Mercurial > repos > dchristiany > data_manager_proteore
comparison data_manager/resource_building.py @ 39:ec6252ad1a8e draft
planemo upload commit 43e2a01d7519104c2c16510e4dbdc023e65c49c7-dirty
author | dchristiany |
---|---|
date | Tue, 29 Jan 2019 10:25:49 -0500 |
parents | a4811c440b45 |
children | fddf4a3847f4 |
comparison
equal
deleted
inserted
replaced
38:49467e5f78a6 | 39:ec6252ad1a8e |
---|---|
280 ##BioGRID | 280 ##BioGRID |
281 if interactome=="biogrid": | 281 if interactome=="biogrid": |
282 | 282 |
283 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" | 283 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" |
284 | 284 |
285 #dowload zip file | 285 #download zip file |
286 r = requests.get(tab2_link) | 286 r = requests.get(tab2_link) |
287 with open("BioGRID.zip", "wb") as code: | 287 with open("BioGRID.zip", "wb") as code: |
288 code.write(r.content) | 288 code.write(r.content) |
289 | 289 |
290 #unzip files | 290 #unzip files |
296 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" | 296 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" |
297 with open(file_path,"r") as handle : | 297 with open(file_path,"r") as handle : |
298 tab_file = csv.reader(handle,delimiter="\t") | 298 tab_file = csv.reader(handle,delimiter="\t") |
299 dico_network = {} | 299 dico_network = {} |
300 GeneID_index=1 | 300 GeneID_index=1 |
301 network_cols=[1,2,7,8,11,12,18,20] | 301 network_cols=[1,2,7,8,11,12,14,18,20] |
302 for line in tab_file : | 302 for line in tab_file : |
303 if line[GeneID_index] not in dico_network: | 303 if line[GeneID_index] not in dico_network: |
304 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]] | 304 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]] |
305 else: | 305 else: |
306 dico_network[line[GeneID_index]].append([line[i] for i in network_cols]) | 306 dico_network[line[GeneID_index]].append([line[i] for i in network_cols]) |
307 | 307 |
308 #delete tmp_BioGRID directory | 308 #delete tmp_BioGRID directory |
309 os.remove("BioGRID.zip") | 309 os.remove("BioGRID.zip") |
310 shutil.rmtree("tmp_BioGRID", ignore_errors=True) | 310 shutil.rmtree("tmp_BioGRID", ignore_errors=True) |
311 | 311 |
312 #download NCBI2Reactome.txt file and build dictionary | 312 #download NCBI2Reactome.txt file and build dictionary |
313 r = requests.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | 313 with requests.Session() as s: |
314 r.encoding ="utf-8" | 314 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') |
315 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | 315 r.encoding = r.apparent_encoding |
316 tab_file = csv.reader(r.text.splitlines(), delimiter='\t') | |
317 | |
316 dico_nodes = {} | 318 dico_nodes = {} |
317 GeneID_index=0 | 319 uniProt_index=0 |
318 pathway_description_index=3 | 320 pathway_description_index=3 |
319 species_index=5 | 321 species_index=5 |
320 for line in tab_file : | 322 for line in tab_file : |
321 if line[species_index]==species_dict[species]: | 323 if line[species_index]==species_dict[species]: |
322 if line[GeneID_index] in dico_nodes : | 324 if line[uniProt_index] in dico_nodes : |
323 dico_nodes[line[GeneID_index]].append(line[pathway_description_index]) | 325 dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) |
324 else : | 326 else : |
325 dico_nodes[line[GeneID_index]] = [line[pathway_description_index]] | 327 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] |
326 | 328 |
327 dico={} | 329 dico={} |
328 dico['network']=dico_network | 330 dico['network']=dico_network |
329 dico['nodes']=dico_nodes | 331 dico['nodes']=dico_nodes |
330 | 332 |
331 ##Bioplex | 333 ##Bioplex |
332 elif interactome=="bioplex": | 334 elif interactome=="bioplex": |
333 | 335 |
334 r = requests.get("http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv") | 336 with requests.Session() as s: |
335 r.encoding ="utf-8" | 337 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv') |
336 bioplex = csv.reader(r.content.splitlines(), delimiter='\t') | 338 r = r.content.decode('utf-8') |
339 bioplex = csv.reader(r.splitlines(), delimiter='\t') | |
340 | |
337 dico_network = {} | 341 dico_network = {} |
338 dico_network["GeneID"]={} | 342 dico_network["GeneID"]={} |
339 network_geneid_cols=[0,1,4,5,8] | 343 network_geneid_cols=[0,1,4,5,8] |
340 dico_network["UniProt-AC"]={} | 344 dico_network["UniProt-AC"]={} |
341 network_uniprot_cols=[2,3,4,5,8] | 345 network_uniprot_cols=[2,3,4,5,8] |
342 dico_GeneID_to_UniProt = {} | 346 dico_GeneID_to_UniProt = {} |
343 dico_nodes = {} | |
344 for line in bioplex : | 347 for line in bioplex : |
345 if line[0] not in dico_network["GeneID"]: | 348 if line[0] not in dico_network["GeneID"]: |
346 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]] | 349 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]] |
347 else : | 350 else : |
348 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols]) | 351 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols]) |
350 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]] | 353 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]] |
351 else: | 354 else: |
352 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols]) | 355 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols]) |
353 dico_GeneID_to_UniProt[line[0]]=line[2] | 356 dico_GeneID_to_UniProt[line[0]]=line[2] |
354 | 357 |
355 r = requests.get("https://reactome.org/download/current/UniProt2Reactome.txt") | 358 with requests.Session() as s: |
356 r.encoding ="utf-8" | 359 download = s.get('https://reactome.org/download/current/UniProt2Reactome.txt') |
357 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | 360 decoded_content = download.content.decode('utf-8') |
358 dico_nodes = {} | 361 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t') |
362 | |
363 dico_nodes_uniprot = {} | |
359 uniProt_index=0 | 364 uniProt_index=0 |
360 pathway_description_index=3 | 365 pathway_description_index=3 |
361 species_index=5 | 366 species_index=5 |
362 for line in tab_file : | 367 for line in tab_file : |
363 if line[species_index]==species_dict[species]: | 368 if line[species_index]==species_dict[species]: |
364 if line[uniProt_index] in dico_nodes : | 369 if line[uniProt_index] in dico_nodes_uniprot : |
365 dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) | 370 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index]) |
366 else : | 371 else : |
367 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] | 372 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]] |
373 | |
374 with requests.Session() as s: | |
375 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
376 r.encoding = r.apparent_encoding | |
377 tab_file = csv.reader(r.text.splitlines(), delimiter='\t') | |
378 | |
379 dico_nodes_geneid = {} | |
380 uniProt_index=0 | |
381 pathway_description_index=3 | |
382 species_index=5 | |
383 for line in tab_file : | |
384 if line[species_index]==species_dict[species]: | |
385 if line[uniProt_index] in dico_nodes_geneid : | |
386 dico_nodes_geneid[line[uniProt_index]].append(line[pathway_description_index]) | |
387 else : | |
388 dico_nodes_geneid[line[uniProt_index]] = [line[pathway_description_index]] | |
368 | 389 |
369 dico={} | 390 dico={} |
391 dico_nodes={} | |
392 dico_nodes['GeneID']=dico_nodes_geneid | |
393 dico_nodes['UniProt-AC']=dico_nodes_uniprot | |
370 dico['network']=dico_network | 394 dico['network']=dico_network |
371 dico['nodes']=dico_nodes | 395 dico['nodes']=dico_nodes |
372 dico['convert']=dico_GeneID_to_UniProt | 396 dico['convert']=dico_GeneID_to_UniProt |
373 | 397 |
374 #writing output | 398 #writing output |
375 output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json" | 399 output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json" |
376 path = os.path.join(target_directory,output_file) | 400 path = os.path.join(target_directory,output_file) |
377 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") | 401 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") |
378 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") | 402 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") |
379 | 403 |
380 with open(path, 'w') as handle: | 404 with open(path, 'w', encoding="utf-8") as handle: |
381 json.dump(dico, handle, sort_keys=True) | 405 json.dump(dico, handle, sort_keys=True) |
382 | 406 |
383 data_table_entry = dict(id=id, name = name, value = species, path = path) | 407 data_table_entry = dict(id=id, name = name, value = species, path = path) |
384 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") | 408 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") |
385 | 409 |