comparison data_manager/resource_building.py @ 39:ec6252ad1a8e draft

planemo upload commit 43e2a01d7519104c2c16510e4dbdc023e65c49c7-dirty
author dchristiany
date Tue, 29 Jan 2019 10:25:49 -0500
parents a4811c440b45
children fddf4a3847f4
comparison
equal deleted inserted replaced
38:49467e5f78a6 39:ec6252ad1a8e
280 ##BioGRID 280 ##BioGRID
281 if interactome=="biogrid": 281 if interactome=="biogrid":
282 282
283 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" 283 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip"
284 284
285 #dowload zip file 285 #download zip file
286 r = requests.get(tab2_link) 286 r = requests.get(tab2_link)
287 with open("BioGRID.zip", "wb") as code: 287 with open("BioGRID.zip", "wb") as code:
288 code.write(r.content) 288 code.write(r.content)
289 289
290 #unzip files 290 #unzip files
296 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" 296 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt"
297 with open(file_path,"r") as handle : 297 with open(file_path,"r") as handle :
298 tab_file = csv.reader(handle,delimiter="\t") 298 tab_file = csv.reader(handle,delimiter="\t")
299 dico_network = {} 299 dico_network = {}
300 GeneID_index=1 300 GeneID_index=1
301 network_cols=[1,2,7,8,11,12,18,20] 301 network_cols=[1,2,7,8,11,12,14,18,20]
302 for line in tab_file : 302 for line in tab_file :
303 if line[GeneID_index] not in dico_network: 303 if line[GeneID_index] not in dico_network:
304 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]] 304 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]]
305 else: 305 else:
306 dico_network[line[GeneID_index]].append([line[i] for i in network_cols]) 306 dico_network[line[GeneID_index]].append([line[i] for i in network_cols])
307 307
308 #delete tmp_BioGRID directory 308 #delete tmp_BioGRID directory
309 os.remove("BioGRID.zip") 309 os.remove("BioGRID.zip")
310 shutil.rmtree("tmp_BioGRID", ignore_errors=True) 310 shutil.rmtree("tmp_BioGRID", ignore_errors=True)
311 311
312 #download NCBI2Reactome.txt file and build dictionary 312 #download NCBI2Reactome.txt file and build dictionary
313 r = requests.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') 313 with requests.Session() as s:
314 r.encoding ="utf-8" 314 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
315 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') 315 r.encoding = r.apparent_encoding
316 tab_file = csv.reader(r.text.splitlines(), delimiter='\t')
317
316 dico_nodes = {} 318 dico_nodes = {}
317 GeneID_index=0 319 uniProt_index=0
318 pathway_description_index=3 320 pathway_description_index=3
319 species_index=5 321 species_index=5
320 for line in tab_file : 322 for line in tab_file :
321 if line[species_index]==species_dict[species]: 323 if line[species_index]==species_dict[species]:
322 if line[GeneID_index] in dico_nodes : 324 if line[uniProt_index] in dico_nodes :
323 dico_nodes[line[GeneID_index]].append(line[pathway_description_index]) 325 dico_nodes[line[uniProt_index]].append(line[pathway_description_index])
324 else : 326 else :
325 dico_nodes[line[GeneID_index]] = [line[pathway_description_index]] 327 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]]
326 328
327 dico={} 329 dico={}
328 dico['network']=dico_network 330 dico['network']=dico_network
329 dico['nodes']=dico_nodes 331 dico['nodes']=dico_nodes
330 332
331 ##Bioplex 333 ##Bioplex
332 elif interactome=="bioplex": 334 elif interactome=="bioplex":
333 335
334 r = requests.get("http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv") 336 with requests.Session() as s:
335 r.encoding ="utf-8" 337 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv')
336 bioplex = csv.reader(r.content.splitlines(), delimiter='\t') 338 r = r.content.decode('utf-8')
339 bioplex = csv.reader(r.splitlines(), delimiter='\t')
340
337 dico_network = {} 341 dico_network = {}
338 dico_network["GeneID"]={} 342 dico_network["GeneID"]={}
339 network_geneid_cols=[0,1,4,5,8] 343 network_geneid_cols=[0,1,4,5,8]
340 dico_network["UniProt-AC"]={} 344 dico_network["UniProt-AC"]={}
341 network_uniprot_cols=[2,3,4,5,8] 345 network_uniprot_cols=[2,3,4,5,8]
342 dico_GeneID_to_UniProt = {} 346 dico_GeneID_to_UniProt = {}
343 dico_nodes = {}
344 for line in bioplex : 347 for line in bioplex :
345 if line[0] not in dico_network["GeneID"]: 348 if line[0] not in dico_network["GeneID"]:
346 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]] 349 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]]
347 else : 350 else :
348 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols]) 351 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols])
350 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]] 353 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]]
351 else: 354 else:
352 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols]) 355 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols])
353 dico_GeneID_to_UniProt[line[0]]=line[2] 356 dico_GeneID_to_UniProt[line[0]]=line[2]
354 357
355 r = requests.get("https://reactome.org/download/current/UniProt2Reactome.txt") 358 with requests.Session() as s:
356 r.encoding ="utf-8" 359 download = s.get('https://reactome.org/download/current/UniProt2Reactome.txt')
357 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') 360 decoded_content = download.content.decode('utf-8')
358 dico_nodes = {} 361 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t')
362
363 dico_nodes_uniprot = {}
359 uniProt_index=0 364 uniProt_index=0
360 pathway_description_index=3 365 pathway_description_index=3
361 species_index=5 366 species_index=5
362 for line in tab_file : 367 for line in tab_file :
363 if line[species_index]==species_dict[species]: 368 if line[species_index]==species_dict[species]:
364 if line[uniProt_index] in dico_nodes : 369 if line[uniProt_index] in dico_nodes_uniprot :
365 dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) 370 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index])
366 else : 371 else :
367 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] 372 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]]
373
374 with requests.Session() as s:
375 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
376 r.encoding = r.apparent_encoding
377 tab_file = csv.reader(r.text.splitlines(), delimiter='\t')
378
379 dico_nodes_geneid = {}
380 uniProt_index=0
381 pathway_description_index=3
382 species_index=5
383 for line in tab_file :
384 if line[species_index]==species_dict[species]:
385 if line[uniProt_index] in dico_nodes_geneid :
386 dico_nodes_geneid[line[uniProt_index]].append(line[pathway_description_index])
387 else :
388 dico_nodes_geneid[line[uniProt_index]] = [line[pathway_description_index]]
368 389
369 dico={} 390 dico={}
391 dico_nodes={}
392 dico_nodes['GeneID']=dico_nodes_geneid
393 dico_nodes['UniProt-AC']=dico_nodes_uniprot
370 dico['network']=dico_network 394 dico['network']=dico_network
371 dico['nodes']=dico_nodes 395 dico['nodes']=dico_nodes
372 dico['convert']=dico_GeneID_to_UniProt 396 dico['convert']=dico_GeneID_to_UniProt
373 397
374 #writing output 398 #writing output
375 output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json" 399 output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json"
376 path = os.path.join(target_directory,output_file) 400 path = os.path.join(target_directory,output_file)
377 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") 401 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
378 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") 402 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y")
379 403
380 with open(path, 'w') as handle: 404 with open(path, 'w', encoding="utf-8") as handle:
381 json.dump(dico, handle, sort_keys=True) 405 json.dump(dico, handle, sort_keys=True)
382 406
383 data_table_entry = dict(id=id, name = name, value = species, path = path) 407 data_table_entry = dict(id=id, name = name, value = species, path = path)
384 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") 408 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
385 409