comparison data_manager/resource_building.py @ 46:80fc0b28e227 draft

planemo upload commit cb633de1f04ef7b7133728909716b6c6594533d1-dirty
author dchristiany
date Fri, 01 Feb 2019 10:21:58 -0500
parents 3febf3d1139a
children 7b486b0fba4e
comparison
equal deleted inserted replaced
45:ec7a4d773c45 46:80fc0b28e227
107 107
108 #function to check if an id is an uniprot accession number : return True or False- 108 #function to check if an id is an uniprot accession number : return True or False-
109 def check_uniprot_access (id) : 109 def check_uniprot_access (id) :
110 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") 110 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
111 if uniprot_pattern.match(id) : 111 if uniprot_pattern.match(id) :
112 return True
113 else :
114 return False
115
116 def check_entrez_geneid (id) :
117 entrez_pattern = re.complie("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+")
118 if entrez_pattern.match(id) :
112 return True 119 return True
113 else : 120 else :
114 return False 121 return False
115 122
116 ####################################################################################################### 123 #######################################################################################################
270 277
271 ####################################################################################################### 278 #######################################################################################################
272 # 4. Build protein interaction maps files 279 # 4. Build protein interaction maps files
273 ####################################################################################################### 280 #######################################################################################################
274 281
282 def get_interactant_name(line):
283
284 if line[0] in dico_geneid_to_gene_name :
285 print line[0]
286 interactant_A = dico_geneid_to_gene_name[line[0]]
287 else :
288 interactant_A = "NA"
289
290 if line[1] in dico_geneid_to_gene_name :
291 interactant_B = dico_geneid_to_gene_name[line[1]]
292 else :
293 interactant_B = "NA"
294
295 return interactant_A, interactant_B
296
275 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): 297 def PPI_ref_files(data_manager_dict, species, interactome, target_directory):
276 298
277 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} 299 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"}
278 300
279 ##BioGRID 301 ##BioGRID
313 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') 335 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
314 r.encoding ="utf-8" 336 r.encoding ="utf-8"
315 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') 337 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
316 338
317 dico_nodes = {} 339 dico_nodes = {}
318 uniProt_index=0 340 geneid_index=0
319 pathway_description_index=3 341 pathway_description_index=3
320 species_index=5 342 species_index=5
321 for line in tab_file : 343 for line in tab_file :
322 if line[species_index]==species_dict[species]: 344 if line[species_index]==species_dict[species]:
323 if line[uniProt_index] in dico_nodes : 345 if line[geneid_index] in dico_nodes :
324 dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) 346 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
325 else : 347 else :
326 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] 348 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
327 349
328 dico={} 350 dico={}
329 dico['network']=dico_network 351 dico['network']=dico_network
330 dico['nodes']=dico_nodes 352 dico['nodes']=dico_nodes
331 353
374 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') 396 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
375 r.encoding ="utf-8" 397 r.encoding ="utf-8"
376 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') 398 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
377 399
378 dico_nodes_geneid = {} 400 dico_nodes_geneid = {}
379 uniProt_index=0 401 geneid_index=0
380 pathway_description_index=3 402 pathway_description_index=3
381 species_index=5 403 species_index=5
382 for line in tab_file : 404 for line in tab_file :
383 if line[species_index]==species_dict[species]: 405 if line[species_index]==species_dict[species]:
384 if line[uniProt_index] in dico_nodes_geneid : 406 if line[geneid_index] in dico_nodes_geneid :
385 dico_nodes_geneid[line[uniProt_index]].append(line[pathway_description_index]) 407 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index])
386 else : 408 else :
387 dico_nodes_geneid[line[uniProt_index]] = [line[pathway_description_index]] 409 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]]
388 410
389 dico={} 411 dico={}
390 dico_nodes={} 412 dico_nodes={}
391 dico_nodes['GeneID']=dico_nodes_geneid 413 dico_nodes['GeneID']=dico_nodes_geneid
392 dico_nodes['UniProt-AC']=dico_nodes_uniprot 414 dico_nodes['UniProt-AC']=dico_nodes_uniprot
393 dico['network']=dico_network 415 dico['network']=dico_network
394 dico['nodes']=dico_nodes 416 dico['nodes']=dico_nodes
395 dico['convert']=dico_GeneID_to_UniProt 417 dico['convert']=dico_GeneID_to_UniProt
418
419 ##Humap
420 elif interactome=="humap":
421
422 with requests.Session() as s:
423 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt')
424 r = r.content.decode('utf-8')
425 humap_nodes = csv.reader(r.splitlines(), delimiter=',')
426
427 dico_geneid_to_gene_name={}
428 for line in humap_nodes :
429 if check_entrez_geneid(line[5]):
430 if line[5] not in dico_geneid_to_gene_name:
431 dico_geneid_to_gene_name[line[5]]=[line[4]]
432 else :
433 if line[4] not in dico_geneid_to_gene_name[line[5]] :
434 dico_geneid_to_gene_name[line[5]].append(line[4])
435
436 with requests.Session() as s:
437 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt')
438 r = r.content.decode('utf-8')
439 humap = csv.reader(r.splitlines(), delimiter='\t')
440
441 dico_network = {}
442 for line in humap :
443 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]):
444
445 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name)
446
447 if line[0] not in dico_network:
448 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]]
449 else :
450 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]])
451
452 with requests.Session() as s:
453 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
454 r.encoding ="utf-8"
455 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
456
457 dico_nodes = {}
458 geneid_index=0
459 pathway_description_index=3
460 species_index=5
461 for line in tab_file :
462 if line[species_index]==species_dict[species]:
463 #Fill dictionary with pathways
464 if line[geneid_index] in dico_nodes :
465 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
466 else :
467 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
468
469 dico={}
470 dico['network']=dico_network
471 dico['nodes']=dico_nodes
472 dico['gene_name']=dico_geneid_to_gene_name
396 473
397 #writing output 474 #writing output
398 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" 475 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json"
399 path = os.path.join(target_directory,output_file) 476 path = os.path.join(target_directory,output_file)
400 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") 477 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
462 id_mapping_sources(data_manager_dict, species, target_directory) 539 id_mapping_sources(data_manager_dict, species, target_directory)
463 540
464 ## Download PPI ref files from biogrid/bioplex/humap 541 ## Download PPI ref files from biogrid/bioplex/humap
465 try: 542 try:
466 interactome=args.interactome 543 interactome=args.interactome
467 species=args.species 544 if interactome == "biogrid" :
545 species=args.species
546 else :
547 species="Human"
468 except NameError: 548 except NameError:
469 interactome=None 549 interactome=None
470 species=None 550 species=None
471 if interactome is not None and species is not None: 551 if interactome is not None and species is not None:
472 PPI_ref_files(data_manager_dict, species, interactome, target_directory) 552 PPI_ref_files(data_manager_dict, species, interactome, target_directory)