Mercurial > repos > dchristiany > data_manager_proteore
comparison data_manager/resource_building.py @ 46:80fc0b28e227 draft
planemo upload commit cb633de1f04ef7b7133728909716b6c6594533d1-dirty
author | dchristiany |
---|---|
date | Fri, 01 Feb 2019 10:21:58 -0500 |
parents | 3febf3d1139a |
children | 7b486b0fba4e |
comparison
equal
deleted
inserted
replaced
45:ec7a4d773c45 | 46:80fc0b28e227 |
---|---|
107 | 107 |
108 #function to check if an id is an uniprot accession number : return True or False- | 108 #function to check if an id is an uniprot accession number : return True or False- |
109 def check_uniprot_access (id) : | 109 def check_uniprot_access (id) : |
110 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | 110 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") |
111 if uniprot_pattern.match(id) : | 111 if uniprot_pattern.match(id) : |
112 return True | |
113 else : | |
114 return False | |
115 | |
116 def check_entrez_geneid (id) : | |
117 entrez_pattern = re.complie("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+") | |
118 if entrez_pattern.match(id) : | |
112 return True | 119 return True |
113 else : | 120 else : |
114 return False | 121 return False |
115 | 122 |
116 ####################################################################################################### | 123 ####################################################################################################### |
270 | 277 |
271 ####################################################################################################### | 278 ####################################################################################################### |
272 # 4. Build protein interaction maps files | 279 # 4. Build protein interaction maps files |
273 ####################################################################################################### | 280 ####################################################################################################### |
274 | 281 |
282 def get_interactant_name(line): | |
283 | |
284 if line[0] in dico_geneid_to_gene_name : | |
285 print line[0] | |
286 interactant_A = dico_geneid_to_gene_name[line[0]] | |
287 else : | |
288 interactant_A = "NA" | |
289 | |
290 if line[1] in dico_geneid_to_gene_name : | |
291 interactant_B = dico_geneid_to_gene_name[line[1]] | |
292 else : | |
293 interactant_B = "NA" | |
294 | |
295 return interactant_A, interactant_B | |
296 | |
275 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): | 297 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): |
276 | 298 |
277 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} | 299 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} |
278 | 300 |
279 ##BioGRID | 301 ##BioGRID |
313 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | 335 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') |
314 r.encoding ="utf-8" | 336 r.encoding ="utf-8" |
315 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | 337 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') |
316 | 338 |
317 dico_nodes = {} | 339 dico_nodes = {} |
318 uniProt_index=0 | 340 geneid_index=0 |
319 pathway_description_index=3 | 341 pathway_description_index=3 |
320 species_index=5 | 342 species_index=5 |
321 for line in tab_file : | 343 for line in tab_file : |
322 if line[species_index]==species_dict[species]: | 344 if line[species_index]==species_dict[species]: |
323 if line[uniProt_index] in dico_nodes : | 345 if line[geneid_index] in dico_nodes : |
324 dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) | 346 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) |
325 else : | 347 else : |
326 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] | 348 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] |
327 | 349 |
328 dico={} | 350 dico={} |
329 dico['network']=dico_network | 351 dico['network']=dico_network |
330 dico['nodes']=dico_nodes | 352 dico['nodes']=dico_nodes |
331 | 353 |
374 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | 396 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') |
375 r.encoding ="utf-8" | 397 r.encoding ="utf-8" |
376 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | 398 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') |
377 | 399 |
378 dico_nodes_geneid = {} | 400 dico_nodes_geneid = {} |
379 uniProt_index=0 | 401 geneid_index=0 |
380 pathway_description_index=3 | 402 pathway_description_index=3 |
381 species_index=5 | 403 species_index=5 |
382 for line in tab_file : | 404 for line in tab_file : |
383 if line[species_index]==species_dict[species]: | 405 if line[species_index]==species_dict[species]: |
384 if line[uniProt_index] in dico_nodes_geneid : | 406 if line[geneid_index] in dico_nodes_geneid : |
385 dico_nodes_geneid[line[uniProt_index]].append(line[pathway_description_index]) | 407 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index]) |
386 else : | 408 else : |
387 dico_nodes_geneid[line[uniProt_index]] = [line[pathway_description_index]] | 409 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]] |
388 | 410 |
389 dico={} | 411 dico={} |
390 dico_nodes={} | 412 dico_nodes={} |
391 dico_nodes['GeneID']=dico_nodes_geneid | 413 dico_nodes['GeneID']=dico_nodes_geneid |
392 dico_nodes['UniProt-AC']=dico_nodes_uniprot | 414 dico_nodes['UniProt-AC']=dico_nodes_uniprot |
393 dico['network']=dico_network | 415 dico['network']=dico_network |
394 dico['nodes']=dico_nodes | 416 dico['nodes']=dico_nodes |
395 dico['convert']=dico_GeneID_to_UniProt | 417 dico['convert']=dico_GeneID_to_UniProt |
418 | |
419 ##Humap | |
420 elif interactome=="humap": | |
421 | |
422 with requests.Session() as s: | |
423 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt') | |
424 r = r.content.decode('utf-8') | |
425 humap_nodes = csv.reader(r.splitlines(), delimiter=',') | |
426 | |
427 dico_geneid_to_gene_name={} | |
428 for line in humap_nodes : | |
429 if check_entrez_geneid(line[5]): | |
430 if line[5] not in dico_geneid_to_gene_name: | |
431 dico_geneid_to_gene_name[line[5]]=[line[4]] | |
432 else : | |
433 if line[4] not in dico_geneid_to_gene_name[line[5]] : | |
434 dico_geneid_to_gene_name[line[5]].append(line[4]) | |
435 | |
436 with requests.Session() as s: | |
437 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt') | |
438 r = r.content.decode('utf-8') | |
439 humap = csv.reader(r.splitlines(), delimiter='\t') | |
440 | |
441 dico_network = {} | |
442 for line in humap : | |
443 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]): | |
444 | |
445 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name) | |
446 | |
447 if line[0] not in dico_network: | |
448 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]] | |
449 else : | |
450 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]]) | |
451 | |
452 with requests.Session() as s: | |
453 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
454 r.encoding ="utf-8" | |
455 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
456 | |
457 dico_nodes = {} | |
458 geneid_index=0 | |
459 pathway_description_index=3 | |
460 species_index=5 | |
461 for line in tab_file : | |
462 if line[species_index]==species_dict[species]: | |
463 #Fill dictionary with pathways | |
464 if line[geneid_index] in dico_nodes : | |
465 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) | |
466 else : | |
467 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] | |
468 | |
469 dico={} | |
470 dico['network']=dico_network | |
471 dico['nodes']=dico_nodes | |
472 dico['gene_name']=dico_geneid_to_gene_name | |
396 | 473 |
397 #writing output | 474 #writing output |
398 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" | 475 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" |
399 path = os.path.join(target_directory,output_file) | 476 path = os.path.join(target_directory,output_file) |
400 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") | 477 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") |
462 id_mapping_sources(data_manager_dict, species, target_directory) | 539 id_mapping_sources(data_manager_dict, species, target_directory) |
463 | 540 |
464 ## Download PPI ref files from biogrid/bioplex/humap | 541 ## Download PPI ref files from biogrid/bioplex/humap |
465 try: | 542 try: |
466 interactome=args.interactome | 543 interactome=args.interactome |
467 species=args.species | 544 if interactome == "biogrid" : |
545 species=args.species | |
546 else : | |
547 species="Human" | |
468 except NameError: | 548 except NameError: |
469 interactome=None | 549 interactome=None |
470 species=None | 550 species=None |
471 if interactome is not None and species is not None: | 551 if interactome is not None and species is not None: |
472 PPI_ref_files(data_manager_dict, species, interactome, target_directory) | 552 PPI_ref_files(data_manager_dict, species, interactome, target_directory) |