Mercurial > repos > dchristiany > data_manager_proteore
diff data_manager/resource_building.py @ 46:80fc0b28e227 draft
planemo upload commit cb633de1f04ef7b7133728909716b6c6594533d1-dirty
author | dchristiany |
---|---|
date | Fri, 01 Feb 2019 10:21:58 -0500 |
parents | 3febf3d1139a |
children | 7b486b0fba4e |
line wrap: on
line diff
--- a/data_manager/resource_building.py Thu Jan 31 08:58:26 2019 -0500 +++ b/data_manager/resource_building.py Fri Feb 01 10:21:58 2019 -0500 @@ -113,6 +113,13 @@ else : return False +def check_entrez_geneid (id) : + entrez_pattern = re.complie("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+") + if entrez_pattern.match(id) : + return True + else : + return False + ####################################################################################################### # 3. ID mapping file ####################################################################################################### @@ -272,6 +279,21 @@ # 4. Build protein interaction maps files ####################################################################################################### +def get_interactant_name(line): + + if line[0] in dico_geneid_to_gene_name : + print line[0] + interactant_A = dico_geneid_to_gene_name[line[0]] + else : + interactant_A = "NA" + + if line[1] in dico_geneid_to_gene_name : + interactant_B = dico_geneid_to_gene_name[line[1]] + else : + interactant_B = "NA" + + return interactant_A, interactant_B + def PPI_ref_files(data_manager_dict, species, interactome, target_directory): species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} @@ -315,15 +337,15 @@ tab_file = csv.reader(r.content.splitlines(), delimiter='\t') dico_nodes = {} - uniProt_index=0 + geneid_index=0 pathway_description_index=3 species_index=5 for line in tab_file : if line[species_index]==species_dict[species]: - if line[uniProt_index] in dico_nodes : - dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) + if line[geneid_index] in dico_nodes : + dico_nodes[line[geneid_index]].append(line[pathway_description_index]) else : - dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] + dico_nodes[line[geneid_index]] = [line[pathway_description_index]] dico={} dico['network']=dico_network @@ -376,15 +398,15 @@ tab_file = csv.reader(r.content.splitlines(), delimiter='\t') dico_nodes_geneid = {} - uniProt_index=0 + geneid_index=0 pathway_description_index=3 species_index=5 for line in tab_file : if line[species_index]==species_dict[species]: - if line[uniProt_index] in dico_nodes_geneid : - dico_nodes_geneid[line[uniProt_index]].append(line[pathway_description_index]) + if line[geneid_index] in dico_nodes_geneid : + dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index]) else : - dico_nodes_geneid[line[uniProt_index]] = [line[pathway_description_index]] + dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]] dico={} dico_nodes={} @@ -394,6 +416,61 @@ dico['nodes']=dico_nodes dico['convert']=dico_GeneID_to_UniProt + ##Humap + elif interactome=="humap": + + with requests.Session() as s: + r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt') + r = r.content.decode('utf-8') + humap_nodes = csv.reader(r.splitlines(), delimiter=',') + + dico_geneid_to_gene_name={} + for line in humap_nodes : + if check_entrez_geneid(line[5]): + if line[5] not in dico_geneid_to_gene_name: + dico_geneid_to_gene_name[line[5]]=[line[4]] + else : + if line[4] not in dico_geneid_to_gene_name[line[5]] : + dico_geneid_to_gene_name[line[5]].append(line[4]) + + with requests.Session() as s: + r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt') + r = r.content.decode('utf-8') + humap = csv.reader(r.splitlines(), delimiter='\t') + + dico_network = {} + for line in humap : + if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]): + + interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name) + + if line[0] not in dico_network: + dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]] + else : + dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]]) + + with requests.Session() as s: + r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') + r.encoding ="utf-8" + tab_file = csv.reader(r.content.splitlines(), delimiter='\t') + + dico_nodes = {} + geneid_index=0 + pathway_description_index=3 + species_index=5 + for line in tab_file : + if line[species_index]==species_dict[species]: + #Fill dictionary with pathways + if line[geneid_index] in dico_nodes : + dico_nodes[line[geneid_index]].append(line[pathway_description_index]) + else : + dico_nodes[line[geneid_index]] = [line[pathway_description_index]] + + dico={} + dico['network']=dico_network + dico['nodes']=dico_nodes + dico['gene_name']=dico_geneid_to_gene_name + #writing output output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" path = os.path.join(target_directory,output_file) @@ -464,7 +541,10 @@ ## Download PPI ref files from biogrid/bioplex/humap try: interactome=args.interactome - species=args.species + if interactome == "biogrid" : + species=args.species + else : + species="Human" except NameError: interactome=None species=None