data_manager_proteore: data_manager/resource

comparison data_manager/resource_building.py @ 15:83f57ba70416 draft

planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty

author	dchristiany
date	Tue, 15 Jan 2019 04:29:28 -0500
parents	a1530507fee4
children	454c2e2984ea

comparison

equal deleted inserted replaced

-:f8ed6fc5f3ae
+:83f57ba70416
 """
 The purpose of this script is to create source files from different databases to be used in other proteore tools
 """
-import os, sys, argparse, requests, time, csv, re
+import os, sys, argparse, requests, time, csv, re, json, zipfile, shutil
 from io import BytesIO
 from zipfile import ZipFile
 from galaxy.util.json import from_json_string, to_json_string
 #######################################################################################################
 #######################################################################################################
 # 3. ID mapping file
 #######################################################################################################
-import ftplib, gzip, pickle
+import ftplib, gzip
 csv.field_size_limit(sys.maxsize) # to handle big files
 def id_mapping_sources (data_manager_dict, species, target_directory) :
 human = species == "human"
 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
 #header
-if human :
+if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
-ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]
+else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
-else :
-ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]
-tab = [ids_list]
 #print("header ok")
-#selected.tab and keep only ids of interest
+#get selected.tab and keep only ids of interest
 selected_tab_file=species_dict[species]+"_"+files[0]
 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
 with gzip.open(tab_path,"rt") as select :
 tab_reader = csv.reader(select,delimiter="\t")
 for line in tab_reader :
 """
 Supplementary ID to get from HUMAN_9606_idmapping.dat :
 -NextProt,BioGrid,STRING,KEGG
 """
+#there's more id type for human
 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ]   #ids to get from dat_file
 else : ids = ['BioGrid','STRING','KEGG' ]
 unidict = {}
 #keep only ids of interest in dictionaries
 uniprotID=line[0]
 nextprotID=line[13]
 if nextprotID == '' and uniprotID in next_dict :
 line[13]=next_dict[uniprotID]
-#create empty dictionary and dictionary index
+output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
-ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list)
-#fill dictionary and sub dictionaries with ids
-for line in tab[1:] :
-for index, ids in enumerate(line) :
-other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index]
-for id in ids.replace(" ","").split(";") :       #if there's more than one id, one key per id (example : GO)
-if id not in ids_dictionary[ids_dictionary_index[index]] :      #if the key is not created yet
-ids_dictionary[ids_dictionary_index[index]][id]={}
-for other_id_type in other_id_type_index :
-if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] :
-ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";"))
-else :
-ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
-if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] :
-ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('')
-##writing output files
-output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle"
 path = os.path.join(target_directory,output_file)
-#save ids_dictionary
+with open(path,"w") as out :
-with open(output_dict, 'wb') as handle:
+w = csv.writer(out,delimiter='\t')
-pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
+w.writerows(tab)
 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
 name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
 data_table_entry = dict(id=id, name = name, value = species, path = path)
-_add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries")
+_add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping")
 def download_from_uniprot_ftp(file,target_directory) :
 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
 path = os.path.join(target_directory, file)
 ftp = ftplib.FTP("ftp.uniprot.org")
 else :
 return (tmp[1])
 else :
 return (next_id)
-#create empty dictionary with index for tab
-def create_ids_dictionary (ids_list) :
+#######################################################################################################
-ids_dictionary = {}
+# 4. Build protein interaction maps files
-for id_type in ids_list :
+#######################################################################################################
-ids_dictionary[id_type]={}
-ids_dictionary_index = {}
+def PPI_ref_files(data_manager_dict, species, interactome, target_directory):
-for i,id in enumerate(ids_list) :
+species_dict={'human':'Homo sapiens',"mouse":"Mus musculus","rat":"Rattus norvegicus"}
-ids_dictionary_index[i]=id
+##BioGRID
-return(ids_dictionary,ids_dictionary_index)
+if interactome=="biogrid":
+tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip"
+#dowload zip file
+r = requests.get(tab2_link)
+with open("BioGRID.zip", "wb") as code:
+code.write(r.content)
+#unzip files
+with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref:
+if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID")
+zip_ref.extractall("tmp_BioGRID")
+#import file of interest and build dictionary
+file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt"
+with open(file_path,"r") as handle :
+tab_file = csv.reader(handle,delimiter="\t")
+dico_network = {}
+GeneID_index=1
+network_cols=[1,2,7,8,11,12,18,20]
+for line in tab_file :
+dico_network[line[GeneID_index]]=[line[i] for i in network_cols]
+#delete tmp_BioGRID directory
+os.remove("BioGRID.zip")
+shutil.rmtree("tmp_BioGRID", ignore_errors=True)
+#download NCBI2Reactome.txt file and build dictionary
+download = requests.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
+decoded_content = download.content.decode('utf-8')
+tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t')
+dico_nodes = {}
+GeneID_index=0
+pathway_description_index=3
+species_index=5
+for line in tab_file :
+if line[species_index]==species_dict[species]:
+if line[GeneID_index] in dico_nodes :
+dico_nodes[line[GeneID_index]].append(line[pathway_description_index])
+else :
+dico_nodes[line[GeneID_index]] = [line[pathway_description_index]]
+dico={}
+dico['network']=dico_network
+dico['nodes']=dico_nodes
+##Bioplex
+elif interactome=="bioplex":
+download = requests.get("http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv")
+decoded_content = download.content.decode('utf-8')
+bioplex = csv.reader(decoded_content.splitlines(), delimiter='\t')
+dico_network = {}
+dico_network["GeneID"]={}
+network_geneid_cols=[0,1,4,5,8]
+dico_network["UniProt-AC"]={}
+network_uniprot_cols=[2,3,4,5,8]
+dico_GeneID_to_UniProt = {}
+dico_nodes = {}
+for line in bioplex :
+dico_network["GeneID"][line[0]]=[line[i] for i in network_geneid_cols]
+dico_network["UniProt-AC"][line[2]]=[line[i] for i in network_uniprot_cols]
+dico_GeneID_to_UniProt[line[0]]=line[2]
+download = requests.get("https://reactome.org/download/current/UniProt2Reactome.txt")
+decoded_content = download.content.decode('utf-8')
+tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t')
+dico_nodes = {}
+uniProt_index=0
+pathway_description_index=3
+species_index=5
+for line in tab_file :
+if line[species_index]==species_dict[species]:
+if line[uniProt_index] in dico_nodes :
+dico_nodes[line[uniProt_index]].append(line[pathway_description_index])
+else :
+dico_nodes[line[uniProt_index]] = [line[pathway_description_index]]
+dico={}
+dico['network']=dico_network
+dico['nodes']=dico_nodes
+dico['convert']=dico_GeneID_to_UniProt
+#writing output
+output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json"
+path = os.path.join(target_directory,output_file)
+name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
+id = interactome+"_"+species+ time.strftime("%d-%m-%Y")
+with open(path, 'w') as handle:
+json.dump(dico, handle, sort_keys=True)
+data_table_entry = dict(id=id, name = name, value = species, path = path)
+_add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
 #######################################################################################################
 # Main function
 #######################################################################################################
 def main():
 parser = argparse.ArgumentParser()
 parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
+parser.add_argument("--interactome", metavar = ("PPI"))
+parser.add_argument("--species")
 parser.add_argument("-o", "--output")
 args = parser.parse_args()
 data_manager_dict = {}
 # Extract json file params
 id_mapping = None
 if id_mapping is not None:
 id_mapping = id_mapping .split(",")
 for species in id_mapping :
 id_mapping_sources(data_manager_dict, species, target_directory)
+## Download PPI ref files from biogrid/bioplex/humap
+try:
+interactome=args.interactome
+species=args.species
+except NameError:
+interactome=None
+species=None
+if interactome is not None and species is not None:
+PPI_ref_files(data_manager_dict, species, interactome, target_directory)
 #save info to json file
 filename = args.output
 open(filename, 'wb').write(to_json_string(data_manager_dict))

Mercurial > repos > dchristiany > data_manager_proteore

comparison data_manager/resource_building.py @ 15:83f57ba70416 draft