comparison data_manager/resource_building.py @ 15:83f57ba70416 draft

planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
author dchristiany
date Tue, 15 Jan 2019 04:29:28 -0500
parents a1530507fee4
children 454c2e2984ea
comparison
equal deleted inserted replaced
14:f8ed6fc5f3ae 15:83f57ba70416
1 """ 1 """
2 The purpose of this script is to create source files from different databases to be used in other proteore tools 2 The purpose of this script is to create source files from different databases to be used in other proteore tools
3 """ 3 """
4 4
5 import os, sys, argparse, requests, time, csv, re 5 import os, sys, argparse, requests, time, csv, re, json, zipfile, shutil
6 from io import BytesIO 6 from io import BytesIO
7 from zipfile import ZipFile 7 from zipfile import ZipFile
8 from galaxy.util.json import from_json_string, to_json_string 8 from galaxy.util.json import from_json_string, to_json_string
9 9
10 ####################################################################################################### 10 #######################################################################################################
115 115
116 116
117 ####################################################################################################### 117 #######################################################################################################
118 # 3. ID mapping file 118 # 3. ID mapping file
119 ####################################################################################################### 119 #######################################################################################################
120 import ftplib, gzip, pickle 120 import ftplib, gzip
121 csv.field_size_limit(sys.maxsize) # to handle big files 121 csv.field_size_limit(sys.maxsize) # to handle big files
122 122
123 def id_mapping_sources (data_manager_dict, species, target_directory) : 123 def id_mapping_sources (data_manager_dict, species, target_directory) :
124 124
125 human = species == "human" 125 human = species == "human"
126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } 126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] 127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
128 128
129 #header 129 #header
130 if human : 130 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
131 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"] 131 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
132 else :
133 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]
134 tab = [ids_list]
135 132
136 #print("header ok") 133 #print("header ok")
137 134
138 #selected.tab and keep only ids of interest 135 #get selected.tab and keep only ids of interest
139 selected_tab_file=species_dict[species]+"_"+files[0] 136 selected_tab_file=species_dict[species]+"_"+files[0]
140 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) 137 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
141 with gzip.open(tab_path,"rt") as select : 138 with gzip.open(tab_path,"rt") as select :
142 tab_reader = csv.reader(select,delimiter="\t") 139 tab_reader = csv.reader(select,delimiter="\t")
143 for line in tab_reader : 140 for line in tab_reader :
149 """ 146 """
150 Supplementary ID to get from HUMAN_9606_idmapping.dat : 147 Supplementary ID to get from HUMAN_9606_idmapping.dat :
151 -NextProt,BioGrid,STRING,KEGG 148 -NextProt,BioGrid,STRING,KEGG
152 """ 149 """
153 150
151 #there's more id type for human
154 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file 152 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file
155 else : ids = ['BioGrid','STRING','KEGG' ] 153 else : ids = ['BioGrid','STRING','KEGG' ]
156 unidict = {} 154 unidict = {}
157 155
158 #keep only ids of interest in dictionaries 156 #keep only ids of interest in dictionaries
210 uniprotID=line[0] 208 uniprotID=line[0]
211 nextprotID=line[13] 209 nextprotID=line[13]
212 if nextprotID == '' and uniprotID in next_dict : 210 if nextprotID == '' and uniprotID in next_dict :
213 line[13]=next_dict[uniprotID] 211 line[13]=next_dict[uniprotID]
214 212
215 #create empty dictionary and dictionary index 213 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
216 ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list)
217
218 #fill dictionary and sub dictionaries with ids
219 for line in tab[1:] :
220 for index, ids in enumerate(line) :
221 other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index]
222 for id in ids.replace(" ","").split(";") : #if there's more than one id, one key per id (example : GO)
223 if id not in ids_dictionary[ids_dictionary_index[index]] : #if the key is not created yet
224 ids_dictionary[ids_dictionary_index[index]][id]={}
225 for other_id_type in other_id_type_index :
226 if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] :
227 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";"))
228 else :
229 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
230 if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] :
231 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('')
232
233 ##writing output files
234 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle"
235 path = os.path.join(target_directory,output_file) 214 path = os.path.join(target_directory,output_file)
236 215
237 #save ids_dictionary 216 with open(path,"w") as out :
238 with open(output_dict, 'wb') as handle: 217 w = csv.writer(out,delimiter='\t')
239 pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) 218 w.writerows(tab)
240 219
241 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} 220 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
242 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") 221 name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
243 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") 222 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
244 223
245 data_table_entry = dict(id=id, name = name, value = species, path = path) 224 data_table_entry = dict(id=id, name = name, value = species, path = path)
246 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries") 225 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping")
247 226
248 def download_from_uniprot_ftp(file,target_directory) : 227 def download_from_uniprot_ftp(file,target_directory) :
249 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" 228 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
250 path = os.path.join(target_directory, file) 229 path = os.path.join(target_directory, file)
251 ftp = ftplib.FTP("ftp.uniprot.org") 230 ftp = ftplib.FTP("ftp.uniprot.org")
287 else : 266 else :
288 return (tmp[1]) 267 return (tmp[1])
289 else : 268 else :
290 return (next_id) 269 return (next_id)
291 270
292 #create empty dictionary with index for tab 271
293 def create_ids_dictionary (ids_list) : 272 #######################################################################################################
294 ids_dictionary = {} 273 # 4. Build protein interaction maps files
295 for id_type in ids_list : 274 #######################################################################################################
296 ids_dictionary[id_type]={} 275
297 ids_dictionary_index = {} 276 def PPI_ref_files(data_manager_dict, species, interactome, target_directory):
298 277
299 for i,id in enumerate(ids_list) : 278 species_dict={'human':'Homo sapiens',"mouse":"Mus musculus","rat":"Rattus norvegicus"}
300 ids_dictionary_index[i]=id 279
301 280 ##BioGRID
302 return(ids_dictionary,ids_dictionary_index) 281 if interactome=="biogrid":
282
283 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip"
284
285 #dowload zip file
286 r = requests.get(tab2_link)
287 with open("BioGRID.zip", "wb") as code:
288 code.write(r.content)
289
290 #unzip files
291 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref:
292 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID")
293 zip_ref.extractall("tmp_BioGRID")
294
295 #import file of interest and build dictionary
296 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt"
297 with open(file_path,"r") as handle :
298 tab_file = csv.reader(handle,delimiter="\t")
299 dico_network = {}
300 GeneID_index=1
301 network_cols=[1,2,7,8,11,12,18,20]
302 for line in tab_file :
303 dico_network[line[GeneID_index]]=[line[i] for i in network_cols]
304
305 #delete tmp_BioGRID directory
306 os.remove("BioGRID.zip")
307 shutil.rmtree("tmp_BioGRID", ignore_errors=True)
308
309 #download NCBI2Reactome.txt file and build dictionary
310 download = requests.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
311 decoded_content = download.content.decode('utf-8')
312 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t')
313 dico_nodes = {}
314 GeneID_index=0
315 pathway_description_index=3
316 species_index=5
317 for line in tab_file :
318 if line[species_index]==species_dict[species]:
319 if line[GeneID_index] in dico_nodes :
320 dico_nodes[line[GeneID_index]].append(line[pathway_description_index])
321 else :
322 dico_nodes[line[GeneID_index]] = [line[pathway_description_index]]
323
324 dico={}
325 dico['network']=dico_network
326 dico['nodes']=dico_nodes
327
328 ##Bioplex
329 elif interactome=="bioplex":
330
331 download = requests.get("http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv")
332 decoded_content = download.content.decode('utf-8')
333 bioplex = csv.reader(decoded_content.splitlines(), delimiter='\t')
334 dico_network = {}
335 dico_network["GeneID"]={}
336 network_geneid_cols=[0,1,4,5,8]
337 dico_network["UniProt-AC"]={}
338 network_uniprot_cols=[2,3,4,5,8]
339 dico_GeneID_to_UniProt = {}
340 dico_nodes = {}
341 for line in bioplex :
342 dico_network["GeneID"][line[0]]=[line[i] for i in network_geneid_cols]
343 dico_network["UniProt-AC"][line[2]]=[line[i] for i in network_uniprot_cols]
344 dico_GeneID_to_UniProt[line[0]]=line[2]
345
346 download = requests.get("https://reactome.org/download/current/UniProt2Reactome.txt")
347 decoded_content = download.content.decode('utf-8')
348 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t')
349 dico_nodes = {}
350 uniProt_index=0
351 pathway_description_index=3
352 species_index=5
353 for line in tab_file :
354 if line[species_index]==species_dict[species]:
355 if line[uniProt_index] in dico_nodes :
356 dico_nodes[line[uniProt_index]].append(line[pathway_description_index])
357 else :
358 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]]
359
360 dico={}
361 dico['network']=dico_network
362 dico['nodes']=dico_nodes
363 dico['convert']=dico_GeneID_to_UniProt
364
365 #writing output
366 output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json"
367 path = os.path.join(target_directory,output_file)
368 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
369 id = interactome+"_"+species+ time.strftime("%d-%m-%Y")
370
371 with open(path, 'w') as handle:
372 json.dump(dico, handle, sort_keys=True)
373
374 data_table_entry = dict(id=id, name = name, value = species, path = path)
375 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
376
303 377
304 ####################################################################################################### 378 #######################################################################################################
305 # Main function 379 # Main function
306 ####################################################################################################### 380 #######################################################################################################
307 def main(): 381 def main():
308 parser = argparse.ArgumentParser() 382 parser = argparse.ArgumentParser()
309 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) 383 parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
310 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) 384 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
311 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) 385 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
386 parser.add_argument("--interactome", metavar = ("PPI"))
387 parser.add_argument("--species")
312 parser.add_argument("-o", "--output") 388 parser.add_argument("-o", "--output")
313 args = parser.parse_args() 389 args = parser.parse_args()
314 390
315 data_manager_dict = {} 391 data_manager_dict = {}
316 # Extract json file params 392 # Extract json file params
348 id_mapping = None 424 id_mapping = None
349 if id_mapping is not None: 425 if id_mapping is not None:
350 id_mapping = id_mapping .split(",") 426 id_mapping = id_mapping .split(",")
351 for species in id_mapping : 427 for species in id_mapping :
352 id_mapping_sources(data_manager_dict, species, target_directory) 428 id_mapping_sources(data_manager_dict, species, target_directory)
429
430 ## Download PPI ref files from biogrid/bioplex/humap
431 try:
432 interactome=args.interactome
433 species=args.species
434 except NameError:
435 interactome=None
436 species=None
437 if interactome is not None and species is not None:
438 PPI_ref_files(data_manager_dict, species, interactome, target_directory)
353 439
354 #save info to json file 440 #save info to json file
355 filename = args.output 441 filename = args.output
356 open(filename, 'wb').write(to_json_string(data_manager_dict)) 442 open(filename, 'wb').write(to_json_string(data_manager_dict))
357 443