Mercurial > repos > dchristiany > data_manager_proteore
comparison data_manager/resource_building.py @ 15:83f57ba70416 draft
planemo upload commit 8040003119a3d54866ec6ee9b9f659f2af554817-dirty
author | dchristiany |
---|---|
date | Tue, 15 Jan 2019 04:29:28 -0500 |
parents | a1530507fee4 |
children | 454c2e2984ea |
comparison
equal
deleted
inserted
replaced
14:f8ed6fc5f3ae | 15:83f57ba70416 |
---|---|
1 """ | 1 """ |
2 The purpose of this script is to create source files from different databases to be used in other proteore tools | 2 The purpose of this script is to create source files from different databases to be used in other proteore tools |
3 """ | 3 """ |
4 | 4 |
5 import os, sys, argparse, requests, time, csv, re | 5 import os, sys, argparse, requests, time, csv, re, json, zipfile, shutil |
6 from io import BytesIO | 6 from io import BytesIO |
7 from zipfile import ZipFile | 7 from zipfile import ZipFile |
8 from galaxy.util.json import from_json_string, to_json_string | 8 from galaxy.util.json import from_json_string, to_json_string |
9 | 9 |
10 ####################################################################################################### | 10 ####################################################################################################### |
115 | 115 |
116 | 116 |
117 ####################################################################################################### | 117 ####################################################################################################### |
118 # 3. ID mapping file | 118 # 3. ID mapping file |
119 ####################################################################################################### | 119 ####################################################################################################### |
120 import ftplib, gzip, pickle | 120 import ftplib, gzip |
121 csv.field_size_limit(sys.maxsize) # to handle big files | 121 csv.field_size_limit(sys.maxsize) # to handle big files |
122 | 122 |
123 def id_mapping_sources (data_manager_dict, species, target_directory) : | 123 def id_mapping_sources (data_manager_dict, species, target_directory) : |
124 | 124 |
125 human = species == "human" | 125 human = species == "human" |
126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } | 126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } |
127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | 127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
128 | 128 |
129 #header | 129 #header |
130 if human : | 130 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] |
131 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"] | 131 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] |
132 else : | |
133 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"] | |
134 tab = [ids_list] | |
135 | 132 |
136 #print("header ok") | 133 #print("header ok") |
137 | 134 |
138 #selected.tab and keep only ids of interest | 135 #get selected.tab and keep only ids of interest |
139 selected_tab_file=species_dict[species]+"_"+files[0] | 136 selected_tab_file=species_dict[species]+"_"+files[0] |
140 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | 137 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
141 with gzip.open(tab_path,"rt") as select : | 138 with gzip.open(tab_path,"rt") as select : |
142 tab_reader = csv.reader(select,delimiter="\t") | 139 tab_reader = csv.reader(select,delimiter="\t") |
143 for line in tab_reader : | 140 for line in tab_reader : |
149 """ | 146 """ |
150 Supplementary ID to get from HUMAN_9606_idmapping.dat : | 147 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
151 -NextProt,BioGrid,STRING,KEGG | 148 -NextProt,BioGrid,STRING,KEGG |
152 """ | 149 """ |
153 | 150 |
151 #there's more id type for human | |
154 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file | 152 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file |
155 else : ids = ['BioGrid','STRING','KEGG' ] | 153 else : ids = ['BioGrid','STRING','KEGG' ] |
156 unidict = {} | 154 unidict = {} |
157 | 155 |
158 #keep only ids of interest in dictionaries | 156 #keep only ids of interest in dictionaries |
210 uniprotID=line[0] | 208 uniprotID=line[0] |
211 nextprotID=line[13] | 209 nextprotID=line[13] |
212 if nextprotID == '' and uniprotID in next_dict : | 210 if nextprotID == '' and uniprotID in next_dict : |
213 line[13]=next_dict[uniprotID] | 211 line[13]=next_dict[uniprotID] |
214 | 212 |
215 #create empty dictionary and dictionary index | 213 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" |
216 ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list) | |
217 | |
218 #fill dictionary and sub dictionaries with ids | |
219 for line in tab[1:] : | |
220 for index, ids in enumerate(line) : | |
221 other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index] | |
222 for id in ids.replace(" ","").split(";") : #if there's more than one id, one key per id (example : GO) | |
223 if id not in ids_dictionary[ids_dictionary_index[index]] : #if the key is not created yet | |
224 ids_dictionary[ids_dictionary_index[index]][id]={} | |
225 for other_id_type in other_id_type_index : | |
226 if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] : | |
227 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";")) | |
228 else : | |
229 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) | |
230 if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] : | |
231 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('') | |
232 | |
233 ##writing output files | |
234 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle" | |
235 path = os.path.join(target_directory,output_file) | 214 path = os.path.join(target_directory,output_file) |
236 | 215 |
237 #save ids_dictionary | 216 with open(path,"w") as out : |
238 with open(output_dict, 'wb') as handle: | 217 w = csv.writer(out,delimiter='\t') |
239 pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) | 218 w.writerows(tab) |
240 | 219 |
241 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} | 220 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} |
242 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") | 221 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") |
243 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | 222 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
244 | 223 |
245 data_table_entry = dict(id=id, name = name, value = species, path = path) | 224 data_table_entry = dict(id=id, name = name, value = species, path = path) |
246 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries") | 225 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping") |
247 | 226 |
248 def download_from_uniprot_ftp(file,target_directory) : | 227 def download_from_uniprot_ftp(file,target_directory) : |
249 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | 228 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" |
250 path = os.path.join(target_directory, file) | 229 path = os.path.join(target_directory, file) |
251 ftp = ftplib.FTP("ftp.uniprot.org") | 230 ftp = ftplib.FTP("ftp.uniprot.org") |
287 else : | 266 else : |
288 return (tmp[1]) | 267 return (tmp[1]) |
289 else : | 268 else : |
290 return (next_id) | 269 return (next_id) |
291 | 270 |
292 #create empty dictionary with index for tab | 271 |
293 def create_ids_dictionary (ids_list) : | 272 ####################################################################################################### |
294 ids_dictionary = {} | 273 # 4. Build protein interaction maps files |
295 for id_type in ids_list : | 274 ####################################################################################################### |
296 ids_dictionary[id_type]={} | 275 |
297 ids_dictionary_index = {} | 276 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): |
298 | 277 |
299 for i,id in enumerate(ids_list) : | 278 species_dict={'human':'Homo sapiens',"mouse":"Mus musculus","rat":"Rattus norvegicus"} |
300 ids_dictionary_index[i]=id | 279 |
301 | 280 ##BioGRID |
302 return(ids_dictionary,ids_dictionary_index) | 281 if interactome=="biogrid": |
282 | |
283 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" | |
284 | |
285 #dowload zip file | |
286 r = requests.get(tab2_link) | |
287 with open("BioGRID.zip", "wb") as code: | |
288 code.write(r.content) | |
289 | |
290 #unzip files | |
291 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref: | |
292 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID") | |
293 zip_ref.extractall("tmp_BioGRID") | |
294 | |
295 #import file of interest and build dictionary | |
296 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" | |
297 with open(file_path,"r") as handle : | |
298 tab_file = csv.reader(handle,delimiter="\t") | |
299 dico_network = {} | |
300 GeneID_index=1 | |
301 network_cols=[1,2,7,8,11,12,18,20] | |
302 for line in tab_file : | |
303 dico_network[line[GeneID_index]]=[line[i] for i in network_cols] | |
304 | |
305 #delete tmp_BioGRID directory | |
306 os.remove("BioGRID.zip") | |
307 shutil.rmtree("tmp_BioGRID", ignore_errors=True) | |
308 | |
309 #download NCBI2Reactome.txt file and build dictionary | |
310 download = requests.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
311 decoded_content = download.content.decode('utf-8') | |
312 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
313 dico_nodes = {} | |
314 GeneID_index=0 | |
315 pathway_description_index=3 | |
316 species_index=5 | |
317 for line in tab_file : | |
318 if line[species_index]==species_dict[species]: | |
319 if line[GeneID_index] in dico_nodes : | |
320 dico_nodes[line[GeneID_index]].append(line[pathway_description_index]) | |
321 else : | |
322 dico_nodes[line[GeneID_index]] = [line[pathway_description_index]] | |
323 | |
324 dico={} | |
325 dico['network']=dico_network | |
326 dico['nodes']=dico_nodes | |
327 | |
328 ##Bioplex | |
329 elif interactome=="bioplex": | |
330 | |
331 download = requests.get("http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv") | |
332 decoded_content = download.content.decode('utf-8') | |
333 bioplex = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
334 dico_network = {} | |
335 dico_network["GeneID"]={} | |
336 network_geneid_cols=[0,1,4,5,8] | |
337 dico_network["UniProt-AC"]={} | |
338 network_uniprot_cols=[2,3,4,5,8] | |
339 dico_GeneID_to_UniProt = {} | |
340 dico_nodes = {} | |
341 for line in bioplex : | |
342 dico_network["GeneID"][line[0]]=[line[i] for i in network_geneid_cols] | |
343 dico_network["UniProt-AC"][line[2]]=[line[i] for i in network_uniprot_cols] | |
344 dico_GeneID_to_UniProt[line[0]]=line[2] | |
345 | |
346 download = requests.get("https://reactome.org/download/current/UniProt2Reactome.txt") | |
347 decoded_content = download.content.decode('utf-8') | |
348 tab_file = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
349 dico_nodes = {} | |
350 uniProt_index=0 | |
351 pathway_description_index=3 | |
352 species_index=5 | |
353 for line in tab_file : | |
354 if line[species_index]==species_dict[species]: | |
355 if line[uniProt_index] in dico_nodes : | |
356 dico_nodes[line[uniProt_index]].append(line[pathway_description_index]) | |
357 else : | |
358 dico_nodes[line[uniProt_index]] = [line[pathway_description_index]] | |
359 | |
360 dico={} | |
361 dico['network']=dico_network | |
362 dico['nodes']=dico_nodes | |
363 dico['convert']=dico_GeneID_to_UniProt | |
364 | |
365 #writing output | |
366 output_file = species+'_'+interactome+'_dict_'+ time.strftime("%d-%m-%Y") + ".json" | |
367 path = os.path.join(target_directory,output_file) | |
368 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") | |
369 id = interactome+"_"+species+ time.strftime("%d-%m-%Y") | |
370 | |
371 with open(path, 'w') as handle: | |
372 json.dump(dico, handle, sort_keys=True) | |
373 | |
374 data_table_entry = dict(id=id, name = name, value = species, path = path) | |
375 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") | |
376 | |
303 | 377 |
304 ####################################################################################################### | 378 ####################################################################################################### |
305 # Main function | 379 # Main function |
306 ####################################################################################################### | 380 ####################################################################################################### |
307 def main(): | 381 def main(): |
308 parser = argparse.ArgumentParser() | 382 parser = argparse.ArgumentParser() |
309 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) | 383 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) |
310 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) | 384 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) |
311 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) | 385 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) |
386 parser.add_argument("--interactome", metavar = ("PPI")) | |
387 parser.add_argument("--species") | |
312 parser.add_argument("-o", "--output") | 388 parser.add_argument("-o", "--output") |
313 args = parser.parse_args() | 389 args = parser.parse_args() |
314 | 390 |
315 data_manager_dict = {} | 391 data_manager_dict = {} |
316 # Extract json file params | 392 # Extract json file params |
348 id_mapping = None | 424 id_mapping = None |
349 if id_mapping is not None: | 425 if id_mapping is not None: |
350 id_mapping = id_mapping .split(",") | 426 id_mapping = id_mapping .split(",") |
351 for species in id_mapping : | 427 for species in id_mapping : |
352 id_mapping_sources(data_manager_dict, species, target_directory) | 428 id_mapping_sources(data_manager_dict, species, target_directory) |
429 | |
430 ## Download PPI ref files from biogrid/bioplex/humap | |
431 try: | |
432 interactome=args.interactome | |
433 species=args.species | |
434 except NameError: | |
435 interactome=None | |
436 species=None | |
437 if interactome is not None and species is not None: | |
438 PPI_ref_files(data_manager_dict, species, interactome, target_directory) | |
353 | 439 |
354 #save info to json file | 440 #save info to json file |
355 filename = args.output | 441 filename = args.output |
356 open(filename, 'wb').write(to_json_string(data_manager_dict)) | 442 open(filename, 'wb').write(to_json_string(data_manager_dict)) |
357 | 443 |