comparison data_manager/resource_building.py @ 13:a1530507fee4 draft

planemo upload commit 08d8f131da0e66113519ffaa7f7e7632cb3d1eff-dirty
author dchristiany
date Fri, 04 Jan 2019 04:21:42 -0500
parents 60cb0a5ae661
children 83f57ba70416
comparison
equal deleted inserted replaced
12:60cb0a5ae661 13:a1530507fee4
115 115
116 116
117 ####################################################################################################### 117 #######################################################################################################
118 # 3. ID mapping file 118 # 3. ID mapping file
119 ####################################################################################################### 119 #######################################################################################################
120 import ftplib, gzip 120 import ftplib, gzip, pickle
121 csv.field_size_limit(sys.maxsize) # to handle big files 121 csv.field_size_limit(sys.maxsize) # to handle big files
122 122
123 def id_mapping_sources (data_manager_dict, species, target_directory) : 123 def id_mapping_sources (data_manager_dict, species, target_directory) :
124 124
125 human = species == "human" 125 human = species == "human"
126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } 126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] 127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
128 128
129 #header 129 #header
130 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] 130 if human :
131 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] 131 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]
132 else :
133 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]
134 tab = [ids_list]
132 135
133 #print("header ok") 136 #print("header ok")
134 137
135 #selected.tab and keep only ids of interest 138 #selected.tab and keep only ids of interest
136 selected_tab_file=species_dict[species]+"_"+files[0] 139 selected_tab_file=species_dict[species]+"_"+files[0]
207 uniprotID=line[0] 210 uniprotID=line[0]
208 nextprotID=line[13] 211 nextprotID=line[13]
209 if nextprotID == '' and uniprotID in next_dict : 212 if nextprotID == '' and uniprotID in next_dict :
210 line[13]=next_dict[uniprotID] 213 line[13]=next_dict[uniprotID]
211 214
212 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" 215 #create empty dictionary and dictionary index
216 ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list)
217
218 #fill dictionary and sub dictionaries with ids
219 for line in tab[1:] :
220 for index, ids in enumerate(line) :
221 other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index]
222 for id in ids.replace(" ","").split(";") : #if there's more than one id, one key per id (example : GO)
223 if id not in ids_dictionary[ids_dictionary_index[index]] : #if the key is not created yet
224 ids_dictionary[ids_dictionary_index[index]][id]={}
225 for other_id_type in other_id_type_index :
226 if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] :
227 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";"))
228 else :
229 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
230 if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] :
231 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('')
232
233 ##writing output files
234 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle"
213 path = os.path.join(target_directory,output_file) 235 path = os.path.join(target_directory,output_file)
214 236
215 with open(path,"w") as out : 237 #save ids_dictionary
216 w = csv.writer(out,delimiter='\t') 238 with open(output_dict, 'wb') as handle:
217 w.writerows(tab) 239 pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
218 240
219 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} 241 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
220 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") 242 name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
221 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") 243 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
222 244
223 data_table_entry = dict(id=id, name = name, value = species, path = path) 245 data_table_entry = dict(id=id, name = name, value = species, path = path)
224 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping") 246 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries")
225 247
226 def download_from_uniprot_ftp(file,target_directory) : 248 def download_from_uniprot_ftp(file,target_directory) :
227 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" 249 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
228 path = os.path.join(target_directory, file) 250 path = os.path.join(target_directory, file)
229 ftp = ftplib.FTP("ftp.uniprot.org") 251 ftp = ftplib.FTP("ftp.uniprot.org")
265 else : 287 else :
266 return (tmp[1]) 288 return (tmp[1])
267 else : 289 else :
268 return (next_id) 290 return (next_id)
269 291
292 #create empty dictionary with index for tab
293 def create_ids_dictionary (ids_list) :
294 ids_dictionary = {}
295 for id_type in ids_list :
296 ids_dictionary[id_type]={}
297 ids_dictionary_index = {}
298
299 for i,id in enumerate(ids_list) :
300 ids_dictionary_index[i]=id
301
302 return(ids_dictionary,ids_dictionary_index)
270 303
271 ####################################################################################################### 304 #######################################################################################################
272 # Main function 305 # Main function
273 ####################################################################################################### 306 #######################################################################################################
274 def main(): 307 def main():