Mercurial > repos > dchristiany > data_manager_proteore
comparison data_manager/resource_building.py @ 13:a1530507fee4 draft
planemo upload commit 08d8f131da0e66113519ffaa7f7e7632cb3d1eff-dirty
author | dchristiany |
---|---|
date | Fri, 04 Jan 2019 04:21:42 -0500 |
parents | 60cb0a5ae661 |
children | 83f57ba70416 |
comparison
equal
deleted
inserted
replaced
12:60cb0a5ae661 | 13:a1530507fee4 |
---|---|
115 | 115 |
116 | 116 |
117 ####################################################################################################### | 117 ####################################################################################################### |
118 # 3. ID mapping file | 118 # 3. ID mapping file |
119 ####################################################################################################### | 119 ####################################################################################################### |
120 import ftplib, gzip | 120 import ftplib, gzip, pickle |
121 csv.field_size_limit(sys.maxsize) # to handle big files | 121 csv.field_size_limit(sys.maxsize) # to handle big files |
122 | 122 |
123 def id_mapping_sources (data_manager_dict, species, target_directory) : | 123 def id_mapping_sources (data_manager_dict, species, target_directory) : |
124 | 124 |
125 human = species == "human" | 125 human = species == "human" |
126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } | 126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } |
127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | 127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
128 | 128 |
129 #header | 129 #header |
130 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | 130 if human : |
131 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | 131 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"] |
132 else : | |
133 ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"] | |
134 tab = [ids_list] | |
132 | 135 |
133 #print("header ok") | 136 #print("header ok") |
134 | 137 |
135 #selected.tab and keep only ids of interest | 138 #selected.tab and keep only ids of interest |
136 selected_tab_file=species_dict[species]+"_"+files[0] | 139 selected_tab_file=species_dict[species]+"_"+files[0] |
207 uniprotID=line[0] | 210 uniprotID=line[0] |
208 nextprotID=line[13] | 211 nextprotID=line[13] |
209 if nextprotID == '' and uniprotID in next_dict : | 212 if nextprotID == '' and uniprotID in next_dict : |
210 line[13]=next_dict[uniprotID] | 213 line[13]=next_dict[uniprotID] |
211 | 214 |
212 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | 215 #create empty dictionary and dictionary index |
216 ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list) | |
217 | |
218 #fill dictionary and sub dictionaries with ids | |
219 for line in tab[1:] : | |
220 for index, ids in enumerate(line) : | |
221 other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index] | |
222 for id in ids.replace(" ","").split(";") : #if there's more than one id, one key per id (example : GO) | |
223 if id not in ids_dictionary[ids_dictionary_index[index]] : #if the key is not created yet | |
224 ids_dictionary[ids_dictionary_index[index]][id]={} | |
225 for other_id_type in other_id_type_index : | |
226 if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] : | |
227 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";")) | |
228 else : | |
229 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) | |
230 if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] : | |
231 ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('') | |
232 | |
233 ##writing output files | |
234 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle" | |
213 path = os.path.join(target_directory,output_file) | 235 path = os.path.join(target_directory,output_file) |
214 | 236 |
215 with open(path,"w") as out : | 237 #save ids_dictionary |
216 w = csv.writer(out,delimiter='\t') | 238 with open(output_dict, 'wb') as handle: |
217 w.writerows(tab) | 239 pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) |
218 | 240 |
219 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} | 241 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} |
220 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") | 242 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") |
221 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | 243 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
222 | 244 |
223 data_table_entry = dict(id=id, name = name, value = species, path = path) | 245 data_table_entry = dict(id=id, name = name, value = species, path = path) |
224 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping") | 246 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries") |
225 | 247 |
226 def download_from_uniprot_ftp(file,target_directory) : | 248 def download_from_uniprot_ftp(file,target_directory) : |
227 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | 249 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" |
228 path = os.path.join(target_directory, file) | 250 path = os.path.join(target_directory, file) |
229 ftp = ftplib.FTP("ftp.uniprot.org") | 251 ftp = ftplib.FTP("ftp.uniprot.org") |
265 else : | 287 else : |
266 return (tmp[1]) | 288 return (tmp[1]) |
267 else : | 289 else : |
268 return (next_id) | 290 return (next_id) |
269 | 291 |
292 #create empty dictionary with index for tab | |
293 def create_ids_dictionary (ids_list) : | |
294 ids_dictionary = {} | |
295 for id_type in ids_list : | |
296 ids_dictionary[id_type]={} | |
297 ids_dictionary_index = {} | |
298 | |
299 for i,id in enumerate(ids_list) : | |
300 ids_dictionary_index[i]=id | |
301 | |
302 return(ids_dictionary,ids_dictionary_index) | |
270 | 303 |
271 ####################################################################################################### | 304 ####################################################################################################### |
272 # Main function | 305 # Main function |
273 ####################################################################################################### | 306 ####################################################################################################### |
274 def main(): | 307 def main(): |