data_manager_proteore: data_manager/resource

comparison data_manager/resource_building.py @ 13:a1530507fee4 draft

planemo upload commit 08d8f131da0e66113519ffaa7f7e7632cb3d1eff-dirty

author	dchristiany
date	Fri, 04 Jan 2019 04:21:42 -0500
parents	60cb0a5ae661
children	83f57ba70416

comparison

equal deleted inserted replaced

-:60cb0a5ae661
+:a1530507fee4
 #######################################################################################################
 # 3. ID mapping file
 #######################################################################################################
-import ftplib, gzip
+import ftplib, gzip, pickle
 csv.field_size_limit(sys.maxsize) # to handle big files
 def id_mapping_sources (data_manager_dict, species, target_directory) :
 human = species == "human"
 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
 #header
-if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+if human :
-else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]
+else :
+ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]
+tab = [ids_list]
 #print("header ok")
 #selected.tab and keep only ids of interest
 selected_tab_file=species_dict[species]+"_"+files[0]
 uniprotID=line[0]
 nextprotID=line[13]
 if nextprotID == '' and uniprotID in next_dict :
 line[13]=next_dict[uniprotID]
-output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
+#create empty dictionary and dictionary index
+ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list)
+#fill dictionary and sub dictionaries with ids
+for line in tab[1:] :
+for index, ids in enumerate(line) :
+other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index]
+for id in ids.replace(" ","").split(";") :       #if there's more than one id, one key per id (example : GO)
+if id not in ids_dictionary[ids_dictionary_index[index]] :      #if the key is not created yet
+ids_dictionary[ids_dictionary_index[index]][id]={}
+for other_id_type in other_id_type_index :
+if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] :
+ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";"))
+else :
+ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
+if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] :
+ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('')
+##writing output files
+output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle"
 path = os.path.join(target_directory,output_file)
-with open(path,"w") as out :
+#save ids_dictionary
-w = csv.writer(out,delimiter='\t')
+with open(output_dict, 'wb') as handle:
-w.writerows(tab)
+pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
 name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
 data_table_entry = dict(id=id, name = name, value = species, path = path)
-_add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping")
+_add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries")
 def download_from_uniprot_ftp(file,target_directory) :
 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
 path = os.path.join(target_directory, file)
 ftp = ftplib.FTP("ftp.uniprot.org")
 else :
 return (tmp[1])
 else :
 return (next_id)
+#create empty dictionary with index for tab
+def create_ids_dictionary (ids_list) :
+ids_dictionary = {}
+for id_type in ids_list :
+ids_dictionary[id_type]={}
+ids_dictionary_index = {}
+for i,id in enumerate(ids_list) :
+ids_dictionary_index[i]=id
+return(ids_dictionary,ids_dictionary_index)
 #######################################################################################################
 # Main function
 #######################################################################################################
 def main():

Mercurial > repos > dchristiany > data_manager_proteore

comparison data_manager/resource_building.py @ 13:a1530507fee4 draft