diff data_manager/resource_building.py @ 13:a1530507fee4 draft

planemo upload commit 08d8f131da0e66113519ffaa7f7e7632cb3d1eff-dirty
author dchristiany
date Fri, 04 Jan 2019 04:21:42 -0500
parents 60cb0a5ae661
children 83f57ba70416
line wrap: on
line diff
--- a/data_manager/resource_building.py	Tue Oct 23 08:18:32 2018 -0400
+++ b/data_manager/resource_building.py	Fri Jan 04 04:21:42 2019 -0500
@@ -117,7 +117,7 @@
 #######################################################################################################
 # 3. ID mapping file
 #######################################################################################################
-import ftplib, gzip
+import ftplib, gzip, pickle
 csv.field_size_limit(sys.maxsize) # to handle big files
 
 def id_mapping_sources (data_manager_dict, species, target_directory) :
@@ -127,8 +127,11 @@
     files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
 
     #header
-    if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
-    else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+    if human : 
+        ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]
+    else : 
+        ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"] 
+    tab = [ids_list] 
 
     #print("header ok")
 
@@ -209,19 +212,38 @@
             if nextprotID == '' and uniprotID in next_dict :
                 line[13]=next_dict[uniprotID]
 
-    output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    #create empty dictionary and dictionary index
+    ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list)
+
+    #fill dictionary and sub dictionaries with ids
+    for line in tab[1:] :
+        for index, ids in enumerate(line) :
+            other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index]
+            for id in ids.replace(" ","").split(";") :       #if there's more than one id, one key per id (example : GO)
+                if id not in ids_dictionary[ids_dictionary_index[index]] :      #if the key is not created yet
+                    ids_dictionary[ids_dictionary_index[index]][id]={}
+                for other_id_type in other_id_type_index :
+                    if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] :
+                        ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";"))
+                    else :
+                        ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
+                    if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] : 
+                        ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('')
+
+    ##writing output files
+    output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle"
     path = os.path.join(target_directory,output_file)
 
-    with open(path,"w") as out :
-        w = csv.writer(out,delimiter='\t')
-        w.writerows(tab)
+    #save ids_dictionary
+    with open(output_dict, 'wb') as handle:
+        pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
     name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
     name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
     id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
 
     data_table_entry = dict(id=id, name = name, value = species, path = path)
-    _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping")
+    _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries")
 
 def download_from_uniprot_ftp(file,target_directory) :
     ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
@@ -267,6 +289,17 @@
     else :
         return (next_id)
 
+#create empty dictionary with index for tab
+def create_ids_dictionary (ids_list) :
+    ids_dictionary = {}
+    for id_type in ids_list : 
+        ids_dictionary[id_type]={}
+    ids_dictionary_index = {}
+
+    for i,id in enumerate(ids_list) :
+        ids_dictionary_index[i]=id
+            
+    return(ids_dictionary,ids_dictionary_index)
 
 #######################################################################################################
 # Main function