Mercurial > repos > dchristiany > data_manager_proteore

--- a/data_manager/resource_building.py	Tue Oct 23 08:18:32 2018 -0400
+++ b/data_manager/resource_building.py	Fri Jan 04 04:21:42 2019 -0500
@@ -117,7 +117,7 @@
 #######################################################################################################
 # 3. ID mapping file
 #######################################################################################################
-import ftplib, gzip
+import ftplib, gzip, pickle
 csv.field_size_limit(sys.maxsize) # to handle big files

 def id_mapping_sources (data_manager_dict, species, target_directory) :
@@ -127,8 +127,11 @@
     files=["idmapping_selected.tab.gz","idmapping.dat.gz"]

     #header
-    if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
-    else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+    if human :
+        ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]
+    else :
+        ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]
+    tab = [ids_list]

     #print("header ok")

@@ -209,19 +212,38 @@
             if nextprotID == '' and uniprotID in next_dict :
                 line[13]=next_dict[uniprotID]

-    output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    #create empty dictionary and dictionary index
+    ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list)
+
+    #fill dictionary and sub dictionaries with ids
+    for line in tab[1:] :
+        for index, ids in enumerate(line) :
+            other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index]
+            for id in ids.replace(" ","").split(";") :       #if there's more than one id, one key per id (example : GO)
+                if id not in ids_dictionary[ids_dictionary_index[index]] :      #if the key is not created yet
+                    ids_dictionary[ids_dictionary_index[index]][id]={}
+                for other_id_type in other_id_type_index :
+                    if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] :
+                        ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";"))
+                    else :
+                        ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
+                    if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] :
+                        ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('')
+
+    ##writing output files
+    output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle"
     path = os.path.join(target_directory,output_file)

-    with open(path,"w") as out :
-        w = csv.writer(out,delimiter='\t')
-        w.writerows(tab)
+    #save ids_dictionary
+    with open(output_dict, 'wb') as handle:
+        pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

     name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
     name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
     id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")

     data_table_entry = dict(id=id, name = name, value = species, path = path)
-    _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping")
+    _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries")

 def download_from_uniprot_ftp(file,target_directory) :
     ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
@@ -267,6 +289,17 @@
     else :
         return (next_id)

+#create empty dictionary with index for tab
+def create_ids_dictionary (ids_list) :
+    ids_dictionary = {}
+    for id_type in ids_list :
+        ids_dictionary[id_type]={}
+    ids_dictionary_index = {}
+
+    for i,id in enumerate(ids_list) :
+        ids_dictionary_index[i]=id
+
+    return(ids_dictionary,ids_dictionary_index)

 #######################################################################################################
 # Main function
--- a/data_manager/resource_building.xml	Tue Oct 23 08:18:32 2018 -0400
+++ b/data_manager/resource_building.xml	Fri Jan 04 04:21:42 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2018.10.23.3" tool_type="manage_data">
+<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.01.04" tool_type="manage_data">
 <description>
 to create or update reference files for proteore tools
 </description>
--- a/data_manager_conf.xml	Tue Oct 23 08:18:32 2018 -0400
+++ b/data_manager_conf.xml	Fri Jan 04 04:21:42 2019 -0500
@@ -38,9 +38,9 @@
                 <column name="path" output_ref="output" >
                     <move type="file">
                         <!--source>${path}</source-->
-                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">id_mapping/</target>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">id_mapping_dictionaries/</target>
                     </move>
-                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv</value_translation>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping_dictionaries/${id}.pickle</value_translation>
                     <value_translation type="function">abspath</value_translation>
                 </column>
             </output>
--- a/tool-data/proteore_id_mapping.loc.sample	Tue Oct 23 08:18:32 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-#This file lists the locations of reference file for id_converter tool
-#<id>	<name>	<value>	<path>
-#human_id_mapping_01-01-2018	Human (homo sapiens)	human_id_mapping	tool-data/human_id_mapping_file.tsv
-#mouse_id_mapping_01-01-2018	Mouse (Mus musculus)	mouse_id_mapping	tool-data/mouse_id_mapping.tsv
-#rat_id_mapping_01-01-2018	Rat (Rattus norvegicus)	rat_id_mapping	tool-data/rat_id_mapping.tsv
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_id_mapping_dictionaries.loc.sample	Fri Jan 04 04:21:42 2019 -0500
@@ -0,0 +1,5 @@
+#This file lists the locations of reference file for id_converter tool
+#<id>	<name>	<value>	<path>
+#human_id_mapping_01-01-2018	Human (homo sapiens)	human_id_mapping	tool-data/human_id_mapping_file.tsv
+#mouse_id_mapping_01-01-2018	Mouse (Mus musculus)	mouse_id_mapping	tool-data/mouse_id_mapping.tsv
+#rat_id_mapping_01-01-2018	Rat (Rattus norvegicus)	rat_id_mapping	tool-data/rat_id_mapping.tsv
--- a/tool_data_table_conf.xml.sample	Tue Oct 23 08:18:32 2018 -0400
+++ b/tool_data_table_conf.xml.sample	Fri Jan 04 04:21:42 2019 -0500
@@ -8,8 +8,8 @@
       <columns>id, name, value, path</columns>
       <file path="tool-data/proteore_protein_atlas.loc" />
     </table>
-    <table name="proteore_id_mapping" comment_char="#">
+    <table name="proteore_id_mapping_dictionaries" comment_char="#">
       <columns>id, name, value, path</columns>
-      <file path="tool-data/proteore_id_mapping.loc" />
+      <file path="tool-data/proteore_id_mapping_dictionaries.loc" />
     </table>
 </tables>