Mercurial > repos > dchristiany > data_manager_proteore
changeset 13:a1530507fee4 draft
planemo upload commit 08d8f131da0e66113519ffaa7f7e7632cb3d1eff-dirty
author | dchristiany |
---|---|
date | Fri, 04 Jan 2019 04:21:42 -0500 |
parents | 60cb0a5ae661 |
children | f8ed6fc5f3ae |
files | data_manager/resource_building.py data_manager/resource_building.xml data_manager_conf.xml tool-data/proteore_id_mapping.loc.sample tool-data/proteore_id_mapping_dictionaries.loc.sample tool_data_table_conf.xml.sample |
diffstat | 6 files changed, 51 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/resource_building.py Tue Oct 23 08:18:32 2018 -0400 +++ b/data_manager/resource_building.py Fri Jan 04 04:21:42 2019 -0500 @@ -117,7 +117,7 @@ ####################################################################################################### # 3. ID mapping file ####################################################################################################### -import ftplib, gzip +import ftplib, gzip, pickle csv.field_size_limit(sys.maxsize) # to handle big files def id_mapping_sources (data_manager_dict, species, target_directory) : @@ -127,8 +127,11 @@ files=["idmapping_selected.tab.gz","idmapping.dat.gz"] #header - if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] - else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] + if human : + ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"] + else : + ids_list = ["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"] + tab = [ids_list] #print("header ok") @@ -209,19 +212,38 @@ if nextprotID == '' and uniprotID in next_dict : line[13]=next_dict[uniprotID] - output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" + #create empty dictionary and dictionary index + ids_dictionary, ids_dictionary_index = create_ids_dictionary(ids_list) + + #fill dictionary and sub dictionaries with ids + for line in tab[1:] : + for index, ids in enumerate(line) : + other_id_type_index = [accession_id for accession_id in ids_dictionary_index.keys() if accession_id!=index] + for id in ids.replace(" ","").split(";") : #if there's more than one id, one key per id (example : GO) + if id not in ids_dictionary[ids_dictionary_index[index]] : #if the key is not created yet + ids_dictionary[ids_dictionary_index[index]][id]={} + for other_id_type in other_id_type_index : + if ids_dictionary_index[other_id_type] not in ids_dictionary[ids_dictionary_index[index]][id] : + ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";")) + else : + ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) + if len(ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]] : + ids_dictionary[ids_dictionary_index[index]][id][ids_dictionary_index[other_id_type]].remove('') + + ##writing output files + output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".pickle" path = os.path.join(target_directory,output_file) - with open(path,"w") as out : - w = csv.writer(out,delimiter='\t') - w.writerows(tab) + #save ids_dictionary + with open(output_dict, 'wb') as handle: + pickle.dump(ids_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} name = name_dict[species]+" "+time.strftime("%d/%m/%Y") id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") data_table_entry = dict(id=id, name = name, value = species, path = path) - _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping") + _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_dictionaries") def download_from_uniprot_ftp(file,target_directory) : ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" @@ -267,6 +289,17 @@ else : return (next_id) +#create empty dictionary with index for tab +def create_ids_dictionary (ids_list) : + ids_dictionary = {} + for id_type in ids_list : + ids_dictionary[id_type]={} + ids_dictionary_index = {} + + for i,id in enumerate(ids_list) : + ids_dictionary_index[i]=id + + return(ids_dictionary,ids_dictionary_index) ####################################################################################################### # Main function
--- a/data_manager/resource_building.xml Tue Oct 23 08:18:32 2018 -0400 +++ b/data_manager/resource_building.xml Fri Jan 04 04:21:42 2019 -0500 @@ -1,4 +1,4 @@ -<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2018.10.23.3" tool_type="manage_data"> +<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.01.04" tool_type="manage_data"> <description> to create or update reference files for proteore tools </description>
--- a/data_manager_conf.xml Tue Oct 23 08:18:32 2018 -0400 +++ b/data_manager_conf.xml Fri Jan 04 04:21:42 2019 -0500 @@ -38,9 +38,9 @@ <column name="path" output_ref="output" > <move type="file"> <!--source>${path}</source--> - <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">id_mapping/</target> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">id_mapping_dictionaries/</target> </move> - <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv</value_translation> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping_dictionaries/${id}.pickle</value_translation> <value_translation type="function">abspath</value_translation> </column> </output>
--- a/tool-data/proteore_id_mapping.loc.sample Tue Oct 23 08:18:32 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -#This file lists the locations of reference file for id_converter tool -#<id> <name> <value> <path> -#human_id_mapping_01-01-2018 Human (homo sapiens) human_id_mapping tool-data/human_id_mapping_file.tsv -#mouse_id_mapping_01-01-2018 Mouse (Mus musculus) mouse_id_mapping tool-data/mouse_id_mapping.tsv -#rat_id_mapping_01-01-2018 Rat (Rattus norvegicus) rat_id_mapping tool-data/rat_id_mapping.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_id_mapping_dictionaries.loc.sample Fri Jan 04 04:21:42 2019 -0500 @@ -0,0 +1,5 @@ +#This file lists the locations of reference file for id_converter tool +#<id> <name> <value> <path> +#human_id_mapping_01-01-2018 Human (homo sapiens) human_id_mapping tool-data/human_id_mapping_file.tsv +#mouse_id_mapping_01-01-2018 Mouse (Mus musculus) mouse_id_mapping tool-data/mouse_id_mapping.tsv +#rat_id_mapping_01-01-2018 Rat (Rattus norvegicus) rat_id_mapping tool-data/rat_id_mapping.tsv
--- a/tool_data_table_conf.xml.sample Tue Oct 23 08:18:32 2018 -0400 +++ b/tool_data_table_conf.xml.sample Fri Jan 04 04:21:42 2019 -0500 @@ -8,8 +8,8 @@ <columns>id, name, value, path</columns> <file path="tool-data/proteore_protein_atlas.loc" /> </table> - <table name="proteore_id_mapping" comment_char="#"> + <table name="proteore_id_mapping_dictionaries" comment_char="#"> <columns>id, name, value, path</columns> - <file path="tool-data/proteore_id_mapping.loc" /> + <file path="tool-data/proteore_id_mapping_dictionaries.loc" /> </table> </tables>