Mercurial > repos > dchristiany > data_manager_proteore
changeset 0:2de84fea8367 draft
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
author | dchristiany |
---|---|
date | Thu, 18 Oct 2018 12:16:09 -0400 |
parents | |
children | c60497a290e8 |
files | data_manager/resource_building.py data_manager/resource_building.xml data_manager_conf.xml tool-data/id_mapping.loc.sample tool-data/peptide_atlas.loc.sample tool-data/proteinatlas.loc.sample tool_data_table_conf.xml.sample |
diffstat | 7 files changed, 507 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/resource_building.py Thu Oct 18 12:16:09 2018 -0400 @@ -0,0 +1,323 @@ +""" +The purpose of this script is to create source files from different databases to be used in other tools +""" + +import os, sys, argparse, requests, time, csv, re +from io import BytesIO +from zipfile import ZipFile +from galaxy.util.json import from_json_string, to_json_string + +####################################################################################################### +# General functions +####################################################################################################### +def unzip(url, output_file): + """ + Get a zip file content from a link and unzip + """ + content = requests.get(url) + zipfile = ZipFile(BytesIO(content.content)) + output_content = "" + output_content += zipfile.open(zipfile.namelist()[0]).read() + output = open(output_file, "w") + output.write(output_content) + output.close() + +def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) + data_manager_dict['data_tables'][data_table].append(data_table_entry) + return data_manager_dict + +####################################################################################################### +# 1. Human Protein Atlas +# - Normal tissue +# - Pathology +# - Full Atlas +####################################################################################################### +def HPA_sources(data_manager_dict, tissue, target_directory): + if tissue == "HPA_normal_tissue": + tissue_name = "HPA normal tissue" + url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" + elif tissue == "HPA_pathology": + tissue_name = "HPA pathology" + url = "https://www.proteinatlas.org/download/pathology.tsv.zip" + elif tissue == "HPA_full_atlas": + tissue_name = "HPA full atlas" + url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" + output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory, output_file) + unzip(url, path) + print(str(os.path.isfile(path))) + tmp=open(path,"r").readlines() + tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") + data_table_entry = dict(value = tissue, name = tissue_name, path = path) + _add_data_table_entry(data_manager_dict, data_table_entry, "proteinatlas") + + +####################################################################################################### +# 2. Peptide Atlas +####################################################################################################### +def peptide_atlas_sources(data_manager_dict, tissue, target_directory): + # Define PA Human build released number (here early 2018) + atlas_build_id = "472" + # Define organism_id (here Human) - to be upraded when other organism added to the project + organism_id = "2" + # Extract sample_category_id and output filename + sample_category_id = tissue.split("-")[0] + output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv" + query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \ + atlas_build_id + "&display_options=ShowMappings&organism_id= " + \ + organism_id + "&sample_category_id=" + sample_category_id + \ + "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY" + download = requests.get(query) + decoded_content = download.content.decode('utf-8') + cr = csv.reader(decoded_content.splitlines(), delimiter='\t') + + #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value + uni_dict = build_dictionary(cr) + + tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")]) + tissue_value = tissue.split("-")[1] + tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y") + tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y") + path = os.path.join(target_directory,output_file) + + with open(path,"wb") as out : + w = csv.writer(out,delimiter='\t') + w.writerow(["Uniprot_AC","nb_obs"]) + w.writerows(uni_dict.items()) + + data_table_entry = dict(value = path, name = tissue_name, tissue = tissue) + _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas") + +#function to count the number of observations by uniprot id +def build_dictionary (csv) : + uni_dict = {} + for line in csv : + if "-" not in line[2] and check_uniprot_access(line[2]) : + if line[2] in uni_dict : + uni_dict[line[2]] += int(line[4]) + else : + uni_dict[line[2]] = int(line[4]) + + return uni_dict + +#function to check if an id is an uniprot accession number : return True or False- +def check_uniprot_access (id) : + uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") + if uniprot_pattern.match(id) : + return True + else : + return False + + + +####################################################################################################### +# 3. ID mapping file +####################################################################################################### +import ftplib, gzip +csv.field_size_limit(sys.maxsize) # to handle big files + +def id_mapping_sources (data_manager_dict, species, target_directory) : + + human = species == "human" + species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } + files=["idmapping_selected.tab.gz","idmapping.dat.gz"] + + #header + if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] + else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] + + #print("header ok") + + #selected.tab and keep only ids of interest + selected_tab_file=species_dict[species]+"_"+files[0] + tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) + with gzip.open(tab_path,"rt") as select : + tab_reader = csv.reader(select,delimiter="\t") + for line in tab_reader : + tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) + os.remove(tab_path) + + #print("selected_tab ok") + + """ + Supplementary ID to get from HUMAN_9606_idmapping.dat : + -NextProt,BioGrid,STRING,KEGG + """ + + if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file + else : ids = ['BioGrid','STRING','KEGG' ] + unidict = {} + + #keep only ids of interest in dictionaries + dat_file=species_dict[species]+"_"+files[1] + dat_path = download_from_uniprot_ftp(dat_file,target_directory) + with gzip.open(dat_path,"rt") as dat : + dat_reader = csv.reader(dat,delimiter="\t") + for line in dat_reader : + uniprotID=line[0] #UniProtID as key + id_type=line[1] #ID type of corresponding id, key of sub-dictionnary + cor_id=line[2] #corresponding id + if "-" not in id_type : #we don't keep isoform + if id_type in ids and uniprotID in unidict : + if id_type in unidict[uniprotID] : + unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary + else : + unidict[uniprotID].update({ id_type : cor_id }) + elif id_type in ids : + unidict[uniprotID]={id_type : cor_id} + os.remove(dat_path) + + #print("dat_file ok") + + #add ids from idmapping.dat to the final tab + for line in tab[1:] : + uniprotID=line[0] + if human : + if uniprotID in unidict : + nextprot = access_dictionary(unidict,uniprotID,'neXtProt') + if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) + line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), + access_dictionary(unidict,uniprotID,'KEGG')]) + else : + line.extend(["","","",""]) + else : + if uniprotID in unidict : + line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), + access_dictionary(unidict,uniprotID,'KEGG')]) + else : + line.extend(["","",""]) + + #print ("tab ok") + + #add missing nextprot ID for human + if human : + #build next_dict + nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + next_dict = {} + for nextid in nextprot_ids : + next_dict[nextid.replace("NX_","")] = nextid + os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) + + #add missing nextprot ID + for line in tab[1:] : + uniprotID=line[0] + nextprotID=line[13] + if nextprotID == '' and uniprotID in next_dict : + line[13]=next_dict[uniprotID] + + output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory,output_file) + + with open(path,"w") as out : + w = csv.writer(out,delimiter='\t') + w.writerows(tab) + + name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} + name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")" + + data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path) + _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping_tab") + +def download_from_uniprot_ftp(file,target_directory) : + ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" + path = os.path.join(target_directory, file) + ftp = ftplib.FTP("ftp.uniprot.org") + ftp.login("anonymous", "anonymous") + ftp.cwd(ftp_dir) + ftp.retrbinary("RETR " + file, open(path, 'wb').write) + ftp.quit() + return (path) + +def id_list_from_nextprot_ftp(file,target_directory) : + ftp_dir = "pub/current_release/ac_lists/" + path = os.path.join(target_directory, file) + ftp = ftplib.FTP("ftp.nextprot.org") + ftp.login("anonymous", "anonymous") + ftp.cwd(ftp_dir) + ftp.retrbinary("RETR " + file, open(path, 'wb').write) + ftp.quit() + with open(path,'r') as nextprot_ids : + nextprot_ids = nextprot_ids.read().splitlines() + return (nextprot_ids) + +#return '' if there's no value in a dictionary, avoid error +def access_dictionary (dico,key1,key2) : + if key1 in dico : + if key2 in dico[key1] : + return (dico[key1][key2]) + else : + return ("") + #print (key2,"not in ",dico,"[",key1,"]") + else : + return ('') + +#if there are several nextprot ID for one uniprotID, return the uniprot like ID +def clean_nextprot_id (next_id,uniprotAc) : + if len(next_id.split(";")) > 1 : + tmp = next_id.split(";") + if "NX_"+uniprotAc in tmp : + return ("NX_"+uniprotAc) + else : + return (tmp[1]) + else : + return (next_id) + + +####################################################################################################### +# Main function +####################################################################################################### +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--hpa", metavar = ("HPA_OPTION")) + parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) + parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) + parser.add_argument("-o", "--output") + args = parser.parse_args() + + data_manager_dict = {} + # Extract json file params + filename = args.output + params = from_json_string(open(filename).read()) + target_directory = params[ 'output_data' ][0]['extra_files_path'] + os.mkdir(target_directory) + + ## Download source files from HPA + try: + hpa = args.hpa + except NameError: + hpa = None + if hpa is not None: + #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" + hpa = hpa.split(",") + for hpa_tissue in hpa: + HPA_sources(data_manager_dict, hpa_tissue, target_directory) + + ## Download source file from Peptide Atlas query + try: + peptide_atlas = args.peptideatlas + except NameError: + peptide_atlas = None + if peptide_atlas is not None: + #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" + peptide_atlas = peptide_atlas.split(",") + for pa_tissue in peptide_atlas: + peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory) + + ## Download ID_mapping source file from Uniprot + try: + id_mapping=args.id_mapping + except NameError: + id_mapping = None + if id_mapping is not None: + id_mapping = id_mapping .split(",") + for species in id_mapping : + id_mapping_sources(data_manager_dict, species, target_directory) + + #save info to json file + filename = args.output + open(filename, 'wb').write(to_json_string(data_manager_dict)) + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/resource_building.xml Thu Oct 18 12:16:09 2018 -0400 @@ -0,0 +1,91 @@ +<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2018.10.18.9" tool_type="manage_data"> +<description> +to create or update reference files for proteore tools +</description> +<requirements> +</requirements> +<stdio> + <exit_code range="1:" /> +</stdio> +<command><![CDATA[ + + python $__tool_directory__/resource_building.py + #if $database.database == "human_protein_atlas" + --hpa "$database.tissues" + #else if $database.database == "peptide_atlas" + --peptideatlas "$database.tissues" + #else if $database.database == "id_mapping" + --id_mapping="$database.species" + #end if + --output "$output" + +]]></command> + +<inputs> + <conditional name="database"> + <param name="database" type="select"> + <option value="human_protein_atlas">Human Protein Atlas</option> + <option value="peptide_atlas">Peptide Atlas</option> + <option value="id_mapping">ID mapping</option> + </param> + <when value="human_protein_atlas"> + <param name="tissues" type="select" multiple="false" label="Please select tissue"> + <option value="HPA_normal_tissue">Normal tissue</option> + <option value="HPA_pathology">Pathology</option> + <option value="HPA_full_atlas">Full Atlas</option> + </param> + </when> + <when value="peptide_atlas"> + <param name="tissues" type="select" multiple="false" label="Please select the tissue"> + <option value="1-Human_Liver">Human liver</option> + <option value="2-Human_Brain">Human brain</option> + <option value="4-Human_Heart">Human heart</option> + <option value="5-Human_Kidney">Human kidney</option> + <option value="10-Human_Plasma">Human blood plasma</option> + <option value="13-Human_Urine">Human urine</option> + <option value="24-Human_CSF">Human cerebrospinal fluid</option> + </param> + </when> + <when value="id_mapping"> + <param name="species" type="select" multiple="false" label="Please select the species"> + <option value="human">Homo sapiens</option> + <option value="mouse">Mus musculus</option> + <option value="rat">Rattus norvegicus</option> + </param> + </when> + </conditional> +</inputs> + +<outputs> + <!--data format="tabular" name="output"> + <discover_datasets pattern="(?P<designation>.+).tsv" ext="tabular" visible="true" assign_primary_output="true" /> + </data--> + <data name="output" format="data_manager_json"/> +</outputs> + +<tests> +</tests> + +<help><![CDATA[ + +TODO + +----- + +.. class:: infomark + +**Authors** + +Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR + +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + + ]]></help> + <citations> + </citations> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Thu Oct 18 12:16:09 2018 -0400 @@ -0,0 +1,46 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/resource_building.xml" id="data_manager_proteore"> + <data_table name="proteinatlas"> + <output> + <column name="value" /> + <column name="name" /> + <column name="path" output_ref="output" > + <move type="file"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">proteinatlas/${path}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/proteinatlas/${path}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="peptide_atlas"> + <output> + <column name="tissue" /> + <column name="name" /> + <column name="value" output_ref="output" > + <move type="file"> + <!--source>${path}/${value}.tsv</source--> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">peptide_atlas/</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/peptide_atlas/${tissue}.tsv</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="id_mapping_tab"> + <output> + <column name="value" /> + <column name="name" /> + <column name="path" output_ref="output" > + <move type="file"> + <!--source>${path}</source--> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">id_mapping/</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${value}.tsv</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/id_mapping.loc.sample Thu Oct 18 12:16:09 2018 -0400 @@ -0,0 +1,5 @@ +#This file lists the locations of reference file for id_converter tool +#<name> <value> <path> +#human_id_mapping Human (homo sapiens) tool-data/human_id_mapping_file.tsv +#mouse_id_mapping Mouse (Mus musculus) tool-data/mouse_id_mapping.tsv +#rat_id_mapping Rat (Rattus norvegicus) tool-data/rat_id_mapping.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/peptide_atlas.loc.sample Thu Oct 18 12:16:09 2018 -0400 @@ -0,0 +1,15 @@ +#This file lists the locations name and values of reference files +#for number of MS/MS observations in a tissue +#This is a tab separated file (TAB, not 4 spaces !) +# +#<tissue> <name> <value> + + +#Human_Heart_20-07-2018 Human Heart 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Heart_20-07-2018.tsv +#Human_Liver_20-07-2018 Human Liver 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Liver_20-07-2018.tsv +#Human_Urine_20-07-2018 Human Urine 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Urine_20-07-2018.tsv +#Human_Brain_20-07-2018 Human Brain 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Brain_20-07-2018.tsv +#Human_Kidney_20-07-2018 Human Kidney 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Kidney_20-07-2018.tsv +#Human_Plasma_20-07-2018 Human Plasma 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Plasma_20-07-2018.tsv +#Human_CSF_20-07-2018 Human CSF 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_CSF_20-07-2018.tsv +#Human_Liver_23-07-2018 Human Liver 23/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Liver_23-07-2018.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteinatlas.loc.sample Thu Oct 18 12:16:09 2018 -0400 @@ -0,0 +1,12 @@ +#This file lists the locations name and values of reference files +#for Get expression data tool +#This is a tab separated file (TAB, not 4 spaces !) +# +#<name> <value> <path> +# +#proteinatlas.loc could look something like this: +# +#HPA normal tissue 19/07/2018 HPA_normal_tissue /projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19159/dataset_39307_files/HPA_normal_tissue_19-07-2018.tsv +#HPA pathology 19/07/2018 HPA_pathology /projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_pathology_19-07-2018.tsv +#HPA full atlas 19/07/2018 HPA_full_atlas /projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19161/dataset_39309_files/HPA_full_atlas_19-07-2018.tsv +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Oct 18 12:16:09 2018 -0400 @@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tables> + <table name='peptide_atlas' comment_char="#"> + <columns>tissue,name,value</columns> + <file path="tool-data/peptide_atlas.loc"/> + </table> + <table name="proteinatlas" comment_char="#"> + <columns>name, value, path</columns> + <file path="tool-data/proteinatlas.loc" /> + </table> + <table name="id_mapping_tab" comment_char="#"> + <columns>name, value, path</columns> + <file path="tool-data/id_mapping.loc" /> + </table> +</tables> \ No newline at end of file