Mercurial > repos > dchristiany > data_manager_proteore

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/resource_building.py	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,323 @@
+"""
+The purpose of this script is to create source files from different databases to be used in other tools
+"""
+
+import os, sys, argparse, requests, time, csv, re
+from io import BytesIO
+from zipfile import ZipFile
+from galaxy.util.json import from_json_string, to_json_string
+
+#######################################################################################################
+# General functions
+#######################################################################################################
+def unzip(url, output_file):
+    """
+    Get a zip file content from a link and unzip
+    """
+    content = requests.get(url)
+    zipfile = ZipFile(BytesIO(content.content))
+    output_content = ""
+    output_content += zipfile.open(zipfile.namelist()[0]).read()
+    output = open(output_file, "w")
+    output.write(output_content)
+    output.close()
+
+def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
+    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+    data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
+    data_manager_dict['data_tables'][data_table].append(data_table_entry)
+    return data_manager_dict
+
+#######################################################################################################
+# 1. Human Protein Atlas
+#    - Normal tissue
+#    - Pathology
+#    - Full Atlas
+#######################################################################################################
+def HPA_sources(data_manager_dict, tissue, target_directory):
+    if tissue == "HPA_normal_tissue":
+        tissue_name = "HPA normal tissue"
+        url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
+    elif tissue == "HPA_pathology":
+        tissue_name = "HPA pathology"
+        url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
+    elif tissue == "HPA_full_atlas":
+        tissue_name = "HPA full atlas"
+        url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
+    output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    path = os.path.join(target_directory, output_file)
+    unzip(url, path)
+    print(str(os.path.isfile(path)))
+    tmp=open(path,"r").readlines()
+    tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
+    data_table_entry = dict(value = tissue, name = tissue_name, path = path)
+    _add_data_table_entry(data_manager_dict, data_table_entry, "proteinatlas")
+
+
+#######################################################################################################
+# 2. Peptide Atlas
+#######################################################################################################
+def peptide_atlas_sources(data_manager_dict, tissue, target_directory):
+    # Define PA Human build released number (here  early 2018)
+    atlas_build_id = "472"
+    # Define organism_id (here Human) - to be upraded when other organism added to the project
+    organism_id = "2"
+    # Extract sample_category_id and output filename
+    sample_category_id = tissue.split("-")[0]
+    output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \
+            atlas_build_id + "&display_options=ShowMappings&organism_id= " + \
+            organism_id + "&sample_category_id=" + sample_category_id + \
+            "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY"
+    download = requests.get(query)
+    decoded_content = download.content.decode('utf-8')
+    cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
+
+    #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value
+    uni_dict = build_dictionary(cr)
+
+    tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")])
+    tissue_value = tissue.split("-")[1]
+    tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y")
+    tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y")
+    path = os.path.join(target_directory,output_file)
+
+    with open(path,"wb") as out :
+        w = csv.writer(out,delimiter='\t')
+        w.writerow(["Uniprot_AC","nb_obs"])
+        w.writerows(uni_dict.items())
+
+    data_table_entry = dict(value = path, name = tissue_name, tissue = tissue)
+    _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas")
+
+#function to count the number of observations by uniprot id
+def build_dictionary (csv) :
+    uni_dict = {}
+    for line in csv :
+        if "-" not in line[2] and check_uniprot_access(line[2]) :
+            if line[2] in uni_dict :
+                uni_dict[line[2]] += int(line[4])
+            else :
+                uni_dict[line[2]] = int(line[4])
+
+    return uni_dict
+
+#function to check if an id is an uniprot accession number : return True or False-
+def check_uniprot_access (id) :
+    uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
+    if uniprot_pattern.match(id) :
+        return True
+    else :
+        return False
+
+
+
+#######################################################################################################
+# 3. ID mapping file
+#######################################################################################################
+import ftplib, gzip
+csv.field_size_limit(sys.maxsize) # to handle big files
+
+def id_mapping_sources (data_manager_dict, species, target_directory) :
+
+    human = species == "human"
+    species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
+    files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
+
+    #header
+    if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+    else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+
+    #print("header ok")
+
+    #selected.tab and keep only ids of interest
+    selected_tab_file=species_dict[species]+"_"+files[0]
+    tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
+    with gzip.open(tab_path,"rt") as select :
+        tab_reader = csv.reader(select,delimiter="\t")
+        for line in tab_reader :
+            tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
+    os.remove(tab_path)
+
+    #print("selected_tab ok")
+
+    """
+    Supplementary ID to get from HUMAN_9606_idmapping.dat :
+    -NextProt,BioGrid,STRING,KEGG
+    """
+
+    if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ]   #ids to get from dat_file
+    else : ids = ['BioGrid','STRING','KEGG' ]
+    unidict = {}
+
+    #keep only ids of interest in dictionaries
+    dat_file=species_dict[species]+"_"+files[1]
+    dat_path = download_from_uniprot_ftp(dat_file,target_directory)
+    with gzip.open(dat_path,"rt") as dat :
+        dat_reader = csv.reader(dat,delimiter="\t")
+        for line in dat_reader :
+            uniprotID=line[0]       #UniProtID as key
+            id_type=line[1]         #ID type of corresponding id, key of sub-dictionnary
+            cor_id=line[2]          #corresponding id
+            if "-" not in id_type :                                 #we don't keep isoform
+                if id_type in ids and uniprotID in unidict :
+                    if id_type in unidict[uniprotID] :
+                        unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id])    #if there is already a value in the dictionnary
+                    else :
+                        unidict[uniprotID].update({ id_type : cor_id })
+                elif  id_type in ids :
+                    unidict[uniprotID]={id_type : cor_id}
+    os.remove(dat_path)
+
+    #print("dat_file ok")
+
+    #add ids from idmapping.dat to the final tab
+    for line in tab[1:] :
+        uniprotID=line[0]
+        if human :
+            if uniprotID in unidict :
+                nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
+                if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
+                line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+                        access_dictionary(unidict,uniprotID,'KEGG')])
+            else :
+                line.extend(["","","",""])
+        else :
+            if uniprotID in unidict :
+                line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+                        access_dictionary(unidict,uniprotID,'KEGG')])
+            else :
+                line.extend(["","",""])
+
+    #print ("tab ok")
+
+    #add missing nextprot ID for human
+    if human :
+        #build next_dict
+        nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+        next_dict = {}
+        for nextid in nextprot_ids :
+            next_dict[nextid.replace("NX_","")] = nextid
+        os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
+
+        #add missing nextprot ID
+        for line in tab[1:] :
+            uniprotID=line[0]
+            nextprotID=line[13]
+            if nextprotID == '' and uniprotID in next_dict :
+                line[13]=next_dict[uniprotID]
+
+    output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    path = os.path.join(target_directory,output_file)
+
+    with open(path,"w") as out :
+        w = csv.writer(out,delimiter='\t')
+        w.writerows(tab)
+
+    name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
+    name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")"
+
+    data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path)
+    _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping_tab")
+
+def download_from_uniprot_ftp(file,target_directory) :
+    ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
+    path = os.path.join(target_directory, file)
+    ftp = ftplib.FTP("ftp.uniprot.org")
+    ftp.login("anonymous", "anonymous")
+    ftp.cwd(ftp_dir)
+    ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+    ftp.quit()
+    return (path)
+
+def id_list_from_nextprot_ftp(file,target_directory) :
+    ftp_dir = "pub/current_release/ac_lists/"
+    path = os.path.join(target_directory, file)
+    ftp = ftplib.FTP("ftp.nextprot.org")
+    ftp.login("anonymous", "anonymous")
+    ftp.cwd(ftp_dir)
+    ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+    ftp.quit()
+    with open(path,'r') as nextprot_ids :
+        nextprot_ids = nextprot_ids.read().splitlines()
+    return (nextprot_ids)
+
+#return '' if there's no value in a dictionary, avoid error
+def access_dictionary (dico,key1,key2) :
+    if key1 in dico :
+        if key2 in dico[key1] :
+            return (dico[key1][key2])
+        else :
+            return ("")
+            #print (key2,"not in ",dico,"[",key1,"]")
+    else :
+        return ('')
+
+#if there are several nextprot ID for one uniprotID, return the uniprot like ID
+def clean_nextprot_id (next_id,uniprotAc) :
+    if len(next_id.split(";")) > 1 :
+        tmp = next_id.split(";")
+        if "NX_"+uniprotAc in tmp :
+            return ("NX_"+uniprotAc)
+        else :
+            return (tmp[1])
+    else :
+        return (next_id)
+
+
+#######################################################################################################
+# Main function
+#######################################################################################################
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
+    parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
+    parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
+    parser.add_argument("-o", "--output")
+    args = parser.parse_args()
+
+    data_manager_dict = {}
+    # Extract json file params
+    filename = args.output
+    params = from_json_string(open(filename).read())
+    target_directory = params[ 'output_data' ][0]['extra_files_path']
+    os.mkdir(target_directory)
+
+    ## Download source files from HPA
+    try:
+        hpa = args.hpa
+    except NameError:
+        hpa = None
+    if hpa is not None:
+        #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+        hpa = hpa.split(",")
+        for hpa_tissue in hpa:
+            HPA_sources(data_manager_dict, hpa_tissue, target_directory)
+
+    ## Download source file from Peptide Atlas query
+    try:
+        peptide_atlas = args.peptideatlas
+    except NameError:
+        peptide_atlas = None
+    if peptide_atlas is not None:
+        #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+        peptide_atlas = peptide_atlas.split(",")
+        for pa_tissue in peptide_atlas:
+            peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory)
+
+    ## Download ID_mapping source file from Uniprot
+    try:
+        id_mapping=args.id_mapping
+    except NameError:
+        id_mapping = None
+    if id_mapping is not None:
+        id_mapping = id_mapping .split(",")
+        for species in id_mapping :
+            id_mapping_sources(data_manager_dict, species, target_directory)
+
+    #save info to json file
+    filename = args.output
+    open(filename, 'wb').write(to_json_string(data_manager_dict))
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/resource_building.xml	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,91 @@
+<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2018.10.18.9" tool_type="manage_data">
+<description>
+to create or update reference files for proteore tools
+</description>
+<requirements>
+</requirements>
+<stdio>
+  <exit_code range="1:" />
+</stdio>
+<command><![CDATA[
+
+    python $__tool_directory__/resource_building.py
+    #if $database.database == "human_protein_atlas"
+        --hpa "$database.tissues"
+    #else if $database.database == "peptide_atlas"
+        --peptideatlas "$database.tissues"
+    #else if $database.database == "id_mapping"
+        --id_mapping="$database.species"
+    #end if
+    --output "$output"
+
+]]></command>
+
+<inputs>
+    <conditional name="database">
+        <param name="database" type="select">
+            <option value="human_protein_atlas">Human Protein Atlas</option>
+            <option value="peptide_atlas">Peptide Atlas</option>
+            <option value="id_mapping">ID mapping</option>
+        </param>
+        <when value="human_protein_atlas">
+            <param name="tissues" type="select" multiple="false" label="Please select tissue">
+                <option value="HPA_normal_tissue">Normal tissue</option>
+                <option value="HPA_pathology">Pathology</option>
+                <option value="HPA_full_atlas">Full Atlas</option>
+            </param>
+        </when>
+        <when value="peptide_atlas">
+            <param name="tissues" type="select" multiple="false" label="Please select the tissue">
+                <option value="1-Human_Liver">Human liver</option>
+                <option value="2-Human_Brain">Human brain</option>
+                <option value="4-Human_Heart">Human heart</option>
+                <option value="5-Human_Kidney">Human kidney</option>
+                <option value="10-Human_Plasma">Human blood plasma</option>
+                <option value="13-Human_Urine">Human urine</option>
+                <option value="24-Human_CSF">Human cerebrospinal fluid</option>
+            </param>
+        </when>
+        <when value="id_mapping">
+            <param name="species" type="select" multiple="false" label="Please select the species">
+                <option value="human">Homo sapiens</option>
+                <option value="mouse">Mus musculus</option>
+                <option value="rat">Rattus norvegicus</option>
+            </param>
+        </when>
+    </conditional>
+</inputs>
+
+<outputs>
+    <!--data format="tabular" name="output">
+        <discover_datasets pattern="(?P&lt;designation&gt;.+).tsv" ext="tabular" visible="true" assign_primary_output="true" />
+    </data-->
+    <data name="output" format="data_manager_json"/>
+</outputs>
+
+<tests>
+</tests>
+
+<help><![CDATA[
+
+TODO
+
+-----
+
+.. class:: infomark
+
+**Authors**
+
+Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
+
+This work has been partially funded through the French National Agency for Research (ANR) IFB project.
+
+Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
+
+    ]]></help>
+    <citations>
+    </citations>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,46 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/resource_building.xml" id="data_manager_proteore">
+        <data_table name="proteinatlas">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="output" >
+                    <move type="file">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">proteinatlas/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/proteinatlas/${path}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+        <data_table name="peptide_atlas">
+            <output>
+                <column name="tissue" />
+                <column name="name" />
+                <column name="value" output_ref="output" >
+                    <move type="file">
+                        <!--source>${path}/${value}.tsv</source-->
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">peptide_atlas/</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/peptide_atlas/${tissue}.tsv</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+        <data_table name="id_mapping_tab">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="output" >
+                    <move type="file">
+                        <!--source>${path}</source-->
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">id_mapping/</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${value}.tsv</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/id_mapping.loc.sample	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,5 @@
+#This file lists the locations of reference file for id_converter tool
+#<name>	<value>	<path>
+#human_id_mapping	Human (homo sapiens)	tool-data/human_id_mapping_file.tsv
+#mouse_id_mapping	Mouse (Mus musculus)	tool-data/mouse_id_mapping.tsv
+#rat_id_mapping 	Rat (Rattus norvegicus)	tool-data/rat_id_mapping.tsv
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/peptide_atlas.loc.sample	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,15 @@
+#This file lists the locations name and values of reference files
+#for number of MS/MS observations in a tissue
+#This is a tab separated file (TAB, not 4 spaces !)
+#
+#<tissue>	<name>	<value>
+
+
+#Human_Heart_20-07-2018	Human Heart 20/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Heart_20-07-2018.tsv
+#Human_Liver_20-07-2018	Human Liver 20/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Liver_20-07-2018.tsv
+#Human_Urine_20-07-2018	Human Urine 20/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Urine_20-07-2018.tsv
+#Human_Brain_20-07-2018	Human Brain 20/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Brain_20-07-2018.tsv
+#Human_Kidney_20-07-2018	Human Kidney 20/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Kidney_20-07-2018.tsv
+#Human_Plasma_20-07-2018	Human Plasma 20/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Plasma_20-07-2018.tsv
+#Human_CSF_20-07-2018	Human CSF 20/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_CSF_20-07-2018.tsv
+#Human_Liver_23-07-2018	Human Liver 23/07/2018	/projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Liver_23-07-2018.tsv
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteinatlas.loc.sample	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,12 @@
+#This file lists the locations name and values of reference files
+#for Get expression data tool
+#This is a tab separated file (TAB, not 4 spaces !)
+#
+#<name> <value>	<path>
+#
+#proteinatlas.loc could look something like this:
+#
+#HPA normal tissue 19/07/2018	HPA_normal_tissue	/projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19159/dataset_39307_files/HPA_normal_tissue_19-07-2018.tsv
+#HPA pathology 19/07/2018	HPA_pathology	/projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_pathology_19-07-2018.tsv
+#HPA full atlas 19/07/2018	HPA_full_atlas	/projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19161/dataset_39309_files/HPA_full_atlas_19-07-2018.tsv
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<tables>
+    <table name='peptide_atlas' comment_char="#">
+        <columns>tissue,name,value</columns>
+        <file path="tool-data/peptide_atlas.loc"/>
+    </table>
+    <table name="proteinatlas" comment_char="#">
+      <columns>name, value, path</columns>
+      <file path="tool-data/proteinatlas.loc" />
+    </table>
+    <table name="id_mapping_tab" comment_char="#">
+      <columns>name, value, path</columns>
+      <file path="tool-data/id_mapping.loc" />
+    </table>
+</tables>
\ No newline at end of file