# HG changeset patch
# User dchristiany
# Date 1539879369 14400
# Node ID 2de84fea83672e0e1bd3ba7d49e182fda6f92ec9
planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
diff -r 000000000000 -r 2de84fea8367 data_manager/resource_building.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/resource_building.py Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,323 @@
+"""
+The purpose of this script is to create source files from different databases to be used in other tools
+"""
+
+import os, sys, argparse, requests, time, csv, re
+from io import BytesIO
+from zipfile import ZipFile
+from galaxy.util.json import from_json_string, to_json_string
+
+#######################################################################################################
+# General functions
+#######################################################################################################
+def unzip(url, output_file):
+ """
+ Get a zip file content from a link and unzip
+ """
+ content = requests.get(url)
+ zipfile = ZipFile(BytesIO(content.content))
+ output_content = ""
+ output_content += zipfile.open(zipfile.namelist()[0]).read()
+ output = open(output_file, "w")
+ output.write(output_content)
+ output.close()
+
+def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
+ data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+ data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
+ data_manager_dict['data_tables'][data_table].append(data_table_entry)
+ return data_manager_dict
+
+#######################################################################################################
+# 1. Human Protein Atlas
+# - Normal tissue
+# - Pathology
+# - Full Atlas
+#######################################################################################################
+def HPA_sources(data_manager_dict, tissue, target_directory):
+ if tissue == "HPA_normal_tissue":
+ tissue_name = "HPA normal tissue"
+ url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
+ elif tissue == "HPA_pathology":
+ tissue_name = "HPA pathology"
+ url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
+ elif tissue == "HPA_full_atlas":
+ tissue_name = "HPA full atlas"
+ url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
+ output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
+ path = os.path.join(target_directory, output_file)
+ unzip(url, path)
+ print(str(os.path.isfile(path)))
+ tmp=open(path,"r").readlines()
+ tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
+ data_table_entry = dict(value = tissue, name = tissue_name, path = path)
+ _add_data_table_entry(data_manager_dict, data_table_entry, "proteinatlas")
+
+
+#######################################################################################################
+# 2. Peptide Atlas
+#######################################################################################################
+def peptide_atlas_sources(data_manager_dict, tissue, target_directory):
+ # Define PA Human build released number (here early 2018)
+ atlas_build_id = "472"
+ # Define organism_id (here Human) - to be upraded when other organism added to the project
+ organism_id = "2"
+ # Extract sample_category_id and output filename
+ sample_category_id = tissue.split("-")[0]
+ output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
+ query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \
+ atlas_build_id + "&display_options=ShowMappings&organism_id= " + \
+ organism_id + "&sample_category_id=" + sample_category_id + \
+ "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY"
+ download = requests.get(query)
+ decoded_content = download.content.decode('utf-8')
+ cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
+
+ #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value
+ uni_dict = build_dictionary(cr)
+
+ tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")])
+ tissue_value = tissue.split("-")[1]
+ tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y")
+ tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y")
+ path = os.path.join(target_directory,output_file)
+
+ with open(path,"wb") as out :
+ w = csv.writer(out,delimiter='\t')
+ w.writerow(["Uniprot_AC","nb_obs"])
+ w.writerows(uni_dict.items())
+
+ data_table_entry = dict(value = path, name = tissue_name, tissue = tissue)
+ _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas")
+
+#function to count the number of observations by uniprot id
+def build_dictionary (csv) :
+ uni_dict = {}
+ for line in csv :
+ if "-" not in line[2] and check_uniprot_access(line[2]) :
+ if line[2] in uni_dict :
+ uni_dict[line[2]] += int(line[4])
+ else :
+ uni_dict[line[2]] = int(line[4])
+
+ return uni_dict
+
+#function to check if an id is an uniprot accession number : return True or False-
+def check_uniprot_access (id) :
+ uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
+ if uniprot_pattern.match(id) :
+ return True
+ else :
+ return False
+
+
+
+#######################################################################################################
+# 3. ID mapping file
+#######################################################################################################
+import ftplib, gzip
+csv.field_size_limit(sys.maxsize) # to handle big files
+
+def id_mapping_sources (data_manager_dict, species, target_directory) :
+
+ human = species == "human"
+ species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
+ files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
+
+ #header
+ if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+ else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+
+ #print("header ok")
+
+ #selected.tab and keep only ids of interest
+ selected_tab_file=species_dict[species]+"_"+files[0]
+ tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
+ with gzip.open(tab_path,"rt") as select :
+ tab_reader = csv.reader(select,delimiter="\t")
+ for line in tab_reader :
+ tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
+ os.remove(tab_path)
+
+ #print("selected_tab ok")
+
+ """
+ Supplementary ID to get from HUMAN_9606_idmapping.dat :
+ -NextProt,BioGrid,STRING,KEGG
+ """
+
+ if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file
+ else : ids = ['BioGrid','STRING','KEGG' ]
+ unidict = {}
+
+ #keep only ids of interest in dictionaries
+ dat_file=species_dict[species]+"_"+files[1]
+ dat_path = download_from_uniprot_ftp(dat_file,target_directory)
+ with gzip.open(dat_path,"rt") as dat :
+ dat_reader = csv.reader(dat,delimiter="\t")
+ for line in dat_reader :
+ uniprotID=line[0] #UniProtID as key
+ id_type=line[1] #ID type of corresponding id, key of sub-dictionnary
+ cor_id=line[2] #corresponding id
+ if "-" not in id_type : #we don't keep isoform
+ if id_type in ids and uniprotID in unidict :
+ if id_type in unidict[uniprotID] :
+ unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
+ else :
+ unidict[uniprotID].update({ id_type : cor_id })
+ elif id_type in ids :
+ unidict[uniprotID]={id_type : cor_id}
+ os.remove(dat_path)
+
+ #print("dat_file ok")
+
+ #add ids from idmapping.dat to the final tab
+ for line in tab[1:] :
+ uniprotID=line[0]
+ if human :
+ if uniprotID in unidict :
+ nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
+ if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
+ line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+ access_dictionary(unidict,uniprotID,'KEGG')])
+ else :
+ line.extend(["","","",""])
+ else :
+ if uniprotID in unidict :
+ line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+ access_dictionary(unidict,uniprotID,'KEGG')])
+ else :
+ line.extend(["","",""])
+
+ #print ("tab ok")
+
+ #add missing nextprot ID for human
+ if human :
+ #build next_dict
+ nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+ next_dict = {}
+ for nextid in nextprot_ids :
+ next_dict[nextid.replace("NX_","")] = nextid
+ os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
+
+ #add missing nextprot ID
+ for line in tab[1:] :
+ uniprotID=line[0]
+ nextprotID=line[13]
+ if nextprotID == '' and uniprotID in next_dict :
+ line[13]=next_dict[uniprotID]
+
+ output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
+ path = os.path.join(target_directory,output_file)
+
+ with open(path,"w") as out :
+ w = csv.writer(out,delimiter='\t')
+ w.writerows(tab)
+
+ name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
+ name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")"
+
+ data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path)
+ _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping_tab")
+
+def download_from_uniprot_ftp(file,target_directory) :
+ ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
+ path = os.path.join(target_directory, file)
+ ftp = ftplib.FTP("ftp.uniprot.org")
+ ftp.login("anonymous", "anonymous")
+ ftp.cwd(ftp_dir)
+ ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+ ftp.quit()
+ return (path)
+
+def id_list_from_nextprot_ftp(file,target_directory) :
+ ftp_dir = "pub/current_release/ac_lists/"
+ path = os.path.join(target_directory, file)
+ ftp = ftplib.FTP("ftp.nextprot.org")
+ ftp.login("anonymous", "anonymous")
+ ftp.cwd(ftp_dir)
+ ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+ ftp.quit()
+ with open(path,'r') as nextprot_ids :
+ nextprot_ids = nextprot_ids.read().splitlines()
+ return (nextprot_ids)
+
+#return '' if there's no value in a dictionary, avoid error
+def access_dictionary (dico,key1,key2) :
+ if key1 in dico :
+ if key2 in dico[key1] :
+ return (dico[key1][key2])
+ else :
+ return ("")
+ #print (key2,"not in ",dico,"[",key1,"]")
+ else :
+ return ('')
+
+#if there are several nextprot ID for one uniprotID, return the uniprot like ID
+def clean_nextprot_id (next_id,uniprotAc) :
+ if len(next_id.split(";")) > 1 :
+ tmp = next_id.split(";")
+ if "NX_"+uniprotAc in tmp :
+ return ("NX_"+uniprotAc)
+ else :
+ return (tmp[1])
+ else :
+ return (next_id)
+
+
+#######################################################################################################
+# Main function
+#######################################################################################################
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
+ parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
+ parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
+ parser.add_argument("-o", "--output")
+ args = parser.parse_args()
+
+ data_manager_dict = {}
+ # Extract json file params
+ filename = args.output
+ params = from_json_string(open(filename).read())
+ target_directory = params[ 'output_data' ][0]['extra_files_path']
+ os.mkdir(target_directory)
+
+ ## Download source files from HPA
+ try:
+ hpa = args.hpa
+ except NameError:
+ hpa = None
+ if hpa is not None:
+ #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+ hpa = hpa.split(",")
+ for hpa_tissue in hpa:
+ HPA_sources(data_manager_dict, hpa_tissue, target_directory)
+
+ ## Download source file from Peptide Atlas query
+ try:
+ peptide_atlas = args.peptideatlas
+ except NameError:
+ peptide_atlas = None
+ if peptide_atlas is not None:
+ #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+ peptide_atlas = peptide_atlas.split(",")
+ for pa_tissue in peptide_atlas:
+ peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory)
+
+ ## Download ID_mapping source file from Uniprot
+ try:
+ id_mapping=args.id_mapping
+ except NameError:
+ id_mapping = None
+ if id_mapping is not None:
+ id_mapping = id_mapping .split(",")
+ for species in id_mapping :
+ id_mapping_sources(data_manager_dict, species, target_directory)
+
+ #save info to json file
+ filename = args.output
+ open(filename, 'wb').write(to_json_string(data_manager_dict))
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r 2de84fea8367 data_manager/resource_building.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/resource_building.xml Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,91 @@
+
+
+to create or update reference files for proteore tools
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 2de84fea8367 data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,46 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 2de84fea8367 tool-data/id_mapping.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/id_mapping.loc.sample Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,5 @@
+#This file lists the locations of reference file for id_converter tool
+#
+#human_id_mapping Human (homo sapiens) tool-data/human_id_mapping_file.tsv
+#mouse_id_mapping Mouse (Mus musculus) tool-data/mouse_id_mapping.tsv
+#rat_id_mapping Rat (Rattus norvegicus) tool-data/rat_id_mapping.tsv
diff -r 000000000000 -r 2de84fea8367 tool-data/peptide_atlas.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/peptide_atlas.loc.sample Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,15 @@
+#This file lists the locations name and values of reference files
+#for number of MS/MS observations in a tissue
+#This is a tab separated file (TAB, not 4 spaces !)
+#
+#
+
+
+#Human_Heart_20-07-2018 Human Heart 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Heart_20-07-2018.tsv
+#Human_Liver_20-07-2018 Human Liver 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Liver_20-07-2018.tsv
+#Human_Urine_20-07-2018 Human Urine 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Urine_20-07-2018.tsv
+#Human_Brain_20-07-2018 Human Brain 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Brain_20-07-2018.tsv
+#Human_Kidney_20-07-2018 Human Kidney 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Kidney_20-07-2018.tsv
+#Human_Plasma_20-07-2018 Human Plasma 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Plasma_20-07-2018.tsv
+#Human_CSF_20-07-2018 Human CSF 20/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_CSF_20-07-2018.tsv
+#Human_Liver_23-07-2018 Human Liver 23/07/2018 /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Liver_23-07-2018.tsv
diff -r 000000000000 -r 2de84fea8367 tool-data/proteinatlas.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteinatlas.loc.sample Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,12 @@
+#This file lists the locations name and values of reference files
+#for Get expression data tool
+#This is a tab separated file (TAB, not 4 spaces !)
+#
+#
+#
+#proteinatlas.loc could look something like this:
+#
+#HPA normal tissue 19/07/2018 HPA_normal_tissue /projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19159/dataset_39307_files/HPA_normal_tissue_19-07-2018.tsv
+#HPA pathology 19/07/2018 HPA_pathology /projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_pathology_19-07-2018.tsv
+#HPA full atlas 19/07/2018 HPA_full_atlas /projet/galaxydev/galaxy/tool-data/proteinatlas/projet/galaxydev/galaxy/database/jobs_directory/019/19161/dataset_39309_files/HPA_full_atlas_19-07-2018.tsv
+#
diff -r 000000000000 -r 2de84fea8367 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,15 @@
+
+
+
+
+
+
\ No newline at end of file