diff data_manager/resource_building.py @ 0:2de84fea8367 draft

planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
author dchristiany
date Thu, 18 Oct 2018 12:16:09 -0400
parents
children 9c75521e4a64
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/resource_building.py	Thu Oct 18 12:16:09 2018 -0400
@@ -0,0 +1,323 @@
+"""
+The purpose of this script is to create source files from different databases to be used in other tools
+"""
+
+import os, sys, argparse, requests, time, csv, re
+from io import BytesIO
+from zipfile import ZipFile
+from galaxy.util.json import from_json_string, to_json_string
+
+#######################################################################################################
+# General functions
+#######################################################################################################
+def unzip(url, output_file):
+    """
+    Get a zip file content from a link and unzip
+    """
+    content = requests.get(url)
+    zipfile = ZipFile(BytesIO(content.content))
+    output_content = ""
+    output_content += zipfile.open(zipfile.namelist()[0]).read()
+    output = open(output_file, "w")
+    output.write(output_content)
+    output.close()
+
+def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
+    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+    data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
+    data_manager_dict['data_tables'][data_table].append(data_table_entry)
+    return data_manager_dict
+
+#######################################################################################################
+# 1. Human Protein Atlas
+#    - Normal tissue
+#    - Pathology
+#    - Full Atlas
+#######################################################################################################
+def HPA_sources(data_manager_dict, tissue, target_directory):
+    if tissue == "HPA_normal_tissue":
+        tissue_name = "HPA normal tissue"
+        url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
+    elif tissue == "HPA_pathology":
+        tissue_name = "HPA pathology"
+        url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
+    elif tissue == "HPA_full_atlas":
+        tissue_name = "HPA full atlas"
+        url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
+    output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    path = os.path.join(target_directory, output_file)
+    unzip(url, path)
+    print(str(os.path.isfile(path)))
+    tmp=open(path,"r").readlines()
+    tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
+    data_table_entry = dict(value = tissue, name = tissue_name, path = path)
+    _add_data_table_entry(data_manager_dict, data_table_entry, "proteinatlas")
+
+
+#######################################################################################################
+# 2. Peptide Atlas
+#######################################################################################################
+def peptide_atlas_sources(data_manager_dict, tissue, target_directory):
+    # Define PA Human build released number (here  early 2018)
+    atlas_build_id = "472"
+    # Define organism_id (here Human) - to be upraded when other organism added to the project
+    organism_id = "2"
+    # Extract sample_category_id and output filename
+    sample_category_id = tissue.split("-")[0]
+    output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \
+            atlas_build_id + "&display_options=ShowMappings&organism_id= " + \
+            organism_id + "&sample_category_id=" + sample_category_id + \
+            "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY"
+    download = requests.get(query)
+    decoded_content = download.content.decode('utf-8')
+    cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
+
+    #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value
+    uni_dict = build_dictionary(cr)
+
+    tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")])
+    tissue_value = tissue.split("-")[1]
+    tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y")
+    tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y")
+    path = os.path.join(target_directory,output_file)
+
+    with open(path,"wb") as out :
+        w = csv.writer(out,delimiter='\t')
+        w.writerow(["Uniprot_AC","nb_obs"])
+        w.writerows(uni_dict.items())
+        
+    data_table_entry = dict(value = path, name = tissue_name, tissue = tissue)
+    _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas")
+
+#function to count the number of observations by uniprot id
+def build_dictionary (csv) :
+    uni_dict = {} 
+    for line in csv :
+        if "-" not in line[2] and check_uniprot_access(line[2]) :
+            if line[2] in uni_dict :
+                uni_dict[line[2]] += int(line[4])
+            else : 
+                uni_dict[line[2]] = int(line[4])
+
+    return uni_dict
+
+#function to check if an id is an uniprot accession number : return True or False-
+def check_uniprot_access (id) :
+    uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
+    if uniprot_pattern.match(id) :
+        return True
+    else :
+        return False
+
+
+
+#######################################################################################################
+# 3. ID mapping file
+#######################################################################################################
+import ftplib, gzip
+csv.field_size_limit(sys.maxsize) # to handle big files
+
+def id_mapping_sources (data_manager_dict, species, target_directory) :
+
+    human = species == "human"
+    species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
+    files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
+
+    #header
+    if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+    else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+
+    #print("header ok")
+
+    #selected.tab and keep only ids of interest
+    selected_tab_file=species_dict[species]+"_"+files[0]
+    tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
+    with gzip.open(tab_path,"rt") as select :
+        tab_reader = csv.reader(select,delimiter="\t")
+        for line in tab_reader :
+            tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
+    os.remove(tab_path)
+
+    #print("selected_tab ok")
+
+    """
+    Supplementary ID to get from HUMAN_9606_idmapping.dat :
+    -NextProt,BioGrid,STRING,KEGG
+    """
+
+    if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ]   #ids to get from dat_file
+    else : ids = ['BioGrid','STRING','KEGG' ]
+    unidict = {}
+
+    #keep only ids of interest in dictionaries
+    dat_file=species_dict[species]+"_"+files[1]
+    dat_path = download_from_uniprot_ftp(dat_file,target_directory)
+    with gzip.open(dat_path,"rt") as dat :
+        dat_reader = csv.reader(dat,delimiter="\t")
+        for line in dat_reader :
+            uniprotID=line[0]       #UniProtID as key
+            id_type=line[1]         #ID type of corresponding id, key of sub-dictionnary
+            cor_id=line[2]          #corresponding id
+            if "-" not in id_type :                                 #we don't keep isoform
+                if id_type in ids and uniprotID in unidict :
+                    if id_type in unidict[uniprotID] :
+                        unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id])    #if there is already a value in the dictionnary
+                    else :          
+                        unidict[uniprotID].update({ id_type : cor_id })
+                elif  id_type in ids :
+                    unidict[uniprotID]={id_type : cor_id}
+    os.remove(dat_path)
+
+    #print("dat_file ok")
+
+    #add ids from idmapping.dat to the final tab
+    for line in tab[1:] :
+        uniprotID=line[0]
+        if human :
+            if uniprotID in unidict :
+                nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
+                if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
+                line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+                        access_dictionary(unidict,uniprotID,'KEGG')])
+            else :
+                line.extend(["","","",""])
+        else :
+            if uniprotID in unidict :
+                line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+                        access_dictionary(unidict,uniprotID,'KEGG')])
+            else :
+                line.extend(["","",""])
+
+    #print ("tab ok")
+
+    #add missing nextprot ID for human
+    if human : 
+        #build next_dict
+        nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+        next_dict = {}
+        for nextid in nextprot_ids : 
+            next_dict[nextid.replace("NX_","")] = nextid
+        os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
+
+        #add missing nextprot ID
+        for line in tab[1:] : 
+            uniprotID=line[0]
+            nextprotID=line[13]
+            if nextprotID == '' and uniprotID in next_dict :
+                line[13]=next_dict[uniprotID]
+
+    output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
+    path = os.path.join(target_directory,output_file)
+
+    with open(path,"w") as out :
+        w = csv.writer(out,delimiter='\t')
+        w.writerows(tab)
+
+    name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
+    name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")"
+
+    data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path)
+    _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping_tab")
+
+def download_from_uniprot_ftp(file,target_directory) :
+    ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
+    path = os.path.join(target_directory, file)
+    ftp = ftplib.FTP("ftp.uniprot.org")
+    ftp.login("anonymous", "anonymous") 
+    ftp.cwd(ftp_dir)
+    ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+    ftp.quit()
+    return (path)
+
+def id_list_from_nextprot_ftp(file,target_directory) :
+    ftp_dir = "pub/current_release/ac_lists/"
+    path = os.path.join(target_directory, file)
+    ftp = ftplib.FTP("ftp.nextprot.org")
+    ftp.login("anonymous", "anonymous") 
+    ftp.cwd(ftp_dir)
+    ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+    ftp.quit()
+    with open(path,'r') as nextprot_ids :
+        nextprot_ids = nextprot_ids.read().splitlines()
+    return (nextprot_ids)
+
+#return '' if there's no value in a dictionary, avoid error
+def access_dictionary (dico,key1,key2) :
+    if key1 in dico :
+        if key2 in dico[key1] :
+            return (dico[key1][key2])
+        else :
+            return ("")
+            #print (key2,"not in ",dico,"[",key1,"]")
+    else :
+        return ('')
+
+#if there are several nextprot ID for one uniprotID, return the uniprot like ID
+def clean_nextprot_id (next_id,uniprotAc) :
+    if len(next_id.split(";")) > 1 :
+        tmp = next_id.split(";")
+        if "NX_"+uniprotAc in tmp :
+            return ("NX_"+uniprotAc)
+        else :
+            return (tmp[1])
+    else :
+        return (next_id)
+
+
+#######################################################################################################
+# Main function
+#######################################################################################################
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
+    parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
+    parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
+    parser.add_argument("-o", "--output")
+    args = parser.parse_args()
+
+    data_manager_dict = {}
+    # Extract json file params
+    filename = args.output
+    params = from_json_string(open(filename).read())
+    target_directory = params[ 'output_data' ][0]['extra_files_path']
+    os.mkdir(target_directory)
+
+    ## Download source files from HPA
+    try:
+        hpa = args.hpa
+    except NameError:
+        hpa = None
+    if hpa is not None:
+        #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+        hpa = hpa.split(",")
+        for hpa_tissue in hpa:
+            HPA_sources(data_manager_dict, hpa_tissue, target_directory)
+    
+    ## Download source file from Peptide Atlas query
+    try:
+        peptide_atlas = args.peptideatlas
+    except NameError:
+        peptide_atlas = None
+    if peptide_atlas is not None:
+        #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+        peptide_atlas = peptide_atlas.split(",")
+        for pa_tissue in peptide_atlas:
+            peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory)
+
+    ## Download ID_mapping source file from Uniprot
+    try:
+        id_mapping=args.id_mapping
+    except NameError:
+        id_mapping = None
+    if id_mapping is not None:
+        id_mapping = id_mapping .split(",")
+        for species in id_mapping :
+            id_mapping_sources(data_manager_dict, species, target_directory)
+ 
+    #save info to json file
+    filename = args.output
+    open(filename, 'wb').write(to_json_string(data_manager_dict))
+
+if __name__ == "__main__":
+    main()