annotate data_manager/resource_building.py @ 0:55efb19f0b34 draft

planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
author dchristiany
date Tue, 23 Oct 2018 04:59:28 -0400
parents
children d98f0163932b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
1 """
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
2 The purpose of this script is to create source files from different databases to be used in other proteore tools
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
3 """
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
4
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
5 import os, sys, argparse, requests, time, csv, re
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
6 from io import BytesIO
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
7 from zipfile import ZipFile
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
8 from galaxy.util.json import from_json_string, to_json_string
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
9
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
10 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
11 # General functions
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
12 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
13 def unzip(url, output_file):
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
14 """
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
15 Get a zip file content from a link and unzip
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
16 """
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
17 content = requests.get(url)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
18 zipfile = ZipFile(BytesIO(content.content))
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
19 output_content = ""
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
20 output_content += zipfile.open(zipfile.namelist()[0]).read()
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
21 output = open(output_file, "w")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
22 output.write(output_content)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
23 output.close()
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
24
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
25 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
26 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
27 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
28 data_manager_dict['data_tables'][data_table].append(data_table_entry)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
29 return data_manager_dict
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
30
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
31 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
32 # 1. Human Protein Atlas
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
33 # - Normal tissue
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
34 # - Pathology
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
35 # - Full Atlas
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
36 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
37 def HPA_sources(data_manager_dict, tissue, target_directory):
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
38 if tissue == "HPA_normal_tissue":
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
39 tissue_name = "HPA normal tissue"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
40 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
41 elif tissue == "HPA_pathology":
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
42 tissue_name = "HPA pathology"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
44 elif tissue == "HPA_full_atlas":
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
45 tissue_name = "HPA full atlas"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
47
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
48 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
49 path = os.path.join(target_directory, output_file)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
50 unzip(url, path) #download and save file
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
51 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
52 tissue_id = tissue_name.replace(" ","_").replace("/","-")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
53
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
54 data_table_entry = dict(id=tissue_id, name = tissue_name, value = tissue, , path = path)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
55 _add_data_table_entry(data_manager_dict, data_table_entry, "protein_atlas")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
56
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
57
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
58 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
59 # 2. Peptide Atlas
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
60 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
61 def peptide_atlas_sources(data_manager_dict, tissue, target_directory):
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
62 # Define PA Human build released number (here early 2018)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
63 atlas_build_id = "472"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
64 # Define organism_id (here Human) - to be upraded when other organism added to the project
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
65 organism_id = "2"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
66 # Extract sample_category_id and output filename
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
67 sample_category_id = tissue.split("-")[0]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
68 output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
69 query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
70 atlas_build_id + "&display_options=ShowMappings&organism_id= " + \
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
71 organism_id + "&sample_category_id=" + sample_category_id + \
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
72 "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
73 download = requests.get(query)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
74 decoded_content = download.content.decode('utf-8')
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
75 cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
76
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
77 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
78 uni_dict = build_dictionary(cr)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
79
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
80 #columns of data table peptide_atlas
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
81 date = time.strftime("%d-%m-%Y")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
82 tissue = tissue.split("-")[1]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
83 tissue_id = tissue+"_"+date
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
84 tissue_name = tissue_id.replace("-","/").replace("_"," ")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
85 path = os.path.join(target_directory,output_file)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
86
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
87 with open(path,"wb") as out :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
88 w = csv.writer(out,delimiter='\t')
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
89 w.writerow(["Uniprot_AC","nb_obs"])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
90 w.writerows(uni_dict.items())
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
91
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
92 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
93 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
94
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
95 #function to count the number of observations by uniprot id
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
96 def build_dictionary (csv) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
97 uni_dict = {}
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
98 for line in csv :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
99 if "-" not in line[2] and check_uniprot_access(line[2]) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
100 if line[2] in uni_dict :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
101 uni_dict[line[2]] += int(line[4])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
102 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
103 uni_dict[line[2]] = int(line[4])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
104
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
105 return uni_dict
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
106
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
107 #function to check if an id is an uniprot accession number : return True or False-
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
108 def check_uniprot_access (id) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
109 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
110 if uniprot_pattern.match(id) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
111 return True
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
112 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
113 return False
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
114
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
115
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
116
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
117 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
118 # 3. ID mapping file
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
119 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
120 import ftplib, gzip
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
121 csv.field_size_limit(sys.maxsize) # to handle big files
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
122
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
123 def id_mapping_sources (data_manager_dict, species, target_directory) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
124
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
125 human = species == "human"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
128
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
129 #header
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
130 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
131 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
132
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
133 #print("header ok")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
134
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
135 #selected.tab and keep only ids of interest
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
136 selected_tab_file=species_dict[species]+"_"+files[0]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
137 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
138 with gzip.open(tab_path,"rt") as select :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
139 tab_reader = csv.reader(select,delimiter="\t")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
140 for line in tab_reader :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
141 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
142 os.remove(tab_path)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
143
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
144 #print("selected_tab ok")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
145
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
146 """
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
147 Supplementary ID to get from HUMAN_9606_idmapping.dat :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
148 -NextProt,BioGrid,STRING,KEGG
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
149 """
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
150
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
151 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
152 else : ids = ['BioGrid','STRING','KEGG' ]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
153 unidict = {}
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
154
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
155 #keep only ids of interest in dictionaries
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
156 dat_file=species_dict[species]+"_"+files[1]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
157 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
158 with gzip.open(dat_path,"rt") as dat :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
159 dat_reader = csv.reader(dat,delimiter="\t")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
160 for line in dat_reader :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
161 uniprotID=line[0] #UniProtID as key
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
162 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
163 cor_id=line[2] #corresponding id
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
164 if "-" not in id_type : #we don't keep isoform
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
165 if id_type in ids and uniprotID in unidict :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
166 if id_type in unidict[uniprotID] :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
167 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
168 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
169 unidict[uniprotID].update({ id_type : cor_id })
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
170 elif id_type in ids :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
171 unidict[uniprotID]={id_type : cor_id}
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
172 os.remove(dat_path)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
173
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
174 #print("dat_file ok")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
175
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
176 #add ids from idmapping.dat to the final tab
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
177 for line in tab[1:] :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
178 uniprotID=line[0]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
179 if human :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
180 if uniprotID in unidict :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
181 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
182 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
183 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
184 access_dictionary(unidict,uniprotID,'KEGG')])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
185 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
186 line.extend(["","","",""])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
187 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
188 if uniprotID in unidict :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
189 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
190 access_dictionary(unidict,uniprotID,'KEGG')])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
191 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
192 line.extend(["","",""])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
193
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
194 #print ("tab ok")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
195
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
196 #add missing nextprot ID for human
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
197 if human :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
198 #build next_dict
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
199 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
200 next_dict = {}
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
201 for nextid in nextprot_ids :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
202 next_dict[nextid.replace("NX_","")] = nextid
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
203 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
204
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
205 #add missing nextprot ID
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
206 for line in tab[1:] :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
207 uniprotID=line[0]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
208 nextprotID=line[13]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
209 if nextprotID == '' and uniprotID in next_dict :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
210 line[13]=next_dict[uniprotID]
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
211
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
212 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
213 path = os.path.join(target_directory,output_file)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
214
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
215 with open(path,"w") as out :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
216 w = csv.writer(out,delimiter='\t')
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
217 w.writerows(tab)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
218
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
219 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
220 name = name_dict[species]+" "+time.strftime("%d/%m/%Y")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
221 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
222
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
223 data_table_entry = dict(id=id, name = name, value = species, path = path)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
224 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
225
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
226 def download_from_uniprot_ftp(file,target_directory) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
227 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
228 path = os.path.join(target_directory, file)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
229 ftp = ftplib.FTP("ftp.uniprot.org")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
230 ftp.login("anonymous", "anonymous")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
231 ftp.cwd(ftp_dir)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
232 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
233 ftp.quit()
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
234 return (path)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
235
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
236 def id_list_from_nextprot_ftp(file,target_directory) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
237 ftp_dir = "pub/current_release/ac_lists/"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
238 path = os.path.join(target_directory, file)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
239 ftp = ftplib.FTP("ftp.nextprot.org")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
240 ftp.login("anonymous", "anonymous")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
241 ftp.cwd(ftp_dir)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
242 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
243 ftp.quit()
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
244 with open(path,'r') as nextprot_ids :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
245 nextprot_ids = nextprot_ids.read().splitlines()
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
246 return (nextprot_ids)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
247
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
248 #return '' if there's no value in a dictionary, avoid error
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
249 def access_dictionary (dico,key1,key2) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
250 if key1 in dico :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
251 if key2 in dico[key1] :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
252 return (dico[key1][key2])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
253 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
254 return ("")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
255 #print (key2,"not in ",dico,"[",key1,"]")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
256 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
257 return ('')
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
258
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
259 #if there are several nextprot ID for one uniprotID, return the uniprot like ID
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
260 def clean_nextprot_id (next_id,uniprotAc) :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
261 if len(next_id.split(";")) > 1 :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
262 tmp = next_id.split(";")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
263 if "NX_"+uniprotAc in tmp :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
264 return ("NX_"+uniprotAc)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
265 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
266 return (tmp[1])
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
267 else :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
268 return (next_id)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
269
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
270
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
271 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
272 # Main function
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
273 #######################################################################################################
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
274 def main():
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
275 parser = argparse.ArgumentParser()
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
276 parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
277 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
278 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
279 parser.add_argument("-o", "--output")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
280 args = parser.parse_args()
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
281
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
282 data_manager_dict = {}
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
283 # Extract json file params
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
284 filename = args.output
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
285 params = from_json_string(open(filename).read())
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
286 target_directory = params[ 'output_data' ][0]['extra_files_path']
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
287 os.mkdir(target_directory)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
288
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
289 ## Download source files from HPA
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
290 try:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
291 hpa = args.hpa
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
292 except NameError:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
293 hpa = None
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
294 if hpa is not None:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
295 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
296 hpa = hpa.split(",")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
297 for hpa_tissue in hpa:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
298 HPA_sources(data_manager_dict, hpa_tissue, target_directory)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
299
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
300 ## Download source file from Peptide Atlas query
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
301 try:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
302 peptide_atlas = args.peptideatlas
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
303 except NameError:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
304 peptide_atlas = None
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
305 if peptide_atlas is not None:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
306 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
307 peptide_atlas = peptide_atlas.split(",")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
308 for pa_tissue in peptide_atlas:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
309 peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
310
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
311 ## Download ID_mapping source file from Uniprot
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
312 try:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
313 id_mapping=args.id_mapping
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
314 except NameError:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
315 id_mapping = None
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
316 if id_mapping is not None:
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
317 id_mapping = id_mapping .split(",")
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
318 for species in id_mapping :
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
319 id_mapping_sources(data_manager_dict, species, target_directory)
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
320
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
321 #save info to json file
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
322 filename = args.output
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
323 open(filename, 'wb').write(to_json_string(data_manager_dict))
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
324
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
325 if __name__ == "__main__":
55efb19f0b34 planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
dchristiany
parents:
diff changeset
326 main()