Mercurial > repos > dchristiany > data_manager_proteore_test
comparison data_manager/resource_building.py @ 0:55efb19f0b34 draft
planemo upload commit e5e768b479ddc6b36270a1b5b0443a4c80d693bc-dirty
author | dchristiany |
---|---|
date | Tue, 23 Oct 2018 04:59:28 -0400 |
parents | |
children | d98f0163932b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:55efb19f0b34 |
---|---|
1 """ | |
2 The purpose of this script is to create source files from different databases to be used in other proteore tools | |
3 """ | |
4 | |
5 import os, sys, argparse, requests, time, csv, re | |
6 from io import BytesIO | |
7 from zipfile import ZipFile | |
8 from galaxy.util.json import from_json_string, to_json_string | |
9 | |
10 ####################################################################################################### | |
11 # General functions | |
12 ####################################################################################################### | |
13 def unzip(url, output_file): | |
14 """ | |
15 Get a zip file content from a link and unzip | |
16 """ | |
17 content = requests.get(url) | |
18 zipfile = ZipFile(BytesIO(content.content)) | |
19 output_content = "" | |
20 output_content += zipfile.open(zipfile.namelist()[0]).read() | |
21 output = open(output_file, "w") | |
22 output.write(output_content) | |
23 output.close() | |
24 | |
25 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): | |
26 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
27 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | |
28 data_manager_dict['data_tables'][data_table].append(data_table_entry) | |
29 return data_manager_dict | |
30 | |
31 ####################################################################################################### | |
32 # 1. Human Protein Atlas | |
33 # - Normal tissue | |
34 # - Pathology | |
35 # - Full Atlas | |
36 ####################################################################################################### | |
37 def HPA_sources(data_manager_dict, tissue, target_directory): | |
38 if tissue == "HPA_normal_tissue": | |
39 tissue_name = "HPA normal tissue" | |
40 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" | |
41 elif tissue == "HPA_pathology": | |
42 tissue_name = "HPA pathology" | |
43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" | |
44 elif tissue == "HPA_full_atlas": | |
45 tissue_name = "HPA full atlas" | |
46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" | |
47 | |
48 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
49 path = os.path.join(target_directory, output_file) | |
50 unzip(url, path) #download and save file | |
51 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") | |
52 tissue_id = tissue_name.replace(" ","_").replace("/","-") | |
53 | |
54 data_table_entry = dict(id=tissue_id, name = tissue_name, value = tissue, , path = path) | |
55 _add_data_table_entry(data_manager_dict, data_table_entry, "protein_atlas") | |
56 | |
57 | |
58 ####################################################################################################### | |
59 # 2. Peptide Atlas | |
60 ####################################################################################################### | |
61 def peptide_atlas_sources(data_manager_dict, tissue, target_directory): | |
62 # Define PA Human build released number (here early 2018) | |
63 atlas_build_id = "472" | |
64 # Define organism_id (here Human) - to be upraded when other organism added to the project | |
65 organism_id = "2" | |
66 # Extract sample_category_id and output filename | |
67 sample_category_id = tissue.split("-")[0] | |
68 output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
69 query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \ | |
70 atlas_build_id + "&display_options=ShowMappings&organism_id= " + \ | |
71 organism_id + "&sample_category_id=" + sample_category_id + \ | |
72 "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY" | |
73 download = requests.get(query) | |
74 decoded_content = download.content.decode('utf-8') | |
75 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
76 | |
77 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value | |
78 uni_dict = build_dictionary(cr) | |
79 | |
80 #columns of data table peptide_atlas | |
81 date = time.strftime("%d-%m-%Y") | |
82 tissue = tissue.split("-")[1] | |
83 tissue_id = tissue+"_"+date | |
84 tissue_name = tissue_id.replace("-","/").replace("_"," ") | |
85 path = os.path.join(target_directory,output_file) | |
86 | |
87 with open(path,"wb") as out : | |
88 w = csv.writer(out,delimiter='\t') | |
89 w.writerow(["Uniprot_AC","nb_obs"]) | |
90 w.writerows(uni_dict.items()) | |
91 | |
92 data_table_entry = dict(id=tissue_id, name=tissue_name, value = path, tissue = tissue) | |
93 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas") | |
94 | |
95 #function to count the number of observations by uniprot id | |
96 def build_dictionary (csv) : | |
97 uni_dict = {} | |
98 for line in csv : | |
99 if "-" not in line[2] and check_uniprot_access(line[2]) : | |
100 if line[2] in uni_dict : | |
101 uni_dict[line[2]] += int(line[4]) | |
102 else : | |
103 uni_dict[line[2]] = int(line[4]) | |
104 | |
105 return uni_dict | |
106 | |
107 #function to check if an id is an uniprot accession number : return True or False- | |
108 def check_uniprot_access (id) : | |
109 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
110 if uniprot_pattern.match(id) : | |
111 return True | |
112 else : | |
113 return False | |
114 | |
115 | |
116 | |
117 ####################################################################################################### | |
118 # 3. ID mapping file | |
119 ####################################################################################################### | |
120 import ftplib, gzip | |
121 csv.field_size_limit(sys.maxsize) # to handle big files | |
122 | |
123 def id_mapping_sources (data_manager_dict, species, target_directory) : | |
124 | |
125 human = species == "human" | |
126 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" } | |
127 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | |
128 | |
129 #header | |
130 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | |
131 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | |
132 | |
133 #print("header ok") | |
134 | |
135 #selected.tab and keep only ids of interest | |
136 selected_tab_file=species_dict[species]+"_"+files[0] | |
137 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | |
138 with gzip.open(tab_path,"rt") as select : | |
139 tab_reader = csv.reader(select,delimiter="\t") | |
140 for line in tab_reader : | |
141 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | |
142 os.remove(tab_path) | |
143 | |
144 #print("selected_tab ok") | |
145 | |
146 """ | |
147 Supplementary ID to get from HUMAN_9606_idmapping.dat : | |
148 -NextProt,BioGrid,STRING,KEGG | |
149 """ | |
150 | |
151 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file | |
152 else : ids = ['BioGrid','STRING','KEGG' ] | |
153 unidict = {} | |
154 | |
155 #keep only ids of interest in dictionaries | |
156 dat_file=species_dict[species]+"_"+files[1] | |
157 dat_path = download_from_uniprot_ftp(dat_file,target_directory) | |
158 with gzip.open(dat_path,"rt") as dat : | |
159 dat_reader = csv.reader(dat,delimiter="\t") | |
160 for line in dat_reader : | |
161 uniprotID=line[0] #UniProtID as key | |
162 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary | |
163 cor_id=line[2] #corresponding id | |
164 if "-" not in id_type : #we don't keep isoform | |
165 if id_type in ids and uniprotID in unidict : | |
166 if id_type in unidict[uniprotID] : | |
167 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary | |
168 else : | |
169 unidict[uniprotID].update({ id_type : cor_id }) | |
170 elif id_type in ids : | |
171 unidict[uniprotID]={id_type : cor_id} | |
172 os.remove(dat_path) | |
173 | |
174 #print("dat_file ok") | |
175 | |
176 #add ids from idmapping.dat to the final tab | |
177 for line in tab[1:] : | |
178 uniprotID=line[0] | |
179 if human : | |
180 if uniprotID in unidict : | |
181 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') | |
182 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) | |
183 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
184 access_dictionary(unidict,uniprotID,'KEGG')]) | |
185 else : | |
186 line.extend(["","","",""]) | |
187 else : | |
188 if uniprotID in unidict : | |
189 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
190 access_dictionary(unidict,uniprotID,'KEGG')]) | |
191 else : | |
192 line.extend(["","",""]) | |
193 | |
194 #print ("tab ok") | |
195 | |
196 #add missing nextprot ID for human | |
197 if human : | |
198 #build next_dict | |
199 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | |
200 next_dict = {} | |
201 for nextid in nextprot_ids : | |
202 next_dict[nextid.replace("NX_","")] = nextid | |
203 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) | |
204 | |
205 #add missing nextprot ID | |
206 for line in tab[1:] : | |
207 uniprotID=line[0] | |
208 nextprotID=line[13] | |
209 if nextprotID == '' and uniprotID in next_dict : | |
210 line[13]=next_dict[uniprotID] | |
211 | |
212 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
213 path = os.path.join(target_directory,output_file) | |
214 | |
215 with open(path,"w") as out : | |
216 w = csv.writer(out,delimiter='\t') | |
217 w.writerows(tab) | |
218 | |
219 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"} | |
220 name = name_dict[species]+" "+time.strftime("%d/%m/%Y") | |
221 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | |
222 | |
223 data_table_entry = dict(id=id, name = name, value = species, path = path) | |
224 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping") | |
225 | |
226 def download_from_uniprot_ftp(file,target_directory) : | |
227 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | |
228 path = os.path.join(target_directory, file) | |
229 ftp = ftplib.FTP("ftp.uniprot.org") | |
230 ftp.login("anonymous", "anonymous") | |
231 ftp.cwd(ftp_dir) | |
232 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
233 ftp.quit() | |
234 return (path) | |
235 | |
236 def id_list_from_nextprot_ftp(file,target_directory) : | |
237 ftp_dir = "pub/current_release/ac_lists/" | |
238 path = os.path.join(target_directory, file) | |
239 ftp = ftplib.FTP("ftp.nextprot.org") | |
240 ftp.login("anonymous", "anonymous") | |
241 ftp.cwd(ftp_dir) | |
242 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
243 ftp.quit() | |
244 with open(path,'r') as nextprot_ids : | |
245 nextprot_ids = nextprot_ids.read().splitlines() | |
246 return (nextprot_ids) | |
247 | |
248 #return '' if there's no value in a dictionary, avoid error | |
249 def access_dictionary (dico,key1,key2) : | |
250 if key1 in dico : | |
251 if key2 in dico[key1] : | |
252 return (dico[key1][key2]) | |
253 else : | |
254 return ("") | |
255 #print (key2,"not in ",dico,"[",key1,"]") | |
256 else : | |
257 return ('') | |
258 | |
259 #if there are several nextprot ID for one uniprotID, return the uniprot like ID | |
260 def clean_nextprot_id (next_id,uniprotAc) : | |
261 if len(next_id.split(";")) > 1 : | |
262 tmp = next_id.split(";") | |
263 if "NX_"+uniprotAc in tmp : | |
264 return ("NX_"+uniprotAc) | |
265 else : | |
266 return (tmp[1]) | |
267 else : | |
268 return (next_id) | |
269 | |
270 | |
271 ####################################################################################################### | |
272 # Main function | |
273 ####################################################################################################### | |
274 def main(): | |
275 parser = argparse.ArgumentParser() | |
276 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) | |
277 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) | |
278 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) | |
279 parser.add_argument("-o", "--output") | |
280 args = parser.parse_args() | |
281 | |
282 data_manager_dict = {} | |
283 # Extract json file params | |
284 filename = args.output | |
285 params = from_json_string(open(filename).read()) | |
286 target_directory = params[ 'output_data' ][0]['extra_files_path'] | |
287 os.mkdir(target_directory) | |
288 | |
289 ## Download source files from HPA | |
290 try: | |
291 hpa = args.hpa | |
292 except NameError: | |
293 hpa = None | |
294 if hpa is not None: | |
295 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
296 hpa = hpa.split(",") | |
297 for hpa_tissue in hpa: | |
298 HPA_sources(data_manager_dict, hpa_tissue, target_directory) | |
299 | |
300 ## Download source file from Peptide Atlas query | |
301 try: | |
302 peptide_atlas = args.peptideatlas | |
303 except NameError: | |
304 peptide_atlas = None | |
305 if peptide_atlas is not None: | |
306 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
307 peptide_atlas = peptide_atlas.split(",") | |
308 for pa_tissue in peptide_atlas: | |
309 peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory) | |
310 | |
311 ## Download ID_mapping source file from Uniprot | |
312 try: | |
313 id_mapping=args.id_mapping | |
314 except NameError: | |
315 id_mapping = None | |
316 if id_mapping is not None: | |
317 id_mapping = id_mapping .split(",") | |
318 for species in id_mapping : | |
319 id_mapping_sources(data_manager_dict, species, target_directory) | |
320 | |
321 #save info to json file | |
322 filename = args.output | |
323 open(filename, 'wb').write(to_json_string(data_manager_dict)) | |
324 | |
325 if __name__ == "__main__": | |
326 main() |