comparison data_manager/resource_building.py @ 0:2de84fea8367 draft

planemo upload commit d703392579d96e480c6461ce679516b12cefb3de-dirty
author dchristiany
date Thu, 18 Oct 2018 12:16:09 -0400
parents
children 9c75521e4a64
comparison
equal deleted inserted replaced
-1:000000000000 0:2de84fea8367
1 """
2 The purpose of this script is to create source files from different databases to be used in other tools
3 """
4
5 import os, sys, argparse, requests, time, csv, re
6 from io import BytesIO
7 from zipfile import ZipFile
8 from galaxy.util.json import from_json_string, to_json_string
9
10 #######################################################################################################
11 # General functions
12 #######################################################################################################
13 def unzip(url, output_file):
14 """
15 Get a zip file content from a link and unzip
16 """
17 content = requests.get(url)
18 zipfile = ZipFile(BytesIO(content.content))
19 output_content = ""
20 output_content += zipfile.open(zipfile.namelist()[0]).read()
21 output = open(output_file, "w")
22 output.write(output_content)
23 output.close()
24
25 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
26 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
27 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
28 data_manager_dict['data_tables'][data_table].append(data_table_entry)
29 return data_manager_dict
30
31 #######################################################################################################
32 # 1. Human Protein Atlas
33 # - Normal tissue
34 # - Pathology
35 # - Full Atlas
36 #######################################################################################################
37 def HPA_sources(data_manager_dict, tissue, target_directory):
38 if tissue == "HPA_normal_tissue":
39 tissue_name = "HPA normal tissue"
40 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
41 elif tissue == "HPA_pathology":
42 tissue_name = "HPA pathology"
43 url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
44 elif tissue == "HPA_full_atlas":
45 tissue_name = "HPA full atlas"
46 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
47 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
48 path = os.path.join(target_directory, output_file)
49 unzip(url, path)
50 print(str(os.path.isfile(path)))
51 tmp=open(path,"r").readlines()
52 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
53 data_table_entry = dict(value = tissue, name = tissue_name, path = path)
54 _add_data_table_entry(data_manager_dict, data_table_entry, "proteinatlas")
55
56
57 #######################################################################################################
58 # 2. Peptide Atlas
59 #######################################################################################################
60 def peptide_atlas_sources(data_manager_dict, tissue, target_directory):
61 # Define PA Human build released number (here early 2018)
62 atlas_build_id = "472"
63 # Define organism_id (here Human) - to be upraded when other organism added to the project
64 organism_id = "2"
65 # Extract sample_category_id and output filename
66 sample_category_id = tissue.split("-")[0]
67 output_file = tissue.split("-")[1] +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
68 query = "https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetPeptides?atlas_build_id=" + \
69 atlas_build_id + "&display_options=ShowMappings&organism_id= " + \
70 organism_id + "&sample_category_id=" + sample_category_id + \
71 "&QUERY_NAME=AT_GetPeptides&output_mode=tsv&apply_action=QUERY"
72 download = requests.get(query)
73 decoded_content = download.content.decode('utf-8')
74 cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
75
76 #build dictionary by only keeping uniprot accession (not isoform) as key and sum of observations as value
77 uni_dict = build_dictionary(cr)
78
79 tissue_id = "_".join([atlas_build_id, organism_id, sample_category_id,time.strftime("%d-%m-%Y")])
80 tissue_value = tissue.split("-")[1]
81 tissue = tissue.split("-")[1] + "_" +time.strftime("%d-%m-%Y")
82 tissue_name = " ".join(tissue_value.split("_")) + " " + time.strftime("%d/%m/%Y")
83 path = os.path.join(target_directory,output_file)
84
85 with open(path,"wb") as out :
86 w = csv.writer(out,delimiter='\t')
87 w.writerow(["Uniprot_AC","nb_obs"])
88 w.writerows(uni_dict.items())
89
90 data_table_entry = dict(value = path, name = tissue_name, tissue = tissue)
91 _add_data_table_entry(data_manager_dict, data_table_entry, "peptide_atlas")
92
93 #function to count the number of observations by uniprot id
94 def build_dictionary (csv) :
95 uni_dict = {}
96 for line in csv :
97 if "-" not in line[2] and check_uniprot_access(line[2]) :
98 if line[2] in uni_dict :
99 uni_dict[line[2]] += int(line[4])
100 else :
101 uni_dict[line[2]] = int(line[4])
102
103 return uni_dict
104
105 #function to check if an id is an uniprot accession number : return True or False-
106 def check_uniprot_access (id) :
107 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
108 if uniprot_pattern.match(id) :
109 return True
110 else :
111 return False
112
113
114
115 #######################################################################################################
116 # 3. ID mapping file
117 #######################################################################################################
118 import ftplib, gzip
119 csv.field_size_limit(sys.maxsize) # to handle big files
120
121 def id_mapping_sources (data_manager_dict, species, target_directory) :
122
123 human = species == "human"
124 species_dict = { "human" : "HUMAN_9606", "mouse" : "MOUSE_10090", "rat" : "RAT_10116" }
125 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
126
127 #header
128 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
129 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
130
131 #print("header ok")
132
133 #selected.tab and keep only ids of interest
134 selected_tab_file=species_dict[species]+"_"+files[0]
135 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
136 with gzip.open(tab_path,"rt") as select :
137 tab_reader = csv.reader(select,delimiter="\t")
138 for line in tab_reader :
139 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
140 os.remove(tab_path)
141
142 #print("selected_tab ok")
143
144 """
145 Supplementary ID to get from HUMAN_9606_idmapping.dat :
146 -NextProt,BioGrid,STRING,KEGG
147 """
148
149 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file
150 else : ids = ['BioGrid','STRING','KEGG' ]
151 unidict = {}
152
153 #keep only ids of interest in dictionaries
154 dat_file=species_dict[species]+"_"+files[1]
155 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
156 with gzip.open(dat_path,"rt") as dat :
157 dat_reader = csv.reader(dat,delimiter="\t")
158 for line in dat_reader :
159 uniprotID=line[0] #UniProtID as key
160 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary
161 cor_id=line[2] #corresponding id
162 if "-" not in id_type : #we don't keep isoform
163 if id_type in ids and uniprotID in unidict :
164 if id_type in unidict[uniprotID] :
165 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
166 else :
167 unidict[uniprotID].update({ id_type : cor_id })
168 elif id_type in ids :
169 unidict[uniprotID]={id_type : cor_id}
170 os.remove(dat_path)
171
172 #print("dat_file ok")
173
174 #add ids from idmapping.dat to the final tab
175 for line in tab[1:] :
176 uniprotID=line[0]
177 if human :
178 if uniprotID in unidict :
179 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
180 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
181 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
182 access_dictionary(unidict,uniprotID,'KEGG')])
183 else :
184 line.extend(["","","",""])
185 else :
186 if uniprotID in unidict :
187 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
188 access_dictionary(unidict,uniprotID,'KEGG')])
189 else :
190 line.extend(["","",""])
191
192 #print ("tab ok")
193
194 #add missing nextprot ID for human
195 if human :
196 #build next_dict
197 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
198 next_dict = {}
199 for nextid in nextprot_ids :
200 next_dict[nextid.replace("NX_","")] = nextid
201 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
202
203 #add missing nextprot ID
204 for line in tab[1:] :
205 uniprotID=line[0]
206 nextprotID=line[13]
207 if nextprotID == '' and uniprotID in next_dict :
208 line[13]=next_dict[uniprotID]
209
210 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
211 path = os.path.join(target_directory,output_file)
212
213 with open(path,"w") as out :
214 w = csv.writer(out,delimiter='\t')
215 w.writerows(tab)
216
217 name_dict={"human" : "Homo sapiens", "mouse" : "Mus musculus", "rat" : "Rattus norvegicus"}
218 name = name_dict[species]+" ("+time.strftime("%d-%m-%Y")+")"
219
220 data_table_entry = dict(value = species+"_id_mapping_"+ time.strftime("%d-%m-%Y"), name = name, path = path)
221 _add_data_table_entry(data_manager_dict, data_table_entry, "id_mapping_tab")
222
223 def download_from_uniprot_ftp(file,target_directory) :
224 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
225 path = os.path.join(target_directory, file)
226 ftp = ftplib.FTP("ftp.uniprot.org")
227 ftp.login("anonymous", "anonymous")
228 ftp.cwd(ftp_dir)
229 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
230 ftp.quit()
231 return (path)
232
233 def id_list_from_nextprot_ftp(file,target_directory) :
234 ftp_dir = "pub/current_release/ac_lists/"
235 path = os.path.join(target_directory, file)
236 ftp = ftplib.FTP("ftp.nextprot.org")
237 ftp.login("anonymous", "anonymous")
238 ftp.cwd(ftp_dir)
239 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
240 ftp.quit()
241 with open(path,'r') as nextprot_ids :
242 nextprot_ids = nextprot_ids.read().splitlines()
243 return (nextprot_ids)
244
245 #return '' if there's no value in a dictionary, avoid error
246 def access_dictionary (dico,key1,key2) :
247 if key1 in dico :
248 if key2 in dico[key1] :
249 return (dico[key1][key2])
250 else :
251 return ("")
252 #print (key2,"not in ",dico,"[",key1,"]")
253 else :
254 return ('')
255
256 #if there are several nextprot ID for one uniprotID, return the uniprot like ID
257 def clean_nextprot_id (next_id,uniprotAc) :
258 if len(next_id.split(";")) > 1 :
259 tmp = next_id.split(";")
260 if "NX_"+uniprotAc in tmp :
261 return ("NX_"+uniprotAc)
262 else :
263 return (tmp[1])
264 else :
265 return (next_id)
266
267
268 #######################################################################################################
269 # Main function
270 #######################################################################################################
271 def main():
272 parser = argparse.ArgumentParser()
273 parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
274 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
275 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
276 parser.add_argument("-o", "--output")
277 args = parser.parse_args()
278
279 data_manager_dict = {}
280 # Extract json file params
281 filename = args.output
282 params = from_json_string(open(filename).read())
283 target_directory = params[ 'output_data' ][0]['extra_files_path']
284 os.mkdir(target_directory)
285
286 ## Download source files from HPA
287 try:
288 hpa = args.hpa
289 except NameError:
290 hpa = None
291 if hpa is not None:
292 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
293 hpa = hpa.split(",")
294 for hpa_tissue in hpa:
295 HPA_sources(data_manager_dict, hpa_tissue, target_directory)
296
297 ## Download source file from Peptide Atlas query
298 try:
299 peptide_atlas = args.peptideatlas
300 except NameError:
301 peptide_atlas = None
302 if peptide_atlas is not None:
303 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
304 peptide_atlas = peptide_atlas.split(",")
305 for pa_tissue in peptide_atlas:
306 peptide_atlas_sources(data_manager_dict, pa_tissue, target_directory)
307
308 ## Download ID_mapping source file from Uniprot
309 try:
310 id_mapping=args.id_mapping
311 except NameError:
312 id_mapping = None
313 if id_mapping is not None:
314 id_mapping = id_mapping .split(",")
315 for species in id_mapping :
316 id_mapping_sources(data_manager_dict, species, target_directory)
317
318 #save info to json file
319 filename = args.output
320 open(filename, 'wb').write(to_json_string(data_manager_dict))
321
322 if __name__ == "__main__":
323 main()