Mercurial > repos > pjbriggs > data_manager_mothur_toolsuite
comparison data_manager/fetch_mothur_reference_data.py @ 0:b90e0f2bf4b1 draft default tip
Initial version.
author | pjbriggs |
---|---|
date | Tue, 18 Nov 2014 09:57:33 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b90e0f2bf4b1 |
---|---|
1 #!/usr/bin/env python | |
2 # | |
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools | |
4 import sys | |
5 import os | |
6 import optparse | |
7 import tempfile | |
8 import shutil | |
9 import urllib2 | |
10 import zipfile | |
11 import tarfile | |
12 | |
13 from galaxy.util.json import from_json_string, to_json_string | |
14 | |
15 # When extracting files from archives, skip names that | |
16 # start with the following strings | |
17 IGNORE_PATHS = ('.','__MACOSX/','__') | |
18 | |
19 # Map file extensions to data table names | |
20 MOTHUR_FILE_TYPES = { ".map": "map", | |
21 ".fasta": "aligndb", | |
22 ".pat": "lookup", | |
23 ".tax": "taxonomy" } | |
24 | |
25 # Reference data URLs | |
26 MOTHUR_REFERENCE_DATA = { | |
27 # Look up data | |
28 # http://www.mothur.org/wiki/Lookup_files | |
29 "lookup_titanium": { | |
30 "GS FLX Titanium": ["http://www.mothur.org/w/images/9/96/LookUp_Titanium.zip",] | |
31 }, | |
32 "lookup_gsflx": { | |
33 "GSFLX": ["http://www.mothur.org/w/images/8/84/LookUp_GSFLX.zip",] | |
34 }, | |
35 "lookup_gs20": { | |
36 "GS20": ["http://www.mothur.org/w/images/7/7b/LookUp_GS20.zip",] | |
37 }, | |
38 # RDP reference files | |
39 # http://www.mothur.org/wiki/RDP_reference_files | |
40 "RDP_v10": { | |
41 "16S rRNA RDP training set 10": | |
42 ["http://www.mothur.org/w/images/b/b5/Trainset10_082014.rdp.tgz",], | |
43 "16S rRNA PDS training set 10": | |
44 ["http://www.mothur.org/w/images/2/24/Trainset10_082014.pds.tgz",], | |
45 }, | |
46 "RDP_v9": { | |
47 "16S rRNA RDP training set 9": | |
48 ["http://www.mothur.org/w/images/7/72/Trainset9_032012.rdp.zip",], | |
49 "16S rRNA PDS training set 9": | |
50 ["http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip",], | |
51 }, | |
52 "RDP_v7": { | |
53 "16S rRNA RDP training set 7": | |
54 ["http://www.mothur.org/w/images/2/29/Trainset7_112011.rdp.zip",], | |
55 "16S rRNA PDS training set 7": | |
56 ["http://www.mothur.org/w/images/4/4a/Trainset7_112011.pds.zip",], | |
57 "8S rRNA Fungi training set 7": | |
58 ["http://www.mothur.org/w/images/3/36/FungiLSU_train_v7.zip",], | |
59 }, | |
60 "RDP_v6": { | |
61 "RDP training set 6": | |
62 ["http://www.mothur.org/w/images/4/49/RDPTrainingSet.zip",], | |
63 }, | |
64 # Silva reference files | |
65 # http://www.mothur.org/wiki/Silva_reference_files | |
66 "silva_release_119": { | |
67 "SILVA release 119": | |
68 ["http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz", | |
69 "http://www.mothur.org/w/images/5/56/Silva.seed_v119.tgz",], | |
70 }, | |
71 "silva_release_102": { | |
72 "SILVA release 102": | |
73 ["http://www.mothur.org/w/images/9/98/Silva.bacteria.zip", | |
74 "http://www.mothur.org/w/images/3/3c/Silva.archaea.zip", | |
75 "http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip",], | |
76 }, | |
77 "silva_gold_bacteria": { | |
78 "SILVA gold": | |
79 ["http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip",], | |
80 }, | |
81 # Greengenes | |
82 # http://www.mothur.org/wiki/Greengenes-formatted_databases | |
83 "greengenes_August2013": { | |
84 "Greengenes August 2013": | |
85 ["http://www.mothur.org/w/images/1/19/Gg_13_8_99.refalign.tgz", | |
86 "http://www.mothur.org/w/images/6/68/Gg_13_8_99.taxonomy.tgz",], | |
87 }, | |
88 "greengenes_May2013": { | |
89 "Greengenes May 2013": | |
90 ["http://www.mothur.org/w/images/c/cd/Gg_13_5_99.refalign.tgz", | |
91 "http://www.mothur.org/w/images/9/9d/Gg_13_5_99.taxonomy.tgz",], | |
92 }, | |
93 "greengenes_old": { | |
94 "Greengenes pre-May 2013": | |
95 ["http://www.mothur.org/w/images/7/72/Greengenes.alignment.zip", | |
96 "http://www.mothur.org/w/images/1/16/Greengenes.tax.tgz",], | |
97 }, | |
98 "greengenes_gold_alignment": { | |
99 "Greengenes gold alignment": | |
100 ["http://www.mothur.org/w/images/2/21/Greengenes.gold.alignment.zip",], | |
101 }, | |
102 # Secondary structure maps | |
103 # http://www.mothur.org/wiki/Secondary_structure_map | |
104 "secondary_structure_maps_silva": { | |
105 "SILVA": | |
106 ["http://www.mothur.org/w/images/6/6d/Silva_ss_map.zip",], | |
107 }, | |
108 "secondary_structure_maps_greengenes": { | |
109 "Greengenes": | |
110 ["http://www.mothur.org/w/images/4/4b/Gg_ss_map.zip",], | |
111 }, | |
112 # Lane masks: not used here? | |
113 "lane_masks": { | |
114 "Greengenes-compatible": | |
115 ["http://www.mothur.org/w/images/2/2a/Lane1241.gg.filter", | |
116 "http://www.mothur.org/w/images/a/a0/Lane1287.gg.filter", | |
117 "http://www.mothur.org/w/images/3/3d/Lane1349.gg.filter",], | |
118 "SILVA-compatible": | |
119 ["http://www.mothur.org/w/images/6/6d/Lane1349.silva.filter",] | |
120 }, | |
121 } | |
122 | |
123 # Utility functions for interacting with Galaxy JSON | |
124 | |
125 def read_input_json(jsonfile): | |
126 """Read the JSON supplied from the data manager tool | |
127 | |
128 Returns a tuple (param_dict,extra_files_path) | |
129 | |
130 'param_dict' is an arbitrary dictionary of parameters | |
131 input into the tool; 'extra_files_path' is the path | |
132 to a directory where output files must be put for the | |
133 receiving data manager to pick them up. | |
134 | |
135 NB the directory pointed to by 'extra_files_path' | |
136 doesn't exist initially, it is the job of the script | |
137 to create it if necessary. | |
138 | |
139 """ | |
140 params = from_json_string(open(jsonfile).read()) | |
141 return (params['param_dict'], | |
142 params['output_data'][0]['extra_files_path']) | |
143 | |
144 # Utility functions for creating data table dictionaries | |
145 # | |
146 # Example usage: | |
147 # >>> d = create_data_tables_dict() | |
148 # >>> add_data_table(d,'my_data') | |
149 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | |
150 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | |
151 # >>> print str(to_json_string(d)) | |
152 | |
153 def create_data_tables_dict(): | |
154 """Return a dictionary for storing data table information | |
155 | |
156 Returns a dictionary that can be used with 'add_data_table' | |
157 and 'add_data_table_entry' to store information about a | |
158 data table. It can be converted to JSON to be sent back to | |
159 the data manager. | |
160 | |
161 """ | |
162 d = {} | |
163 d['data_tables'] = {} | |
164 return d | |
165 | |
166 def add_data_table(d,table): | |
167 """Add a data table to the data tables dictionary | |
168 | |
169 Creates a placeholder for a data table called 'table'. | |
170 | |
171 """ | |
172 d['data_tables'][table] = [] | |
173 | |
174 def add_data_table_entry(d,table,entry): | |
175 """Add an entry to a data table | |
176 | |
177 Appends an entry to the data table 'table'. 'entry' | |
178 should be a dictionary where the keys are the names of | |
179 columns in the data table. | |
180 | |
181 Raises an exception if the named data table doesn't | |
182 exist. | |
183 | |
184 """ | |
185 try: | |
186 d['data_tables'][table].append(entry) | |
187 except KeyError: | |
188 raise Exception("add_data_table_entry: no table '%s'" % table) | |
189 | |
190 # Utility functions for downloading and unpacking archive files | |
191 | |
192 def download_file(url,target=None,wd=None): | |
193 """Download a file from a URL | |
194 | |
195 Fetches a file from the specified URL. | |
196 | |
197 If 'target' is specified then the file is saved to this | |
198 name; otherwise it's saved as the basename of the URL. | |
199 | |
200 If 'wd' is specified then it is used as the 'working | |
201 directory' where the file will be save on the local | |
202 system. | |
203 | |
204 Returns the name that the file is saved with. | |
205 | |
206 """ | |
207 print "Downloading %s" % url | |
208 if not target: | |
209 target = os.path.basename(url) | |
210 if wd: | |
211 target = os.path.join(wd,target) | |
212 print "Saving to %s" % target | |
213 open(target,'wb').write(urllib2.urlopen(url).read()) | |
214 return target | |
215 | |
216 def unpack_zip_archive(filen,wd=None): | |
217 """Extract files from a ZIP archive | |
218 | |
219 Given a ZIP archive, extract the files it contains | |
220 and return a list of the resulting file names and | |
221 paths. | |
222 | |
223 'wd' specifies the working directory to extract | |
224 the files to, otherwise they are extracted to the | |
225 current working directory. | |
226 | |
227 Once all the files are extracted the ZIP archive | |
228 file is deleted from the file system. | |
229 | |
230 """ | |
231 if not zipfile.is_zipfile(filen): | |
232 print "%s: not ZIP formatted file" | |
233 return [filen] | |
234 file_list = [] | |
235 z = zipfile.ZipFile(filen) | |
236 for name in z.namelist(): | |
237 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False): | |
238 print "Ignoring %s" % name | |
239 continue | |
240 if wd: | |
241 target = os.path.join(wd,name) | |
242 else: | |
243 target = name | |
244 if name.endswith('/'): | |
245 # Make directory | |
246 print "Creating dir %s" % target | |
247 try: | |
248 os.makedirs(target) | |
249 except OSError: | |
250 pass | |
251 else: | |
252 # Extract file | |
253 print "Extracting %s" % name | |
254 try: | |
255 os.makedirs(os.path.dirname(target)) | |
256 except OSError: | |
257 pass | |
258 open(target,'wb').write(z.read(name)) | |
259 file_list.append(target) | |
260 print "Removing %s" % filen | |
261 os.remove(filen) | |
262 return file_list | |
263 | |
264 def unpack_tar_archive(filen,wd=None): | |
265 """Extract files from a TAR archive | |
266 | |
267 Given a TAR archive (which optionally can be | |
268 compressed with either gzip or bz2), extract the | |
269 files it contains and return a list of the | |
270 resulting file names and paths. | |
271 | |
272 'wd' specifies the working directory to extract | |
273 the files to, otherwise they are extracted to the | |
274 current working directory. | |
275 | |
276 Once all the files are extracted the TAR archive | |
277 file is deleted from the file system. | |
278 | |
279 """ | |
280 file_list = [] | |
281 if wd: | |
282 path = wd | |
283 else: | |
284 path = '.' | |
285 if not tarfile.is_tarfile(filen): | |
286 print "%s: not TAR file" | |
287 return [filen] | |
288 t = tarfile.open(filen) | |
289 for name in t.getnames(): | |
290 # Check for unwanted files | |
291 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False): | |
292 print "Ignoring %s" % name | |
293 continue | |
294 # Extract file | |
295 print "Extracting %s" % name | |
296 t.extract(name,wd) | |
297 if wd: | |
298 target = os.path.join(wd,name) | |
299 else: | |
300 target = name | |
301 file_list.append(target) | |
302 print "Removing %s" % filen | |
303 os.remove(filen) | |
304 return file_list | |
305 | |
306 def unpack_archive(filen,wd=None): | |
307 """Extract files from an archive | |
308 | |
309 Wrapper function that calls the appropriate | |
310 unpacking function depending on the archive | |
311 type, and returns a list of files that have | |
312 been extracted. | |
313 | |
314 'wd' specifies the working directory to extract | |
315 the files to, otherwise they are extracted to the | |
316 current working directory. | |
317 | |
318 """ | |
319 print "Unpack %s" % filen | |
320 ext = os.path.splitext(filen)[1] | |
321 print "Extension: %s" % ext | |
322 if ext == ".zip": | |
323 return unpack_zip_archive(filen,wd=wd) | |
324 elif ext == ".tgz": | |
325 return unpack_tar_archive(filen,wd=wd) | |
326 else: | |
327 return [filen] | |
328 | |
329 def fetch_files(urls,wd=None,files=None): | |
330 """Download and unpack files from a list of URLs | |
331 | |
332 Given a list of URLs, download and unpack each | |
333 one, and return a list of the extracted files. | |
334 | |
335 'wd' specifies the working directory to extract | |
336 the files to, otherwise they are extracted to the | |
337 current working directory. | |
338 | |
339 If 'files' is given then the list of extracted | |
340 files will be appended to this list before being | |
341 returned. | |
342 | |
343 """ | |
344 if files is None: | |
345 files = [] | |
346 for url in urls: | |
347 filen = download_file(url,wd=wd) | |
348 files.extend(unpack_archive(filen,wd=wd)) | |
349 return files | |
350 | |
351 # Utility functions specific to the Mothur reference data | |
352 | |
353 def identify_type(filen): | |
354 """Return the data table name based on the file name | |
355 | |
356 """ | |
357 ext = os.path.splitext(filen)[1] | |
358 try: | |
359 return MOTHUR_FILE_TYPES[ext] | |
360 except KeyError: | |
361 return None | |
362 | |
363 def get_name(filen): | |
364 """Generate a descriptive name based on the file name | |
365 """ | |
366 type_ = identify_type(filen) | |
367 name = os.path.splitext(os.path.basename(filen))[0] | |
368 for delim in ('.','_'): | |
369 name = name.replace(delim,' ') | |
370 return name | |
371 | |
372 def fetch_from_mothur_website(data_tables,target_dir,datasets): | |
373 """Fetch reference data from the Mothur website | |
374 | |
375 For each dataset in the list 'datasets', download (and if | |
376 necessary unpack) the related files from the Mothur website, | |
377 copy them to the data manager's target directory, and add | |
378 references to the files to the appropriate data table. | |
379 | |
380 The 'data_tables' dictionary should have been created using | |
381 the 'create_data_tables_dict' and 'add_data_table' functions. | |
382 | |
383 Arguments: | |
384 data_tables: a dictionary containing the data table info | |
385 target_dir: directory to put the downloaded files | |
386 datasets: a list of dataset names corresponding to keys in | |
387 the MOTHUR_REFERENCE_DATA dictionary | |
388 | |
389 """ | |
390 # Make working dir | |
391 wd = tempfile.mkdtemp(suffix=".mothur",dir=os.getcwd()) | |
392 print "Working dir %s" % wd | |
393 # Iterate over all requested reference data URLs | |
394 for dataset in datasets: | |
395 print "Handling dataset '%s'" % dataset | |
396 for name in MOTHUR_REFERENCE_DATA[dataset]: | |
397 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name],wd=wd): | |
398 type_ = identify_type(f) | |
399 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0],name) | |
400 print "%s\t\'%s'\t.../%s" % (type_, | |
401 entry_name, | |
402 os.path.basename(f)) | |
403 if type_ is not None: | |
404 # Move to target dir | |
405 ref_data_file = os.path.basename(f) | |
406 f1 = os.path.join(target_dir,ref_data_file) | |
407 print "Moving %s to %s" % (f,f1) | |
408 os.rename(f,f1) | |
409 # Add entry to data table | |
410 table_name = "mothur_%s" % type_ | |
411 add_data_table_entry(data_tables,table_name,dict(name=entry_name, | |
412 value=ref_data_file)) | |
413 # Remove working dir | |
414 print "Removing %s" % wd | |
415 shutil.rmtree(wd) | |
416 | |
417 def files_from_filesystem_paths(paths): | |
418 """Return list of file paths from arbitrary input paths | |
419 | |
420 Given a list of filesystem paths, return a list of | |
421 full paths corresponding to all files found recursively | |
422 from under those paths. | |
423 | |
424 """ | |
425 # Collect files to add | |
426 files = [] | |
427 for path in paths: | |
428 path = os.path.abspath(path) | |
429 print "Examining '%s'..." % path | |
430 if os.path.isfile(path): | |
431 # Store full path for file | |
432 files.append(path) | |
433 elif os.path.isdir(path): | |
434 # Descend into directory and collect the files | |
435 for f in os.listdir(path): | |
436 files.extend(files_from_filesystem_paths((os.path.join(path,f),))) | |
437 else: | |
438 print "Not a file or directory, ignored" | |
439 return files | |
440 | |
441 def import_from_server(data_tables,target_dir,paths,description, | |
442 link_to_data=False): | |
443 """Import reference data from filesystem paths | |
444 | |
445 Creates references to the specified file(s) on the Galaxy | |
446 server in the appropriate data table (determined from the | |
447 file extension). | |
448 | |
449 The 'data_tables' dictionary should have been created using | |
450 the 'create_data_tables_dict' and 'add_data_table' functions. | |
451 | |
452 Arguments: | |
453 data_tables: a dictionary containing the data table info | |
454 target_dir: directory to put copy or link to the data file | |
455 paths: list of file and/or directory paths to import | |
456 description: text to associate with the files | |
457 link_to_data: boolean, if False then copy the data file | |
458 into Galaxy (default); if True then make a symlink to | |
459 the data file | |
460 | |
461 """ | |
462 # Collect list of files based on input paths | |
463 files = files_from_filesystem_paths(paths) | |
464 # Handle each file individually | |
465 for f in files: | |
466 type_ = identify_type(f) | |
467 if type_ is None: | |
468 print "%s: unrecognised type, skipped" % f | |
469 continue | |
470 ref_data_file = os.path.basename(f) | |
471 target_file = os.path.join(target_dir,ref_data_file) | |
472 entry_name = "%s" % os.path.splitext(ref_data_file)[0] | |
473 if description: | |
474 entry_name += " (%s)" % description | |
475 print "%s\t\'%s'\t.../%s" % (type_, | |
476 entry_name, | |
477 ref_data_file) | |
478 # Link to or copy the data | |
479 if link_to_data: | |
480 os.symlink(f,target_file) | |
481 else: | |
482 shutil.copyfile(f,target_file) | |
483 # Add entry to data table | |
484 table_name = "mothur_%s" % type_ | |
485 add_data_table_entry(data_tables,table_name,dict(name=entry_name, | |
486 value=ref_data_file)) | |
487 | |
488 if __name__ == "__main__": | |
489 print "Starting..." | |
490 | |
491 # Read command line | |
492 parser = optparse.OptionParser() | |
493 parser.add_option('--source',action='store',dest='data_source') | |
494 parser.add_option('--datasets',action='store',dest='datasets',default='') | |
495 parser.add_option('--paths',action='store',dest='paths',default=[]) | |
496 parser.add_option('--description',action='store',dest='description',default='') | |
497 parser.add_option('--link',action='store_true',dest='link_to_data') | |
498 options,args = parser.parse_args() | |
499 print "options: %s" % options | |
500 print "args : %s" % args | |
501 | |
502 # Check for JSON file | |
503 if len(args) != 1: | |
504 p.error("Need to supply JSON file name") | |
505 jsonfile = args[0] | |
506 | |
507 # Read the input JSON | |
508 params,target_dir = read_input_json(jsonfile) | |
509 | |
510 # Make the target directory | |
511 print "Making %s" % target_dir | |
512 os.mkdir(target_dir) | |
513 | |
514 # Set up data tables dictionary | |
515 data_tables = create_data_tables_dict() | |
516 add_data_table(data_tables,'mothur_lookup') | |
517 add_data_table(data_tables,'mothur_aligndb') | |
518 add_data_table(data_tables,'mothur_map') | |
519 add_data_table(data_tables,'mothur_taxonomy') | |
520 | |
521 # Fetch data from specified data sources | |
522 if options.data_source == 'mothur_website': | |
523 datasets = options.datasets.split(',') | |
524 fetch_from_mothur_website(data_tables,target_dir,datasets) | |
525 elif options.data_source == 'filesystem_paths': | |
526 # Check description text | |
527 description = options.description.strip() | |
528 # Get list of paths (need to remove any escapes for '\n' and '\r' | |
529 # that might have been inserted by Galaxy) | |
530 paths = options.paths.replace('__cn__','\n').replace('__cr__','\r').split() | |
531 import_from_server(data_tables,target_dir,paths,description, | |
532 link_to_data=options.link_to_data) | |
533 # Write output JSON | |
534 print "Outputting JSON" | |
535 print str(to_json_string(data_tables)) | |
536 open(jsonfile,'wb').write(to_json_string(data_tables)) | |
537 print "Done." |