comparison data_manager/fetch_mothur_reference_data.py @ 0:b90e0f2bf4b1 draft default tip

Initial version.
author pjbriggs
date Tue, 18 Nov 2014 09:57:33 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b90e0f2bf4b1
1 #!/usr/bin/env python
2 #
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
4 import sys
5 import os
6 import optparse
7 import tempfile
8 import shutil
9 import urllib2
10 import zipfile
11 import tarfile
12
13 from galaxy.util.json import from_json_string, to_json_string
14
15 # When extracting files from archives, skip names that
16 # start with the following strings
17 IGNORE_PATHS = ('.','__MACOSX/','__')
18
19 # Map file extensions to data table names
20 MOTHUR_FILE_TYPES = { ".map": "map",
21 ".fasta": "aligndb",
22 ".pat": "lookup",
23 ".tax": "taxonomy" }
24
25 # Reference data URLs
26 MOTHUR_REFERENCE_DATA = {
27 # Look up data
28 # http://www.mothur.org/wiki/Lookup_files
29 "lookup_titanium": {
30 "GS FLX Titanium": ["http://www.mothur.org/w/images/9/96/LookUp_Titanium.zip",]
31 },
32 "lookup_gsflx": {
33 "GSFLX": ["http://www.mothur.org/w/images/8/84/LookUp_GSFLX.zip",]
34 },
35 "lookup_gs20": {
36 "GS20": ["http://www.mothur.org/w/images/7/7b/LookUp_GS20.zip",]
37 },
38 # RDP reference files
39 # http://www.mothur.org/wiki/RDP_reference_files
40 "RDP_v10": {
41 "16S rRNA RDP training set 10":
42 ["http://www.mothur.org/w/images/b/b5/Trainset10_082014.rdp.tgz",],
43 "16S rRNA PDS training set 10":
44 ["http://www.mothur.org/w/images/2/24/Trainset10_082014.pds.tgz",],
45 },
46 "RDP_v9": {
47 "16S rRNA RDP training set 9":
48 ["http://www.mothur.org/w/images/7/72/Trainset9_032012.rdp.zip",],
49 "16S rRNA PDS training set 9":
50 ["http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip",],
51 },
52 "RDP_v7": {
53 "16S rRNA RDP training set 7":
54 ["http://www.mothur.org/w/images/2/29/Trainset7_112011.rdp.zip",],
55 "16S rRNA PDS training set 7":
56 ["http://www.mothur.org/w/images/4/4a/Trainset7_112011.pds.zip",],
57 "8S rRNA Fungi training set 7":
58 ["http://www.mothur.org/w/images/3/36/FungiLSU_train_v7.zip",],
59 },
60 "RDP_v6": {
61 "RDP training set 6":
62 ["http://www.mothur.org/w/images/4/49/RDPTrainingSet.zip",],
63 },
64 # Silva reference files
65 # http://www.mothur.org/wiki/Silva_reference_files
66 "silva_release_119": {
67 "SILVA release 119":
68 ["http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz",
69 "http://www.mothur.org/w/images/5/56/Silva.seed_v119.tgz",],
70 },
71 "silva_release_102": {
72 "SILVA release 102":
73 ["http://www.mothur.org/w/images/9/98/Silva.bacteria.zip",
74 "http://www.mothur.org/w/images/3/3c/Silva.archaea.zip",
75 "http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip",],
76 },
77 "silva_gold_bacteria": {
78 "SILVA gold":
79 ["http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip",],
80 },
81 # Greengenes
82 # http://www.mothur.org/wiki/Greengenes-formatted_databases
83 "greengenes_August2013": {
84 "Greengenes August 2013":
85 ["http://www.mothur.org/w/images/1/19/Gg_13_8_99.refalign.tgz",
86 "http://www.mothur.org/w/images/6/68/Gg_13_8_99.taxonomy.tgz",],
87 },
88 "greengenes_May2013": {
89 "Greengenes May 2013":
90 ["http://www.mothur.org/w/images/c/cd/Gg_13_5_99.refalign.tgz",
91 "http://www.mothur.org/w/images/9/9d/Gg_13_5_99.taxonomy.tgz",],
92 },
93 "greengenes_old": {
94 "Greengenes pre-May 2013":
95 ["http://www.mothur.org/w/images/7/72/Greengenes.alignment.zip",
96 "http://www.mothur.org/w/images/1/16/Greengenes.tax.tgz",],
97 },
98 "greengenes_gold_alignment": {
99 "Greengenes gold alignment":
100 ["http://www.mothur.org/w/images/2/21/Greengenes.gold.alignment.zip",],
101 },
102 # Secondary structure maps
103 # http://www.mothur.org/wiki/Secondary_structure_map
104 "secondary_structure_maps_silva": {
105 "SILVA":
106 ["http://www.mothur.org/w/images/6/6d/Silva_ss_map.zip",],
107 },
108 "secondary_structure_maps_greengenes": {
109 "Greengenes":
110 ["http://www.mothur.org/w/images/4/4b/Gg_ss_map.zip",],
111 },
112 # Lane masks: not used here?
113 "lane_masks": {
114 "Greengenes-compatible":
115 ["http://www.mothur.org/w/images/2/2a/Lane1241.gg.filter",
116 "http://www.mothur.org/w/images/a/a0/Lane1287.gg.filter",
117 "http://www.mothur.org/w/images/3/3d/Lane1349.gg.filter",],
118 "SILVA-compatible":
119 ["http://www.mothur.org/w/images/6/6d/Lane1349.silva.filter",]
120 },
121 }
122
123 # Utility functions for interacting with Galaxy JSON
124
125 def read_input_json(jsonfile):
126 """Read the JSON supplied from the data manager tool
127
128 Returns a tuple (param_dict,extra_files_path)
129
130 'param_dict' is an arbitrary dictionary of parameters
131 input into the tool; 'extra_files_path' is the path
132 to a directory where output files must be put for the
133 receiving data manager to pick them up.
134
135 NB the directory pointed to by 'extra_files_path'
136 doesn't exist initially, it is the job of the script
137 to create it if necessary.
138
139 """
140 params = from_json_string(open(jsonfile).read())
141 return (params['param_dict'],
142 params['output_data'][0]['extra_files_path'])
143
144 # Utility functions for creating data table dictionaries
145 #
146 # Example usage:
147 # >>> d = create_data_tables_dict()
148 # >>> add_data_table(d,'my_data')
149 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
150 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
151 # >>> print str(to_json_string(d))
152
153 def create_data_tables_dict():
154 """Return a dictionary for storing data table information
155
156 Returns a dictionary that can be used with 'add_data_table'
157 and 'add_data_table_entry' to store information about a
158 data table. It can be converted to JSON to be sent back to
159 the data manager.
160
161 """
162 d = {}
163 d['data_tables'] = {}
164 return d
165
166 def add_data_table(d,table):
167 """Add a data table to the data tables dictionary
168
169 Creates a placeholder for a data table called 'table'.
170
171 """
172 d['data_tables'][table] = []
173
174 def add_data_table_entry(d,table,entry):
175 """Add an entry to a data table
176
177 Appends an entry to the data table 'table'. 'entry'
178 should be a dictionary where the keys are the names of
179 columns in the data table.
180
181 Raises an exception if the named data table doesn't
182 exist.
183
184 """
185 try:
186 d['data_tables'][table].append(entry)
187 except KeyError:
188 raise Exception("add_data_table_entry: no table '%s'" % table)
189
190 # Utility functions for downloading and unpacking archive files
191
192 def download_file(url,target=None,wd=None):
193 """Download a file from a URL
194
195 Fetches a file from the specified URL.
196
197 If 'target' is specified then the file is saved to this
198 name; otherwise it's saved as the basename of the URL.
199
200 If 'wd' is specified then it is used as the 'working
201 directory' where the file will be save on the local
202 system.
203
204 Returns the name that the file is saved with.
205
206 """
207 print "Downloading %s" % url
208 if not target:
209 target = os.path.basename(url)
210 if wd:
211 target = os.path.join(wd,target)
212 print "Saving to %s" % target
213 open(target,'wb').write(urllib2.urlopen(url).read())
214 return target
215
216 def unpack_zip_archive(filen,wd=None):
217 """Extract files from a ZIP archive
218
219 Given a ZIP archive, extract the files it contains
220 and return a list of the resulting file names and
221 paths.
222
223 'wd' specifies the working directory to extract
224 the files to, otherwise they are extracted to the
225 current working directory.
226
227 Once all the files are extracted the ZIP archive
228 file is deleted from the file system.
229
230 """
231 if not zipfile.is_zipfile(filen):
232 print "%s: not ZIP formatted file"
233 return [filen]
234 file_list = []
235 z = zipfile.ZipFile(filen)
236 for name in z.namelist():
237 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False):
238 print "Ignoring %s" % name
239 continue
240 if wd:
241 target = os.path.join(wd,name)
242 else:
243 target = name
244 if name.endswith('/'):
245 # Make directory
246 print "Creating dir %s" % target
247 try:
248 os.makedirs(target)
249 except OSError:
250 pass
251 else:
252 # Extract file
253 print "Extracting %s" % name
254 try:
255 os.makedirs(os.path.dirname(target))
256 except OSError:
257 pass
258 open(target,'wb').write(z.read(name))
259 file_list.append(target)
260 print "Removing %s" % filen
261 os.remove(filen)
262 return file_list
263
264 def unpack_tar_archive(filen,wd=None):
265 """Extract files from a TAR archive
266
267 Given a TAR archive (which optionally can be
268 compressed with either gzip or bz2), extract the
269 files it contains and return a list of the
270 resulting file names and paths.
271
272 'wd' specifies the working directory to extract
273 the files to, otherwise they are extracted to the
274 current working directory.
275
276 Once all the files are extracted the TAR archive
277 file is deleted from the file system.
278
279 """
280 file_list = []
281 if wd:
282 path = wd
283 else:
284 path = '.'
285 if not tarfile.is_tarfile(filen):
286 print "%s: not TAR file"
287 return [filen]
288 t = tarfile.open(filen)
289 for name in t.getnames():
290 # Check for unwanted files
291 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False):
292 print "Ignoring %s" % name
293 continue
294 # Extract file
295 print "Extracting %s" % name
296 t.extract(name,wd)
297 if wd:
298 target = os.path.join(wd,name)
299 else:
300 target = name
301 file_list.append(target)
302 print "Removing %s" % filen
303 os.remove(filen)
304 return file_list
305
306 def unpack_archive(filen,wd=None):
307 """Extract files from an archive
308
309 Wrapper function that calls the appropriate
310 unpacking function depending on the archive
311 type, and returns a list of files that have
312 been extracted.
313
314 'wd' specifies the working directory to extract
315 the files to, otherwise they are extracted to the
316 current working directory.
317
318 """
319 print "Unpack %s" % filen
320 ext = os.path.splitext(filen)[1]
321 print "Extension: %s" % ext
322 if ext == ".zip":
323 return unpack_zip_archive(filen,wd=wd)
324 elif ext == ".tgz":
325 return unpack_tar_archive(filen,wd=wd)
326 else:
327 return [filen]
328
329 def fetch_files(urls,wd=None,files=None):
330 """Download and unpack files from a list of URLs
331
332 Given a list of URLs, download and unpack each
333 one, and return a list of the extracted files.
334
335 'wd' specifies the working directory to extract
336 the files to, otherwise they are extracted to the
337 current working directory.
338
339 If 'files' is given then the list of extracted
340 files will be appended to this list before being
341 returned.
342
343 """
344 if files is None:
345 files = []
346 for url in urls:
347 filen = download_file(url,wd=wd)
348 files.extend(unpack_archive(filen,wd=wd))
349 return files
350
351 # Utility functions specific to the Mothur reference data
352
353 def identify_type(filen):
354 """Return the data table name based on the file name
355
356 """
357 ext = os.path.splitext(filen)[1]
358 try:
359 return MOTHUR_FILE_TYPES[ext]
360 except KeyError:
361 return None
362
363 def get_name(filen):
364 """Generate a descriptive name based on the file name
365 """
366 type_ = identify_type(filen)
367 name = os.path.splitext(os.path.basename(filen))[0]
368 for delim in ('.','_'):
369 name = name.replace(delim,' ')
370 return name
371
372 def fetch_from_mothur_website(data_tables,target_dir,datasets):
373 """Fetch reference data from the Mothur website
374
375 For each dataset in the list 'datasets', download (and if
376 necessary unpack) the related files from the Mothur website,
377 copy them to the data manager's target directory, and add
378 references to the files to the appropriate data table.
379
380 The 'data_tables' dictionary should have been created using
381 the 'create_data_tables_dict' and 'add_data_table' functions.
382
383 Arguments:
384 data_tables: a dictionary containing the data table info
385 target_dir: directory to put the downloaded files
386 datasets: a list of dataset names corresponding to keys in
387 the MOTHUR_REFERENCE_DATA dictionary
388
389 """
390 # Make working dir
391 wd = tempfile.mkdtemp(suffix=".mothur",dir=os.getcwd())
392 print "Working dir %s" % wd
393 # Iterate over all requested reference data URLs
394 for dataset in datasets:
395 print "Handling dataset '%s'" % dataset
396 for name in MOTHUR_REFERENCE_DATA[dataset]:
397 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name],wd=wd):
398 type_ = identify_type(f)
399 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0],name)
400 print "%s\t\'%s'\t.../%s" % (type_,
401 entry_name,
402 os.path.basename(f))
403 if type_ is not None:
404 # Move to target dir
405 ref_data_file = os.path.basename(f)
406 f1 = os.path.join(target_dir,ref_data_file)
407 print "Moving %s to %s" % (f,f1)
408 os.rename(f,f1)
409 # Add entry to data table
410 table_name = "mothur_%s" % type_
411 add_data_table_entry(data_tables,table_name,dict(name=entry_name,
412 value=ref_data_file))
413 # Remove working dir
414 print "Removing %s" % wd
415 shutil.rmtree(wd)
416
417 def files_from_filesystem_paths(paths):
418 """Return list of file paths from arbitrary input paths
419
420 Given a list of filesystem paths, return a list of
421 full paths corresponding to all files found recursively
422 from under those paths.
423
424 """
425 # Collect files to add
426 files = []
427 for path in paths:
428 path = os.path.abspath(path)
429 print "Examining '%s'..." % path
430 if os.path.isfile(path):
431 # Store full path for file
432 files.append(path)
433 elif os.path.isdir(path):
434 # Descend into directory and collect the files
435 for f in os.listdir(path):
436 files.extend(files_from_filesystem_paths((os.path.join(path,f),)))
437 else:
438 print "Not a file or directory, ignored"
439 return files
440
441 def import_from_server(data_tables,target_dir,paths,description,
442 link_to_data=False):
443 """Import reference data from filesystem paths
444
445 Creates references to the specified file(s) on the Galaxy
446 server in the appropriate data table (determined from the
447 file extension).
448
449 The 'data_tables' dictionary should have been created using
450 the 'create_data_tables_dict' and 'add_data_table' functions.
451
452 Arguments:
453 data_tables: a dictionary containing the data table info
454 target_dir: directory to put copy or link to the data file
455 paths: list of file and/or directory paths to import
456 description: text to associate with the files
457 link_to_data: boolean, if False then copy the data file
458 into Galaxy (default); if True then make a symlink to
459 the data file
460
461 """
462 # Collect list of files based on input paths
463 files = files_from_filesystem_paths(paths)
464 # Handle each file individually
465 for f in files:
466 type_ = identify_type(f)
467 if type_ is None:
468 print "%s: unrecognised type, skipped" % f
469 continue
470 ref_data_file = os.path.basename(f)
471 target_file = os.path.join(target_dir,ref_data_file)
472 entry_name = "%s" % os.path.splitext(ref_data_file)[0]
473 if description:
474 entry_name += " (%s)" % description
475 print "%s\t\'%s'\t.../%s" % (type_,
476 entry_name,
477 ref_data_file)
478 # Link to or copy the data
479 if link_to_data:
480 os.symlink(f,target_file)
481 else:
482 shutil.copyfile(f,target_file)
483 # Add entry to data table
484 table_name = "mothur_%s" % type_
485 add_data_table_entry(data_tables,table_name,dict(name=entry_name,
486 value=ref_data_file))
487
488 if __name__ == "__main__":
489 print "Starting..."
490
491 # Read command line
492 parser = optparse.OptionParser()
493 parser.add_option('--source',action='store',dest='data_source')
494 parser.add_option('--datasets',action='store',dest='datasets',default='')
495 parser.add_option('--paths',action='store',dest='paths',default=[])
496 parser.add_option('--description',action='store',dest='description',default='')
497 parser.add_option('--link',action='store_true',dest='link_to_data')
498 options,args = parser.parse_args()
499 print "options: %s" % options
500 print "args : %s" % args
501
502 # Check for JSON file
503 if len(args) != 1:
504 p.error("Need to supply JSON file name")
505 jsonfile = args[0]
506
507 # Read the input JSON
508 params,target_dir = read_input_json(jsonfile)
509
510 # Make the target directory
511 print "Making %s" % target_dir
512 os.mkdir(target_dir)
513
514 # Set up data tables dictionary
515 data_tables = create_data_tables_dict()
516 add_data_table(data_tables,'mothur_lookup')
517 add_data_table(data_tables,'mothur_aligndb')
518 add_data_table(data_tables,'mothur_map')
519 add_data_table(data_tables,'mothur_taxonomy')
520
521 # Fetch data from specified data sources
522 if options.data_source == 'mothur_website':
523 datasets = options.datasets.split(',')
524 fetch_from_mothur_website(data_tables,target_dir,datasets)
525 elif options.data_source == 'filesystem_paths':
526 # Check description text
527 description = options.description.strip()
528 # Get list of paths (need to remove any escapes for '\n' and '\r'
529 # that might have been inserted by Galaxy)
530 paths = options.paths.replace('__cn__','\n').replace('__cr__','\r').split()
531 import_from_server(data_tables,target_dir,paths,description,
532 link_to_data=options.link_to_data)
533 # Write output JSON
534 print "Outputting JSON"
535 print str(to_json_string(data_tables))
536 open(jsonfile,'wb').write(to_json_string(data_tables))
537 print "Done."