0
|
1 #!/usr/bin/env python
|
|
2 #
|
|
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
|
|
4 import sys
|
|
5 import os
|
|
6 import optparse
|
|
7 import tempfile
|
|
8 import shutil
|
|
9 import urllib2
|
|
10 import zipfile
|
|
11 import tarfile
|
|
12
|
|
13 from galaxy.util.json import from_json_string, to_json_string
|
|
14
|
|
15 # When extracting files from archives, skip names that
|
|
16 # start with the following strings
|
|
17 IGNORE_PATHS = ('.','__MACOSX/','__')
|
|
18
|
|
19 # Map file extensions to data table names
|
|
20 MOTHUR_FILE_TYPES = { ".map": "map",
|
|
21 ".fasta": "aligndb",
|
|
22 ".pat": "lookup",
|
|
23 ".tax": "taxonomy" }
|
|
24
|
|
25 # Reference data URLs
|
|
26 MOTHUR_REFERENCE_DATA = {
|
|
27 # Look up data
|
|
28 # http://www.mothur.org/wiki/Lookup_files
|
|
29 "lookup_titanium": {
|
|
30 "GS FLX Titanium": ["http://www.mothur.org/w/images/9/96/LookUp_Titanium.zip",]
|
|
31 },
|
|
32 "lookup_gsflx": {
|
|
33 "GSFLX": ["http://www.mothur.org/w/images/8/84/LookUp_GSFLX.zip",]
|
|
34 },
|
|
35 "lookup_gs20": {
|
|
36 "GS20": ["http://www.mothur.org/w/images/7/7b/LookUp_GS20.zip",]
|
|
37 },
|
|
38 # RDP reference files
|
|
39 # http://www.mothur.org/wiki/RDP_reference_files
|
|
40 "RDP_v10": {
|
|
41 "16S rRNA RDP training set 10":
|
|
42 ["http://www.mothur.org/w/images/b/b5/Trainset10_082014.rdp.tgz",],
|
|
43 "16S rRNA PDS training set 10":
|
|
44 ["http://www.mothur.org/w/images/2/24/Trainset10_082014.pds.tgz",],
|
|
45 },
|
|
46 "RDP_v9": {
|
|
47 "16S rRNA RDP training set 9":
|
|
48 ["http://www.mothur.org/w/images/7/72/Trainset9_032012.rdp.zip",],
|
|
49 "16S rRNA PDS training set 9":
|
|
50 ["http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip",],
|
|
51 },
|
|
52 "RDP_v7": {
|
|
53 "16S rRNA RDP training set 7":
|
|
54 ["http://www.mothur.org/w/images/2/29/Trainset7_112011.rdp.zip",],
|
|
55 "16S rRNA PDS training set 7":
|
|
56 ["http://www.mothur.org/w/images/4/4a/Trainset7_112011.pds.zip",],
|
|
57 "8S rRNA Fungi training set 7":
|
|
58 ["http://www.mothur.org/w/images/3/36/FungiLSU_train_v7.zip",],
|
|
59 },
|
|
60 "RDP_v6": {
|
|
61 "RDP training set 6":
|
|
62 ["http://www.mothur.org/w/images/4/49/RDPTrainingSet.zip",],
|
|
63 },
|
|
64 # Silva reference files
|
|
65 # http://www.mothur.org/wiki/Silva_reference_files
|
|
66 "silva_release_119": {
|
|
67 "SILVA release 119":
|
|
68 ["http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz",
|
|
69 "http://www.mothur.org/w/images/5/56/Silva.seed_v119.tgz",],
|
|
70 },
|
|
71 "silva_release_102": {
|
|
72 "SILVA release 102":
|
|
73 ["http://www.mothur.org/w/images/9/98/Silva.bacteria.zip",
|
|
74 "http://www.mothur.org/w/images/3/3c/Silva.archaea.zip",
|
|
75 "http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip",],
|
|
76 },
|
|
77 "silva_gold_bacteria": {
|
|
78 "SILVA gold":
|
|
79 ["http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip",],
|
|
80 },
|
|
81 # Greengenes
|
|
82 # http://www.mothur.org/wiki/Greengenes-formatted_databases
|
|
83 "greengenes_August2013": {
|
|
84 "Greengenes August 2013":
|
|
85 ["http://www.mothur.org/w/images/1/19/Gg_13_8_99.refalign.tgz",
|
|
86 "http://www.mothur.org/w/images/6/68/Gg_13_8_99.taxonomy.tgz",],
|
|
87 },
|
|
88 "greengenes_May2013": {
|
|
89 "Greengenes May 2013":
|
|
90 ["http://www.mothur.org/w/images/c/cd/Gg_13_5_99.refalign.tgz",
|
|
91 "http://www.mothur.org/w/images/9/9d/Gg_13_5_99.taxonomy.tgz",],
|
|
92 },
|
|
93 "greengenes_old": {
|
|
94 "Greengenes pre-May 2013":
|
|
95 ["http://www.mothur.org/w/images/7/72/Greengenes.alignment.zip",
|
|
96 "http://www.mothur.org/w/images/1/16/Greengenes.tax.tgz",],
|
|
97 },
|
|
98 "greengenes_gold_alignment": {
|
|
99 "Greengenes gold alignment":
|
|
100 ["http://www.mothur.org/w/images/2/21/Greengenes.gold.alignment.zip",],
|
|
101 },
|
|
102 # Secondary structure maps
|
|
103 # http://www.mothur.org/wiki/Secondary_structure_map
|
|
104 "secondary_structure_maps_silva": {
|
|
105 "SILVA":
|
|
106 ["http://www.mothur.org/w/images/6/6d/Silva_ss_map.zip",],
|
|
107 },
|
|
108 "secondary_structure_maps_greengenes": {
|
|
109 "Greengenes":
|
|
110 ["http://www.mothur.org/w/images/4/4b/Gg_ss_map.zip",],
|
|
111 },
|
|
112 # Lane masks: not used here?
|
|
113 "lane_masks": {
|
|
114 "Greengenes-compatible":
|
|
115 ["http://www.mothur.org/w/images/2/2a/Lane1241.gg.filter",
|
|
116 "http://www.mothur.org/w/images/a/a0/Lane1287.gg.filter",
|
|
117 "http://www.mothur.org/w/images/3/3d/Lane1349.gg.filter",],
|
|
118 "SILVA-compatible":
|
|
119 ["http://www.mothur.org/w/images/6/6d/Lane1349.silva.filter",]
|
|
120 },
|
|
121 }
|
|
122
|
|
123 # Utility functions for interacting with Galaxy JSON
|
|
124
|
|
125 def read_input_json(jsonfile):
|
|
126 """Read the JSON supplied from the data manager tool
|
|
127
|
|
128 Returns a tuple (param_dict,extra_files_path)
|
|
129
|
|
130 'param_dict' is an arbitrary dictionary of parameters
|
|
131 input into the tool; 'extra_files_path' is the path
|
|
132 to a directory where output files must be put for the
|
|
133 receiving data manager to pick them up.
|
|
134
|
|
135 NB the directory pointed to by 'extra_files_path'
|
|
136 doesn't exist initially, it is the job of the script
|
|
137 to create it if necessary.
|
|
138
|
|
139 """
|
|
140 params = from_json_string(open(jsonfile).read())
|
|
141 return (params['param_dict'],
|
|
142 params['output_data'][0]['extra_files_path'])
|
|
143
|
|
144 # Utility functions for creating data table dictionaries
|
|
145 #
|
|
146 # Example usage:
|
|
147 # >>> d = create_data_tables_dict()
|
|
148 # >>> add_data_table(d,'my_data')
|
|
149 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
|
|
150 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
|
|
151 # >>> print str(to_json_string(d))
|
|
152
|
|
153 def create_data_tables_dict():
|
|
154 """Return a dictionary for storing data table information
|
|
155
|
|
156 Returns a dictionary that can be used with 'add_data_table'
|
|
157 and 'add_data_table_entry' to store information about a
|
|
158 data table. It can be converted to JSON to be sent back to
|
|
159 the data manager.
|
|
160
|
|
161 """
|
|
162 d = {}
|
|
163 d['data_tables'] = {}
|
|
164 return d
|
|
165
|
|
166 def add_data_table(d,table):
|
|
167 """Add a data table to the data tables dictionary
|
|
168
|
|
169 Creates a placeholder for a data table called 'table'.
|
|
170
|
|
171 """
|
|
172 d['data_tables'][table] = []
|
|
173
|
|
174 def add_data_table_entry(d,table,entry):
|
|
175 """Add an entry to a data table
|
|
176
|
|
177 Appends an entry to the data table 'table'. 'entry'
|
|
178 should be a dictionary where the keys are the names of
|
|
179 columns in the data table.
|
|
180
|
|
181 Raises an exception if the named data table doesn't
|
|
182 exist.
|
|
183
|
|
184 """
|
|
185 try:
|
|
186 d['data_tables'][table].append(entry)
|
|
187 except KeyError:
|
|
188 raise Exception("add_data_table_entry: no table '%s'" % table)
|
|
189
|
|
190 # Utility functions for downloading and unpacking archive files
|
|
191
|
|
192 def download_file(url,target=None,wd=None):
|
|
193 """Download a file from a URL
|
|
194
|
|
195 Fetches a file from the specified URL.
|
|
196
|
|
197 If 'target' is specified then the file is saved to this
|
|
198 name; otherwise it's saved as the basename of the URL.
|
|
199
|
|
200 If 'wd' is specified then it is used as the 'working
|
|
201 directory' where the file will be save on the local
|
|
202 system.
|
|
203
|
|
204 Returns the name that the file is saved with.
|
|
205
|
|
206 """
|
|
207 print "Downloading %s" % url
|
|
208 if not target:
|
|
209 target = os.path.basename(url)
|
|
210 if wd:
|
|
211 target = os.path.join(wd,target)
|
|
212 print "Saving to %s" % target
|
|
213 open(target,'wb').write(urllib2.urlopen(url).read())
|
|
214 return target
|
|
215
|
|
216 def unpack_zip_archive(filen,wd=None):
|
|
217 """Extract files from a ZIP archive
|
|
218
|
|
219 Given a ZIP archive, extract the files it contains
|
|
220 and return a list of the resulting file names and
|
|
221 paths.
|
|
222
|
|
223 'wd' specifies the working directory to extract
|
|
224 the files to, otherwise they are extracted to the
|
|
225 current working directory.
|
|
226
|
|
227 Once all the files are extracted the ZIP archive
|
|
228 file is deleted from the file system.
|
|
229
|
|
230 """
|
|
231 if not zipfile.is_zipfile(filen):
|
|
232 print "%s: not ZIP formatted file"
|
|
233 return [filen]
|
|
234 file_list = []
|
|
235 z = zipfile.ZipFile(filen)
|
|
236 for name in z.namelist():
|
|
237 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False):
|
|
238 print "Ignoring %s" % name
|
|
239 continue
|
|
240 if wd:
|
|
241 target = os.path.join(wd,name)
|
|
242 else:
|
|
243 target = name
|
|
244 if name.endswith('/'):
|
|
245 # Make directory
|
|
246 print "Creating dir %s" % target
|
|
247 try:
|
|
248 os.makedirs(target)
|
|
249 except OSError:
|
|
250 pass
|
|
251 else:
|
|
252 # Extract file
|
|
253 print "Extracting %s" % name
|
|
254 try:
|
|
255 os.makedirs(os.path.dirname(target))
|
|
256 except OSError:
|
|
257 pass
|
|
258 open(target,'wb').write(z.read(name))
|
|
259 file_list.append(target)
|
|
260 print "Removing %s" % filen
|
|
261 os.remove(filen)
|
|
262 return file_list
|
|
263
|
|
264 def unpack_tar_archive(filen,wd=None):
|
|
265 """Extract files from a TAR archive
|
|
266
|
|
267 Given a TAR archive (which optionally can be
|
|
268 compressed with either gzip or bz2), extract the
|
|
269 files it contains and return a list of the
|
|
270 resulting file names and paths.
|
|
271
|
|
272 'wd' specifies the working directory to extract
|
|
273 the files to, otherwise they are extracted to the
|
|
274 current working directory.
|
|
275
|
|
276 Once all the files are extracted the TAR archive
|
|
277 file is deleted from the file system.
|
|
278
|
|
279 """
|
|
280 file_list = []
|
|
281 if wd:
|
|
282 path = wd
|
|
283 else:
|
|
284 path = '.'
|
|
285 if not tarfile.is_tarfile(filen):
|
|
286 print "%s: not TAR file"
|
|
287 return [filen]
|
|
288 t = tarfile.open(filen)
|
|
289 for name in t.getnames():
|
|
290 # Check for unwanted files
|
|
291 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False):
|
|
292 print "Ignoring %s" % name
|
|
293 continue
|
|
294 # Extract file
|
|
295 print "Extracting %s" % name
|
|
296 t.extract(name,wd)
|
|
297 if wd:
|
|
298 target = os.path.join(wd,name)
|
|
299 else:
|
|
300 target = name
|
|
301 file_list.append(target)
|
|
302 print "Removing %s" % filen
|
|
303 os.remove(filen)
|
|
304 return file_list
|
|
305
|
|
306 def unpack_archive(filen,wd=None):
|
|
307 """Extract files from an archive
|
|
308
|
|
309 Wrapper function that calls the appropriate
|
|
310 unpacking function depending on the archive
|
|
311 type, and returns a list of files that have
|
|
312 been extracted.
|
|
313
|
|
314 'wd' specifies the working directory to extract
|
|
315 the files to, otherwise they are extracted to the
|
|
316 current working directory.
|
|
317
|
|
318 """
|
|
319 print "Unpack %s" % filen
|
|
320 ext = os.path.splitext(filen)[1]
|
|
321 print "Extension: %s" % ext
|
|
322 if ext == ".zip":
|
|
323 return unpack_zip_archive(filen,wd=wd)
|
|
324 elif ext == ".tgz":
|
|
325 return unpack_tar_archive(filen,wd=wd)
|
|
326 else:
|
|
327 return [filen]
|
|
328
|
|
329 def fetch_files(urls,wd=None,files=None):
|
|
330 """Download and unpack files from a list of URLs
|
|
331
|
|
332 Given a list of URLs, download and unpack each
|
|
333 one, and return a list of the extracted files.
|
|
334
|
|
335 'wd' specifies the working directory to extract
|
|
336 the files to, otherwise they are extracted to the
|
|
337 current working directory.
|
|
338
|
|
339 If 'files' is given then the list of extracted
|
|
340 files will be appended to this list before being
|
|
341 returned.
|
|
342
|
|
343 """
|
|
344 if files is None:
|
|
345 files = []
|
|
346 for url in urls:
|
|
347 filen = download_file(url,wd=wd)
|
|
348 files.extend(unpack_archive(filen,wd=wd))
|
|
349 return files
|
|
350
|
|
351 # Utility functions specific to the Mothur reference data
|
|
352
|
|
353 def identify_type(filen):
|
|
354 """Return the data table name based on the file name
|
|
355
|
|
356 """
|
|
357 ext = os.path.splitext(filen)[1]
|
|
358 try:
|
|
359 return MOTHUR_FILE_TYPES[ext]
|
|
360 except KeyError:
|
|
361 return None
|
|
362
|
|
363 def get_name(filen):
|
|
364 """Generate a descriptive name based on the file name
|
|
365 """
|
|
366 type_ = identify_type(filen)
|
|
367 name = os.path.splitext(os.path.basename(filen))[0]
|
|
368 for delim in ('.','_'):
|
|
369 name = name.replace(delim,' ')
|
|
370 return name
|
|
371
|
|
372 def fetch_from_mothur_website(data_tables,target_dir,datasets):
|
|
373 """Fetch reference data from the Mothur website
|
|
374
|
|
375 For each dataset in the list 'datasets', download (and if
|
|
376 necessary unpack) the related files from the Mothur website,
|
|
377 copy them to the data manager's target directory, and add
|
|
378 references to the files to the appropriate data table.
|
|
379
|
|
380 The 'data_tables' dictionary should have been created using
|
|
381 the 'create_data_tables_dict' and 'add_data_table' functions.
|
|
382
|
|
383 Arguments:
|
|
384 data_tables: a dictionary containing the data table info
|
|
385 target_dir: directory to put the downloaded files
|
|
386 datasets: a list of dataset names corresponding to keys in
|
|
387 the MOTHUR_REFERENCE_DATA dictionary
|
|
388
|
|
389 """
|
|
390 # Make working dir
|
|
391 wd = tempfile.mkdtemp(suffix=".mothur",dir=os.getcwd())
|
|
392 print "Working dir %s" % wd
|
|
393 # Iterate over all requested reference data URLs
|
|
394 for dataset in datasets:
|
|
395 print "Handling dataset '%s'" % dataset
|
|
396 for name in MOTHUR_REFERENCE_DATA[dataset]:
|
|
397 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name],wd=wd):
|
|
398 type_ = identify_type(f)
|
|
399 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0],name)
|
|
400 print "%s\t\'%s'\t.../%s" % (type_,
|
|
401 entry_name,
|
|
402 os.path.basename(f))
|
|
403 if type_ is not None:
|
|
404 # Move to target dir
|
|
405 ref_data_file = os.path.basename(f)
|
|
406 f1 = os.path.join(target_dir,ref_data_file)
|
|
407 print "Moving %s to %s" % (f,f1)
|
|
408 os.rename(f,f1)
|
|
409 # Add entry to data table
|
|
410 table_name = "mothur_%s" % type_
|
|
411 add_data_table_entry(data_tables,table_name,dict(name=entry_name,
|
|
412 value=ref_data_file))
|
|
413 # Remove working dir
|
|
414 print "Removing %s" % wd
|
|
415 shutil.rmtree(wd)
|
|
416
|
|
417 def files_from_filesystem_paths(paths):
|
|
418 """Return list of file paths from arbitrary input paths
|
|
419
|
|
420 Given a list of filesystem paths, return a list of
|
|
421 full paths corresponding to all files found recursively
|
|
422 from under those paths.
|
|
423
|
|
424 """
|
|
425 # Collect files to add
|
|
426 files = []
|
|
427 for path in paths:
|
|
428 path = os.path.abspath(path)
|
|
429 print "Examining '%s'..." % path
|
|
430 if os.path.isfile(path):
|
|
431 # Store full path for file
|
|
432 files.append(path)
|
|
433 elif os.path.isdir(path):
|
|
434 # Descend into directory and collect the files
|
|
435 for f in os.listdir(path):
|
|
436 files.extend(files_from_filesystem_paths((os.path.join(path,f),)))
|
|
437 else:
|
|
438 print "Not a file or directory, ignored"
|
|
439 return files
|
|
440
|
|
441 def import_from_server(data_tables,target_dir,paths,description,
|
|
442 link_to_data=False):
|
|
443 """Import reference data from filesystem paths
|
|
444
|
|
445 Creates references to the specified file(s) on the Galaxy
|
|
446 server in the appropriate data table (determined from the
|
|
447 file extension).
|
|
448
|
|
449 The 'data_tables' dictionary should have been created using
|
|
450 the 'create_data_tables_dict' and 'add_data_table' functions.
|
|
451
|
|
452 Arguments:
|
|
453 data_tables: a dictionary containing the data table info
|
|
454 target_dir: directory to put copy or link to the data file
|
|
455 paths: list of file and/or directory paths to import
|
|
456 description: text to associate with the files
|
|
457 link_to_data: boolean, if False then copy the data file
|
|
458 into Galaxy (default); if True then make a symlink to
|
|
459 the data file
|
|
460
|
|
461 """
|
|
462 # Collect list of files based on input paths
|
|
463 files = files_from_filesystem_paths(paths)
|
|
464 # Handle each file individually
|
|
465 for f in files:
|
|
466 type_ = identify_type(f)
|
|
467 if type_ is None:
|
|
468 print "%s: unrecognised type, skipped" % f
|
|
469 continue
|
|
470 ref_data_file = os.path.basename(f)
|
|
471 target_file = os.path.join(target_dir,ref_data_file)
|
|
472 entry_name = "%s" % os.path.splitext(ref_data_file)[0]
|
|
473 if description:
|
|
474 entry_name += " (%s)" % description
|
|
475 print "%s\t\'%s'\t.../%s" % (type_,
|
|
476 entry_name,
|
|
477 ref_data_file)
|
|
478 # Link to or copy the data
|
|
479 if link_to_data:
|
|
480 os.symlink(f,target_file)
|
|
481 else:
|
|
482 shutil.copyfile(f,target_file)
|
|
483 # Add entry to data table
|
|
484 table_name = "mothur_%s" % type_
|
|
485 add_data_table_entry(data_tables,table_name,dict(name=entry_name,
|
|
486 value=ref_data_file))
|
|
487
|
|
488 if __name__ == "__main__":
|
|
489 print "Starting..."
|
|
490
|
|
491 # Read command line
|
|
492 parser = optparse.OptionParser()
|
|
493 parser.add_option('--source',action='store',dest='data_source')
|
|
494 parser.add_option('--datasets',action='store',dest='datasets',default='')
|
|
495 parser.add_option('--paths',action='store',dest='paths',default=[])
|
|
496 parser.add_option('--description',action='store',dest='description',default='')
|
|
497 parser.add_option('--link',action='store_true',dest='link_to_data')
|
|
498 options,args = parser.parse_args()
|
|
499 print "options: %s" % options
|
|
500 print "args : %s" % args
|
|
501
|
|
502 # Check for JSON file
|
|
503 if len(args) != 1:
|
|
504 p.error("Need to supply JSON file name")
|
|
505 jsonfile = args[0]
|
|
506
|
|
507 # Read the input JSON
|
|
508 params,target_dir = read_input_json(jsonfile)
|
|
509
|
|
510 # Make the target directory
|
|
511 print "Making %s" % target_dir
|
|
512 os.mkdir(target_dir)
|
|
513
|
|
514 # Set up data tables dictionary
|
|
515 data_tables = create_data_tables_dict()
|
|
516 add_data_table(data_tables,'mothur_lookup')
|
|
517 add_data_table(data_tables,'mothur_aligndb')
|
|
518 add_data_table(data_tables,'mothur_map')
|
|
519 add_data_table(data_tables,'mothur_taxonomy')
|
|
520
|
|
521 # Fetch data from specified data sources
|
|
522 if options.data_source == 'mothur_website':
|
|
523 datasets = options.datasets.split(',')
|
|
524 fetch_from_mothur_website(data_tables,target_dir,datasets)
|
|
525 elif options.data_source == 'filesystem_paths':
|
|
526 # Check description text
|
|
527 description = options.description.strip()
|
|
528 # Get list of paths (need to remove any escapes for '\n' and '\r'
|
|
529 # that might have been inserted by Galaxy)
|
|
530 paths = options.paths.replace('__cn__','\n').replace('__cr__','\r').split()
|
|
531 import_from_server(data_tables,target_dir,paths,description,
|
|
532 link_to_data=options.link_to_data)
|
|
533 # Write output JSON
|
|
534 print "Outputting JSON"
|
|
535 print str(to_json_string(data_tables))
|
|
536 open(jsonfile,'wb').write(to_json_string(data_tables))
|
|
537 print "Done."
|