Mercurial > repos > pjbriggs > data_manager_mothur_toolsuite
comparison data_manager/fetch_mothur_reference_data.py @ 0:b90e0f2bf4b1 draft default tip
Initial version.
| author | pjbriggs |
|---|---|
| date | Tue, 18 Nov 2014 09:57:33 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b90e0f2bf4b1 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # | |
| 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools | |
| 4 import sys | |
| 5 import os | |
| 6 import optparse | |
| 7 import tempfile | |
| 8 import shutil | |
| 9 import urllib2 | |
| 10 import zipfile | |
| 11 import tarfile | |
| 12 | |
| 13 from galaxy.util.json import from_json_string, to_json_string | |
| 14 | |
| 15 # When extracting files from archives, skip names that | |
| 16 # start with the following strings | |
| 17 IGNORE_PATHS = ('.','__MACOSX/','__') | |
| 18 | |
| 19 # Map file extensions to data table names | |
| 20 MOTHUR_FILE_TYPES = { ".map": "map", | |
| 21 ".fasta": "aligndb", | |
| 22 ".pat": "lookup", | |
| 23 ".tax": "taxonomy" } | |
| 24 | |
| 25 # Reference data URLs | |
| 26 MOTHUR_REFERENCE_DATA = { | |
| 27 # Look up data | |
| 28 # http://www.mothur.org/wiki/Lookup_files | |
| 29 "lookup_titanium": { | |
| 30 "GS FLX Titanium": ["http://www.mothur.org/w/images/9/96/LookUp_Titanium.zip",] | |
| 31 }, | |
| 32 "lookup_gsflx": { | |
| 33 "GSFLX": ["http://www.mothur.org/w/images/8/84/LookUp_GSFLX.zip",] | |
| 34 }, | |
| 35 "lookup_gs20": { | |
| 36 "GS20": ["http://www.mothur.org/w/images/7/7b/LookUp_GS20.zip",] | |
| 37 }, | |
| 38 # RDP reference files | |
| 39 # http://www.mothur.org/wiki/RDP_reference_files | |
| 40 "RDP_v10": { | |
| 41 "16S rRNA RDP training set 10": | |
| 42 ["http://www.mothur.org/w/images/b/b5/Trainset10_082014.rdp.tgz",], | |
| 43 "16S rRNA PDS training set 10": | |
| 44 ["http://www.mothur.org/w/images/2/24/Trainset10_082014.pds.tgz",], | |
| 45 }, | |
| 46 "RDP_v9": { | |
| 47 "16S rRNA RDP training set 9": | |
| 48 ["http://www.mothur.org/w/images/7/72/Trainset9_032012.rdp.zip",], | |
| 49 "16S rRNA PDS training set 9": | |
| 50 ["http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip",], | |
| 51 }, | |
| 52 "RDP_v7": { | |
| 53 "16S rRNA RDP training set 7": | |
| 54 ["http://www.mothur.org/w/images/2/29/Trainset7_112011.rdp.zip",], | |
| 55 "16S rRNA PDS training set 7": | |
| 56 ["http://www.mothur.org/w/images/4/4a/Trainset7_112011.pds.zip",], | |
| 57 "8S rRNA Fungi training set 7": | |
| 58 ["http://www.mothur.org/w/images/3/36/FungiLSU_train_v7.zip",], | |
| 59 }, | |
| 60 "RDP_v6": { | |
| 61 "RDP training set 6": | |
| 62 ["http://www.mothur.org/w/images/4/49/RDPTrainingSet.zip",], | |
| 63 }, | |
| 64 # Silva reference files | |
| 65 # http://www.mothur.org/wiki/Silva_reference_files | |
| 66 "silva_release_119": { | |
| 67 "SILVA release 119": | |
| 68 ["http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz", | |
| 69 "http://www.mothur.org/w/images/5/56/Silva.seed_v119.tgz",], | |
| 70 }, | |
| 71 "silva_release_102": { | |
| 72 "SILVA release 102": | |
| 73 ["http://www.mothur.org/w/images/9/98/Silva.bacteria.zip", | |
| 74 "http://www.mothur.org/w/images/3/3c/Silva.archaea.zip", | |
| 75 "http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip",], | |
| 76 }, | |
| 77 "silva_gold_bacteria": { | |
| 78 "SILVA gold": | |
| 79 ["http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip",], | |
| 80 }, | |
| 81 # Greengenes | |
| 82 # http://www.mothur.org/wiki/Greengenes-formatted_databases | |
| 83 "greengenes_August2013": { | |
| 84 "Greengenes August 2013": | |
| 85 ["http://www.mothur.org/w/images/1/19/Gg_13_8_99.refalign.tgz", | |
| 86 "http://www.mothur.org/w/images/6/68/Gg_13_8_99.taxonomy.tgz",], | |
| 87 }, | |
| 88 "greengenes_May2013": { | |
| 89 "Greengenes May 2013": | |
| 90 ["http://www.mothur.org/w/images/c/cd/Gg_13_5_99.refalign.tgz", | |
| 91 "http://www.mothur.org/w/images/9/9d/Gg_13_5_99.taxonomy.tgz",], | |
| 92 }, | |
| 93 "greengenes_old": { | |
| 94 "Greengenes pre-May 2013": | |
| 95 ["http://www.mothur.org/w/images/7/72/Greengenes.alignment.zip", | |
| 96 "http://www.mothur.org/w/images/1/16/Greengenes.tax.tgz",], | |
| 97 }, | |
| 98 "greengenes_gold_alignment": { | |
| 99 "Greengenes gold alignment": | |
| 100 ["http://www.mothur.org/w/images/2/21/Greengenes.gold.alignment.zip",], | |
| 101 }, | |
| 102 # Secondary structure maps | |
| 103 # http://www.mothur.org/wiki/Secondary_structure_map | |
| 104 "secondary_structure_maps_silva": { | |
| 105 "SILVA": | |
| 106 ["http://www.mothur.org/w/images/6/6d/Silva_ss_map.zip",], | |
| 107 }, | |
| 108 "secondary_structure_maps_greengenes": { | |
| 109 "Greengenes": | |
| 110 ["http://www.mothur.org/w/images/4/4b/Gg_ss_map.zip",], | |
| 111 }, | |
| 112 # Lane masks: not used here? | |
| 113 "lane_masks": { | |
| 114 "Greengenes-compatible": | |
| 115 ["http://www.mothur.org/w/images/2/2a/Lane1241.gg.filter", | |
| 116 "http://www.mothur.org/w/images/a/a0/Lane1287.gg.filter", | |
| 117 "http://www.mothur.org/w/images/3/3d/Lane1349.gg.filter",], | |
| 118 "SILVA-compatible": | |
| 119 ["http://www.mothur.org/w/images/6/6d/Lane1349.silva.filter",] | |
| 120 }, | |
| 121 } | |
| 122 | |
| 123 # Utility functions for interacting with Galaxy JSON | |
| 124 | |
| 125 def read_input_json(jsonfile): | |
| 126 """Read the JSON supplied from the data manager tool | |
| 127 | |
| 128 Returns a tuple (param_dict,extra_files_path) | |
| 129 | |
| 130 'param_dict' is an arbitrary dictionary of parameters | |
| 131 input into the tool; 'extra_files_path' is the path | |
| 132 to a directory where output files must be put for the | |
| 133 receiving data manager to pick them up. | |
| 134 | |
| 135 NB the directory pointed to by 'extra_files_path' | |
| 136 doesn't exist initially, it is the job of the script | |
| 137 to create it if necessary. | |
| 138 | |
| 139 """ | |
| 140 params = from_json_string(open(jsonfile).read()) | |
| 141 return (params['param_dict'], | |
| 142 params['output_data'][0]['extra_files_path']) | |
| 143 | |
| 144 # Utility functions for creating data table dictionaries | |
| 145 # | |
| 146 # Example usage: | |
| 147 # >>> d = create_data_tables_dict() | |
| 148 # >>> add_data_table(d,'my_data') | |
| 149 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | |
| 150 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | |
| 151 # >>> print str(to_json_string(d)) | |
| 152 | |
| 153 def create_data_tables_dict(): | |
| 154 """Return a dictionary for storing data table information | |
| 155 | |
| 156 Returns a dictionary that can be used with 'add_data_table' | |
| 157 and 'add_data_table_entry' to store information about a | |
| 158 data table. It can be converted to JSON to be sent back to | |
| 159 the data manager. | |
| 160 | |
| 161 """ | |
| 162 d = {} | |
| 163 d['data_tables'] = {} | |
| 164 return d | |
| 165 | |
| 166 def add_data_table(d,table): | |
| 167 """Add a data table to the data tables dictionary | |
| 168 | |
| 169 Creates a placeholder for a data table called 'table'. | |
| 170 | |
| 171 """ | |
| 172 d['data_tables'][table] = [] | |
| 173 | |
| 174 def add_data_table_entry(d,table,entry): | |
| 175 """Add an entry to a data table | |
| 176 | |
| 177 Appends an entry to the data table 'table'. 'entry' | |
| 178 should be a dictionary where the keys are the names of | |
| 179 columns in the data table. | |
| 180 | |
| 181 Raises an exception if the named data table doesn't | |
| 182 exist. | |
| 183 | |
| 184 """ | |
| 185 try: | |
| 186 d['data_tables'][table].append(entry) | |
| 187 except KeyError: | |
| 188 raise Exception("add_data_table_entry: no table '%s'" % table) | |
| 189 | |
| 190 # Utility functions for downloading and unpacking archive files | |
| 191 | |
| 192 def download_file(url,target=None,wd=None): | |
| 193 """Download a file from a URL | |
| 194 | |
| 195 Fetches a file from the specified URL. | |
| 196 | |
| 197 If 'target' is specified then the file is saved to this | |
| 198 name; otherwise it's saved as the basename of the URL. | |
| 199 | |
| 200 If 'wd' is specified then it is used as the 'working | |
| 201 directory' where the file will be save on the local | |
| 202 system. | |
| 203 | |
| 204 Returns the name that the file is saved with. | |
| 205 | |
| 206 """ | |
| 207 print "Downloading %s" % url | |
| 208 if not target: | |
| 209 target = os.path.basename(url) | |
| 210 if wd: | |
| 211 target = os.path.join(wd,target) | |
| 212 print "Saving to %s" % target | |
| 213 open(target,'wb').write(urllib2.urlopen(url).read()) | |
| 214 return target | |
| 215 | |
| 216 def unpack_zip_archive(filen,wd=None): | |
| 217 """Extract files from a ZIP archive | |
| 218 | |
| 219 Given a ZIP archive, extract the files it contains | |
| 220 and return a list of the resulting file names and | |
| 221 paths. | |
| 222 | |
| 223 'wd' specifies the working directory to extract | |
| 224 the files to, otherwise they are extracted to the | |
| 225 current working directory. | |
| 226 | |
| 227 Once all the files are extracted the ZIP archive | |
| 228 file is deleted from the file system. | |
| 229 | |
| 230 """ | |
| 231 if not zipfile.is_zipfile(filen): | |
| 232 print "%s: not ZIP formatted file" | |
| 233 return [filen] | |
| 234 file_list = [] | |
| 235 z = zipfile.ZipFile(filen) | |
| 236 for name in z.namelist(): | |
| 237 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False): | |
| 238 print "Ignoring %s" % name | |
| 239 continue | |
| 240 if wd: | |
| 241 target = os.path.join(wd,name) | |
| 242 else: | |
| 243 target = name | |
| 244 if name.endswith('/'): | |
| 245 # Make directory | |
| 246 print "Creating dir %s" % target | |
| 247 try: | |
| 248 os.makedirs(target) | |
| 249 except OSError: | |
| 250 pass | |
| 251 else: | |
| 252 # Extract file | |
| 253 print "Extracting %s" % name | |
| 254 try: | |
| 255 os.makedirs(os.path.dirname(target)) | |
| 256 except OSError: | |
| 257 pass | |
| 258 open(target,'wb').write(z.read(name)) | |
| 259 file_list.append(target) | |
| 260 print "Removing %s" % filen | |
| 261 os.remove(filen) | |
| 262 return file_list | |
| 263 | |
| 264 def unpack_tar_archive(filen,wd=None): | |
| 265 """Extract files from a TAR archive | |
| 266 | |
| 267 Given a TAR archive (which optionally can be | |
| 268 compressed with either gzip or bz2), extract the | |
| 269 files it contains and return a list of the | |
| 270 resulting file names and paths. | |
| 271 | |
| 272 'wd' specifies the working directory to extract | |
| 273 the files to, otherwise they are extracted to the | |
| 274 current working directory. | |
| 275 | |
| 276 Once all the files are extracted the TAR archive | |
| 277 file is deleted from the file system. | |
| 278 | |
| 279 """ | |
| 280 file_list = [] | |
| 281 if wd: | |
| 282 path = wd | |
| 283 else: | |
| 284 path = '.' | |
| 285 if not tarfile.is_tarfile(filen): | |
| 286 print "%s: not TAR file" | |
| 287 return [filen] | |
| 288 t = tarfile.open(filen) | |
| 289 for name in t.getnames(): | |
| 290 # Check for unwanted files | |
| 291 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False): | |
| 292 print "Ignoring %s" % name | |
| 293 continue | |
| 294 # Extract file | |
| 295 print "Extracting %s" % name | |
| 296 t.extract(name,wd) | |
| 297 if wd: | |
| 298 target = os.path.join(wd,name) | |
| 299 else: | |
| 300 target = name | |
| 301 file_list.append(target) | |
| 302 print "Removing %s" % filen | |
| 303 os.remove(filen) | |
| 304 return file_list | |
| 305 | |
| 306 def unpack_archive(filen,wd=None): | |
| 307 """Extract files from an archive | |
| 308 | |
| 309 Wrapper function that calls the appropriate | |
| 310 unpacking function depending on the archive | |
| 311 type, and returns a list of files that have | |
| 312 been extracted. | |
| 313 | |
| 314 'wd' specifies the working directory to extract | |
| 315 the files to, otherwise they are extracted to the | |
| 316 current working directory. | |
| 317 | |
| 318 """ | |
| 319 print "Unpack %s" % filen | |
| 320 ext = os.path.splitext(filen)[1] | |
| 321 print "Extension: %s" % ext | |
| 322 if ext == ".zip": | |
| 323 return unpack_zip_archive(filen,wd=wd) | |
| 324 elif ext == ".tgz": | |
| 325 return unpack_tar_archive(filen,wd=wd) | |
| 326 else: | |
| 327 return [filen] | |
| 328 | |
| 329 def fetch_files(urls,wd=None,files=None): | |
| 330 """Download and unpack files from a list of URLs | |
| 331 | |
| 332 Given a list of URLs, download and unpack each | |
| 333 one, and return a list of the extracted files. | |
| 334 | |
| 335 'wd' specifies the working directory to extract | |
| 336 the files to, otherwise they are extracted to the | |
| 337 current working directory. | |
| 338 | |
| 339 If 'files' is given then the list of extracted | |
| 340 files will be appended to this list before being | |
| 341 returned. | |
| 342 | |
| 343 """ | |
| 344 if files is None: | |
| 345 files = [] | |
| 346 for url in urls: | |
| 347 filen = download_file(url,wd=wd) | |
| 348 files.extend(unpack_archive(filen,wd=wd)) | |
| 349 return files | |
| 350 | |
| 351 # Utility functions specific to the Mothur reference data | |
| 352 | |
| 353 def identify_type(filen): | |
| 354 """Return the data table name based on the file name | |
| 355 | |
| 356 """ | |
| 357 ext = os.path.splitext(filen)[1] | |
| 358 try: | |
| 359 return MOTHUR_FILE_TYPES[ext] | |
| 360 except KeyError: | |
| 361 return None | |
| 362 | |
| 363 def get_name(filen): | |
| 364 """Generate a descriptive name based on the file name | |
| 365 """ | |
| 366 type_ = identify_type(filen) | |
| 367 name = os.path.splitext(os.path.basename(filen))[0] | |
| 368 for delim in ('.','_'): | |
| 369 name = name.replace(delim,' ') | |
| 370 return name | |
| 371 | |
| 372 def fetch_from_mothur_website(data_tables,target_dir,datasets): | |
| 373 """Fetch reference data from the Mothur website | |
| 374 | |
| 375 For each dataset in the list 'datasets', download (and if | |
| 376 necessary unpack) the related files from the Mothur website, | |
| 377 copy them to the data manager's target directory, and add | |
| 378 references to the files to the appropriate data table. | |
| 379 | |
| 380 The 'data_tables' dictionary should have been created using | |
| 381 the 'create_data_tables_dict' and 'add_data_table' functions. | |
| 382 | |
| 383 Arguments: | |
| 384 data_tables: a dictionary containing the data table info | |
| 385 target_dir: directory to put the downloaded files | |
| 386 datasets: a list of dataset names corresponding to keys in | |
| 387 the MOTHUR_REFERENCE_DATA dictionary | |
| 388 | |
| 389 """ | |
| 390 # Make working dir | |
| 391 wd = tempfile.mkdtemp(suffix=".mothur",dir=os.getcwd()) | |
| 392 print "Working dir %s" % wd | |
| 393 # Iterate over all requested reference data URLs | |
| 394 for dataset in datasets: | |
| 395 print "Handling dataset '%s'" % dataset | |
| 396 for name in MOTHUR_REFERENCE_DATA[dataset]: | |
| 397 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name],wd=wd): | |
| 398 type_ = identify_type(f) | |
| 399 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0],name) | |
| 400 print "%s\t\'%s'\t.../%s" % (type_, | |
| 401 entry_name, | |
| 402 os.path.basename(f)) | |
| 403 if type_ is not None: | |
| 404 # Move to target dir | |
| 405 ref_data_file = os.path.basename(f) | |
| 406 f1 = os.path.join(target_dir,ref_data_file) | |
| 407 print "Moving %s to %s" % (f,f1) | |
| 408 os.rename(f,f1) | |
| 409 # Add entry to data table | |
| 410 table_name = "mothur_%s" % type_ | |
| 411 add_data_table_entry(data_tables,table_name,dict(name=entry_name, | |
| 412 value=ref_data_file)) | |
| 413 # Remove working dir | |
| 414 print "Removing %s" % wd | |
| 415 shutil.rmtree(wd) | |
| 416 | |
| 417 def files_from_filesystem_paths(paths): | |
| 418 """Return list of file paths from arbitrary input paths | |
| 419 | |
| 420 Given a list of filesystem paths, return a list of | |
| 421 full paths corresponding to all files found recursively | |
| 422 from under those paths. | |
| 423 | |
| 424 """ | |
| 425 # Collect files to add | |
| 426 files = [] | |
| 427 for path in paths: | |
| 428 path = os.path.abspath(path) | |
| 429 print "Examining '%s'..." % path | |
| 430 if os.path.isfile(path): | |
| 431 # Store full path for file | |
| 432 files.append(path) | |
| 433 elif os.path.isdir(path): | |
| 434 # Descend into directory and collect the files | |
| 435 for f in os.listdir(path): | |
| 436 files.extend(files_from_filesystem_paths((os.path.join(path,f),))) | |
| 437 else: | |
| 438 print "Not a file or directory, ignored" | |
| 439 return files | |
| 440 | |
| 441 def import_from_server(data_tables,target_dir,paths,description, | |
| 442 link_to_data=False): | |
| 443 """Import reference data from filesystem paths | |
| 444 | |
| 445 Creates references to the specified file(s) on the Galaxy | |
| 446 server in the appropriate data table (determined from the | |
| 447 file extension). | |
| 448 | |
| 449 The 'data_tables' dictionary should have been created using | |
| 450 the 'create_data_tables_dict' and 'add_data_table' functions. | |
| 451 | |
| 452 Arguments: | |
| 453 data_tables: a dictionary containing the data table info | |
| 454 target_dir: directory to put copy or link to the data file | |
| 455 paths: list of file and/or directory paths to import | |
| 456 description: text to associate with the files | |
| 457 link_to_data: boolean, if False then copy the data file | |
| 458 into Galaxy (default); if True then make a symlink to | |
| 459 the data file | |
| 460 | |
| 461 """ | |
| 462 # Collect list of files based on input paths | |
| 463 files = files_from_filesystem_paths(paths) | |
| 464 # Handle each file individually | |
| 465 for f in files: | |
| 466 type_ = identify_type(f) | |
| 467 if type_ is None: | |
| 468 print "%s: unrecognised type, skipped" % f | |
| 469 continue | |
| 470 ref_data_file = os.path.basename(f) | |
| 471 target_file = os.path.join(target_dir,ref_data_file) | |
| 472 entry_name = "%s" % os.path.splitext(ref_data_file)[0] | |
| 473 if description: | |
| 474 entry_name += " (%s)" % description | |
| 475 print "%s\t\'%s'\t.../%s" % (type_, | |
| 476 entry_name, | |
| 477 ref_data_file) | |
| 478 # Link to or copy the data | |
| 479 if link_to_data: | |
| 480 os.symlink(f,target_file) | |
| 481 else: | |
| 482 shutil.copyfile(f,target_file) | |
| 483 # Add entry to data table | |
| 484 table_name = "mothur_%s" % type_ | |
| 485 add_data_table_entry(data_tables,table_name,dict(name=entry_name, | |
| 486 value=ref_data_file)) | |
| 487 | |
| 488 if __name__ == "__main__": | |
| 489 print "Starting..." | |
| 490 | |
| 491 # Read command line | |
| 492 parser = optparse.OptionParser() | |
| 493 parser.add_option('--source',action='store',dest='data_source') | |
| 494 parser.add_option('--datasets',action='store',dest='datasets',default='') | |
| 495 parser.add_option('--paths',action='store',dest='paths',default=[]) | |
| 496 parser.add_option('--description',action='store',dest='description',default='') | |
| 497 parser.add_option('--link',action='store_true',dest='link_to_data') | |
| 498 options,args = parser.parse_args() | |
| 499 print "options: %s" % options | |
| 500 print "args : %s" % args | |
| 501 | |
| 502 # Check for JSON file | |
| 503 if len(args) != 1: | |
| 504 p.error("Need to supply JSON file name") | |
| 505 jsonfile = args[0] | |
| 506 | |
| 507 # Read the input JSON | |
| 508 params,target_dir = read_input_json(jsonfile) | |
| 509 | |
| 510 # Make the target directory | |
| 511 print "Making %s" % target_dir | |
| 512 os.mkdir(target_dir) | |
| 513 | |
| 514 # Set up data tables dictionary | |
| 515 data_tables = create_data_tables_dict() | |
| 516 add_data_table(data_tables,'mothur_lookup') | |
| 517 add_data_table(data_tables,'mothur_aligndb') | |
| 518 add_data_table(data_tables,'mothur_map') | |
| 519 add_data_table(data_tables,'mothur_taxonomy') | |
| 520 | |
| 521 # Fetch data from specified data sources | |
| 522 if options.data_source == 'mothur_website': | |
| 523 datasets = options.datasets.split(',') | |
| 524 fetch_from_mothur_website(data_tables,target_dir,datasets) | |
| 525 elif options.data_source == 'filesystem_paths': | |
| 526 # Check description text | |
| 527 description = options.description.strip() | |
| 528 # Get list of paths (need to remove any escapes for '\n' and '\r' | |
| 529 # that might have been inserted by Galaxy) | |
| 530 paths = options.paths.replace('__cn__','\n').replace('__cr__','\r').split() | |
| 531 import_from_server(data_tables,target_dir,paths,description, | |
| 532 link_to_data=options.link_to_data) | |
| 533 # Write output JSON | |
| 534 print "Outputting JSON" | |
| 535 print str(to_json_string(data_tables)) | |
| 536 open(jsonfile,'wb').write(to_json_string(data_tables)) | |
| 537 print "Done." |
