Mercurial > repos > rhpvorderman > data_manager_mothur_toolsuite
changeset 0:3f6f7ca5f95a draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit 821349cd1c414dcad2128ba78d2b9d9051b3e75b"
author | rhpvorderman |
---|---|
date | Tue, 22 Jun 2021 12:07:41 +0000 |
parents | |
children | 3f8a5d7fe819 |
files | README data_manager/data_manager_fetch_mothur_reference_data.xml data_manager/fetch_mothur_reference_data.py data_manager_conf.xml tool-data/mothur_aligndb.loc.sample tool-data/mothur_lookup.loc.sample tool-data/mothur_map.loc.sample tool-data/mothur_taxonomy.loc.sample tool_data_table_conf.xml.sample |
diffstat | 9 files changed, 1071 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,3 @@ +Data manager to install reference data for Mothur toolsuite + +Imported from https://github.com/fls-bioinformatics-core/galaxy-tools/tree/master/data_manager_mothur_toolsuite
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_fetch_mothur_reference_data.xml Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,349 @@ +<?xml version="1.0"?> +<tool id="data_manager_fetch_mothur_reference_data" name="Fetch Mothur toolsuite reference data" version="0.1.5" tool_type="manage_data" profile="19.05"> + <description>Fetch and install reference data for Mothur</description> + <requirements> + <requirement type="package" version="3.8">python</requirement> + </requirements> + <command><![CDATA[ + python '$__tool_directory__/fetch_mothur_reference_data.py' + --source=$data_source.data_source_selector + #if str( $data_source.data_source_selector ) == "mothur_website" + --datasets '${data_source.ref_data}' + #elif str( $data_source.data_source_selector ) == "filesystem_paths" + --description '${data_source.description}' + --paths '${data_source.paths}' + #if $data_source.create_symlink + --link + #end if + #end if + '${out_file}' + ]]></command> + <inputs> + <conditional name="data_source"> + <param name="data_source_selector" type="select" + label="Choose the source for the reference data"> + <option value="mothur_website">Mothur website</option> + <option value="filesystem_paths">Filesystem paths</option> + </param> + <when value="mothur_website"> + <param name="ref_data" type="select" display="checkboxes" multiple="true" + label="Reference dataset to install"> + <option value="lookup_titanium">GS FLX Titanium lookup files</option> + <option value="lookup_gsflx">GSFLX lookup files</option> + <option value="lookup_gs20">GS20 lookup files</option> + <option value="RDP_v16">RDP reference files (training set version 16)</option> + <option value="RDP_v14">RDP reference files (training set version 14)</option> + <option value="RDP_v10">RDP reference files (training set version 10)</option> + <option value="RDP_v9">RDP reference files (training set version 9)</option> + <option value="RDP_v7">RDP reference files (training set version 7)</option> + <option value="RDP_v6">RDP reference files (training set version 6)</option> + <option value="silva_release_128">SILVA reference files (release 128)</option> + <option value="silva_release_123">SILVA reference files (release 123)</option> + <option value="silva_release_119">SILVA reference files (release 119)</option> + <option value="silva_release_102">SILVA reference files (release 102)</option> + <option value="greengenes_August2013">Greengenes reference taxonomy and alignment v13.8 (August 2013)</option> + <option value="greengenes_May2013">Greengenes reference taxonomy and alignment v13.5 (May 2013)</option> + <option value="greengenes_old">Greengenes reference taxonomy and alignment (pre-May 2013)</option> + <option value="greengenes_gold_alignment">Greengenes gold alignment</option> + <option value="secondary_structure_maps_silva">SILVA secondary structure maps</option> + <option value="secondary_structure_maps_greengenes">Greengenes secondary structure maps</option> + </param> + </when> + <when value="filesystem_paths"> + <param name="description" type="text" value="" size="50" + label="Description of the data" optional="False" /> + <param name="paths" type="text" value="" area="True" size="10x50" + label="Paths to upload" optional="False" + help="Upload all files pasted in the box. The (recursive) contents of any pasted directories will be added as well." /> + <param type="boolean" name="create_symlink" truevalue="create_symlink" + falsevalue="copy_file" + label="Create symlinks to data instead of copying into Galaxy" checked="on" /> + </when> + </conditional> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <tests> + <test> + <param name="data_source|ref_data" value="lookup_titanium"/> + <output name="out_file"> + <assert_contents> + <has_text text="GS FLX Titanium" /> + <has_text text="LookUp_Titanium.pat" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="lookup_gsflx"/> + <output name="out_file"> + <assert_contents> + <has_text text="GSFLX" /> + <has_text text="LookUp_GSFLX.pat" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="lookup_gs20"/> + <output name="out_file"> + <assert_contents> + <has_text text="GS20" /> + <has_text text="LookUp_GS20.pat" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="RDP_v16"/> + <output name="out_file"> + <assert_contents> + <has_text text="16S rRNA RDP training set 16" /> + <has_text text="trainset16_022016.rdp.fasta" /> + <has_text text="trainset16_022016.rdp.tax" /> + <has_text text="trainset16_022016.pds.fasta" /> + <has_text text="trainset16_022016.pds.tax" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="RDP_v14"/> + <output name="out_file"> + <assert_contents> + <has_text text="16S rRNA RDP training set 14" /> + <has_text text="trainset14_032015.rdp.fasta" /> + <has_text text="trainset14_032015.rdp.tax" /> + <has_text text="trainset14_032015.pds.fasta" /> + <has_text text="trainset14_032015.pds.tax" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="RDP_v10"/> + <output name="out_file"> + <assert_contents> + <has_text text="16S rRNA RDP training set 10" /> + <has_text text="trainset10_082014.rdp.fasta" /> + <has_text text="trainset10_082014.rdp.tax" /> + <has_text text="trainset10_082014.pds.fasta" /> + <has_text text="trainset10_082014.pds.tax" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="RDP_v9"/> + <output name="out_file"> + <assert_contents> + <has_text text="16S rRNA PDS training set 9" /> + <has_text text="trainset9_032012.rdp.fasta" /> + <has_text text="trainset9_032012.rdp.tax" /> + <has_text text="trainset9_032012.pds.fasta" /> + <has_text text="trainset9_032012.pds.tax" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="RDP_v7"/> + <output name="out_file"> + <assert_contents> + <has_text text="16S rRNA RDP training set 7" /> + <has_text text="FungiLSU_train_1400bp_8506_mod.fasta" /> + <has_text text="FungiLSU_train_1400bp_8506_mod.tax" /> + <has_text text="trainset7_112011.rdp.fasta" /> + <has_text text="trainset7_112011.rdp.tax" /> + <has_text text="trainset7_112011.pds.fasta" /> + <has_text text="trainset7_112011.pds.tax" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="RDP_v6"/> + <output name="out_file"> + <assert_contents> + <has_text text="RDP training set 6" /> + <has_text text="trainset6_032010.rdp.fasta" /> + <has_text text="trainset6_032010.rdp.tax" /> + </assert_contents> + </output> + </test> + <!-- SILVA data is to large (>1GB each) for CI testing on github actions + so we skip them --> + <!--<test> + <param name="data_source|ref_data" value="silva_release_128"/> + <output name="out_file"> + <assert_contents> + <has_text text="SILVA release 128" /> + <has_text text="silva.nr_v128.tax" /> + <has_text text="silva.seed_v128.tax" /> + <has_text text="silva.nr_v128.align" /> + <has_text text="silva.seed_v128.align" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="silva_release_123"/> + <output name="out_file"> + <assert_contents> + <has_text text="SILVA release 123" /> + <has_text text="silva.nr_v123.align" /> + <has_text text="silva.seed_v123.align" /> + <has_text text="silva.nr_v123.tax" /> + <has_text text="silva.seed_v123.tax" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="silva_release_119"/> + <output name="out_file"> + <assert_contents> + <has_text text="SILVA release 119" /> + <has_text text="silva.nr_v119.align" /> + <has_text text="silva.seed_v119.align" /> + <has_text text="silva.nr_v119.tax" /> + <has_text text="silva.seed_v119.tax" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="silva_release_102"/> + <output name="out_file"> + <assert_contents> + <has_text text="SILVA release 102" /> + <has_text text="silva.bacteria.fasta" /> + <has_text text="silva.gold.ng.fasta" /> + <has_text text="nogap.archaea.fasta" /> + <has_text text="silva.archaea.fasta" /> + <has_text text="nogap.eukarya.fasta" /> + <has_text text="silva.eukarya.fasta" /> + <has_text text="silva.bacteria.gg.tax" /> + <has_text text="silva.bacteria.ncbi.tax" /> + <has_text text="silva.bacteria.rdp.tax" /> + <has_text text="silva.bacteria.rdp6.tax" /> + <has_text text="silva.bacteria.silva.tax" /> + <has_text text="silva.archaea.gg.tax" /> + <has_text text="silva.archaea.ncbi.tax" /> + <has_text text="silva.archaea.rdp.tax" /> + <has_text text="silva.archaea.silva.tax" /> + <has_text text="silva.eukarya.ncbi.tax" /> + <has_text text="silva.eukarya.silva.tax" /> + </assert_contents> + </output> + </test>--> + + <!-- also greengenes is large (400MB-1.5GB) so only tests for older + (smaller) releases are executed --> + <!--<test> + <param name="data_source|ref_data" value="greengenes_August2013"/> + <output name="out_file"> + <assert_contents> + <has_text text="Greengenes August 2013" /> + <has_text text="gg_13_8_99.gg.tax" /> + <has_text text="gg_13_8_99.fasta" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="greengenes_May2013"/> + <output name="out_file"> + <assert_contents> + <has_text text="Greengenes May 2013" /> + <has_text text="gg_13_5_99.pds.tax" /> + <has_text text="gg_13_5_99.gg.tax" /> + <has_text text="gg_13_5_99.align" /> + <has_text text="gg_13_5_99.fasta" /> + </assert_contents> + </output> + </test>--> + <test> + <param name="data_source|ref_data" value="greengenes_old"/> + <output name="out_file"> + <assert_contents> + <has_text text="Greengenes pre-May 2013" /> + <has_text text="gg_99.pds.tax" /> + <has_text text="core_set_aligned.imputed.fasta" /> + <has_text text="gg_99.pds.ng.fasta" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="greengenes_gold_alignment"/> + <output name="out_file"> + <assert_contents> + <has_text text="Greengenes gold alignment" /> + <has_text text="rRNA16S.gold.NAST_ALIGNED.fasta" /> + </assert_contents> + </output> + </test> + + <test> + <param name="data_source|ref_data" value="secondary_structure_maps_silva"/> + <output name="out_file"> + <assert_contents> + <has_text text="SILVA" /> + <has_text text="silva.ss.map" /> + </assert_contents> + </output> + </test> + <test> + <param name="data_source|ref_data" value="secondary_structure_maps_greengenes"/> + <output name="out_file"> + <assert_contents> + <has_text text="Greengenes" /> + <has_text text="gg.ss.map" /> + </assert_contents> + </output> + </test> + </tests> + <help> +.. class:: infomark + +**What it does** + +This tool fetches reference data used by the mothur_toolsuite set of Galaxy tools, +and populates the appropriate data tables. + +The reference data can be imported directly from the Mothur website, or from files +in a server directory. + +Files are added to the following data tables based on file extension: + + * **mothur_lookup**: for .pat files + * **mothur_aligndb**: for .fasta files + * **mothur_map**: for .map files + * **mothur_taxonomy**: for .tax files + +------ + +**Importing from Mothur website** + +Reference data sets provided by the Mothur developers can be downloaded from the +Mothur website. See the following pages to get more information about each dataset: + + * Lookup data: http://www.mothur.org/wiki/Lookup_files + * RDP reference data: http://www.mothur.org/wiki/RDP_reference_files + * Silva data: http://www.mothur.org/wiki/Silva_reference_files + * Greengenes data: http://www.mothur.org/wiki/Greengenes-formatted_databases + * Secondary structure maps: http://www.mothur.org/wiki/Secondary_structure_map + +**Importing from file system paths** + +If reference data is already on the server filesystem then use this option to +import it into the Mothur data tables. The appropriate data tables are determined +based on the file extensions. + +Optionally a description can be added which will appear next to the base of the +reference file name in the data table entry. + +------ + +.. class:: warningmark + +**A note on Lane masks** + +Lane mask data is also available via the Mothur website (files ending in ".filter"): + + * http://www.mothur.org/wiki/Lane_mask + +but as these data are not currently used in the toolsuite, they cannot be imported +using this data manager. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_mothur_reference_data.py Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,580 @@ +#!/usr/bin/env python3 +# +# Data manager for reference data for the 'mothur_toolsuite' Galaxy tools +import json +import optparse +import os +import shutil +import sys +import tarfile +import tempfile +import urllib.error +import urllib.parse +import urllib.request +import zipfile +from functools import reduce + +# When extracting files from archives, skip names that +# start with the following strings +IGNORE_PATHS = ('.', '__MACOSX/', '__') + +# Map file extensions to data table names +MOTHUR_FILE_TYPES = {".map": "map", + ".fasta": "aligndb", + ".align": "aligndb", + ".pat": "lookup", + ".tax": "taxonomy"} + +# Reference data URLs +MOTHUR_REFERENCE_DATA = { + # Look up data + # http://www.mothur.org/wiki/Lookup_files + "lookup_titanium": { + "GS FLX Titanium": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_titanium.zip", ] + }, + "lookup_gsflx": { + "GSFLX": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gsflx.zip", ] + }, + "lookup_gs20": { + "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] + }, + # RDP reference files + # http://www.mothur.org/wiki/RDP_reference_files + "RDP_v18": { + "16S rRNA RDP training set 18": + [ + "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ], + "16S rRNA PDS training set 18": + [ + "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ], + }, + "RDP_v16": { + "16S rRNA RDP training set 16": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], + "16S rRNA PDS training set 16": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], + }, + "RDP_v14": { + "16S rRNA RDP training set 14": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset14_032015.rdp.tgz", ], + "16S rRNA PDS training set 14": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset14_032015.pds.tgz", ], + }, + "RDP_v10": { + "16S rRNA RDP training set 10": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset10_082014.rdp.tgz", ], + "16S rRNA PDS training set 10": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset10_082014.pds.tgz", ], + }, + "RDP_v9": { + "16S rRNA RDP training set 9": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset9_032012.rdp.zip", ], + "16S rRNA PDS training set 9": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset9_032012.pds.zip", ], + }, + "RDP_v7": { + "16S rRNA RDP training set 7": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset7_112011.rdp.zip", ], + "16S rRNA PDS training set 7": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset7_112011.pds.zip", ], + "8S rRNA Fungi training set 7": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/fungilsu_train_v7.zip", ], + }, + "RDP_v6": { + "RDP training set 6": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], + }, + # Silva reference files + # http://www.mothur.org/wiki/Silva_reference_files + "silva_release_138.1": { + "SILVA release 138.1": + [ + "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ], + }, + "silva_release_128": { + "SILVA release 128": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], + }, + "silva_release_123": { + "SILVA release 123": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v123.tgz", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v123.tgz", ], + }, + "silva_release_119": { + "SILVA release 119": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v119.tgz", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v119.tgz", ], + }, + "silva_release_102": { + "SILVA release 102": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.bacteria.zip", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.archaea.zip", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.eukarya.zip", ], + }, + "silva_gold_bacteria": { + "SILVA gold": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.gold.bacteria.zip", ], + }, + # Greengenes + # http://www.mothur.org/wiki/Greengenes-formatted_databases + "greengenes_August2013": { + "Greengenes August 2013": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_8_99.refalign.tgz", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_8_99.taxonomy.tgz", ], + }, + "greengenes_May2013": { + "Greengenes May 2013": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_5_99.refalign.tgz", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_5_99.taxonomy.tgz", ], + }, + "greengenes_old": { + "Greengenes pre-May 2013": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/greengenes.alignment.zip", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/greengenes.tax.tgz", ], + }, + "greengenes_gold_alignment": { + "Greengenes gold alignment": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/greengenes.gold.alignment.zip", ], + }, + # Secondary structure maps + # http://www.mothur.org/wiki/Secondary_structure_map + "secondary_structure_maps_silva": { + "SILVA": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva_ss_map.zip", ], + }, + "secondary_structure_maps_greengenes": { + "Greengenes": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_ss_map.zip", ], + }, + # Lane masks: not used here? + "lane_masks": { + "Greengenes-compatible": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/Lane1241.gg.filter", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/lane1287.gg.filter", + "https://mothur.s3.us-east-2.amazonaws.com/wiki/lane1349.gg.filter", ], + "SILVA-compatible": + ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lane1349.silva.filter", ] + }, +} + + +# Utility functions for interacting with Galaxy JSON +def read_input_json(jsonfile): + """Read the JSON supplied from the data manager tool + + Returns a tuple (param_dict,extra_files_path) + + 'param_dict' is an arbitrary dictionary of parameters + input into the tool; 'extra_files_path' is the path + to a directory where output files must be put for the + receiving data manager to pick them up. + + NB the directory pointed to by 'extra_files_path' + doesn't exist initially, it is the job of the script + to create it if necessary. + + """ + with open(jsonfile) as fh: + params = json.load(fh) + return (params['param_dict'], + params['output_data'][0]['extra_files_path']) + + +# Utility functions for creating data table dictionaries +# +# Example usage: +# >>> d = create_data_tables_dict() +# >>> add_data_table(d,'my_data') +# >>> add_data_table_entry(dict(dbkey='hg19',value='human')) +# >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) +# >>> print(json.dumps(d)) +def create_data_tables_dict(): + """Return a dictionary for storing data table information + + Returns a dictionary that can be used with 'add_data_table' + and 'add_data_table_entry' to store information about a + data table. It can be converted to JSON to be sent back to + the data manager. + + """ + d = {} + d['data_tables'] = {} + return d + + +def add_data_table(d, table): + """Add a data table to the data tables dictionary + + Creates a placeholder for a data table called 'table'. + + """ + d['data_tables'][table] = [] + + +def add_data_table_entry(d, table, entry): + """Add an entry to a data table + + Appends an entry to the data table 'table'. 'entry' + should be a dictionary where the keys are the names of + columns in the data table. + + Raises an exception if the named data table doesn't + exist. + + """ + try: + d['data_tables'][table].append(entry) + except KeyError: + raise Exception("add_data_table_entry: no table '%s'" % table) + + +# Utility functions for downloading and unpacking archive files +def download_file(url, target=None, wd=None): + """Download a file from a URL + + Fetches a file from the specified URL. + + If 'target' is specified then the file is saved to this + name; otherwise it's saved as the basename of the URL. + + If 'wd' is specified then it is used as the 'working + directory' where the file will be save on the local + system. + + Returns the name that the file is saved with. + + """ + print(("Downloading %s" % url)) + if not target: + target = os.path.basename(url) + if wd: + target = os.path.join(wd, target) + print(("Saving to %s" % target)) + with open(target, 'wb') as fh: + fh.write(urllib.request.urlopen(url).read()) + return target + + +def unpack_zip_archive(filen, wd=None): + """Extract files from a ZIP archive + + Given a ZIP archive, extract the files it contains + and return a list of the resulting file names and + paths. + + 'wd' specifies the working directory to extract + the files to, otherwise they are extracted to the + current working directory. + + Once all the files are extracted the ZIP archive + file is deleted from the file system. + + """ + if not zipfile.is_zipfile(filen): + print("%s: not ZIP formatted file") + return [filen] + file_list = [] + with zipfile.ZipFile(filen) as z: + for name in z.namelist(): + if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): + print(("Ignoring %s" % name)) + continue + if wd: + target = os.path.join(wd, name) + else: + target = name + if name.endswith('/'): + # Make directory + print(("Creating dir %s" % target)) + try: + os.makedirs(target) + except OSError: + pass + else: + # Extract file + print(("Extracting %s" % name)) + try: + os.makedirs(os.path.dirname(target)) + except OSError: + pass + with open(target, 'wb') as fh: + fh.write(z.read(name)) + file_list.append(target) + print(("Removing %s" % filen)) + os.remove(filen) + return file_list + + +def unpack_tar_archive(filen, wd=None): + """Extract files from a TAR archive + + Given a TAR archive (which optionally can be + compressed with either gzip or bz2), extract the + files it contains and return a list of the + resulting file names and paths. + + 'wd' specifies the working directory to extract + the files to, otherwise they are extracted to the + current working directory. + + Once all the files are extracted the TAR archive + file is deleted from the file system. + + """ + file_list = [] + if not tarfile.is_tarfile(filen): + print("%s: not TAR file") + return [filen] + with tarfile.open(filen) as t: + for name in t.getnames(): + # Check for unwanted files + if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): + print(("Ignoring %s" % name)) + continue + # Extract file + print(("Extracting %s" % name)) + t.extract(name, wd) + if wd: + target = os.path.join(wd, name) + else: + target = name + file_list.append(target) + print(("Removing %s" % filen)) + os.remove(filen) + return file_list + + +def unpack_archive(filen, wd=None): + """Extract files from an archive + + Wrapper function that calls the appropriate + unpacking function depending on the archive + type, and returns a list of files that have + been extracted. + + 'wd' specifies the working directory to extract + the files to, otherwise they are extracted to the + current working directory. + + """ + print(("Unpack %s" % filen)) + ext = os.path.splitext(filen)[1] + print(("Extension: %s" % ext)) + if ext == ".zip": + return unpack_zip_archive(filen, wd=wd) + elif ext == ".tgz": + return unpack_tar_archive(filen, wd=wd) + else: + return [filen] + + +def fetch_files(urls, wd=None, files=None): + """Download and unpack files from a list of URLs + + Given a list of URLs, download and unpack each + one, and return a list of the extracted files. + + 'wd' specifies the working directory to extract + the files to, otherwise they are extracted to the + current working directory. + + If 'files' is given then the list of extracted + files will be appended to this list before being + returned. + + """ + if files is None: + files = [] + for url in urls: + filen = download_file(url, wd=wd) + files.extend(unpack_archive(filen, wd=wd)) + return files + + +# Utility functions specific to the Mothur reference data +def identify_type(filen): + """Return the data table name based on the file name + + """ + ext = os.path.splitext(filen)[1] + try: + return MOTHUR_FILE_TYPES[ext] + except KeyError: + print(("WARNING: unknown file type for " + filen + ", skipping")) + return None + + +def get_name(filen): + """Generate a descriptive name based on the file name + """ + # type_ = identify_type(filen) + name = os.path.splitext(os.path.basename(filen))[0] + for delim in ('.', '_'): + name = name.replace(delim, ' ') + return name + + +def fetch_from_mothur_website(data_tables, target_dir, datasets): + """Fetch reference data from the Mothur website + + For each dataset in the list 'datasets', download (and if + necessary unpack) the related files from the Mothur website, + copy them to the data manager's target directory, and add + references to the files to the appropriate data table. + + The 'data_tables' dictionary should have been created using + the 'create_data_tables_dict' and 'add_data_table' functions. + + Arguments: + data_tables: a dictionary containing the data table info + target_dir: directory to put the downloaded files + datasets: a list of dataset names corresponding to keys in + the MOTHUR_REFERENCE_DATA dictionary + """ + # Make working dir + wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) + print(("Working dir %s" % wd)) + # Iterate over all requested reference data URLs + for dataset in datasets: + print(("Handling dataset '%s'" % dataset)) + for name in MOTHUR_REFERENCE_DATA[dataset]: + for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): + type_ = identify_type(f) + entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) + print(("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f)))) + if type_ is not None: + # Move to target dir + ref_data_file = os.path.basename(f) + f1 = os.path.join(target_dir, ref_data_file) + print(("Moving %s to %s" % (f, f1))) + shutil.move(f, f1) + # Add entry to data table + table_name = "mothur_%s" % type_ + add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) + # Remove working dir + print(("Removing %s" % wd)) + shutil.rmtree(wd) + + +def files_from_filesystem_paths(paths): + """Return list of file paths from arbitrary input paths + + Given a list of filesystem paths, return a list of + full paths corresponding to all files found recursively + from under those paths. + + """ + # Collect files to add + files = [] + for path in paths: + path = os.path.abspath(path) + print(("Examining '%s'..." % path)) + if os.path.isfile(path): + # Store full path for file + files.append(path) + elif os.path.isdir(path): + # Descend into directory and collect the files + for f in os.listdir(path): + files.extend(files_from_filesystem_paths((os.path.join(path, f), ))) + else: + print("Not a file or directory, ignored") + return files + + +def import_from_server(data_tables, target_dir, paths, description, link_to_data=False): + """Import reference data from filesystem paths + + Creates references to the specified file(s) on the Galaxy + server in the appropriate data table (determined from the + file extension). + + The 'data_tables' dictionary should have been created using + the 'create_data_tables_dict' and 'add_data_table' functions. + + Arguments: + data_tables: a dictionary containing the data table info + target_dir: directory to put copy or link to the data file + paths: list of file and/or directory paths to import + description: text to associate with the files + link_to_data: boolean, if False then copy the data file + into Galaxy (default); if True then make a symlink to + the data file + + """ + # Collect list of files based on input paths + files = files_from_filesystem_paths(paths) + # Handle each file individually + for f in files: + type_ = identify_type(f) + if type_ is None: + print(("%s: unrecognised type, skipped" % f)) + continue + ref_data_file = os.path.basename(f) + target_file = os.path.join(target_dir, ref_data_file) + entry_name = "%s" % os.path.splitext(ref_data_file)[0] + if description: + entry_name += " (%s)" % description + print(("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file))) + # Link to or copy the data + if link_to_data: + os.symlink(f, target_file) + else: + shutil.copyfile(f, target_file) + # Add entry to data table + table_name = "mothur_%s" % type_ + add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) + + +if __name__ == "__main__": + print("Starting...") + + # Read command line + parser = optparse.OptionParser() + parser.add_option('--source', action='store', dest='data_source') + parser.add_option('--datasets', action='store', dest='datasets', default='') + parser.add_option('--paths', action='store', dest='paths', default=[]) + parser.add_option('--description', action='store', dest='description', default='') + parser.add_option('--link', action='store_true', dest='link_to_data') + options, args = parser.parse_args() + print(("options: %s" % options)) + print(("args : %s" % args)) + + # Check for JSON file + if len(args) != 1: + sys.stderr.write("Need to supply JSON file name") + sys.exit(1) + + jsonfile = args[0] + + # Read the input JSON + params, target_dir = read_input_json(jsonfile) + + # Make the target directory + print(("Making %s" % target_dir)) + os.mkdir(target_dir) + + # Set up data tables dictionary + data_tables = create_data_tables_dict() + add_data_table(data_tables, 'mothur_lookup') + add_data_table(data_tables, 'mothur_aligndb') + add_data_table(data_tables, 'mothur_map') + add_data_table(data_tables, 'mothur_taxonomy') + + # Fetch data from specified data sources + if options.data_source == 'mothur_website': + datasets = options.datasets.split(',') + fetch_from_mothur_website(data_tables, target_dir, datasets) + elif options.data_source == 'filesystem_paths': + # Check description text + description = options.description.strip() + # Get list of paths (need to remove any escapes for '\n' and '\r' + # that might have been inserted by Galaxy) + paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split() + import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data) + # Write output JSON + print("Outputting JSON") + with open(jsonfile, 'w') as fh: + json.dump(data_tables, fh, sort_keys=True) + print("Done.")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,57 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_fetch_mothur_reference_data.xml" id="data_manager_mothur_fetch_reference_data"> + <data_table name="mothur_aligndb"> + <output> + <column name="name" /> + <column name="value" output_ref="out_file" > + <move type="file"> + <source>${value}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/aligndb/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/aligndb/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="mothur_lookup"> + <output> + <column name="name" /> + <column name="value" output_ref="out_file" > + <move type="file"> + <source>${value}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/lookup/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/lookup/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="mothur_map"> + <output> + <column name="name" /> + <column name="value" output_ref="out_file" > + <move type="file"> + <source>${value}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/map/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/map/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="mothur_taxonomy"> + <output> + <column name="name" /> + <column name="value" output_ref="out_file" > + <move type="file"> + <source>${value}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/taxonomy/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/taxonomy/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/mothur_aligndb.loc.sample Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,19 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of metagenomics files. +#file has this format (white space characters are TAB characters): +# +# Reference Alignments: http://www.mothur.org/wiki/Alignment_database +# +#<dbname> <file_base> +# +#greengenes /project/db/galaxy/mothur/core_set_aligned.imputed.fasta +#silva archaea /project/db/galaxy/mothur/Silva.archaea/silva.archaea.fasta +#silva bacteria /project/db/galaxy/mothur/silva.bacteria/silva.bacteria.fasta +#silva eukarya /project/db/galaxy/mothur/silva.eukarya.fasta +#silva archaea nogap /project/db/galaxy/mothur/Silva.archaea/nogap.archaea.fasta +#silva bacteria nogap /project/db/galaxy/mothur/silva.bacteria/nogap.bacteria.fasta +#silva eukarya nogap /project/db/galaxy/mothur/nogap.eukarya.fasta +#FungiLSU_train_1400bp_8506_mod.fasta /project/db/galaxy/mothur/RDP/FungiLSU_train_1400bp_8506_mod.fasta +#trainset6_032010.rdp.fasta /project/db/galaxy/mothur/RDP/trainset6_032010.rdp.fasta +#trainset7_112011.pds.fasta /project/db/galaxy/mothur/RDP/trainset7_112011.pds.fasta +#trainset7_112011.rdp.fasta /project/db/galaxy/mothur/RDP/trainset7_112011.rdp.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/mothur_lookup.loc.sample Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,11 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of metagenomics files. +#file has this format (white space characters are TAB characters): +# +# lookup files from: http://www.mothur.org/wiki/Lookup_files +# +#<name> <file_base> +# +#GS20 /project/db/galaxy/mothur/lookup/LookUp_GS20.pat +#GSFLX /project/db/galaxy/mothur/lookup/LookUp_GSFLX.pat +#Titanium /project/db/galaxy/mothur/lookup/LookUp_Titanium.pat
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/mothur_map.loc.sample Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,10 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of metagenomics files. +#file has this format (white space characters are TAB characters): +# +# Secondary structure maps: http://www.mothur.org/wiki/Secondary_structure_map +# +#<name> <file_base> +# +#greengenes /project/db/galaxy/mothur/gg.ss.map +#silva /project/db/galaxy/mothur/silva.ss.map
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/mothur_taxonomy.loc.sample Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,24 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of metagenomics files. +#file has this format (white space characters are TAB characters): +# +# Silva reference files: http://www.mothur.org/wiki/Silva_reference_files +# +#<taxonomyname> <file_base> +# +#archaea.gg /project/db/galaxy/mothur/Silva.archaea/silva.archaea.gg.tax +#archaea.silva /project/db/galaxy/mothur/Silva.archaea/silva.archaea.silva.tax +#archaea.rdp /project/db/galaxy/mothur/Silva.archaea/silva.archaea.rdp.tax +#archaea.ncbi /project/db/galaxy/mothur/Silva.archaea/silva.archaea.ncbi.tax +#bacteria.gg /project/db/galaxy/mothur/silva.bacteria/silva.bacteria.gg.tax +#bacteria.silva /project/db/galaxy/mothur/silva.bacteria/silva.bacteria.silva.tax +#bacteria.ncbi /project/db/galaxy/mothur/silva.bacteria/silva.bacteria.ncbi.tax +#bacteria.rdp /project/db/galaxy/mothur/silva.bacteria/silva.bacteria.rdp.tax +#bacteria.rdp6 /project/db/galaxy/mothur/silva.bacteria/silva.bacteria.rdp6.tax +#eukarya.silva /project/db/galaxy/mothur/silva.eukarya.silva.tax +#eukarya.ncbi /project/db/galaxy/mothur/silva.eukarya.ncbi.tax +#trainset6_032010.rdp.tax /project/db/galaxy/mothur/RDP/trainset6_032010.rdp.tax +#trainset7_112011.pds.tax /project/db/galaxy/mothur/RDP/trainset7_112011.pds.tax +#trainset7_112011.rdp.tax /project/db/galaxy/mothur/RDP/trainset7_112011.rdp.tax +#FungiLSU_train_1400bp_8506_mod.tax /project/db/galaxy/mothur/RDP/FungiLSU_train_1400bp_8506_mod.tax +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Jun 22 12:07:41 2021 +0000 @@ -0,0 +1,18 @@ +<tables> + <table name="mothur_aligndb" comment_char="#" allow_duplicate_entries="False"> + <columns>name, value</columns> + <file path="tool-data/mothur_aligndb.loc" /> + </table> + <table name="mothur_lookup" comment_char="#" allow_duplicate_entries="False"> + <columns>name, value</columns> + <file path="tool-data/mothur_lookup.loc" /> + </table> + <table name="mothur_map" comment_char="#" allow_duplicate_entries="False"> + <columns>name, value</columns> + <file path="tool-data/mothur_map.loc" /> + </table> + <table name="mothur_taxonomy" comment_char="#" allow_duplicate_entries="False"> + <columns>name, value</columns> + <file path="tool-data/mothur_taxonomy.loc" /> + </table> +</tables>