Mercurial > repos > rhpvorderman > data_manager_mothur_toolsuite

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,3 @@
+Data manager to install reference data for Mothur toolsuite
+
+Imported from https://github.com/fls-bioinformatics-core/galaxy-tools/tree/master/data_manager_mothur_toolsuite
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_fetch_mothur_reference_data.xml	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,349 @@
+<?xml version="1.0"?>
+<tool id="data_manager_fetch_mothur_reference_data" name="Fetch Mothur toolsuite reference data" version="0.1.5" tool_type="manage_data" profile="19.05">
+    <description>Fetch and install reference data for Mothur</description>
+    <requirements>
+        <requirement type="package" version="3.8">python</requirement>
+    </requirements>
+    <command><![CDATA[
+        python '$__tool_directory__/fetch_mothur_reference_data.py'
+        --source=$data_source.data_source_selector
+        #if str( $data_source.data_source_selector ) == "mothur_website"
+           --datasets '${data_source.ref_data}'
+        #elif str( $data_source.data_source_selector ) == "filesystem_paths"
+           --description '${data_source.description}'
+           --paths '${data_source.paths}'
+           #if $data_source.create_symlink
+             --link
+           #end if
+        #end if
+        '${out_file}'
+    ]]></command>
+    <inputs>
+        <conditional name="data_source">
+            <param name="data_source_selector" type="select"
+                   label="Choose the source for the reference data">
+                <option value="mothur_website">Mothur website</option>
+                <option value="filesystem_paths">Filesystem paths</option>
+            </param>
+            <when value="mothur_website">
+                <param name="ref_data" type="select" display="checkboxes" multiple="true"
+                       label="Reference dataset to install">
+                    <option value="lookup_titanium">GS FLX Titanium lookup files</option>
+                    <option value="lookup_gsflx">GSFLX lookup files</option>
+                    <option value="lookup_gs20">GS20 lookup files</option>
+                    <option value="RDP_v16">RDP reference files (training set version 16)</option>
+                    <option value="RDP_v14">RDP reference files (training set version 14)</option>
+                    <option value="RDP_v10">RDP reference files (training set version 10)</option>
+                    <option value="RDP_v9">RDP reference files (training set version 9)</option>
+                    <option value="RDP_v7">RDP reference files (training set version 7)</option>
+                    <option value="RDP_v6">RDP reference files (training set version 6)</option>
+                    <option value="silva_release_128">SILVA reference files (release 128)</option>
+                    <option value="silva_release_123">SILVA reference files (release 123)</option>
+                    <option value="silva_release_119">SILVA reference files (release 119)</option>
+                    <option value="silva_release_102">SILVA reference files (release 102)</option>
+                    <option value="greengenes_August2013">Greengenes reference taxonomy and alignment v13.8 (August 2013)</option>
+                    <option value="greengenes_May2013">Greengenes reference taxonomy and alignment v13.5 (May 2013)</option>
+                    <option value="greengenes_old">Greengenes reference taxonomy and alignment (pre-May 2013)</option>
+                    <option value="greengenes_gold_alignment">Greengenes gold alignment</option>
+                    <option value="secondary_structure_maps_silva">SILVA secondary structure maps</option>
+                    <option value="secondary_structure_maps_greengenes">Greengenes secondary structure maps</option>
+                </param>
+            </when>
+            <when value="filesystem_paths">
+                <param name="description" type="text" value="" size="50"
+                       label="Description of the data" optional="False" />
+                <param name="paths" type="text" value="" area="True" size="10x50"
+                       label="Paths to upload" optional="False"
+                       help="Upload all files pasted in the box. The (recursive) contents of any pasted directories will be added as well." />
+                <param type="boolean" name="create_symlink" truevalue="create_symlink"
+                       falsevalue="copy_file"
+                       label="Create symlinks to data instead of copying into Galaxy" checked="on" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="data_source|ref_data" value="lookup_titanium"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="GS FLX Titanium" />
+                    <has_text text="LookUp_Titanium.pat" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="lookup_gsflx"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="GSFLX" />
+                    <has_text text="LookUp_GSFLX.pat" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="lookup_gs20"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="GS20" />
+                    <has_text text="LookUp_GS20.pat" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="RDP_v16"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="16S rRNA RDP training set 16" />
+                    <has_text text="trainset16_022016.rdp.fasta" />
+                    <has_text text="trainset16_022016.rdp.tax" />
+                    <has_text text="trainset16_022016.pds.fasta" />
+                    <has_text text="trainset16_022016.pds.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="RDP_v14"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="16S rRNA RDP training set 14" />
+                    <has_text text="trainset14_032015.rdp.fasta" />
+                    <has_text text="trainset14_032015.rdp.tax" />
+                    <has_text text="trainset14_032015.pds.fasta" />
+                    <has_text text="trainset14_032015.pds.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="RDP_v10"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="16S rRNA RDP training set 10" />
+                    <has_text text="trainset10_082014.rdp.fasta" />
+                    <has_text text="trainset10_082014.rdp.tax" />
+                    <has_text text="trainset10_082014.pds.fasta" />
+                    <has_text text="trainset10_082014.pds.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="RDP_v9"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="16S rRNA PDS training set 9" />
+                    <has_text text="trainset9_032012.rdp.fasta" />
+                    <has_text text="trainset9_032012.rdp.tax" />
+                    <has_text text="trainset9_032012.pds.fasta" />
+                    <has_text text="trainset9_032012.pds.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="RDP_v7"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="16S rRNA RDP training set 7" />
+                    <has_text text="FungiLSU_train_1400bp_8506_mod.fasta" />
+                    <has_text text="FungiLSU_train_1400bp_8506_mod.tax" />
+                    <has_text text="trainset7_112011.rdp.fasta" />
+                    <has_text text="trainset7_112011.rdp.tax" />
+                    <has_text text="trainset7_112011.pds.fasta" />
+                    <has_text text="trainset7_112011.pds.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="RDP_v6"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="RDP training set 6" />
+                    <has_text text="trainset6_032010.rdp.fasta" />
+                    <has_text text="trainset6_032010.rdp.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- SILVA data is to large (>1GB each) for CI testing on github actions
+             so we skip them -->
+        <!--<test>
+            <param name="data_source|ref_data" value="silva_release_128"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="SILVA release 128" />
+                    <has_text text="silva.nr_v128.tax" />
+                    <has_text text="silva.seed_v128.tax" />
+                    <has_text text="silva.nr_v128.align" />
+                    <has_text text="silva.seed_v128.align" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="silva_release_123"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="SILVA release 123" />
+                    <has_text text="silva.nr_v123.align" />
+                    <has_text text="silva.seed_v123.align" />
+                    <has_text text="silva.nr_v123.tax" />
+                    <has_text text="silva.seed_v123.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="silva_release_119"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="SILVA release 119" />
+                    <has_text text="silva.nr_v119.align" />
+                    <has_text text="silva.seed_v119.align" />
+                    <has_text text="silva.nr_v119.tax" />
+                    <has_text text="silva.seed_v119.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="silva_release_102"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="SILVA release 102" />
+                    <has_text text="silva.bacteria.fasta" />
+                    <has_text text="silva.gold.ng.fasta" />
+                    <has_text text="nogap.archaea.fasta" />
+                    <has_text text="silva.archaea.fasta" />
+                    <has_text text="nogap.eukarya.fasta" />
+                    <has_text text="silva.eukarya.fasta" />
+                    <has_text text="silva.bacteria.gg.tax" />
+                    <has_text text="silva.bacteria.ncbi.tax" />
+                    <has_text text="silva.bacteria.rdp.tax" />
+                    <has_text text="silva.bacteria.rdp6.tax" />
+                    <has_text text="silva.bacteria.silva.tax" />
+                    <has_text text="silva.archaea.gg.tax" />
+                    <has_text text="silva.archaea.ncbi.tax" />
+                    <has_text text="silva.archaea.rdp.tax" />
+                    <has_text text="silva.archaea.silva.tax" />
+                    <has_text text="silva.eukarya.ncbi.tax" />
+                    <has_text text="silva.eukarya.silva.tax" />
+                </assert_contents>
+            </output>
+        </test>-->
+
+        <!-- also greengenes is large (400MB-1.5GB) so only tests for older
+             (smaller) releases are executed -->
+        <!--<test>
+            <param name="data_source|ref_data" value="greengenes_August2013"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="Greengenes August 2013" />
+                    <has_text text="gg_13_8_99.gg.tax" />
+                    <has_text text="gg_13_8_99.fasta" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="greengenes_May2013"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="Greengenes May 2013" />
+                    <has_text text="gg_13_5_99.pds.tax" />
+                    <has_text text="gg_13_5_99.gg.tax" />
+                    <has_text text="gg_13_5_99.align" />
+                    <has_text text="gg_13_5_99.fasta" />
+                </assert_contents>
+            </output>
+        </test>-->
+        <test>
+            <param name="data_source|ref_data" value="greengenes_old"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="Greengenes pre-May 2013" />
+                    <has_text text="gg_99.pds.tax" />
+                    <has_text text="core_set_aligned.imputed.fasta" />
+                    <has_text text="gg_99.pds.ng.fasta" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="greengenes_gold_alignment"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="Greengenes gold alignment" />
+                    <has_text text="rRNA16S.gold.NAST_ALIGNED.fasta" />
+                </assert_contents>
+            </output>
+        </test>
+
+        <test>
+            <param name="data_source|ref_data" value="secondary_structure_maps_silva"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="SILVA" />
+                    <has_text text="silva.ss.map" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="data_source|ref_data" value="secondary_structure_maps_greengenes"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="Greengenes" />
+                    <has_text text="gg.ss.map" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+.. class:: infomark
+
+**What it does**
+
+This tool fetches reference data used by the mothur_toolsuite set of Galaxy tools,
+and populates the appropriate data tables.
+
+The reference data can be imported directly from the Mothur website, or from files
+in a server directory.
+
+Files are added to the following data tables based on file extension:
+
+ * **mothur_lookup**: for .pat files
+ * **mothur_aligndb**: for .fasta files
+ * **mothur_map**: for .map files
+ * **mothur_taxonomy**: for .tax files
+
+------
+
+**Importing from Mothur website**
+
+Reference data sets provided by the Mothur developers can be downloaded from the
+Mothur website. See the following pages to get more information about each dataset:
+
+ * Lookup data:        http://www.mothur.org/wiki/Lookup_files
+ * RDP reference data: http://www.mothur.org/wiki/RDP_reference_files
+ * Silva data:         http://www.mothur.org/wiki/Silva_reference_files
+ * Greengenes data:    http://www.mothur.org/wiki/Greengenes-formatted_databases
+ * Secondary structure maps: http://www.mothur.org/wiki/Secondary_structure_map
+
+**Importing from file system paths**
+
+If reference data is already on the server filesystem then use this option to
+import it into the Mothur data tables. The appropriate data tables are determined
+based on the file extensions.
+
+Optionally a description can be added which will appear next to the base of the
+reference file name in the data table entry.
+
+------
+
+.. class:: warningmark
+
+**A note on Lane masks**
+
+Lane mask data is also available via the Mothur website (files ending in ".filter"):
+
+ * http://www.mothur.org/wiki/Lane_mask
+
+but as these data are not currently used in the toolsuite, they cannot be imported
+using this data manager.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_mothur_reference_data.py	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,580 @@
+#!/usr/bin/env python3
+#
+# Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
+import json
+import optparse
+import os
+import shutil
+import sys
+import tarfile
+import tempfile
+import urllib.error
+import urllib.parse
+import urllib.request
+import zipfile
+from functools import reduce
+
+# When extracting files from archives, skip names that
+# start with the following strings
+IGNORE_PATHS = ('.', '__MACOSX/', '__')
+
+# Map file extensions to data table names
+MOTHUR_FILE_TYPES = {".map": "map",
+                     ".fasta": "aligndb",
+                     ".align": "aligndb",
+                     ".pat": "lookup",
+                     ".tax": "taxonomy"}
+
+# Reference data URLs
+MOTHUR_REFERENCE_DATA = {
+    # Look up data
+    # http://www.mothur.org/wiki/Lookup_files
+    "lookup_titanium": {
+        "GS FLX Titanium": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_titanium.zip", ]
+    },
+    "lookup_gsflx": {
+        "GSFLX": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gsflx.zip", ]
+    },
+    "lookup_gs20": {
+        "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ]
+    },
+    # RDP reference files
+    # http://www.mothur.org/wiki/RDP_reference_files
+    "RDP_v18": {
+        "16S rRNA RDP training set 18":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ],
+        "16S rRNA PDS training set 18":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ],
+    },
+    "RDP_v16": {
+        "16S rRNA RDP training set 16":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ],
+        "16S rRNA PDS training set 16":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ],
+    },
+    "RDP_v14": {
+        "16S rRNA RDP training set 14":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset14_032015.rdp.tgz", ],
+        "16S rRNA PDS training set 14":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset14_032015.pds.tgz", ],
+    },
+    "RDP_v10": {
+        "16S rRNA RDP training set 10":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset10_082014.rdp.tgz", ],
+        "16S rRNA PDS training set 10":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset10_082014.pds.tgz", ],
+    },
+    "RDP_v9": {
+        "16S rRNA RDP training set 9":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset9_032012.rdp.zip", ],
+        "16S rRNA PDS training set 9":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset9_032012.pds.zip", ],
+    },
+    "RDP_v7": {
+        "16S rRNA RDP training set 7":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset7_112011.rdp.zip", ],
+        "16S rRNA PDS training set 7":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset7_112011.pds.zip", ],
+        "8S rRNA Fungi training set 7":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/fungilsu_train_v7.zip", ],
+    },
+    "RDP_v6": {
+        "RDP training set 6":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ],
+    },
+    # Silva reference files
+    # http://www.mothur.org/wiki/Silva_reference_files
+    "silva_release_138.1": {
+        "SILVA release 138.1":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz",
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ],
+    },
+    "silva_release_128": {
+        "SILVA release 128":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ],
+    },
+    "silva_release_123": {
+        "SILVA release 123":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v123.tgz",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v123.tgz", ],
+    },
+    "silva_release_119": {
+        "SILVA release 119":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v119.tgz",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v119.tgz", ],
+    },
+    "silva_release_102": {
+        "SILVA release 102":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.bacteria.zip",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.archaea.zip",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.eukarya.zip", ],
+    },
+    "silva_gold_bacteria": {
+        "SILVA gold":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.gold.bacteria.zip", ],
+    },
+    # Greengenes
+    # http://www.mothur.org/wiki/Greengenes-formatted_databases
+    "greengenes_August2013": {
+        "Greengenes August 2013":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_8_99.refalign.tgz",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_8_99.taxonomy.tgz", ],
+    },
+    "greengenes_May2013": {
+        "Greengenes May 2013":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_5_99.refalign.tgz",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_13_5_99.taxonomy.tgz", ],
+    },
+    "greengenes_old": {
+        "Greengenes pre-May 2013":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/greengenes.alignment.zip",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/greengenes.tax.tgz", ],
+    },
+    "greengenes_gold_alignment": {
+        "Greengenes gold alignment":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/greengenes.gold.alignment.zip", ],
+    },
+    # Secondary structure maps
+    # http://www.mothur.org/wiki/Secondary_structure_map
+    "secondary_structure_maps_silva": {
+        "SILVA":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva_ss_map.zip", ],
+    },
+    "secondary_structure_maps_greengenes": {
+        "Greengenes":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/gg_ss_map.zip", ],
+    },
+    # Lane masks: not used here?
+    "lane_masks": {
+        "Greengenes-compatible":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/Lane1241.gg.filter",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/lane1287.gg.filter",
+         "https://mothur.s3.us-east-2.amazonaws.com/wiki/lane1349.gg.filter", ],
+        "SILVA-compatible":
+        ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lane1349.silva.filter", ]
+    },
+}
+
+
+# Utility functions for interacting with Galaxy JSON
+def read_input_json(jsonfile):
+    """Read the JSON supplied from the data manager tool
+
+    Returns a tuple (param_dict,extra_files_path)
+
+    'param_dict' is an arbitrary dictionary of parameters
+    input into the tool; 'extra_files_path' is the path
+    to a directory where output files must be put for the
+    receiving data manager to pick them up.
+
+    NB the directory pointed to by 'extra_files_path'
+    doesn't exist initially, it is the job of the script
+    to create it if necessary.
+
+    """
+    with open(jsonfile) as fh:
+        params = json.load(fh)
+    return (params['param_dict'],
+            params['output_data'][0]['extra_files_path'])
+
+
+# Utility functions for creating data table dictionaries
+#
+# Example usage:
+# >>> d = create_data_tables_dict()
+# >>> add_data_table(d,'my_data')
+# >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
+# >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
+# >>> print(json.dumps(d))
+def create_data_tables_dict():
+    """Return a dictionary for storing data table information
+
+    Returns a dictionary that can be used with 'add_data_table'
+    and 'add_data_table_entry' to store information about a
+    data table. It can be converted to JSON to be sent back to
+    the data manager.
+
+    """
+    d = {}
+    d['data_tables'] = {}
+    return d
+
+
+def add_data_table(d, table):
+    """Add a data table to the data tables dictionary
+
+    Creates a placeholder for a data table called 'table'.
+
+    """
+    d['data_tables'][table] = []
+
+
+def add_data_table_entry(d, table, entry):
+    """Add an entry to a data table
+
+    Appends an entry to the data table 'table'. 'entry'
+    should be a dictionary where the keys are the names of
+    columns in the data table.
+
+    Raises an exception if the named data table doesn't
+    exist.
+
+    """
+    try:
+        d['data_tables'][table].append(entry)
+    except KeyError:
+        raise Exception("add_data_table_entry: no table '%s'" % table)
+
+
+# Utility functions for downloading and unpacking archive files
+def download_file(url, target=None, wd=None):
+    """Download a file from a URL
+
+    Fetches a file from the specified URL.
+
+    If 'target' is specified then the file is saved to this
+    name; otherwise it's saved as the basename of the URL.
+
+    If 'wd' is specified then it is used as the 'working
+    directory' where the file will be save on the local
+    system.
+
+    Returns the name that the file is saved with.
+
+    """
+    print(("Downloading %s" % url))
+    if not target:
+        target = os.path.basename(url)
+    if wd:
+        target = os.path.join(wd, target)
+    print(("Saving to %s" % target))
+    with open(target, 'wb') as fh:
+        fh.write(urllib.request.urlopen(url).read())
+    return target
+
+
+def unpack_zip_archive(filen, wd=None):
+    """Extract files from a ZIP archive
+
+    Given a ZIP archive, extract the files it contains
+    and return a list of the resulting file names and
+    paths.
+
+    'wd' specifies the working directory to extract
+    the files to, otherwise they are extracted to the
+    current working directory.
+
+    Once all the files are extracted the ZIP archive
+    file is deleted from the file system.
+
+    """
+    if not zipfile.is_zipfile(filen):
+        print("%s: not ZIP formatted file")
+        return [filen]
+    file_list = []
+    with zipfile.ZipFile(filen) as z:
+        for name in z.namelist():
+            if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
+                print(("Ignoring %s" % name))
+                continue
+            if wd:
+                target = os.path.join(wd, name)
+            else:
+                target = name
+            if name.endswith('/'):
+                # Make directory
+                print(("Creating dir %s" % target))
+                try:
+                    os.makedirs(target)
+                except OSError:
+                    pass
+            else:
+                # Extract file
+                print(("Extracting %s" % name))
+                try:
+                    os.makedirs(os.path.dirname(target))
+                except OSError:
+                    pass
+                with open(target, 'wb') as fh:
+                    fh.write(z.read(name))
+                file_list.append(target)
+    print(("Removing %s" % filen))
+    os.remove(filen)
+    return file_list
+
+
+def unpack_tar_archive(filen, wd=None):
+    """Extract files from a TAR archive
+
+    Given a TAR archive (which optionally can be
+    compressed with either gzip or bz2), extract the
+    files it contains and return a list of the
+    resulting file names and paths.
+
+    'wd' specifies the working directory to extract
+    the files to, otherwise they are extracted to the
+    current working directory.
+
+    Once all the files are extracted the TAR archive
+    file is deleted from the file system.
+
+    """
+    file_list = []
+    if not tarfile.is_tarfile(filen):
+        print("%s: not TAR file")
+        return [filen]
+    with tarfile.open(filen) as t:
+        for name in t.getnames():
+            # Check for unwanted files
+            if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
+                print(("Ignoring %s" % name))
+                continue
+            # Extract file
+            print(("Extracting %s" % name))
+            t.extract(name, wd)
+            if wd:
+                target = os.path.join(wd, name)
+            else:
+                target = name
+            file_list.append(target)
+    print(("Removing %s" % filen))
+    os.remove(filen)
+    return file_list
+
+
+def unpack_archive(filen, wd=None):
+    """Extract files from an archive
+
+    Wrapper function that calls the appropriate
+    unpacking function depending on the archive
+    type, and returns a list of files that have
+    been extracted.
+
+    'wd' specifies the working directory to extract
+    the files to, otherwise they are extracted to the
+    current working directory.
+
+    """
+    print(("Unpack %s" % filen))
+    ext = os.path.splitext(filen)[1]
+    print(("Extension: %s" % ext))
+    if ext == ".zip":
+        return unpack_zip_archive(filen, wd=wd)
+    elif ext == ".tgz":
+        return unpack_tar_archive(filen, wd=wd)
+    else:
+        return [filen]
+
+
+def fetch_files(urls, wd=None, files=None):
+    """Download and unpack files from a list of URLs
+
+    Given a list of URLs, download and unpack each
+    one, and return a list of the extracted files.
+
+    'wd' specifies the working directory to extract
+    the files to, otherwise they are extracted to the
+    current working directory.
+
+    If 'files' is given then the list of extracted
+    files will be appended to this list before being
+    returned.
+
+    """
+    if files is None:
+        files = []
+    for url in urls:
+        filen = download_file(url, wd=wd)
+        files.extend(unpack_archive(filen, wd=wd))
+    return files
+
+
+# Utility functions specific to the Mothur reference data
+def identify_type(filen):
+    """Return the data table name based on the file name
+
+    """
+    ext = os.path.splitext(filen)[1]
+    try:
+        return MOTHUR_FILE_TYPES[ext]
+    except KeyError:
+        print(("WARNING: unknown file type for " + filen + ", skipping"))
+        return None
+
+
+def get_name(filen):
+    """Generate a descriptive name based on the file name
+    """
+    # type_ = identify_type(filen)
+    name = os.path.splitext(os.path.basename(filen))[0]
+    for delim in ('.', '_'):
+        name = name.replace(delim, ' ')
+    return name
+
+
+def fetch_from_mothur_website(data_tables, target_dir, datasets):
+    """Fetch reference data from the Mothur website
+
+    For each dataset in the list 'datasets', download (and if
+    necessary unpack) the related files from the Mothur website,
+    copy them to the data manager's target directory, and add
+    references to the files to the appropriate data table.
+
+    The 'data_tables' dictionary should have been created using
+    the 'create_data_tables_dict' and 'add_data_table' functions.
+
+    Arguments:
+      data_tables: a dictionary containing the data table info
+      target_dir: directory to put the downloaded files
+      datasets: a list of dataset names corresponding to keys in
+        the MOTHUR_REFERENCE_DATA dictionary
+    """
+    # Make working dir
+    wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd())
+    print(("Working dir %s" % wd))
+    # Iterate over all requested reference data URLs
+    for dataset in datasets:
+        print(("Handling dataset '%s'" % dataset))
+        for name in MOTHUR_REFERENCE_DATA[dataset]:
+            for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd):
+                type_ = identify_type(f)
+                entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name)
+                print(("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f))))
+                if type_ is not None:
+                    # Move to target dir
+                    ref_data_file = os.path.basename(f)
+                    f1 = os.path.join(target_dir, ref_data_file)
+                    print(("Moving %s to %s" % (f, f1)))
+                    shutil.move(f, f1)
+                    # Add entry to data table
+                    table_name = "mothur_%s" % type_
+                    add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
+    # Remove working dir
+    print(("Removing %s" % wd))
+    shutil.rmtree(wd)
+
+
+def files_from_filesystem_paths(paths):
+    """Return list of file paths from arbitrary input paths
+
+    Given a list of filesystem paths, return a list of
+    full paths corresponding to all files found recursively
+    from under those paths.
+
+    """
+    # Collect files to add
+    files = []
+    for path in paths:
+        path = os.path.abspath(path)
+        print(("Examining '%s'..." % path))
+        if os.path.isfile(path):
+            # Store full path for file
+            files.append(path)
+        elif os.path.isdir(path):
+            # Descend into directory and collect the files
+            for f in os.listdir(path):
+                files.extend(files_from_filesystem_paths((os.path.join(path, f), )))
+        else:
+            print("Not a file or directory, ignored")
+    return files
+
+
+def import_from_server(data_tables, target_dir, paths, description, link_to_data=False):
+    """Import reference data from filesystem paths
+
+    Creates references to the specified file(s) on the Galaxy
+    server in the appropriate data table (determined from the
+    file extension).
+
+    The 'data_tables' dictionary should have been created using
+    the 'create_data_tables_dict' and 'add_data_table' functions.
+
+    Arguments:
+      data_tables: a dictionary containing the data table info
+      target_dir: directory to put copy or link to the data file
+      paths: list of file and/or directory paths to import
+      description: text to associate with the files
+      link_to_data: boolean, if False then copy the data file
+        into Galaxy (default); if True then make a symlink to
+        the data file
+
+    """
+    # Collect list of files based on input paths
+    files = files_from_filesystem_paths(paths)
+    # Handle each file individually
+    for f in files:
+        type_ = identify_type(f)
+        if type_ is None:
+            print(("%s: unrecognised type, skipped" % f))
+            continue
+        ref_data_file = os.path.basename(f)
+        target_file = os.path.join(target_dir, ref_data_file)
+        entry_name = "%s" % os.path.splitext(ref_data_file)[0]
+        if description:
+            entry_name += " (%s)" % description
+        print(("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file)))
+        # Link to or copy the data
+        if link_to_data:
+            os.symlink(f, target_file)
+        else:
+            shutil.copyfile(f, target_file)
+        # Add entry to data table
+        table_name = "mothur_%s" % type_
+        add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
+
+
+if __name__ == "__main__":
+    print("Starting...")
+
+    # Read command line
+    parser = optparse.OptionParser()
+    parser.add_option('--source', action='store', dest='data_source')
+    parser.add_option('--datasets', action='store', dest='datasets', default='')
+    parser.add_option('--paths', action='store', dest='paths', default=[])
+    parser.add_option('--description', action='store', dest='description', default='')
+    parser.add_option('--link', action='store_true', dest='link_to_data')
+    options, args = parser.parse_args()
+    print(("options: %s" % options))
+    print(("args   : %s" % args))
+
+    # Check for JSON file
+    if len(args) != 1:
+        sys.stderr.write("Need to supply JSON file name")
+        sys.exit(1)
+
+    jsonfile = args[0]
+
+    # Read the input JSON
+    params, target_dir = read_input_json(jsonfile)
+
+    # Make the target directory
+    print(("Making %s" % target_dir))
+    os.mkdir(target_dir)
+
+    # Set up data tables dictionary
+    data_tables = create_data_tables_dict()
+    add_data_table(data_tables, 'mothur_lookup')
+    add_data_table(data_tables, 'mothur_aligndb')
+    add_data_table(data_tables, 'mothur_map')
+    add_data_table(data_tables, 'mothur_taxonomy')
+
+    # Fetch data from specified data sources
+    if options.data_source == 'mothur_website':
+        datasets = options.datasets.split(',')
+        fetch_from_mothur_website(data_tables, target_dir, datasets)
+    elif options.data_source == 'filesystem_paths':
+        # Check description text
+        description = options.description.strip()
+        # Get list of paths (need to remove any escapes for '\n' and '\r'
+        # that might have been inserted by Galaxy)
+        paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split()
+        import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data)
+    # Write output JSON
+    print("Outputting JSON")
+    with open(jsonfile, 'w') as fh:
+        json.dump(data_tables, fh, sort_keys=True)
+    print("Done.")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,57 @@
+<?xml version="1.0"?>
+<data_managers>
+  <data_manager tool_file="data_manager/data_manager_fetch_mothur_reference_data.xml" id="data_manager_mothur_fetch_reference_data">
+    <data_table name="mothur_aligndb">
+      <output>
+        <column name="name" />
+        <column name="value" output_ref="out_file" >
+          <move type="file">
+            <source>${value}</source>
+            <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/aligndb/${value}</target>
+          </move>
+          <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/aligndb/${value}</value_translation>
+          <value_translation type="function">abspath</value_translation>
+        </column>
+      </output>
+    </data_table>
+    <data_table name="mothur_lookup">
+      <output>
+        <column name="name" />
+        <column name="value" output_ref="out_file" >
+          <move type="file">
+            <source>${value}</source>
+            <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/lookup/${value}</target>
+          </move>
+          <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/lookup/${value}</value_translation>
+          <value_translation type="function">abspath</value_translation>
+        </column>
+      </output>
+    </data_table>
+    <data_table name="mothur_map">
+      <output>
+        <column name="name" />
+        <column name="value" output_ref="out_file" >
+          <move type="file">
+            <source>${value}</source>
+            <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/map/${value}</target>
+          </move>
+          <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/map/${value}</value_translation>
+          <value_translation type="function">abspath</value_translation>
+        </column>
+      </output>
+    </data_table>
+    <data_table name="mothur_taxonomy">
+      <output>
+        <column name="name" />
+        <column name="value" output_ref="out_file" >
+          <move type="file">
+            <source>${value}</source>
+            <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">mothur/taxonomy/${value}</target>
+          </move>
+          <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/mothur/taxonomy/${value}</value_translation>
+          <value_translation type="function">abspath</value_translation>
+        </column>
+      </output>
+    </data_table>
+  </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/mothur_aligndb.loc.sample	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,19 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of metagenomics files.
+#file has this format (white space characters are TAB characters):
+#
+# Reference Alignments:  http://www.mothur.org/wiki/Alignment_database
+#
+#<dbname>	<file_base>
+#
+#greengenes	/project/db/galaxy/mothur/core_set_aligned.imputed.fasta
+#silva archaea	/project/db/galaxy/mothur/Silva.archaea/silva.archaea.fasta
+#silva bacteria	/project/db/galaxy/mothur/silva.bacteria/silva.bacteria.fasta
+#silva eukarya	/project/db/galaxy/mothur/silva.eukarya.fasta
+#silva archaea nogap	/project/db/galaxy/mothur/Silva.archaea/nogap.archaea.fasta
+#silva bacteria nogap	/project/db/galaxy/mothur/silva.bacteria/nogap.bacteria.fasta
+#silva eukarya nogap	/project/db/galaxy/mothur/nogap.eukarya.fasta
+#FungiLSU_train_1400bp_8506_mod.fasta	/project/db/galaxy/mothur/RDP/FungiLSU_train_1400bp_8506_mod.fasta
+#trainset6_032010.rdp.fasta	/project/db/galaxy/mothur/RDP/trainset6_032010.rdp.fasta
+#trainset7_112011.pds.fasta	/project/db/galaxy/mothur/RDP/trainset7_112011.pds.fasta
+#trainset7_112011.rdp.fasta	/project/db/galaxy/mothur/RDP/trainset7_112011.rdp.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/mothur_lookup.loc.sample	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,11 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of metagenomics files.
+#file has this format (white space characters are TAB characters):
+#
+# lookup files from:   http://www.mothur.org/wiki/Lookup_files
+#
+#<name>	<file_base>
+#
+#GS20	/project/db/galaxy/mothur/lookup/LookUp_GS20.pat
+#GSFLX	/project/db/galaxy/mothur/lookup/LookUp_GSFLX.pat
+#Titanium	/project/db/galaxy/mothur/lookup/LookUp_Titanium.pat
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/mothur_map.loc.sample	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,10 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of metagenomics files.
+#file has this format (white space characters are TAB characters):
+#
+# Secondary structure maps:    http://www.mothur.org/wiki/Secondary_structure_map
+#
+#<name>	<file_base>
+#
+#greengenes	/project/db/galaxy/mothur/gg.ss.map
+#silva	/project/db/galaxy/mothur/silva.ss.map
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/mothur_taxonomy.loc.sample	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,24 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of metagenomics files.
+#file has this format (white space characters are TAB characters):
+#
+# Silva reference files:    http://www.mothur.org/wiki/Silva_reference_files
+#
+#<taxonomyname>	<file_base>
+#
+#archaea.gg	/project/db/galaxy/mothur/Silva.archaea/silva.archaea.gg.tax
+#archaea.silva	/project/db/galaxy/mothur/Silva.archaea/silva.archaea.silva.tax
+#archaea.rdp	/project/db/galaxy/mothur/Silva.archaea/silva.archaea.rdp.tax
+#archaea.ncbi	/project/db/galaxy/mothur/Silva.archaea/silva.archaea.ncbi.tax
+#bacteria.gg	/project/db/galaxy/mothur/silva.bacteria/silva.bacteria.gg.tax
+#bacteria.silva	/project/db/galaxy/mothur/silva.bacteria/silva.bacteria.silva.tax
+#bacteria.ncbi	/project/db/galaxy/mothur/silva.bacteria/silva.bacteria.ncbi.tax
+#bacteria.rdp	/project/db/galaxy/mothur/silva.bacteria/silva.bacteria.rdp.tax
+#bacteria.rdp6	/project/db/galaxy/mothur/silva.bacteria/silva.bacteria.rdp6.tax
+#eukarya.silva	/project/db/galaxy/mothur/silva.eukarya.silva.tax
+#eukarya.ncbi	/project/db/galaxy/mothur/silva.eukarya.ncbi.tax
+#trainset6_032010.rdp.tax	/project/db/galaxy/mothur/RDP/trainset6_032010.rdp.tax
+#trainset7_112011.pds.tax	/project/db/galaxy/mothur/RDP/trainset7_112011.pds.tax
+#trainset7_112011.rdp.tax	/project/db/galaxy/mothur/RDP/trainset7_112011.rdp.tax
+#FungiLSU_train_1400bp_8506_mod.tax	/project/db/galaxy/mothur/RDP/FungiLSU_train_1400bp_8506_mod.tax
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Jun 22 12:07:41 2021 +0000
@@ -0,0 +1,18 @@
+<tables>
+  <table name="mothur_aligndb" comment_char="#" allow_duplicate_entries="False">
+    <columns>name, value</columns>
+    <file path="tool-data/mothur_aligndb.loc" />
+  </table>
+  <table name="mothur_lookup" comment_char="#" allow_duplicate_entries="False">
+    <columns>name, value</columns>
+    <file path="tool-data/mothur_lookup.loc" />
+  </table>
+  <table name="mothur_map" comment_char="#" allow_duplicate_entries="False">
+    <columns>name, value</columns>
+    <file path="tool-data/mothur_map.loc" />
+  </table>
+  <table name="mothur_taxonomy" comment_char="#" allow_duplicate_entries="False">
+    <columns>name, value</columns>
+    <file path="tool-data/mothur_taxonomy.loc" />
+  </table>
+</tables>