Mercurial > repos > iuc > data_manager_mothur_toolsuite

--- a/data_manager/data_manager_fetch_mothur_reference_data.xml	Thu Sep 17 09:37:20 2020 +0000
+++ b/data_manager/data_manager_fetch_mothur_reference_data.xml	Fri Jun 25 09:36:36 2021 +0000
@@ -1,8 +1,8 @@
 <?xml version="1.0"?>
-<tool id="data_manager_fetch_mothur_reference_data" name="Fetch Mothur toolsuite reference data" version="0.1.5" tool_type="manage_data" profile="19.05">
+<tool id="data_manager_fetch_mothur_reference_data" name="Fetch Mothur toolsuite reference data" version="0.2.1" tool_type="manage_data" profile="19.05">
     <description>Fetch and install reference data for Mothur</description>
     <requirements>
-        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="3.8">python</requirement>
     </requirements>
     <command><![CDATA[
         python '$__tool_directory__/fetch_mothur_reference_data.py'
@@ -31,12 +31,14 @@
                     <option value="lookup_titanium">GS FLX Titanium lookup files</option>
                     <option value="lookup_gsflx">GSFLX lookup files</option>
                     <option value="lookup_gs20">GS20 lookup files</option>
+                    <option value="RDP_v18">RDP reference files (training set version 18)</option>
                     <option value="RDP_v16">RDP reference files (training set version 16)</option>
                     <option value="RDP_v14">RDP reference files (training set version 14)</option>
                     <option value="RDP_v10">RDP reference files (training set version 10)</option>
                     <option value="RDP_v9">RDP reference files (training set version 9)</option>
                     <option value="RDP_v7">RDP reference files (training set version 7)</option>
                     <option value="RDP_v6">RDP reference files (training set version 6)</option>
+                    <option value="silva_release_138.1">SILVA reference files (release 138.1)</option>
                     <option value="silva_release_128">SILVA reference files (release 128)</option>
                     <option value="silva_release_123">SILVA reference files (release 123)</option>
                     <option value="silva_release_119">SILVA reference files (release 119)</option>
@@ -93,6 +95,18 @@
             </output>
         </test>
         <test>
+            <param name="data_source|ref_data" value="RDP_v18"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="16S rRNA RDP training set 18" />
+                    <has_text text="trainset18_062020.rdp.fasta" />
+                    <has_text text="trainset18_062020.rdp.tax" />
+                    <has_text text="trainset18_062020.pds.fasta" />
+                    <has_text text="trainset18_062020.pds.tax" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
             <param name="data_source|ref_data" value="RDP_v16"/>
             <output name="out_file">
                 <assert_contents>
--- a/data_manager/fetch_mothur_reference_data.py	Thu Sep 17 09:37:20 2020 +0000
+++ b/data_manager/fetch_mothur_reference_data.py	Fri Jun 25 09:36:36 2021 +0000
@@ -1,6 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
+import io
 import json
 import optparse
 import os
@@ -8,7 +9,9 @@
 import sys
 import tarfile
 import tempfile
-import urllib2
+import urllib.error
+import urllib.parse
+import urllib.request
 import zipfile
 from functools import reduce

@@ -38,6 +41,14 @@
     },
     # RDP reference files
     # http://www.mothur.org/wiki/RDP_reference_files
+    "RDP_v18": {
+        "16S rRNA RDP training set 18":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ],
+        "16S rRNA PDS training set 18":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ],
+    },
     "RDP_v16": {
         "16S rRNA RDP training set 16":
         ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ],
@@ -76,6 +87,12 @@
     },
     # Silva reference files
     # http://www.mothur.org/wiki/Silva_reference_files
+    "silva_release_138.1": {
+        "SILVA release 138.1":
+            [
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz",
+                "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ],
+    },
     "silva_release_128": {
         "SILVA release 128":
         ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz",
@@ -160,7 +177,8 @@
     to create it if necessary.

     """
-    params = json.loads(open(jsonfile).read())
+    with open(jsonfile) as fh:
+        params = json.load(fh)
     return (params['param_dict'],
             params['output_data'][0]['extra_files_path'])

@@ -172,7 +190,7 @@
 # >>> add_data_table(d,'my_data')
 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
-# >>> print str(json.dumps(d))
+# >>> print(json.dumps(d))
 def create_data_tables_dict():
     """Return a dictionary for storing data table information

@@ -229,13 +247,19 @@
     Returns the name that the file is saved with.

     """
-    print("Downloading %s" % url)
+    print(f"Downloading {url}")
     if not target:
         target = os.path.basename(url)
     if wd:
         target = os.path.join(wd, target)
-    print("Saving to %s" % target)
-    open(target, 'wb').write(urllib2.urlopen(url).read())
+    print(f"Saving to {target}")
+    with open(target, 'wb') as fh:
+        url_h = urllib.request.urlopen(url)
+        while True:
+            buffer = url_h.read(io.DEFAULT_BUFFER_SIZE)
+            if buffer == b"":
+                break
+            fh.write(buffer)
     return target


@@ -255,35 +279,36 @@

     """
     if not zipfile.is_zipfile(filen):
-        print("%s: not ZIP formatted file")
+        print(f"{filen}: not ZIP formatted file")
         return [filen]
     file_list = []
-    z = zipfile.ZipFile(filen)
-    for name in z.namelist():
-        if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
-            print("Ignoring %s" % name)
-            continue
-        if wd:
-            target = os.path.join(wd, name)
-        else:
-            target = name
-        if name.endswith('/'):
-            # Make directory
-            print("Creating dir %s" % target)
-            try:
-                os.makedirs(target)
-            except OSError:
-                pass
-        else:
-            # Extract file
-            print("Extracting %s" % name)
-            try:
-                os.makedirs(os.path.dirname(target))
-            except OSError:
-                pass
-            open(target, 'wb').write(z.read(name))
-            file_list.append(target)
-    print("Removing %s" % filen)
+    with zipfile.ZipFile(filen) as z:
+        for name in z.namelist():
+            if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
+                print(f"Ignoring {name}")
+                continue
+            if wd:
+                target = os.path.join(wd, name)
+            else:
+                target = name
+            if name.endswith('/'):
+                # Make directory
+                print(f"Creating dir {target}")
+                try:
+                    os.makedirs(target)
+                except OSError:
+                    pass
+            else:
+                # Extract file
+                print("Extracting {target}")
+                try:
+                    os.makedirs(os.path.dirname(target))
+                except OSError:
+                    pass
+                with open(target, 'wb') as fh:
+                    fh.write(z.read(name))
+                file_list.append(target)
+    print(f"Removing {filen}")
     os.remove(filen)
     return file_list

@@ -306,23 +331,23 @@
     """
     file_list = []
     if not tarfile.is_tarfile(filen):
-        print("%s: not TAR file")
+        print(f"{filen}: not TAR file")
         return [filen]
-    t = tarfile.open(filen)
-    for name in t.getnames():
-        # Check for unwanted files
-        if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
-            print("Ignoring %s" % name)
-            continue
-        # Extract file
-        print("Extracting %s" % name)
-        t.extract(name, wd)
-        if wd:
-            target = os.path.join(wd, name)
-        else:
-            target = name
-        file_list.append(target)
-    print("Removing %s" % filen)
+    with tarfile.open(filen) as t:
+        for name in t.getnames():
+            # Check for unwanted files
+            if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
+                print(f"Ignoring {name}")
+                continue
+            # Extract file
+            print(f"Extracting {name}")
+            t.extract(name, wd)
+            if wd:
+                target = os.path.join(wd, name)
+            else:
+                target = name
+            file_list.append(target)
+    print(f"Removing {filen}")
     os.remove(filen)
     return file_list

@@ -340,9 +365,9 @@
     current working directory.

     """
-    print("Unpack %s" % filen)
+    print(f"Unpack {filen}")
     ext = os.path.splitext(filen)[1]
-    print("Extension: %s" % ext)
+    print(f"Extension: {ext}")
     if ext == ".zip":
         return unpack_zip_archive(filen, wd=wd)
     elif ext == ".tgz":
@@ -383,7 +408,7 @@
     try:
         return MOTHUR_FILE_TYPES[ext]
     except KeyError:
-        print("WARNING: unknown file type for " + filen + ", skipping")
+        print(f"WARNING: unknown file type for {filen}, skipping")
         return None


@@ -416,26 +441,27 @@
     """
     # Make working dir
     wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd())
-    print("Working dir %s" % wd)
+    print(f"Working dir {wd}")
     # Iterate over all requested reference data URLs
     for dataset in datasets:
-        print("Handling dataset '%s'" % dataset)
+        print(f"Handling dataset '{dataset}'")
         for name in MOTHUR_REFERENCE_DATA[dataset]:
             for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd):
                 type_ = identify_type(f)
-                entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name)
-                print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f)))
+                name_from_file = os.path.splitext(os.path.basename(f))[0]
+                entry_name = f"{name_from_file} ({name})"
+                print(f"{type_}\t\'{entry_name}'\t.../{os.path.basename(f)}")
                 if type_ is not None:
                     # Move to target dir
                     ref_data_file = os.path.basename(f)
                     f1 = os.path.join(target_dir, ref_data_file)
-                    print("Moving %s to %s" % (f, f1))
-                    os.rename(f, f1)
+                    print(f"Moving {f} to {f1}")
+                    shutil.move(f, f1)
                     # Add entry to data table
-                    table_name = "mothur_%s" % type_
+                    table_name = f"mothur_{type_}"
                     add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
     # Remove working dir
-    print("Removing %s" % wd)
+    print(f"Removing {wd}")
     shutil.rmtree(wd)


@@ -451,7 +477,7 @@
     files = []
     for path in paths:
         path = os.path.abspath(path)
-        print("Examining '%s'..." % path)
+        print(f"Examining '{path}'...")
         if os.path.isfile(path):
             # Store full path for file
             files.append(path)
@@ -490,21 +516,21 @@
     for f in files:
         type_ = identify_type(f)
         if type_ is None:
-            print("%s: unrecognised type, skipped" % f)
+            print(f"{f}: unrecognised type, skipped")
             continue
         ref_data_file = os.path.basename(f)
         target_file = os.path.join(target_dir, ref_data_file)
         entry_name = "%s" % os.path.splitext(ref_data_file)[0]
         if description:
             entry_name += " (%s)" % description
-        print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file))
+        print(f"{type_}\t\'{entry_name}'\t.../{ref_data_file}")
         # Link to or copy the data
         if link_to_data:
             os.symlink(f, target_file)
         else:
             shutil.copyfile(f, target_file)
         # Add entry to data table
-        table_name = "mothur_%s" % type_
+        table_name = f"mothur_{type_}"
         add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))


@@ -519,8 +545,8 @@
     parser.add_option('--description', action='store', dest='description', default='')
     parser.add_option('--link', action='store_true', dest='link_to_data')
     options, args = parser.parse_args()
-    print("options: %s" % options)
-    print("args   : %s" % args)
+    print(f"options: {options}")
+    print(f"args   : {args}")

     # Check for JSON file
     if len(args) != 1:
@@ -533,7 +559,7 @@
     params, target_dir = read_input_json(jsonfile)

     # Make the target directory
-    print("Making %s" % target_dir)
+    print(f"Making {target_dir}")
     os.mkdir(target_dir)

     # Set up data tables dictionary
@@ -556,6 +582,6 @@
         import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data)
     # Write output JSON
     print("Outputting JSON")
-    print(json.dumps(data_tables))
-    open(jsonfile, 'w').write(json.dumps(data_tables, sort_keys=True))
+    with open(jsonfile, 'w') as fh:
+        json.dump(data_tables, fh, sort_keys=True)
     print("Done.")