changeset 0:3ab83cb7e2d2 draft

Uploaded
author greg
date Tue, 15 Mar 2022 15:32:31 +0000
parents
children 7093598fa300
files .shed.yml data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml data_manager_conf.xml test-data/gtdbtk_database.loc tool-data/gtdbtk_database.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 8 files changed, 210 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,9 @@
+categories:
+- Data Managers
+description: Install GTDB-Tk databases
+homepage_url: https://github.com/Ecogenomics/GTDBTk
+long_description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes.  The GTDB-Tk is open source and released under the GNU General Public License (Version 3).
+owner: iuc
+name: gtdbtk_database_installer
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer
+type: unrestricted
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/gtdbtk_database_installer.py	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import os
+import sys
+import tarfile
+from urllib.request import Request, urlopen
+from urllib.parse import urlparse
+
+
+def url_download(url, work_dir):
+    url_parts = urlparse(url)
+    file_path = os.path.abspath(os.path.join(work_dir, os.path.basename(url_parts.path)))
+    src = None
+    dst = None
+    try:
+        req = Request(url)
+        src = urlopen(req)
+        with open(file_path, 'wb') as dst:
+            while True:
+                chunk = src.read(2**10)
+                if chunk:
+                    dst.write(chunk)
+                else:
+                    break
+    except Exception as e:
+        sys.exit(str(e))
+    finally:
+        if src:
+            src.close()
+    if tarfile.is_tarfile(file_path):
+        fh = tarfile.open(file_path, 'r:*')
+    else:
+        return file_path
+    fh.extractall(work_dir)
+    os.remove(file_path)
+    return work_dir
+
+
+def download(database_id, database_name, url, out_file):
+
+    with open(out_file) as fh:
+        params = json.load(fh)
+
+    work_dir = params['output_data'][0]['extra_files_path']
+    os.makedirs(work_dir)
+    file_path = url_download(url, work_dir)
+
+    data_manager_json = {"data_tables": {}}
+    data_manager_entry = {}
+    data_manager_entry['value'] = database_id
+    data_manager_entry['name'] = database_name
+    data_manager_entry['path'] = file_path
+    data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry
+
+    with open(out_file, 'w') as fh:
+        json.dump(data_manager_json, fh, sort_keys=True)
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name')
+parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id')
+parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version')
+parser.add_argument('--out_file', dest='out_file', help='JSON output file')
+
+args = parser.parse_args()
+
+download(args.database_id, args.database_name, args.url, args.out_file)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/gtdbtk_database_installer.xml	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,48 @@
+<tool id="gtdbtk_database_installer" name="GTDB-Tk Database Installer" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description></description>
+    <macros>
+        <token name="@TOOL_VERSION@">202</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@PROFILE@">20.09</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="3.10.2">python</requirement>
+    </requirements>
+    <command>
+    <![CDATA[
+        python '$__tool_directory__/gtdbtk_database_installer.py'
+          --database_id '$database_id'
+          --database_name '$database_name'
+          --url '$url'
+          --out_file '$out_file'
+    ]]>
+    </command>
+    <inputs>
+	    <param name="database_name" type="text" value="" label="Database name or description" help="This value will be displayed in the GTDB-Tk Database select list"/>
+        <param name="database_id" type="text" value="" label="Database id" help="This value must be unique with nNo whitespace allowed-use underscores"/>
+        <param name="url" type="text" value="https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_data.tar.gz" label="URL for downloading the selected version of the GTDB-Tk database"/>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <tests>
+        <test>
+            <!-- Not actually installing a huge GTDB-Tk database -->
+            <param name="database_id" value="release202"/>
+            <param name="database_name" value="GTDB-Tk database release 202"/>
+            <param name="url" value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/VERSION"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="GTDB-Tk database release 202"/>
+                    <has_text text="release202"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+    </help>
+    <citations>
+        <citation type="doi">doi.org/10.1038/s41587-020-0501-8</citation>
+        <citation type="doi">dx.doi.org/10.1038/nbt.4229</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,17 @@
+<data_managers>
+    <data_manager tool_file="data_manager/gtdbtk_database_installer.xml" id="gtdbtk_database_installer">
+        <data_table name="gtdbtk_database">
+            <output>
+                <column name="value"/>
+                <column name="name"/>
+                <column name="db_path" output_ref="out_file">
+                    <move type="directory" relativize_symlinks="True">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database/${value}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database/${value}/${db_path}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gtdbtk_database.loc	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,26 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+# <unique_build_id> <display_name>  <directory_path>
+#
+# So, for example, if you have the gtdbtk 202 stored in 
+# /depot/data2/galaxy/gtdbtk/202/, 
+# then the gtdbtk_databases.loc entry would look like this:
+#
+# release202    gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202
+#
+# and your /depot/data2/galaxy/gtdbtk/release202 directory
+# would contain GTDB-Tk database files for release 202, sommething like this:
+#
+#drwxr-sr-x  3 gvk G-824019    4096 Apr 20  2021 fastani/
+#-rw-r--r--  1 gvk G-824019 4810764 Apr 22  2021 manifest.tsv
+#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 markers/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 masks/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 metadata/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 21  2021 mrca_red/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 msa/
+#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 pplacer/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 radii/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 taxonomy/
+release202		GTDB-Tk database release 202	/depot/data2/galaxy/tool-data/gtdbtk_database/release202
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gtdbtk_database.loc.sample	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,26 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+# <unique_build_id> <display_name>  <directory_path>
+#
+# So, for example, if you have the gtdbtk 202 stored in 
+# /depot/data2/galaxy/gtdbtk/202/, 
+# then the gtdbtk_databases.loc entry would look like this:
+#
+# release202    gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202
+#
+# and your /depot/data2/galaxy/gtdbtk/release202 directory
+# would contain GTDB-Tk database files for release 202, sommething like this:
+#
+#drwxr-sr-x  3 gvk G-824019    4096 Apr 20  2021 fastani/
+#-rw-r--r--  1 gvk G-824019 4810764 Apr 22  2021 manifest.tsv
+#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 markers/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 masks/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 metadata/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 21  2021 mrca_red/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 msa/
+#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 pplacer/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 radii/
+#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 taxonomy/
+release202		GTDB-Tk database release 202	/depot/data2/galaxy/tool-data/gtdbtk_database/release202
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of GTDB-Tk database versions 202 and higher -->
+    <table name="gtdbtk_database" comment_char="#">
+        <columns>value, name, db_path</columns>
+        <file path="tool-data/gtdbtk_database.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Tue Mar 15 15:32:31 2022 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Location of databases for gtdbtk version 202 and higher -->
+    <table name="gtdbtk_database" comment_char="#">
+        <columns>value, name, db_path</columns>
+        <file path="${__HERE__}/test-data/gtdbtk_database.loc" />
+    </table>
+</tables>