Mercurial > repos > greg > data_manager_gtdbtk_database_installer
changeset 0:3ab83cb7e2d2 draft
Uploaded
author | greg |
---|---|
date | Tue, 15 Mar 2022 15:32:31 +0000 |
parents | |
children | 7093598fa300 |
files | .shed.yml data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml data_manager_conf.xml test-data/gtdbtk_database.loc tool-data/gtdbtk_database.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 8 files changed, 210 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,9 @@ +categories: +- Data Managers +description: Install GTDB-Tk databases +homepage_url: https://github.com/Ecogenomics/GTDBTk +long_description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. The GTDB-Tk is open source and released under the GNU General Public License (Version 3). +owner: iuc +name: gtdbtk_database_installer +remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer +type: unrestricted
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/gtdbtk_database_installer.py Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import sys +import tarfile +from urllib.request import Request, urlopen +from urllib.parse import urlparse + + +def url_download(url, work_dir): + url_parts = urlparse(url) + file_path = os.path.abspath(os.path.join(work_dir, os.path.basename(url_parts.path))) + src = None + dst = None + try: + req = Request(url) + src = urlopen(req) + with open(file_path, 'wb') as dst: + while True: + chunk = src.read(2**10) + if chunk: + dst.write(chunk) + else: + break + except Exception as e: + sys.exit(str(e)) + finally: + if src: + src.close() + if tarfile.is_tarfile(file_path): + fh = tarfile.open(file_path, 'r:*') + else: + return file_path + fh.extractall(work_dir) + os.remove(file_path) + return work_dir + + +def download(database_id, database_name, url, out_file): + + with open(out_file) as fh: + params = json.load(fh) + + work_dir = params['output_data'][0]['extra_files_path'] + os.makedirs(work_dir) + file_path = url_download(url, work_dir) + + data_manager_json = {"data_tables": {}} + data_manager_entry = {} + data_manager_entry['value'] = database_id + data_manager_entry['name'] = database_name + data_manager_entry['path'] = file_path + data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry + + with open(out_file, 'w') as fh: + json.dump(data_manager_json, fh, sort_keys=True) + + +parser = argparse.ArgumentParser() + +parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name') +parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id') +parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version') +parser.add_argument('--out_file', dest='out_file', help='JSON output file') + +args = parser.parse_args() + +download(args.database_id, args.database_name, args.url, args.out_file)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/gtdbtk_database_installer.xml Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,48 @@ +<tool id="gtdbtk_database_installer" name="GTDB-Tk Database Installer" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description></description> + <macros> + <token name="@TOOL_VERSION@">202</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">20.09</token> + </macros> + <requirements> + <requirement type="package" version="3.10.2">python</requirement> + </requirements> + <command> + <![CDATA[ + python '$__tool_directory__/gtdbtk_database_installer.py' + --database_id '$database_id' + --database_name '$database_name' + --url '$url' + --out_file '$out_file' + ]]> + </command> + <inputs> + <param name="database_name" type="text" value="" label="Database name or description" help="This value will be displayed in the GTDB-Tk Database select list"/> + <param name="database_id" type="text" value="" label="Database id" help="This value must be unique with nNo whitespace allowed-use underscores"/> + <param name="url" type="text" value="https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_data.tar.gz" label="URL for downloading the selected version of the GTDB-Tk database"/> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <tests> + <test> + <!-- Not actually installing a huge GTDB-Tk database --> + <param name="database_id" value="release202"/> + <param name="database_name" value="GTDB-Tk database release 202"/> + <param name="url" value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/VERSION"/> + <output name="out_file"> + <assert_contents> + <has_text text="GTDB-Tk database release 202"/> + <has_text text="release202"/> + </assert_contents> + </output> + </test> + </tests> + <help> + </help> + <citations> + <citation type="doi">doi.org/10.1038/s41587-020-0501-8</citation> + <citation type="doi">dx.doi.org/10.1038/nbt.4229</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,17 @@ +<data_managers> + <data_manager tool_file="data_manager/gtdbtk_database_installer.xml" id="gtdbtk_database_installer"> + <data_table name="gtdbtk_database"> + <output> + <column name="value"/> + <column name="name"/> + <column name="db_path" output_ref="out_file"> + <move type="directory" relativize_symlinks="True"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database/${value}/${db_path}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtdbtk_database.loc Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,26 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <directory_path> +# +# So, for example, if you have the gtdbtk 202 stored in +# /depot/data2/galaxy/gtdbtk/202/, +# then the gtdbtk_databases.loc entry would look like this: +# +# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 +# +# and your /depot/data2/galaxy/gtdbtk/release202 directory +# would contain GTDB-Tk database files for release 202, sommething like this: +# +#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ +#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/ +release202 GTDB-Tk database release 202 /depot/data2/galaxy/tool-data/gtdbtk_database/release202
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gtdbtk_database.loc.sample Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,26 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <directory_path> +# +# So, for example, if you have the gtdbtk 202 stored in +# /depot/data2/galaxy/gtdbtk/202/, +# then the gtdbtk_databases.loc entry would look like this: +# +# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 +# +# and your /depot/data2/galaxy/gtdbtk/release202 directory +# would contain GTDB-Tk database files for release 202, sommething like this: +# +#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ +#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/ +release202 GTDB-Tk database release 202 /depot/data2/galaxy/tool-data/gtdbtk_database/release202
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of GTDB-Tk database versions 202 and higher --> + <table name="gtdbtk_database" comment_char="#"> + <columns>value, name, db_path</columns> + <file path="tool-data/gtdbtk_database.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Mar 15 15:32:31 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Location of databases for gtdbtk version 202 and higher --> + <table name="gtdbtk_database" comment_char="#"> + <columns>value, name, db_path</columns> + <file path="${__HERE__}/test-data/gtdbtk_database.loc" /> + </table> +</tables>