changeset 0:43ec3aadda50 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_amrfinderplus commit 60348db40f25b746db8fd85d6d62ff7569ce28d3
author pimarin
date Tue, 20 Dec 2022 13:43:51 +0000
parents
children 75f79148cb06
files data_manager/data_manager_build_amrfinderplus.py data_manager/data_manager_build_amrfinderplus.xml data_manager/macro.xml data_manager_conf.xml test-data/amrfinderplus.loc.test test-data/amrfinderplus_test_data_manager.json test-data/amrfinderplus_test_data_manager_1.json test-data/amrfinderplus_test_data_manager_2.json tool-data/amrfinderplus.loc tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 11 files changed, 415 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_build_amrfinderplus.py	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,257 @@
+import argparse
+import json
+import os
+import subprocess as sp
+from ftplib import FTP
+
+import pandas as pd
+from io import BytesIO
+from pathlib import Path
+
+
+class GetAmrFinderPlusDataManager:
+    """
+    Create the json file with database information for galaxy data manager
+    """
+
+    def __init__(self,
+                 amrfinderplus_database="amrfinderplus_database",
+                 db_name="amrfinderplus-db",
+                 amrfinderplus_version="latest",
+                 date_version=None):
+        self.data_table_name = amrfinderplus_database
+        self._db_name = db_name
+        self._amrfinderplus_version = amrfinderplus_version
+        self._amrfinderplus_date_version = date_version
+        self.data_table_entry = None
+        self.amrfinderplus_table_list = None
+
+    def get_data_table_format(self):
+        """
+        Skeleton of a data_table format
+        return: a data table formatted for json output
+        """
+        self.data_table_entry = {
+            "data_tables": {
+                self.data_table_name: {}
+            }
+        }
+        return self.data_table_entry
+
+    def get_data_manager(self):
+        """
+        Create the empty data table format and add all the information into
+        return: The data table with database information
+        """
+        self.amrfinderplus_table_list = self.get_data_table_format()
+        amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \
+                              f"_{self._amrfinderplus_date_version}"
+        amrfinderplus_name = f"V{self._amrfinderplus_version}" \
+                             f"-{self._amrfinderplus_date_version}"
+        data_info = dict(value=amrfinderplus_value,
+                         name=amrfinderplus_name,
+                         path=self._db_name)
+        self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info]
+        return self.amrfinderplus_table_list
+
+
+class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager):
+    """
+    Download the amrfinderplus database from the ncbi.
+    Make the database available with hmm and indexed files
+    Build the data manager infos for galaxy
+    """
+
+    def __init__(self,
+                 output_dir=Path.cwd(),
+                 ncbi_url="ftp.ncbi.nlm.nih.gov",
+                 ftp_login="anonymous",
+                 ftp_password="anonymous",
+                 amrfinderplus_database="amrfinderplus_database",
+                 db_name="amrfinderplus-db",
+                 amrfinderplus_version="latest",
+                 json_file_path=None,
+                 date_version=None,
+                 amrfinderplus_db_path=None,
+                 test_mode=False):
+
+        super().__init__()
+        self.json_file_path = json_file_path
+        self._output_dir = output_dir
+        self._ncbi_ftp_url = ncbi_url
+        self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database"
+        self._login = ftp_login
+        self._password = ftp_password
+        self._amrfinderplus_database = amrfinderplus_database
+        self._db_name = db_name
+        self._amrfinderplus_version = amrfinderplus_version
+        self._amrfinderplus_date_version = date_version
+        self.species_list = None
+        self.test_mode = test_mode
+        self.amrfinderplus_db_path = amrfinderplus_db_path
+
+    @staticmethod
+    def subprocess_cmd(command, *args):
+        """
+        Method to call external tools with any parameters
+        :param command: command name from the tool used (e.g. wget or makeblastdb)
+        :param args: free number of argument need for the command tool (e.g. -r, -P ...)
+        :return: launch the command line from the system
+        """
+        cmd = [command]
+        [cmd.append(i) for i in args]
+        proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE)
+        if proc.returncode != 0:
+            print(f'Error type {proc.returncode} with : \n {proc}')
+
+    def download_amrfinderplus_db(self):
+        """
+        Download the amrfinderplus database from the ncbi ftp server
+        """
+        self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}'
+        os.makedirs(self.amrfinderplus_db_path)
+        if self._amrfinderplus_version == 'latest':
+            self.get_amrfinderplus_version()
+
+        amrfinderplus_ftp_path = f"ftp://{self._login}:" \
+                                 f"{self._password}@{self._ncbi_ftp_url}/" \
+                                 f"{self._ncbi_database_path}/" \
+                                 f"{self._amrfinderplus_version}/" \
+                                 f"{self._amrfinderplus_date_version}"
+        if self.test_mode is True:
+            file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"]
+            output_option = "-O"
+            for file in file_list:
+                self.subprocess_cmd("wget",
+                                    "-nd",
+                                    "-np",
+                                    "-r",
+                                    f"{amrfinderplus_ftp_path}/{file}",
+                                    output_option,
+                                    f"{self.amrfinderplus_db_path}/{file}")
+        else:
+            output_option = "-P"
+            self.subprocess_cmd("wget",
+                                "-nd",
+                                "-np",
+                                "-r",
+                                amrfinderplus_ftp_path,
+                                output_option,
+                                self.amrfinderplus_db_path)
+
+    def make_hmm_profile(self):
+        """
+        Make the hmm profile using the AMR.LIB file previously download
+        """
+        hmm_file = Path(f"{self.amrfinderplus_db_path}/AMR.LIB")
+        if Path.exists(hmm_file) and self.test_mode is False:
+            self.subprocess_cmd("hmmpress", "-f", hmm_file)
+        else:
+            print("hmm_file file is missing to make hmm profiles")
+
+    def extract_filelist_makeblast(self):
+        """
+        Extract le list of species which have file in the database
+        return: a filtered species list of available species in the database
+        """
+        taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab")
+        if Path.exists(taxa_group_path):
+            taxa_table = pd.read_table(taxa_group_path)
+            taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"]
+            taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1)
+            if self.test_mode is True:
+                taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup
+            else:
+                taxa_df = taxa_df.taxgroup
+            self.species_list = list(taxa_df)
+        else:
+            print("taxgroup.tab file is missing to list available species")
+
+    def make_blastdb(self):
+        """
+        Index fasta file for blast
+        """
+        self.extract_filelist_makeblast()
+        nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list]
+        amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS'
+        amr_prot = f'{self.amrfinderplus_db_path}/AMRProt'
+        os.chdir(self.amrfinderplus_db_path)
+        if Path(amr_dna).exists():
+            nucl_file_db_list.append(amr_dna)
+        else:
+            print("No file AMR_CDS detected for indexing")
+        if Path(amr_prot).exists():
+            self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot")
+        else:
+            print("No file AMRProt detected for indexing")
+        [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list]
+
+    def get_amrfinderplus_version(self, version_file="version.txt",
+                                  database_version_file="database_format_version.txt"):
+        """
+        Check the version when latest if provided and update the number
+        param version_file: name of the file containing version information
+        param database_version_file: name of the file containing date version information
+        """
+        ftp = FTP(self._ncbi_ftp_url)
+        ftp.login(self._login, self._password)
+        ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}")
+        db_version = BytesIO()
+        db_date_version = BytesIO()
+        ftp.retrbinary(f'RETR {version_file}', db_version.write)
+        ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write)
+        self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0]
+        self._amrfinderplus_version = '.'.join(
+            db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2])
+
+    def read_json_input_file(self):
+        """
+        Import the json file
+        """
+        with open(self.json_file_path) as fh:
+            params = json.load(fh)
+        target_dir = params['output_data'][0]['extra_files_path']
+        os.makedirs(target_dir)
+        self._output_dir = target_dir
+
+    def write_json_infos(self):
+        """
+        Write in the imported json file
+        """
+        with open(self.json_file_path, 'w') as fh:
+            json.dump(self.get_data_manager(), fh, sort_keys=True)
+
+
+def parse_arguments():
+    """
+    List of arguments provided by the user
+    return: parsed arguments
+    """
+    # parse options and arguments
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument("data_manager_json",
+                            help="json file from galaxy")
+    arg_parser.add_argument("--db_version", default="latest",
+                            help="select the major version of the database (e.g. 3.10, 3.8), default is latest")
+    arg_parser.add_argument("--db_date",
+                            help="select the date into the database version (e.g. 2022-10-11.2)")
+    arg_parser.add_argument("--test", action='store_true',
+                            help="option to test the script with an lighted database")
+    return arg_parser.parse_args()
+
+
+def main():
+    all_args = parse_arguments()
+    amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version,
+                                                           date_version=all_args.db_date,
+                                                           json_file_path=all_args.data_manager_json,
+                                                           test_mode=all_args.test)
+    amrfinderplus_download.read_json_input_file()
+    amrfinderplus_download.download_amrfinderplus_db()
+    amrfinderplus_download.make_hmm_profile()
+    amrfinderplus_download.make_blastdb()
+    amrfinderplus_download.write_json_infos()
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_build_amrfinderplus.xml	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,92 @@
+<tool id="data_manager_build_amrfinderplus" name="amrfinderplus_datamanager" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>AMRfinderplus database builder</description>
+    <macros>
+        <import>macro.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code">
+      <![CDATA[
+        python '$__tool_directory__/data_manager_build_amrfinderplus.py'
+        '$output_file'
+        --db_version '$database_list.database_version_select'
+        #if $database_list.database_version_select != 'latest':
+            --db_date '$database_list.database_date_select'
+        #end if
+        $test_data_manager
+      ]]></command>
+    <inputs>
+        <conditional name="database_list">
+            <param name="database_version_select" type="select" label="Database version">
+                <option value="latest" selected="true">Latest available version</option>
+                <option value="3.10">V3.10</option>
+                <option value="3.9">V3.9</option>
+                <option value="3.8">V3.8</option>
+                <option value="3.6">V3.6</option>
+            </param>
+            <when value="latest">
+            </when>
+            <when value="3.10">
+                <param name="database_date_select" type="select" label="Date version">
+                <option value="2022-10-11.2" selected="true">2022-10-11.2</option>
+                <option value="2022-08-09.1">2022-08-09.1</option>
+                <option value="2022-05-26.1">2022-05-26.1</option>
+                <option value="2022-04-04.1">2022-04-04.1</option>
+                <option value="2021-12-21.1">2021-12-21.1</option>
+                <option value="2021-09-30.1">2021-09-30.1</option>
+                <option value="2021-09-30.1">2021-08-11.1</option>
+                <option value="2021-09-30.1">2021-06-01.1</option>
+                <option value="2021-09-30.1">2021-03-01.1</option>
+                </param>
+            </when>
+            <when value="3.9">
+                <param name="database_date_select" type="select" label="Date version">
+                <option value="2020-11-09.1" selected="true">2020-11-09.1</option>
+                <option value="2020-12-17.1">2020-12-17.1</option>
+                </param>
+            </when>
+             <when value="3.8">
+                <param name="database_date_select" type="select" label="Date version">
+                <option value="2020-09-30.1" selected="true">2020-09-30.1</option>
+                <option value="2020-09-22.2">2020-09-22.2</option>
+                <option value="2020-07-16.2">2020-07-16.2</option>
+                <option value="2020-06-11.1">2020-06-11.1</option>
+                <option value="2020-05-04.1">2020-05-04.1</option>
+                </param>
+            </when>
+            <when value="3.6">
+                <param name="database_date_select" type="select" label="Date version">
+                <option value="2020-01-22.1" selected="true">2020-01-22.1</option>
+                <option value="2020-03-20.1">2020-03-20.1</option>
+                </param>
+            </when>
+        </conditional>
+         <param name="test_data_manager" type="hidden" value=""/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <!-- Test_1 DB latest -->
+        <test expect_num_outputs="1">
+            <param name="test_data_manager" value="--test"/>
+            <output name="output_file" value="amrfinderplus_test_data_manager_1.json"/>
+        </test>
+        <!-- Test_2 DB 3.2 -->
+        <test expect_num_outputs="1">
+            <param name="test_data_manager" value="--test"/>
+            <conditional name="database_list">
+                <param name="database_version_select" value="3.6"/>
+                <param name="database_date_select" value="2020-03-20.1"/>
+            </conditional>
+            <output name="output_file" value="amrfinderplus_test_data_manager_2.json"/>
+        </test>
+
+
+    </tests>
+    <help><![CDATA[
+        Download amrfinderplus database from the NCBI server
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/s41598-021-91456-0</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/macro.xml	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<macros>
+    <token name="@TOOL_VERSION@">3.10.45</token>
+    <token name="@PYTHON_VERSION@">3.10.6</token>
+    <token name="@PANDAS@">1.5.1</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.05</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">ncbi-amrfinderplus</requirement>
+            <requirement type="package" version="@PYTHON_VERSION@">python</requirement>
+            <requirement type="package" version="@PANDAS@">pandas</requirement>
+        </requirements>
+    </xml>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/data_manager_build_amrfinderplus.xml" id="data_manager_build_amrfinderplus" version="@TOOL_VERSION@">
+        <data_table name="amrfinderplus_database">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="output_file">
+                    <move type="directory" relativize_symlinks="True">
+                        <source>${path}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">amrfinderplus-db/${value}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/amrfinderplus-db/${value}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/amrfinderplus.loc.test	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,8 @@
+# this is a tab separated file describing the location of amrfinderplus database
+#
+# the columns are:
+# value, name, path
+#
+# for example
+amrfinderplus_V3.10_2022-10-11.2	V3.10-2022-10-11.2	amrfinderplus-db
+amrfinderplus_V3.6_2020-03-20.1	V3.6-2020-03-20.1	amrfinderplus-db
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/amrfinderplus_test_data_manager.json	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"amrfinderplus_database": [{"name": "V3.6-2020-03-20.1", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.6_2020-03-20.1"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/amrfinderplus_test_data_manager_1.json	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"amrfinderplus_database": [{"name": "V3.10-2022-10-11.2", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.10_2022-10-11.2"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/amrfinderplus_test_data_manager_2.json	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"amrfinderplus_database": [{"name": "V3.6-2020-03-20.1", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.6_2020-03-20.1"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/amrfinderplus.loc	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,7 @@
+# this is a tab separated file describing the location of amrfinderplus database
+#
+# the columns are:
+# value, name, path
+#
+# for example
+amrfinderplus_V3.6_2020-03-20.1    V3.6-2020-03-20.1    amrfinderplus-db
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of amrfinderplus database in the required format -->
+    <table name="amrfinderplus_database" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/amrfinderplus.loc" />
+    </table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Tue Dec 20 13:43:51 2022 +0000
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of amrfinderplus database in the required format -->
+    <table name="amrfinderplus_database" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/amrfinderplus.loc.test"/>
+    </table>
+</tables>