# HG changeset patch # User pimarin # Date 1671543831 0 # Node ID 43ec3aadda5066bca1412c3befafaa01d35ef054 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_amrfinderplus commit 60348db40f25b746db8fd85d6d62ff7569ce28d3 diff -r 000000000000 -r 43ec3aadda50 data_manager/data_manager_build_amrfinderplus.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_build_amrfinderplus.py Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,257 @@ +import argparse +import json +import os +import subprocess as sp +from ftplib import FTP + +import pandas as pd +from io import BytesIO +from pathlib import Path + + +class GetAmrFinderPlusDataManager: + """ + Create the json file with database information for galaxy data manager + """ + + def __init__(self, + amrfinderplus_database="amrfinderplus_database", + db_name="amrfinderplus-db", + amrfinderplus_version="latest", + date_version=None): + self.data_table_name = amrfinderplus_database + self._db_name = db_name + self._amrfinderplus_version = amrfinderplus_version + self._amrfinderplus_date_version = date_version + self.data_table_entry = None + self.amrfinderplus_table_list = None + + def get_data_table_format(self): + """ + Skeleton of a data_table format + return: a data table formatted for json output + """ + self.data_table_entry = { + "data_tables": { + self.data_table_name: {} + } + } + return self.data_table_entry + + def get_data_manager(self): + """ + Create the empty data table format and add all the information into + return: The data table with database information + """ + self.amrfinderplus_table_list = self.get_data_table_format() + amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ + f"_{self._amrfinderplus_date_version}" + amrfinderplus_name = f"V{self._amrfinderplus_version}" \ + f"-{self._amrfinderplus_date_version}" + data_info = dict(value=amrfinderplus_value, + name=amrfinderplus_name, + path=self._db_name) + self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] + return self.amrfinderplus_table_list + + +class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): + """ + Download the amrfinderplus database from the ncbi. + Make the database available with hmm and indexed files + Build the data manager infos for galaxy + """ + + def __init__(self, + output_dir=Path.cwd(), + ncbi_url="ftp.ncbi.nlm.nih.gov", + ftp_login="anonymous", + ftp_password="anonymous", + amrfinderplus_database="amrfinderplus_database", + db_name="amrfinderplus-db", + amrfinderplus_version="latest", + json_file_path=None, + date_version=None, + amrfinderplus_db_path=None, + test_mode=False): + + super().__init__() + self.json_file_path = json_file_path + self._output_dir = output_dir + self._ncbi_ftp_url = ncbi_url + self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" + self._login = ftp_login + self._password = ftp_password + self._amrfinderplus_database = amrfinderplus_database + self._db_name = db_name + self._amrfinderplus_version = amrfinderplus_version + self._amrfinderplus_date_version = date_version + self.species_list = None + self.test_mode = test_mode + self.amrfinderplus_db_path = amrfinderplus_db_path + + @staticmethod + def subprocess_cmd(command, *args): + """ + Method to call external tools with any parameters + :param command: command name from the tool used (e.g. wget or makeblastdb) + :param args: free number of argument need for the command tool (e.g. -r, -P ...) + :return: launch the command line from the system + """ + cmd = [command] + [cmd.append(i) for i in args] + proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) + if proc.returncode != 0: + print(f'Error type {proc.returncode} with : \n {proc}') + + def download_amrfinderplus_db(self): + """ + Download the amrfinderplus database from the ncbi ftp server + """ + self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' + os.makedirs(self.amrfinderplus_db_path) + if self._amrfinderplus_version == 'latest': + self.get_amrfinderplus_version() + + amrfinderplus_ftp_path = f"ftp://{self._login}:" \ + f"{self._password}@{self._ncbi_ftp_url}/" \ + f"{self._ncbi_database_path}/" \ + f"{self._amrfinderplus_version}/" \ + f"{self._amrfinderplus_date_version}" + if self.test_mode is True: + file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] + output_option = "-O" + for file in file_list: + self.subprocess_cmd("wget", + "-nd", + "-np", + "-r", + f"{amrfinderplus_ftp_path}/{file}", + output_option, + f"{self.amrfinderplus_db_path}/{file}") + else: + output_option = "-P" + self.subprocess_cmd("wget", + "-nd", + "-np", + "-r", + amrfinderplus_ftp_path, + output_option, + self.amrfinderplus_db_path) + + def make_hmm_profile(self): + """ + Make the hmm profile using the AMR.LIB file previously download + """ + hmm_file = Path(f"{self.amrfinderplus_db_path}/AMR.LIB") + if Path.exists(hmm_file) and self.test_mode is False: + self.subprocess_cmd("hmmpress", "-f", hmm_file) + else: + print("hmm_file file is missing to make hmm profiles") + + def extract_filelist_makeblast(self): + """ + Extract le list of species which have file in the database + return: a filtered species list of available species in the database + """ + taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") + if Path.exists(taxa_group_path): + taxa_table = pd.read_table(taxa_group_path) + taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] + taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) + if self.test_mode is True: + taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup + else: + taxa_df = taxa_df.taxgroup + self.species_list = list(taxa_df) + else: + print("taxgroup.tab file is missing to list available species") + + def make_blastdb(self): + """ + Index fasta file for blast + """ + self.extract_filelist_makeblast() + nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] + amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' + amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' + os.chdir(self.amrfinderplus_db_path) + if Path(amr_dna).exists(): + nucl_file_db_list.append(amr_dna) + else: + print("No file AMR_CDS detected for indexing") + if Path(amr_prot).exists(): + self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") + else: + print("No file AMRProt detected for indexing") + [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] + + def get_amrfinderplus_version(self, version_file="version.txt", + database_version_file="database_format_version.txt"): + """ + Check the version when latest if provided and update the number + param version_file: name of the file containing version information + param database_version_file: name of the file containing date version information + """ + ftp = FTP(self._ncbi_ftp_url) + ftp.login(self._login, self._password) + ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") + db_version = BytesIO() + db_date_version = BytesIO() + ftp.retrbinary(f'RETR {version_file}', db_version.write) + ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) + self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] + self._amrfinderplus_version = '.'.join( + db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) + + def read_json_input_file(self): + """ + Import the json file + """ + with open(self.json_file_path) as fh: + params = json.load(fh) + target_dir = params['output_data'][0]['extra_files_path'] + os.makedirs(target_dir) + self._output_dir = target_dir + + def write_json_infos(self): + """ + Write in the imported json file + """ + with open(self.json_file_path, 'w') as fh: + json.dump(self.get_data_manager(), fh, sort_keys=True) + + +def parse_arguments(): + """ + List of arguments provided by the user + return: parsed arguments + """ + # parse options and arguments + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("data_manager_json", + help="json file from galaxy") + arg_parser.add_argument("--db_version", default="latest", + help="select the major version of the database (e.g. 3.10, 3.8), default is latest") + arg_parser.add_argument("--db_date", + help="select the date into the database version (e.g. 2022-10-11.2)") + arg_parser.add_argument("--test", action='store_true', + help="option to test the script with an lighted database") + return arg_parser.parse_args() + + +def main(): + all_args = parse_arguments() + amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, + date_version=all_args.db_date, + json_file_path=all_args.data_manager_json, + test_mode=all_args.test) + amrfinderplus_download.read_json_input_file() + amrfinderplus_download.download_amrfinderplus_db() + amrfinderplus_download.make_hmm_profile() + amrfinderplus_download.make_blastdb() + amrfinderplus_download.write_json_infos() + + +if __name__ == '__main__': + main() diff -r 000000000000 -r 43ec3aadda50 data_manager/data_manager_build_amrfinderplus.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_build_amrfinderplus.xml Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,92 @@ + + AMRfinderplus database builder + + macro.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1038/s41598-021-91456-0 + + diff -r 000000000000 -r 43ec3aadda50 data_manager/macro.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macro.xml Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,15 @@ + + + 3.10.45 + 3.10.6 + 1.5.1 + 0 + 21.05 + + + ncbi-amrfinderplus + python + pandas + + + \ No newline at end of file diff -r 000000000000 -r 43ec3aadda50 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,19 @@ + + + + + + + + + + ${path} + amrfinderplus-db/${value} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/amrfinderplus-db/${value} + abspath + + + + + diff -r 000000000000 -r 43ec3aadda50 test-data/amrfinderplus.loc.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amrfinderplus.loc.test Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,8 @@ +# this is a tab separated file describing the location of amrfinderplus database +# +# the columns are: +# value, name, path +# +# for example +amrfinderplus_V3.10_2022-10-11.2 V3.10-2022-10-11.2 amrfinderplus-db +amrfinderplus_V3.6_2020-03-20.1 V3.6-2020-03-20.1 amrfinderplus-db diff -r 000000000000 -r 43ec3aadda50 test-data/amrfinderplus_test_data_manager.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amrfinderplus_test_data_manager.json Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"amrfinderplus_database": [{"name": "V3.6-2020-03-20.1", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.6_2020-03-20.1"}]}} \ No newline at end of file diff -r 000000000000 -r 43ec3aadda50 test-data/amrfinderplus_test_data_manager_1.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amrfinderplus_test_data_manager_1.json Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"amrfinderplus_database": [{"name": "V3.10-2022-10-11.2", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.10_2022-10-11.2"}]}} \ No newline at end of file diff -r 000000000000 -r 43ec3aadda50 test-data/amrfinderplus_test_data_manager_2.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amrfinderplus_test_data_manager_2.json Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"amrfinderplus_database": [{"name": "V3.6-2020-03-20.1", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.6_2020-03-20.1"}]}} \ No newline at end of file diff -r 000000000000 -r 43ec3aadda50 tool-data/amrfinderplus.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/amrfinderplus.loc Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,7 @@ +# this is a tab separated file describing the location of amrfinderplus database +# +# the columns are: +# value, name, path +# +# for example +amrfinderplus_V3.6_2020-03-20.1 V3.6-2020-03-20.1 amrfinderplus-db \ No newline at end of file diff -r 000000000000 -r 43ec3aadda50 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,7 @@ + + + + value, name, path + +
+
\ No newline at end of file diff -r 000000000000 -r 43ec3aadda50 tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Dec 20 13:43:51 2022 +0000 @@ -0,0 +1,7 @@ + + + + value, name, path + +
+