Mercurial > repos > iuc > data_manager_nextclade
changeset 0:6e64cb3d2b1d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_nextclade commit 3d6dabd066dcbe31cfa38fbfac340e253d8a984d
author | iuc |
---|---|
date | Sat, 30 Jul 2022 08:09:07 +0000 |
parents | |
children | 8b7bb3c635b1 |
files | data_manager/nextclade_dm.py data_manager/nextclade_dm.xml data_manager_conf.xml test-data/nextclade.loc tool-data/nextclade.loc tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 7 files changed, 367 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/nextclade_dm.py Sat Jul 30 08:09:07 2022 +0000 @@ -0,0 +1,189 @@ +#!/usr/bin/env python + +import argparse +import datetime +import json +import operator +import pathlib +import subprocess +import sys +from typing import List + + +def parse_date(d: str) -> datetime.datetime: + # Parses the publication date from the nextclade release tags or user input into a datetime object. + date = None + try: + date = datetime.datetime.strptime(d, "%Y-%m-%dT%H:%M:%SZ") + except ValueError: + date = datetime.datetime.strptime(d, "%Y-%m-%d") + return date + + +def entry_to_tag(entry: dict) -> str: + return ( + entry["attributes"]["name"]["value"] + "_" + entry["attributes"]["tag"]["value"] + ) + + +def get_database_list() -> List[dict]: + list_cmd = [ + "nextclade", + "dataset", + "list", + "--json", + "--include-old", + "--include-incompatible", + ] + list_proc = subprocess.run(list_cmd, capture_output=True, check=True) + database_list = json.loads(list_proc.stdout) + entry_list = [] + for db_entry in database_list: + attributes = db_entry["attributes"] + entry = { + "value": entry_to_tag(db_entry), + "database_name": attributes["name"]["value"], + "description": attributes["name"]["valueFriendly"], + "date": datetime.datetime.fromisoformat( + attributes["tag"]["value"].replace("Z", "") + ), + "tag": attributes["tag"]["value"], + "min_nextclade_version": db_entry["compatibility"]["nextcladeCli"]["min"], + } + entry_list.append(entry) + return entry_list + + +def filter_by_date( + existing_release_tags: List[str], + name: str, + releases: list, + start_date: datetime.datetime = None, + end_date: datetime.datetime = None, +) -> List[dict]: + ret = [] + for release in releases: + if ( + release["database_name"] != name + or release["value"] in existing_release_tags + ): + continue + if start_date and release["date"] < start_date: + break + if not end_date or release["date"] <= end_date: + ret.append(release) + + return ret + + +def download_and_unpack(name: str, release: str, output_directory: str) -> pathlib.Path: + download_cmd = [ + "nextclade", + "dataset", + "get", + "--name", + name, + "--tag", + release, + "--output-dir", + ] + output_path = pathlib.Path(output_directory) / ( + name + "_" + release.replace(":", "-") + ) + download_cmd.append(str(output_path)) + subprocess.run(download_cmd, check=True) + return output_path + + +def comma_split(args: str) -> List[str]: + return args.split(",") + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--testmode", default=False, action="store_true") + parser.add_argument("--latest", default=False, action="store_true") + parser.add_argument("--start_date", type=parse_date) + parser.add_argument("--end_date", type=parse_date) + parser.add_argument("--known_revisions", type=comma_split) + parser.add_argument("--datasets", type=comma_split, default=["sars-cov-2"]) + parser.add_argument("datatable_name", default="nextclade") + parser.add_argument("galaxy_config") + args = parser.parse_args() + + # known-revisions is populated from the Galaxy data table by the wrapper + if args.known_revisions is not None: + existing_release_tags = set(args.known_revisions) + else: + existing_release_tags = set() + + releases_available = get_database_list() + if args.testmode: + releases = [] + for name in args.datasets: + releases.extend( + filter_by_date( + [], + name, + releases_available, + start_date=args.start_date, + end_date=args.end_date, + ) + ) + for release in releases: + print( + release["value"], + release["description"], + release["date"].isoformat(), + release["min_nextclade_version"], + ) + sys.exit(0) + + with open(args.galaxy_config) as fh: + config = json.load(fh) + + output_directory = config.get("output_data", [{}])[0].get("extra_files_path", None) + + data_manager_dict = {"data_tables": {args.datatable_name: []}} + + releases = [] + if args.latest: + for dataset in args.datasets: + for release in releases_available: + if release["database_name"] == dataset: + if release["value"] not in existing_release_tags: + # add the latest release for this dataset, but only if we don't already have it + releases.append(release) + break + else: + for dataset in args.datasets: + releases_for_ds = filter_by_date( + existing_release_tags, + dataset, + releases_available, + start_date=args.start_date, + end_date=args.end_date, + ) + releases.extend(releases_for_ds) + + for release in releases: + fname = download_and_unpack( + release["database_name"], release["tag"], output_directory + ) + if fname is not None: + data_manager_dict["data_tables"][args.datatable_name].append( + { + "value": release["value"], + "database_name": release["database_name"], + "description": release["description"], + "min_nextclade_version": release["min_nextclade_version"], + "date": release["date"].isoformat(), # ISO 8601 is easily sortable + "path": str(output_directory / fname), + } + ) + data_manager_dict["data_tables"][args.datatable_name].sort( + key=operator.itemgetter("value"), reverse=True + ) + with open(args.galaxy_config, "w") as fh: + json.dump(data_manager_dict, fh, indent=2, sort_keys=True)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/nextclade_dm.xml Sat Jul 30 08:09:07 2022 +0000 @@ -0,0 +1,121 @@ +<tool id="data_manager_nextclade" name="nextclade data manager" version="0.0.1+galaxy0" tool_type="manage_data" profile="20.01"> + + <requirements> + <requirement type="package" version="3.8">python</requirement> + <requirement type="package" version="2.3.0">nextclade</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #set $data_table = $__app__.tool_data_tables.get('nextclade') + #if $data_table is not None and len($data_table.get_fields()) != 0 + #set $known_revisions = '--known_revisions=' + ','.join([row[0] for row in $data_table.get_fields()]) + #else + #set $known_revisions = '' + #end if + #if str($additional_datasets).strip() != '' + #if str($datasets) != '' + #set $dataset_list = $datasets + ',' + str($additional_datasets).strip() + #else + #set $dataset_list = str($additional_datasets) + #end if + #else + #set $dataset_list = str($datasets) + #end if + python '$__tool_directory__/nextclade_dm.py' + $known_revisions + #if $release.which == "latest" + --latest + #else if $release.which == "date_range" + #if str($release.start_date).strip() != "" + --start_date '$release.start_date' + #end if + #if str($release.end_date).strip() != "" + --end_date '$release.end_date' + #end if + #end if + --datasets '$dataset_list' + 'nextclade' + '${output_file}' + ]]></command> + <inputs> + <param name="datasets" type="select" label="Select nextclade datasets" multiple="true"> + <option value="sars-cov-2" selected="true">SARS-CoV-2</option> + <option value="MPXV">Monkeypox (All Clades)</option> + <option value="hMPXV">Human Monkeypox (hMPXV)</option> + <option value="hMPXV_B1">Human Monkeypox Clade B.1</option> + <option value="flu_h1n1pdm_ha">Influenza A H1N1pdm HA</option> + <option value="flu_h3n2_ha">Influenza A H3N2 HA</option> + <option value="flu_vic_ha">Influenza B Victoria HA</option> + <option value="flu_yam_ha">Influenza B Yamagata HA</option> + <option value="sars-cov-2-no-recomb">SARS-CoV-2 without recombinants</option> + </param> + <param name="additional_datasets" type="text" label="Additional nextclade dataset names" help="If you want to download datasets that are not in the list above, enter their names here, separated by commas"> + <validator type="regex" message="Dataset names consist of letters, numbers, underscore and hyphens, with multiple names separated by ,">^[-A-Za-z0-9_]?[-A-Za-z0-9_,]*$</validator> + </param> + <conditional name="release"> + <param name="which" type="select" label="Select nextclade dataset(s) release"> + <option value="latest" selected="true">Latest</option> + <option value="date_range">Date range</option> + </param> + <when value="latest"> + </when> + <when value="date_range"> + <param name="start_date" type="text" label="Start date (YYYY-MM-DD)" help="Don't download models older than this date" optional="true"> + <validator type="regex" message="Dates are in YYYY-MM-DD format">\d{4}-\d{2}-\d{2}$</validator> + </param> + <param name="end_date" type="text" label="End date (YYYY-MM-DD)" help="Don't download models newer than this date" optional="true"> + <validator type="regex" message="Dates are in YYYY-MM-DD format">\d{4}-\d{2}-\d{2}$</validator> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <test expect_num_outputs="1"> + <conditional name="release"> + <param name="which" value="date_range" /> + <param name="start_date" value="2022-03-01" /> + <param name="end_date" value="2022-04-01" /> + </conditional> + <output name="output_file"> + <assert_contents> + <has_text text='"database_name": "sars-cov-2"' /> + <has_text text='sars-cov-2_2022-03-31T12-00-00Z' /> + <has_text text='sars-cov-2_2022-03-24T12-00-00Z' /> + <has_text text='sars-cov-2_2022-03-14T12-00-00Z"' /> + <has_text text='"min_nextclade_version": "1.10.0"' /> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <param name="datasets" value="MPXV,hMPXV" /> + <conditional name="release"> + <param name="which" value="latest" /> + </conditional> + <output name="output_file"> + <assert_contents> + <has_text text='"database_name": "MPXV"' /> + <has_text text='"database_name": "hMPXV"' /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + This data managers fetches databases for the nextclade_ viral genome typing tool and + updates the nextclade database. + + The default is to fetch the latest version of the data tables, but ranges of dates + can also be specified to fetch releases that are within those dates. The data manager + has a built-in list of databases that can be fetched and users can specify ones by name + if they want something that is not on the list. + + The data manager will read the existing data tables and not re-download or replace databases + that are already present in those data tables. + + .. _nextclade: https://clades.nextstrain.org/ + ]]></help> + <citations> + <citation type="doi">10.21105/joss.03773</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Sat Jul 30 08:09:07 2022 +0000 @@ -0,0 +1,23 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/nextclade_dm.xml" id="data_manager_nextclade"> + <data_table name="nextclade"> + <output> + <column name="value" /> + <column name="database_name" /> + <column name="description" /> + <column name="min_nextclade_version" /> + <column name="date" /> + <column name="path" output_ref="output_file" > + <!-- note: the Python script sanitises the possibly user-supplied scheme name ('value') --> + <move type="directory" relativize_symlinks="True"> + <source>${path}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">nextclade/#echo str($value).replace(':', '-')#</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/nextclade/#echo str($value).replace(':', '-')#</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/nextclade.loc Sat Jul 30 08:09:07 2022 +0000 @@ -0,0 +1,10 @@ +# this is a tab separated file describing the location of nextclade databases used for the +# pangolin SARS-CoV-2 lineage typing tool +# +# the columns are: +# value database_name description min_nextclade_version date path +# +# min_pangolin_version is the minimum pangolin tool major version that is needed to read the pangolin_data +# +# for example +#sars-cov-2_2022-06-14T12:00:00Z sars-cov-2 SARS-CoV-2 1.10.0 2022-06-14T12:00:00 /srv/galaxy/tool-data/nextclade/sars-cov-2_2022-06-14T12-00-00Z
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/nextclade.loc Sat Jul 30 08:09:07 2022 +0000 @@ -0,0 +1,10 @@ +# this is a tab separated file describing the location of pangolin_data databases used for the +# pangolin SARS-CoV-2 lineage typing tool +# +# the columns are: +# value database_name description min_nextclade_version date path +# +# min_pangolin_version is the minimum pangolin tool major version that is needed to read the pangolin_data +# +# for example +#sars-cov-2_2022-06-14T12:00:00Z sars-cov-2 SARS-CoV-2 1.10.0 2022-06-14T12:00:00 /srv/galaxy/tool-data/nextclade/sars-cov-2_2022-06-14T12-00-00Z
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sat Jul 30 08:09:07 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <table name="nextclade" comment_char="#" allow_duplicate_entries="False"> + <!-- min_nextclade_version is the minimum nextclade tool version that is needed to read the nextclade data --> + <columns>value, database_name, description, min_nextclade_version, date, path</columns> + <file path="tool-data/nextclade.loc" /> + </table> +</tables> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Sat Jul 30 08:09:07 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <table name="nextclade" comment_char="#" allow_duplicate_entries="False"> + <!-- min_nextclade_version is the minimum nextclade tool version that is needed to read the nextclade data --> + <columns>value, database_name, description, min_nextclade_version, date, path</columns> + <file path="${__HERE__}/test-data/nextclade.loc" /> + </table> +</tables> \ No newline at end of file