Mercurial > repos > diodupima > data_manager_coast_taxonomic_filters

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Wed Jul 07 15:13:50 2021 +0000
@@ -0,0 +1,8 @@
+# COAST's Taxonomic Filters - Data Manager
+
+This data manager helps an admin provide the galaxy instance users with pre-cached taxonomic filters. This helps increase
+the speed of the queries against larger databases.
+This is a BLAST aimed setting tool because diamond is capable of taking higher order taxids as filters.
+
+To use it you only need to provide the taxid for your desired root node, and the tool will take it from there.
+Optionally you can provide a name, that will also be visible to your users alongside the taxid, to help your users identify the filter.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Wed Jul 07 15:13:50 2021 +0000
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="txids_dm.xml" id="coast_taxonomic_filter_builder">
+        <data_table name="coast_taxonomic_filters">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="node_name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="directory">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">COAST_taxonomic_filters/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/COAST_taxonomic_filter/${path}/${taxid}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/high_order2species.sh	Wed Jul 07 15:13:50 2021 +0000
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+CURRENT_DATE=`date +"%Y.%m.%d_%H.%M.%S"`
+high_order=$1
+name=$2
+directory=$PWD
+
+# Get the data
+wget -c ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+tar -zxvf taxdump.tar.gz
+
+# Removed indentation so it only has 1 collumn
+taxonkit list --ids $high_order --data-dir $directory \
+  | taxonkit filter --equal-to Species --lower-than Species --save-predictable-norank --data-dir $directory  \
+   > ${name}.txids
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ho2s.py	Wed Jul 07 15:13:50 2021 +0000
@@ -0,0 +1,44 @@
+import argparse
+import os
+import subprocess
+import datetime
+from galaxy.util.json import from_json_string, to_json_string
+
+
+def script_cli():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file",
+                        help="JSON options file",
+                        type=int)
+    parser.add_argument("--tool_data_table_name",
+                        help="Tool data table name",
+                        type=str)
+    args = parser.parse_args()
+
+    params = from_json_string(open(args.filename).read())
+    target_directory = params["output_data"][0]["extra_files_path"]
+    os.mkdir(target_directory)
+
+    taxid = params.params["output_data"]["taxid"]
+    node_name = params.params["output_data"]["node_name"]
+    name = " ".join([node_name, f"(taxid - {taxid})", f"(date - {datetime.datetime.now().strftime('%Y_%m_%d_%H')})"])
+    tool_data_table_name = args.tool_data_table_name
+    data_id = "_".join([taxid, datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")])
+
+    subprocess.call(f". high_order2species.sh {taxid} {data_id}", shell=True, cwd=target_directory)
+    data_table_entry = {
+        "value": data_id,
+        "name": name,
+        "node_name": node_name,
+        "path": os.path.join(taxid, data_id)+".txids"
+    }
+    data_manager_dict = {
+        "data_tables": {tool_data_table_name: [data_table_entry]}
+    }
+    output_json = open(args.filename, "wb")
+    output_json.write(to_json_string(data_manager_dict))
+    output_json.close()
+
+
+if __name__ == "__main__":
+    script_cli()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/txids_dm.xml	Wed Jul 07 15:13:50 2021 +0000
@@ -0,0 +1,57 @@
+<tool id="data_manager_taxonomic_filters" name="COAST taxonomic filter generator" version="0.1" tool_type="manage_data">
+    <description>
+    </description>
+    <requirements>
+        <requirement type="package" version="0.8">taxonkit</requirement>
+    </requirements>
+    <command interpreter="python">ho2s.py --file "${out_file}" --tool_data_table_name "coast_taxonomic_filters"</command>
+    <inputs>
+        <param name="taxid" type="integer" label="TAXID for the desired top node" help="Root Taxonomy node." optional="False"/>
+        <param name="node_name" type="text" label="The name you want for the node" help="Label for the filter." optional="False"/>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <test>
+                <param name="taxid" value="10239"/>
+                <param name="node_name" value="Virus"/>
+                <output name="out_file" file="10239_out.json"/>
+            </test>
+        </test>
+    </tests>
+    <help>
+        Generates taxonomic filters from high order taxid nodes.
+        Used by PhageCOAST for Super Kingdomn scale filtering.
+        It generates species level taxid list in a file, ready to be provided to BLAST.
+        Can also be used for lower ranked taxids, if you desire to provide for example a genus or phylos level filter.
+    </help>
+    <citations>
+        <citation type="bibtex">@misc{noauthor_coast_nodate,
+                title = {{COAST} - {Compartive} {Ominc} {Alignment} {Search} {Tool}},
+                url = {https://gitlab.com/coast_tool/COAST},
+                abstract = {Alignment search tool that identifies close proteomes},
+                language = {en},
+                urldate = {2021-06-22},
+            }
+        </citation>
+        <citation type="bibtex">@article{shen_taxonkit_2021,
+                abstract = {The National Center for Biotechnology Information (NCBI) Taxonomy is widely applied in biomedical and ecological studies. Typical demands include querying taxonomy identifier (TaxIds) by taxonomy names, querying complete taxonomic lineages by TaxIds, listing descendants of given TaxIds, and others. However, existed tools are either limited in functionalities or inefficient in terms of runtime. In this work, we present TaxonKit, a command-line toolkit for comprehensive and efficient manipulation of NCBI Taxonomy data. TaxonKit comprises seven core subcommands providing functions, including TaxIds querying, listing, filtering, lineage retrieving and reformatting, lowest common ancestor computation, and TaxIds change tracking. The practical functions, competitive processing performance, scalability with different scales of datasets and good accessibility could facilitate taxonomy data manipulations. TaxonKit provides free access under the permissive MIT license on GitHub, Brewsci, and Bioconda. The documents are also available at https://bioinf.shenwei.me/taxonkit/.},
+                author = {Shen, Wei and Ren, Hong},
+                doi = {10.1016/j.jgg.2021.03.006},
+                file = {ScienceDirect Snapshot:/home/dm/Zotero/storage/Q3KYT6QS/S1673852721000837.html:text/html},
+                issn = {1673-8527},
+                journal = {Journal of Genetics and Genomics},
+                keywords = {Lineage; NCBI Taxonomy; TaxId; TaxId changelog; TaxonKit},
+                language = {en},
+                month = apr,
+                shorttitle = {{TaxonKit}},
+                title = {{TaxonKit}: {A} practical and efficient {NCBI} taxonomy toolkit},
+                url = {https://www.sciencedirect.com/science/article/pii/S1673852721000837},
+                urldate = {2021-06-21},
+                year = {2021}
+            }
+        </citation>
+    </citations>
+</tool>
\ No newline at end of file