changeset 10:92f9975f08e2 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bracken_database/ commit 57b36cfbdd1663aef43d03b76e37364cc5bdeef3-dirty"
author dfornika
date Thu, 04 Nov 2021 21:33:16 +0000
parents 472b3834d65d
children 2a4d2363ebb9
files data_manager/bracken_build_database.py data_manager/bracken_build_database.xml data_manager_conf.xml test-data/bracken_databases.loc test-data/kraken2_databases.loc test-data/nodes_patterns.txt test-data/reproduce_test_dataset.sh test-data/test_db/hash.k2d test-data/test_db/library/added/9C7DdW7GAD.fna test-data/test_db/library/added/9C7DdW7GAD.fna.masked test-data/test_db/library/added/cWk1IBlK73.fna test-data/test_db/library/added/cWk1IBlK73.fna.masked test-data/test_db/library/added/prelim_map.txt test-data/test_db/library/added/prelim_map_QXr8C5PiOX.txt test-data/test_db/library/added/prelim_map_l8ftMYsZv0.txt test-data/test_db/opts.k2d test-data/test_db/seqid2taxid.map test-data/test_db/taxo.k2d test-data/test_db/taxonomy/names.dmp test-data/test_db/taxonomy/nodes.dmp test-data/test_db/taxonomy/nucl_gb.accession2taxid test-data/test_db/taxonomy/prelim_map.txt test-data/test_db/unmapped.txt tool_data_table_conf.xml.test
diffstat 21 files changed, 156 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/bracken_build_database.py	Tue Oct 15 17:10:29 2019 -0400
+++ b/data_manager/bracken_build_database.py	Thu Nov 04 21:33:16 2021 +0000
@@ -3,14 +3,10 @@
 from __future__ import print_function
 
 import argparse
-import datetime
 import errno
 import json
 import os
-import shutil
-import string
 import subprocess
-import sys
 import uuid
 
 
@@ -19,14 +15,11 @@
 
 def bracken_build_database(target_directory, bracken_build_args, database_name, data_table_name=DATA_TABLE_NAME):
 
-    now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
-
     database_value = str(uuid.uuid4())
 
     database_name = database_name
 
-    database_path = os.path.join(bracken_build_args['kraken_database'], 'database' + str(bracken_build_args['read_len']) + 'mers.kmer_distrib') 
-
+    database_path = os.path.join(bracken_build_args['kraken_database'], 'database' + str(bracken_build_args['read_len']) + 'mers.kmer_distrib')
 
     bracken_build_args_list = [
         '-t', bracken_build_args['threads'],
@@ -37,7 +30,6 @@
 
     subprocess.check_call(['bracken-build'] + bracken_build_args_list)
 
-    
     data_table_entry = {
         "data_tables": {
             data_table_name: [
@@ -56,14 +48,15 @@
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('data_manager_json')
-    parser.add_argument('--threads', dest='threads', default=1, help='threads' )
-    parser.add_argument('--kmer-len', dest='kmer_len', help='K-mer length' )
-    parser.add_argument('--read-len', dest='read_len', help='Read length' )
-    parser.add_argument('--kraken-db', dest='kraken_database', help='Kraken Database' )
+    parser.add_argument('--threads', dest='threads', default=1, help='threads')
+    parser.add_argument('--kmer-len', dest='kmer_len', help='K-mer length')
+    parser.add_argument('--read-len', dest='read_len', help='Read length')
+    parser.add_argument('--kraken-db', dest='kraken_database', help='Kraken Database')
     parser.add_argument('--database-name', dest='database_name', help='Database Name')
     args = parser.parse_args()
 
-    data_manager_input = json.loads(open(args.data_manager_json).read())
+    with open(args.data_manager_json) as fh:
+        data_manager_input = json.load(fh)
 
     target_directory = data_manager_input['output_data'][0]['extra_files_path']
 
@@ -75,9 +68,9 @@
     }
 
     try:
-        os.mkdir( target_directory )
+        os.mkdir(target_directory)
     except OSError as exc:
-        if exc.errno == errno.EEXIST and os.path.isdir( target_directory ):
+        if exc.errno == errno.EEXIST and os.path.isdir(target_directory):
             pass
         else:
             raise
@@ -90,7 +83,8 @@
         args.database_name,
     )
 
-    open(args.data_manager_json, 'wb').write(json.dumps(data_manager_output))
+    with open(args.data_manager_json, 'w') as fh:
+        json.dump(data_manager_output, fh, sort_keys=True)
 
 
 if __name__ == "__main__":
--- a/data_manager/bracken_build_database.xml	Tue Oct 15 17:10:29 2019 -0400
+++ b/data_manager/bracken_build_database.xml	Thu Nov 04 21:33:16 2021 +0000
@@ -1,16 +1,20 @@
 <?xml version="1.0"?>
-<tool id="bracken_build_database" name="Bracken Database Builder" tool_type="manage_data" version="2.5+galaxy0">
+<tool id="bracken_build_database" name="Bracken Database Builder" tool_type="manage_data" version="2.5+galaxy1" profile="19.01">
     <description>bracken database builder</description>
     <requirements>
         <requirement type="package" version="2.5">bracken</requirement>
-        <requirement type="package" version="2.0.8_beta">kraken2</requirement>
+        <requirement type="package" version="2.1.1">kraken2</requirement>
     </requirements>
-    <version_command>bracken --version</version_command>
-    <command detect_errors="exit_code">
+    <command>
     <![CDATA[
+        #import os
+        #set db_dir = os.path.basename($kraken_db.fields.path)
+
+        mkdir '$db_dir' &&
+        ln -s '${kraken_db.fields.path}'/* '$db_dir/' &&
         python '$__tool_directory__/bracken_build_database.py'
           '${out_file}'
-          --kraken-db ${kraken_db.fields.path}
+          --kraken-db '$db_dir'
           --threads \${GALAXY_SLOTS:-1}
           --kmer-len ${kmer_len}
           --read-len ${read_len}
@@ -23,13 +27,29 @@
                 <validator type="no_options" message="No Kraken2 databases are available" />
             </options>
         </param>
-        <param name="kmer_len" type="integer" min="8" max="256" value="35" label="K-mer length" />
-	<param name="read_len" type="integer" min="8" max="1000" value="100" label="Read length" />
-        <param name="database_name" type="text" label="Database Name" />
+	<conditional name="prebuilt">
+	  <param name="prebuilt" type="boolean" checked="false" truevalue="--prebuilt" falsevalue="" label="Use Pre-built DB" />
+	  <when value="">
+            <param name="kmer_len" type="integer" min="8" max="256" value="35" label="K-mer length" />
+            <param name="read_len" type="integer" min="8" max="1000" value="100" label="Read length" />
+	  </when>
+	</conditional>
+	<param name="database_name" type="text" label="Database Name" />
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json" />
     </outputs>
+    <tests>
+        <test>
+            <param name="kraken_db" value="test_entry" />
+            <param name="database_name" value="database" />
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="test_db/database100mers.kmer_distrib" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
     <help>
     </help>
     <citations>
--- a/data_manager_conf.xml	Tue Oct 15 17:10:29 2019 -0400
+++ b/data_manager_conf.xml	Thu Nov 04 21:33:16 2021 +0000
@@ -1,5 +1,5 @@
 <data_managers>
-    <data_manager tool_file="data_manager/bracken_build_database.xml" id="bracken_build_database" version="2.5+galaxy0">
+    <data_manager tool_file="data_manager/bracken_build_database.xml" id="bracken_build_database" version="2.6+galaxy0">
         <data_table name="bracken_databases">
             <output>
                 <column name="value"/>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/kraken2_databases.loc	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,6 @@
+# Tab separated with three columns:
+# - value (Galaxy records this in the Galaxy DB)
+# - name (Galaxy shows this in the UI)
+# - path (folder name containing the Kraken DB)
+#
+test_entry	"Test Database"	${__HERE__}/test_db
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nodes_patterns.txt	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,15 @@
+^220341\s
+^90370\s
+^59201\s
+^28901\s
+^590\s
+^543\s
+^91347\s
+^1236\s
+^1224\s
+^2\s
+^131567\s
+^1\s
+^585057\s
+^562\s
+^561\s
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reproduce_test_dataset.sh	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# This script produces a small kraken2 database containing only a ~1kb portion each of a salmonella and ecoli genome
+# It requires kraken2, and entrez-direct (available on bioconda)
+kraken2-build --db test_db --download_taxonomy
+mv test_db/taxonomy/nucl_gb.accession2taxid test_db/taxonomy/nucl_gb.accession2taxid_full
+grep -e 'NC_003198.1' -e 'NC_011750.1' test_db/taxonomy/nucl_gb.accession2taxid_full > test_db/taxonomy/nucl_gb.accession2taxid
+mv test_db/taxonomy/nodes.dmp test_db/taxonomy/nodes.dmp_full
+grep -f node_patterns.txt test_db/taxonomy/nodes.dmp_full > test_db/taxonomy/nodes.dmp
+mv test_db/taxonomy/names.dmp test_db/taxonomy/names.dmp_full
+grep -e '^220341\s' -e '^585057\s' test_db/taxonomy/names.dmp_full > test_db/taxonomy/names.dmp
+esearch -db nucleotide -query "NC_003198.1" | efetch -format fasta > NC_003198.1.fasta
+esearch -db nucleotide -query "NC_011750.1" | efetch -format fasta > NC_011750.1.fasta
+head -n 14 NC_003198.1.fasta > NC_003198.1_1kb.fasta
+head -n 14 NC_011750.1.fasta > NC_011750.1_1kb.fasta
+kraken2-build --db test_db --add-to-library NC_003198.1_1kb.fasta
+kraken2-build --db test_db --add-to-library NC_011750.1_1kb.fasta
+kraken2-build --db test_db --build
Binary file test-data/test_db/hash.k2d has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/9C7DdW7GAD.fna	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,17 @@
+>NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome
+AGAGATTACGTCTGGTTGCAAGAGATCATAACAGGGGAAATTGATTGAAAATAAATATAT
+CGCCAGCAGCACATGAACAAGTTTCGGAATGTGATCAATTTAAAAATTTATTGACTTAGG
+CGGGCAGATACTTTAACCAATATAGGAATACAAGACAGACAAATAAAAATGACAGAGTAC
+ACAACATCCATGAACCGCATCAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGT
+AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGAACAGTGCGG
+GCxxxxxxxxCGACCAGAGATCACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
+ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATTCC
+AGGCAAGGGCAGGTAGCGACCGTACTTTCCGCCCCCGCGAAAATTACCAACCATCTGGTG
+GCGATGATTGAAAAAACTATCGGCGGCCAGGATGCTTTGCCGAATATCAGCGATGCCGAA
+CGTATTTTTTCTGACCTGCTCGCAGGACTTGCCAGCGCGCAGCCGGGATTCCCGCTTGCA
+CGGTTGAAAATGGTTGTCGAACAAGAATTCGCTCAGATCAAACATGTTTTGCATGGTATC
+AGCCTGCTGGGTCAGTGCCCGGATAGCATCAACGCCGCGCTGATTTGCCGTGGCGAAAAA
+ATGTCGATCGCGATTATGGCGGGACTCCTGGAGGCGCGTGGACATCGCGTCACGGTGATC
+GATCCGGTAGAAAAACTGCTGGCGGTGGGCCATTACCTTGAATCTACCGTCGATATCGCG
+GAATCGACTCGCCGTATCGCCGCCAGCCAGATCCCGGCCGATCACATGATCCTGATGGCG
+GGCTTTACTG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/cWk1IBlK73.fna	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,17 @@
+>NC_011750.1 Escherichia coli IAI39 chromosome, complete genome
+GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTxxxxxxxGAGTGTCT
+GATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGT
+CACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACA
+CAACATCCATGAAACGCATTAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGTA
+ACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGC
+xxxxxxxxCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTAC
+ATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAG
+GCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGC
+GATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACG
+TATTTTTGCCGAACTTCTGACGGGACTCGCCGCTGCCCAACCGGGATTCCCGCTGGCGCA
+ACTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAG
+TTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAAT
+GTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACCGTTATCGA
+TCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGA
+GTCCACCCGCCGTATTGCGGCAAGTCGTATTCCGGCTGATCACATGGTGCTGATGGCAGG
+TTTCACCGCC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/prelim_map.txt	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,2 @@
+ACCNUM	NC_011750.1	NC_011750
+ACCNUM	NC_003198.1	NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/prelim_map_QXr8C5PiOX.txt	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,1 @@
+ACCNUM	NC_003198.1	NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/library/added/prelim_map_l8ftMYsZv0.txt	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,1 @@
+ACCNUM	NC_011750.1	NC_011750
Binary file test-data/test_db/opts.k2d has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/seqid2taxid.map	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,1 @@
+NC_011750.1	585057
Binary file test-data/test_db/taxo.k2d has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/names.dmp	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,5 @@
+220341	|	Salmonella enterica subsp. enterica serovar Typhi CT18	|		|	equivalent name	|
+220341	|	Salmonella enterica subsp. enterica serovar Typhi str. CT18	|		|	scientific name	|
+220341	|	Salmonella enterica subsp. enterica serovar Typhi strain CT18	|		|	equivalent name	|
+220341	|	Salmonella typhi CT18	|		|	equivalent name	|
+585057	|	Escherichia coli IAI39	|		|	scientific name	|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/nodes.dmp	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,15 @@
+1	|	1	|	no rank	|		|	8	|	0	|	1	|	0	|	0	|	0	|	0	|	0	|		|
+2	|	131567	|	superkingdom	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|
+543	|	91347	|	family	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+561	|	543	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+562	|	561	|	species	|	EC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+590	|	543	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+1224	|	2	|	phylum	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+1236	|	1224	|	class	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+28901	|	590	|	species	|	SE	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+59201	|	28901	|	subspecies	|	SE	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+90370	|	59201	|	no rank	|		|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+91347	|	1236	|	order	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+131567	|	1	|	no rank	|		|	8	|	1	|	1	|	1	|	0	|	1	|	1	|	0	|		|
+220341	|	90370	|	no rank	|		|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+585057	|	562	|	no rank	|		|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/nucl_gb.accession2taxid	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,2 @@
+NC_003198	NC_003198.1	220341	16758993
+NC_011750	NC_011750.1	585057	218698419
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/prelim_map.txt	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,2 @@
+ACCNUM	NC_011750.1	NC_011750
+ACCNUM	NC_003198.1	NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/unmapped.txt	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,1 @@
+NC_003198
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Thu Nov 04 21:33:16 2021 +0000
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of Kraken database in the required format -->
+    <table name="kraken2_databases" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/kraken2_databases.loc" />
+    </table>
+    <!-- Locations of bracken databases in the required format -->
+    <table name="bracken_databases" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/bracken_databases.loc" />
+    </table>
+</tables>