Galaxy |

Changeset 39:22b7cdcf4960 (2014-02-20)

Previous changeset 38:2f7fac29bb3c (2014-01-15) Next changeset 40:f83e5d79b6ab (2014-02-26)

Commit message:
Uploaded v0.1.0 preview 2, includes missing new test files

modified:
tools/ncbi_blast_plus/README.rst
tools/ncbi_blast_plus/blastxml_to_tabular.py
tools/ncbi_blast_plus/blastxml_to_tabular.xml
tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml
tools/ncbi_blast_plus/ncbi_macros.xml
tools/ncbi_blast_plus/ncbi_makeblastdb.xml
tools/ncbi_blast_plus/tool_dependencies.xml

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/blastn_rhodopsin_vs_three_human.columns.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_rhodopsin_vs_three_human.columns.tabular Thu Feb 20 05:39:48 2014 -0500

@@ -0,0 +1,7 @@
+gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 92.07 1047 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 91.59 4301 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 91.36 4301 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 94.22 4301 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 92.94 4301 1213
+gi|283855822|gb|GQ290312.1| ENA|BC112106|BC112106.1 91.55 983 1213
+gi|18148870|dbj|AB062417.1| ENA|BC112106|BC112106.1 87.50 1047 1213

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/blastn_rhodopsin_vs_three_human.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_rhodopsin_vs_three_human.xml Thu Feb 20 05:39:48 2014 -0500

b'@@ -0,0 +1,549 @@\n+<?xml version="1.0"?>\n+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n+<BlastOutput>\n+ <BlastOutput_program>blastn</BlastOutput_program>\n+ <BlastOutput_version>BLASTN 2.2.29+</BlastOutput_version>\n+ <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>\n+ <BlastOutput_db></BlastOutput_db>\n+ <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n+ <BlastOutput_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</BlastOutput_query-def>\n+ <BlastOutput_query-len>1047</BlastOutput_query-len>\n+ <BlastOutput_param>\n+ <Parameters>\n+ <Parameters_expect>1e-40</Parameters_expect>\n+ <Parameters_sc-match>1</Parameters_sc-match>\n+ <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>\n+ <Parameters_gap-open>0</Parameters_gap-open>\n+ <Parameters_gap-extend>0</Parameters_gap-extend>\n+ <Parameters_filter>L;m;</Parameters_filter>\n+ </Parameters>\n+ </BlastOutput_param>\n+<BlastOutput_iterations>\n+<Iteration>\n+ <Iteration_iter-num>1</Iteration_iter-num>\n+ <Iteration_query-ID>Query_1</Iteration_query-ID>\n+ <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n+ <Iteration_query-len>1047</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>0</Statistics_db-num>\n+ <Statistics_db-len>0</Statistics_db-len>\n+ <Statistics_hsp-len>15</Statistics_hsp-len>\n+ <Statistics_eff-space>4933992</Statistics_eff-space>\n+ <Statistics_kappa>0.46</Statistics_kappa>\n+ <Statistics_lambda>1.28</Statistics_lambda>\n+ <Statistics_entropy>0.85</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>2</Iteration_iter-num>\n+ <Iteration_query-ID>Query_1</Iteration_query-ID>\n+ <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n+ <Iteration_query-len>1047</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>0</Statistics_db-num>\n+ <Statistics_db-len>0</Statistics_db-len>\n+ <Statistics_hsp-len>15</Statistics_hsp-len>\n+ <Statistics_eff-space>4933992</Statistics_eff-space>\n+ <Statistics_kappa>0.46</Statistics_kappa>\n+ <Statistics_lambda>1.28</Statistics_lambda>\n+ <Statistics_entropy>0.85</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>3</Iteration_iter-num>\n+ <Iteration_query-ID>Query_1</Iteration_query-ID>\n+ <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n+ <Iteration_query-len>1047</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+ <Hit_num>1</Hit_num>\n+ <Hit_id>Subject_3</Hit_id>\n+ <Hit_def>ENA|BC112106|BC112106.1 Homo sapiens rhodopsin, mRNA (cDNA clone MGC:138311 IMAGE:8327574), complete cds</Hit_def>\n+ <Hit_accession>Subject_3</Hit_accession>\n+ <Hit_len>1213</Hit_len>\n+ <Hit_hsps>\n+ <Hsp>\n+ <Hsp_num>1</Hsp_num>\n+ <Hsp_bit-score>1474.75</Hsp_bit-score>\n+ <Hsp_score>798</Hsp_score>\n+ <Hsp_evalue>0</Hsp_evalue>\n+ <Hsp_query-from>1</Hsp_query-from>\n+ <Hsp_query-to>1047</Hsp_query-to>\n+ <Hsp_hit-from>88</Hsp_hit-from>\n+ <Hsp_hit-to>1134</Hsp_hit-to>\n+ <Hsp_query-frame>1</Hsp_query-frame>\n+ <Hsp_hit-frame>1</Hsp_hit-frame>\n+ <Hsp_identity>964</Hsp_identity>\n+ <Hsp_positive>964</Hsp_positive>\n+ <Hsp_gaps>0</Hsp_gaps>\n+ <Hsp_align-len>1047</Hsp_align-len>\n+ <Hsp_qseq>ATGAACGGGACGGAGGGCCCGAACTTCTACGTGCCC'..b'|||||||| |||||||||||||||||| | || || ||||||||||||||||||||||| |||||||| || ||||||||||| | || |||||||||| |||||| || ||||||||||| || |||||||| ||||| || || || ||||| | |||||||||||||||||| | |||||||||||||| ||||||||||||||||| || ||||||||||||||||| |||||||| ||||||||||||||||||||||||||||||||||| |||||||||||||| || ||||||||||||||||| |||||||| || || ||||| |||| ||||||||| || |||||||| ||||| ||||||||||||| || ||||| |||||||||| | | |||| |||||| ||||| || ||||||||||||||||| || ||||||| ||||||| ||||| |||| || |||||||| |||||||| |||||||||||||||||||| || ||||||||||||||||| |||||||| |||||||| |||||||||||||| || ||||||||||||||||||||||||||||| || |||||| |||||||||| | ||||| ||||||||||||||||| ||||| ||| |||| || |||||||||||||||||||| || ||||||||||||| || | ||| |||| ||||| |||||||| ||||||||||||||||||||||||||||||||| ||||||| ||||||| ||||||||||| || |||||||| |||||||| | |||||||||||||| ||||| ||||| |||||||| ||||||</Hsp_midline>\n+ </Hsp>\n+ </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>0</Statistics_db-num>\n+ <Statistics_db-len>0</Statistics_db-len>\n+ <Statistics_hsp-len>15</Statistics_hsp-len>\n+ <Statistics_eff-space>4933992</Statistics_eff-space>\n+ <Statistics_kappa>0.46</Statistics_kappa>\n+ <Statistics_lambda>1.28</Statistics_lambda>\n+ <Statistics_entropy>0.85</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>16</Iteration_iter-num>\n+ <Iteration_query-ID>Query_6</Iteration_query-ID>\n+ <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+ <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>0</Statistics_db-num>\n+ <Statistics_db-len>0</Statistics_db-len>\n+ <Statistics_hsp-len>15</Statistics_hsp-len>\n+ <Statistics_eff-space>6353949</Statistics_eff-space>\n+ <Statistics_kappa>0.46</Statistics_kappa>\n+ <Statistics_lambda>1.28</Statistics_lambda>\n+ <Statistics_entropy>0.85</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>17</Iteration_iter-num>\n+ <Iteration_query-ID>Query_6</Iteration_query-ID>\n+ <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+ <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>0</Statistics_db-num>\n+ <Statistics_db-len>0</Statistics_db-len>\n+ <Statistics_hsp-len>15</Statistics_hsp-len>\n+ <Statistics_eff-space>6353949</Statistics_eff-space>\n+ <Statistics_kappa>0.46</Statistics_kappa>\n+ <Statistics_lambda>1.28</Statistics_lambda>\n+ <Statistics_entropy>0.85</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>18</Iteration_iter-num>\n+ <Iteration_query-ID>Query_6</Iteration_query-ID>\n+ <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+ <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>0</Statistics_db-num>\n+ <Statistics_db-len>0</Statistics_db-len>\n+ <Statistics_hsp-len>15</Statistics_hsp-len>\n+ <Statistics_eff-space>6353949</Statistics_eff-space>\n+ <Statistics_kappa>0.46</Statistics_kappa>\n+ <Statistics_lambda>1.28</Statistics_lambda>\n+ <Statistics_entropy>0.85</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n+\n'

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.log
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins_taxid.fasta.log Thu Feb 20 05:39:48 2014 -0500

@@ -0,0 +1,10 @@
+
+
+Building a new DB, current time: 02/10/2014 18:40:09
+New DB name: four_human_proteins_taxid.fasta
+New DB title: Just 4 human proteins
+Sequence type: Protein
+Keep Linkouts: T
+Keep MBits: T
+Maximum file size: 1000000000B
+Adding sequences from FASTA; added 4 sequences in 0.00230002 seconds.

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.phd
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins_taxid.fasta.phd Thu Feb 20 05:39:48 2014 -0500

@@ -0,0 +1,4 @@
+11117184492
+29249033410
+36665887501
+5392473183

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.phi

Binary file test-data/four_human_proteins_taxid.fasta.phi has changed

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.phr

Binary file test-data/four_human_proteins_taxid.fasta.phr has changed

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.pin

Binary file test-data/four_human_proteins_taxid.fasta.pin has changed

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.pog

Binary file test-data/four_human_proteins_taxid.fasta.pog has changed

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.psd
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins_taxid.fasta.psd Thu Feb 20 05:39:48 2014 -0500

@@ -0,0 +1,4 @@
+gnl|bl_ord_id|00
+gnl|bl_ord_id|11
+gnl|bl_ord_id|22
+gnl|bl_ord_id|33

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.psi

Binary file test-data/four_human_proteins_taxid.fasta.psi has changed

diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.psq

Binary file test-data/four_human_proteins_taxid.fasta.psq has changed

diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/README.rst
--- a/tools/ncbi_blast_plus/README.rst Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/README.rst Thu Feb 20 05:39:48 2014 -0500

@@ -25,17 +25,7 @@
(``blastxml``) and protein and nucleotide BLAST databases (``blastdbp`` and
``blastdbn``).

-You must tell Galaxy about any system level BLAST databases using configuration
-files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein
-databases like NR), and blastdb_d.loc (protein domain databases like CDD or
-SMART) which are located in the tool-data/ folder. Sample files are included
-which explain the tab-based format to use.
-
-You can download the NCBI provided databases as tar-balls from here:
-
-* ftp://ftp.ncbi.nlm.nih.gov/blast/db/ (nucleotide and protein databases like NR)
-* ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ (domain databases like CDD)
-
+See the configuration notes below.

Manual Installation
===================
@@ -78,6 +68,31 @@

     ./run_functional_tests.sh -sid NCBI_BLAST+-ncbi_blast_plus_tools

+Configuration
+=============
+
+You must tell Galaxy about any system level BLAST databases using configuration
+files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein
+databases like NR), and blastdb_d.loc (protein domain databases like CDD or
+SMART) which are located in the tool-data/ folder. Sample files are included
+which explain the tab-based format to use.
+
+You can download the NCBI provided databases as tar-balls from here:
+
+* ftp://ftp.ncbi.nlm.nih.gov/blast/db/ (nucleotide and protein databases like NR)
+* ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ (domain databases like CDD)
+
+The BLAST+ binaries support multi-threaded operation, which is handled via the
+$GALAXY_SLOTS environment variable. This should be set automatically by Galaxy
+via your job runner settings, which allows you to (for example) allocate four
+cores to each BLAST job.
+
+In addition, the BLAST+ wrappers also support high level parallelism by task
+splitting if ``use_tasked_jobs = True`` is enabled in your ``universe_wsgi.ini``
+configuration file. Essentially, the FASTA input query files are broken up into
+batches of 1000 sequences, a separate BLAST child job is run for each chunk,
+and then the BLAST output files are merged (in order). This is transparent
+for the end user.

History
=======
@@ -105,7 +120,7 @@
           (all too often our users where having to re-run searches just to
           get one of the missing columns like query or subject length)
v0.0.18 - Defensive quoting of filenames in case of spaces (where possible,
-          BLAST+ handling of some mult-file arguments is problematic).
+          BLAST+ handling of some multi-file arguments is problematic).
v0.0.19 - Added wrappers for rpsblast and rpstblastn, and new blastdb_d.loc
           for the domain databases they use (e.g. CDD, PFAM or SMART).
         - Correct case of exception regular expression (for error handling
@@ -139,6 +154,7 @@
         - Tablar output now includes option to pick specific columns
         - BLAST XML to tabular tool supports multiple input files.
         - More detailed descriptions for BLASTN and BLASTP task option
+        - Supports setting a taxonomy ID in makeblastdb wrapper.
======= ======================================================================

diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/blastxml_to_tabular.py
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Thu Feb 20 05:39:48 2014 -0500

[

@@ -66,7 +66,7 @@
from optparse import OptionParser

if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.23"
+    print "v0.1.00"
     sys.exit(0)

if sys.version_info[:2] >= ( 2, 5 ):

diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/blastxml_to_tabular.xml
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Thu Feb 20 05:39:48 2014 -0500

@@ -28,33 +28,33 @@
           <when value="ext"/>
           <when value="cols">
             <param name="std_cols" type="select" multiple="true" display="checkboxes" label="Standard columns">
-              <option selected="true" value="qseqid">1 qseqid = Query Seq-id (ID of your sequence)</option>
-              <option selected="true" value="sseqid">2 sseqid = Subject Seq-id (ID of the database hit)</option>
-              <option selected="true" value="pident">3 pident = Percentage of identical matches</option>
-              <option selected="true" value="length">4 length = Alignment length</option>
-              <option selected="true" value="mismatch">5 mismatch = Number of mismatches</option>
-              <option selected="true" value="gapopen">6 gapopen = Number of gap openings</option>
-              <option selected="true" value="qstart">7 qstart = Start of alignment in query</option>
-              <option selected="true" value="qend">8 qend = End of alignment in query</option>
-              <option selected="true" value="sstart">9 sstart = Start of alignment in subject (database hit)</option>
-              <option selected="true" value="send">10 send = End of alignment in subject (database hit)</option>
-              <option selected="true" value="evalue">11 evalue = Expectation value (E-value)</option>
-              <option selected="true" value="bitscore">12 bitscore = Bit score</option>
+              <option selected="true" value="qseqid">qseqid = Query Seq-id (ID of your sequence)</option>
+              <option selected="true" value="sseqid">sseqid = Subject Seq-id (ID of the database hit)</option>
+              <option selected="true" value="pident">pident = Percentage of identical matches</option>
+              <option selected="true" value="length">length = Alignment length</option>
+              <option selected="true" value="mismatch">mismatch = Number of mismatches</option>
+              <option selected="true" value="gapopen">gapopen = Number of gap openings</option>
+              <option selected="true" value="qstart">qstart = Start of alignment in query</option>
+              <option selected="true" value="qend">qend = End of alignment in query</option>
+              <option selected="true" value="sstart">sstart = Start of alignment in subject (database hit)</option>
+              <option selected="true" value="send">send = End of alignment in subject (database hit)</option>
+              <option selected="true" value="evalue">evalue = Expectation value (E-value)</option>
+              <option selected="true" value="bitscore">bitscore = Bit score</option>
             </param>
             <param name="ext_cols" type="select" multiple="true" display="checkboxes" label="Extended columns">
-              <option value="sallseqid">13 sallseqid = All subject Seq-id(s), separated by a ';'</option>
-              <option value="score">14 score = Raw score</option>
-              <option value="nident">15 nident = Number of identical matches</option>
-              <option value="positive">16 positive = Number of positive-scoring matches</option>
-              <option value="gaps">17 gaps = Total number of gaps</option>
-              <option value="ppos">18 ppos = Percentage of positive-scoring matches</option>
-              <option value="qframe">19 qframe = Query frame</option>
-              <option value="sframe">20 sframe = Subject frame</option>
-              <option value="qseq">21 qseq = Aligned part of query sequence</option>
-              <option value="sseq">22 sseq = Aligned part of subject sequence</option>
-              <option value="qlen">23 qlen = Query sequence length</option>
-              <option value="slen">24 slen = Subject sequence length</option>
-              <option value="salltitles">25 salltitles = All subject title(s), separated by a '<>'</option>
+              <option value="sallseqid">sallseqid = All subject Seq-id(s), separated by a ';'</option>
+              <option value="score">score = Raw score</option>
+              <option value="nident">nident = Number of identical matches</option>
+              <option value="positive">positive = Number of positive-scoring matches</option>
+              <option value="gaps">gaps = Total number of gaps</option>
+              <option value="ppos">ppos = Percentage of positive-scoring matches</option>
+              <option value="qframe">qframe = Query frame</option>
+              <option value="sframe">sframe = Subject frame</option>
+              <option value="qseq">qseq = Aligned part of query sequence</option>
+              <option value="sseq">sseq = Aligned part of subject sequence</option>
+              <option value="qlen">qlen = Query sequence length</option>
+              <option value="slen">slen = Subject sequence length</option>
+              <option value="salltitles">salltitles = All subject title(s), separated by a '<>'</option>
             </param>
           </when>
         </conditional>

diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Thu Feb 20 05:39:48 2014 -0500

@@ -83,13 +83,14 @@
<help>
**What it does**

-This tool identifies and masks out low complexity regions of a nucleotide database (or sequences in FASTA format) by using the symmetric DUST algorithm.
+This tool identifies and masks out low complexity regions of a nucleotide database (or sequences in FASTA format) by using the symmetric DUST_ algorithm.

If you select *maskinfo ASN.1* (binary or text) as output format, the output file can be used as masking data for NCBI BLAST+ makeblastdb tool.

More information about dustmasker can be found in the `BLAST Command Line Applications User Manual`_.

.. _BLAST Command Line Applications User Manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/
+.. _DUST: http://www.ncbi.nlm.nih.gov/pubmed/16796549

**References**

diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/ncbi_macros.xml
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml Thu Feb 20 05:39:48 2014 -0500

@@ -31,33 +31,33 @@
             <when value="ext"/>
             <when value="cols">
                 <param name="std_cols" type="select" multiple="true" display="checkboxes" label="Standard columns">
-                    <option selected="true" value="qseqid">1 qseqid = Query Seq-id (ID of your sequence)</option>
-                    <option selected="true" value="sseqid">2 sseqid = Subject Seq-id (ID of the database hit)</option>
-                    <option selected="true" value="pident">3 pident = Percentage of identical matches</option>
-                    <option selected="true" value="length">4 length = Alignment length</option>
-                    <option selected="true" value="mismatch">5 mismatch = Number of mismatches</option>
-                    <option selected="true" value="gapopen">6 gapopen = Number of gap openings</option>
-                    <option selected="true" value="qstart">7 qstart = Start of alignment in query</option>
-                    <option selected="true" value="qend">8 qend = End of alignment in query</option>
-                    <option selected="true" value="sstart">9 sstart = Start of alignment in subject (database hit)</option>
-                    <option selected="true" value="send">10 send = End of alignment in subject (database hit)</option>
-                    <option selected="true" value="evalue">11 evalue = Expectation value (E-value)</option>
-                    <option selected="true" value="bitscore">12 bitscore = Bit score</option>
+                    <option selected="true" value="qseqid">qseqid = Query Seq-id (ID of your sequence)</option>
+                    <option selected="true" value="sseqid">sseqid = Subject Seq-id (ID of the database hit)</option>
+                    <option selected="true" value="pident">pident = Percentage of identical matches</option>
+                    <option selected="true" value="length">length = Alignment length</option>
+                    <option selected="true" value="mismatch">mismatch = Number of mismatches</option>
+                    <option selected="true" value="gapopen">gapopen = Number of gap openings</option>
+                    <option selected="true" value="qstart">qstart = Start of alignment in query</option>
+                    <option selected="true" value="qend">qend = End of alignment in query</option>
+                    <option selected="true" value="sstart">sstart = Start of alignment in subject (database hit)</option>
+                    <option selected="true" value="send">send = End of alignment in subject (database hit)</option>
+                    <option selected="true" value="evalue">evalue = Expectation value (E-value)</option>
+                    <option selected="true" value="bitscore">bitscore = Bit score</option>
                 </param>
                 <param name="ext_cols" type="select" multiple="true" display="checkboxes" label="Extended columns">
-                    <option value="sallseqid">13 sallseqid = All subject Seq-id(s), separated by a ';'</option>
-                    <option value="score">14 score = Raw score</option>
-                    <option value="nident">15 nident = Number of identical matches</option>
-                    <option value="positive">16 positive = Number of positive-scoring matches</option>
-                    <option value="gaps">17 gaps = Total number of gaps</option>
-                    <option value="ppos">18 ppos = Percentage of positive-scoring matches</option>
-                    <option value="qframe">19 qframe = Query frame</option>
-                    <option value="sframe">20 sframe = Subject frame</option>
-                    <option value="qseq">21 qseq = Aligned part of query sequence</option>
-                    <option value="sseq">22 sseq = Aligned part of subject sequence</option>
-                    <option value="qlen">23 qlen = Query sequence length</option>
-                    <option value="slen">24 slen = Subject sequence length</option>
-                    <option value="salltitles">25 salltitles = All subject title(s), separated by a '<>'</option>
+                    <option value="sallseqid">sallseqid = All subject Seq-id(s), separated by a ';'</option>
+                    <option value="score">score = Raw score</option>
+                    <option value="nident">nident = Number of identical matches</option>
+                    <option value="positive">positive = Number of positive-scoring matches</option>
+                    <option value="gaps">gaps = Total number of gaps</option>
+                    <option value="ppos">ppos = Percentage of positive-scoring matches</option>
+                    <option value="qframe">qframe = Query frame</option>
+                    <option value="sframe">sframe = Subject frame</option>
+                    <option value="qseq">qseq = Aligned part of query sequence</option>
+                    <option value="sseq">sseq = Aligned part of subject sequence</option>
+                    <option value="qlen">qlen = Query sequence length</option>
+                    <option value="slen">slen = Subject sequence length</option>
+                    <option value="salltitles">salltitles = All subject title(s), separated by a '<>'</option>
                 </param>
                 
             </when>

diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/ncbi_makeblastdb.xml
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Thu Feb 20 05:39:48 2014 -0500

@@ -29,7 +29,10 @@
##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
-title "BLAST Database"
#end if
--dbtype $dbtype
+-dbtype $dbtype
+## --------------------------------------------------------------------
+## Masking
+## --------------------------------------------------------------------
#set $mask_string = ''
#set $sep = '-mask_data '
#for $i in $mask_data
@@ -44,11 +47,15 @@
## #set $sep = ','
## #end for
## $gi_mask_string
-## #if $tax.select == 'id':
-## -taxid $tax.id
-## #else if $tax.select == 'map':
-## -taxid_map $tax.map
-## #end if
+## --------------------------------------------------------------------
+## Taxonomy
+## --------------------------------------------------------------------
+#if $tax.taxselect == 'id':
+-taxid $tax.taxid
+## TODO - Can we use a tabular file for the taxonomy mapping?
+## #else if $tax.taxselect == 'map':
+## -taxid_map $tax.taxmap
+#end if
## --------------------------------------------------------------------
## Capture the stdout log information to the primary file (plain text):
>> "$outfile"
@@ -83,23 +90,25 @@
         -->

         
-        
             </param>
             <when value="">
             </when>
             <when value="id">
-                <param name="id" type="integer" value="" label="NCBI taxonomy ID" help="Integer >=0" />
+                <param name="taxid" type="integer" value="" label="NCBI taxonomy ID" help="Integer >=0, e.g. 9606 for Homo sapiens" min="0" />
             </when>
+            
         </conditional>
-        -->
     </inputs>
     <outputs>
         
@@ -112,6 +121,8 @@
     </outputs>
     <tests>
         
         <test>
             <param name="dbtype" value="prot" />
@@ -130,6 +141,25 @@
                 <extra_files type="file" value="four_human_proteins.fasta.psi" name="blastdb.psi" />
             </output>
         </test>
+        <test>
+            <param name="dbtype" value="prot" />
+            <param name="file" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="title" value="Just 4 human proteins" />
+            <param name="parse_seqids" value="" />
+            <param name="hash_index" value="true" />
+            <param name="taxselect" value="id" />
+            <param name="taxid" value="9606" />
+            <output name="out_file" file="four_human_proteins_taxid.fasta.log" ftype="blastdbp" lines_diff="6">
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.phr" name="blastdb.phr" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.pin" name="blastdb.pin" lines_diff="2" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.psq" name="blastdb.psq" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.pog" name="blastdb.pog" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.phd" name="blastdb.phd" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.phi" name="blastdb.phi" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.psd" name="blastdb.psd" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.psi" name="blastdb.psi" />
+            </output>
+        </test>
     </tests>
     <help>
**What it does**

diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/tool_dependencies.xml
--- a/tools/ncbi_blast_plus/tool_dependencies.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/tool_dependencies.xml Thu Feb 20 05:39:48 2014 -0500

@@ -1,6 +1,6 @@
<?xml version="1.0"?>
<tool_dependency>
     <package name="blast+" version="2.2.29">
-        <repository changeset_revision="61c4017d3bf2" name="package_blast_plus_2_2_29" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="c021862e9ea8" name="package_blast_plus_2_2_29" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
</tool_dependency>