Repository 'ncbi_blast_plus'
hg clone https://testtoolshed.g2.bx.psu.edu/repos/peterjc/ncbi_blast_plus

Changeset 39:22b7cdcf4960 (2014-02-20)
Previous changeset 38:2f7fac29bb3c (2014-01-15) Next changeset 40:f83e5d79b6ab (2014-02-26)
Commit message:
Uploaded v0.1.0 preview 2, includes missing new test files
modified:
tools/ncbi_blast_plus/README.rst
tools/ncbi_blast_plus/blastxml_to_tabular.py
tools/ncbi_blast_plus/blastxml_to_tabular.xml
tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml
tools/ncbi_blast_plus/ncbi_macros.xml
tools/ncbi_blast_plus/ncbi_makeblastdb.xml
tools/ncbi_blast_plus/tool_dependencies.xml
added:
test-data/blastn_rhodopsin_vs_three_human.columns.tabular
test-data/blastn_rhodopsin_vs_three_human.xml
test-data/four_human_proteins_taxid.fasta.log
test-data/four_human_proteins_taxid.fasta.phd
test-data/four_human_proteins_taxid.fasta.phi
test-data/four_human_proteins_taxid.fasta.phr
test-data/four_human_proteins_taxid.fasta.pin
test-data/four_human_proteins_taxid.fasta.pog
test-data/four_human_proteins_taxid.fasta.psd
test-data/four_human_proteins_taxid.fasta.psi
test-data/four_human_proteins_taxid.fasta.psq
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/blastn_rhodopsin_vs_three_human.columns.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_rhodopsin_vs_three_human.columns.tabular Thu Feb 20 05:39:48 2014 -0500
b
@@ -0,0 +1,7 @@
+gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 92.07 1047 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 91.59 4301 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 91.36 4301 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 94.22 4301 1213
+gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 92.94 4301 1213
+gi|283855822|gb|GQ290312.1| ENA|BC112106|BC112106.1 91.55 983 1213
+gi|18148870|dbj|AB062417.1| ENA|BC112106|BC112106.1 87.50 1047 1213
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/blastn_rhodopsin_vs_three_human.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_rhodopsin_vs_three_human.xml Thu Feb 20 05:39:48 2014 -0500
b
b'@@ -0,0 +1,549 @@\n+<?xml version="1.0"?>\n+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n+<BlastOutput>\n+  <BlastOutput_program>blastn</BlastOutput_program>\n+  <BlastOutput_version>BLASTN 2.2.29+</BlastOutput_version>\n+  <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), &quot;A greedy algorithm for aligning DNA sequences&quot;, J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>\n+  <BlastOutput_db></BlastOutput_db>\n+  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n+  <BlastOutput_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</BlastOutput_query-def>\n+  <BlastOutput_query-len>1047</BlastOutput_query-len>\n+  <BlastOutput_param>\n+    <Parameters>\n+      <Parameters_expect>1e-40</Parameters_expect>\n+      <Parameters_sc-match>1</Parameters_sc-match>\n+      <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>\n+      <Parameters_gap-open>0</Parameters_gap-open>\n+      <Parameters_gap-extend>0</Parameters_gap-extend>\n+      <Parameters_filter>L;m;</Parameters_filter>\n+    </Parameters>\n+  </BlastOutput_param>\n+<BlastOutput_iterations>\n+<Iteration>\n+  <Iteration_iter-num>1</Iteration_iter-num>\n+  <Iteration_query-ID>Query_1</Iteration_query-ID>\n+  <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n+  <Iteration_query-len>1047</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>15</Statistics_hsp-len>\n+      <Statistics_eff-space>4933992</Statistics_eff-space>\n+      <Statistics_kappa>0.46</Statistics_kappa>\n+      <Statistics_lambda>1.28</Statistics_lambda>\n+      <Statistics_entropy>0.85</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+  <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>2</Iteration_iter-num>\n+  <Iteration_query-ID>Query_1</Iteration_query-ID>\n+  <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n+  <Iteration_query-len>1047</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>15</Statistics_hsp-len>\n+      <Statistics_eff-space>4933992</Statistics_eff-space>\n+      <Statistics_kappa>0.46</Statistics_kappa>\n+      <Statistics_lambda>1.28</Statistics_lambda>\n+      <Statistics_entropy>0.85</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+  <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>3</Iteration_iter-num>\n+  <Iteration_query-ID>Query_1</Iteration_query-ID>\n+  <Iteration_query-def>gi|57163782|ref|NM_001009242.1| Felis catus rhodopsin (RHO), mRNA</Iteration_query-def>\n+  <Iteration_query-len>1047</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+  <Hit_num>1</Hit_num>\n+  <Hit_id>Subject_3</Hit_id>\n+  <Hit_def>ENA|BC112106|BC112106.1 Homo sapiens rhodopsin, mRNA (cDNA clone MGC:138311 IMAGE:8327574), complete cds</Hit_def>\n+  <Hit_accession>Subject_3</Hit_accession>\n+  <Hit_len>1213</Hit_len>\n+  <Hit_hsps>\n+    <Hsp>\n+      <Hsp_num>1</Hsp_num>\n+      <Hsp_bit-score>1474.75</Hsp_bit-score>\n+      <Hsp_score>798</Hsp_score>\n+      <Hsp_evalue>0</Hsp_evalue>\n+      <Hsp_query-from>1</Hsp_query-from>\n+      <Hsp_query-to>1047</Hsp_query-to>\n+      <Hsp_hit-from>88</Hsp_hit-from>\n+      <Hsp_hit-to>1134</Hsp_hit-to>\n+      <Hsp_query-frame>1</Hsp_query-frame>\n+      <Hsp_hit-frame>1</Hsp_hit-frame>\n+      <Hsp_identity>964</Hsp_identity>\n+      <Hsp_positive>964</Hsp_positive>\n+      <Hsp_gaps>0</Hsp_gaps>\n+      <Hsp_align-len>1047</Hsp_align-len>\n+      <Hsp_qseq>ATGAACGGGACGGAGGGCCCGAACTTCTACGTGCCC'..b'|||||||| |||||||||||||||||| | || || ||||||||||||||||||||||| |||||||| || |||||||||||  | || |||||||||| |||||| || ||||||||||| || |||||||| ||||| || || || |||||  | |||||||||||||||||| | |||||||||||||| ||||||||||||||||| || ||||||||||||||||| |||||||| ||||||||||||||||||||||||||||||||||| |||||||||||||| || ||||||||||||||||| |||||||| || || ||||| |||| ||||||||| || |||||||| |||||  ||||||||||||| || ||||| |||||||||| | |   ||||  |||||| ||||| || ||||||||||||||||| || ||||||| ||||||| ||||| |||| || |||||||| |||||||| |||||||||||||||||||| || ||||||||||||||||| |||||||| |||||||| |||||||||||||| || ||||||||||||||||||||||||||||| || |||||| ||||||||||  | ||||| ||||||||||||||||| ||||| ||| |||| || |||||||||||||||||||| || ||||||||||||| || | ||| |||| ||||| |||||||| ||||||||||||||||||||||||||||||||| |||||||  ||||||| ||||||||||| || |||||||| ||||||||  | |||||||||||||| ||||| ||||| |||||||| ||||||</Hsp_midline>\n+    </Hsp>\n+  </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>15</Statistics_hsp-len>\n+      <Statistics_eff-space>4933992</Statistics_eff-space>\n+      <Statistics_kappa>0.46</Statistics_kappa>\n+      <Statistics_lambda>1.28</Statistics_lambda>\n+      <Statistics_entropy>0.85</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>16</Iteration_iter-num>\n+  <Iteration_query-ID>Query_6</Iteration_query-ID>\n+  <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+  <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>15</Statistics_hsp-len>\n+      <Statistics_eff-space>6353949</Statistics_eff-space>\n+      <Statistics_kappa>0.46</Statistics_kappa>\n+      <Statistics_lambda>1.28</Statistics_lambda>\n+      <Statistics_entropy>0.85</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+  <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>17</Iteration_iter-num>\n+  <Iteration_query-ID>Query_6</Iteration_query-ID>\n+  <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+  <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>15</Statistics_hsp-len>\n+      <Statistics_eff-space>6353949</Statistics_eff-space>\n+      <Statistics_kappa>0.46</Statistics_kappa>\n+      <Statistics_lambda>1.28</Statistics_lambda>\n+      <Statistics_entropy>0.85</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+  <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+  <Iteration_iter-num>18</Iteration_iter-num>\n+  <Iteration_query-ID>Query_6</Iteration_query-ID>\n+  <Iteration_query-def>gi|12583664|dbj|AB043817.1| Conger myriaster conf gene for fresh water form rod opsin, complete cds</Iteration_query-def>\n+  <Iteration_query-len>1344</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+  <Iteration_stat>\n+    <Statistics>\n+      <Statistics_db-num>0</Statistics_db-num>\n+      <Statistics_db-len>0</Statistics_db-len>\n+      <Statistics_hsp-len>15</Statistics_hsp-len>\n+      <Statistics_eff-space>6353949</Statistics_eff-space>\n+      <Statistics_kappa>0.46</Statistics_kappa>\n+      <Statistics_lambda>1.28</Statistics_lambda>\n+      <Statistics_entropy>0.85</Statistics_entropy>\n+    </Statistics>\n+  </Iteration_stat>\n+  <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n+\n'
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.log
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins_taxid.fasta.log Thu Feb 20 05:39:48 2014 -0500
b
@@ -0,0 +1,10 @@
+
+
+Building a new DB, current time: 02/10/2014 18:40:09
+New DB name:   four_human_proteins_taxid.fasta
+New DB title:  Just 4 human proteins
+Sequence type: Protein
+Keep Linkouts: T
+Keep MBits: T
+Maximum file size: 1000000000B
+Adding sequences from FASTA; added 4 sequences in 0.00230002 seconds.
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.phd
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins_taxid.fasta.phd Thu Feb 20 05:39:48 2014 -0500
b
@@ -0,0 +1,4 @@
+11117184492
+29249033410
+36665887501
+5392473183
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.phi
b
Binary file test-data/four_human_proteins_taxid.fasta.phi has changed
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.phr
b
Binary file test-data/four_human_proteins_taxid.fasta.phr has changed
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.pin
b
Binary file test-data/four_human_proteins_taxid.fasta.pin has changed
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.pog
b
Binary file test-data/four_human_proteins_taxid.fasta.pog has changed
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.psd
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins_taxid.fasta.psd Thu Feb 20 05:39:48 2014 -0500
b
@@ -0,0 +1,4 @@
+gnl|bl_ord_id|00
+gnl|bl_ord_id|11
+gnl|bl_ord_id|22
+gnl|bl_ord_id|33
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.psi
b
Binary file test-data/four_human_proteins_taxid.fasta.psi has changed
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 test-data/four_human_proteins_taxid.fasta.psq
b
Binary file test-data/four_human_proteins_taxid.fasta.psq has changed
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/README.rst
--- a/tools/ncbi_blast_plus/README.rst Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/README.rst Thu Feb 20 05:39:48 2014 -0500
b
@@ -25,17 +25,7 @@
 (``blastxml``) and protein and nucleotide BLAST databases (``blastdbp`` and
 ``blastdbn``).
 
-You must tell Galaxy about any system level BLAST databases using configuration
-files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein
-databases like NR), and blastdb_d.loc (protein domain databases like CDD or
-SMART) which are located in the tool-data/ folder. Sample files are included
-which explain the tab-based format to use.
-
-You can download the NCBI provided databases as tar-balls from here:
-
-* ftp://ftp.ncbi.nlm.nih.gov/blast/db/ (nucleotide and protein databases like NR)
-* ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ (domain databases like CDD)
-
+See the configuration notes below.
 
 Manual Installation
 ===================
@@ -78,6 +68,31 @@
 
     ./run_functional_tests.sh -sid NCBI_BLAST+-ncbi_blast_plus_tools
 
+Configuration
+=============
+
+You must tell Galaxy about any system level BLAST databases using configuration
+files blastdb.loc (nucleotide databases like NT) and blastdb_p.loc (protein
+databases like NR), and blastdb_d.loc (protein domain databases like CDD or
+SMART) which are located in the tool-data/ folder. Sample files are included
+which explain the tab-based format to use.
+
+You can download the NCBI provided databases as tar-balls from here:
+
+* ftp://ftp.ncbi.nlm.nih.gov/blast/db/ (nucleotide and protein databases like NR)
+* ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ (domain databases like CDD)
+
+The BLAST+ binaries support multi-threaded operation, which is handled via the
+$GALAXY_SLOTS environment variable. This should be set automatically by Galaxy
+via your job runner settings, which allows you to (for example) allocate four
+cores to each BLAST job.
+
+In addition, the BLAST+ wrappers also support high level parallelism by task
+splitting if ``use_tasked_jobs = True`` is enabled in your ``universe_wsgi.ini``
+configuration file. Essentially, the FASTA input query files are broken up into
+batches of 1000 sequences, a separate BLAST child job is run for each chunk,
+and then the BLAST output files are merged (in order). This is transparent
+for the end user.
 
 History
 =======
@@ -105,7 +120,7 @@
           (all too often our users where having to re-run searches just to
           get one of the missing columns like query or subject length)
 v0.0.18 - Defensive quoting of filenames in case of spaces (where possible,
-          BLAST+ handling of some mult-file arguments is problematic).
+          BLAST+ handling of some multi-file arguments is problematic).
 v0.0.19 - Added wrappers for rpsblast and rpstblastn, and new blastdb_d.loc
           for the domain databases they use (e.g. CDD, PFAM or SMART).
         - Correct case of exception regular expression (for error handling
@@ -139,6 +154,7 @@
         - Tablar output now includes option to pick specific columns
         - BLAST XML to tabular tool supports multiple input files.
         - More detailed descriptions for BLASTN and BLASTP task option
+        - Supports setting a taxonomy ID in makeblastdb wrapper.
 ======= ======================================================================
 
 
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/blastxml_to_tabular.py
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Thu Feb 20 05:39:48 2014 -0500
[
@@ -66,7 +66,7 @@
 from optparse import OptionParser
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.23"
+    print "v0.1.00"
     sys.exit(0)
 
 if sys.version_info[:2] >= ( 2, 5 ):
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/blastxml_to_tabular.xml
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Thu Feb 20 05:39:48 2014 -0500
b
@@ -28,33 +28,33 @@
           <when value="ext"/>
           <when value="cols">
             <param name="std_cols" type="select" multiple="true" display="checkboxes" label="Standard columns">
-              <option selected="true" value="qseqid">1 qseqid = Query Seq-id (ID of your sequence)</option>
-              <option selected="true" value="sseqid">2 sseqid = Subject Seq-id (ID of the database hit)</option>
-              <option selected="true" value="pident">3 pident = Percentage of identical matches</option>
-              <option selected="true" value="length">4 length = Alignment length</option>
-              <option selected="true" value="mismatch">5 mismatch = Number of mismatches</option>
-              <option selected="true" value="gapopen">6 gapopen = Number of gap openings</option>
-              <option selected="true" value="qstart">7 qstart = Start of alignment in query</option>
-              <option selected="true" value="qend">8 qend = End of alignment in query</option>
-              <option selected="true" value="sstart">9 sstart = Start of alignment in subject (database hit)</option>
-              <option selected="true" value="send">10 send = End of alignment in subject (database hit)</option>
-              <option selected="true" value="evalue">11 evalue = Expectation value (E-value)</option>
-              <option selected="true" value="bitscore">12 bitscore = Bit score</option>
+              <option selected="true" value="qseqid">qseqid = Query Seq-id (ID of your sequence)</option>
+              <option selected="true" value="sseqid">sseqid = Subject Seq-id (ID of the database hit)</option>
+              <option selected="true" value="pident">pident = Percentage of identical matches</option>
+              <option selected="true" value="length">length = Alignment length</option>
+              <option selected="true" value="mismatch">mismatch = Number of mismatches</option>
+              <option selected="true" value="gapopen">gapopen = Number of gap openings</option>
+              <option selected="true" value="qstart">qstart = Start of alignment in query</option>
+              <option selected="true" value="qend">qend = End of alignment in query</option>
+              <option selected="true" value="sstart">sstart = Start of alignment in subject (database hit)</option>
+              <option selected="true" value="send">send = End of alignment in subject (database hit)</option>
+              <option selected="true" value="evalue">evalue = Expectation value (E-value)</option>
+              <option selected="true" value="bitscore">bitscore = Bit score</option>
             </param>
             <param name="ext_cols" type="select" multiple="true" display="checkboxes" label="Extended columns">
-              <option value="sallseqid">13 sallseqid = All subject Seq-id(s), separated by a ';'</option>
-              <option value="score">14 score = Raw score</option>
-              <option value="nident">15 nident = Number of identical matches</option>
-              <option value="positive">16 positive = Number of positive-scoring matches</option>
-              <option value="gaps">17 gaps = Total number of gaps</option>
-              <option value="ppos">18 ppos = Percentage of positive-scoring matches</option>
-              <option value="qframe">19 qframe = Query frame</option>
-              <option value="sframe">20 sframe = Subject frame</option>
-              <option value="qseq">21 qseq = Aligned part of query sequence</option>
-              <option value="sseq">22 sseq = Aligned part of subject sequence</option>
-              <option value="qlen">23 qlen = Query sequence length</option>
-              <option value="slen">24 slen = Subject sequence length</option>
-              <option value="salltitles">25 salltitles = All subject title(s), separated by a '&lt;&gt;'</option>
+              <option value="sallseqid">sallseqid = All subject Seq-id(s), separated by a ';'</option>
+              <option value="score">score = Raw score</option>
+              <option value="nident">nident = Number of identical matches</option>
+              <option value="positive">positive = Number of positive-scoring matches</option>
+              <option value="gaps">gaps = Total number of gaps</option>
+              <option value="ppos">ppos = Percentage of positive-scoring matches</option>
+              <option value="qframe">qframe = Query frame</option>
+              <option value="sframe">sframe = Subject frame</option>
+              <option value="qseq">qseq = Aligned part of query sequence</option>
+              <option value="sseq">sseq = Aligned part of subject sequence</option>
+              <option value="qlen">qlen = Query sequence length</option>
+              <option value="slen">slen = Subject sequence length</option>
+              <option value="salltitles">salltitles = All subject title(s), separated by a '&lt;&gt;'</option>
             </param>
           </when>
         </conditional>
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Thu Feb 20 05:39:48 2014 -0500
b
@@ -83,13 +83,14 @@
     <help>
 **What it does**
 
-This tool identifies and masks out low complexity regions of a nucleotide database (or sequences in FASTA format) by using the symmetric DUST algorithm.
+This tool identifies and masks out low complexity regions of a nucleotide database (or sequences in FASTA format) by using the symmetric DUST_ algorithm.
 
 If you select *maskinfo ASN.1* (binary or text) as output format, the output file can be used as masking data for NCBI BLAST+ makeblastdb tool.
 
 More information about dustmasker can be found in the `BLAST Command Line Applications User Manual`_.
 
 .. _BLAST Command Line Applications User Manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/
+.. _DUST: http://www.ncbi.nlm.nih.gov/pubmed/16796549
 
 **References**
 
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/ncbi_macros.xml
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml Thu Feb 20 05:39:48 2014 -0500
b
@@ -31,33 +31,33 @@
             <when value="ext"/>
             <when value="cols">
                 <param name="std_cols" type="select" multiple="true" display="checkboxes" label="Standard columns">
-                    <option selected="true" value="qseqid">1 qseqid = Query Seq-id (ID of your sequence)</option>
-                    <option selected="true" value="sseqid">2 sseqid = Subject Seq-id (ID of the database hit)</option>
-                    <option selected="true" value="pident">3 pident = Percentage of identical matches</option>
-                    <option selected="true" value="length">4 length = Alignment length</option>
-                    <option selected="true" value="mismatch">5 mismatch = Number of mismatches</option>
-                    <option selected="true" value="gapopen">6 gapopen = Number of gap openings</option>
-                    <option selected="true" value="qstart">7 qstart = Start of alignment in query</option>
-                    <option selected="true" value="qend">8 qend = End of alignment in query</option>
-                    <option selected="true" value="sstart">9 sstart = Start of alignment in subject (database hit)</option>
-                    <option selected="true" value="send">10 send = End of alignment in subject (database hit)</option>
-                    <option selected="true" value="evalue">11 evalue = Expectation value (E-value)</option>
-                    <option selected="true" value="bitscore">12 bitscore = Bit score</option>
+                    <option selected="true" value="qseqid">qseqid = Query Seq-id (ID of your sequence)</option>
+                    <option selected="true" value="sseqid">sseqid = Subject Seq-id (ID of the database hit)</option>
+                    <option selected="true" value="pident">pident = Percentage of identical matches</option>
+                    <option selected="true" value="length">length = Alignment length</option>
+                    <option selected="true" value="mismatch">mismatch = Number of mismatches</option>
+                    <option selected="true" value="gapopen">gapopen = Number of gap openings</option>
+                    <option selected="true" value="qstart">qstart = Start of alignment in query</option>
+                    <option selected="true" value="qend">qend = End of alignment in query</option>
+                    <option selected="true" value="sstart">sstart = Start of alignment in subject (database hit)</option>
+                    <option selected="true" value="send">send = End of alignment in subject (database hit)</option>
+                    <option selected="true" value="evalue">evalue = Expectation value (E-value)</option>
+                    <option selected="true" value="bitscore">bitscore = Bit score</option>
                 </param>
                 <param name="ext_cols" type="select" multiple="true" display="checkboxes" label="Extended columns">
-                    <option value="sallseqid">13 sallseqid = All subject Seq-id(s), separated by a ';'</option>
-                    <option value="score">14 score = Raw score</option>
-                    <option value="nident">15 nident = Number of identical matches</option>
-                    <option value="positive">16 positive = Number of positive-scoring matches</option>
-                    <option value="gaps">17 gaps = Total number of gaps</option>
-                    <option value="ppos">18 ppos = Percentage of positive-scoring matches</option>
-                    <option value="qframe">19 qframe = Query frame</option>
-                    <option value="sframe">20 sframe = Subject frame</option>
-                    <option value="qseq">21 qseq = Aligned part of query sequence</option>
-                    <option value="sseq">22 sseq = Aligned part of subject sequence</option>
-                    <option value="qlen">23 qlen = Query sequence length</option>
-                    <option value="slen">24 slen = Subject sequence length</option>
-                    <option value="salltitles">25 salltitles = All subject title(s), separated by a '&lt;&gt;'</option>
+                    <option value="sallseqid">sallseqid = All subject Seq-id(s), separated by a ';'</option>
+                    <option value="score">score = Raw score</option>
+                    <option value="nident">nident = Number of identical matches</option>
+                    <option value="positive">positive = Number of positive-scoring matches</option>
+                    <option value="gaps">gaps = Total number of gaps</option>
+                    <option value="ppos">ppos = Percentage of positive-scoring matches</option>
+                    <option value="qframe">qframe = Query frame</option>
+                    <option value="sframe">sframe = Subject frame</option>
+                    <option value="qseq">qseq = Aligned part of query sequence</option>
+                    <option value="sseq">sseq = Aligned part of subject sequence</option>
+                    <option value="qlen">qlen = Query sequence length</option>
+                    <option value="slen">slen = Subject sequence length</option>
+                    <option value="salltitles">salltitles = All subject title(s), separated by a '&lt;&gt;'</option>
                 </param>
                 <!-- TODO, the other columns, like taxonomy -->
             </when>
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/ncbi_makeblastdb.xml
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Thu Feb 20 05:39:48 2014 -0500
b
@@ -29,7 +29,10 @@
 ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
 -title "BLAST Database"
 #end if
--dbtype $dbtype 
+-dbtype $dbtype
+## --------------------------------------------------------------------
+## Masking
+## --------------------------------------------------------------------
 #set $mask_string = ''
 #set $sep = '-mask_data '
 #for $i in $mask_data
@@ -44,11 +47,15 @@
 ## #set $sep = ','
 ## #end for
 ## $gi_mask_string
-## #if $tax.select == 'id':
-## -taxid $tax.id
-## #else if $tax.select == 'map':
-## -taxid_map $tax.map
-## #end if
+## --------------------------------------------------------------------
+## Taxonomy
+## --------------------------------------------------------------------
+#if $tax.taxselect == 'id':
+-taxid $tax.taxid
+## TODO - Can we use a tabular file for the taxonomy mapping?
+## #else if $tax.taxselect == 'map':
+## -taxid_map $tax.taxmap
+#end if
 ## --------------------------------------------------------------------
 ## Capture the stdout log information to the primary file (plain text):
 &gt;&gt; "$outfile"
@@ -83,23 +90,25 @@
         -->
 
         <!-- TAXONOMY OPTIONS -->
-        <!-- TODO
         <conditional name="tax">
-            <param name="select" type="select" label="Taxonomy options">
-                <option value="">Do not assign sequences to Taxonomy IDs</option>
-                <option value="id">Assign all sequences to one Taxonomy ID</option>
+            <param name="taxselect" type="select" label="Taxonomy options">
+                <option value="">Do not assign a Taxonomy ID to the sequences</option>
+                <option value="id">Assign the same Taxonomy ID to all the sequences</option>
+                <!--
                 <option value="map">Supply text file mapping sequence IDs to taxnomy IDs</option>
+                -->
             </param>
             <when value="">
             </when>
             <when value="id">
-                <param name="id" type="integer" value="" label="NCBI taxonomy ID" help="Integer &gt;=0" />
+                <param name="taxid" type="integer" value="" label="NCBI taxonomy ID" help="Integer &gt;=0, e.g. 9606 for Homo sapiens" min="0" />
             </when>
+            <!-- TODO: File format?
             <when value="map">
-                <param name="file" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" />
+                <param name="taxmap" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" />
             </when>
+            -->
         </conditional>
-        -->
     </inputs>
     <outputs>
         <!-- If we only accepted one FASTA file, we could use its human name here... -->
@@ -112,6 +121,8 @@
     </outputs>
     <tests>
         <!-- Note the (two line) PIN file is not reproducible run to run.
+             Likewise there is a datestamp in the log file as well.
+             With and without the taxid the only real difference is in the *.phr file.
         -->
         <test>
             <param name="dbtype" value="prot" />
@@ -130,6 +141,25 @@
                 <extra_files type="file" value="four_human_proteins.fasta.psi" name="blastdb.psi" />
             </output>
         </test>
+        <test>
+            <param name="dbtype" value="prot" />
+            <param name="file" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="title" value="Just 4 human proteins" />
+            <param name="parse_seqids" value="" />
+            <param name="hash_index" value="true" />
+            <param name="taxselect" value="id" />
+            <param name="taxid" value="9606" />
+            <output name="out_file" file="four_human_proteins_taxid.fasta.log" ftype="blastdbp" lines_diff="6">
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.phr" name="blastdb.phr" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.pin" name="blastdb.pin" lines_diff="2" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.psq" name="blastdb.psq" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.pog" name="blastdb.pog" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.phd" name="blastdb.phd" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.phi" name="blastdb.phi" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.psd" name="blastdb.psd" />
+                <extra_files type="file" value="four_human_proteins_taxid.fasta.psi" name="blastdb.psi" />
+            </output>
+        </test>
     </tests>
     <help>
 **What it does**
b
diff -r 2f7fac29bb3c -r 22b7cdcf4960 tools/ncbi_blast_plus/tool_dependencies.xml
--- a/tools/ncbi_blast_plus/tool_dependencies.xml Wed Jan 15 05:38:14 2014 -0500
+++ b/tools/ncbi_blast_plus/tool_dependencies.xml Thu Feb 20 05:39:48 2014 -0500
b
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
     <package name="blast+" version="2.2.29">
-        <repository changeset_revision="61c4017d3bf2" name="package_blast_plus_2_2_29" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="c021862e9ea8" name="package_blast_plus_2_2_29" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>