changeset 33:42e6f199d11f draft

v0.3.0 Updated for NCBI BLAST+ 2.7.1
author peterjc
date Sat, 30 Jun 2018 15:13:38 -0400
parents 360352490a06
children 8f82e05831dc
files test-data/chimera.fasta.gz test-data/rhodopsin_nucs.fasta.gz test-data/three_human_mRNA.fasta.gz tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/check_no_duplicates.py tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml tools/ncbi_blast_plus/ncbi_macros.xml tools/ncbi_blast_plus/ncbi_makeblastdb.xml
diffstat 8 files changed, 134 insertions(+), 86 deletions(-) [+]
line wrap: on
line diff
Binary file test-data/chimera.fasta.gz has changed
Binary file test-data/rhodopsin_nucs.fasta.gz has changed
Binary file test-data/three_human_mRNA.fasta.gz has changed
--- a/tools/ncbi_blast_plus/README.rst	Tue Jun 05 11:42:10 2018 -0400
+++ b/tools/ncbi_blast_plus/README.rst	Sat Jun 30 15:13:38 2018 -0400
@@ -1,10 +1,9 @@
 Galaxy wrappers for NCBI BLAST+ suite
 =====================================
 
-These wrappers are copyright 2010-2017 by Peter Cock (The James Hutton Institute,
-UK) and additional contributors including Edward Kirton, John Chilton,
-Nicola Soranzo, Jim Johnson, Bjoern Gruening, and Caleb Easterly.
-
+These wrappers are copyright 2010-2018 by Peter Cock (James Hutton Institute,
+UK) and additional contributors including Edward Kirton, John Chilton, Nicola
+Soranzo, Jim Johnson, Bjoern Gruening, Caleb Easterly, and Anton Nekrutenko.
 See the licence text below.
 
 Note this does not work with the NCBI 'legacy' BLAST suite written in C
@@ -259,6 +258,7 @@
         - Depends on BioConda or legacy ToolShed ``package_blast_plus_2_7_1``.
         - Document the BLAST+ 2.6.0 change in the standard 12 column output
           from ``qacc,sacc,...`` to ``qaccver,saccver,...`` instead.
+        - Accept gzipped FASTA inputs (contribution from Anton Nekrutenko).
 ======= ======================================================================
 
 
--- a/tools/ncbi_blast_plus/check_no_duplicates.py	Tue Jun 05 11:42:10 2018 -0400
+++ b/tools/ncbi_blast_plus/check_no_duplicates.py	Sat Jun 30 15:13:38 2018 -0400
@@ -9,10 +9,11 @@
 will return a non-zero error if any duplicate identifiers
 are found.
 """
-
+import gzip
 import os
 import sys
 
+
 if "-v" in sys.argv or "--version" in sys.argv:
     print("v0.0.23")
     sys.exit(0)
@@ -24,7 +25,19 @@
         sys.stderr.write("Missing FASTA file %r\n" % filename)
         sys.exit(2)
     files += 1
-    handle = open(filename)
+
+    with open(filename, "rb") as binary_handle:
+        magic = binary_handle.read(2)
+    if not magic:
+        # Empty file, special case
+        continue
+    elif magic == b'\x1f\x8b':
+        # Gzipped
+        handle = gzip.open(filename, "rt")
+    elif magic[0:1] == b">":
+        # Not gzipped, shoudl be plain FASTA
+        handle = open(filename, "r")
+
     for line in handle:
         if line.startswith(">"):
             # The split will also take care of the new line character,
--- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml	Tue Jun 05 11:42:10 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml	Sat Jun 30 15:13:38 2018 -0400
@@ -7,24 +7,29 @@
     <expand macro="parallelism" />
     <expand macro="preamble" />
     <command detect_errors="aggressive">
+<![CDATA[
 ## The command is a Cheetah template which allows some Python based syntax.
 ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
 blastn
--query '$query'
+#if $query.is_of_type('fasta.gz'):
+-query <(gunzip -c '${query}')
+#else:
+-query '${query}'
+#end if
 @BLAST_DB_SUBJECT@
--task $blast_type
--evalue $evalue_cutoff
+-task '${blast_type}'
+-evalue '${evalue_cutoff}'
 @BLAST_OUTPUT@
 @THREADS@
 #if $adv_opts.adv_opts_selector=="advanced":
-$adv_opts.strand
+${adv_opts.strand}
 @ADV_FILTER_QUERY@
 @ADV_MAX_HITS@
 @ADV_WORD_SIZE@
 #if (str($adv_opts.identity_cutoff) and float(str($adv_opts.identity_cutoff)) > 0 ):
--perc_identity $adv_opts.identity_cutoff
+-perc_identity '${adv_opts.identity_cutoff}'
 #end if
-$adv_opts.ungapped
+${adv_opts.ungapped}
 @ADV_ID_LIST_FILTER@
 @ADV_QCOV_HSP_PERC@
 ## only use window size if dc-megablast mode is used
@@ -35,9 +40,10 @@
 @ADV_GAPEXTEND@
 ## End of advanced options:
 #end if
+]]>
     </command>
     <inputs>
-        <param argument="-query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
+        <param argument="-query" type="data" format="fasta,fasta.gz" label="Nucleotide query sequence(s)"/>
         <expand macro="input_conditional_nucleotide_db" />
         <param name="blast_type" argument="-task" type="select" display="radio" label="Type of BLAST">
             <option value="megablast">megablast - Traditional megablast used to find very similar (e.g., intraspecies or closely related species) sequences</option>
@@ -102,6 +108,16 @@
         <test>
             <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" />
             <param name="db_opts_selector" value="file" />
+            <param name="subject" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" />
+            <param name="database" value="" />
+            <param name="evalue_cutoff" value="1e-40" />
+            <param name="out_format" value="6" />
+            <param name="adv_opts_selector" value="basic" />
+            <output name="output1" file="blastn_rhodopsin_vs_three_human.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" />
+            <param name="db_opts_selector" value="file" />
             <param name="subject" value="three_human_mRNA.fasta" ftype="fasta" />
             <param name="database" value="" />
             <param name="evalue_cutoff" value="1e-40" />
--- a/tools/ncbi_blast_plus/ncbi_macros.xml	Tue Jun 05 11:42:10 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml	Sat Jun 30 15:13:38 2018 -0400
@@ -357,7 +357,7 @@
             <when value="file">
                 <param name="database" type="hidden" value="" />
                 <param name="histdb" type="hidden" value="" />
-                <param argument="-subject" type="data" format="fasta" label="Nucleotide FASTA subject file to use instead of a database"/>
+                <param argument="-subject" type="data" format="fasta,fasta.gz" label="Nucleotide FASTA subject file to use instead of a database"/>
             </when>
         </conditional>
     </xml>
@@ -533,42 +533,46 @@
         </conditional>
     </xml>
 <!--Tokens-->
-    <token name="@ADV_MATRIX_GAPCOSTS@">
+    <token name="@ADV_MATRIX_GAPCOSTS@"><![CDATA[
 #if str($adv_opts.matrix_gapcosts.matrix):
-    -matrix $adv_opts.matrix_gapcosts.matrix
-    $adv_opts.matrix_gapcosts.gap_costs
+    -matrix '${adv_opts.matrix_gapcosts.matrix}'
+    ${adv_opts.matrix_gapcosts.gap_costs}
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@ADV_QCOV_HSP_PERC@">
-#if float(str($adv_opts.qcov_hsp_perc)) &gt; 0:
-    -qcov_hsp_perc $adv_opts.qcov_hsp_perc
+    <token name="@ADV_QCOV_HSP_PERC@"><![CDATA[
+#if float(str($adv_opts.qcov_hsp_perc)) > 0:
+    -qcov_hsp_perc '${adv_opts.qcov_hsp_perc}'
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@ADV_ID_LIST_FILTER@">
+    <token name="@ADV_ID_LIST_FILTER@"><![CDATA[
 #if $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'negative_gilist':
-    -negative_gilist $adv_opts.adv_optional_id_files_opts.negative_gilist
+    -negative_gilist '${adv_opts.adv_optional_id_files_opts.negative_gilist}'
 #elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'gilist':
-    -gilist $adv_opts.adv_optional_id_files_opts.gilist
+    -gilist '{$adv_opts.adv_optional_id_files_opts.gilist}'
 #elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'seqidlist':
-    -seqidlist $adv_opts.adv_optional_id_files_opts.seqidlist
+    -seqidlist '${adv_opts.adv_optional_id_files_opts.seqidlist}'
 #end if
-    </token>
+    ]]></token>
 
     <token name="@THREADS@">-num_threads "\${GALAXY_SLOTS:-8}"</token>
 
-    <token name="@BLAST_DB_SUBJECT@">
+    <token name="@BLAST_DB_SUBJECT@"><![CDATA[
 #if $db_opts.db_opts_selector == "db":
   -db '${" ".join(str($db_opts.database.fields.path).split(","))}'
 #elif $db_opts.db_opts_selector == "histdb":
   -db '${os.path.join($db_opts.histdb.extra_files_path, "blastdb")}'
 #else:
-  -subject '$db_opts.subject'
+    #if $db_opts.subject.is_of_type('fasta.gz'):
+        -subject <(gunzip -c '${$db_opts.subject}')
+    #else:
+        -subject '${db_opts.subject}'
+    #end if
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@BLAST_OUTPUT@">-out '$output1'
+    <token name="@BLAST_OUTPUT@"><![CDATA[ -out '$output1'
 ##Set the extended list here so when we add things, saved workflows are not affected
 #if str($output.out_format)=="ext":
     -outfmt '6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen salltitles'
@@ -579,79 +583,81 @@
     -outfmt '6 $cols'
 #else:
 ## Note do not quote this as can be '0 -html' which is really two arguments
-    -outfmt $output.out_format
+    -outfmt ${output.out_format}
 #end if
-    </token>
+    ]]></token>
     <token name="@ADV_FILTER_QUERY@">$adv_opts.filter_query</token>
-    <token name="@ADV_MAX_HITS@">
+    <token name="@ADV_MAX_HITS@"><![CDATA[
 ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string
 ## Note -max_target_seqs used to simply override -num_descriptions and -num_alignments
 ## but this was changed in BLAST+ 2.2.27 onwards to force their use (raised with NCBI)
 #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):
     #if str($output.out_format) in ["6", "ext", "cols", "5"]:
         ## Most output formats use this, including tabular and XML:
-        -max_target_seqs $adv_opts.max_hits
+        -max_target_seqs '${adv_opts.max_hits}'
     #else
         ## Text and HTML output formats 0-4 currently need this instead:
         -num_descriptions $adv_opts.max_hits -num_alignments $adv_opts.max_hits
     #end if
 #end if
 #if str($adv_opts.max_hsps)
-    -max_hsps $adv_opts.max_hsps
+    -max_hsps '${adv_opts.max_hsps}'
 #end if
-    </token>
-    <token name="@ADV_WORD_SIZE@">
+    ]]></token>
+    <token name="@ADV_WORD_SIZE@"><![CDATA[
 #if str($adv_opts.word_size):
--word_size $adv_opts.word_size
+-word_size '${adv_opts.word_size}'
 #end if
 $adv_opts.parse_deflines
-    </token>
-    <token name="@ADV_WINDOW_SIZE@">
+    ]]></token>
+    <token name="@ADV_WINDOW_SIZE@"><![CDATA[
 #if str($adv_opts.window_size):
--window_size $adv_opts.window_size
+-window_size '${adv_opts.window_size}'
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@ADV_THRESHOLD@">
+    <token name="@ADV_THRESHOLD@"><![CDATA[
 #if str($adv_opts.threshold):
--threshold $adv_opts.threshold
+-threshold '${adv_opts.threshold}'
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@ADV_COMP_BASED_STATS@">
+    <token name="@ADV_COMP_BASED_STATS@"><![CDATA[
 #if str($adv_opts.comp_based_stats):
--comp_based_stats $adv_opts.comp_based_stats
+-comp_based_stats '${adv_opts.comp_based_stats}'
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@ADV_GAPOPEN@">
+    <token name="@ADV_GAPOPEN@"><![CDATA[
 #if str($adv_opts.gapopen):
--gapopen $adv_opts.gapopen
+-gapopen '${adv_opts.gapopen}'
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@ADV_GAPEXTEND@">
+    <token name="@ADV_GAPEXTEND@"><![CDATA[
 #if str($adv_opts.gapextend):
--gapextend $adv_opts.gapextend
+-gapextend '${adv_opts.gapextend}'
 #end if
-    </token>
+    ]]></token>
 
-    <token name="@ADV_MATRIX@">
+    <token name="@ADV_MATRIX@"><![CDATA[
 #if str($adv_opts.matrix):
--matrix $adv_opts.matrix
+-matrix '${adv_opts.matrix}'
 #end if
-    </token>
+    ]]></token>
 
     <!-- @ON_DB_SUBJECT@ is for use with @BLAST_DB_SUBJECT@ -->
-    <token name="@ON_DB_SUBJECT@">#if str($db_opts.db_opts_selector)=='db'
+    <token name="@ON_DB_SUBJECT@"><![CDATA[
+#if str($db_opts.db_opts_selector)=='db'
 '${db_opts.database}'
 #elif str($db_opts.db_opts_selector)=='histdb'
 '${db_opts.histdb.name}'
 #else
 '${db_opts.subject.name}'
-#end if</token>
+#end if
+]]></token>
 
-    <token name="@REFERENCES@">
+    <token name="@REFERENCES@"><![CDATA[
 Peter J. A. Cock, John M. Chilton, Björn Grüning, James E. Johnson, Nicola Soranzo (2015).
 NCBI BLAST+ integrated into Galaxy. *GigaScience* 4:39
 https://doi.org/10.1186/s13742-015-0080-7
@@ -663,14 +669,16 @@
 
 This wrapper is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus
-    </token>
+    ]]></token>
     <xml name="blast_citations">
         <citations>
+            <citation type="doi">10.1093/nar/25.17.3389</citation>
             <citation type="doi">10.1186/1471-2105-10-421</citation>
             <citation type="doi">10.1186/s13742-015-0080-7</citation>
         </citations>
     </xml>
-    <token name="@OUTPUT_FORMAT@">**Output format**
+    <token name="@OUTPUT_FORMAT@"><![CDATA[
+**Output format**
 
 Because Galaxy focuses on processing tabular data, the default output of this
 tool is tabular. The standard BLAST+ tabular output contains 12 columns:
@@ -720,7 +728,7 @@
     22 sseq          Aligned part of subject sequence
     23 qlen          Query sequence length
     24 slen          Subject sequence length
-    25 salltitles    All subject title(s), separated by a '&lt;&gt;'
+    25 salltitles    All subject title(s), separated by a '<>'
 ====== ============= ===========================================
 
 The third option is to customise the tabular output by selecting which
@@ -735,8 +743,9 @@
 The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.
 The two query anchored outputs show a multiple sequence alignment between the query and all the matches,
 and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).
-    </token>
-    <token name="@FASTA_WARNING@">.. class:: warningmark
+    ]]></token>
+    <token name="@FASTA_WARNING@"><![CDATA[
+.. class:: warningmark
 
 You can also search against a FASTA file of subject (target)
 sequences. This is *not* advised because it is slower (only one
@@ -744,25 +753,26 @@
 searches (very small e-values which will look overly signficiant).
 In most cases you should instead turn the other FASTA file into a
 database first using *makeblastdb* and search against that.
-    </token>
-    <token name="@SEARCH_TIME_WARNING@">.. class:: warningmark
+    ]]></token>
+    <token name="@SEARCH_TIME_WARNING@"><![CDATA[
+.. class:: warningmark
 
 **Note**. Database searches may take a substantial amount of time.
 For large input datasets it is advisable to allow overnight processing.
 
 -----
-    </token>
-
-    <token name="@CLI_OPTIONS@">**Advanced Options**
+    ]]></token>
+    <token name="@CLI_OPTIONS@"><![CDATA[
+**Advanced Options**
 
 For help with advanced options and their default values, visit the
 NCBI BLAST® Command Line Applications User Manual, Appendices,
 `Options for the command-line applications
-&lt;https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_&gt;`_.
+<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_>`_.
 
 For amino acid substitution matrices, see `BLAST Substitution Matrices
-&lt;https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_BLAST_Substitution_Matrices_&gt;`_ in the same
+<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_BLAST_Substitution_Matrices_>`_ in the same
 appendices.
 
-    </token>
+    ]]></token>
 </macros>
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Tue Jun 05 11:42:10 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Sat Jun 30 15:13:38 2018 -0400
@@ -5,21 +5,30 @@
         <import>ncbi_macros.xml</import>
     </macros>
     <expand macro="preamble" />
-    <command detect_errors="aggressive" strict="true">
+    <command detect_errors="aggressive" strict="true"><![CDATA[
 python $__tool_directory__/check_no_duplicates.py
 ##First check for duplicates (since BLAST+ 2.2.28 fails to do so)
 ##and abort (via the ampersand ampersand trick) if any are found.
 #for i in $input_file#'${i}' #end for#
-&amp;&amp;
-makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}'
+&&
+##makeblastdb does not like input redirects of the sort
+##makeblastdb -in <(gunzip -c gzipped_fasta_file)
+##therefore we're cramming everything
+##into a single cat command below
+cat
+#for i in $input_file:
+    #if $i.is_of_type('fasta.gz'):
+        <(gunzip -c ${i})
+    #else:
+        ${i}
+    #end if
+#end for
+| makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}'
 $parse_seqids
 $hash_index
-## Single call to -in with multiple filenames space separated with outer quotes
-## (presumably any filenames with spaces would be a problem). Note this gives
-## some extra spaces, e.g. -in "file1 file2 file3 " but BLAST seems happy:
--in '#for i in $input_file#${i} #end for#'
+-in -
 #if $title:
--title '$title'
+-title '${title}'
 #else:
 ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful
 -title 'BLAST Database'
@@ -46,8 +55,8 @@
 #end if
 ## --------------------------------------------------------------------
 ## Capture the stdout log information to the primary file (plain text):
-&gt; "$outfile"
-    </command>
+> '$outfile'
+    ]]></command>
     <inputs>
         <param argument="-dbtype" type="select" display="radio" label="Molecule type of input">
             <option value="prot">protein</option>
@@ -57,7 +66,7 @@
              NOTE Double check the new database would be self contained first
         -->
         <!-- Note this is a mandatory parameter - default should be most recent FASTA file -->
-        <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta" label="Input FASTA files(s)" help="One or more FASTA files" />
+        <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta,fasta.gz" label="Input FASTA files(s)" help="One or more FASTA files" />
         <param argument="-title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" />
         <param argument="-parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="false" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" />
         <param argument="-hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." />
@@ -158,7 +167,7 @@
         </test>
         <test>
             <param name="dbtype" value="nucl" />
-            <param name="input_file" value="three_human_mRNA.fasta" ftype="fasta" />
+            <param name="input_file" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" />
             <param name="title" value="Just 3 human mRNA sequences" />
             <param name="parse_seqids" value="" />
             <param name="hash_index" value="true" />