Previous changeset 32:360352490a06 (2018-06-05) Next changeset 34:8f82e05831dc (2018-06-30) |
Commit message:
v0.3.0 Updated for NCBI BLAST+ 2.7.1 |
modified:
tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/check_no_duplicates.py tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml tools/ncbi_blast_plus/ncbi_macros.xml tools/ncbi_blast_plus/ncbi_makeblastdb.xml |
added:
test-data/chimera.fasta.gz test-data/rhodopsin_nucs.fasta.gz test-data/three_human_mRNA.fasta.gz |
b |
diff -r 360352490a06 -r 42e6f199d11f test-data/chimera.fasta.gz |
b |
Binary file test-data/chimera.fasta.gz has changed |
b |
diff -r 360352490a06 -r 42e6f199d11f test-data/rhodopsin_nucs.fasta.gz |
b |
Binary file test-data/rhodopsin_nucs.fasta.gz has changed |
b |
diff -r 360352490a06 -r 42e6f199d11f test-data/three_human_mRNA.fasta.gz |
b |
Binary file test-data/three_human_mRNA.fasta.gz has changed |
b |
diff -r 360352490a06 -r 42e6f199d11f tools/ncbi_blast_plus/README.rst --- a/tools/ncbi_blast_plus/README.rst Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/README.rst Sat Jun 30 15:13:38 2018 -0400 |
b |
@@ -1,10 +1,9 @@ Galaxy wrappers for NCBI BLAST+ suite ===================================== -These wrappers are copyright 2010-2017 by Peter Cock (The James Hutton Institute, -UK) and additional contributors including Edward Kirton, John Chilton, -Nicola Soranzo, Jim Johnson, Bjoern Gruening, and Caleb Easterly. - +These wrappers are copyright 2010-2018 by Peter Cock (James Hutton Institute, +UK) and additional contributors including Edward Kirton, John Chilton, Nicola +Soranzo, Jim Johnson, Bjoern Gruening, Caleb Easterly, and Anton Nekrutenko. See the licence text below. Note this does not work with the NCBI 'legacy' BLAST suite written in C @@ -259,6 +258,7 @@ - Depends on BioConda or legacy ToolShed ``package_blast_plus_2_7_1``. - Document the BLAST+ 2.6.0 change in the standard 12 column output from ``qacc,sacc,...`` to ``qaccver,saccver,...`` instead. + - Accept gzipped FASTA inputs (contribution from Anton Nekrutenko). ======= ====================================================================== |
b |
diff -r 360352490a06 -r 42e6f199d11f tools/ncbi_blast_plus/check_no_duplicates.py --- a/tools/ncbi_blast_plus/check_no_duplicates.py Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/check_no_duplicates.py Sat Jun 30 15:13:38 2018 -0400 |
[ |
@@ -9,10 +9,11 @@ will return a non-zero error if any duplicate identifiers are found. """ - +import gzip import os import sys + if "-v" in sys.argv or "--version" in sys.argv: print("v0.0.23") sys.exit(0) @@ -24,7 +25,19 @@ sys.stderr.write("Missing FASTA file %r\n" % filename) sys.exit(2) files += 1 - handle = open(filename) + + with open(filename, "rb") as binary_handle: + magic = binary_handle.read(2) + if not magic: + # Empty file, special case + continue + elif magic == b'\x1f\x8b': + # Gzipped + handle = gzip.open(filename, "rt") + elif magic[0:1] == b">": + # Not gzipped, shoudl be plain FASTA + handle = open(filename, "r") + for line in handle: if line.startswith(">"): # The split will also take care of the new line character, |
b |
diff -r 360352490a06 -r 42e6f199d11f tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml --- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Sat Jun 30 15:13:38 2018 -0400 |
[ |
@@ -7,24 +7,29 @@ <expand macro="parallelism" /> <expand macro="preamble" /> <command detect_errors="aggressive"> +<![CDATA[ ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces blastn --query '$query' +#if $query.is_of_type('fasta.gz'): +-query <(gunzip -c '${query}') +#else: +-query '${query}' +#end if @BLAST_DB_SUBJECT@ --task $blast_type --evalue $evalue_cutoff +-task '${blast_type}' +-evalue '${evalue_cutoff}' @BLAST_OUTPUT@ @THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.strand +${adv_opts.strand} @ADV_FILTER_QUERY@ @ADV_MAX_HITS@ @ADV_WORD_SIZE@ #if (str($adv_opts.identity_cutoff) and float(str($adv_opts.identity_cutoff)) > 0 ): --perc_identity $adv_opts.identity_cutoff +-perc_identity '${adv_opts.identity_cutoff}' #end if -$adv_opts.ungapped +${adv_opts.ungapped} @ADV_ID_LIST_FILTER@ @ADV_QCOV_HSP_PERC@ ## only use window size if dc-megablast mode is used @@ -35,9 +40,10 @@ @ADV_GAPEXTEND@ ## End of advanced options: #end if +]]> </command> <inputs> - <param argument="-query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> + <param argument="-query" type="data" format="fasta,fasta.gz" label="Nucleotide query sequence(s)"/> <expand macro="input_conditional_nucleotide_db" /> <param name="blast_type" argument="-task" type="select" display="radio" label="Type of BLAST"> <option value="megablast">megablast - Traditional megablast used to find very similar (e.g., intraspecies or closely related species) sequences</option> @@ -102,6 +108,16 @@ <test> <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" /> <param name="db_opts_selector" value="file" /> + <param name="subject" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" /> + <param name="database" value="" /> + <param name="evalue_cutoff" value="1e-40" /> + <param name="out_format" value="6" /> + <param name="adv_opts_selector" value="basic" /> + <output name="output1" file="blastn_rhodopsin_vs_three_human.tabular" ftype="tabular" /> + </test> + <test> + <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" /> + <param name="db_opts_selector" value="file" /> <param name="subject" value="three_human_mRNA.fasta" ftype="fasta" /> <param name="database" value="" /> <param name="evalue_cutoff" value="1e-40" /> |
b |
diff -r 360352490a06 -r 42e6f199d11f tools/ncbi_blast_plus/ncbi_macros.xml --- a/tools/ncbi_blast_plus/ncbi_macros.xml Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_macros.xml Sat Jun 30 15:13:38 2018 -0400 |
[ |
b'@@ -357,7 +357,7 @@\n <when value="file">\n <param name="database" type="hidden" value="" />\n <param name="histdb" type="hidden" value="" />\n- <param argument="-subject" type="data" format="fasta" label="Nucleotide FASTA subject file to use instead of a database"/>\n+ <param argument="-subject" type="data" format="fasta,fasta.gz" label="Nucleotide FASTA subject file to use instead of a database"/>\n </when>\n </conditional>\n </xml>\n@@ -533,42 +533,46 @@\n </conditional>\n </xml>\n <!--Tokens-->\n- <token name="@ADV_MATRIX_GAPCOSTS@">\n+ <token name="@ADV_MATRIX_GAPCOSTS@"><![CDATA[\n #if str($adv_opts.matrix_gapcosts.matrix):\n- -matrix $adv_opts.matrix_gapcosts.matrix\n- $adv_opts.matrix_gapcosts.gap_costs\n+ -matrix \'${adv_opts.matrix_gapcosts.matrix}\'\n+ ${adv_opts.matrix_gapcosts.gap_costs}\n #end if\n- </token>\n+ ]]></token>\n \n- <token name="@ADV_QCOV_HSP_PERC@">\n-#if float(str($adv_opts.qcov_hsp_perc)) > 0:\n- -qcov_hsp_perc $adv_opts.qcov_hsp_perc\n+ <token name="@ADV_QCOV_HSP_PERC@"><![CDATA[\n+#if float(str($adv_opts.qcov_hsp_perc)) > 0:\n+ -qcov_hsp_perc \'${adv_opts.qcov_hsp_perc}\'\n #end if\n- </token>\n+ ]]></token>\n \n- <token name="@ADV_ID_LIST_FILTER@">\n+ <token name="@ADV_ID_LIST_FILTER@"><![CDATA[\n #if $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == \'negative_gilist\':\n- -negative_gilist $adv_opts.adv_optional_id_files_opts.negative_gilist\n+ -negative_gilist \'${adv_opts.adv_optional_id_files_opts.negative_gilist}\'\n #elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == \'gilist\':\n- -gilist $adv_opts.adv_optional_id_files_opts.gilist\n+ -gilist \'{$adv_opts.adv_optional_id_files_opts.gilist}\'\n #elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == \'seqidlist\':\n- -seqidlist $adv_opts.adv_optional_id_files_opts.seqidlist\n+ -seqidlist \'${adv_opts.adv_optional_id_files_opts.seqidlist}\'\n #end if\n- </token>\n+ ]]></token>\n \n <token name="@THREADS@">-num_threads "\\${GALAXY_SLOTS:-8}"</token>\n \n- <token name="@BLAST_DB_SUBJECT@">\n+ <token name="@BLAST_DB_SUBJECT@"><![CDATA[\n #if $db_opts.db_opts_selector == "db":\n -db \'${" ".join(str($db_opts.database.fields.path).split(","))}\'\n #elif $db_opts.db_opts_selector == "histdb":\n -db \'${os.path.join($db_opts.histdb.extra_files_path, "blastdb")}\'\n #else:\n- -subject \'$db_opts.subject\'\n+ #if $db_opts.subject.is_of_type(\'fasta.gz\'):\n+ -subject <(gunzip -c \'${$db_opts.subject}\')\n+ #else:\n+ -subject \'${db_opts.subject}\'\n+ #end if\n #end if\n- </token>\n+ ]]></token>\n \n- <token name="@BLAST_OUTPUT@">-out \'$output1\'\n+ <token name="@BLAST_OUTPUT@"><![CDATA[ -out \'$output1\'\n ##Set the extended list here so when we add things, saved workflows are not affected\n #if str($output.out_format)=="ext":\n -outfmt \'6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen salltitles\'\n@@ -579,79 +583,81 @@\n -outfmt \'6 $cols\'\n #else:\n ## Note do not quote this as can be \'0 -html\' which is really two arguments\n- -outfmt $output.out_format\n+ -outfmt ${output.out_format}\n #end if\n- </token>\n+ ]]></token>\n <token name="@ADV_FILTER_QUERY@">$adv_opts.filter_query</token>\n- <token name="@ADV_MAX_HITS@">\n+ <token name="@ADV_MAX_HITS@"><![CDATA[\n ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string\n ## Note -max_target_seqs used to simply override -num_descriptions and -num_alignments\n ## but this was changed in BLAST+ 2.2.27 onwards to force their use (raised with NCBI)\n #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0):\n #if str($output.out_format) in ["6", "ext", "cols", "5"]:\n ## Most output formats use this, including tabular and XML:\n- -max_target_seqs $adv_opts.max_hits\n+ -max_target_s'..b'ts.matrix}\'\n #end if\n- </token>\n+ ]]></token>\n \n <!-- @ON_DB_SUBJECT@ is for use with @BLAST_DB_SUBJECT@ -->\n- <token name="@ON_DB_SUBJECT@">#if str($db_opts.db_opts_selector)==\'db\'\n+ <token name="@ON_DB_SUBJECT@"><![CDATA[\n+#if str($db_opts.db_opts_selector)==\'db\'\n \'${db_opts.database}\'\n #elif str($db_opts.db_opts_selector)==\'histdb\'\n \'${db_opts.histdb.name}\'\n #else\n \'${db_opts.subject.name}\'\n-#end if</token>\n+#end if\n+]]></token>\n \n- <token name="@REFERENCES@">\n+ <token name="@REFERENCES@"><![CDATA[\n Peter J. A. Cock, John M. Chilton, Bj\xc3\xb6rn Gr\xc3\xbcning, James E. Johnson, Nicola Soranzo (2015).\n NCBI BLAST+ integrated into Galaxy. *GigaScience* 4:39\n https://doi.org/10.1186/s13742-015-0080-7\n@@ -663,14 +669,16 @@\n \n This wrapper is available to install into other Galaxy Instances via the Galaxy\n Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus\n- </token>\n+ ]]></token>\n <xml name="blast_citations">\n <citations>\n+ <citation type="doi">10.1093/nar/25.17.3389</citation>\n <citation type="doi">10.1186/1471-2105-10-421</citation>\n <citation type="doi">10.1186/s13742-015-0080-7</citation>\n </citations>\n </xml>\n- <token name="@OUTPUT_FORMAT@">**Output format**\n+ <token name="@OUTPUT_FORMAT@"><![CDATA[\n+**Output format**\n \n Because Galaxy focuses on processing tabular data, the default output of this\n tool is tabular. The standard BLAST+ tabular output contains 12 columns:\n@@ -720,7 +728,7 @@\n 22 sseq Aligned part of subject sequence\n 23 qlen Query sequence length\n 24 slen Subject sequence length\n- 25 salltitles All subject title(s), separated by a \'<>\'\n+ 25 salltitles All subject title(s), separated by a \'<>\'\n ====== ============= ===========================================\n \n The third option is to customise the tabular output by selecting which\n@@ -735,8 +743,9 @@\n The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query.\n The two query anchored outputs show a multiple sequence alignment between the query and all the matches,\n and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences).\n- </token>\n- <token name="@FASTA_WARNING@">.. class:: warningmark\n+ ]]></token>\n+ <token name="@FASTA_WARNING@"><![CDATA[\n+.. class:: warningmark\n \n You can also search against a FASTA file of subject (target)\n sequences. This is *not* advised because it is slower (only one\n@@ -744,25 +753,26 @@\n searches (very small e-values which will look overly signficiant).\n In most cases you should instead turn the other FASTA file into a\n database first using *makeblastdb* and search against that.\n- </token>\n- <token name="@SEARCH_TIME_WARNING@">.. class:: warningmark\n+ ]]></token>\n+ <token name="@SEARCH_TIME_WARNING@"><![CDATA[\n+.. class:: warningmark\n \n **Note**. Database searches may take a substantial amount of time.\n For large input datasets it is advisable to allow overnight processing.\n \n -----\n- </token>\n-\n- <token name="@CLI_OPTIONS@">**Advanced Options**\n+ ]]></token>\n+ <token name="@CLI_OPTIONS@"><![CDATA[\n+**Advanced Options**\n \n For help with advanced options and their default values, visit the\n NCBI BLAST\xc2\xae Command Line Applications User Manual, Appendices,\n `Options for the command-line applications\n-<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_>`_.\n+<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_>`_.\n \n For amino acid substitution matrices, see `BLAST Substitution Matrices\n-<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_BLAST_Substitution_Matrices_>`_ in the same\n+<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_BLAST_Substitution_Matrices_>`_ in the same\n appendices.\n \n- </token>\n+ ]]></token>\n </macros>\n' |
b |
diff -r 360352490a06 -r 42e6f199d11f tools/ncbi_blast_plus/ncbi_makeblastdb.xml --- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Sat Jun 30 15:13:38 2018 -0400 |
[ |
@@ -5,21 +5,30 @@ <import>ncbi_macros.xml</import> </macros> <expand macro="preamble" /> - <command detect_errors="aggressive" strict="true"> + <command detect_errors="aggressive" strict="true"><![CDATA[ python $__tool_directory__/check_no_duplicates.py ##First check for duplicates (since BLAST+ 2.2.28 fails to do so) ##and abort (via the ampersand ampersand trick) if any are found. #for i in $input_file#'${i}' #end for# -&& -makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}' +&& +##makeblastdb does not like input redirects of the sort +##makeblastdb -in <(gunzip -c gzipped_fasta_file) +##therefore we're cramming everything +##into a single cat command below +cat +#for i in $input_file: + #if $i.is_of_type('fasta.gz'): + <(gunzip -c ${i}) + #else: + ${i} + #end if +#end for +| makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}' $parse_seqids $hash_index -## Single call to -in with multiple filenames space separated with outer quotes -## (presumably any filenames with spaces would be a problem). Note this gives -## some extra spaces, e.g. -in "file1 file2 file3 " but BLAST seems happy: --in '#for i in $input_file#${i} #end for#' +-in - #if $title: --title '$title' +-title '${title}' #else: ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful -title 'BLAST Database' @@ -46,8 +55,8 @@ #end if ## -------------------------------------------------------------------- ## Capture the stdout log information to the primary file (plain text): -> "$outfile" - </command> +> '$outfile' + ]]></command> <inputs> <param argument="-dbtype" type="select" display="radio" label="Molecule type of input"> <option value="prot">protein</option> @@ -57,7 +66,7 @@ NOTE Double check the new database would be self contained first --> <!-- Note this is a mandatory parameter - default should be most recent FASTA file --> - <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta" label="Input FASTA files(s)" help="One or more FASTA files" /> + <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta,fasta.gz" label="Input FASTA files(s)" help="One or more FASTA files" /> <param argument="-title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" /> <param argument="-parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="false" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" /> <param argument="-hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> @@ -158,7 +167,7 @@ </test> <test> <param name="dbtype" value="nucl" /> - <param name="input_file" value="three_human_mRNA.fasta" ftype="fasta" /> + <param name="input_file" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" /> <param name="title" value="Just 3 human mRNA sequences" /> <param name="parse_seqids" value="" /> <param name="hash_index" value="true" /> |