Mercurial > repos > peterjc > ncbi_blast_plus
changeset 32:b2795652d2b4 draft
Uploaded v0.0.22a, more macros, $GALAXY_SLOTS, more descriptive output names, test makeblastdb
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/README.rst Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/README.rst Mon Nov 25 10:58:46 2013 -0500 @@ -127,6 +127,10 @@ defined in updated blast_datatypes on Galaxy ToolShed. - Tests updated for BLAST+ 2.2.27 instead of BLAST+ 2.2.26 - Now depends on package_blast_plus_2_2_27 in ToolShed +v0.0.22 - More use macros to simplify the wrappers + - Set number of threads via $GALAXY_SLOTS environment variable + - More descriptive default output names + - Tests require updated BLAST DB definitions (blast_datatypes v0.0.18) ======= ======================================================================
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Mon Nov 25 10:58:46 2013 -0500 @@ -63,7 +63,7 @@ import re if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.12" + print "v0.0.22" sys.exit(0) if sys.version_info[:2] >= ( 2, 5 ): @@ -228,7 +228,10 @@ ] if extended: - sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) + try: + sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >")) + except IndexError as e: + stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) #print hit_def, "-->", sallseqid positive = hsp.findtext("Hsp_positive") ppos = "%0.2f" % (100*float(positive)/float(length))
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,4 +1,4 @@ -<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.11"> +<tool id="blastxml_to_tabular" name="BLAST XML to tabular" version="0.0.22"> <description>Convert BLAST XML output to tabular</description> <version_command interpreter="python">blastxml_to_tabular.py --version</version_command> <command interpreter="python"> @@ -17,7 +17,7 @@ </param> </inputs> <outputs> - <data name="tabular_file" format="tabular" label="BLAST results as tabular" /> + <data name="tabular_file" format="tabular" label="$blastxml_file.display_name (as tabular)" /> </outputs> <requirements> </requirements>
--- a/tools/ncbi_blast_plus/ncbi_blastdbcmd_info.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_info.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,13 +1,10 @@ -<tool id="ncbi_blastdbcmd_info" name="NCBI BLAST+ database info" version="0.0.21"> +<tool id="ncbi_blastdbcmd_info" name="NCBI BLAST+ database info" version="0.0.22"> <description>Show BLAST database information from blastdbcmd</description> - <requirements> - <requirement type="binary">blastdbcmd</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>blastdbcmd -version</version_command> <macros> + <token name="@BINARY@">blastdbcmd</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}" -info -out "$info" </command> @@ -33,17 +30,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,13 +1,10 @@ -<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="0.0.21"> +<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="0.0.22"> <description>Extract sequence(s) from BLAST database</description> - <requirements> - <requirement type="binary">blastdbcmd</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>blastdbcmd -version</version_command> <macros> + <token name="@BINARY@">blastdbcmd</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces @@ -105,17 +102,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,53 +1,29 @@ -<tool id="ncbi_blastn_wrapper" name="NCBI BLAST+ blastn" version="0.0.21"> +<tool id="ncbi_blastn_wrapper" name="NCBI BLAST+ blastn" version="0.0.22"> <description>Search nucleotide database with nucleotide query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">blastn</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>blastn -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1"></parallelism> <macros> + <token name="@BINARY@">blastn</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces blastn -query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#elif $db_opts.db_opts_selector == "histdb": - -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" -#else: - -subject "$db_opts.subject" -#end if +@BLAST_DB_SUBJECT@ -task $blast_type -evalue $evalue_cutoff --out "$output1" -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 +@BLAST_OUTPUT@ +@THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query $adv_opts.strand -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if +@ADVANCED_OPTIONS@ #if (str($adv_opts.identity_cutoff) and float(str($adv_opts.identity_cutoff)) > 0 ): -perc_identity $adv_opts.identity_cutoff #end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if $adv_opts.ungapped -$adv_opts.parse_deflines ## End of advanced options: #end if </command> @@ -69,38 +45,25 @@ <option value="vecscreen">vecscreen</option> --> </param> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <expand macro="input_evalue" /> <expand macro="input_out_format" /> - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> + <expand macro="advanced_options"> + <!-- Could use a select (yes, no, other) where other allows setting 'level window linker' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with DUST)" truevalue="-dust yes" falsevalue="-dust no" checked="true" /> + <expand macro="input_strand" /> + <expand macro="input_max_hits" /> + <param name="identity_cutoff" type="float" min="0" max="100" value="0" label="Percent identity cutoff (-perc_identity)" help="Use zero for no cutoff" /> + + <!-- I'd like word_size to be optional, with minimum 4 for blastn --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 4."> + <validator type="in_range" min="0" /> </param> - <when value="basic" /> - <when value="advanced"> - <!-- Could use a select (yes, no, other) where other allows setting 'level window linker' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with DUST)" truevalue="-dust yes" falsevalue="-dust no" checked="true" /> - <param name="strand" type="select" label="Query strand(s) to search against database/subject"> - <option value="-strand both">Both</option> - <option value="-strand plus">Plus (forward)</option> - <option value="-strand minus">Minus (reverse complement)</option> - </param> - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <param name="identity_cutoff" type="float" min="0" max="100" value="0" label="Percent identity cutoff (-perc_identity)" help="Use zero for no cutoff" /> - <!-- I'd like word_size to be optional, with minimum 4 for blastn --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 4."> - <validator type="in_range" min="0" /> - </param> - <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> - <data name="output1" format="tabular" label="${blast_type.value_label} on ${on_string}"> + <data name="output1" format="tabular" label="${blast_type.value_label} $query.name vs @ON_DB_SUBJECT@"> <expand macro="output_change_format" /> </data> </outputs> @@ -118,12 +81,7 @@ </tests> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** @@ -131,71 +89,11 @@ using the NCBI BLAST+ blastn command line tool. Algorithms include blastn, megablast, and discontiguous megablast. -.. class:: warningmark - -You can also search against a FASTA file of subject nucleotide -sequences. This is *not* advised because it is slower (only one -CPU is used), but more importantly gives e-values for pairwise -searches (very small e-values which will look overly signficiant). -In most cases you should instead turn the other FASTA file into a -database first using *makeblastdb* and search against that. +@FASTA_WARNING@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -204,17 +102,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastp_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,51 +1,27 @@ -<tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.21"> +<tool id="ncbi_blastp_wrapper" name="NCBI BLAST+ blastp" version="0.0.22"> <description>Search protein database with protein query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">blastp</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>blastp -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1" /> <macros> + <token name="@BINARY@">blastp</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces blastp -query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#elif $db_opts.db_opts_selector == "histdb": - -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" -#else: - -subject "$db_opts.subject" -#end if +@BLAST_DB_SUBJECT@ -task $blast_type -evalue $evalue_cutoff --out "$output1" -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 +@BLAST_OUTPUT@ +@THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query -matrix $adv_opts.matrix -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if +@ADVANCED_OPTIONS@ ##Ungapped disabled for now - see comments below ##$adv_opts.ungapped -$adv_opts.parse_deflines ## End of advanced options: #end if </command> @@ -61,40 +37,25 @@ <option value="blastp">blastp</option> <option value="blastp-short">blastp-short</option> </param> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <expand macro="input_evalue" /> <expand macro="input_out_format" /> - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> - </param> - <when value="basic" /> - <when value="advanced"> - <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> - - <expand macro="input_scoring_matrix" /> - - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <!-- I'd like word_size to be optional, with minimum 2 for blastp --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> - <validator type="in_range" min="0" /> - </param> - <!-- - Can't use '-ungapped' on its own, error back is: - Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search - Tried using '-ungapped -comp_based_stats F' and blastp crashed with 'Attempt to access NULL pointer.' - <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" /> - --> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <expand macro="advanced_options"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <expand macro="input_filter_query_default_false" /> + <expand macro="input_scoring_matrix" /> + <expand macro="input_max_hits" /> + <expand macro="input_word_size" /> + <!-- + Can't use '-ungapped' on its own, error back is: + Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search + Tried using '-ungapped -comp_based_stats F' and blastp crashed with 'Attempt to access NULL pointer.' + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" /> + --> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> - <data name="output1" format="tabular" label="${blast_type.value_label} on ${on_string}"> + <data name="output1" format="tabular" label="${blast_type.value_label} $query.name vs @ON_DB_SUBJECT@"> <expand macro="output_change_format" /> </data> </outputs> @@ -161,83 +122,18 @@ </tests> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** Search a *protein database* using a *protein query*, using the NCBI BLAST+ blastp command line tool. -.. class:: warningmark - -You can also search against a FASTA file of subject protein -sequences. This is *not* advised because it is slower (only one -CPU is used), but more importantly gives e-values for pairwise -searches (very small e-values which will look overly signficiant). -In most cases you should instead turn the other FASTA file into a -database first using *makeblastdb* and search against that. +@FASTA_WARNING@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -246,17 +142,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastx_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,51 +1,27 @@ -<tool id="ncbi_blastx_wrapper" name="NCBI BLAST+ blastx" version="0.0.21"> +<tool id="ncbi_blastx_wrapper" name="NCBI BLAST+ blastx" version="0.0.22"> <description>Search protein database with translated nucleotide query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">blastx</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>blastx -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1"></parallelism> <macros> + <token name="@BINARY@">blastx</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces blastx -query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#elif $db_opts.db_opts_selector == "histdb": - -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" -#else: - -subject "$db_opts.subject" -#end if +@BLAST_DB_SUBJECT@ -query_gencode $query_gencode -evalue $evalue_cutoff --out "$output1" -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 +@BLAST_OUTPUT@ +@THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query $adv_opts.strand -matrix $adv_opts.matrix -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if +@ADVANCED_OPTIONS@ $adv_opts.ungapped -$adv_opts.parse_deflines ## End of advanced options: #end if </command> @@ -57,42 +33,22 @@ <expand macro="input_conditional_protein_db" /> <expand macro="input_query_gencode" /> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <expand macro="input_evalue" /> <expand macro="input_out_format" /> - - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> - </param> - <when value="basic" /> - <when value="advanced"> - <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" /> - <param name="strand" type="select" label="Query strand(s) to search against database/subject"> - <option value="-strand both">Both</option> - <option value="-strand plus">Plus (forward)</option> - <option value="-strand minus">Minus (reverse complement)</option> - </param> - - <expand macro="input_scoring_matrix" /> - - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <!-- I'd like word_size to be optional, with minimum 2 for blastx --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> - <validator type="in_range" min="0" /> - </param> - <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <expand macro="advanced_options"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <expand macro="input_filter_query_default_true" /> + <expand macro="input_strand" /> + <expand macro="input_scoring_matrix" /> + <expand macro="input_max_hits" /> + <expand macro="input_word_size" /> + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> - <data name="output1" format="tabular" label="blastx on ${on_string}"> + <data name="output1" format="tabular" label="blastx $query.name vs @ON_DB_SUBJECT@"> <expand macro="output_change_format" /> </data> </outputs> @@ -130,83 +86,18 @@ </tests> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** Search a *protein database* using a *translated nucleotide query*, using the NCBI BLAST+ blastx command line tool. -.. class:: warningmark - -You can also search against a FASTA file of subject protein -sequences. This is *not* advised because it is slower (only one -CPU is used), but more importantly gives e-values for pairwise -searches (very small e-values which will look overly signficiant). -In most cases you should instead turn the other FASTA file into a -database first using *makeblastdb* and search against that. +@FASTA_WARNING@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -215,17 +106,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,4 +1,4 @@ -<tool id="ncbi_dustmasker_wrapper" name="NCBI dustmasker" version="0.0.21"> +<tool id="ncbi_dustmasker_wrapper" name="NCBI dustmasker" version="0.0.22"> <!-- dustmasker wrapper from Edward Kirton and Nicola Soranzo --> <description>masks low complexity regions</description> <requirements> @@ -99,17 +99,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers (a more specific paper covering this wrapper is planned): -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_macros.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,6 +1,5 @@ <macros> - <macro name="output_change_format"> - + <xml name="output_change_format"> <change_format> <when input="out_format" value="0" format="txt"/> <when input="out_format" value="0 -html" format="html"/> @@ -10,9 +9,8 @@ <when input="out_format" value="4 -html" format="html"/> <when input="out_format" value="5" format="blastxml"/> </change_format> - - </macro> - <macro name="input_out_format"> + </xml> + <xml name="input_out_format"> <param name="out_format" type="select" label="Output format"> <option value="6">Tabular (standard 12 columns)</option> <option value="ext" selected="True">Tabular (extended 24 columns)</option> @@ -27,8 +25,8 @@ <option value="-outfmt 11">BLAST archive format (ASN.1)</option> --> </param> - </macro> - <macro name="input_scoring_matrix"> + </xml> + <xml name="input_scoring_matrix"> <param name="matrix" type="select" label="Scoring matrix"> <option value="BLOSUM90">BLOSUM90</option> <option value="BLOSUM80">BLOSUM80</option> @@ -39,8 +37,8 @@ <option value="PAM70">PAM70</option> <option value="PAM30">PAM30</option> </param> - </macro> - <macro name="stdio"> + </xml> + <xml name="stdio"> <stdio> <!-- Anything other than zero is an error --> <exit_code range="1:" /> @@ -49,8 +47,8 @@ <regex match="Error:" /> <regex match="Exception:" /> </stdio> - </macro> - <macro name="input_query_gencode"> + </xml> + <xml name="input_query_gencode"> <param name="query_gencode" type="select" label="Query genetic code"> <!-- See http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for details --> <option value="1" select="True">1. Standard</option> @@ -72,9 +70,8 @@ <option value="23">23. Thraustochytrium Mitochondrial Code</option> <option value="24">24. Pterobranchia mitochondrial code</option> </param> - </macro> - - <macro name="input_db_gencode"> + </xml> + <xml name="input_db_gencode"> <param name="db_gencode" type="select" label="Database/subject genetic code"> <!-- See http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi for details --> <option value="1" select="True">1. Standard</option> @@ -96,9 +93,8 @@ <option value="23">23. Thraustochytrium Mitochondrial Code</option> <option value="24">24. Pterobranchia mitochondrial code</option> </param> - </macro> - - <macro name="input_conditional_nucleotide_db"> + </xml> + <xml name="input_conditional_nucleotide_db"> <conditional name="db_opts"> <param name="db_opts_selector" type="select" label="Subject database/sequences"> <option value="db" selected="True">Locally installed BLAST database</option> @@ -127,9 +123,8 @@ <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> </when> </conditional> - </macro> - - <macro name="input_conditional_protein_db"> + </xml> + <xml name="input_conditional_protein_db"> <conditional name="db_opts"> <param name="db_opts_selector" type="select" label="Subject database/sequences"> <option value="db" selected="True">Locally installed BLAST database</option> @@ -158,9 +153,8 @@ <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> </when> </conditional> - </macro> - - <macro name="input_conditional_pssm"> + </xml> + <xml name="input_conditional_pssm"> <conditional name="db_opts"> <param name="db_opts_selector" type="select" label="Protein domain database (PSSM)"> <option value="db" selected="True">Locally installed BLAST database</option> @@ -187,9 +181,8 @@ </when> --> </conditional> - </macro> - - <macro name="input_conditional_choose_db_type"> + </xml> + <xml name="input_conditional_choose_db_type"> <conditional name="db_opts"> <param name="db_type" type="select" label="Type of BLAST database"> <option value="nucl" selected="True">Nucleotide</option> @@ -214,7 +207,175 @@ </param> </when> </conditional> - </macro> + </xml> + <xml name="input_parse_deflines"> + <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> + </xml> + <xml name="input_filter_query_default_false"> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> + </xml> + <xml name="input_filter_query_default_true"> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" /> + </xml> + <xml name="input_max_hits"> + <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> + <validator type="in_range" min="0" /> + </param> + </xml> + <xml name="input_evalue"> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + </xml> + <xml name="input_word_size"> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> + <validator type="in_range" min="0" /> + </param> + </xml> + <xml name="input_strand"> + <param name="strand" type="select" label="Query strand(s) to search against database/subject"> + <option value="-strand both">Both</option> + <option value="-strand plus">Plus (forward)</option> + <option value="-strand minus">Minus (reverse complement)</option> + </param> + </xml> + <xml name="requirements"> + <requirements> + <requirement type="binary">@BINARY@</requirement> + <requirement type="package" version="2.2.27">blast+</requirement> + </requirements> + <version_command>@BINARY@ -version</version_command> + </xml> + <xml name="advanced_options"> + <conditional name="adv_opts"> + <param name="adv_opts_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Hide Advanced Options</option> + <option value="advanced">Show Advanced Options</option> + </param> + <when value="basic" /> + <when value="advanced"> + <yield /> + </when> + </conditional> + </xml> + <token name="@THREADS@">-num_threads "\${GALAXY_SLOTS:-8}"</token> + <token name="@BLAST_DB_SUBJECT@"> +#if $db_opts.db_opts_selector == "db": + -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" +#else: + -subject "$db_opts.subject" +#end if + </token> + <token name="@BLAST_OUTPUT@">-out "$output1" +##Set the extended list here so if/when we add things, saved workflows are not affected +#if str($out_format)=="ext": + -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" +#else: + -outfmt $out_format +#end if + </token> + <token name="@ADVANCED_OPTIONS@">$adv_opts.filter_query +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.parse_deflines + </token> + <!-- @ON_DB_SUBJECT@ is for use with @BLAST_DB_SUBJECT@ --> + <token name="@ON_DB_SUBJECT@">#if str($db_opts.db_opts_selector)=='db' +${db_opts.database} +#elif str($db_opts.db_opts_selector)=='histdb' +${db_opts.histdb.name} +#else +${db_opts.subject.name} +#end if</token> + <token name="@REFERENCES@"> +Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). +Galaxy tools and workflows for sequence analysis with applications +in molecular plant pathology. PeerJ 1:e167 +http://dx.doi.org/10.7717/peerj.167 +Christiam Camacho et al. (2009). +BLAST+: architecture and applications. +BMC Bioinformatics. 15;10:421. +http://dx.doi.org/10.1186/1471-2105-10-421 +This wrapper is available to install into other Galaxy Instances via the Galaxy +Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus + </token> + <token name="@OUTPUT_FORMAT@">**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 24 column tabular +BLAST output. Galaxy now uses this extended 24 column output by default. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence + 23 qlen Query sequence length + 24 slen Subject sequence length +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + </token> + <token name="@FASTA_WARNING@">.. class:: warningmark + +You can also search against a FASTA file of subject (target) +sequences. This is *not* advised because it is slower (only one +CPU is used), but more importantly gives e-values for pairwise +searches (very small e-values which will look overly signficiant). +In most cases you should instead turn the other FASTA file into a +database first using *makeblastdb* and search against that. + </token> + <token name="@SEARCH_TIME_WARNING@">.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + </token> </macros>
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,13 +1,10 @@ -<tool id="ncbi_makeblastdb" name="NCBI BLAST+ makeblastdb" version="0.0.21"> +<tool id="ncbi_makeblastdb" name="NCBI BLAST+ makeblastdb" version="0.0.22"> <description>Make BLAST database</description> - <requirements> - <requirement type="binary">makeblastdb</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>makeblastdb -version</version_command> <macros> + <token name="@BINARY@">makeblastdb</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}" $parse_seqids @@ -46,34 +43,36 @@ ## #else if $tax.select == 'map': ## -taxid_map $tax.map ## #end if +## -------------------------------------------------------------------- +## Capture the stdout log information to the primary file (plain text): +>> "$outfile" </command> - <expand macro="stdio" /> - <inputs> <param name="dbtype" type="select" display="radio" label="Molecule type of input"> <option value="prot">protein</option> <option value="nucl">nucleotide</option> </param> <!-- TODO Allow merging of existing BLAST databases (conditional on the database type) + NOTE Double check the new database would be self contained first <repeat name="in" title="BLAST or FASTA Database" min="1"> <param name="file" type="data" format="fasta,blastdbn,blastdbp" label="BLAST or FASTA database" /> </repeat> --> + <!-- TODO Switch this to using <param ... multiple="true" /> instead of <repeat> block? --> <repeat name="in" title="FASTA file" min="1"> <param name="file" type="data" format="fasta" /> </repeat> <param name="title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" /> <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="False" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" /> - <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values." help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> - + <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> <!-- SEQUENCE MASKING OPTIONS --> <repeat name="mask_data" title="Masking data file"> - <param name="file" type="data" format="maskinfo-asn1,maskinfo-asn1-binary" label="ASN.1 file containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" /> + <param name="mask_data_file" type="data" format="maskinfo-asn1,maskinfo-asn1-binary" label="ASN.1 file containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" /> </repeat> <!-- TODO <repeat name="gi_mask" title="Create GI indexed masking data"> - <param name="file" type="data" format="asnb" label="Masking data output file" /> + <param name="gi_mask_file" type="data" format="asnb" label="Masking data output file" /> </repeat> --> @@ -106,6 +105,25 @@ </data> </outputs> <tests> + <!-- Note the (two line) PIN file is not reproducible run to run. + --> + <test> + <param name="dbtype" value="prot" /> + <param name="file" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="title" value="Just 4 human proteins" /> + <param name="parse_seqids" value="" /> + <param name="hash_index" value="true" /> + <output name="out_file" file="four_human_proteins.fasta.log" ftype="blastdbp" lines_diff="6"> + <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" /> + <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" /> + <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" /> + <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" /> + <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" /> + <extra_files type="file" value="four_human_proteins.fasta.phi" name="blastdb.phi" /> + <extra_files type="file" value="four_human_proteins.fasta.psd" name="blastdb.psd" /> + <extra_files type="file" value="four_human_proteins.fasta.psi" name="blastdb.psi" /> + </output> + </test> </tests> <help> **What it does** @@ -129,17 +147,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_rpsblast_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,15 +1,12 @@ -<tool id="ncbi_rpsblast_wrapper" name="NCBI BLAST+ rpsblast" version="0.0.21"> +<tool id="ncbi_rpsblast_wrapper" name="NCBI BLAST+ rpsblast" version="0.0.22"> <description>Search protein domain database (PSSMs) with protein query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">rpsblast</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>rpsblast -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1" /> <macros> + <token name="@BINARY@">deltablast</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces @@ -21,25 +18,10 @@ -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" #end if -evalue $evalue_cutoff --out "$output1" -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 +@BLAST_OUTPUT@ +@THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -$adv_opts.parse_deflines +@ADVANCED_OPTIONS@ ## End of advanced options: #end if </command> @@ -51,30 +33,17 @@ <expand macro="input_conditional_pssm" /> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <expand macro="input_evalue" /> <expand macro="input_out_format" /> - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> - </param> - <when value="basic" /> - <when value="advanced"> - <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <!-- I'd like word_size to be optional, with minimum 2 for rpsblast --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> - <validator type="in_range" min="0" /> - </param> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <expand macro="advanced_options"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <expand macro="input_filter_query_default_false" /> + <expand macro="input_max_hits" /> + <expand macro="input_word_size" /> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> <data name="output1" format="tabular" label="rpsblast on ${on_string}"> @@ -85,12 +54,7 @@ </outputs> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** @@ -129,60 +93,7 @@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -191,17 +102,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,15 +1,12 @@ -<tool id="ncbi_rpstblastn_wrapper" name="NCBI BLAST+ rpstblastn" version="0.0.21"> +<tool id="ncbi_rpstblastn_wrapper" name="NCBI BLAST+ rpstblastn" version="0.0.22"> <description>Search protein domain database (PSSMs) with translated nucleotide query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">rpstblastn</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>rpstblastn -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1"></parallelism> <macros> + <token name="@BINARY@">rpstblastn</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces @@ -21,26 +18,11 @@ -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" #end if -evalue $evalue_cutoff --out "$output1" -## Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if +@BLAST_OUTPUT@ ## rpstblastn does not support multiple threads up to release 2.2.27+. Added in BLAST 2.2.28+. ##-num_threads 8 #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.filter_query -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -$adv_opts.parse_deflines +@ADVANCED_OPTIONS@ ## End of advanced options: #end if </command> @@ -50,30 +32,18 @@ <expand macro="input_conditional_pssm" /> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <expand macro="input_evalue" /> <expand macro="input_out_format" /> - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> - </param> - <when value="basic" /> - <when value="advanced"> - <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <!-- I'd like word_size to be optional, with minimum 2 for rpsblast --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> - <validator type="in_range" min="0" /> - </param> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <expand macro="advanced_options"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <expand macro="input_filter_query_default_false" /> + <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> + <expand macro="input_max_hits" /> + <expand macro="input_word_size" /> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> <data name="output1" format="tabular" label="rpstblastn on ${on_string}"> @@ -82,12 +52,7 @@ </outputs> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** @@ -126,60 +91,7 @@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -188,17 +100,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_tblastn_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,51 +1,27 @@ -<tool id="ncbi_tblastn_wrapper" name="NCBI BLAST+ tblastn" version="0.0.21"> +<tool id="ncbi_tblastn_wrapper" name="NCBI BLAST+ tblastn" version="0.0.22"> <description>Search translated nucleotide database with protein query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">tblastn</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>tblastn -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1"></parallelism> <macros> + <token name="@BINARY@">tblastn</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces tblastn -query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#elif $db_opts.db_opts_selector == "histdb": - -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" -#else: - -subject "$db_opts.subject" -#end if +@BLAST_DB_SUBJECT@ -evalue $evalue_cutoff --out "$output1" -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 +@BLAST_OUTPUT@ +@THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -db_gencode $adv_opts.db_gencode -$adv_opts.filter_query -matrix $adv_opts.matrix -## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string -## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if +@ADVANCED_OPTIONS@ ##Ungapped disabled for now - see comments below ##$adv_opts.ungapped -$adv_opts.parse_deflines ## End of advanced options: #end if </command> @@ -57,43 +33,26 @@ <expand macro="input_conditional_nucleotide_db" /> <expand macro="input_out_format" /> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> - - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> - </param> - <when value="basic" /> - <when value="advanced"> - - <expand macro="input_db_gencode" /> - - <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" /> - - <expand macro="input_scoring_matrix" /> + <expand macro="input_evalue" /> + <expand macro="advanced_options"> + <expand macro="input_db_gencode" /> - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <!-- I'd like word_size to be optional, with minimum 2 for blastp --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> - <validator type="in_range" min="0" /> - </param> - <!-- - Can't use '-ungapped' on its own, error back is: - Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search - Tried using '-ungapped -comp_based_stats F' and tblastn crashed with 'Attempt to access NULL pointer.' - <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" /> - --> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <expand macro="input_filter_query_default_true" /> + <expand macro="input_scoring_matrix" /> + <expand macro="input_max_hits" /> + <expand macro="input_word_size" /> + <!-- + Can't use '-ungapped' on its own, error back is: + Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search + Tried using '-ungapped -comp_based_stats F' and tblastn crashed with 'Attempt to access NULL pointer.' + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" /> + --> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> - <data name="output1" format="tabular" label="tblastn on ${on_string}"> + <data name="output1" format="tabular" label="tblastn $query.name vs @ON_DB_SUBJECT@"> <expand macro="output_change_format" /> </data> </outputs> @@ -177,83 +136,18 @@ </tests> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** Search a *translated nucleotide database* using a *protein query*, using the NCBI BLAST+ tblastn command line tool. -.. class:: warningmark - -You can also search against a FASTA file of subject nucleotide -sequences. This is *not* advised because it is slower (only one -CPU is used), but more importantly gives e-values for pairwise -searches (very small e-values which will look overly signficiant). -In most cases you should instead turn the other FASTA file into a -database first using *makeblastdb* and search against that. +@FASTA_WARNING@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -262,17 +156,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/ncbi_tblastx_wrapper.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,51 +1,29 @@ -<tool id="ncbi_tblastx_wrapper" name="NCBI BLAST+ tblastx" version="0.0.21"> +<tool id="ncbi_tblastx_wrapper" name="NCBI BLAST+ tblastx" version="0.0.22"> <description>Search translated nucleotide database with translated nucleotide query sequence(s)</description> <!-- If job splitting is enabled, break up the query file into parts --> - <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject,histdb" merge_outputs="output1"></parallelism> - <requirements> - <requirement type="binary">tblastx</requirement> - <requirement type="package" version="2.2.27">blast+</requirement> - </requirements> - <version_command>tblastx -version</version_command> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" merge_outputs="output1"></parallelism> <macros> + <token name="@BINARY@">tblastx</token> <import>ncbi_macros.xml</import> </macros> + <expand macro="requirements" /> <command> ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces tblastx -query "$query" -#if $db_opts.db_opts_selector == "db": - -db "${db_opts.database.fields.path}" -#elif $db_opts.db_opts_selector == "histdb": - -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" -#else: - -subject "$db_opts.subject" -#end if +@BLAST_DB_SUBJECT@ -query_gencode $query_gencode -evalue $evalue_cutoff --out "$output1" -##Set the extended list here so if/when we add things, saved workflows are not affected -#if str($out_format)=="ext": - -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" -#else: - -outfmt $out_format -#end if --num_threads 8 +@BLAST_OUTPUT@ +@THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -db_gencode $adv_opts.db_gencode -$adv_opts.filter_query $adv_opts.strand -matrix $adv_opts.matrix ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string ## Note -max_target_seqs overrides -num_descriptions and -num_alignments -#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): --max_target_seqs $adv_opts.max_hits -#end if -#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): --word_size $adv_opts.word_size -#end if -$adv_opts.parse_deflines +@ADVANCED_OPTIONS@ ## End of advanced options: #end if </command> @@ -57,42 +35,24 @@ <expand macro="input_conditional_nucleotide_db" /> <expand macro="input_query_gencode" /> - <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <expand macro="input_evalue" /> <expand macro="input_out_format" /> - - <conditional name="adv_opts"> - <param name="adv_opts_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Hide Advanced Options</option> - <option value="advanced">Show Advanced Options</option> - </param> - <when value="basic" /> - <when value="advanced"> - - <expand macro="input_db_gencode" /> + <expand macro="advanced_options"> + <expand macro="input_db_gencode" /> - <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> - <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" /> - <param name="strand" type="select" label="Query strand(s) to search against database/subject"> - <option value="-strand both">Both</option> - <option value="-strand plus">Plus (forward)</option> - <option value="-strand minus">Minus (reverse complement)</option> - </param> - <expand macro="input_scoring_matrix" /> - <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> - <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> - <validator type="in_range" min="0" /> - </param> - <!-- I'd like word_size to be optional, with minimum 2 for tblastx --> - <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> - <validator type="in_range" min="0" /> - </param> - <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> - </when> - </conditional> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <expand macro="input_filter_query_default_true" /> + <expand macro="input_strand" /> + <expand macro="input_scoring_matrix" /> + <expand macro="input_max_hits" /> + <!-- I'd like word_size to be optional, with minimum 2 for tblastx --> + <expand macro="input_word_size" /> + <expand macro="input_parse_deflines" /> + </expand> </inputs> <outputs> - <data name="output1" format="tabular" label="tblastx on ${on_string}"> + <data name="output1" format="tabular" label="tblastx $query.name vs @ON_DB_SUBJECT@"> <expand macro="output_change_format" /> </data> </outputs> @@ -110,83 +70,18 @@ </tests> <help> -.. class:: warningmark - -**Note**. Database searches may take a substantial amount of time. -For large input datasets it is advisable to allow overnight processing. - ------ +@SEARCH_TIME_WARNING@ **What it does** Search a *translated nucleotide database* using a *protein query*, using the NCBI BLAST+ tblastx command line tool. -.. class:: warningmark - -You can also search against a FASTA file of subject nucleotide -sequences. This is *not* advised because it is slower (only one -CPU is used), but more importantly gives e-values for pairwise -searches (very small e-values which will look overly signficiant). -In most cases you should instead turn the other FASTA file into a -database first using *makeblastdb* and search against that. +@FASTA_WARNING@ ----- -**Output format** - -Because Galaxy focuses on processing tabular data, the default output of this -tool is tabular. The standard BLAST+ tabular output contains 12 columns: - -====== ========= ============================================ -Column NCBI name Description ------- --------- -------------------------------------------- - 1 qseqid Query Seq-id (ID of your sequence) - 2 sseqid Subject Seq-id (ID of the database hit) - 3 pident Percentage of identical matches - 4 length Alignment length - 5 mismatch Number of mismatches - 6 gapopen Number of gap openings - 7 qstart Start of alignment in query - 8 qend End of alignment in query - 9 sstart Start of alignment in subject (database hit) - 10 send End of alignment in subject (database hit) - 11 evalue Expectation value (E-value) - 12 bitscore Bit score -====== ========= ============================================ - -The BLAST+ tools can optionally output additional columns of information, -but this takes longer to calculate. Most (but not all) of these columns are -included by selecting the extended tabular output. The extra columns are -included *after* the standard 12 columns. This is so that you can write -workflow filtering steps that accept either the 12 or 24 column tabular -BLAST output. Galaxy now uses this extended 24 column output by default. - -====== ============= =========================================== -Column NCBI name Description ------- ------------- ------------------------------------------- - 13 sallseqid All subject Seq-id(s), separated by a ';' - 14 score Raw score - 15 nident Number of identical matches - 16 positive Number of positive-scoring matches - 17 gaps Total number of gaps - 18 ppos Percentage of positive-scoring matches - 19 qframe Query frame - 20 sframe Subject frame - 21 qseq Aligned part of query sequence - 22 sseq Aligned part of subject sequence - 23 qlen Query sequence length - 24 slen Subject sequence length -====== ============= =========================================== - -The third option is BLAST XML output, which is designed to be parsed by -another program, and is understood by some Galaxy tools. - -You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). -The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. -The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. -The two query anchored outputs show a multiple sequence alignment between the query and all the matches, -and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). +@OUTPUT_FORMAT@ ------- @@ -195,17 +90,6 @@ If you use this Galaxy tool in work leading to a scientific publication please cite the following papers: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 - -Christiam Camacho et al. (2009). -BLAST+: architecture and applications. -BMC Bioinformatics. 15;10:421. -http://dx.doi.org/10.1186/1471-2105-10-421 - -This wrapper is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus +@REFERENCES@ </help> </tool>
--- a/tools/ncbi_blast_plus/repository_dependencies.xml Thu Oct 10 11:53:57 2013 -0400 +++ b/tools/ncbi_blast_plus/repository_dependencies.xml Mon Nov 25 10:58:46 2013 -0500 @@ -1,4 +1,4 @@ <?xml version="1.0"?> <repositories description="This requires the BLAST datatype definitions (e.g. the BLAST XML format)."> - <repository changeset_revision="7ceb2ae30ff4" name="blast_datatypes" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <repository changeset_revision="e36c60d13c94" name="blast_datatypes" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </repositories>