Mercurial > repos > edward-kirton > blast
changeset 0:d9ce1081d80b default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | edward-kirton |
---|---|
date | Tue, 07 Jun 2011 16:48:42 -0400 |
parents | |
children | |
files | blast/README blast/blastdb.py blast/blastdb_wrapper.sh blast/blastn.xml blast/blastp.xml blast/blastx.xml blast/dustmasker.xml blast/makeblastdb.xml blast/suite_config.xml blast/tblastn.xml blast/tblastx.xml |
diffstat | 11 files changed, 1359 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/README Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,5 @@ +the blast xml files are just modified versions of these included with the galaxy distribution; they add support for blastdb files so that users can create a database and resuse them. +the makeblastdb tool was added for this purpose. dustmasker was also added to allow masking of low complexity sequences. +additional ncbi blast+ tools will be added in the near future. + +blastdb.py goes in lib/galaxy/datatypes and must be registered in both the datatypes_conf.xml file and registry.py files.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/blastdb.py Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,50 @@ +""" +BLAST Database classes +""" + +import data +import logging +import re +import string +from cgi import escape +from galaxy.datatypes.metadata import MetadataElement +from galaxy.datatypes import metadata +import galaxy.model +from galaxy import util +from sniff import * + +log = logging.getLogger(__name__) + +class BlastDb( data.Data ): + """Class for BLAST database files""" + + file_ext = 'blastdb' + composite_type='basic' + + MetadataElement( readonly=True, optional=True, visible=False, no_value=0 ) + + def __init__(self,**kwd): + data.Data.__init__(self, **kwd) + self.add_composite_file('blastdb.nhr') + self.add_composite_file('blastdb.nin') + self.add_composite_file('blastdb.nsq') + self.add_composite_file('blastdb.nhd', optional=True) + self.add_composite_file('blastdb.nsi', optional=True) + self.add_composite_file('blastdb.nhi', optional=True) + self.add_composite_file('blastdb.nog', optional=True) + self.add_composite_file('blastdb.nsd', optional=True) + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = "Folder of multiple files" + dataset.blurb = "Folder of multiple files" + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "Folder of multiple files" + def get_mime(self): + """Returns the mime type of the datatype""" + return 'text/plain'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/blastdb_wrapper.sh Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,18 @@ +#!/bin/bash + +if [ -z $1 ] +then + echo "Missing arguments" 1>&2 + exit 1 +fi +mkdir $1 +shift +OUT=`$* 2>&1` +if [ $? != 0 ] +then + echo $OUT 1>&2 + exit 1 +else + echo $OUT + exit 0 +fi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/blastn.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,212 @@ +<tool id="blastn" name="blastn" version="0.0.1"> +<description>Search nucleotide database with nucleotide query sequence(s)</description> +<command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +blastn +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "$db_opts.database" +#elif $db_opts.db_opts_selector == "user_db": + -db ${os.path.join($db_opts.db.extra_files_path,'blastdb')} +#else: + -subject "$db_opts.subject" +#end if +-task $blast_type +-evalue $evalue_cutoff +-out $output1 +-outfmt "$out_format" +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +$adv_opts.strand +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.ungapped +$adv_opts.parse_deflines +## End of advanced options: +#end if + </command> + <inputs> + <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> + <conditional name="db_opts"> + <param name="db_opts_selector" type="select" label="Subject database/sequences"> + <option value="db" selected="True">Precompiled BLAST Database</option> + <option value="user_db">BLAST Database in your History</option> + <option value="fasta">FASTA file</option> + </param> + <when value="db"> + <param name="database" type="select" label="Precompiled Nucleotide BLAST database"> + <!-- The BLAST loc file has three columns: + column 0 is an identifier (not used here, see legacy megablast wrapper), + column 1 is the caption (show this to the user), + column 2 is the database path (given to BLAST+) --> + <options from_file="blastdb.loc"> + <column name="name" index="1"/> + <column name="value" index="2"/> + </options> + </param> + <param name="subject" type="hidden" value="" /> + </when> + <when value="user_db"> + <param name="database" type="hidden" value="" /> + <param name="db" type="data" format="blastdb" label="Blast DB" /> + </when> + <when value="fasta"> + <param name="database" type="hidden" value="" /> + <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> + </when> + </conditional> + <param name="blast_type" type="select" display="radio" label="Type of BLAST"> + <option value="megablast">megablast</option> + <option value="blastn">blastn</option> + <option value="blastn-short">blastn-short</option> + <option value="dc-megablast">dc-megablast</option> + <!-- Using BLAST 2.2.24+ this gives an error: + BLAST engine error: Program type 'vecscreen' not supported + <option value="vecscreen">vecscreen</option> + --> + </param> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <param name="out_format" type="select" label="Output format"> + <option value="6" selected="True">Tabular (standard 12 columns)</option> + <option value="6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq">Tabular (extended 22 columns)</option> + <option value="5">BLAST XML</option> + <option value="0">Pairwise text</option> + <option value="0 -html">Pairwise HTML</option> + <option value="2">Query-anchored text</option> + <option value="2 -html">Query-anchored HTML</option> + <option value="4">Flat query-anchored text</option> + <option value="4 -html">Flat query-anchored HTML</option> + <!-- + <option value="-outfmt 11">BLAST archive format (ASN.1)</option> + --> + </param> + <conditional name="adv_opts"> + <param name="adv_opts_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Hide Advanced Options</option> + <option value="advanced">Show Advanced Options</option> + </param> + <when value="basic" /> + <when value="advanced"> + <!-- Could use a select (yes, no, other) where other allows setting 'level window linker' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with DUST)" truevalue="-dust yes" falsevalue="-dust no" checked="true" /> + <param name="strand" type="select" label="Query strand(s) to search against database/subject"> + <option value="-strand both">Both</option> + <option value="-strand plus">Plus (forward)</option> + <option value="-strand minus">Minus (reverse complement)</option> + </param> + <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> + <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> + <validator type="in_range" min="0" /> + </param> + <!-- I'd like word_size to be optional, with minimum 4 for blastn --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 4."> + <validator type="in_range" min="0" /> + </param> + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> + <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format="tabular" label="${blast_type.value_label} on ${db_opts.db_opts_selector}"> + <change_format> + <when input="out_format" value="0" format="txt"/> + <when input="out_format" value="0 -html" format="html"/> + <when input="out_format" value="2" format="txt"/> + <when input="out_format" value="2 -html" format="html"/> + <when input="out_format" value="4" format="txt"/> + <when input="out_format" value="4 -html" format="html"/> + <when input="out_format" value="5" format="blastxml"/> + </change_format> + </data> + </outputs> + <requirements> + <requirement type="binary">blastn</requirement> + </requirements> + <tests> + </tests> + <help> + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *nucleotide database* using a *nucleotide query*, +using the NCBI BLAST+ blastn command line tool. +Algorithms include blastn, megablast, and discontiguous megablast. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 22 column tabular +BLAST output. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/blastp.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,259 @@ +<tool id="blastp" name="blastp" version="0.0.1"> + <description>Search protein database with protein query sequence(s)</description> +<command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +blastp +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "$db_opts.database" +#elif $db_opts.db_opts_selector == "user_db": + -db "$db_opts.db" +#else: + -subject "$db_opts.subject" +#end if +-task $blast_type +-evalue $evalue_cutoff +-out $output1 +-outfmt "$out_format" +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +-matrix $adv_opts.matrix +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +##Ungapped disabled for now - see comments below +##$adv_opts.ungapped +$adv_opts.parse_deflines +## End of advanced options: +#end if + </command> + <inputs> + <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> + <conditional name="db_opts"> + <param name="db_opts_selector" type="select" label="Subject database/sequences"> + <option value="db" selected="True">Precompiled BLAST Database</option> + <option value="user_db">BLAST Database in your History</option> + <option value="fasta">FASTA file</option> + </param> + <when value="db"> + <param name="database" type="select" label="Precompiled Protein BLAST database"> + <!-- The BLAST loc file has three columns: + column 0 is an identifier (not used), + column 1 is the caption (show this to the user), + column 2 is the database path (given to BLAST+) --> + <options from_file="blastdb_p.loc"> + <column name="name" index="1"/> + <column name="value" index="2"/> + </options> + </param> + <param name="subject" type="hidden" value="" /> + </when> + <when value="user_db"> + <param name="database" type="hidden" value="" /> + <param name="db" type="data" format="blastdb" label="Blast DB" /> + </when> + <when value="fasta"> + <param name="database" type="hidden" value="" /> + <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> + </when> + </conditional> + <param name="blast_type" type="select" display="radio" label="Type of BLAST"> + <option value="blastp">blastp</option> + <option value="blastp-short">blastp-short</option> + </param> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <param name="out_format" type="select" label="Output format"> + <option value="6" selected="True">Tabular (standard 12 columns)</option> + <option value="6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq">Tabular (extended 22 columns)</option> + <option value="5">BLAST XML</option> + <option value="0">Pairwise text</option> + <option value="0 -html">Pairwise HTML</option> + <option value="2">Query-anchored text</option> + <option value="2 -html">Query-anchored HTML</option> + <option value="4">Flat query-anchored text</option> + <option value="4 -html">Flat query-anchored HTML</option> + <!-- + <option value="-outfmt 11">BLAST archive format (ASN.1)</option> + --> + </param> + <conditional name="adv_opts"> + <param name="adv_opts_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Hide Advanced Options</option> + <option value="advanced">Show Advanced Options</option> + </param> + <when value="basic" /> + <when value="advanced"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> + <param name="matrix" type="select" label="Scoring matrix"> + <option value="BLOSUM90">BLOSUM90</option> + <option value="BLOSUM80">BLOSUM80</option> + <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option> + <option value="BLOSUM50">BLOSUM50</option> + <option value="BLOSUM45">BLOSUM45</option> + <option value="PAM250">PAM250</option> + <option value="PAM70">PAM70</option> + <option value="PAM30">PAM30</option> + </param> + <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> + <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> + <validator type="in_range" min="0" /> + </param> + <!-- I'd like word_size to be optional, with minimum 2 for blastp --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> + <validator type="in_range" min="0" /> + </param> + <!-- + Can't use '-ungapped' on its own, error back is: + Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search + Tried using '-ungapped -comp_based_stats F' and blastp crashed with 'Attempt to access NULL pointer.' + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" /> + --> + <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format="tabular" label="${blast_type.value_label} on ${db_opts.db_opts_selector}"> + <change_format> + <when input="out_format" value="0" format="txt"/> + <when input="out_format" value="0 -html" format="html"/> + <when input="out_format" value="2" format="txt"/> + <when input="out_format" value="2 -html" format="html"/> + <when input="out_format" value="4" format="txt"/> + <when input="out_format" value="4 -html" format="html"/> + <when input="out_format" value="5" format="blastxml"/> + </change_format> + </data> + </outputs> + <requirements> + <requirement type="binary">blastp</requirement> + </requirements> + <tests> + <test> + <param name="query" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" /> + <param name="database" value="" /> + <param name="evalue_cutoff" value="1e-8" /> + <param name="adv_opts_selector" value="advanced" /> + <param name="filter_query" value="True" /> + <param name="matrix" value="BLOSUM62" /> + <param name="max_hits" value="0" /> + <param name="word_size" value="0" /> + <param name="parse_deflines" value="True" /> + <output name="output1" file="blastp_four_human_vs_rhodopsin.tabular" ftype="tabular" /> + </test> + <test> + <param name="query" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="rhodopsin_proteins.fasta" ftype="fasta" /> + <param name="database" value="" /> + <param name="evalue_cutoff" value="1e-8" /> + <param name="blast_type" value="blastp" /> + <param name="out_format" value="6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq" /> + <param name="adv_opts_selector" value="advanced" /> + <param name="filter_query" value="True" /> + <param name="matrix" value="BLOSUM62" /> + <param name="max_hits" value="0" /> + <param name="word_size" value="0" /> + <param name="parse_deflines" value="True" /> + <output name="output1" file="blastp_four_human_vs_rhodopsin_22c.tabular" ftype="tabular" /> + </test> + <test> + <param name="query" value="rhodopsin_proteins.fasta" ftype="fasta" /> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="database" value="" /> + <param name="evalue_cutoff" value="1e-8" /> + <param name="blast_type" value="blastp" /> + <param name="out_format" value="6" /> + <param name="adv_opts_selector" value="basic" /> + <output name="output1" file="blastp_rhodopsin_vs_four_human.tabular" ftype="tabular" /> + </test> + </tests> + <help> + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *protein database* using a *protein query*, +using the NCBI BLAST+ blastp command line tool. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 22 column tabular +BLAST output. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. + +Schaffer et al. Improving the accuracy of PSI-BLAST protein database searches with composition-based statistics and other refinements. 2001. Nucleic Acids Res. 29:2994-3005. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/blastx.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,211 @@ +<tool id="blastx" name="blastx" version="0.0.1"> + <description>Search protein database with translated nucleotide query sequence(s)</description> +<command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +blastx +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "$db_opts.database" +#elif $db_opts.db_opts_selector == "user_db": + -db "$db_opts.db" +#else: + -subject "$db_opts.subject" +#end if +-evalue $evalue_cutoff +-out $output1 +-outfmt "$out_format" +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +$adv_opts.strand +-matrix $adv_opts.matrix +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.ungapped +$adv_opts.parse_deflines +## End of advanced options: +#end if + </command> + <inputs> + <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> + <conditional name="db_opts"> + <param name="db_opts_selector" type="select" label="Subject database/sequences"> + <option value="db" selected="True">Precompiled BLAST Database</option> + <option value="user_db">BLAST Database in your History</option> + <option value="fasta">FASTA file</option> + </param> + <when value="db"> + <param name="database" type="select" label="Precompiled Protein BLAST database"> + <!-- The BLAST loc file has three columns: + column 0 is an identifier (not used), + column 1 is the caption (show this to the user), + column 2 is the database path (given to BLAST+) --> + <options from_file="blastdb_p.loc"> + <column name="name" index="1"/> + <column name="value" index="2"/> + </options> + </param> + <param name="subject" type="hidden" value="" /> + </when> + <when value="user_db"> + <param name="database" type="hidden" value="" /> + <param name="db" type="data" format="blastdb" label="Blast DB" /> + </when> + <when value="fasta"> + <param name="database" type="hidden" value="" /> + <param name="subject" type="data" format="fasta" label="Protein FASTA file to use as database"/> + </when> + </conditional> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <param name="out_format" type="select" label="Output format"> + <option value="6" selected="True">Tabular (standard 12 columns)</option> + <option value="6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq">Tabular (extended 22 columns)</option> + <option value="5">BLAST XML</option> + <option value="0">Pairwise text</option> + <option value="0 -html">Pairwise HTML</option> + <option value="2">Query-anchored text</option> + <option value="2 -html">Query-anchored HTML</option> + <option value="4">Flat query-anchored text</option> + <option value="4 -html">Flat query-anchored HTML</option> + <!-- + <option value="-outfmt 11">BLAST archive format (ASN.1)</option> + --> + </param> + <conditional name="adv_opts"> + <param name="adv_opts_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Hide Advanced Options</option> + <option value="advanced">Show Advanced Options</option> + </param> + <when value="basic" /> + <when value="advanced"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" /> + <param name="strand" type="select" label="Query strand(s) to search against database/subject"> + <option value="-strand both">Both</option> + <option value="-strand plus">Plus (forward)</option> + <option value="-strand minus">Minus (reverse complement)</option> + </param> + <param name="matrix" type="select" label="Scoring matrix"> + <option value="BLOSUM90">BLOSUM90</option> + <option value="BLOSUM80">BLOSUM80</option> + <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option> + <option value="BLOSUM50">BLOSUM50</option> + <option value="BLOSUM45">BLOSUM45</option> + <option value="PAM250">PAM250</option> + <option value="PAM70">PAM70</option> + <option value="PAM30">PAM30</option> + </param> + <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> + <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> + <validator type="in_range" min="0" /> + </param> + <!-- I'd like word_size to be optional, with minimum 2 for blastx --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> + <validator type="in_range" min="0" /> + </param> + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped" falsevalue="" checked="false" /> + <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format="tabular" label="blastx on ${db_opts.db_opts_selector}"> + <change_format> + <when input="out_format" value="0" format="txt"/> + <when input="out_format" value="0 -html" format="html"/> + <when input="out_format" value="2" format="txt"/> + <when input="out_format" value="2 -html" format="html"/> + <when input="out_format" value="4" format="txt"/> + <when input="out_format" value="4 -html" format="html"/> + <when input="out_format" value="5" format="blastxml"/> + </change_format> + </data> + </outputs> + <requirements> + <requirement type="binary">blastx</requirement> + </requirements> + <tests> + </tests> + <help> + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *protein database* using a *translated nucleotide query*, +using the NCBI BLAST+ blastx command line tool. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 22 column tabular +BLAST output. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/dustmasker.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,44 @@ +<tool id="dustmasker" name="dustmasker" version="0.0.1"> +<description>Low complexity region masker</description> +<command interpreter='bash'>blastdb_wrapper.sh $outfile.extra_files_path +dustmasker -outfmt seqloc_asn1_text -out ${os.path.join($outfile.extra_files_path,'blastdb')} +#if $in.fmt == 'blastdb': +-infmt blastdb -in ${os.path.join($in.file.extra_files_path,'blastdb')} +#else: +-infmt fasta -in $in.file -parse_seqids +#end if +-window $window -level $level -linker $linker +</command> +<inputs> + <conditional name="in"> + <param name="fmt" type="select" label="Input format"> + <option value="blastdb">BLAST Database</option> + <option value="fasta">Fasta Database</option> + </param> + <when value="blastdb"> + <param name="file" type="data" format="blastdb" label="BLAST database" /> + </when> + <when value="fasta"> + <param name="file" type="data" format="fasta" label="FASTA file" /> + </when> + </conditional> + <param name="window" type="integer" value="64" label="DUST window length" /> + <param name="level" type="integer" value="20" label="DUST level" help="Score threshold for subwindows" /> + <param name="linker" type="integer" value="1" label="DUST linker" help="How close masked intervals should be to get merged together" /> +</inputs> +<outputs> + <data name="outfile" format="asn1" /> +</outputs> +<requirements> + <binary>dustmasker</binary> +</requirements> +<help> +**What it does** + +Low complexity region masker based on Symmetric DUST algorithm + +**Documentation** + +http://www.ncbi.nlm.nih.gov/books/NBK1763/ +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/makeblastdb.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,85 @@ +<tool id="makeblastdb" name="makeblastdb" version="0.0.1"> +<description>Make BLAST database</description> +<command interpreter="bash">blastdb_wrapper.sh $outfile.extra_files_path +makeblastdb -logfile $outfile -out ${os.path.join($outfile.extra_files_path,'blastdb')} +-parse_seqids +$hash_index +#set $sep = '-in ' +#for $i in $in +$sep${i.file} +#set $set = ', ' +#end for +-title $title +-dbtype $dbtype +#set $sep = '-mask_data ' +#for $i in $mask_data +$sep${i.file} +#set $set = ', ' +#end for +#set $sep = '-gi_mask -gi_mask_name ' +#for $i in $gi_mask +$sep${i.file} +#set $set = ', ' +#end for +#if $tax.select == 'id': +-taxid $tax.id +#elsif $tax.select == 'map': +-taxid_map $tax.map +#end if +</command> +<inputs> + <repeat name="in" title="Blast or Fasta Database"> + <param name="file" type="data" format="fasta,blastdb" label="Blast or Fasta database" /> + </repeat> + <param name="title" type="text" value="" label="Title for BLAST database" /> + <param name="dbtype" type="select" display="radio" label="[-dbtype] Molecule type of input"> + <option value="prot">protein</option> + <option value="nucl">nucleotide</option> + </param> + <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="[-hash_index] Enables the creation of sequence hash values. These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> + + <!-- SEQUENCE MASKING OPTIONS --> + <repeat name="mask_data" title="[-mask_data] Provide one or more files containing masking data"> + <param name="file" type="data" format="asnb" label="File containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" /> + </repeat> + <repeat name="gi_mask" title="[-gi_mask_name] Create GI indexed masking data"> + <param name="file" type="data" format="asnb" label="Masking data output file" /> + </repeat> + + <!-- TAXONOMY OPTIONS --> + <conditional name="tax"> + <param name="select" type="select" label="Taxonomy options"> + <option value="">Do not assign sequences to Taxonomy IDs</option> + <option value="id">[-taxid] Assign all sequences to one Taxonomy ID</option> + <option value="map">[-taxid_map] Supply text file mapping sequence IDs to taxnomy IDs</option> + </param> + <when value=""> + </when> + <when value="id"> + <param name="id" type="integer" value="" label="NCBI taxonomy ID" help="Integer >=0" /> + </when> + <when value="map"> + <param name="file" type="data" format="txt" label="Seq ID : Tax ID mapping file" help="Format: SequenceId TaxonomyId" /> + </when> + </conditional> +</inputs> +<outputs> + <data name="outfile" format="blastdb" /> +</outputs> +<requirements> +</requirements> +<help> +**What it does** + +Make BLAST database from one or more FASTA files and/or BLAST databases. +This application serves as a replacement for formatdb. + +Applying masks to an existing BLAST database will not change the original database; a new database will be created. +For this reason, it's best to apply all masks at once to minimize the number of unnecessary intermediate databases. + + +**Documentation** + +http://www.ncbi.nlm.nih.gov/books/NBK1763/ +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/suite_config.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,24 @@ +<suite id="blast" name="Modified NCBI Blast+ Tools" version="1.0.0"> + <description>Modified Galaxy wrappers add support for makeblastdb files and add dustmasker</description> + <tool id="blastn" name="blastn" version="0.0.1"> + <description>blastn with blastdb support</description> + </tool> + <tool id="blastp" name="blastp" version="0.0.1"> + <description>blastp with blastdb support</description> + </tool> + <tool id="blastx" name="blastx" version="0.0.1"> + <description>blastx with blastdb support</description> + </tool> + <tool id="tblastn" name="tblastn" version="0.0.1"> + <description>tblastn with blastdb support</description> + </tool> + <tool id="tblastx" name="tblastx" version="0.0.1"> + <description>tblastx with blastdb support</description> + </tool> + <tool id="makeblastdb" name="makeblastdb" version="0.0.1"> + <description>Make blast Db file</description> + </tool> + <tool id="dustmasker" name="dustmasker" version="0.0.1"> + <description>dust masking of blast db file</description> + </tool> +</suite>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/tblastn.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,242 @@ +<tool id="tblastn" name="tblastn" version="0.0.1"> + <description>Search translated nucleotide database with protein query sequence(s)</description> +<command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +tblastn +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "$db_opts.database" +#elif $db_opts.db_opts_selector == "user_db": + -db "$db_opts.db" +#else: + -subject "$db_opts.subject" +#end if +-evalue $evalue_cutoff +-out $output1 +-outfmt "$out_format" +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +-matrix $adv_opts.matrix +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +##Ungapped disabled for now - see comments below +##$adv_opts.ungapped +$adv_opts.parse_deflines +## End of advanced options: +#end if + </command> + <inputs> + <param name="query" type="data" format="fasta" label="Protein query sequence(s)"/> + <conditional name="db_opts"> + <param name="db_opts_selector" type="select" label="Subject database/sequences"> + <option value="db" selected="True">Precompiled BLAST Database</option> + <option value="user_db">BLAST Database in your History</option> + <option value="fasta">FASTA file</option> + </param> + <when value="db"> + <param name="database" type="select" label="Precompiled Nucleotide BLAST database"> + <!-- The BLAST loc file has three columns: + column 0 is an identifier (not used here, see legacy megablast wrapper), + column 1 is the caption (show this to the user), + column 2 is the database path (given to BLAST+) --> + <options from_file="blastdb.loc"> + <column name="name" index="1"/> + <column name="value" index="2"/> + </options> + </param> + <param name="subject" type="hidden" value="" /> + </when> + <when value="user_db"> + <param name="database" type="hidden" value="" /> + <param name="db" type="data" format="blastdb" label="Blast DB" /> + </when> + <when value="fasta"> + <param name="database" type="hidden" value="" /> + <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> + </when> + </conditional> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <param name="out_format" type="select" label="Output format"> + <option value="6" selected="True">Tabular (standard 12 columns)</option> + <option value="6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq">Tabular (extended 22 columns)</option> + <option value="5">BLAST XML</option> + <option value="0">Pairwise text</option> + <option value="0 -html">Pairwise HTML</option> + <option value="2">Query-anchored text</option> + <option value="2 -html">Query-anchored HTML</option> + <option value="4">Flat query-anchored text</option> + <option value="4 -html">Flat query-anchored HTML</option> + <!-- + <option value="-outfmt 11">BLAST archive format (ASN.1)</option> + --> + </param> + <conditional name="adv_opts"> + <param name="adv_opts_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Hide Advanced Options</option> + <option value="advanced">Show Advanced Options</option> + </param> + <when value="basic" /> + <when value="advanced"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" /> + <param name="matrix" type="select" label="Scoring matrix"> + <option value="BLOSUM90">BLOSUM90</option> + <option value="BLOSUM80">BLOSUM80</option> + <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option> + <option value="BLOSUM50">BLOSUM50</option> + <option value="BLOSUM45">BLOSUM45</option> + <option value="PAM250">PAM250</option> + <option value="PAM70">PAM70</option> + <option value="PAM30">PAM30</option> + </param> + <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> + <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> + <validator type="in_range" min="0" /> + </param> + <!-- I'd like word_size to be optional, with minimum 2 for blastp --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> + <validator type="in_range" min="0" /> + </param> + <!-- + Can't use '-ungapped' on its own, error back is: + Composition-adjusted searched are not supported with an ungapped search, please add -comp_based_stats F or do a gapped search + Tried using '-ungapped -comp_based_stats F' and tblastn crashed with 'Attempt to access NULL pointer.' + <param name="ungapped" type="boolean" label="Perform ungapped alignment only?" truevalue="-ungapped -comp_based_stats F" falsevalue="" checked="false" /> + --> + <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format="tabular" label="tblastn on ${db_opts.db_opts_selector}"> + <change_format> + <when input="out_format" value="0" format="txt"/> + <when input="out_format" value="0 -html" format="html"/> + <when input="out_format" value="2" format="txt"/> + <when input="out_format" value="2 -html" format="html"/> + <when input="out_format" value="4" format="txt"/> + <when input="out_format" value="4 -html" format="html"/> + <when input="out_format" value="5" format="blastxml"/> + </change_format> + </data> + </outputs> + <requirements> + <requirement type="binary">tblastn</requirement> + </requirements> + <tests> + <test> + <param name="query" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="rhodopsin_nucs.fasta" ftype="fasta" /> + <param name="database" value="" /> + <param name="evalue_cutoff" value="1e-10" /> + <param name="out_format" value="6" /> + <param name="adv_opts_selector" value="advanced" /> + <param name="filter_query" value="false" /> + <param name="matrix" value="BLOSUM80" /> + <param name="max_hits" value="0" /> + <param name="word_size" value="0" /> + <param name="parse_deflines" value="false" /> + <output name="output1" file="tblastn_four_human_vs_rhodopsin.tabular" ftype="tabular" /> + </test> + <test> + <!-- Same as above, but parse deflines --> + <param name="query" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="rhodopsin_nucs.fasta" ftype="fasta" /> + <param name="database" value="" /> + <param name="evalue_cutoff" value="1e-10" /> + <param name="out_format" value="6" /> + <param name="adv_opts_selector" value="advanced" /> + <param name="filter_query" value="false" /> + <param name="matrix" value="BLOSUM80" /> + <param name="max_hits" value="0" /> + <param name="word_size" value="0" /> + <param name="parse_deflines" value="true" /> + <output name="output1" file="tblastn_four_human_vs_rhodopsin_parse_deflines.tabular" ftype="tabular" /> + </test> + </tests> + <help> + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *translated nucleotide database* using a *protein query*, +using the NCBI BLAST+ tblastn command line tool. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 22 column tabular +BLAST output. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast/tblastx.xml Tue Jun 07 16:48:42 2011 -0400 @@ -0,0 +1,209 @@ +<tool id="tblastx" name="tblastx" version="0.0.1"> + <description>Search translated nucleotide database with translated nucleotide query sequence(s)</description> +<command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +tblastx +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "$db_opts.database" +#elif $db_opts.db_opts_selector == "user_db": + -db "$db_opts.db" +#else: + -subject "$db_opts.subject" +#end if +-evalue $evalue_cutoff +-out $output1 +-outfmt "$out_format" +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +$adv_opts.strand +-matrix $adv_opts.matrix +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.parse_deflines +## End of advanced options: +#end if + </command> + <inputs> + <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> + <conditional name="db_opts"> + <param name="db_opts_selector" type="select" label="Subject database/sequences"> + <option value="db" selected="True">Precompiled BLAST Database</option> + <option value="user_db">BLAST Database in your History</option> + <option value="file">FASTA file</option> + </param> + <when value="db"> + <param name="database" type="select" label="Precompiled Nucleotide BLAST database"> + <!-- The BLAST loc file has three columns: + column 0 is an identifier (not used here, see legacy megablast wrapper), + column 1 is the caption (show this to the user), + column 2 is the database path (given to BLAST+) --> + <options from_file="blastdb.loc"> + <column name="name" index="1"/> + <column name="value" index="2"/> + </options> + </param> + <param name="subject" type="hidden" value="" /> + </when> + <when value="user_db"> + <param name="database" type="hidden" value="" /> + <param name="db" type="data" format="blastdb" label="Blast DB" /> + </when> + <when value="fasta"> + <param name="database" type="hidden" value="" /> + <param name="subject" type="data" format="fasta" label="Nucleotide FASTA file to use as database"/> + </when> + </conditional> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <param name="out_format" type="select" label="Output format"> + <option value="6" selected="True">Tabular (standard 12 columns)</option> + <option value="6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq">Tabular (extended 22 columns)</option> + <option value="5">BLAST XML</option> + <option value="0">Pairwise text</option> + <option value="0 -html">Pairwise HTML</option> + <option value="2">Query-anchored text</option> + <option value="2 -html">Query-anchored HTML</option> + <option value="4">Flat query-anchored text</option> + <option value="4 -html">Flat query-anchored HTML</option> + <!-- + <option value="-outfmt 11">BLAST archive format (ASN.1)</option> + --> + </param> + <conditional name="adv_opts"> + <param name="adv_opts_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Hide Advanced Options</option> + <option value="advanced">Show Advanced Options</option> + </param> + <when value="basic" /> + <when value="advanced"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="true" /> + <param name="strand" type="select" label="Query strand(s) to search against database/subject"> + <option value="-strand both">Both</option> + <option value="-strand plus">Plus (forward)</option> + <option value="-strand minus">Minus (reverse complement)</option> + </param> + <param name="matrix" type="select" label="Scoring matrix"> + <option value="BLOSUM90">BLOSUM90</option> + <option value="BLOSUM80">BLOSUM80</option> + <option value="BLOSUM62" selected="true">BLOSUM62 (default)</option> + <option value="BLOSUM50">BLOSUM50</option> + <option value="BLOSUM45">BLOSUM45</option> + <option value="PAM250">PAM250</option> + <option value="PAM70">PAM70</option> + <option value="PAM30">PAM30</option> + </param> + <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> + <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> + <validator type="in_range" min="0" /> + </param> + <!-- I'd like word_size to be optional, with minimum 2 for tblastx --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> + <validator type="in_range" min="0" /> + </param> + <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format="tabular" label="tblastx on ${db_opts.db_opts_selector}"> + <change_format> + <when input="out_format" value="0" format="txt"/> + <when input="out_format" value="0 -html" format="html"/> + <when input="out_format" value="2" format="txt"/> + <when input="out_format" value="2 -html" format="html"/> + <when input="out_format" value="4" format="txt"/> + <when input="out_format" value="4 -html" format="html"/> + <when input="out_format" value="5" format="blastxml"/> + </change_format> + </data> + </outputs> + <requirements> + <requirement type="binary">tblastx</requirement> + </requirements> + <tests> + </tests> + <help> + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *translated nucleotide database* using a *protein query*, +using the NCBI BLAST+ tblastx command line tool. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 22 column tabular +BLAST output. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Altschul et al. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. 1997. Nucleic Acids Res. 25:3389-3402. + + </help> +</tool>