Mercurial > repos > peterjc > ncbi_blast_plus
changeset 7:08be1b2b0bf6 draft
Uploaded v0.0.19 preview take 4, added missing wrapper for RPS-TBLASTN
author | peterjc |
---|---|
date | Thu, 25 Apr 2013 09:05:13 -0400 |
parents | e77e30f1deeb |
children | e710b9446493 |
files | tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml |
diffstat | 1 files changed, 237 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml Thu Apr 25 09:05:13 2013 -0400 @@ -0,0 +1,237 @@ +<tool id="ncbi_rpstblastn_wrapper" name="NCBI BLAST+ rpstblastn" version="0.0.3"> + <description>Search protein domain database (PSSMs) with translated nucleotide query sequence(s)</description> + <!-- If job splitting is enabled, break up the query file into parts --> + <parallelism method="multi" split_inputs="query" split_mode="to_size" split_size="1000" shared_inputs="subject" merge_outputs="output1"></parallelism> + <requirements> + <requirement type="binary">rpstblastn</requirement> + <requirement type="package" version="2.2.26+">blast+</requirement> + </requirements> + <version_command>rpstblastn -version</version_command> + <command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +rpstblastn +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" +#end if +-evalue $evalue_cutoff +-out "$output1" +##Set the extended list here so if/when we add things, saved workflows are not affected +#if str($out_format)=="ext": + -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" +#else: + -outfmt $out_format +#end if +##Seems rpstblastn does not currently support multiple threads :( +##-num_threads 8 +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.parse_deflines +## End of advanced options: +#end if + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + <!-- In case the return code has not been set propery check stderr too --> + <regex match="Error:" /> + <regex match="Exception:" /> + </stdio> + <inputs> + <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> + <conditional name="db_opts"> + <param name="db_opts_selector" type="select" label="Protein domain database (PSSM)"> + <option value="db" selected="True">Locally installed BLAST database</option> + <!-- TODO - define new datatype + <option value="histdb">BLAST protein domain database from your history</option> + --> + </param> + <when value="db"> + <param name="database" type="select" label="Protein domain database"> + <options from_file="blastdb_d.loc"> + <column name="value" index="0"/> + <column name="name" index="1"/> + <column name="path" index="2"/> + </options> + </param> + <param name="histdb" type="hidden" value="" /> + <param name="subject" type="hidden" value="" /> + </when> + <!-- TODO - define new datatype + <when value="histdb"> + <param name="database" type="hidden" value="" /> + <param name="histdb" type="data" format="blastdbd" label="Protein domain database" /> + <param name="subject" type="hidden" value="" /> + </when> + --> + </conditional> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="Set expectation value cutoff" /> + <param name="out_format" type="select" label="Output format"> + <option value="6">Tabular (standard 12 columns)</option> + <option value="ext" selected="True">Tabular (extended 24 columns)</option> + <option value="5">BLAST XML</option> + <option value="0">Pairwise text</option> + <option value="0 -html">Pairwise HTML</option> + <option value="2">Query-anchored text</option> + <option value="2 -html">Query-anchored HTML</option> + <option value="4">Flat query-anchored text</option> + <option value="4 -html">Flat query-anchored HTML</option> + <!-- + <option value="-outfmt 11">BLAST archive format (ASN.1)</option> + --> + </param> + <conditional name="adv_opts"> + <param name="adv_opts_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Hide Advanced Options</option> + <option value="advanced">Show Advanced Options</option> + </param> + <when value="basic" /> + <when value="advanced"> + <!-- Could use a select (yes, no, other) where other allows setting 'window locut hicut' --> + <param name="filter_query" type="boolean" label="Filter out low complexity regions (with SEG)" truevalue="-seg yes" falsevalue="-seg no" checked="false" /> + <!-- Why doesn't optional override a validator? I want to accept an empty string OR a non-negative integer --> + <param name="max_hits" type="integer" value="0" label="Maximum hits to show" help="Use zero for default limits"> + <validator type="in_range" min="0" /> + </param> + <!-- I'd like word_size to be optional, with minimum 2 for rpsblast --> + <param name="word_size" type="integer" value="0" label="Word size for wordfinder algorithm" help="Use zero for default, otherwise minimum 2."> + <validator type="in_range" min="0" /> + </param> + <param name="parse_deflines" type="boolean" label="Should the query and subject defline(s) be parsed?" truevalue="-parse_deflines" falsevalue="" checked="false" help="This affects the formatting of the query/subject ID strings"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format="tabular" label="RPS-TBLASTN on ${db_opts.db_opts_selector}"> + <change_format> + <when input="out_format" value="0" format="txt"/> + <when input="out_format" value="0 -html" format="html"/> + <when input="out_format" value="2" format="txt"/> + <when input="out_format" value="2 -html" format="html"/> + <when input="out_format" value="4" format="txt"/> + <when input="out_format" value="4 -html" format="html"/> + <when input="out_format" value="5" format="blastxml"/> + </change_format> + </data> + </outputs> + <help> + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *protein domain database* using a *nucleotide query*, +using the NCBI BLAST+ rpstblastn command line tool. + +The protein domain databases use position-specific scoring matrices +(PSSMs) and are available for a number of domain collections including: + +*CDD* - NCBI curarated meta-collection of domains, see +http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#NCBI_curated_domains + +*Kog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the KOGs resource, the eukaryotic +counterpart to COGs, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Cog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the COGs resource, which focuses primarily +on prokaryotes, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Pfam* - PSSMs from Pfam-A seed alignment database, see +http://pfam.sanger.ac.uk/ + +*Smart* - PSSMs from SMART domain alignment database, see +http://smart.embl-heidelberg.de/ + +*Tigr* - PSSMs from TIGRFAM database of protein families, see +http://www.jcvi.org/cms/research/projects/tigrfams/overview/ + +*Prk* - PSSms from automatically aligned stable clusters in the +Protein Clusters database, see +http://www.ncbi.nlm.nih.gov/proteinclusters?cmd=search&db=proteinclusters + +The exact list of domain databases offered will depend on how your +local Galaxy has been configured. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 24 column tabular +BLAST output. Galaxy now uses this extended 24 column output by default. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence + 23 qlen Query sequence length + 24 slen Subject sequence length +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Marchler-Bauer A, Bryant SH. CD-Search: protein domain annotations on the fly. Nucleic Acids Res. 2004 Jul 1;32(Web Server issue):W327-31. + + </help> +</tool>