# HG changeset patch # User peterjc # Date 1366895113 14400 # Node ID 08be1b2b0bf62b5f27a1ededd118d10a0e6c1d78 # Parent e77e30f1deebfa33267bfb43678ed2166aab9357 Uploaded v0.0.19 preview take 4, added missing wrapper for RPS-TBLASTN diff -r e77e30f1deeb -r 08be1b2b0bf6 tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_rpstblastn_wrapper.xml Thu Apr 25 09:05:13 2013 -0400 @@ -0,0 +1,237 @@ + + Search protein domain database (PSSMs) with translated nucleotide query sequence(s) + + + + rpstblastn + blast+ + + rpstblastn -version + +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +rpstblastn +-query "$query" +#if $db_opts.db_opts_selector == "db": + -db "${db_opts.database.fields.path}" +#elif $db_opts.db_opts_selector == "histdb": + -db "${os.path.join($db_opts.histdb.extra_files_path,'blastdb')}" +#end if +-evalue $evalue_cutoff +-out "$output1" +##Set the extended list here so if/when we add things, saved workflows are not affected +#if str($out_format)=="ext": + -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen" +#else: + -outfmt $out_format +#end if +##Seems rpstblastn does not currently support multiple threads :( +##-num_threads 8 +#if $adv_opts.adv_opts_selector=="advanced": +$adv_opts.filter_query +## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string +## Note -max_target_seqs overrides -num_descriptions and -num_alignments +#if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): +-max_target_seqs $adv_opts.max_hits +#end if +#if (str($adv_opts.word_size) and int(str($adv_opts.word_size)) > 0): +-word_size $adv_opts.word_size +#end if +$adv_opts.parse_deflines +## End of advanced options: +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: warningmark + +**Note**. Database searches may take a substantial amount of time. +For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +Search a *protein domain database* using a *nucleotide query*, +using the NCBI BLAST+ rpstblastn command line tool. + +The protein domain databases use position-specific scoring matrices +(PSSMs) and are available for a number of domain collections including: + +*CDD* - NCBI curarated meta-collection of domains, see +http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#NCBI_curated_domains + +*Kog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the KOGs resource, the eukaryotic +counterpart to COGs, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Cog* - PSSMs from automatically aligned sequences and sequence +fragments classified in the COGs resource, which focuses primarily +on prokaryotes, see http://www.ncbi.nlm.nih.gov/COG/new/ + +*Pfam* - PSSMs from Pfam-A seed alignment database, see +http://pfam.sanger.ac.uk/ + +*Smart* - PSSMs from SMART domain alignment database, see +http://smart.embl-heidelberg.de/ + +*Tigr* - PSSMs from TIGRFAM database of protein families, see +http://www.jcvi.org/cms/research/projects/tigrfams/overview/ + +*Prk* - PSSms from automatically aligned stable clusters in the +Protein Clusters database, see +http://www.ncbi.nlm.nih.gov/proteinclusters?cmd=search&db=proteinclusters + +The exact list of domain databases offered will depend on how your +local Galaxy has been configured. + +----- + +**Output format** + +Because Galaxy focuses on processing tabular data, the default output of this +tool is tabular. The standard BLAST+ tabular output contains 12 columns: + +====== ========= ============================================ +Column NCBI name Description +------ --------- -------------------------------------------- + 1 qseqid Query Seq-id (ID of your sequence) + 2 sseqid Subject Seq-id (ID of the database hit) + 3 pident Percentage of identical matches + 4 length Alignment length + 5 mismatch Number of mismatches + 6 gapopen Number of gap openings + 7 qstart Start of alignment in query + 8 qend End of alignment in query + 9 sstart Start of alignment in subject (database hit) + 10 send End of alignment in subject (database hit) + 11 evalue Expectation value (E-value) + 12 bitscore Bit score +====== ========= ============================================ + +The BLAST+ tools can optionally output additional columns of information, +but this takes longer to calculate. Most (but not all) of these columns are +included by selecting the extended tabular output. The extra columns are +included *after* the standard 12 columns. This is so that you can write +workflow filtering steps that accept either the 12 or 24 column tabular +BLAST output. Galaxy now uses this extended 24 column output by default. + +====== ============= =========================================== +Column NCBI name Description +------ ------------- ------------------------------------------- + 13 sallseqid All subject Seq-id(s), separated by a ';' + 14 score Raw score + 15 nident Number of identical matches + 16 positive Number of positive-scoring matches + 17 gaps Total number of gaps + 18 ppos Percentage of positive-scoring matches + 19 qframe Query frame + 20 sframe Subject frame + 21 qseq Aligned part of query sequence + 22 sseq Aligned part of subject sequence + 23 qlen Query sequence length + 24 slen Subject sequence length +====== ============= =========================================== + +The third option is BLAST XML output, which is designed to be parsed by +another program, and is understood by some Galaxy tools. + +You can also choose several plain text or HTML output formats which are designed to be read by a person (not by another program). +The HTML versions use basic webpage formatting and can include links to the hits on the NCBI website. +The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. +The two query anchored outputs show a multiple sequence alignment between the query and all the matches, +and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). + +------- + +**References** + +Marchler-Bauer A, Bryant SH. CD-Search: protein domain annotations on the fly. Nucleic Acids Res. 2004 Jul 1;32(Web Server issue):W327-31. + + +