Mercurial > repos > peterjc > blast_rbh
annotate tools/blast_rbh/blast_rbh.xml @ 5:c84b6c21e3d4 draft
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
author | peterjc |
---|---|
date | Tue, 20 May 2014 06:33:08 -0400 |
parents | 57245c11b8cb |
children | e47960bcdccb |
rev | line source |
---|---|
0 | 1 <tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.0"> |
2 <description>from two FASTA files</description> | |
3 <requirements> | |
4 <requirement type="binary">makeblastdb</requirement> | |
5 <requirement type="binary">blastp</requirement> | |
6 <requirement type="binary">blastn</requirement> | |
7 <requirement type="package" version="2.2.29">blast+</requirement> | |
8 </requirements> | |
9 <version_command interpreter="python"> | |
10 blast_rbh.py --version | |
11 </version_command> | |
12 <command interpreter="python"> | |
13 blast_rbh.py "$fasta_a" "$fasta_b" $seq.dbtype | |
14 #if $seq.dbtype=="nucl" | |
15 $seq.nucl_type | |
16 #else | |
17 $seq.prot_type | |
18 #end if | |
19 $identity $q_cover "$output" | |
20 </command> | |
21 <inputs> | |
22 <!-- Galaxy does not have sub-types for protein vs nucletide FASTA --> | |
23 <param name="fasta_a" type="data" format="fasta" | |
24 label="Genes/proteins from species A" | |
25 description="FASTA file, one sequence per gene/protein." /> | |
26 <param name="fasta_b" type="data" format="fasta" | |
27 label="Genes/proteins from species B" | |
28 description="FASTA file, one sequence per gene/protein." /> | |
29 <conditional name="seq"> | |
30 <param name="dbtype" type="select" label="Molecule type of FASTA inputs"> | |
31 <option value="prot">protein</option> | |
32 <option value="nucl">nucleotide</option> | |
33 </param> | |
34 <when value="prot"> | |
35 <param name="prot_type" type="select" display="radio" label="Type of BLAST"> | |
36 <option value="blastp">blastp - Traditional BLASTP to compare a protein query to a protein database</option> | |
37 <option value="blastp-short">blastp-short - BLASTP optimized for queries shorter than 30 residues</option> | |
38 </param> | |
39 </when> | |
40 <when value="nucl"> | |
41 <param name="nucl_type" type="select" display="radio" label="Type of BLAST"> | |
42 <option value="megablast">megablast - Traditional megablast used to find very similar (e.g., intraspecies or closely related species) sequences</option> | |
43 <option value="blastn">blastn - Traditional BLASTN requiring an exact match of 11, for somewhat similar sequences</option> | |
44 <option value="blastn-short">blastn-short - BLASTN program optimized for sequences shorter than 50 bases</option> | |
45 <option value="dc-megablast">dc-megablast - Discontiguous megablast used to find more distant (e.g., interspecies) sequences</option> | |
4
57245c11b8cb
Uploaded v0.1.0d, TBLASTX support; changed output columns
peterjc
parents:
2
diff
changeset
|
46 <option value="tblastx">tblastx - TBLASTX program using translated query against translated database (protein level matches)</option> |
0 | 47 </param> |
48 </when> | |
49 </conditional> | |
50 <param name="identity" type="float" value="70" min="0" max="100" | |
51 label="Minimum percentage identity for BLAST matches" | |
52 help="Default is 70%, use 0 for no filtering." /> | |
53 <param name="q_cover" type="float" value="50" min="0" max="100" | |
54 label="Minimum percentage query coverage for BLAST matches" | |
55 help="Default is 50%, use 0 for no filtering." /> | |
56 </inputs> | |
57 <outputs> | |
58 <data name="output" format="tabular" label="BLAST RBH: $fasta_a.name vs $fasta_b.name" /> | |
59 </outputs> | |
60 <requirements> | |
61 </requirements> | |
62 <tests> | |
63 <test> | |
1 | 64 <param name="fasta_a" value="four_human_proteins.fasta" ftype="fasta"/> |
65 <param name="fasta_b" value="rhodopsin_proteins.fasta" ftype="fasta"/> | |
66 <param name="dbtype" value="prot"/> | |
67 <param name="nucl_type" value="blastp"/> | |
68 <param name="identity" value="0.0"/> | |
69 <param name="q_cover" value="0.0"/> | |
70 <output name="output" file="rbh_blastp_four_human_vs_rhodopsin_proteins.tabular" ftype="tabular"/> | |
71 </test> | |
72 <test> | |
0 | 73 <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/> |
74 <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/> | |
75 <param name="dbtype" value="nucl"/> | |
76 <param name="nucl_type" value="megablast"/> | |
77 <param name="identity" value="0.0"/> | |
78 <param name="q_cover" value="0.0"/> | |
79 <output name="output" file="rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular" ftype="tabular"/> | |
80 </test> | |
81 <test> | |
82 <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/> | |
83 <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/> | |
84 <param name="dbtype" value="nucl"/> | |
85 <param name="nucl_type" value="megablast"/> | |
86 <param name="identity" value="92"/> | |
87 <param name="q_cover" value="86"/> | |
88 <output name="output" file="rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular" ftype="tabular"/> | |
89 </test> | |
90 <!-- push the percentage identity over the 92.07% level --> | |
91 <test> | |
92 <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/> | |
93 <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/> | |
94 <param name="dbtype" value="nucl"/> | |
95 <param name="nucl_type" value="megablast"/> | |
96 <param name="identity" value="92.5"/> | |
97 <param name="q_cover" value="86"/> | |
98 <output name="output" file="rbh_none.tabular" ftype="tabular"/> | |
99 </test> | |
100 <!-- push the coverage over the 86% level --> | |
101 <test> | |
102 <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/> | |
103 <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/> | |
104 <param name="dbtype" value="nucl"/> | |
105 <param name="nucl_type" value="megablast"/> | |
106 <param name="identity" value="92"/> | |
107 <param name="q_cover" value="87"/> | |
108 <output name="output" file="rbh_none.tabular" ftype="tabular"/> | |
109 </test> | |
110 <test> | |
5
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
111 <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
112 <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
113 <param name="dbtype" value="nucl"/> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
114 <param name="nucl_type" value="tblastx"/> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
115 <param name="identity" value="0.0"/> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
116 <param name="q_cover" value="0.0"/> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
117 <output name="output" file="rbh_tblastx_rhodopsin_nucs_vs_three_human_mRNA.tabular" ftype="tabular"/> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
118 </test> |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
119 <test> |
0 | 120 <param name="fasta_a" value="three_human_mRNA.fasta" ftype="fasta"/> |
121 <param name="fasta_b" value="rhodopsin_nucs.fasta" ftype="fasta"/> | |
122 <param name="dbtype" value="nucl"/> | |
123 <param name="nucl_type" value="blastn"/> | |
124 <param name="identity" value="0.0"/> | |
125 <param name="q_cover" value="0.0"/> | |
126 <output name="output" file="rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular" ftype="tabular"/> | |
127 </test> | |
128 </tests> | |
129 <help> | |
130 **What it does** | |
131 | |
2 | 132 Takes two FASTA files (*species A* and *species B*), builds a BLAST database |
0 | 133 for each, runs reciprocal BLAST searchs (*A vs B*, and *B vs A*), optionally |
2 | 134 filters the HSPs, and then compiles a list of the reciprocal best hits (RBH). |
0 | 135 |
5
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
136 The output from this tool is a tabular file containing multiple columns, with |
2 | 137 information about the BLAST matches used: |
0 | 138 |
5
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
139 ====== ================================== |
0 | 140 Column Description |
5
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
141 ------ ---------------------------------- |
2 | 142 1 ID from *species A* |
143 2 ID from *species B* | |
5
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
144 3 Length of sequence *A* |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
145 4 Length of sequence *B* |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
146 5 Percentage of sequence *A* covered |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
147 6 Percentage of sequence *B* covered |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
148 7 HSP alignment length |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
149 8 HSP percentage identity |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
150 9 HSP bitscore |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
151 ====== ================================== |
2 | 152 |
5
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
153 These values correspond to the ``qseqid``/``sseqid``, ``qlen``/``slen``, |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
154 ``qcovhsp``, ``length``, ``pident`` and ``bitscore`` values in the BLAST+ |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
155 tabular output. |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
156 |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
157 For the alignment length, bitscore and percentage identity the values for |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
158 *A vs B* and *B vs A* are typically the same, so their minimum is shown. |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
159 The coverage values are given by the HSP alignment length divided by the |
c84b6c21e3d4
Uploaded v0.1.0e, test TBLASTX mode; more columns in output
peterjc
parents:
4
diff
changeset
|
160 sequence length (adjusted by a factor of three for TBLASTX). |
0 | 161 |
162 .. class:: warningmark | |
163 | |
164 **Note** | |
165 | |
166 If you are trying to use BLAST RBH matches to identify candidate orthologues | |
167 or transfer annotation, you *must* use a percentage identity and minimum | |
168 coverage threshold or similiar. See: | |
169 | |
170 Punta and Ofran (2008) The Rough Guide to In Silico Function Prediction, | |
171 or How To Use Sequence and Structure Information To Predict Protein | |
172 Function. PLoS Comput Biol 4(10): e1000160. | |
173 http://dx.doi.org/10.1371/journal.pcbi.1000160 | |
174 | |
175 The defaults are to require 70% sequence identity over the aligned region | |
176 (using ``pident`` in the BLAST+ tabular output), and that the HSP alignment | |
177 covers at least 50% of the query sequence (using ``qcovhsp`` in the BLAST+ | |
178 tabular output). | |
179 | |
180 | |
181 **References** | |
182 | |
183 A specific paper covering this tool is planned, but please also cite: | |
184 | |
185 Christiam Camacho et al. (2009). | |
186 BLAST+: architecture and applications. | |
187 BMC Bioinformatics. 15;10:421. | |
188 http://dx.doi.org/10.1186/1471-2105-10-421 | |
189 | |
190 This wrapper is available to install into other Galaxy Instances via the Galaxy | |
191 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh | |
192 </help> | |
193 </tool> |