diff tools/blast_rbh/blast_rbh.xml @ 0:a96608a125fb draft

Uploaded v0.1.0, first release
author peterjc
date Thu, 15 May 2014 12:54:09 -0400
parents
children a68f4e5789d7
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blast_rbh/blast_rbh.xml	Thu May 15 12:54:09 2014 -0400
@@ -0,0 +1,160 @@
+<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.0">
+    <description>from two FASTA files</description>
+    <requirements>
+            <requirement type="binary">makeblastdb</requirement>
+            <requirement type="binary">blastp</requirement>
+            <requirement type="binary">blastn</requirement>
+            <requirement type="package" version="2.2.29">blast+</requirement>
+    </requirements>
+    <version_command interpreter="python">
+blast_rbh.py --version
+    </version_command>
+    <command interpreter="python">
+blast_rbh.py "$fasta_a" "$fasta_b" $seq.dbtype
+#if $seq.dbtype=="nucl"
+    $seq.nucl_type
+#else
+    $seq.prot_type
+#end if
+$identity $q_cover "$output"
+    </command>
+    <inputs>
+        <!-- Galaxy does not have sub-types for protein vs nucletide FASTA -->
+        <param name="fasta_a" type="data" format="fasta"
+	       label="Genes/proteins from species A"
+	       description="FASTA file, one sequence per gene/protein." /> 
+        <param name="fasta_b" type="data" format="fasta"
+	       label="Genes/proteins from species B"
+	       description="FASTA file, one sequence per gene/protein." /> 
+        <conditional name="seq">
+            <param name="dbtype" type="select" label="Molecule type of FASTA inputs">
+                <option value="prot">protein</option>
+                <option value="nucl">nucleotide</option>
+            </param>
+            <when value="prot">
+                <param name="prot_type" type="select" display="radio" label="Type of BLAST">
+                    <option value="blastp">blastp - Traditional BLASTP to compare a protein query to a protein database</option>
+                    <option value="blastp-short">blastp-short - BLASTP optimized for queries shorter than 30 residues</option>
+                </param>
+            </when>
+            <when value="nucl">
+                <param name="nucl_type" type="select" display="radio" label="Type of BLAST">
+                    <option value="megablast">megablast - Traditional megablast used to find very similar (e.g., intraspecies or closely related species) sequences</option>
+                    <option value="blastn">blastn - Traditional BLASTN requiring an exact match of 11, for somewhat similar sequences</option>
+                    <option value="blastn-short">blastn-short - BLASTN program optimized for sequences shorter than 50 bases</option>
+                    <option value="dc-megablast">dc-megablast - Discontiguous megablast used to find more distant (e.g., interspecies) sequences</option>
+                </param>
+            </when>
+        </conditional>
+	<param name="identity" type="float" value="70" min="0" max="100"
+	       label="Minimum percentage identity for BLAST matches"
+	       help="Default is 70%, use 0 for no filtering." />
+        <param name="q_cover" type="float" value="50" min="0" max="100"
+	       label="Minimum percentage query coverage for BLAST matches"
+	       help="Default is 50%, use 0 for no filtering." />
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular" label="BLAST RBH: $fasta_a.name vs $fasta_b.name" />
+    </outputs>
+    <requirements>
+    </requirements>
+    <tests>
+        <test>
+            <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/>
+            <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/>
+            <param name="dbtype" value="nucl"/>
+            <param name="nucl_type" value="megablast"/>
+            <param name="identity" value="0.0"/>
+            <param name="q_cover" value="0.0"/>
+            <output name="output" file="rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/>
+            <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/>
+            <param name="dbtype" value="nucl"/>
+            <param name="nucl_type" value="megablast"/>
+            <param name="identity" value="92"/>
+            <param name="q_cover" value="86"/>
+            <output name="output" file="rbh_megablast_rhodopsin_nucs_vs_three_human_mRNA.tabular" ftype="tabular"/>
+        </test>
+        <!-- push the percentage identity over the 92.07% level -->
+        <test>
+            <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/>
+            <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/>
+            <param name="dbtype" value="nucl"/>
+            <param name="nucl_type" value="megablast"/>
+            <param name="identity" value="92.5"/>
+            <param name="q_cover" value="86"/>
+            <output name="output" file="rbh_none.tabular" ftype="tabular"/>
+        </test>
+	<!-- push the coverage over the 86% level -->
+        <test>
+            <param name="fasta_a" value="rhodopsin_nucs.fasta" ftype="fasta"/>
+            <param name="fasta_b" value="three_human_mRNA.fasta" ftype="fasta"/>
+            <param name="dbtype" value="nucl"/>
+            <param name="nucl_type" value="megablast"/>
+            <param name="identity" value="92"/>
+            <param name="q_cover" value="87"/>
+            <output name="output" file="rbh_none.tabular" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="fasta_a" value="three_human_mRNA.fasta" ftype="fasta"/>
+            <param name="fasta_b" value="rhodopsin_nucs.fasta" ftype="fasta"/>
+            <param name="dbtype" value="nucl"/>
+            <param name="nucl_type" value="blastn"/>
+            <param name="identity" value="0.0"/>
+            <param name="q_cover" value="0.0"/>
+            <output name="output" file="rbh_blastn_three_human_mRNA_vs_rhodopsin_nucs.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Takes two FASTA files (species *A* and species *B*), builds a BLAST database
+for each, runs reciprocal BLAST searchs (*A vs B*, and *B vs A*), optionally
+filters these, and then compiles a list of the reciprocal best hits (RBH).
+
+The output from this tool is a tabular file containing four columns, with
+the order taken from input file A:
+
+====== ======================
+Column Description
+------ ----------------------
+     1 ID from species *A*
+     2 ID from species *B*
+     3 Bitscore from *A vs B*
+     4 Bitscore from *B vs A*
+====== ======================
+
+.. class:: warningmark
+
+**Note**
+
+If you are trying to use BLAST RBH matches to identify candidate orthologues
+or transfer annotation, you *must* use a percentage identity and minimum
+coverage threshold or similiar. See:
+
+Punta and Ofran (2008) The Rough Guide to In Silico Function Prediction,
+or How To Use Sequence and Structure Information To Predict Protein
+Function. PLoS Comput Biol 4(10): e1000160.
+http://dx.doi.org/10.1371/journal.pcbi.1000160
+
+The defaults are to require 70% sequence identity over the aligned region
+(using ``pident`` in the BLAST+ tabular output), and that the HSP alignment
+covers at least 50% of the query sequence (using ``qcovhsp`` in the BLAST+
+tabular output).
+
+
+**References**
+
+A specific paper covering this tool is planned, but please also cite:
+
+Christiam Camacho et al. (2009).
+BLAST+: architecture and applications.
+BMC Bioinformatics. 15;10:421.
+http://dx.doi.org/10.1186/1471-2105-10-421
+
+This wrapper is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
+    </help>
+</tool>