Mercurial > repos > peterjc > blast_rbh
changeset 12:661276ad882e draft
Uploaded v0.1.3, uses Biopython for making NR
author | peterjc |
---|---|
date | Tue, 14 Oct 2014 06:53:40 -0400 |
parents | 4733e4ea4dab |
children | 4454596ed127 |
files | tools/blast_rbh/README.rst tools/blast_rbh/blast_rbh.py tools/blast_rbh/blast_rbh.xml tools/blast_rbh/tool_dependencies.xml |
diffstat | 4 files changed, 89 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/blast_rbh/README.rst Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/README.rst Tue Oct 14 06:53:40 2014 -0400 @@ -8,9 +8,8 @@ This tool is a short Python script to run reciprocal BLAST searches on a pair of sequence files, and extract the reciprocal best hits. -This is a work in progress, and builds on an earlier implementation which -prequired the two BLAST searches be prepared in advance. Integration allows -a much simpler user experience, and can ensure sensible filters are used. +It is available from the Galaxy Tool Shed at: +http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh Automated Installation @@ -54,6 +53,8 @@ v0.1.2 - Using optparse for command line API. - Tool definition now embeds citation information. - Fixed Tool Shed dependency definition. +v0.1.3 - Option to make FASTA files non-redundant (via Biopython dependency). + - Avoid extra database and BLAST search in self-comparison mode. ======= ======================================================================
--- a/tools/blast_rbh/blast_rbh.py Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.py Tue Oct 14 06:53:40 2014 -0400 @@ -30,7 +30,7 @@ if "--version" in sys.argv[1:]: #TODO - Capture version of BLAST+ binaries too? - print "BLAST RBH v0.1.2" + print "BLAST RBH v0.1.3" sys.exit(0) #Parse Command Line @@ -52,6 +52,9 @@ parser.add_option("-c", "--coverage", dest="min_coverage", default="50", help="Minimum HSP coverage (optional, default 50)") +parser.add_option("--nr", dest="nr", default=False, action="store_true", + help="Preprocess FASTA files to collapse identifical " + "entries (make sequences non-redundant)") parser.add_option("-o", "--output", dest="output", default=None, metavar="FILE", help="Output filename") @@ -117,10 +120,17 @@ makeblastdb_exe = "makeblastdb" base_path = tempfile.mkdtemp() +tmp_a = os.path.join(base_path, "SpeciesA.fasta") db_a = os.path.join(base_path, "SpeciesA") -db_b = os.path.join(base_path, "SpeciesB") a_vs_b = os.path.join(base_path, "A_vs_B.tabular") -b_vs_a = os.path.join(base_path, "B_vs_A.tabular") +if self_comparison: + tmp_b = tmp_a + db_b = db_a + b_vs_a = a_vs_b +else: + tmp_b = os.path.join(base_path, "SpeciesB.fasta") + db_b = os.path.join(base_path, "SpeciesB") + b_vs_a = os.path.join(base_path, "B_vs_A.tabular") log = os.path.join(base_path, "blast.log") cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? @@ -202,18 +212,65 @@ #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) tie_warning += 1 +def make_nr(input_fasta, output_fasta, sep=";"): + #TODO - seq-hash based to avoid loading everything into RAM? + by_seq = dict() + try: + from Bio import SeqIO + except KeyError: + stop_err("Missing Biopython") + for record in SeqIO.parse(input_fasta, "fasta"): + s = str(record.seq).upper() + try: + by_seq[s].append(record.id) + except KeyError: + by_seq[s] = [record.id] + unique = 0 + representatives = dict() + duplicates = set() + for cluster in by_seq.values(): + if len(cluster) > 1: + representatives[cluster[0]] = cluster + duplicates.update(cluster[1:]) + else: + unique += 1 + del by_seq + if duplicates: + #TODO - refactor as a generator with single SeqIO.write(...) call + with open(output_fasta, "w") as handle: + for record in SeqIO.parse(input_fasta, "fasta"): + if record.id in representatives: + cluster = representatives[record.id] + record.id = sep.join(cluster) + record.description = "representing %i records" % len(cluster) + elif record.id in duplicates: + continue + SeqIO.write(record, handle, "fasta") + print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives))) + else: + os.symlink(os.path.abspath(input_fasta), output_fasta) + print("No perfect duplicates in file, %i unique entries" % unique) #print("Starting...") +if options.nr: + make_nr(fasta_a, tmp_a) + if not self_comparison: + make_nr(fasta_b, tmp_b) + fasta_a = tmp_a + fasta_b = tmp_b + #TODO - Report log in case of error? run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) -run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) +if not self_comparison: + run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) #print("BLAST databases prepared.") run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) #print("BLAST species A vs species B done.") -run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) -#print("BLAST species B vs species A done.") +if not self_comparison: + run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' + % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) + #print("BLAST species B vs species A done.") best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
--- a/tools/blast_rbh/blast_rbh.xml Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Tue Oct 14 06:53:40 2014 -0400 @@ -1,10 +1,12 @@ -<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.2"> +<tool id="blast_reciprocal_best_hits" name="BLAST Reciprocal Best Hits (RBH)" version="0.1.3"> <description>from two FASTA files</description> <requirements> - <requirement type="binary">makeblastdb</requirement> - <requirement type="binary">blastp</requirement> - <requirement type="binary">blastn</requirement> - <requirement type="package" version="2.2.29">blast+</requirement> + <requirement type="package" version="1.64">biopython</requirement> + <requirement type="python-module">Bio</requirement> + <requirement type="binary">makeblastdb</requirement> + <requirement type="binary">blastp</requirement> + <requirement type="binary">blastn</requirement> + <requirement type="package" version="2.2.29">blast+</requirement> </requirements> <version_command interpreter="python"> blast_rbh.py --version @@ -17,6 +19,7 @@ #else -t $seq.prot_type #end if +$make_nr -i $identity -c $q_cover -o "$output" @@ -61,6 +64,9 @@ <param name="q_cover" type="float" value="50" min="0" max="100" label="Minimum percentage query coverage for BLAST matches" help="Default is 50%, use 0 for no filtering." /> + <param name="make_nr" type="boolean" checked="false" truevalue="--nr" falsevalue="" + label="Process input FASTA files to collapse identical sequences" + help="i.e. First make the input non-redundant" /> </inputs> <outputs> <data name="output" format="tabular" label="BLAST RBH: $fasta_a.name vs $fasta_b.name" /> @@ -201,6 +207,13 @@ can happen following gene duplication, or for (near) identical gene duplicates. +The tool can optionally make the FASTA files non-redundant by replacing +repeated identical sequences with a single representative before building +the databases and running BLAST. + +Finally, the tool can be run using the same FASTA input file to look for +RBH within the dataset. In this case, self matches are discarded. + .. class:: warningmark **Note**
--- a/tools/blast_rbh/tool_dependencies.xml Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/tool_dependencies.xml Tue Oct 14 06:53:40 2014 -0400 @@ -1,5 +1,8 @@ <?xml version="1.0"?> <tool_dependency> + <package name="biopython" version="1.64"> + <repository changeset_revision="268128adb501" name="package_biopython_1_64" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> <package name="blast+" version="2.2.29"> <repository changeset_revision="e78bbab7933d" name="package_blast_plus_2_2_29" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package>