# HG changeset patch # User peterjc # Date 1413284020 14400 # Node ID 661276ad882e4030df3053ff1e8cf4351cf664df # Parent 4733e4ea4dab25788bac44abff5e171787e8fa76 Uploaded v0.1.3, uses Biopython for making NR diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/README.rst --- a/tools/blast_rbh/README.rst Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/README.rst Tue Oct 14 06:53:40 2014 -0400 @@ -8,9 +8,8 @@ This tool is a short Python script to run reciprocal BLAST searches on a pair of sequence files, and extract the reciprocal best hits. -This is a work in progress, and builds on an earlier implementation which -prequired the two BLAST searches be prepared in advance. Integration allows -a much simpler user experience, and can ensure sensible filters are used. +It is available from the Galaxy Tool Shed at: +http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh Automated Installation @@ -54,6 +53,8 @@ v0.1.2 - Using optparse for command line API. - Tool definition now embeds citation information. - Fixed Tool Shed dependency definition. +v0.1.3 - Option to make FASTA files non-redundant (via Biopython dependency). + - Avoid extra database and BLAST search in self-comparison mode. ======= ====================================================================== diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/blast_rbh.py --- a/tools/blast_rbh/blast_rbh.py Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.py Tue Oct 14 06:53:40 2014 -0400 @@ -30,7 +30,7 @@ if "--version" in sys.argv[1:]: #TODO - Capture version of BLAST+ binaries too? - print "BLAST RBH v0.1.2" + print "BLAST RBH v0.1.3" sys.exit(0) #Parse Command Line @@ -52,6 +52,9 @@ parser.add_option("-c", "--coverage", dest="min_coverage", default="50", help="Minimum HSP coverage (optional, default 50)") +parser.add_option("--nr", dest="nr", default=False, action="store_true", + help="Preprocess FASTA files to collapse identifical " + "entries (make sequences non-redundant)") parser.add_option("-o", "--output", dest="output", default=None, metavar="FILE", help="Output filename") @@ -117,10 +120,17 @@ makeblastdb_exe = "makeblastdb" base_path = tempfile.mkdtemp() +tmp_a = os.path.join(base_path, "SpeciesA.fasta") db_a = os.path.join(base_path, "SpeciesA") -db_b = os.path.join(base_path, "SpeciesB") a_vs_b = os.path.join(base_path, "A_vs_B.tabular") -b_vs_a = os.path.join(base_path, "B_vs_A.tabular") +if self_comparison: + tmp_b = tmp_a + db_b = db_a + b_vs_a = a_vs_b +else: + tmp_b = os.path.join(base_path, "SpeciesB.fasta") + db_b = os.path.join(base_path, "SpeciesB") + b_vs_a = os.path.join(base_path, "B_vs_A.tabular") log = os.path.join(base_path, "blast.log") cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? @@ -202,18 +212,65 @@ #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) tie_warning += 1 +def make_nr(input_fasta, output_fasta, sep=";"): + #TODO - seq-hash based to avoid loading everything into RAM? + by_seq = dict() + try: + from Bio import SeqIO + except KeyError: + stop_err("Missing Biopython") + for record in SeqIO.parse(input_fasta, "fasta"): + s = str(record.seq).upper() + try: + by_seq[s].append(record.id) + except KeyError: + by_seq[s] = [record.id] + unique = 0 + representatives = dict() + duplicates = set() + for cluster in by_seq.values(): + if len(cluster) > 1: + representatives[cluster[0]] = cluster + duplicates.update(cluster[1:]) + else: + unique += 1 + del by_seq + if duplicates: + #TODO - refactor as a generator with single SeqIO.write(...) call + with open(output_fasta, "w") as handle: + for record in SeqIO.parse(input_fasta, "fasta"): + if record.id in representatives: + cluster = representatives[record.id] + record.id = sep.join(cluster) + record.description = "representing %i records" % len(cluster) + elif record.id in duplicates: + continue + SeqIO.write(record, handle, "fasta") + print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives))) + else: + os.symlink(os.path.abspath(input_fasta), output_fasta) + print("No perfect duplicates in file, %i unique entries" % unique) #print("Starting...") +if options.nr: + make_nr(fasta_a, tmp_a) + if not self_comparison: + make_nr(fasta_b, tmp_b) + fasta_a = tmp_a + fasta_b = tmp_b + #TODO - Report log in case of error? run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) -run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) +if not self_comparison: + run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) #print("BLAST databases prepared.") run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) #print("BLAST species A vs species B done.") -run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' - % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) -#print("BLAST species B vs species A done.") +if not self_comparison: + run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' + % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) + #print("BLAST species B vs species A done.") best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/blast_rbh.xml --- a/tools/blast_rbh/blast_rbh.xml Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/blast_rbh.xml Tue Oct 14 06:53:40 2014 -0400 @@ -1,10 +1,12 @@ - + from two FASTA files - makeblastdb - blastp - blastn - blast+ + biopython + Bio + makeblastdb + blastp + blastn + blast+ blast_rbh.py --version @@ -17,6 +19,7 @@ #else -t $seq.prot_type #end if +$make_nr -i $identity -c $q_cover -o "$output" @@ -61,6 +64,9 @@ + @@ -201,6 +207,13 @@ can happen following gene duplication, or for (near) identical gene duplicates. +The tool can optionally make the FASTA files non-redundant by replacing +repeated identical sequences with a single representative before building +the databases and running BLAST. + +Finally, the tool can be run using the same FASTA input file to look for +RBH within the dataset. In this case, self matches are discarded. + .. class:: warningmark **Note** diff -r 4733e4ea4dab -r 661276ad882e tools/blast_rbh/tool_dependencies.xml --- a/tools/blast_rbh/tool_dependencies.xml Thu Aug 14 07:11:59 2014 -0400 +++ b/tools/blast_rbh/tool_dependencies.xml Tue Oct 14 06:53:40 2014 -0400 @@ -1,5 +1,8 @@ + + +