# HG changeset patch
# User peterjc
# Date 1392827761 18000
# Node ID 39ad3f684a301eaff3c88930455116690ce1f043
# Parent bd2b013e0d0d7e587c1b56155159a59679564cd2
Uploaded v0.0.3 preview 1, translation checking
diff -r bd2b013e0d0d -r 39ad3f684a30 test-data/demo_nuc_align.fasta
--- a/test-data/demo_nuc_align.fasta Tue Feb 18 08:46:34 2014 -0500
+++ b/test-data/demo_nuc_align.fasta Wed Feb 19 11:36:01 2014 -0500
@@ -1,6 +1,6 @@
>Alpha
-GGUGAGGAACGA
+GATGAGGAACGA
>Beta
-GGCGGG---CGU
+GATGAG---CGU
>Gamma
-GGU------CGG
+GAT------CGG
diff -r bd2b013e0d0d -r 39ad3f684a30 test-data/demo_nucs.fasta
--- a/test-data/demo_nucs.fasta Tue Feb 18 08:46:34 2014 -0500
+++ b/test-data/demo_nucs.fasta Wed Feb 19 11:36:01 2014 -0500
@@ -1,6 +1,6 @@
>Alpha
-GGUGAGGAACGA
+GATGAGGAACGA
>Beta
-GGCGGGCGU
+GATGAGCGU
>Gamma
-GGUCGG
+GATCGG
diff -r bd2b013e0d0d -r 39ad3f684a30 test-data/demo_nucs_trailing_stop.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/demo_nucs_trailing_stop.fasta Wed Feb 19 11:36:01 2014 -0500
@@ -0,0 +1,6 @@
+>Alpha
+GATGAGGAACGA
+>Beta
+GATGAGCGU
+>Gamma
+GATCGGTAA
diff -r bd2b013e0d0d -r 39ad3f684a30 tools/align_back_trans/README.rst
--- a/tools/align_back_trans/README.rst Tue Feb 18 08:46:34 2014 -0500
+++ b/tools/align_back_trans/README.rst Wed Feb 19 11:36:01 2014 -0500
@@ -33,14 +33,15 @@
The suggested location is in a dedicated ``tools/align_back_trans`` folder.
-You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
-tool. One suggested location is in the filters section. Simply add the line::
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer
+the tool. One suggested location is in the multiple alignments section. Simply
+add the line::
-You will also need to install Biopython 1.54 or later. If you want to run
+You will also need to install Biopython 1.62 or later. If you want to run
the unit tests, include this line in ``tools_conf.xml.sample`` and the sample
-FASTA files under the test-data directory. Then::
+FASTA files under the ``test-data`` directory. Then::
./run_functional_tests.sh -id align_back_trans
@@ -54,6 +55,7 @@
Version Changes
------- ----------------------------------------------------------------------
v0.0.1 - Initial version, based on a previously written Python script
+v0.0.2 - Optionally check the translation is consistent
======= ======================================================================
@@ -69,7 +71,7 @@
For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
the following command from the Galaxy root folder::
- $ tar -czf align_back_trans.tar.gz tools/align_back_trans/README.rst tools/align_back_trans/align_back_trans.py tools/align_back_trans/align_back_trans.xml tools/align_back_trans/tool_dependencies.xml test-data/demo_nucs.fasta test-data/demo_prot_align.fasta test-data/demo_nuc_align.fasta
+ $ tar -czf align_back_trans.tar.gz tools/align_back_trans/README.rst tools/align_back_trans/align_back_trans.py tools/align_back_trans/align_back_trans.xml tools/align_back_trans/tool_dependencies.xml test-data/demo_nucs.fasta test-data/demo_nucs_trailing_stop.fasta test-data/demo_prot_align.fasta test-data/demo_nuc_align.fasta
Check this worked::
@@ -79,6 +81,7 @@
tools/align_back_trans/align_back_trans.xml
tools/align_back_trans/tool_dependencies.xml
test-data/demo_nucs.fasta
+ test-data/demo_nucs_trailing_stop.fasta
test-data/demo_prot_align.fasta
test-data/demo_nuc_align.fasta
diff -r bd2b013e0d0d -r 39ad3f684a30 tools/align_back_trans/align_back_trans.py
--- a/tools/align_back_trans/align_back_trans.py Tue Feb 18 08:46:34 2014 -0500
+++ b/tools/align_back_trans/align_back_trans.py Wed Feb 19 11:36:01 2014 -0500
@@ -25,9 +25,10 @@
from Bio.Align import MultipleSeqAlignment
from Bio import SeqIO
from Bio import AlignIO
+from Bio.Data.CodonTable import ambiguous_generic_by_id
if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.0.2"
+ print "v0.0.3"
sys.exit(0)
def stop_err(msg, error_level=1):
@@ -35,7 +36,59 @@
sys.stderr.write("%s\n" % msg)
sys.exit(error_level)
-def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap):
+def check_trans(identifier, nuc, prot, table):
+ """Returns nucleotide sequence if works (can remove trailing stop)"""
+ if len(nuc) % 3:
+ stop_err("Nucleotide sequence for %s is length %i (not a multiple of three)"
+ % (identifier, nuc))
+
+ p = str(prot).upper().replace("*", "X")
+ t = str(nuc.translate(table)).upper().replace("*", "X")
+ if len(t) == len(p) + 1:
+ if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons:
+ #Allow this...
+ t = t[:-1]
+ nuc = nuc[:-3] #edit return value
+ if len(t) != len(p) and p in t:
+ stop_err("%s translation matched but only as subset of nucleotides, "
+ "wrong start codon?" % identifier)
+ if len(t) != len(p) and p[1:] in t:
+ stop_err("%s translation matched (ignoring first base) but only "
+ "as subset of nucleotides, wring start codon?" % identifier)
+ if len(t) != len(p):
+ stop_err("Inconsistent lengths for %s, ungapped protein %i, "
+ "tripled %i vs ungapped nucleotide %i" %
+ (identifier,
+ len(p),
+ len(p) * 3,
+ len(nuc)))
+
+
+ if t == p:
+ return nuc
+ elif p.startswith("M") and "M" + t[1:] == p:
+ #Close, was there a start codon?
+ if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons:
+ return nuc
+ else:
+ stop_err("Translation check failed for %s\n"
+ "Would match if %s was a start codon (check correct table used)\n"
+ % (identifier, nuc[0:3].upper()))
+ else:
+ #Allow * vs X here? e.g. internal stop codons
+ m = "".join("." if x==y else "!" for (x,y) in zip(p,t))
+ if len(prot) < 70:
+ sys.stderr.write("Protein: %s\n" % p)
+ sys.stderr.write(" %s\n" % m)
+ sys.stderr.write("Translation: %s\n" % t)
+ else:
+ for offset in range(0, len(p), 60):
+ sys.stderr.write("Protein: %s\n" % p[offset:offset+60])
+ sys.stderr.write(" %s\n" % m[offset:offset+60])
+ sys.stderr.write("Translation: %s\n\n" % t[offset:offset+60])
+ stop_err("Translation check failed for %s\n" % identifier)
+
+def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0):
#TODO - Separate arguments for protein gap and nucleotide gap?
if not gap or len(gap) != 1:
raise ValueError("Please supply a single gap character")
@@ -49,22 +102,27 @@
alpha = Gapped(alpha, gap)
gap_codon = gap*3
- if len(aligned_protein_record.seq.ungap(gap))*3 != len(unaligned_nucleotide_record.seq):
+ ungapped_protein = aligned_protein_record.seq.ungap(gap)
+ ungapped_nucleotide = unaligned_nucleotide_record.seq
+ if table:
+ ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table)
+ elif len(ungapped_protein) * 3 != len(ungapped_nucleotide):
stop_err("Inconsistent lengths for %s, ungapped protein %i, "
"tripled %i vs ungapped nucleotide %i" %
- (len(aligned_protein_record.seq.ungap(gap)),
- len(aligned_protein_record.seq.ungap(gap))*3,
- len(unaligned_nucleotide_record.seq)))
+ (aligned_protein_record.id,
+ len(ungapped_protein),
+ len(ungapped_protein) * 3,
+ len(ungapped_nucleotide)))
seq = []
- nuc = str(unaligned_nucleotide_record.seq)
+ nuc = str(ungapped_nucleotide)
for amino_acid in aligned_protein_record.seq:
if amino_acid == gap:
seq.append(gap_codon)
else:
seq.append(nuc[:3])
nuc = nuc[3:]
- assert not nuc, "Nucleotide sequence for %r longer than protein %s" \
+ assert not nuc, "Nucleotide sequence for %r longer than protein %r" \
% (unaligned_nucleotide_record.id, aligned_protein_record.id)
aligned_nuc = unaligned_nucleotide_record[:] #copy for most annotation
@@ -73,7 +131,7 @@
assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc)
return aligned_nuc
-def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None):
+def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0):
"""Thread nucleotide sequences onto a protein alignment."""
#TODO - Separate arguments for protein and nucleotide gap characters?
if key_function is None:
@@ -88,23 +146,28 @@
except KeyError:
raise ValueError("Could not find nucleotide sequence for protein %r" \
% protein.id)
- aligned.append(sequence_back_translate(protein, nucleotide, gap))
+ aligned.append(sequence_back_translate(protein, nucleotide, gap, table))
return MultipleSeqAlignment(aligned)
if len(sys.argv) == 4:
align_format, prot_align_file, nuc_fasta_file = sys.argv[1:]
nuc_align_file = sys.stdout
+ table = 0
elif len(sys.argv) == 5:
align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:]
+ table = 0
+elif len(sys.argv) == 6:
+ align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
else:
stop_err("""This is a Python script for 'back-translating' a protein alignment,
-It requires three or four arguments:
+It requires three, four or five arguments:
- alignment format (e.g. fasta, clustal),
- aligned protein file (in specified format),
- unaligned nucleotide file (in fasta format).
- aligned nucleotiode output file (in same format), optional.
+- NCBI translation table (0 for none), optional
The nucleotide alignment is printed to stdout if no output filename is given.
@@ -119,7 +182,12 @@
http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
""")
+try:
+ table = int(table)
+except:
+ stop_err("Bad table argument %r" % table)
+
prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein)
nuc_dict = SeqIO.index(nuc_fasta_file, "fasta")
-nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-")
+nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table)
AlignIO.write(nuc_align, nuc_align_file, align_format)
diff -r bd2b013e0d0d -r 39ad3f684a30 tools/align_back_trans/align_back_trans.xml
--- a/tools/align_back_trans/align_back_trans.xml Tue Feb 18 08:46:34 2014 -0500
+++ b/tools/align_back_trans/align_back_trans.xml Wed Feb 19 11:36:01 2014 -0500
@@ -1,4 +1,4 @@
-
+
Gives a codon aware alignment
biopython
@@ -6,7 +6,7 @@
align_back_trans.py --version
-align_back_trans.py $prot_align.ext $prot_align $nuc_file $out_nuc_align
+align_back_trans.py $prot_align.ext "$prot_align" "$nuc_file" "$out_nuc_align" "$table"
@@ -15,7 +15,6 @@
-
@@ -51,6 +50,13 @@
+
+
+
+
+
+
+
@@ -63,8 +69,15 @@
protein alignment to produce a codon aware nucleotide alignment - which can
be viewed as a back translation.
-Note - the provided nucleotide sequences should be exactly three times the
-length of the protein sequences (exluding the gaps).
+If you specify one of the standard NCBI genetic codes (recommended), then the
+translation is verified. This will allow fuzzy matching if stop codons in the
+protein sequence have been reprented as X, and will allow for a trailing stop
+codon present in the nucleotide sequences but not the protein.
+
+Note - the protein and nucleotide sequences must use the same identifers.
+
+Note - If not translation table is specified, the provided nucleotide sequences
+should be exactly three times the length of the protein sequences (exluding the gaps).
Note - the nucleotide FASTA file may contain extra sequences not in the
protein alignment, they will be ignored. This can be useful if for example