Mercurial > repos > peterjc > align_back_trans

--- a/test-data/demo_nuc_align.fasta	Tue Feb 18 08:46:34 2014 -0500
+++ b/test-data/demo_nuc_align.fasta	Wed Feb 19 11:36:01 2014 -0500
@@ -1,6 +1,6 @@
 >Alpha
-GGUGAGGAACGA
+GATGAGGAACGA
 >Beta
-GGCGGG---CGU
+GATGAG---CGU
 >Gamma
-GGU------CGG
+GAT------CGG
--- a/test-data/demo_nucs.fasta	Tue Feb 18 08:46:34 2014 -0500
+++ b/test-data/demo_nucs.fasta	Wed Feb 19 11:36:01 2014 -0500
@@ -1,6 +1,6 @@
 >Alpha
-GGUGAGGAACGA
+GATGAGGAACGA
 >Beta
-GGCGGGCGU
+GATGAGCGU
 >Gamma
-GGUCGG
+GATCGG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/demo_nucs_trailing_stop.fasta	Wed Feb 19 11:36:01 2014 -0500
@@ -0,0 +1,6 @@
+>Alpha
+GATGAGGAACGA
+>Beta
+GATGAGCGU
+>Gamma
+GATCGGTAA
--- a/tools/align_back_trans/README.rst	Tue Feb 18 08:46:34 2014 -0500
+++ b/tools/align_back_trans/README.rst	Wed Feb 19 11:36:01 2014 -0500
@@ -33,14 +33,15 @@

 The suggested location is in a dedicated ``tools/align_back_trans`` folder.

-You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
-tool. One suggested location is in the filters section. Simply add the line::
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer
+the tool. One suggested location is in the multiple alignments section. Simply
+add the line::

     <tool file="align_back_trans/align_back_trans.xml" />

-You will also need to install Biopython 1.54 or later. If you want to run
+You will also need to install Biopython 1.62 or later. If you want to run
 the unit tests, include this line in ``tools_conf.xml.sample`` and the sample
-FASTA files under the test-data directory. Then::
+FASTA files under the ``test-data`` directory. Then::

     ./run_functional_tests.sh -id align_back_trans

@@ -54,6 +55,7 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial version, based on a previously written Python script
+v0.0.2  - Optionally check the translation is consistent
 ======= ======================================================================


@@ -69,7 +71,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder::

-    $ tar -czf align_back_trans.tar.gz tools/align_back_trans/README.rst tools/align_back_trans/align_back_trans.py tools/align_back_trans/align_back_trans.xml tools/align_back_trans/tool_dependencies.xml test-data/demo_nucs.fasta test-data/demo_prot_align.fasta test-data/demo_nuc_align.fasta
+    $ tar -czf align_back_trans.tar.gz tools/align_back_trans/README.rst tools/align_back_trans/align_back_trans.py tools/align_back_trans/align_back_trans.xml tools/align_back_trans/tool_dependencies.xml test-data/demo_nucs.fasta test-data/demo_nucs_trailing_stop.fasta test-data/demo_prot_align.fasta test-data/demo_nuc_align.fasta

 Check this worked::

@@ -79,6 +81,7 @@
     tools/align_back_trans/align_back_trans.xml
     tools/align_back_trans/tool_dependencies.xml
     test-data/demo_nucs.fasta
+    test-data/demo_nucs_trailing_stop.fasta
     test-data/demo_prot_align.fasta
     test-data/demo_nuc_align.fasta
--- a/tools/align_back_trans/align_back_trans.py	Tue Feb 18 08:46:34 2014 -0500
+++ b/tools/align_back_trans/align_back_trans.py	Wed Feb 19 11:36:01 2014 -0500
@@ -25,9 +25,10 @@
 from Bio.Align import MultipleSeqAlignment
 from Bio import SeqIO
 from Bio import AlignIO
+from Bio.Data.CodonTable import ambiguous_generic_by_id

 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.2"
+    print "v0.0.3"
     sys.exit(0)

 def stop_err(msg, error_level=1):
@@ -35,7 +36,59 @@
     sys.stderr.write("%s\n" % msg)
     sys.exit(error_level)

-def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap):
+def check_trans(identifier, nuc, prot, table):
+    """Returns nucleotide sequence if works (can remove trailing stop)"""
+    if len(nuc) % 3:
+        stop_err("Nucleotide sequence for %s is length %i (not a multiple of three)"
+                 % (identifier, nuc))
+
+    p = str(prot).upper().replace("*", "X")
+    t = str(nuc.translate(table)).upper().replace("*", "X")
+    if len(t) == len(p) + 1:
+        if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons:
+            #Allow this...
+            t = t[:-1]
+            nuc  = nuc[:-3] #edit return value
+    if len(t) != len(p) and p in t:
+        stop_err("%s translation matched but only as subset of nucleotides, "
+                 "wrong start codon?" % identifier)
+    if len(t) != len(p) and p[1:] in t:
+        stop_err("%s translation matched (ignoring first base) but only "
+                 "as subset of nucleotides, wring start codon?" % identifier)
+    if len(t) != len(p):
+        stop_err("Inconsistent lengths for %s, ungapped protein %i, "
+                 "tripled %i vs ungapped nucleotide %i" %
+                 (identifier,
+                  len(p),
+                  len(p) * 3,
+                  len(nuc)))
+
+
+    if t == p:
+        return nuc
+    elif p.startswith("M") and "M" + t[1:] == p:
+        #Close, was there a start codon?
+        if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons:
+            return nuc
+        else:
+            stop_err("Translation check failed for %s\n"
+                     "Would match if %s was a start codon (check correct table used)\n"
+                     % (identifier, nuc[0:3].upper()))
+    else:
+        #Allow * vs X here? e.g. internal stop codons
+        m = "".join("." if x==y else "!" for (x,y) in zip(p,t))
+        if len(prot) < 70:
+            sys.stderr.write("Protein:     %s\n" % p)
+            sys.stderr.write("             %s\n" % m)
+            sys.stderr.write("Translation: %s\n" % t)
+        else:
+            for offset in range(0, len(p), 60):
+                sys.stderr.write("Protein:     %s\n" % p[offset:offset+60])
+                sys.stderr.write("             %s\n" % m[offset:offset+60])
+                sys.stderr.write("Translation: %s\n\n" % t[offset:offset+60])
+        stop_err("Translation check failed for %s\n" % identifier)
+
+def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0):
     #TODO - Separate arguments for protein gap and nucleotide gap?
     if not gap or len(gap) != 1:
         raise ValueError("Please supply a single gap character")
@@ -49,22 +102,27 @@
         alpha = Gapped(alpha, gap)
         gap_codon = gap*3

-    if len(aligned_protein_record.seq.ungap(gap))*3 != len(unaligned_nucleotide_record.seq):
+    ungapped_protein = aligned_protein_record.seq.ungap(gap)
+    ungapped_nucleotide = unaligned_nucleotide_record.seq
+    if table:
+        ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table)
+    elif len(ungapped_protein) * 3 != len(ungapped_nucleotide):
         stop_err("Inconsistent lengths for %s, ungapped protein %i, "
                  "tripled %i vs ungapped nucleotide %i" %
-                 (len(aligned_protein_record.seq.ungap(gap)),
-                  len(aligned_protein_record.seq.ungap(gap))*3,
-                  len(unaligned_nucleotide_record.seq)))
+                 (aligned_protein_record.id,
+                  len(ungapped_protein),
+                  len(ungapped_protein) * 3,
+                  len(ungapped_nucleotide)))

     seq = []
-    nuc = str(unaligned_nucleotide_record.seq)
+    nuc = str(ungapped_nucleotide)
     for amino_acid in aligned_protein_record.seq:
         if amino_acid == gap:
             seq.append(gap_codon)
         else:
             seq.append(nuc[:3])
             nuc = nuc[3:]
-    assert not nuc, "Nucleotide sequence for %r longer than protein %s" \
+    assert not nuc, "Nucleotide sequence for %r longer than protein %r" \
         % (unaligned_nucleotide_record.id, aligned_protein_record.id)

     aligned_nuc = unaligned_nucleotide_record[:] #copy for most annotation
@@ -73,7 +131,7 @@
     assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc)
     return aligned_nuc

-def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None):
+def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0):
     """Thread nucleotide sequences onto a protein alignment."""
     #TODO - Separate arguments for protein and nucleotide gap characters?
     if key_function is None:
@@ -88,23 +146,28 @@
         except KeyError:
             raise ValueError("Could not find nucleotide sequence for protein %r" \
                              % protein.id)
-        aligned.append(sequence_back_translate(protein, nucleotide, gap))
+        aligned.append(sequence_back_translate(protein, nucleotide, gap, table))
     return MultipleSeqAlignment(aligned)


 if len(sys.argv) == 4:
     align_format, prot_align_file, nuc_fasta_file = sys.argv[1:]
     nuc_align_file = sys.stdout
+    table = 0
 elif len(sys.argv) == 5:
     align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:]
+    table = 0
+elif len(sys.argv) == 6:
+    align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
 else:
     stop_err("""This is a Python script for 'back-translating' a protein alignment,

-It requires three or four arguments:
+It requires three, four or five arguments:
 - alignment format (e.g. fasta, clustal),
 - aligned protein file (in specified format),
 - unaligned nucleotide file (in fasta format).
 - aligned nucleotiode output file (in same format), optional.
+- NCBI translation table (0 for none), optional

 The nucleotide alignment is printed to stdout if no output filename is given.

@@ -119,7 +182,12 @@
 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
 """)

+try:
+    table = int(table)
+except:
+    stop_err("Bad table argument %r" % table)
+
 prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein)
 nuc_dict = SeqIO.index(nuc_fasta_file, "fasta")
-nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-")
+nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table)
 AlignIO.write(nuc_align, nuc_align_file, align_format)
--- a/tools/align_back_trans/align_back_trans.xml	Tue Feb 18 08:46:34 2014 -0500
+++ b/tools/align_back_trans/align_back_trans.xml	Wed Feb 19 11:36:01 2014 -0500
@@ -1,4 +1,4 @@
-<tool id="align_back_trans" name="Thread nucleotides onto a protein alignment (back-translation)" version="0.0.2">
+<tool id="align_back_trans" name="Thread nucleotides onto a protein alignment (back-translation)" version="0.0.3">
     <description>Gives a codon aware alignment</description>
     <requirements>
         <requirement type="package" version="1.63">biopython</requirement>
@@ -6,7 +6,7 @@
     </requirements>
     <version_command interpreter="python">align_back_trans.py --version</version_command>
     <command interpreter="python">
-align_back_trans.py $prot_align.ext $prot_align $nuc_file $out_nuc_align
+align_back_trans.py $prot_align.ext "$prot_align" "$nuc_file" "$out_nuc_align" "$table"
     </command>
     <stdio>
         <!-- Anything other than zero is an error -->
@@ -15,7 +15,6 @@
     </stdio>
     <inputs>
         <param name="prot_align" type="data" format="fasta,muscle,clustal" label="Aligned protein file" help="Mutliple sequence file in FASTA, ClustalW or PHYLIP format." />
-	<!-- TODO? Could verify the translation
         <param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons">
             <option value="1">1. Standard</option>
             <option value="2">2. Vertebrate Mitochondrial</option>
@@ -34,8 +33,8 @@
             <option value="21">21. Trematode Mitochondrial</option>
             <option value="22">22. Scenedesmus obliquus</option>
             <option value="23">23. Thraustochytrium Mitochondrial</option>
+            <option value="0">Don't check the translation</option>
         </param>
-	-->
         <param name="nuc_file" type="data" format="fasta" label="Unaligned nucleotide sequences" help="FASTA format, using same identifiers as your protein alignment" />
     </inputs>
     <outputs>
@@ -51,6 +50,13 @@
         <test>
             <param name="prot_align" value="demo_prot_align.fasta" />
             <param name="nuc_file" value="demo_nucs.fasta" />
+            <param name="table" value="0" />
+            <output name="out_nuc_align" file="demo_nuc_align.fasta" />
+        </test>
+        <test>
+            <param name="prot_align" value="demo_prot_align.fasta" />
+            <param name="nuc_file" value="demo_nucs_trailing_stop.fasta" />
+            <param name="table" value="11" />
             <output name="out_nuc_align" file="demo_nuc_align.fasta" />
         </test>
     </tests>
@@ -63,8 +69,15 @@
 protein alignment to produce a codon aware nucleotide alignment - which can
 be viewed as a back translation.

-Note - the provided nucleotide sequences should be exactly three times the
-length of the protein sequences (exluding the gaps).
+If you specify one of the standard NCBI genetic codes (recommended), then the
+translation is verified. This will allow fuzzy matching if stop codons in the
+protein sequence have been reprented as X, and will allow for a trailing stop
+codon present in the nucleotide sequences but not the protein.
+
+Note - the protein and nucleotide sequences must use the same identifers.
+
+Note - If not translation table is specified, the provided nucleotide sequences
+should be exactly three times the length of the protein sequences (exluding the gaps).

 Note - the nucleotide FASTA file may contain extra sequences not in the
 protein alignment, they will be ignored. This can be useful if for example