0
|
1 #!/usr/bin/env python
|
|
2 """Back-translate a protein alignment to nucleotides
|
|
3
|
|
4 This tool is a short Python script (using Biopython library functions) to
|
|
5 load a protein alignment, and matching nucleotide FASTA file of unaligned
|
|
6 sequences, in order to produce a codon aware nucleotide alignment - which
|
|
7 can be viewed as a back translation.
|
|
8
|
|
9 The development repository for this tool is here:
|
|
10
|
|
11 * https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
|
|
12
|
|
13 This tool is available with a Galaxy wrapper from the Galaxy Tool Shed at:
|
|
14
|
|
15 * http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
|
|
16
|
|
17 See accompanying text file for licence details (MIT licence).
|
|
18 """
|
|
19
|
|
20 import sys
|
|
21 from Bio.Seq import Seq
|
|
22 from Bio.Alphabet import generic_dna, generic_protein
|
|
23 from Bio.Align import MultipleSeqAlignment
|
|
24 from Bio import SeqIO
|
|
25 from Bio import AlignIO
|
|
26 from Bio.Data.CodonTable import ambiguous_generic_by_id
|
|
27
|
|
28 if "-v" in sys.argv or "--version" in sys.argv:
|
|
29 print "v0.0.5"
|
|
30 sys.exit(0)
|
|
31
|
|
32 def sys_exit(msg, error_level=1):
|
|
33 """Print error message to stdout and quit with given error level."""
|
|
34 sys.stderr.write("%s\n" % msg)
|
|
35 sys.exit(error_level)
|
|
36
|
|
37 def check_trans(identifier, nuc, prot, table):
|
|
38 """Returns nucleotide sequence if works (can remove trailing stop)"""
|
|
39 if len(nuc) % 3:
|
|
40 sys_exit("Nucleotide sequence for %s is length %i (not a multiple of three)"
|
|
41 % (identifier, len(nuc)))
|
|
42
|
|
43 p = str(prot).upper().replace("*", "X")
|
|
44 t = str(nuc.translate(table)).upper().replace("*", "X")
|
|
45 if len(t) == len(p) + 1:
|
|
46 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons:
|
|
47 #Allow this...
|
|
48 t = t[:-1]
|
|
49 nuc = nuc[:-3] #edit return value
|
|
50 if len(t) != len(p):
|
|
51 err = ("Inconsistent lengths for %s, ungapped protein %i, "
|
|
52 "tripled %i vs ungapped nucleotide %i." %
|
|
53 (identifier, len(p), len(p) * 3, len(nuc)))
|
|
54 if t.endswith(p):
|
|
55 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p))
|
|
56 elif t.startswith(p):
|
|
57 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p))
|
|
58 elif p in t:
|
|
59 #TODO - Calculate and report the number to trim at start and end?
|
|
60 err += "\nHowever, protein sequence found within translated nucleotides."
|
|
61 elif p[1:] in t:
|
|
62 err += "\nHowever, ignoring first amino acid, protein sequence found within translated nucleotides."
|
|
63 sys_exit(err)
|
|
64
|
|
65
|
|
66 if t == p:
|
|
67 return nuc
|
|
68 elif p.startswith("M") and "M" + t[1:] == p:
|
|
69 #Close, was there a start codon?
|
|
70 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons:
|
|
71 return nuc
|
|
72 else:
|
|
73 sys_exit("Translation check failed for %s\n"
|
|
74 "Would match if %s was a start codon (check correct table used)\n"
|
|
75 % (identifier, nuc[0:3].upper()))
|
|
76 else:
|
|
77 #Allow * vs X here? e.g. internal stop codons
|
|
78 m = "".join("." if x==y else "!" for (x,y) in zip(p,t))
|
|
79 if len(prot) < 70:
|
|
80 sys.stderr.write("Protein: %s\n" % p)
|
|
81 sys.stderr.write(" %s\n" % m)
|
|
82 sys.stderr.write("Translation: %s\n" % t)
|
|
83 else:
|
|
84 for offset in range(0, len(p), 60):
|
|
85 sys.stderr.write("Protein: %s\n" % p[offset:offset+60])
|
|
86 sys.stderr.write(" %s\n" % m[offset:offset+60])
|
|
87 sys.stderr.write("Translation: %s\n\n" % t[offset:offset+60])
|
|
88 sys_exit("Translation check failed for %s\n" % identifier)
|
|
89
|
|
90 def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0):
|
|
91 #TODO - Separate arguments for protein gap and nucleotide gap?
|
|
92 if not gap or len(gap) != 1:
|
|
93 raise ValueError("Please supply a single gap character")
|
|
94
|
|
95 alpha = unaligned_nucleotide_record.seq.alphabet
|
|
96 if hasattr(alpha, "gap_char"):
|
|
97 gap_codon = alpha.gap_char * 3
|
|
98 assert len(gap_codon) == 3
|
|
99 else:
|
|
100 from Bio.Alphabet import Gapped
|
|
101 alpha = Gapped(alpha, gap)
|
|
102 gap_codon = gap*3
|
|
103
|
|
104 ungapped_protein = aligned_protein_record.seq.ungap(gap)
|
|
105 ungapped_nucleotide = unaligned_nucleotide_record.seq
|
|
106 if table:
|
|
107 ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table)
|
|
108 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide):
|
|
109 sys_exit("Inconsistent lengths for %s, ungapped protein %i, "
|
|
110 "tripled %i vs ungapped nucleotide %i" %
|
|
111 (aligned_protein_record.id,
|
|
112 len(ungapped_protein),
|
|
113 len(ungapped_protein) * 3,
|
|
114 len(ungapped_nucleotide)))
|
|
115
|
|
116 seq = []
|
|
117 nuc = str(ungapped_nucleotide)
|
|
118 for amino_acid in aligned_protein_record.seq:
|
|
119 if amino_acid == gap:
|
|
120 seq.append(gap_codon)
|
|
121 else:
|
|
122 seq.append(nuc[:3])
|
|
123 nuc = nuc[3:]
|
|
124 assert not nuc, "Nucleotide sequence for %r longer than protein %r" \
|
|
125 % (unaligned_nucleotide_record.id, aligned_protein_record.id)
|
|
126
|
|
127 aligned_nuc = unaligned_nucleotide_record[:] #copy for most annotation
|
|
128 aligned_nuc.letter_annotation = {} #clear this
|
|
129 aligned_nuc.seq = Seq("".join(seq), alpha) #replace this
|
|
130 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc)
|
|
131 return aligned_nuc
|
|
132
|
|
133 def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0):
|
|
134 """Thread nucleotide sequences onto a protein alignment."""
|
|
135 #TODO - Separate arguments for protein and nucleotide gap characters?
|
|
136 if key_function is None:
|
|
137 key_function = lambda x: x
|
|
138 if gap is None:
|
|
139 gap = "-"
|
|
140
|
|
141 aligned = []
|
|
142 for protein in protein_alignment:
|
|
143 protein.id = protein.id[:-2]
|
|
144 try:
|
|
145 nucleotide = nucleotide_records[key_function(protein.id)]
|
|
146 except KeyError:
|
|
147 raise ValueError("Could not find nucleotide sequence for protein %r" \
|
|
148 % protein.id)
|
|
149 aligned.append(sequence_back_translate(protein, nucleotide, gap, table))
|
|
150 return MultipleSeqAlignment(aligned)
|
|
151
|
|
152
|
|
153 if len(sys.argv) == 4:
|
|
154 align_format, prot_align_file, nuc_fasta_file = sys.argv[1:]
|
|
155 nuc_align_file = sys.stdout
|
|
156 table = 0
|
|
157 elif len(sys.argv) == 5:
|
|
158 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:]
|
|
159 table = 0
|
|
160 elif len(sys.argv) == 6:
|
|
161 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:]
|
|
162 else:
|
|
163 sys_exit("""This is a Python script for 'back-translating' a protein alignment,
|
|
164
|
|
165 It requires three, four or five arguments:
|
|
166 - alignment format (e.g. fasta, clustal),
|
|
167 - aligned protein file (in specified format),
|
|
168 - unaligned nucleotide file (in fasta format).
|
|
169 - aligned nucleotiode output file (in same format), optional.
|
|
170 - NCBI translation table (0 for none), optional
|
|
171
|
|
172 The nucleotide alignment is printed to stdout if no output filename is given.
|
|
173
|
|
174 Example usage:
|
|
175
|
|
176 $ python align_back_trans.py fasta demo_prot_align.fasta demo_nucs.fasta demo_nuc_align.fasta
|
|
177
|
|
178 Warning: If the output file already exists, it will be overwritten.
|
|
179
|
|
180 This script is available with sample data and a Galaxy wrapper here:
|
|
181 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
|
|
182 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
|
|
183 """)
|
|
184
|
|
185 try:
|
|
186 table = int(table)
|
|
187 except:
|
|
188 sys_exit("Bad table argument %r" % table)
|
|
189
|
|
190 prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein)
|
|
191 nuc_dict = SeqIO.index(nuc_fasta_file, "fasta")
|
|
192 nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table)
|
|
193 AlignIO.write(nuc_align, nuc_align_file, align_format)
|