Mercurial > repos > jasper > align_back_trans
comparison align_back_trans.py @ 0:526c3f268982 draft
Uploaded
| author | jasper |
|---|---|
| date | Fri, 03 Feb 2017 12:49:04 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:526c3f268982 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Back-translate a protein alignment to nucleotides | |
| 3 | |
| 4 This tool is a short Python script (using Biopython library functions) to | |
| 5 load a protein alignment, and matching nucleotide FASTA file of unaligned | |
| 6 sequences, in order to produce a codon aware nucleotide alignment - which | |
| 7 can be viewed as a back translation. | |
| 8 | |
| 9 The development repository for this tool is here: | |
| 10 | |
| 11 * https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans | |
| 12 | |
| 13 This tool is available with a Galaxy wrapper from the Galaxy Tool Shed at: | |
| 14 | |
| 15 * http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans | |
| 16 | |
| 17 See accompanying text file for licence details (MIT licence). | |
| 18 """ | |
| 19 | |
| 20 import sys | |
| 21 from Bio.Seq import Seq | |
| 22 from Bio.Alphabet import generic_dna, generic_protein | |
| 23 from Bio.Align import MultipleSeqAlignment | |
| 24 from Bio import SeqIO | |
| 25 from Bio import AlignIO | |
| 26 from Bio.Data.CodonTable import ambiguous_generic_by_id | |
| 27 | |
| 28 if "-v" in sys.argv or "--version" in sys.argv: | |
| 29 print "v0.0.5" | |
| 30 sys.exit(0) | |
| 31 | |
| 32 def sys_exit(msg, error_level=1): | |
| 33 """Print error message to stdout and quit with given error level.""" | |
| 34 sys.stderr.write("%s\n" % msg) | |
| 35 sys.exit(error_level) | |
| 36 | |
| 37 def check_trans(identifier, nuc, prot, table): | |
| 38 """Returns nucleotide sequence if works (can remove trailing stop)""" | |
| 39 if len(nuc) % 3: | |
| 40 sys_exit("Nucleotide sequence for %s is length %i (not a multiple of three)" | |
| 41 % (identifier, len(nuc))) | |
| 42 | |
| 43 p = str(prot).upper().replace("*", "X") | |
| 44 t = str(nuc.translate(table)).upper().replace("*", "X") | |
| 45 if len(t) == len(p) + 1: | |
| 46 if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: | |
| 47 #Allow this... | |
| 48 t = t[:-1] | |
| 49 nuc = nuc[:-3] #edit return value | |
| 50 if len(t) != len(p): | |
| 51 err = ("Inconsistent lengths for %s, ungapped protein %i, " | |
| 52 "tripled %i vs ungapped nucleotide %i." % | |
| 53 (identifier, len(p), len(p) * 3, len(nuc))) | |
| 54 if t.endswith(p): | |
| 55 err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) | |
| 56 elif t.startswith(p): | |
| 57 err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) | |
| 58 elif p in t: | |
| 59 #TODO - Calculate and report the number to trim at start and end? | |
| 60 err += "\nHowever, protein sequence found within translated nucleotides." | |
| 61 elif p[1:] in t: | |
| 62 err += "\nHowever, ignoring first amino acid, protein sequence found within translated nucleotides." | |
| 63 sys_exit(err) | |
| 64 | |
| 65 | |
| 66 if t == p: | |
| 67 return nuc | |
| 68 elif p.startswith("M") and "M" + t[1:] == p: | |
| 69 #Close, was there a start codon? | |
| 70 if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: | |
| 71 return nuc | |
| 72 else: | |
| 73 sys_exit("Translation check failed for %s\n" | |
| 74 "Would match if %s was a start codon (check correct table used)\n" | |
| 75 % (identifier, nuc[0:3].upper())) | |
| 76 else: | |
| 77 #Allow * vs X here? e.g. internal stop codons | |
| 78 m = "".join("." if x==y else "!" for (x,y) in zip(p,t)) | |
| 79 if len(prot) < 70: | |
| 80 sys.stderr.write("Protein: %s\n" % p) | |
| 81 sys.stderr.write(" %s\n" % m) | |
| 82 sys.stderr.write("Translation: %s\n" % t) | |
| 83 else: | |
| 84 for offset in range(0, len(p), 60): | |
| 85 sys.stderr.write("Protein: %s\n" % p[offset:offset+60]) | |
| 86 sys.stderr.write(" %s\n" % m[offset:offset+60]) | |
| 87 sys.stderr.write("Translation: %s\n\n" % t[offset:offset+60]) | |
| 88 sys_exit("Translation check failed for %s\n" % identifier) | |
| 89 | |
| 90 def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0): | |
| 91 #TODO - Separate arguments for protein gap and nucleotide gap? | |
| 92 if not gap or len(gap) != 1: | |
| 93 raise ValueError("Please supply a single gap character") | |
| 94 | |
| 95 alpha = unaligned_nucleotide_record.seq.alphabet | |
| 96 if hasattr(alpha, "gap_char"): | |
| 97 gap_codon = alpha.gap_char * 3 | |
| 98 assert len(gap_codon) == 3 | |
| 99 else: | |
| 100 from Bio.Alphabet import Gapped | |
| 101 alpha = Gapped(alpha, gap) | |
| 102 gap_codon = gap*3 | |
| 103 | |
| 104 ungapped_protein = aligned_protein_record.seq.ungap(gap) | |
| 105 ungapped_nucleotide = unaligned_nucleotide_record.seq | |
| 106 if table: | |
| 107 ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table) | |
| 108 elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): | |
| 109 sys_exit("Inconsistent lengths for %s, ungapped protein %i, " | |
| 110 "tripled %i vs ungapped nucleotide %i" % | |
| 111 (aligned_protein_record.id, | |
| 112 len(ungapped_protein), | |
| 113 len(ungapped_protein) * 3, | |
| 114 len(ungapped_nucleotide))) | |
| 115 | |
| 116 seq = [] | |
| 117 nuc = str(ungapped_nucleotide) | |
| 118 for amino_acid in aligned_protein_record.seq: | |
| 119 if amino_acid == gap: | |
| 120 seq.append(gap_codon) | |
| 121 else: | |
| 122 seq.append(nuc[:3]) | |
| 123 nuc = nuc[3:] | |
| 124 assert not nuc, "Nucleotide sequence for %r longer than protein %r" \ | |
| 125 % (unaligned_nucleotide_record.id, aligned_protein_record.id) | |
| 126 | |
| 127 aligned_nuc = unaligned_nucleotide_record[:] #copy for most annotation | |
| 128 aligned_nuc.letter_annotation = {} #clear this | |
| 129 aligned_nuc.seq = Seq("".join(seq), alpha) #replace this | |
| 130 assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) | |
| 131 return aligned_nuc | |
| 132 | |
| 133 def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0): | |
| 134 """Thread nucleotide sequences onto a protein alignment.""" | |
| 135 #TODO - Separate arguments for protein and nucleotide gap characters? | |
| 136 if key_function is None: | |
| 137 key_function = lambda x: x | |
| 138 if gap is None: | |
| 139 gap = "-" | |
| 140 | |
| 141 aligned = [] | |
| 142 for protein in protein_alignment: | |
| 143 protein.id = protein.id[:-2] | |
| 144 try: | |
| 145 nucleotide = nucleotide_records[key_function(protein.id)] | |
| 146 except KeyError: | |
| 147 raise ValueError("Could not find nucleotide sequence for protein %r" \ | |
| 148 % protein.id) | |
| 149 aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) | |
| 150 return MultipleSeqAlignment(aligned) | |
| 151 | |
| 152 | |
| 153 if len(sys.argv) == 4: | |
| 154 align_format, prot_align_file, nuc_fasta_file = sys.argv[1:] | |
| 155 nuc_align_file = sys.stdout | |
| 156 table = 0 | |
| 157 elif len(sys.argv) == 5: | |
| 158 align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] | |
| 159 table = 0 | |
| 160 elif len(sys.argv) == 6: | |
| 161 align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] | |
| 162 else: | |
| 163 sys_exit("""This is a Python script for 'back-translating' a protein alignment, | |
| 164 | |
| 165 It requires three, four or five arguments: | |
| 166 - alignment format (e.g. fasta, clustal), | |
| 167 - aligned protein file (in specified format), | |
| 168 - unaligned nucleotide file (in fasta format). | |
| 169 - aligned nucleotiode output file (in same format), optional. | |
| 170 - NCBI translation table (0 for none), optional | |
| 171 | |
| 172 The nucleotide alignment is printed to stdout if no output filename is given. | |
| 173 | |
| 174 Example usage: | |
| 175 | |
| 176 $ python align_back_trans.py fasta demo_prot_align.fasta demo_nucs.fasta demo_nuc_align.fasta | |
| 177 | |
| 178 Warning: If the output file already exists, it will be overwritten. | |
| 179 | |
| 180 This script is available with sample data and a Galaxy wrapper here: | |
| 181 https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans | |
| 182 http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans | |
| 183 """) | |
| 184 | |
| 185 try: | |
| 186 table = int(table) | |
| 187 except: | |
| 188 sys_exit("Bad table argument %r" % table) | |
| 189 | |
| 190 prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein) | |
| 191 nuc_dict = SeqIO.index(nuc_fasta_file, "fasta") | |
| 192 nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table) | |
| 193 AlignIO.write(nuc_align, nuc_align_file, align_format) |
