Mercurial > repos > jasper > align_back_trans
changeset 13:1df4c5372e07 draft
Deleted selected files
author | jasper |
---|---|
date | Fri, 03 Feb 2017 14:06:53 -0500 |
parents | 28459eecd18c |
children | 558e6dac53d6 |
files | align_back_trans.py |
diffstat | 1 files changed, 0 insertions(+), 193 deletions(-) [+] |
line wrap: on
line diff
--- a/align_back_trans.py Fri Feb 03 13:45:14 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,193 +0,0 @@ -#!/usr/bin/env python -"""Back-translate a protein alignment to nucleotides - -This tool is a short Python script (using Biopython library functions) to -load a protein alignment, and matching nucleotide FASTA file of unaligned -sequences, in order to produce a codon aware nucleotide alignment - which -can be viewed as a back translation. - -The development repository for this tool is here: - -* https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans - -This tool is available with a Galaxy wrapper from the Galaxy Tool Shed at: - -* http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans - -See accompanying text file for licence details (MIT licence). -""" - -import sys -from Bio.Seq import Seq -from Bio.Alphabet import generic_dna, generic_protein -from Bio.Align import MultipleSeqAlignment -from Bio import SeqIO -from Bio import AlignIO -from Bio.Data.CodonTable import ambiguous_generic_by_id - -if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.5" - sys.exit(0) - -def sys_exit(msg, error_level=1): - """Print error message to stdout and quit with given error level.""" - sys.stderr.write("%s\n" % msg) - sys.exit(error_level) - -def check_trans(identifier, nuc, prot, table): - """Returns nucleotide sequence if works (can remove trailing stop)""" - if len(nuc) % 3: - sys_exit("Nucleotide sequence for %s is length %i (not a multiple of three)" - % (identifier, len(nuc))) - - p = str(prot).upper().replace("*", "X") - t = str(nuc.translate(table)).upper().replace("*", "X") - if len(t) == len(p) + 1: - if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: - #Allow this... - t = t[:-1] - nuc = nuc[:-3] #edit return value - if len(t) != len(p): - err = ("Inconsistent lengths for %s, ungapped protein %i, " - "tripled %i vs ungapped nucleotide %i." % - (identifier, len(p), len(p) * 3, len(nuc))) - if t.endswith(p): - err += "\nThere are %i extra nucleotides at the start." % (len(t) - len(p)) - elif t.startswith(p): - err += "\nThere are %i extra nucleotides at the end." % (len(t) - len(p)) - elif p in t: - #TODO - Calculate and report the number to trim at start and end? - err += "\nHowever, protein sequence found within translated nucleotides." - elif p[1:] in t: - err += "\nHowever, ignoring first amino acid, protein sequence found within translated nucleotides." - sys_exit(err) - - - if t == p: - return nuc - elif p.startswith("M") and "M" + t[1:] == p: - #Close, was there a start codon? - if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: - return nuc - else: - sys_exit("Translation check failed for %s\n" - "Would match if %s was a start codon (check correct table used)\n" - % (identifier, nuc[0:3].upper())) - else: - #Allow * vs X here? e.g. internal stop codons - m = "".join("." if x==y else "!" for (x,y) in zip(p,t)) - if len(prot) < 70: - sys.stderr.write("Protein: %s\n" % p) - sys.stderr.write(" %s\n" % m) - sys.stderr.write("Translation: %s\n" % t) - else: - for offset in range(0, len(p), 60): - sys.stderr.write("Protein: %s\n" % p[offset:offset+60]) - sys.stderr.write(" %s\n" % m[offset:offset+60]) - sys.stderr.write("Translation: %s\n\n" % t[offset:offset+60]) - sys_exit("Translation check failed for %s\n" % identifier) - -def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0): - #TODO - Separate arguments for protein gap and nucleotide gap? - if not gap or len(gap) != 1: - raise ValueError("Please supply a single gap character") - - alpha = unaligned_nucleotide_record.seq.alphabet - if hasattr(alpha, "gap_char"): - gap_codon = alpha.gap_char * 3 - assert len(gap_codon) == 3 - else: - from Bio.Alphabet import Gapped - alpha = Gapped(alpha, gap) - gap_codon = gap*3 - - ungapped_protein = aligned_protein_record.seq.ungap(gap) - ungapped_nucleotide = unaligned_nucleotide_record.seq - if table: - ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table) - elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): - sys_exit("Inconsistent lengths for %s, ungapped protein %i, " - "tripled %i vs ungapped nucleotide %i" % - (aligned_protein_record.id, - len(ungapped_protein), - len(ungapped_protein) * 3, - len(ungapped_nucleotide))) - - seq = [] - nuc = str(ungapped_nucleotide) - for amino_acid in aligned_protein_record.seq: - if amino_acid == gap: - seq.append(gap_codon) - else: - seq.append(nuc[:3]) - nuc = nuc[3:] - assert not nuc, "Nucleotide sequence for %r longer than protein %r" \ - % (unaligned_nucleotide_record.id, aligned_protein_record.id) - - aligned_nuc = unaligned_nucleotide_record[:] #copy for most annotation - aligned_nuc.letter_annotation = {} #clear this - aligned_nuc.seq = Seq("".join(seq), alpha) #replace this - assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) - return aligned_nuc - -def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0): - """Thread nucleotide sequences onto a protein alignment.""" - #TODO - Separate arguments for protein and nucleotide gap characters? - if key_function is None: - key_function = lambda x: x - if gap is None: - gap = "-" - - aligned = [] - for protein in protein_alignment: - protein.id = protein.id[:-2] - try: - nucleotide = nucleotide_records[key_function(protein.id)] - except KeyError: - raise ValueError("Could not find nucleotide sequence for protein %r" \ - % protein.id) - aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) - return MultipleSeqAlignment(aligned) - - -if len(sys.argv) == 4: - align_format, prot_align_file, nuc_fasta_file = sys.argv[1:] - nuc_align_file = sys.stdout - table = 0 -elif len(sys.argv) == 5: - align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] - table = 0 -elif len(sys.argv) == 6: - align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] -else: - sys_exit("""This is a Python script for 'back-translating' a protein alignment, - -It requires three, four or five arguments: -- alignment format (e.g. fasta, clustal), -- aligned protein file (in specified format), -- unaligned nucleotide file (in fasta format). -- aligned nucleotiode output file (in same format), optional. -- NCBI translation table (0 for none), optional - -The nucleotide alignment is printed to stdout if no output filename is given. - -Example usage: - -$ python align_back_trans.py fasta demo_prot_align.fasta demo_nucs.fasta demo_nuc_align.fasta - -Warning: If the output file already exists, it will be overwritten. - -This script is available with sample data and a Galaxy wrapper here: -https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans -http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans -""") - -try: - table = int(table) -except: - sys_exit("Bad table argument %r" % table) - -prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein) -nuc_dict = SeqIO.index(nuc_fasta_file, "fasta") -nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table) -AlignIO.write(nuc_align, nuc_align_file, align_format)