Mercurial > repos > peterjc > align_back_trans
changeset 2:39ad3f684a30 draft
Uploaded v0.0.3 preview 1, translation checking
author | peterjc |
---|---|
date | Wed, 19 Feb 2014 11:36:01 -0500 |
parents | bd2b013e0d0d |
children | 4367786266d5 |
files | test-data/demo_nuc_align.fasta test-data/demo_nucs.fasta test-data/demo_nucs_trailing_stop.fasta tools/align_back_trans/README.rst tools/align_back_trans/align_back_trans.py tools/align_back_trans/align_back_trans.xml |
diffstat | 6 files changed, 119 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/test-data/demo_nuc_align.fasta Tue Feb 18 08:46:34 2014 -0500 +++ b/test-data/demo_nuc_align.fasta Wed Feb 19 11:36:01 2014 -0500 @@ -1,6 +1,6 @@ >Alpha -GGUGAGGAACGA +GATGAGGAACGA >Beta -GGCGGG---CGU +GATGAG---CGU >Gamma -GGU------CGG +GAT------CGG
--- a/test-data/demo_nucs.fasta Tue Feb 18 08:46:34 2014 -0500 +++ b/test-data/demo_nucs.fasta Wed Feb 19 11:36:01 2014 -0500 @@ -1,6 +1,6 @@ >Alpha -GGUGAGGAACGA +GATGAGGAACGA >Beta -GGCGGGCGU +GATGAGCGU >Gamma -GGUCGG +GATCGG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/demo_nucs_trailing_stop.fasta Wed Feb 19 11:36:01 2014 -0500 @@ -0,0 +1,6 @@ +>Alpha +GATGAGGAACGA +>Beta +GATGAGCGU +>Gamma +GATCGGTAA
--- a/tools/align_back_trans/README.rst Tue Feb 18 08:46:34 2014 -0500 +++ b/tools/align_back_trans/README.rst Wed Feb 19 11:36:01 2014 -0500 @@ -33,14 +33,15 @@ The suggested location is in a dedicated ``tools/align_back_trans`` folder. -You will also need to modify the tools_conf.xml file to tell Galaxy to offer the -tool. One suggested location is in the filters section. Simply add the line:: +You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer +the tool. One suggested location is in the multiple alignments section. Simply +add the line:: <tool file="align_back_trans/align_back_trans.xml" /> -You will also need to install Biopython 1.54 or later. If you want to run +You will also need to install Biopython 1.62 or later. If you want to run the unit tests, include this line in ``tools_conf.xml.sample`` and the sample -FASTA files under the test-data directory. Then:: +FASTA files under the ``test-data`` directory. Then:: ./run_functional_tests.sh -id align_back_trans @@ -54,6 +55,7 @@ Version Changes ------- ---------------------------------------------------------------------- v0.0.1 - Initial version, based on a previously written Python script +v0.0.2 - Optionally check the translation is consistent ======= ====================================================================== @@ -69,7 +71,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder:: - $ tar -czf align_back_trans.tar.gz tools/align_back_trans/README.rst tools/align_back_trans/align_back_trans.py tools/align_back_trans/align_back_trans.xml tools/align_back_trans/tool_dependencies.xml test-data/demo_nucs.fasta test-data/demo_prot_align.fasta test-data/demo_nuc_align.fasta + $ tar -czf align_back_trans.tar.gz tools/align_back_trans/README.rst tools/align_back_trans/align_back_trans.py tools/align_back_trans/align_back_trans.xml tools/align_back_trans/tool_dependencies.xml test-data/demo_nucs.fasta test-data/demo_nucs_trailing_stop.fasta test-data/demo_prot_align.fasta test-data/demo_nuc_align.fasta Check this worked:: @@ -79,6 +81,7 @@ tools/align_back_trans/align_back_trans.xml tools/align_back_trans/tool_dependencies.xml test-data/demo_nucs.fasta + test-data/demo_nucs_trailing_stop.fasta test-data/demo_prot_align.fasta test-data/demo_nuc_align.fasta
--- a/tools/align_back_trans/align_back_trans.py Tue Feb 18 08:46:34 2014 -0500 +++ b/tools/align_back_trans/align_back_trans.py Wed Feb 19 11:36:01 2014 -0500 @@ -25,9 +25,10 @@ from Bio.Align import MultipleSeqAlignment from Bio import SeqIO from Bio import AlignIO +from Bio.Data.CodonTable import ambiguous_generic_by_id if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.2" + print "v0.0.3" sys.exit(0) def stop_err(msg, error_level=1): @@ -35,7 +36,59 @@ sys.stderr.write("%s\n" % msg) sys.exit(error_level) -def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap): +def check_trans(identifier, nuc, prot, table): + """Returns nucleotide sequence if works (can remove trailing stop)""" + if len(nuc) % 3: + stop_err("Nucleotide sequence for %s is length %i (not a multiple of three)" + % (identifier, nuc)) + + p = str(prot).upper().replace("*", "X") + t = str(nuc.translate(table)).upper().replace("*", "X") + if len(t) == len(p) + 1: + if str(nuc)[-3:].upper() in ambiguous_generic_by_id[table].stop_codons: + #Allow this... + t = t[:-1] + nuc = nuc[:-3] #edit return value + if len(t) != len(p) and p in t: + stop_err("%s translation matched but only as subset of nucleotides, " + "wrong start codon?" % identifier) + if len(t) != len(p) and p[1:] in t: + stop_err("%s translation matched (ignoring first base) but only " + "as subset of nucleotides, wring start codon?" % identifier) + if len(t) != len(p): + stop_err("Inconsistent lengths for %s, ungapped protein %i, " + "tripled %i vs ungapped nucleotide %i" % + (identifier, + len(p), + len(p) * 3, + len(nuc))) + + + if t == p: + return nuc + elif p.startswith("M") and "M" + t[1:] == p: + #Close, was there a start codon? + if str(nuc[0:3]).upper() in ambiguous_generic_by_id[table].start_codons: + return nuc + else: + stop_err("Translation check failed for %s\n" + "Would match if %s was a start codon (check correct table used)\n" + % (identifier, nuc[0:3].upper())) + else: + #Allow * vs X here? e.g. internal stop codons + m = "".join("." if x==y else "!" for (x,y) in zip(p,t)) + if len(prot) < 70: + sys.stderr.write("Protein: %s\n" % p) + sys.stderr.write(" %s\n" % m) + sys.stderr.write("Translation: %s\n" % t) + else: + for offset in range(0, len(p), 60): + sys.stderr.write("Protein: %s\n" % p[offset:offset+60]) + sys.stderr.write(" %s\n" % m[offset:offset+60]) + sys.stderr.write("Translation: %s\n\n" % t[offset:offset+60]) + stop_err("Translation check failed for %s\n" % identifier) + +def sequence_back_translate(aligned_protein_record, unaligned_nucleotide_record, gap, table=0): #TODO - Separate arguments for protein gap and nucleotide gap? if not gap or len(gap) != 1: raise ValueError("Please supply a single gap character") @@ -49,22 +102,27 @@ alpha = Gapped(alpha, gap) gap_codon = gap*3 - if len(aligned_protein_record.seq.ungap(gap))*3 != len(unaligned_nucleotide_record.seq): + ungapped_protein = aligned_protein_record.seq.ungap(gap) + ungapped_nucleotide = unaligned_nucleotide_record.seq + if table: + ungapped_nucleotide = check_trans(aligned_protein_record.id, ungapped_nucleotide, ungapped_protein, table) + elif len(ungapped_protein) * 3 != len(ungapped_nucleotide): stop_err("Inconsistent lengths for %s, ungapped protein %i, " "tripled %i vs ungapped nucleotide %i" % - (len(aligned_protein_record.seq.ungap(gap)), - len(aligned_protein_record.seq.ungap(gap))*3, - len(unaligned_nucleotide_record.seq))) + (aligned_protein_record.id, + len(ungapped_protein), + len(ungapped_protein) * 3, + len(ungapped_nucleotide))) seq = [] - nuc = str(unaligned_nucleotide_record.seq) + nuc = str(ungapped_nucleotide) for amino_acid in aligned_protein_record.seq: if amino_acid == gap: seq.append(gap_codon) else: seq.append(nuc[:3]) nuc = nuc[3:] - assert not nuc, "Nucleotide sequence for %r longer than protein %s" \ + assert not nuc, "Nucleotide sequence for %r longer than protein %r" \ % (unaligned_nucleotide_record.id, aligned_protein_record.id) aligned_nuc = unaligned_nucleotide_record[:] #copy for most annotation @@ -73,7 +131,7 @@ assert len(aligned_protein_record.seq) * 3 == len(aligned_nuc) return aligned_nuc -def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None): +def alignment_back_translate(protein_alignment, nucleotide_records, key_function=None, gap=None, table=0): """Thread nucleotide sequences onto a protein alignment.""" #TODO - Separate arguments for protein and nucleotide gap characters? if key_function is None: @@ -88,23 +146,28 @@ except KeyError: raise ValueError("Could not find nucleotide sequence for protein %r" \ % protein.id) - aligned.append(sequence_back_translate(protein, nucleotide, gap)) + aligned.append(sequence_back_translate(protein, nucleotide, gap, table)) return MultipleSeqAlignment(aligned) if len(sys.argv) == 4: align_format, prot_align_file, nuc_fasta_file = sys.argv[1:] nuc_align_file = sys.stdout + table = 0 elif len(sys.argv) == 5: align_format, prot_align_file, nuc_fasta_file, nuc_align_file = sys.argv[1:] + table = 0 +elif len(sys.argv) == 6: + align_format, prot_align_file, nuc_fasta_file, nuc_align_file, table = sys.argv[1:] else: stop_err("""This is a Python script for 'back-translating' a protein alignment, -It requires three or four arguments: +It requires three, four or five arguments: - alignment format (e.g. fasta, clustal), - aligned protein file (in specified format), - unaligned nucleotide file (in fasta format). - aligned nucleotiode output file (in same format), optional. +- NCBI translation table (0 for none), optional The nucleotide alignment is printed to stdout if no output filename is given. @@ -119,7 +182,12 @@ http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans """) +try: + table = int(table) +except: + stop_err("Bad table argument %r" % table) + prot_align = AlignIO.read(prot_align_file, align_format, alphabet=generic_protein) nuc_dict = SeqIO.index(nuc_fasta_file, "fasta") -nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-") +nuc_align = alignment_back_translate(prot_align, nuc_dict, gap="-", table=table) AlignIO.write(nuc_align, nuc_align_file, align_format)
--- a/tools/align_back_trans/align_back_trans.xml Tue Feb 18 08:46:34 2014 -0500 +++ b/tools/align_back_trans/align_back_trans.xml Wed Feb 19 11:36:01 2014 -0500 @@ -1,4 +1,4 @@ -<tool id="align_back_trans" name="Thread nucleotides onto a protein alignment (back-translation)" version="0.0.2"> +<tool id="align_back_trans" name="Thread nucleotides onto a protein alignment (back-translation)" version="0.0.3"> <description>Gives a codon aware alignment</description> <requirements> <requirement type="package" version="1.63">biopython</requirement> @@ -6,7 +6,7 @@ </requirements> <version_command interpreter="python">align_back_trans.py --version</version_command> <command interpreter="python"> -align_back_trans.py $prot_align.ext $prot_align $nuc_file $out_nuc_align +align_back_trans.py $prot_align.ext "$prot_align" "$nuc_file" "$out_nuc_align" "$table" </command> <stdio> <!-- Anything other than zero is an error --> @@ -15,7 +15,6 @@ </stdio> <inputs> <param name="prot_align" type="data" format="fasta,muscle,clustal" label="Aligned protein file" help="Mutliple sequence file in FASTA, ClustalW or PHYLIP format." /> - <!-- TODO? Could verify the translation <param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons"> <option value="1">1. Standard</option> <option value="2">2. Vertebrate Mitochondrial</option> @@ -34,8 +33,8 @@ <option value="21">21. Trematode Mitochondrial</option> <option value="22">22. Scenedesmus obliquus</option> <option value="23">23. Thraustochytrium Mitochondrial</option> + <option value="0">Don't check the translation</option> </param> - --> <param name="nuc_file" type="data" format="fasta" label="Unaligned nucleotide sequences" help="FASTA format, using same identifiers as your protein alignment" /> </inputs> <outputs> @@ -51,6 +50,13 @@ <test> <param name="prot_align" value="demo_prot_align.fasta" /> <param name="nuc_file" value="demo_nucs.fasta" /> + <param name="table" value="0" /> + <output name="out_nuc_align" file="demo_nuc_align.fasta" /> + </test> + <test> + <param name="prot_align" value="demo_prot_align.fasta" /> + <param name="nuc_file" value="demo_nucs_trailing_stop.fasta" /> + <param name="table" value="11" /> <output name="out_nuc_align" file="demo_nuc_align.fasta" /> </test> </tests> @@ -63,8 +69,15 @@ protein alignment to produce a codon aware nucleotide alignment - which can be viewed as a back translation. -Note - the provided nucleotide sequences should be exactly three times the -length of the protein sequences (exluding the gaps). +If you specify one of the standard NCBI genetic codes (recommended), then the +translation is verified. This will allow fuzzy matching if stop codons in the +protein sequence have been reprented as X, and will allow for a trailing stop +codon present in the nucleotide sequences but not the protein. + +Note - the protein and nucleotide sequences must use the same identifers. + +Note - If not translation table is specified, the provided nucleotide sequences +should be exactly three times the length of the protein sequences (exluding the gaps). Note - the nucleotide FASTA file may contain extra sequences not in the protein alignment, they will be ignored. This can be useful if for example