Mercurial > repos > bgruening > trna_prediction
changeset 1:919523f43769 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 22 Jan 2015 13:13:20 -0500 |
parents | 03d127fd842b |
children | 6d97da269ee2 |
files | aragorn.xml aragorn_out_to_gff3.py tRNAscan.xml test-data/aragorn_tansl-table-1_tmRNA_tRNA.gff3 tool_dependencies.xml |
diffstat | 5 files changed, 245 insertions(+), 82 deletions(-) [+] |
line wrap: on
line diff
--- a/aragorn.xml Sun Jun 09 07:42:15 2013 -0400 +++ b/aragorn.xml Thu Jan 22 13:13:20 2015 -0500 @@ -1,11 +1,13 @@ -<tool id="aragorn_trna" name="tRNA and tmRNA" version="0.3"> - <description>prediction (Aragon)</description> +<tool id="aragorn_trna" name="tRNA and tmRNA" version="0.4"> + <description>prediction (Aragorn)</description> <requirements> <requirement type="package" version="1.2.36">aragorn</requirement> + <requirement type="set_environment">TRNAPRED_SCRIPT_PATH</requirement> </requirements> <command> - aragorn - $input +<![CDATA[ + aragorn + $input -gc$genbank_gencode $tmRNA $tRNA @@ -14,7 +16,21 @@ $topology -o $output $secondary_structure - $introns + $introns; + +#if $gff3_output: + aragorn + $input + -gc$genbank_gencode + $tmRNA + $tRNA + $mtRNA + $mam_mtRNA + $topology + -fasta + $introns | python \$TRNAPRED_SCRIPT_PATH/aragorn_out_to_gff3.py > $gff3_output_file; +#end if +]]> </command> <inputs> <param name="input" type="data" format="fasta" label="Genome Sequence"/> @@ -48,6 +64,7 @@ <param name='mam_mtRNA' type='boolean' label='Search for Mammalian mitochondrial tRNA genes (-mtmam)' truevalue='-mtmam' falsevalue='' checked="false" help='-i switch ignored. -tv switch set. Mammalian mitochondrial genetic code used.' /> <param name='introns' type='boolean' label='Search for tRNA genes with introns in anticodon loop ...' truevalue='-i' falsevalue='' checked="false" help='...with maximum length 3000 bases (-i).' /> <param name='secondary_structure' type='boolean' label='Print out secondary structure (-fasta,-fon)' truevalue='-fasta' falsevalue='-fon' checked="false" help='' /> + <param name="gff3_output" type='boolean' label='Convert output to GFF3' truevalue='True' falsevalue='' checked="false" help='' /> </inputs> <outputs> <data name="output" format="fasta"> @@ -55,6 +72,9 @@ <when input="secondary_structure" value="true" format="text"/> </change_format> </data> + <data format="gff3" name="gff3_output_file" > + <filter>gff3_output</filter> + </data> </outputs> <tests> <test> @@ -65,12 +85,15 @@ <param name="tRNA" value="-t" /> <param name="mtRNA" value="" /> <param name="mam_mtRNA" value="" /> - <param name="introns" value="" /> + <param name="introns" value="" /> <param name="secondary_structure" value="-fon" /> + <param name="gff3_output" value="True" /> <output name="output" file="aragorn_tansl-table-1_tmRNA_tRNA.fasta" ftype="fasta" /> + <output name="output" file="aragorn_tansl-table-1_tmRNA_tRNA.gff3" ftype="gff3" /> </test> </tests> <help> +<![CDATA[ **What it does** @@ -125,41 +148,41 @@ tRNA Anticodon Frequency - AAA Phe GAA Phe 1 CAA Leu 1 TAA Leu 1 - AGA Ser GGA Ser 1 CGA Ser 2 TGA Ser 1 - ACA Cys GCA Cys 2 CCA Trp 2 TCA seC - ATA Tyr GTA Tyr 1 CTA Pyl TTA Stop - AAG Leu GAG Leu 3 CAG Leu 1 TAG Leu 2 - AGG Pro GGG Pro 2 CGG Pro 2 TGG Pro 2 - ACG Arg 1 GCG Arg 2 CCG Arg 1 TCG Arg - ATG His GTG His 2 CTG Gln 2 TTG Gln 1 - AAC Val GAC Val 3 CAC Val 2 TAC Val 1 - AGC Ala GGC Ala 2 CGC Ala 3 TGC Ala 1 - ACC Gly GCC Gly 5 CCC Gly 1 TCC Gly 2 - ATC Asp GTC Asp 3 CTC Glu 2 TTC Glu 2 - AAT Ile GAT Ile 3 CAT Met 6 TAT Ile - AGT Thr GGT Thr 2 CGT Thr 1 TGT Thr 2 - ACT Ser GCT Ser 1 CCT Arg 1 TCT Arg 1 - ATT Asn GTT Asn 3 CTT Lys 3 TTT Lys 2 + AAA Phe GAA Phe 1 CAA Leu 1 TAA Leu 1 + AGA Ser GGA Ser 1 CGA Ser 2 TGA Ser 1 + ACA Cys GCA Cys 2 CCA Trp 2 TCA seC + ATA Tyr GTA Tyr 1 CTA Pyl TTA Stop + AAG Leu GAG Leu 3 CAG Leu 1 TAG Leu 2 + AGG Pro GGG Pro 2 CGG Pro 2 TGG Pro 2 + ACG Arg 1 GCG Arg 2 CCG Arg 1 TCG Arg + ATG His GTG His 2 CTG Gln 2 TTG Gln 1 + AAC Val GAC Val 3 CAC Val 2 TAC Val 1 + AGC Ala GGC Ala 2 CGC Ala 3 TGC Ala 1 + ACC Gly GCC Gly 5 CCC Gly 1 TCC Gly 2 + ATC Asp GTC Asp 3 CTC Glu 2 TTC Glu 2 + AAT Ile GAT Ile 3 CAT Met 6 TAT Ile + AGT Thr GGT Thr 2 CGT Thr 1 TGT Thr 2 + ACT Ser GCT Ser 1 CCT Arg 1 TCT Arg 1 + ATT Asn GTT Asn 3 CTT Lys 3 TTT Lys 2 Ambiguous: 1 tRNA Codon Frequency - TTT Phe TTC Phe 1 TTG Leu 1 TTA Leu 1 - TCT Ser TCC Ser 1 TCG Ser 2 TCA Ser 1 - TGT Cys TGC Cys 2 TGG Trp 2 TGA seC - TAT Tyr TAC Tyr 1 TAG Pyl TAA Stop - CTT Leu CTC Leu 3 CTG Leu 1 CTA Leu 2 - CCT Pro CCC Pro 2 CCG Pro 2 CCA Pro 2 - CGT Arg 1 CGC Arg 2 CGG Arg 1 CGA Arg - CAT His CAC His 2 CAG Gln 2 CAA Gln 1 - GTT Val GTC Val 3 GTG Val 2 GTA Val 1 - GCT Ala GCC Ala 2 GCG Ala 3 GCA Ala 1 - GGT Gly GGC Gly 5 GGG Gly 1 GGA Gly 2 - GAT Asp GAC Asp 3 GAG Glu 2 GAA Glu 2 - ATT Ile ATC Ile 3 ATG Met 6 ATA Ile - ACT Thr ACC Thr 2 ACG Thr 1 ACA Thr 2 - AGT Ser AGC Ser 1 AGG Arg 1 AGA Arg 1 - AAT Asn AAC Asn 3 AAG Lys 3 AAA Lys 2 + TTT Phe TTC Phe 1 TTG Leu 1 TTA Leu 1 + TCT Ser TCC Ser 1 TCG Ser 2 TCA Ser 1 + TGT Cys TGC Cys 2 TGG Trp 2 TGA seC + TAT Tyr TAC Tyr 1 TAG Pyl TAA Stop + CTT Leu CTC Leu 3 CTG Leu 1 CTA Leu 2 + CCT Pro CCC Pro 2 CCG Pro 2 CCA Pro 2 + CGT Arg 1 CGC Arg 2 CGG Arg 1 CGA Arg + CAT His CAC His 2 CAG Gln 2 CAA Gln 1 + GTT Val GTC Val 3 GTG Val 2 GTA Val 1 + GCT Ala GCC Ala 2 GCG Ala 3 GCA Ala 1 + GGT Gly GGC Gly 5 GGG Gly 1 GGA Gly 2 + GAT Asp GAC Asp 3 GAG Glu 2 GAA Glu 2 + ATT Ile ATC Ile 3 ATG Met 6 ATA Ile + ACT Thr ACC Thr 2 ACG Thr 1 ACA Thr 2 + AGT Ser AGC Ser 1 AGG Arg 1 AGA Arg 1 + AAT Asn AAC Asn 3 AAG Lys 3 AAA Lys 2 Ambiguous: 1 Number of tRNA genes = 86 @@ -174,7 +197,8 @@ ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences Nucl. Acids Res. (2004) 32(1): 11-16 -doi:10.1093/nar/gkh152 +doi:10.1093/nar/gkh152 +]]> </help> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aragorn_out_to_gff3.py Thu Jan 22 13:13:20 2015 -0500 @@ -0,0 +1,165 @@ +#!/usr/bin/env python +import re + +def start_pattern(string): + return re.match(r'^[0-9]+\.$', string) \ + or string.startswith('Number of possible') \ + or string.startswith('Searching for') + +def blank_line(string): + return re.match(r'^\s*$', string) + +def blocks(iterable): + accumulator = [] + run_of_blanklines = 0 + for line in iterable: + # Count blank lines + if blank_line(line): + run_of_blanklines += 1 + else: + run_of_blanklines = 0 + + if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line: + if accumulator: + yield accumulator + accumulator = [line] + else: + accumulator.append(line) + if accumulator: + yield accumulator + +IMPORTANT_INFO = { + 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'), + 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'), + 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'), + 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), + 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'), +} +INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo') + +def important_info(block): + info = {} + for line in block: + for matcher in IMPORTANT_INFO: + matches = IMPORTANT_INFO[matcher].search(line) + if matches: + for group in INFO_GROUPS: + try: + info[group] = matches.group(group) + except: + pass + return info + +IMPORTANT_INFO_TMRNA = { + 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'), + 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), +} +INFO_GROUPS_TMRNA = ('start', 'end', 'pep') + +def important_info_tmrna(block): + info = {} + for line in block: + for matcher in IMPORTANT_INFO_TMRNA: + matches = IMPORTANT_INFO_TMRNA[matcher].search(line) + if matches: + for group in INFO_GROUPS_TMRNA: + try: + info[group] = matches.group(group) + except: + pass + return info + +import fileinput +stdin_data = [] +for line in fileinput.input(): + stdin_data.append(line) + +possible_blocks = [line for line in blocks(stdin_data)] + +seqid = None +print '##gff-version-3' +# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible +# awful things +for block_idx in range(len(possible_blocks)): + block = possible_blocks[block_idx] + data = None + fasta_defline = None + + if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]: + # Try and get a sequence ID out of it + try: + fasta_defline = block[-2].strip() + except: + # Failing that, ignore it. + pass + else: + # They DUPLICATE results in multiple places, including a fasta version + # in the 'full report'. + possible_ugliness = [x for x in block if x.startswith('>t')] + if len(possible_ugliness) > 0: + continue + + # However, if it didn't have one of those all important pieces of + # information, then it's either a different important piece of + # information, or complete junk + data = important_info(block) + + # I am not proud of any of this. We essentially say "if that block + # didn't come up with useful info, then try making it a tmrna" + if len(data.keys()) == 0: + data = important_info_tmrna(block) + # And if that fails, just none it. + if len(data.keys()) == 0: + data = None + else: + # But if it didn't, confirm that we're a tmRNA + data['type'] = 'tmRNA' + else: + # If we did have keys, and didn't pass through any of the tmRNA + # checks, we're tRNA + data['type'] = 'tRNA' + + # If we got a sequence ID in this block, set the defline + if 'nucleotides in sequence' in block[-1]: + try: + fasta_defline = block[-2].strip() + except: + pass + + # if a defline is available, try and extract the fasta header ID + if fasta_defline is not None: + try: + seqid = fasta_defline[0:fasta_defline.index(' ')] + except: + seqid = fasta_defline + + # If there's data + if data is not None and len(data.keys()) > 1: + + # Deal with our flags/notes. + if data['type'] == 'tRNA': + # Are these acceptable GFF3 tags? + notes = { + 'Codon': data['codon'], + 'Anticodon': data['anticodon'], + } + if 'pseudo' in data: + notes['Note'] = 'Possible pseudogene' + else: + notes = { + 'Note': 'Tag peptide: ' + data['pep'] + '' + } + + notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()]) + + print '\t'.join([ + seqid, + 'aragorn', + data['type'], + data['start'], + data['end'], + '.', + '.', + '.', + notestr + ])
--- a/tRNAscan.xml Sun Jun 09 07:42:15 2013 -0400 +++ b/tRNAscan.xml Thu Jan 22 13:13:20 2015 -0500 @@ -1,10 +1,11 @@ -<tool id="trnascan" name="tRNAscan" version="0.3"> - <description>tRNA Scan</description> +<tool id="trnascan" name="tRNA prediction" version="0.3"> + <description>(tRNAscan)</description> <requirements> <requirement type="package" version="1.3.1">tRNAscan-SE</requirement> <requirement type="package" version="1.61">biopython</requirement> </requirements> <command interpreter="python"> +<![CDATA[ tRNAscan.py $organism $mode @@ -15,6 +16,7 @@ $tabular_output $inputfile $fasta_output +]]> </command> <inputs> <param name="inputfile" type="data" format="fasta" label="Genome Sequence" help="Dataset missing? See TIP below"/> @@ -54,6 +56,7 @@ </test> </tests> <help> +<![CDATA[ .. class:: warningmark @@ -176,7 +179,7 @@ pseudogene. The heuristic pseudogene detection filter uses this information to flag possible pseudogenes -- use this option to see why a hit is marked as a possible pseudogene. The user may wish to - examine score breakdowns from known tRNAs in the organism of interest + examine score breakdowns from known tRNAs in the organism of interest to get a frame of reference. ----- @@ -207,7 +210,7 @@ ======== ====== ===== ====== ==== ========== ====== ====== ========== ========== - tRNA Bounds Intron Bonds + tRNA Bounds Intron Bonds -------- ------ ---------------- ---- ---------- ---------------- ---------- ---------- Name # tRNA Begin End tRNA Anti Codon Begin End Cove Score Hit Origin ======== ====== ===== ====== ==== ========== ====== ====== ========== ========== @@ -227,8 +230,9 @@ tRNAscan-SE: A Program for Improved Detection of Transfer RNA Genes in Genomic Sequence Nucl. Acids Res. (1997) 25(5): 0955-964 -doi:10.1093/nar/25.5.0955 +doi:10.1093/nar/25.5.0955 +]]> </help> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/aragorn_tansl-table-1_tmRNA_tRNA.gff3 Thu Jan 22 13:13:20 2015 -0500 @@ -0,0 +1,2 @@ +##gff-version-3 +gi|240255695:23036500-23037000 aragorn tRNA 381 453 . . . Anticodon=tgc;Codon=Ala
--- a/tool_dependencies.xml Sun Jun 09 07:42:15 2013 -0400 +++ b/tool_dependencies.xml Thu Jan 22 13:13:20 2015 -0500 @@ -1,47 +1,15 @@ <?xml version="1.0"?> <tool_dependency> <package name="biopython" version="1.61"> - <repository changeset_revision="627c7b41b970" name="package_biopython_1_61" owner="biopython" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <repository changeset_revision="2f6c871cfa35" name="package_biopython_1_61" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> <package name="aragorn" version="1.2.36"> - <install version="1.0"> - <actions> - <action type="download_by_url">http://mbio-serv2.mbioekol.lu.se/ARAGORN/Downloads/aragorn1.2.36.tgz</action> - <action type="make_directory">$INSTALL_DIR/bin/</action> - <action type="shell_command">gcc -O3 -ffast-math -finline-functions -o aragorn aragorn1.2.36.c</action> - <action type="move_file"> - <source>aragorn</source> - <destination>$INSTALL_DIR/bin</destination> - </action> - <action type="set_environment"> - <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> - </action> - </actions> - </install> - <readme>Compiling ARAGORN requires gcc.</readme> + <repository changeset_revision="d561a0a9f601" name="package_aragorn_1_2_36" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> <package name="tRNAscan-SE" version="1.3.1"> - <install version="1.0"> - <actions> - <action type="download_by_url">http://lowelab.ucsc.edu/software/tRNAscan-SE.tar.gz</action> - <action type="make_directory">$INSTALL_DIR/bin/</action> - <action type="make_directory">$INSTALL_DIR/lib/tRNAscan-SE/</action> - <action type="make_directory">$INSTALL_DIR/man/</action> - <!-- replacing the hardcoded pathvariables with the real ones --> - <action type="shell_command">cd ./tRNAscan-SE-1.3.1 && sed 's%^BINDIR = .*%BINDIR = $INSTALL_DIR/bin/%' Makefile | sed 's%^LIBDIR = .*%LIBDIR = $INSTALL_DIR/lib/tRNAscan-SE/%' | sed 's%^MANDIR = .*%MANDIR = $INSTALL_DIR/man%' > Makefile_new</action> - <action type="shell_command">cd ./tRNAscan-SE-1.3.1 && rm Makefile && mv Makefile_new Makefile</action> - <action type="shell_command">cd ./tRNAscan-SE-1.3.1 && make && make install</action> - - <!-- for some reason infernal needs to be directly under the bin/ from tRNAScan --> - <action type="shell_command">wget ftp://selab.janelia.org/pub/software/infernal/infernal-1.0.2.tar.gz</action> - <action type="shell_command">tar xfvz infernal-1.0.2.tar.gz</action> - <action type="shell_command">cd infernal-1.0.2 && ./configure --prefix=$INSTALL_DIR && make && make install</action> - <action type="set_environment"> - <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> - <environment_variable action="prepend_to" name="PERL5LIB">$INSTALL_DIR/bin/</environment_variable> - </action> - </actions> - </install> - <readme>Compiling and running tRNAScan-SE requires gcc a PERL environment.</readme> + <repository changeset_revision="05896b625b03" name="package_trnascan_1_3_1" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> + <set_environment version="1.0"> + <environment_variable action="set_to" name="TRNAPRED_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> </tool_dependency>