Mercurial > repos > bgruening > glimmer
changeset 4:8ddf54417ade
Uploaded
author | bgruening |
---|---|
date | Fri, 07 Jun 2013 10:02:12 -0400 |
parents | 2d0c26885604 |
children | 5a97ff1a7b12 a07c49839f31 |
files | glimmer2gff.py glimmer2gff.xml glimmer2seq.py glimmer3-build-icm-wrapper.xml glimmer3-main-wrapper.xml glimmer_acgt_content.xml glimmer_build-icm.xml glimmer_orf_to_seq.py glimmer_orf_to_seq.xml glimmer_predict.py glimmer_predict.xml glimmer_w_icm.xml glimmer_wo_icm.py glimmer_wo_icm.xml readme.rst |
diffstat | 15 files changed, 572 insertions(+), 757 deletions(-) [+] |
line wrap: on
line diff
--- a/glimmer2gff.py Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -""" -Input: Glimmer3 prediction -Output: GFF3 file -Return a GFF3 file with the genes predicted by Glimmer3 -Bjoern Gruening - -Note: Its not a full-fledged GFF3 file, its a really simple one. - -""" - -import sys, re - -def __main__(): - input_file = open(sys.argv[1], 'r') - - print '##gff-version 3\n' - for line in input_file: - line = line.strip() - if line[0] == '>': - header = line[1:] - else: - (id, start, end, frame, score) = re.split('\s+', line) - if int(end) > int(start): - strand = '+' - else: - strand = '-' - (start, end) = (end, start) - - rest = 'frame=%s;score=%s' % (frame, score) - print '\t'.join([header, 'glimmer_prediction', 'predicted_gene', start, end, '.', strand, '.', rest]) - - -if __name__ == "__main__" : - __main__()
--- a/glimmer2gff.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -<tool id="glimmer2gff" name="Convert Glimmer to GFF" version="0.1"> - <description>Converts Glimmer Files to GFF Files</description> - <command interpreter="python"> - glimmer2gff.py - $input > $output - </command> - <inputs> - <param name="input" type="data" format="tabular" label="Glimmer Output File"/> - </inputs> - <outputs> - <data name="output" type="data" format="gff"/> - </outputs> - <tests> - <test> - - </test> - </tests> - <help> - -**What it does** - -Converts a Glimmer3 output File to an GFF Annotation File:: - -**Example** - -Input:: - >contig00097 sbe.0.234 - orf00003 2869 497 -2 5.60 - orf00005 3894 2875 -1 7.05 - orf00007 4242 4826 +3 8.04 - orf00010 4846 5403 +1 8.57 - orf00012 6858 5413 -1 10.87 - orf00013 6857 7594 +2 3.61 - orf00014 7751 9232 +2 11.34 - orf00015 9374 10357 +2 10.66 - orf00017 10603 11196 +1 13.39 - orf00021 11303 11911 +2 8.81 - orf00025 14791 12050 -2 13.51 - orf00026 15216 16199 +3 6.37 - orf00028 16333 16935 +1 8.86 - - -Output: - contig00097 sbe.0.234 glimmer gene 497 2869 . - . -2 5.60 - contig00097 sbe.0.234 glimmer gene 2875 3894 . - . -1 7.05 - contig00097 sbe.0.234 glimmer gene 4242 4826 . + . +3 8.04 - contig00097 sbe.0.234 glimmer gene 4846 5403 . + . +1 8.57 - contig00097 sbe.0.234 glimmer gene 5413 6858 . - . -1 10.87 - contig00097 sbe.0.234 glimmer gene 6857 7594 . + . +2 3.61 - contig00097 sbe.0.234 glimmer gene 7751 9232 . + . +2 11.34 - contig00097 sbe.0.234 glimmer gene 9374 10357 . + . +2 10.66 - contig00097 sbe.0.234 glimmer gene 10603 11196 . + . +1 13.39 - contig00097 sbe.0.234 glimmer gene 11303 11911 . + . +2 8.81 - contig00097 sbe.0.234 glimmer gene 12050 14791 . - . -2 13.51 - contig00097 sbe.0.234 glimmer gene 15216 16199 . + . +3 6.37 - contig00097 sbe.0.234 glimmer gene 16333 16935 . + . +1 8.86 - - ------ - - - </help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer2seq.py Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" +Input: DNA FASTA file + Glimmer ORF file +Output: ORF sequences as FASTA file +Author: Bjoern Gruening +""" +import sys, os +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ): + if len(sys.argv) >= 4: + glimmerfile = open( glimmer_prediction, "r") + sequence = open( genome_sequence ) + else: + print "Missing input values." + sys.exit() + + fastafile = SeqIO.parse(sequence, "fasta") + + sequences = dict() + seq_records = list() + for entry in fastafile: + sequences[entry.description] = entry + + for line in glimmerfile: + if line.startswith('>'): + entry = sequences[ line[1:].strip() ] + else: + orf_start = int(line[8:17]) + orf_end = int(line[18:26]) + + orf_name = line[0:8] + if orf_start <= orf_end: + seq_records.append( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) ) + else: + seq_records.append( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) ) + + SeqIO.write( seq_records, outfile, "fasta" ) + glimmerfile.close() + sequence.close() + +if __name__ == "__main__" : + glimmer2seq()
--- a/glimmer3-build-icm-wrapper.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,119 +0,0 @@ -<tool id="glimmer_build-icm" name="ICM builder" version="0.1"> - <description>(glimmer3)</description> - <requirements> - <requirement type="package" version="3.02b">glimmer</requirement> - </requirements> - <command> - build-icm - --depth $depth - #if $no_stops: - --no_stops - #end if - --period $period - --width $width - - #if $stop_codon_opts.stop_codon_opts_selector == "gb": - --trans_table "${stop_codon_opts.genbank_gencode}" - #else: - --stop_codons "${stop_codon_opts.stop_codons}" - #end if - - $outfile < $infile - </command> - <inputs> - <param name="infile" type="data" format="fasta" label="Trainings Dataset" help="A set of known genes in FASTA format." /> - <param name="depth" type="integer" value="7" label="Set the depth of the ICM" help="The depth is the maximum number of positions in the context window that will be used to determine the probability of the predicted position." /> - <param name="period" type="integer" value="3" label="Set the period of the ICM" help="The period is the number of different submodels for different positions in the text in a cyclic pattern. E.g., if the period is 3, the first submodel will determine positions 1, 4, 7, ..." /> - <param name="width" type="integer" value="12" label="Set the width of the ICM" help="The width includes the predicted position." /> - <param name="no_stops" type="boolean" truevalue="--no_stops" falsevalue="" checked="true" label="Do not use any input strings with in-frame stop codons" /> - - <conditional name="stop_codon_opts"> - <param name="stop_codon_opts_selector" type="select" label="Specify start codons as"> - <option value="gb" selected="True">Genbank translation table entry</option> - <option value="free_form">Comma-separated list</option> - </param> - <when value="gb"> - <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons"> - <option value="1" select="True">1. Standard</option> - <option value="2">2. Vertebrate Mitochondrial</option> - <option value="3">3. Yeast Mitochondrial</option> - <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> - <option value="5">5. Invertebrate Mitochondrial</option> - <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option> - <option value="9">9. Echinoderm Mitochondrial</option> - <option value="10">10. Euplotid Nuclear</option> - <option value="11">11. Bacteria and Archaea</option> - <option value="12">12. Alternative Yeast Nuclear</option> - <option value="13">13. Ascidian Mitochondrial</option> - <option value="14">14. Flatworm Mitochondrial</option> - <option value="15">15. Blepharisma Macronuclear</option> - <option value="16">16. Chlorophycean Mitochondrial</option> - <option value="21">21. Trematode Mitochondrial</option> - <option value="22">22. Scenedesmus obliquus mitochondrial</option> - <option value="23">23. Thraustochytrium Mitochondrial</option> - <option value="24">24. Pterobranchia mitochondrial</option> - </param> - </when> - <when value="free_form"> - <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" /> - </when> - </conditional> - - </inputs> - <outputs> - <data format="data" name="outfile" /> - </outputs> - <tests> - <test> - <param name="infile" value='glimmer3/seqTest.fa'/> - <output name="outfile" file='glimmer3/buildICMTestOutput.dat'/> - </test> - </tests> - - <help> - -**What it does** - - This program constructs an interpolated context model (ICM) from an input set of sequences. - This model can be used by Glimmer3 to predict genes. - ------ - - -**Example** - -* input:: - - -Genome Sequence - - >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 - GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT - GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT - TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT - TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC - GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA - ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG - AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA - CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA - TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC - AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA - GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC - AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC - CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA - AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC - GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT - ..... - -* output: - interpolated context model (ICM) - - -------- - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - - </help> -</tool>
--- a/glimmer3-main-wrapper.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,232 +0,0 @@ -<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.1"> - <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description> - <requirements> - <requirement type="package" version="3.02b">glimmer</requirement> - <requirement type="package" version="1.61">biopython</requirement> - <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement> - </requirements> - <command> - #import tempfile, os - #set $temp = tempfile.NamedTemporaryFile( delete=False ) - # $temp.close() - - glimmer3 - --max_olap $max_olap - --gene_len $gene_len - --threshold $threshold - #if float( str($gc_percent) ) > 0.0: - --gc_percent $gc_percent - #end if - - #if $stop_codon_opts.stop_codon_opts_selector == "gb": - --trans_table "${stop_codon_opts.genbank_gencode}" - #else: - --stop_codons "${stop_codon_opts.stop_codons}" - #end if - - $linear - $no_indep - $extend - $seq_input - $icm_input - $temp 2>&1; - - ## convert prediction to FASTA sequences - \$GLIMMER_SCRIPT_PATH/glimmer_orf_to_seq.py $temp".predict" $seq_input $genes_output - - #if $report: - mv $temp".predict" $prediction; - #else: - rm $temp".predict"; - #end if - - #if $detailed_report: - mv $temp".detail" $detailed; - #else: - rm $temp".detail"; - #end if - - rm $temp - </command> - <inputs> - <param name="seq_input" type="data" format="fasta" label="Genome Sequence" /> - <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" /> - - <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." /> - <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/> - <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." /> - <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." /> - - <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" /> - <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." /> - <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" /> - <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" /> - - <conditional name="stop_codon_opts"> - <param name="stop_codon_opts_selector" type="select" label="Specify start codons as"> - <option value="gb" selected="True">Genbank translation table entry</option> - <option value="free_form">Comma-separated list</option> - </param> - <when value="gb"> - <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons"> - <option value="1" select="True">1. Standard</option> - <option value="2">2. Vertebrate Mitochondrial</option> - <option value="3">3. Yeast Mitochondrial</option> - <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> - <option value="5">5. Invertebrate Mitochondrial</option> - <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option> - <option value="9">9. Echinoderm Mitochondrial</option> - <option value="10">10. Euplotid Nuclear</option> - <option value="11">11. Bacteria and Archaea</option> - <option value="12">12. Alternative Yeast Nuclear</option> - <option value="13">13. Ascidian Mitochondrial</option> - <option value="14">14. Flatworm Mitochondrial</option> - <option value="15">15. Blepharisma Macronuclear</option> - <option value="16">16. Chlorophycean Mitochondrial</option> - <option value="21">21. Trematode Mitochondrial</option> - <option value="22">22. Scenedesmus obliquus mitochondrial</option> - <option value="23">23. Thraustochytrium Mitochondrial</option> - <option value="24">24. Pterobranchia mitochondrial</option> - </param> - </when> - <when value="free_form"> - <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" /> - </when> - </conditional> - - <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/> - <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/> - </inputs> - <outputs> - <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" /> - <data name="prediction" format="text" label="Glimmer3 on ${on_string} (Gene Prediction table)"> - <filter>report == True</filter> - </data> - <data name="detailed" format="text" label="Glimmer3 on ${on_string} (detailed report)"> - <filter>detailed_report == True</filter> - </data> - </outputs> - <tests> - <test> - <param name="seqInput" value='glimmer3/seqTest.fa' /> - <param name="icmInput" value='glimmer3/icmTest.icm' /> - <param name="overlaplen" value="50" /> - <param name="genlen" value="90" /> - <param name="thresh" value="30" /> - <param name="linear" value="-l" /> - <output name="output1" file='glimmer3/output1Test.dat' /> - <output name="output2" file='glimmer3/output2Test.dat' /> - </test> - </tests> - <help> - - -**What it does** - - This is the main program that makes gene preditions based on an interpolated context model (ICM). - The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms. - ------ - -**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*. - ------ - -**Glimmer Overview** - -:: - -************** ************** ************** ************** -* * * * * * * * -* long-orfs * ===> * Extract * ===> * build-icm * ===> * glimmer3 * -* * * * * * * * -************** ************** ************** ************** - -**Example** - -* input:: - - -Genome Sequence - - CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 - GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT - GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT - TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT - TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC - GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA - ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG - AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA - CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA - TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC - AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA - GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC - AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC - CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA - AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC - GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT - ..... - - - - interpolated context model (ICM) 92: glimmer3-build-icm on data 89 - - maximum overlap length 50 - - minimum gene length. 90 - - threshold score 30 - - linear True - -* output:: - - .predict file - >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. - orf00001 40137 52 +2 8.68 - orf00004 603 34 -1 2.91 - orf00006 1289 1095 -3 3.16 - orf00007 1555 1391 -2 2.33 - orf00008 1809 1576 -1 1.02 - orf00010 1953 2066 +3 3.09 - orf00011 2182 2304 +1 0.89 - orf00013 2390 2521 +2 0.60 - orf00018 2570 3073 +2 2.54 - orf00020 3196 3747 +1 2.91 - orf00022 3758 4000 +2 0.83 - orf00023 4399 4157 -2 1.31 - orf00025 4463 4759 +2 2.92 - orf00026 4878 5111 +3 0.78 - orf00027 5468 5166 -3 1.64 - orf00029 5590 5832 +1 0.29 - orf00032 6023 6226 +2 6.02 - orf00033 6217 6336 +1 3.09 - ........ - - - .details file - >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. - Sequence length = 40222 - - ----- Start ----- --- Length ---- ------------- Scores ------------- - ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC - 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0 - 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0 - +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41 - +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5 - 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1 - 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0 - +1 562 592 762 198 168 -2.54 1 1 - - - - - 98 - +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11 - +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3 - 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15 - 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0 - 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1 - 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5 - 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19 - ..... - -------- - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - - </help> - -</tool>
--- a/glimmer_acgt_content.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -<tool id="glimmer_acgt-content" name="ACGT Content" version="0.1"> - <description>of windows in each sequence</description> - <requirements> - <requirement type="package" version="3.02b">glimmer</requirement> - </requirements> - <command> - window-acgt - $percentage - $input_win_len - $input_win_skip - < $infile > $output - - ##TODO prettify the output - </command> - <inputs> - <param name="infile" type="data" format="fasta" label="Genome Sequence"/> - <param name="input_win_len" type="integer" value="10" label="The width of the sliding window"/> - <param name="input_win_skip" type="integer" value="10" label="The number of positions between windows to report"/> - <param name="percentage" type="boolean" truevalue="-p" falsevalue="" checked="true" label="Report percentages instead of counts"/> - </inputs> - <outputs> - <data name="output" format="tabular"/> - </outputs> - <tests> - <test> - <param name="infile" value="streptomyces_coelicolor.dna" /> - <output name="output" file="fasta_tool_convert_from_dna.out" /> - </test> - </tests> - <help> - -**What it does** - -This tool calculates the ACGT-Content from a given Sequence, given a sliding window. - -------- - -**Output** - -Output is in the format: - - window-start window-len A's C's G's T's #other %GC - -Note the last window in the sequence can be shorter than *window-len* if the sequence ends prematurely - - - - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - - </help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_build-icm.xml Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,119 @@ +<tool id="glimmer_build-icm" name="ICM builder" version="0.1"> + <description>(glimmer)</description> + <requirements> + <requirement type="package" version="3.02b">glimmer</requirement> + </requirements> + <command> + build-icm + --depth $depth + #if $no_stops: + --no_stops + #end if + --period $period + --width $width + + #if $stop_codon_opts.stop_codon_opts_selector == "gb": + --trans_table "${stop_codon_opts.genbank_gencode}" + #else: + --stop_codons "${stop_codon_opts.stop_codons}" + #end if + + $outfile < $infile 2>&1; + </command> + <inputs> + <param name="infile" type="data" format="fasta" label="Trainings Dataset" help="A set of known genes in FASTA format." /> + <param name="depth" type="integer" value="7" label="Set the depth of the ICM" help="The depth is the maximum number of positions in the context window that will be used to determine the probability of the predicted position." /> + <param name="period" type="integer" value="3" label="Set the period of the ICM" help="The period is the number of different submodels for different positions in the text in a cyclic pattern. E.g., if the period is 3, the first submodel will determine positions 1, 4, 7, ..." /> + <param name="width" type="integer" value="12" label="Set the width of the ICM" help="The width includes the predicted position." /> + <param name="no_stops" type="boolean" truevalue="--no_stops" falsevalue="" checked="false" label="Do not use any input strings with in-frame stop codons" /> + + <conditional name="stop_codon_opts"> + <param name="stop_codon_opts_selector" type="select" label="Specify start codons as"> + <option value="gb" selected="True">Genbank translation table entry</option> + <option value="free_form">Comma-separated list</option> + </param> + <when value="gb"> + <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons"> + <option value="1" select="True">1. Standard</option> + <option value="2">2. Vertebrate Mitochondrial</option> + <option value="3">3. Yeast Mitochondrial</option> + <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> + <option value="5">5. Invertebrate Mitochondrial</option> + <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option> + <option value="9">9. Echinoderm Mitochondrial</option> + <option value="10">10. Euplotid Nuclear</option> + <option value="11">11. Bacteria and Archaea</option> + <option value="12">12. Alternative Yeast Nuclear</option> + <option value="13">13. Ascidian Mitochondrial</option> + <option value="14">14. Flatworm Mitochondrial</option> + <option value="15">15. Blepharisma Macronuclear</option> + <option value="16">16. Chlorophycean Mitochondrial</option> + <option value="21">21. Trematode Mitochondrial</option> + <option value="22">22. Scenedesmus obliquus mitochondrial</option> + <option value="23">23. Thraustochytrium Mitochondrial</option> + <option value="24">24. Pterobranchia mitochondrial</option> + </param> + </when> + <when value="free_form"> + <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" /> + </when> + </conditional> + + </inputs> + <outputs> + <data format="data" name="outfile" /> + </outputs> + <tests> + <test> + <param name="infile" value='glimmer3/seqTest.fa'/> + <output name="outfile" file='glimmer3/buildICMTestOutput.dat'/> + </test> + </tests> + + <help> + +**What it does** + + This program constructs an interpolated context model (ICM) from an input set of sequences. + This model can be used by Glimmer3 to predict genes. + +----- + + +**Example** + +* input:: + + -Genome Sequence + + >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 + GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT + GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT + TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT + TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC + GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA + ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG + AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA + CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA + TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC + AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA + GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC + AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC + CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA + AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC + GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT + ..... + +* output: + interpolated context model (ICM) + + +------- + +**References** + +A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). + + + </help> +</tool>
--- a/glimmer_orf_to_seq.py Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -#!/usr/bin/env python -""" -Input: DNA FASTA file + Glimmer ORF file -Output: ORF sequences as FASTA file -Author: Bjoern Gruening -""" -import sys, os -import Bio.SeqIO -from Bio.SeqRecord import SeqRecord - -def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ): - if len(sys.argv) >= 4: - glimmerfile = open( glimmer_prediction, "r") - sequence = open( genome_sequence ) - else: - print "Missing input values." - sys.exit() - - fastafile = Bio.SeqIO.parse(sequence, "fasta") - - sequences = dict() - seq_records = list() - for entry in fastafile: - sequences[entry.description] = entry - - for line in glimmerfile: - if line.startswith('>'): - entry = sequences[ line[1:].strip() ] - else: - orf_start = int(line[8:17]) - orf_end = int(line[18:26]) - - orf_name = line[0:8] - if orf_start <= orf_end: - seq_records.add( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) ) - else: - seq_records.add( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) ) - - SeqIO.write( seq_records, outfile, "fasta" ) - glimmerfile.close() - sequence.close() - -if __name__ == "__main__" : - glimmer2seq()
--- a/glimmer_orf_to_seq.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -<tool id="glimmer_orf-to-sequence" name="ORF to Sequence" version="0.1"> - <description>assigns ORF to its DNA sequence</description> - <requirements> - <requirement type="package" version="1.61">biopython</requirement> - </requirements> - <command interpreter="python"> - glimmer_orf_to_seq.py - $glimmer_orfs - $input_fasta - $output - </command> - <inputs> - <param name="input_fasta" type="data" format="fasta" label="Genome Sequence"/> - <param name="glimmer_orfs" type="data" format="tabular" label="Define Glimmer-ORFs"/> - </inputs> - <outputs> - <data name="output" type="data" format="fasta"/> - </outputs> - <tests> - <test> - </test> - </tests> - <help> - -**What it does** - -This tool extract all gene sequences from a genome, which are predicted with Glimmer3. - - </help> -</tool>
--- a/glimmer_predict.py Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -#!/usr/bin/env python -""" -Input: DNA Fasta File -Output: Tabular -Return Tabular File with predicted ORF's -Bjoern Gruening -""" -import sys, os -import tempfile -import subprocess -import shutil -from glimmer_orf_to_seq import glimmer2seq - -def main(): - genome_seq_file = sys.argv[1] - outfile_classic_glimmer = sys.argv[2] - outfile_ext_path = sys.argv[3] - oufile_genes = sys.argv[8] - - tag = 'glimmer_non_knowlegde_based_prediction' - tempdir = tempfile.gettempdir() - - trainingset = os.path.join( tempdir, tag + ".train" ) - icm = os.path.join( tempdir, tag + ".icm" ) - - longorfs = tempfile.NamedTemporaryFile() - trainingset = tempfile.NamedTemporaryFile() - icm = tempfile.NamedTemporaryFile() - - #glimmeropts = "-o0 -g110 -t30 -l" - glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6]) - if sys.argv[7] == "true": - glimmeropts += " -l" - - """ - 1. Find long, non-overlapping orfs to use as a training set - """ - subprocess.Popen(["long-orfs", "-n", "-t", "1.15", - genome_seq_file, "-"], stdout = longorfs, - stderr = subprocess.PIPE).communicate() - - """ - 2. Extract the training sequences from the genome file - """ - subprocess.Popen(["extract", "-t", - genome_seq_file, longorfs.name], stdout=trainingset, - stderr=subprocess.PIPE).communicate() - - """ - 3. Build the icm from the training sequences - """ - - # the "-" parameter is used to redirect the output to stdout - subprocess.Popen(["build-icm", "-r", "-"], - stdin=open(trainingset.name), stdout = icm, - stderr=subprocess.PIPE).communicate() - - """ - Run Glimmer3 - """ - b = subprocess.Popen(["glimmer3", glimmeropts, - genome_seq_file, icm.name, os.path.join(tempdir, tag)], - stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate() - - shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer ) - if outfile_ext_path.strip() != 'None': - shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path ) - - glimmer2seq( outfile_classic_glimmer, genome_seq_file, oufile_genes ) - - -if __name__ == "__main__" : - main()
--- a/glimmer_predict.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -<tool id="glimmer_not-knowlegde-based" name="Glimmer3" version="0.1"> - <description>Predict ORFs in prokaryotic genomes (not knowlegde-based)</description> - <requirements> - <requirement type="package" version="3.02b">glimmer</requirement> - <requirement type="package" version="1.61">biopython</requirement> - </requirements> - <command interpreter="python"> - glimmer_predict.py - $input - $prediction - #if $detailed_report: - $output_ext - #else: - "None" - #end if - $overlap - $gene_length - $threshold - $linear - $genes_output - </command> - <inputs> - <param name="input" type="data" format="fasta" label="Genome sequence" /> - <param name="overlap" type="integer" value="0" label="Set maximum overlap length. Overlaps this short or shorter are ignored." /> - <param name="gene_length" type="integer" value="110" label="Set minimum gene length." /> - <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene. If the in-frame score >= N, then the region is given a number and considered a potential gene." /> - <param name="linear" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" /> - - <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file" /> - <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output" /> - </inputs> - <outputs> - <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" /> - <data name="prediction" format="text" label="Glimmer3 on ${on_string} (Gene Prediction table)"> - <filter>report == True</filter> - </data> - <data name="detailed" format="text" label="Glimmer3 on ${on_string} (detailed report)"> - <filter>detailed_report == True</filter> - </data> - </outputs> - <tests> - <test> - <param name="input" value="streptomyces_coelicolor.dna" /> - <output name="output" file="fasta_tool_convert_from_dna.out" /> - </test> - </tests> - <help> - -**What it does** - -This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based. - -The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set. - ------ - -**Example** - -Suppose you have the following DNA formatted sequences:: - - >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; - cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg - ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag - cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc - cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc - ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg - -Running this tool will produce this:: - - >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; - orf00001 577 699 +1 5.24 - orf00003 800 1123 +2 5.18 - orf00004 1144 3813 +1 10.62 - orf00006 3857 6220 +2 6.07 - orf00007 6226 7173 +1 1.69 - orf00008 7187 9307 +2 8.95 - orf00009 9424 10410 +1 8.29 - orf00010 10515 11363 +3 7.00 - orf00011 11812 11964 +1 2.80 - orf00012 12360 13457 +3 4.80 - orf00013 14379 14044 -1 7.41 - orf00015 15029 14739 -3 12.43 - orf00016 15066 15227 +3 1.91 - orf00020 16061 15351 -3 2.83 - orf00021 17513 17391 -3 2.20 - orf00023 17529 17675 +3 0.11 - - -------- - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - </help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_w_icm.xml Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,232 @@ +<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.1"> + <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description> + <requirements> + <requirement type="package" version="3.02b">glimmer</requirement> + <requirement type="package" version="1.61">biopython</requirement> + <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement> + </requirements> + <command> + #import tempfile, os + #set $temp = tempfile.NamedTemporaryFile( delete=False ) + # $temp.close() + + glimmer3 + --max_olap $max_olap + --gene_len $gene_len + --threshold $threshold + #if float( str($gc_percent) ) > 0.0: + --gc_percent $gc_percent + #end if + + #if $stop_codon_opts.stop_codon_opts_selector == "gb": + --trans_table "${stop_codon_opts.genbank_gencode}" + #else: + --stop_codons "${stop_codon_opts.stop_codons}" + #end if + + $linear + $no_indep + $extend + $seq_input + $icm_input + $temp 2>&1; + + ## convert prediction to FASTA sequences + \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output + + #if $report: + mv $temp".predict" $prediction; + #else: + rm $temp".predict"; + #end if + + #if $detailed_report: + mv $temp".detail" $detailed; + #else: + rm $temp".detail"; + #end if + + rm $temp + </command> + <inputs> + <param name="seq_input" type="data" format="fasta" label="Genome Sequence" /> + <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" /> + + <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." /> + <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/> + <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." /> + <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." /> + + <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" /> + <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." /> + <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" /> + <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" /> + + <conditional name="stop_codon_opts"> + <param name="stop_codon_opts_selector" type="select" label="Specify start codons as"> + <option value="gb" selected="True">Genbank translation table entry</option> + <option value="free_form">Comma-separated list</option> + </param> + <when value="gb"> + <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons"> + <option value="1" select="True">1. Standard</option> + <option value="2">2. Vertebrate Mitochondrial</option> + <option value="3">3. Yeast Mitochondrial</option> + <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option> + <option value="5">5. Invertebrate Mitochondrial</option> + <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option> + <option value="9">9. Echinoderm Mitochondrial</option> + <option value="10">10. Euplotid Nuclear</option> + <option value="11">11. Bacteria and Archaea</option> + <option value="12">12. Alternative Yeast Nuclear</option> + <option value="13">13. Ascidian Mitochondrial</option> + <option value="14">14. Flatworm Mitochondrial</option> + <option value="15">15. Blepharisma Macronuclear</option> + <option value="16">16. Chlorophycean Mitochondrial</option> + <option value="21">21. Trematode Mitochondrial</option> + <option value="22">22. Scenedesmus obliquus mitochondrial</option> + <option value="23">23. Thraustochytrium Mitochondrial</option> + <option value="24">24. Pterobranchia mitochondrial</option> + </param> + </when> + <when value="free_form"> + <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" /> + </when> + </conditional> + + <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/> + <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/> + </inputs> + <outputs> + <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" /> + <data name="prediction" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)"> + <filter>report == True</filter> + </data> + <data name="detailed" format="txt" label="Glimmer3 on ${on_string} (detailed report)"> + <filter>detailed_report == True</filter> + </data> + </outputs> + <tests> + <test> + <param name="seqInput" value='glimmer3/seqTest.fa' /> + <param name="icmInput" value='glimmer3/icmTest.icm' /> + <param name="overlaplen" value="50" /> + <param name="genlen" value="90" /> + <param name="thresh" value="30" /> + <param name="linear" value="-l" /> + <output name="output1" file='glimmer3/output1Test.dat' /> + <output name="output2" file='glimmer3/output2Test.dat' /> + </test> + </tests> + <help> + + +**What it does** + + This is the main program that makes gene preditions based on an interpolated context model (ICM). + The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms. + +----- + +**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*. + +----- + +**Glimmer Overview** + +:: + +************** ************** ************** ************** +* * * * * * * * +* long-orfs * ===> * Extract * ===> * build-icm * ===> * glimmer3 * +* * * * * * * * +************** ************** ************** ************** + +**Example** + +* input:: + + -Genome Sequence + + CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 + GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT + GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT + TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT + TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC + GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA + ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG + AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA + CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA + TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC + AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA + GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC + AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC + CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA + AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC + GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT + ..... + + + - interpolated context model (ICM) 92: glimmer3-build-icm on data 89 + - maximum overlap length 50 + - minimum gene length. 90 + - threshold score 30 + - linear True + +* output:: + + .predict file + >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. + orf00001 40137 52 +2 8.68 + orf00004 603 34 -1 2.91 + orf00006 1289 1095 -3 3.16 + orf00007 1555 1391 -2 2.33 + orf00008 1809 1576 -1 1.02 + orf00010 1953 2066 +3 3.09 + orf00011 2182 2304 +1 0.89 + orf00013 2390 2521 +2 0.60 + orf00018 2570 3073 +2 2.54 + orf00020 3196 3747 +1 2.91 + orf00022 3758 4000 +2 0.83 + orf00023 4399 4157 -2 1.31 + orf00025 4463 4759 +2 2.92 + orf00026 4878 5111 +3 0.78 + orf00027 5468 5166 -3 1.64 + orf00029 5590 5832 +1 0.29 + orf00032 6023 6226 +2 6.02 + orf00033 6217 6336 +1 3.09 + ........ + + + .details file + >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. + Sequence length = 40222 + + ----- Start ----- --- Length ---- ------------- Scores ------------- + ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC + 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0 + 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0 + +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41 + +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5 + 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1 + 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0 + +1 562 592 762 198 168 -2.54 1 1 - - - - - 98 + +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11 + +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3 + 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15 + 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0 + 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1 + 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5 + 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19 + ..... + +------- + +**References** + +A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). + + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_wo_icm.py Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,74 @@ +#!/usr/bin/env python +""" +Input: DNA Fasta File +Output: Tabular +Return Tabular File with predicted ORF's +Bjoern Gruening +""" +import sys, os +import tempfile +import subprocess +import shutil +from glimmer2seq import glimmer2seq + +def main(): + genome_seq_file = sys.argv[1] + outfile_classic_glimmer = sys.argv[2] + outfile_ext_path = sys.argv[3] + oufile_genes = sys.argv[8] + + tag = 'glimmer_non_knowlegde_based_prediction' + tempdir = tempfile.gettempdir() + + trainingset = os.path.join( tempdir, tag + ".train" ) + icm = os.path.join( tempdir, tag + ".icm" ) + + longorfs = tempfile.NamedTemporaryFile() + trainingset = tempfile.NamedTemporaryFile() + icm = tempfile.NamedTemporaryFile() + + #glimmeropts = "-o0 -g110 -t30 -l" + glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6]) + if sys.argv[7] == "true": + glimmeropts += " -l" + + """ + 1. Find long, non-overlapping orfs to use as a training set + """ + subprocess.Popen(["long-orfs", "-n", "-t", "1.15", + genome_seq_file, "-"], stdout = longorfs, + stderr = subprocess.PIPE).communicate() + + """ + 2. Extract the training sequences from the genome file + """ + subprocess.Popen(["extract", "-t", + genome_seq_file, longorfs.name], stdout=trainingset, + stderr=subprocess.PIPE).communicate() + + """ + 3. Build the icm from the training sequences + """ + + # the "-" parameter is used to redirect the output to stdout + subprocess.Popen(["build-icm", "-r", "-"], + stdin=open(trainingset.name), stdout = icm, + stderr=subprocess.PIPE).communicate() + + """ + Run Glimmer3 + """ + b = subprocess.Popen(["glimmer3", glimmeropts, + genome_seq_file, icm.name, os.path.join(tempdir, tag)], + stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate() + + if outfile_classic_glimmer.strip() != 'None': + shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer ) + if outfile_ext_path.strip() != 'None': + shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path ) + + glimmer2seq( os.path.join( tempdir, tag + ".predict" ), genome_seq_file, oufile_genes ) + + +if __name__ == "__main__" : + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_wo_icm.xml Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,100 @@ +<tool id="glimmer_not-knowlegde-based" name="Glimmer3" version="0.1"> + <description>Predict ORFs in prokaryotic genomes (not knowlegde-based)</description> + <requirements> + <requirement type="package" version="3.02b">glimmer</requirement> + <requirement type="package" version="1.61">biopython</requirement> + </requirements> + <command interpreter="python"> + glimmer_wo_icm.py + $input + #if $report: + $prediction + #else: + "None" + #end if + #if $detailed_report: + $detailed + #else: + "None" + #end if + $overlap + $gene_length + $threshold + $linear + $genes_output + </command> + <inputs> + <param name="input" type="data" format="fasta" label="Genome sequence" /> + <param name="overlap" type="integer" value="0" label="Set maximum overlap length. Overlaps this short or shorter are ignored." /> + <param name="gene_length" type="integer" value="110" label="Set minimum gene length." /> + <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene. If the in-frame score >= N, then the region is given a number and considered a potential gene." /> + <param name="linear" type="boolean" truevalue="true" falsevalue="false" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" /> + + <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file" /> + <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output" /> + </inputs> + <outputs> + <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" /> + <data name="prediction" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)"> + <filter>report == True</filter> + </data> + <data name="detailed" format="txt" label="Glimmer3 on ${on_string} (detailed report)"> + <filter>detailed_report == True</filter> + </data> + </outputs> + <tests> + <test> + <param name="input" value="streptomyces_coelicolor.dna" /> + <output name="output" file="fasta_tool_convert_from_dna.out" /> + </test> + </tests> + <help> + +**What it does** + +This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based. + +The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set. + +----- + +**Example** + +Suppose you have the following DNA formatted sequences:: + + >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; + cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg + ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag + cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc + cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc + ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg + +Running this tool will produce this:: + + >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; + orf00001 577 699 +1 5.24 + orf00003 800 1123 +2 5.18 + orf00004 1144 3813 +1 10.62 + orf00006 3857 6220 +2 6.07 + orf00007 6226 7173 +1 1.69 + orf00008 7187 9307 +2 8.95 + orf00009 9424 10410 +1 8.29 + orf00010 10515 11363 +3 7.00 + orf00011 11812 11964 +1 2.80 + orf00012 12360 13457 +3 4.80 + orf00013 14379 14044 -1 7.41 + orf00015 15029 14739 -3 12.43 + orf00016 15066 15227 +3 1.91 + orf00020 16061 15351 -3 2.83 + orf00021 17513 17391 -3 2.20 + orf00023 17529 17675 +3 0.11 + + +------- + +**References** + +A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). + + </help> +</tool>
--- a/readme.rst Fri Jun 07 07:51:49 2013 -0400 +++ b/readme.rst Fri Jun 07 10:02:12 2013 -0400 @@ -29,15 +29,9 @@ folder and modify the tools_conf.xml file to make the tool available to Galaxy. For example: -<tool file="gene_prediction/tools/glimmer3/glimmer3-main-wrapper.xml" /> -<tool file="gene_prediction/tools/glimmer3/glimmer_predict.xml" /> -<tool file="gene_prediction/tools/glimmer3/glimmer_orf_to_seq.xml" /> -<tool file="gene_prediction/tools/glimmer3/glimmer2gff.xml" /> -<tool file="gene_prediction/tools/glimmer3/gbktoorfWrapper.xml" /> -<tool file="gene_prediction/tools/glimmer3/glimmer_acgt_content.xml" /> -<tool file="gene_prediction/tools/glimmer3/glimmer3-build-icm-wrapper.xml" /> -<tool file="gene_prediction/tools/glimmer3/glimmer3-extract-wrapper.xml" /> -<tool file="gene_prediction/tools/glimmer3/glimmer3-long-orfs-wrapper.xml" /> +<tool file="gene_prediction/tools/glimmer3/glimmer_w_icm.xml" /> +<tool file="gene_prediction/tools/glimmer3/glimmer_wo_icm.xml" /> +<tool file="gene_prediction/tools/glimmer3/glimmer_build-icm.xml" /> History