# HG changeset patch # User bgruening # Date 1370613732 14400 # Node ID 8ddf54417ade326dda4178a1bbd7643e60cafb3b # Parent 2d0c268856040cc2ca727ced0f03c196b55f2c45 Uploaded diff -r 2d0c26885604 -r 8ddf54417ade glimmer2gff.py --- a/glimmer2gff.py Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -""" -Input: Glimmer3 prediction -Output: GFF3 file -Return a GFF3 file with the genes predicted by Glimmer3 -Bjoern Gruening - -Note: Its not a full-fledged GFF3 file, its a really simple one. - -""" - -import sys, re - -def __main__(): - input_file = open(sys.argv[1], 'r') - - print '##gff-version 3\n' - for line in input_file: - line = line.strip() - if line[0] == '>': - header = line[1:] - else: - (id, start, end, frame, score) = re.split('\s+', line) - if int(end) > int(start): - strand = '+' - else: - strand = '-' - (start, end) = (end, start) - - rest = 'frame=%s;score=%s' % (frame, score) - print '\t'.join([header, 'glimmer_prediction', 'predicted_gene', start, end, '.', strand, '.', rest]) - - -if __name__ == "__main__" : - __main__() diff -r 2d0c26885604 -r 8ddf54417ade glimmer2gff.xml --- a/glimmer2gff.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ - - Converts Glimmer Files to GFF Files - - glimmer2gff.py - $input > $output - - - - - - - - - - - - - - -**What it does** - -Converts a Glimmer3 output File to an GFF Annotation File:: - -**Example** - -Input:: - >contig00097 sbe.0.234 - orf00003 2869 497 -2 5.60 - orf00005 3894 2875 -1 7.05 - orf00007 4242 4826 +3 8.04 - orf00010 4846 5403 +1 8.57 - orf00012 6858 5413 -1 10.87 - orf00013 6857 7594 +2 3.61 - orf00014 7751 9232 +2 11.34 - orf00015 9374 10357 +2 10.66 - orf00017 10603 11196 +1 13.39 - orf00021 11303 11911 +2 8.81 - orf00025 14791 12050 -2 13.51 - orf00026 15216 16199 +3 6.37 - orf00028 16333 16935 +1 8.86 - - -Output: - contig00097 sbe.0.234 glimmer gene 497 2869 . - . -2 5.60 - contig00097 sbe.0.234 glimmer gene 2875 3894 . - . -1 7.05 - contig00097 sbe.0.234 glimmer gene 4242 4826 . + . +3 8.04 - contig00097 sbe.0.234 glimmer gene 4846 5403 . + . +1 8.57 - contig00097 sbe.0.234 glimmer gene 5413 6858 . - . -1 10.87 - contig00097 sbe.0.234 glimmer gene 6857 7594 . + . +2 3.61 - contig00097 sbe.0.234 glimmer gene 7751 9232 . + . +2 11.34 - contig00097 sbe.0.234 glimmer gene 9374 10357 . + . +2 10.66 - contig00097 sbe.0.234 glimmer gene 10603 11196 . + . +1 13.39 - contig00097 sbe.0.234 glimmer gene 11303 11911 . + . +2 8.81 - contig00097 sbe.0.234 glimmer gene 12050 14791 . - . -2 13.51 - contig00097 sbe.0.234 glimmer gene 15216 16199 . + . +3 6.37 - contig00097 sbe.0.234 glimmer gene 16333 16935 . + . +1 8.86 - - ------ - - - - diff -r 2d0c26885604 -r 8ddf54417ade glimmer2seq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer2seq.py Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,44 @@ +#!/usr/bin/env python +""" +Input: DNA FASTA file + Glimmer ORF file +Output: ORF sequences as FASTA file +Author: Bjoern Gruening +""" +import sys, os +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ): + if len(sys.argv) >= 4: + glimmerfile = open( glimmer_prediction, "r") + sequence = open( genome_sequence ) + else: + print "Missing input values." + sys.exit() + + fastafile = SeqIO.parse(sequence, "fasta") + + sequences = dict() + seq_records = list() + for entry in fastafile: + sequences[entry.description] = entry + + for line in glimmerfile: + if line.startswith('>'): + entry = sequences[ line[1:].strip() ] + else: + orf_start = int(line[8:17]) + orf_end = int(line[18:26]) + + orf_name = line[0:8] + if orf_start <= orf_end: + seq_records.append( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) ) + else: + seq_records.append( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) ) + + SeqIO.write( seq_records, outfile, "fasta" ) + glimmerfile.close() + sequence.close() + +if __name__ == "__main__" : + glimmer2seq() diff -r 2d0c26885604 -r 8ddf54417ade glimmer3-build-icm-wrapper.xml --- a/glimmer3-build-icm-wrapper.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,119 +0,0 @@ - - (glimmer3) - - glimmer - - - build-icm - --depth $depth - #if $no_stops: - --no_stops - #end if - --period $period - --width $width - - #if $stop_codon_opts.stop_codon_opts_selector == "gb": - --trans_table "${stop_codon_opts.genbank_gencode}" - #else: - --stop_codons "${stop_codon_opts.stop_codons}" - #end if - - $outfile < $infile - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - - This program constructs an interpolated context model (ICM) from an input set of sequences. - This model can be used by Glimmer3 to predict genes. - ------ - - -**Example** - -* input:: - - -Genome Sequence - - >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 - GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT - GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT - TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT - TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC - GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA - ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG - AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA - CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA - TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC - AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA - GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC - AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC - CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA - AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC - GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT - ..... - -* output: - interpolated context model (ICM) - - -------- - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - - - diff -r 2d0c26885604 -r 8ddf54417ade glimmer3-main-wrapper.xml --- a/glimmer3-main-wrapper.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,232 +0,0 @@ - - Predict ORFs in prokaryotic genomes (knowlegde-based) - - glimmer - biopython - GLIMMER_SCRIPT_PATH - - - #import tempfile, os - #set $temp = tempfile.NamedTemporaryFile( delete=False ) - # $temp.close() - - glimmer3 - --max_olap $max_olap - --gene_len $gene_len - --threshold $threshold - #if float( str($gc_percent) ) > 0.0: - --gc_percent $gc_percent - #end if - - #if $stop_codon_opts.stop_codon_opts_selector == "gb": - --trans_table "${stop_codon_opts.genbank_gencode}" - #else: - --stop_codons "${stop_codon_opts.stop_codons}" - #end if - - $linear - $no_indep - $extend - $seq_input - $icm_input - $temp 2>&1; - - ## convert prediction to FASTA sequences - \$GLIMMER_SCRIPT_PATH/glimmer_orf_to_seq.py $temp".predict" $seq_input $genes_output - - #if $report: - mv $temp".predict" $prediction; - #else: - rm $temp".predict"; - #end if - - #if $detailed_report: - mv $temp".detail" $detailed; - #else: - rm $temp".detail"; - #end if - - rm $temp - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - report == True - - - detailed_report == True - - - - - - - - - - - - - - - - - -**What it does** - - This is the main program that makes gene preditions based on an interpolated context model (ICM). - The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms. - ------ - -**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*. - ------ - -**Glimmer Overview** - -:: - -************** ************** ************** ************** -* * * * * * * * -* long-orfs * ===> * Extract * ===> * build-icm * ===> * glimmer3 * -* * * * * * * * -************** ************** ************** ************** - -**Example** - -* input:: - - -Genome Sequence - - CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 - GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT - GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT - TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT - TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC - GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA - ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG - AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA - CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA - TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC - AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA - GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC - AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC - CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA - AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC - GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT - ..... - - - - interpolated context model (ICM) 92: glimmer3-build-icm on data 89 - - maximum overlap length 50 - - minimum gene length. 90 - - threshold score 30 - - linear True - -* output:: - - .predict file - >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. - orf00001 40137 52 +2 8.68 - orf00004 603 34 -1 2.91 - orf00006 1289 1095 -3 3.16 - orf00007 1555 1391 -2 2.33 - orf00008 1809 1576 -1 1.02 - orf00010 1953 2066 +3 3.09 - orf00011 2182 2304 +1 0.89 - orf00013 2390 2521 +2 0.60 - orf00018 2570 3073 +2 2.54 - orf00020 3196 3747 +1 2.91 - orf00022 3758 4000 +2 0.83 - orf00023 4399 4157 -2 1.31 - orf00025 4463 4759 +2 2.92 - orf00026 4878 5111 +3 0.78 - orf00027 5468 5166 -3 1.64 - orf00029 5590 5832 +1 0.29 - orf00032 6023 6226 +2 6.02 - orf00033 6217 6336 +1 3.09 - ........ - - - .details file - >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. - Sequence length = 40222 - - ----- Start ----- --- Length ---- ------------- Scores ------------- - ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC - 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0 - 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0 - +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41 - +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5 - 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1 - 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0 - +1 562 592 762 198 168 -2.54 1 1 - - - - - 98 - +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11 - +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3 - 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15 - 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0 - 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1 - 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5 - 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19 - ..... - -------- - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - - - - diff -r 2d0c26885604 -r 8ddf54417ade glimmer_acgt_content.xml --- a/glimmer_acgt_content.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ - - of windows in each sequence - - glimmer - - - window-acgt - $percentage - $input_win_len - $input_win_skip - < $infile > $output - - ##TODO prettify the output - - - - - - - - - - - - - - - - - - -**What it does** - -This tool calculates the ACGT-Content from a given Sequence, given a sliding window. - -------- - -**Output** - -Output is in the format: - - window-start window-len A's C's G's T's #other %GC - -Note the last window in the sequence can be shorter than *window-len* if the sequence ends prematurely - - - - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - - - diff -r 2d0c26885604 -r 8ddf54417ade glimmer_build-icm.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_build-icm.xml Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,119 @@ + + (glimmer) + + glimmer + + + build-icm + --depth $depth + #if $no_stops: + --no_stops + #end if + --period $period + --width $width + + #if $stop_codon_opts.stop_codon_opts_selector == "gb": + --trans_table "${stop_codon_opts.genbank_gencode}" + #else: + --stop_codons "${stop_codon_opts.stop_codons}" + #end if + + $outfile < $infile 2>&1; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + + This program constructs an interpolated context model (ICM) from an input set of sequences. + This model can be used by Glimmer3 to predict genes. + +----- + + +**Example** + +* input:: + + -Genome Sequence + + >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 + GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT + GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT + TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT + TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC + GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA + ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG + AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA + CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA + TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC + AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA + GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC + AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC + CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA + AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC + GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT + ..... + +* output: + interpolated context model (ICM) + + +------- + +**References** + +A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). + + + + diff -r 2d0c26885604 -r 8ddf54417ade glimmer_orf_to_seq.py --- a/glimmer_orf_to_seq.py Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -#!/usr/bin/env python -""" -Input: DNA FASTA file + Glimmer ORF file -Output: ORF sequences as FASTA file -Author: Bjoern Gruening -""" -import sys, os -import Bio.SeqIO -from Bio.SeqRecord import SeqRecord - -def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ): - if len(sys.argv) >= 4: - glimmerfile = open( glimmer_prediction, "r") - sequence = open( genome_sequence ) - else: - print "Missing input values." - sys.exit() - - fastafile = Bio.SeqIO.parse(sequence, "fasta") - - sequences = dict() - seq_records = list() - for entry in fastafile: - sequences[entry.description] = entry - - for line in glimmerfile: - if line.startswith('>'): - entry = sequences[ line[1:].strip() ] - else: - orf_start = int(line[8:17]) - orf_end = int(line[18:26]) - - orf_name = line[0:8] - if orf_start <= orf_end: - seq_records.add( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) ) - else: - seq_records.add( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) ) - - SeqIO.write( seq_records, outfile, "fasta" ) - glimmerfile.close() - sequence.close() - -if __name__ == "__main__" : - glimmer2seq() diff -r 2d0c26885604 -r 8ddf54417ade glimmer_orf_to_seq.xml --- a/glimmer_orf_to_seq.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ - - assigns ORF to its DNA sequence - - biopython - - - glimmer_orf_to_seq.py - $glimmer_orfs - $input_fasta - $output - - - - - - - - - - - - - - -**What it does** - -This tool extract all gene sequences from a genome, which are predicted with Glimmer3. - - - diff -r 2d0c26885604 -r 8ddf54417ade glimmer_predict.py --- a/glimmer_predict.py Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -#!/usr/bin/env python -""" -Input: DNA Fasta File -Output: Tabular -Return Tabular File with predicted ORF's -Bjoern Gruening -""" -import sys, os -import tempfile -import subprocess -import shutil -from glimmer_orf_to_seq import glimmer2seq - -def main(): - genome_seq_file = sys.argv[1] - outfile_classic_glimmer = sys.argv[2] - outfile_ext_path = sys.argv[3] - oufile_genes = sys.argv[8] - - tag = 'glimmer_non_knowlegde_based_prediction' - tempdir = tempfile.gettempdir() - - trainingset = os.path.join( tempdir, tag + ".train" ) - icm = os.path.join( tempdir, tag + ".icm" ) - - longorfs = tempfile.NamedTemporaryFile() - trainingset = tempfile.NamedTemporaryFile() - icm = tempfile.NamedTemporaryFile() - - #glimmeropts = "-o0 -g110 -t30 -l" - glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6]) - if sys.argv[7] == "true": - glimmeropts += " -l" - - """ - 1. Find long, non-overlapping orfs to use as a training set - """ - subprocess.Popen(["long-orfs", "-n", "-t", "1.15", - genome_seq_file, "-"], stdout = longorfs, - stderr = subprocess.PIPE).communicate() - - """ - 2. Extract the training sequences from the genome file - """ - subprocess.Popen(["extract", "-t", - genome_seq_file, longorfs.name], stdout=trainingset, - stderr=subprocess.PIPE).communicate() - - """ - 3. Build the icm from the training sequences - """ - - # the "-" parameter is used to redirect the output to stdout - subprocess.Popen(["build-icm", "-r", "-"], - stdin=open(trainingset.name), stdout = icm, - stderr=subprocess.PIPE).communicate() - - """ - Run Glimmer3 - """ - b = subprocess.Popen(["glimmer3", glimmeropts, - genome_seq_file, icm.name, os.path.join(tempdir, tag)], - stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate() - - shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer ) - if outfile_ext_path.strip() != 'None': - shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path ) - - glimmer2seq( outfile_classic_glimmer, genome_seq_file, oufile_genes ) - - -if __name__ == "__main__" : - main() diff -r 2d0c26885604 -r 8ddf54417ade glimmer_predict.xml --- a/glimmer_predict.xml Fri Jun 07 07:51:49 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ - - Predict ORFs in prokaryotic genomes (not knowlegde-based) - - glimmer - biopython - - - glimmer_predict.py - $input - $prediction - #if $detailed_report: - $output_ext - #else: - "None" - #end if - $overlap - $gene_length - $threshold - $linear - $genes_output - - - - - - - - - - - - - - - report == True - - - detailed_report == True - - - - - - - - - - -**What it does** - -This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based. - -The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set. - ------ - -**Example** - -Suppose you have the following DNA formatted sequences:: - - >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; - cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg - ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag - cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc - cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc - ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg - -Running this tool will produce this:: - - >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; - orf00001 577 699 +1 5.24 - orf00003 800 1123 +2 5.18 - orf00004 1144 3813 +1 10.62 - orf00006 3857 6220 +2 6.07 - orf00007 6226 7173 +1 1.69 - orf00008 7187 9307 +2 8.95 - orf00009 9424 10410 +1 8.29 - orf00010 10515 11363 +3 7.00 - orf00011 11812 11964 +1 2.80 - orf00012 12360 13457 +3 4.80 - orf00013 14379 14044 -1 7.41 - orf00015 15029 14739 -3 12.43 - orf00016 15066 15227 +3 1.91 - orf00020 16061 15351 -3 2.83 - orf00021 17513 17391 -3 2.20 - orf00023 17529 17675 +3 0.11 - - -------- - -**References** - -A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). - - - diff -r 2d0c26885604 -r 8ddf54417ade glimmer_w_icm.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_w_icm.xml Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,232 @@ + + Predict ORFs in prokaryotic genomes (knowlegde-based) + + glimmer + biopython + GLIMMER_SCRIPT_PATH + + + #import tempfile, os + #set $temp = tempfile.NamedTemporaryFile( delete=False ) + # $temp.close() + + glimmer3 + --max_olap $max_olap + --gene_len $gene_len + --threshold $threshold + #if float( str($gc_percent) ) > 0.0: + --gc_percent $gc_percent + #end if + + #if $stop_codon_opts.stop_codon_opts_selector == "gb": + --trans_table "${stop_codon_opts.genbank_gencode}" + #else: + --stop_codons "${stop_codon_opts.stop_codons}" + #end if + + $linear + $no_indep + $extend + $seq_input + $icm_input + $temp 2>&1; + + ## convert prediction to FASTA sequences + \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output + + #if $report: + mv $temp".predict" $prediction; + #else: + rm $temp".predict"; + #end if + + #if $detailed_report: + mv $temp".detail" $detailed; + #else: + rm $temp".detail"; + #end if + + rm $temp + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + report == True + + + detailed_report == True + + + + + + + + + + + + + + + + + +**What it does** + + This is the main program that makes gene preditions based on an interpolated context model (ICM). + The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms. + +----- + +**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*. + +----- + +**Glimmer Overview** + +:: + +************** ************** ************** ************** +* * * * * * * * +* long-orfs * ===> * Extract * ===> * build-icm * ===> * glimmer3 * +* * * * * * * * +************** ************** ************** ************** + +**Example** + +* input:: + + -Genome Sequence + + CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7 + GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT + GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT + TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT + TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC + GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA + ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG + AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA + CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA + TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC + AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA + GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC + AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC + CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA + AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC + GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT + ..... + + + - interpolated context model (ICM) 92: glimmer3-build-icm on data 89 + - maximum overlap length 50 + - minimum gene length. 90 + - threshold score 30 + - linear True + +* output:: + + .predict file + >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. + orf00001 40137 52 +2 8.68 + orf00004 603 34 -1 2.91 + orf00006 1289 1095 -3 3.16 + orf00007 1555 1391 -2 2.33 + orf00008 1809 1576 -1 1.02 + orf00010 1953 2066 +3 3.09 + orf00011 2182 2304 +1 0.89 + orf00013 2390 2521 +2 0.60 + orf00018 2570 3073 +2 2.54 + orf00020 3196 3747 +1 2.91 + orf00022 3758 4000 +2 0.83 + orf00023 4399 4157 -2 1.31 + orf00025 4463 4759 +2 2.92 + orf00026 4878 5111 +3 0.78 + orf00027 5468 5166 -3 1.64 + orf00029 5590 5832 +1 0.29 + orf00032 6023 6226 +2 6.02 + orf00033 6217 6336 +1 3.09 + ........ + + + .details file + >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7. + Sequence length = 40222 + + ----- Start ----- --- Length ---- ------------- Scores ------------- + ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC + 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0 + 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0 + +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41 + +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5 + 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1 + 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0 + +1 562 592 762 198 168 -2.54 1 1 - - - - - 98 + +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11 + +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3 + 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15 + 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0 + 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1 + 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5 + 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19 + ..... + +------- + +**References** + +A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). + + + + + diff -r 2d0c26885604 -r 8ddf54417ade glimmer_wo_icm.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_wo_icm.py Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,74 @@ +#!/usr/bin/env python +""" +Input: DNA Fasta File +Output: Tabular +Return Tabular File with predicted ORF's +Bjoern Gruening +""" +import sys, os +import tempfile +import subprocess +import shutil +from glimmer2seq import glimmer2seq + +def main(): + genome_seq_file = sys.argv[1] + outfile_classic_glimmer = sys.argv[2] + outfile_ext_path = sys.argv[3] + oufile_genes = sys.argv[8] + + tag = 'glimmer_non_knowlegde_based_prediction' + tempdir = tempfile.gettempdir() + + trainingset = os.path.join( tempdir, tag + ".train" ) + icm = os.path.join( tempdir, tag + ".icm" ) + + longorfs = tempfile.NamedTemporaryFile() + trainingset = tempfile.NamedTemporaryFile() + icm = tempfile.NamedTemporaryFile() + + #glimmeropts = "-o0 -g110 -t30 -l" + glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6]) + if sys.argv[7] == "true": + glimmeropts += " -l" + + """ + 1. Find long, non-overlapping orfs to use as a training set + """ + subprocess.Popen(["long-orfs", "-n", "-t", "1.15", + genome_seq_file, "-"], stdout = longorfs, + stderr = subprocess.PIPE).communicate() + + """ + 2. Extract the training sequences from the genome file + """ + subprocess.Popen(["extract", "-t", + genome_seq_file, longorfs.name], stdout=trainingset, + stderr=subprocess.PIPE).communicate() + + """ + 3. Build the icm from the training sequences + """ + + # the "-" parameter is used to redirect the output to stdout + subprocess.Popen(["build-icm", "-r", "-"], + stdin=open(trainingset.name), stdout = icm, + stderr=subprocess.PIPE).communicate() + + """ + Run Glimmer3 + """ + b = subprocess.Popen(["glimmer3", glimmeropts, + genome_seq_file, icm.name, os.path.join(tempdir, tag)], + stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate() + + if outfile_classic_glimmer.strip() != 'None': + shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer ) + if outfile_ext_path.strip() != 'None': + shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path ) + + glimmer2seq( os.path.join( tempdir, tag + ".predict" ), genome_seq_file, oufile_genes ) + + +if __name__ == "__main__" : + main() diff -r 2d0c26885604 -r 8ddf54417ade glimmer_wo_icm.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_wo_icm.xml Fri Jun 07 10:02:12 2013 -0400 @@ -0,0 +1,100 @@ + + Predict ORFs in prokaryotic genomes (not knowlegde-based) + + glimmer + biopython + + + glimmer_wo_icm.py + $input + #if $report: + $prediction + #else: + "None" + #end if + #if $detailed_report: + $detailed + #else: + "None" + #end if + $overlap + $gene_length + $threshold + $linear + $genes_output + + + + + + + + + + + + + + + report == True + + + detailed_report == True + + + + + + + + + + +**What it does** + +This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based. + +The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set. + +----- + +**Example** + +Suppose you have the following DNA formatted sequences:: + + >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; + cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg + ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag + cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc + cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc + ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg + +Running this tool will produce this:: + + >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; + orf00001 577 699 +1 5.24 + orf00003 800 1123 +2 5.18 + orf00004 1144 3813 +1 10.62 + orf00006 3857 6220 +2 6.07 + orf00007 6226 7173 +1 1.69 + orf00008 7187 9307 +2 8.95 + orf00009 9424 10410 +1 8.29 + orf00010 10515 11363 +3 7.00 + orf00011 11812 11964 +1 2.80 + orf00012 12360 13457 +3 4.80 + orf00013 14379 14044 -1 7.41 + orf00015 15029 14739 -3 12.43 + orf00016 15066 15227 +3 1.91 + orf00020 16061 15351 -3 2.83 + orf00021 17513 17391 -3 2.20 + orf00023 17529 17675 +3 0.11 + + +------- + +**References** + +A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007). + + + diff -r 2d0c26885604 -r 8ddf54417ade readme.rst --- a/readme.rst Fri Jun 07 07:51:49 2013 -0400 +++ b/readme.rst Fri Jun 07 10:02:12 2013 -0400 @@ -29,15 +29,9 @@ folder and modify the tools_conf.xml file to make the tool available to Galaxy. For example: - - - - - - - - - + + + History