# HG changeset patch
# User bgruening
# Date 1370613732 14400
# Node ID 8ddf54417ade326dda4178a1bbd7643e60cafb3b
# Parent 2d0c268856040cc2ca727ced0f03c196b55f2c45
Uploaded
diff -r 2d0c26885604 -r 8ddf54417ade glimmer2gff.py
--- a/glimmer2gff.py Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Input: Glimmer3 prediction
-Output: GFF3 file
-Return a GFF3 file with the genes predicted by Glimmer3
-Bjoern Gruening
-
-Note: Its not a full-fledged GFF3 file, its a really simple one.
-
-"""
-
-import sys, re
-
-def __main__():
- input_file = open(sys.argv[1], 'r')
-
- print '##gff-version 3\n'
- for line in input_file:
- line = line.strip()
- if line[0] == '>':
- header = line[1:]
- else:
- (id, start, end, frame, score) = re.split('\s+', line)
- if int(end) > int(start):
- strand = '+'
- else:
- strand = '-'
- (start, end) = (end, start)
-
- rest = 'frame=%s;score=%s' % (frame, score)
- print '\t'.join([header, 'glimmer_prediction', 'predicted_gene', start, end, '.', strand, '.', rest])
-
-
-if __name__ == "__main__" :
- __main__()
diff -r 2d0c26885604 -r 8ddf54417ade glimmer2gff.xml
--- a/glimmer2gff.xml Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-
- Converts Glimmer Files to GFF Files
-
- glimmer2gff.py
- $input > $output
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
-Converts a Glimmer3 output File to an GFF Annotation File::
-
-**Example**
-
-Input::
- >contig00097 sbe.0.234
- orf00003 2869 497 -2 5.60
- orf00005 3894 2875 -1 7.05
- orf00007 4242 4826 +3 8.04
- orf00010 4846 5403 +1 8.57
- orf00012 6858 5413 -1 10.87
- orf00013 6857 7594 +2 3.61
- orf00014 7751 9232 +2 11.34
- orf00015 9374 10357 +2 10.66
- orf00017 10603 11196 +1 13.39
- orf00021 11303 11911 +2 8.81
- orf00025 14791 12050 -2 13.51
- orf00026 15216 16199 +3 6.37
- orf00028 16333 16935 +1 8.86
-
-
-Output:
- contig00097 sbe.0.234 glimmer gene 497 2869 . - . -2 5.60
- contig00097 sbe.0.234 glimmer gene 2875 3894 . - . -1 7.05
- contig00097 sbe.0.234 glimmer gene 4242 4826 . + . +3 8.04
- contig00097 sbe.0.234 glimmer gene 4846 5403 . + . +1 8.57
- contig00097 sbe.0.234 glimmer gene 5413 6858 . - . -1 10.87
- contig00097 sbe.0.234 glimmer gene 6857 7594 . + . +2 3.61
- contig00097 sbe.0.234 glimmer gene 7751 9232 . + . +2 11.34
- contig00097 sbe.0.234 glimmer gene 9374 10357 . + . +2 10.66
- contig00097 sbe.0.234 glimmer gene 10603 11196 . + . +1 13.39
- contig00097 sbe.0.234 glimmer gene 11303 11911 . + . +2 8.81
- contig00097 sbe.0.234 glimmer gene 12050 14791 . - . -2 13.51
- contig00097 sbe.0.234 glimmer gene 15216 16199 . + . +3 6.37
- contig00097 sbe.0.234 glimmer gene 16333 16935 . + . +1 8.86
-
-
------
-
-
-
-
diff -r 2d0c26885604 -r 8ddf54417ade glimmer2seq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer2seq.py Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+"""
+Input: DNA FASTA file + Glimmer ORF file
+Output: ORF sequences as FASTA file
+Author: Bjoern Gruening
+"""
+import sys, os
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ):
+ if len(sys.argv) >= 4:
+ glimmerfile = open( glimmer_prediction, "r")
+ sequence = open( genome_sequence )
+ else:
+ print "Missing input values."
+ sys.exit()
+
+ fastafile = SeqIO.parse(sequence, "fasta")
+
+ sequences = dict()
+ seq_records = list()
+ for entry in fastafile:
+ sequences[entry.description] = entry
+
+ for line in glimmerfile:
+ if line.startswith('>'):
+ entry = sequences[ line[1:].strip() ]
+ else:
+ orf_start = int(line[8:17])
+ orf_end = int(line[18:26])
+
+ orf_name = line[0:8]
+ if orf_start <= orf_end:
+ seq_records.append( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) )
+ else:
+ seq_records.append( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) )
+
+ SeqIO.write( seq_records, outfile, "fasta" )
+ glimmerfile.close()
+ sequence.close()
+
+if __name__ == "__main__" :
+ glimmer2seq()
diff -r 2d0c26885604 -r 8ddf54417ade glimmer3-build-icm-wrapper.xml
--- a/glimmer3-build-icm-wrapper.xml Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,119 +0,0 @@
-
- (glimmer3)
-
- glimmer
-
-
- build-icm
- --depth $depth
- #if $no_stops:
- --no_stops
- #end if
- --period $period
- --width $width
-
- #if $stop_codon_opts.stop_codon_opts_selector == "gb":
- --trans_table "${stop_codon_opts.genbank_gencode}"
- #else:
- --stop_codons "${stop_codon_opts.stop_codons}"
- #end if
-
- $outfile < $infile
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
- This program constructs an interpolated context model (ICM) from an input set of sequences.
- This model can be used by Glimmer3 to predict genes.
-
------
-
-
-**Example**
-
-* input::
-
- -Genome Sequence
-
- >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7
- GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
- GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
- TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
- TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
- GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
- ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
- AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
- CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
- TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
- AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
- GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
- AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
- CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
- AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
- GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
- .....
-
-* output:
- interpolated context model (ICM)
-
-
--------
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-
-
-
diff -r 2d0c26885604 -r 8ddf54417ade glimmer3-main-wrapper.xml
--- a/glimmer3-main-wrapper.xml Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,232 +0,0 @@
-
- Predict ORFs in prokaryotic genomes (knowlegde-based)
-
- glimmer
- biopython
- GLIMMER_SCRIPT_PATH
-
-
- #import tempfile, os
- #set $temp = tempfile.NamedTemporaryFile( delete=False )
- # $temp.close()
-
- glimmer3
- --max_olap $max_olap
- --gene_len $gene_len
- --threshold $threshold
- #if float( str($gc_percent) ) > 0.0:
- --gc_percent $gc_percent
- #end if
-
- #if $stop_codon_opts.stop_codon_opts_selector == "gb":
- --trans_table "${stop_codon_opts.genbank_gencode}"
- #else:
- --stop_codons "${stop_codon_opts.stop_codons}"
- #end if
-
- $linear
- $no_indep
- $extend
- $seq_input
- $icm_input
- $temp 2>&1;
-
- ## convert prediction to FASTA sequences
- \$GLIMMER_SCRIPT_PATH/glimmer_orf_to_seq.py $temp".predict" $seq_input $genes_output
-
- #if $report:
- mv $temp".predict" $prediction;
- #else:
- rm $temp".predict";
- #end if
-
- #if $detailed_report:
- mv $temp".detail" $detailed;
- #else:
- rm $temp".detail";
- #end if
-
- rm $temp
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- report == True
-
-
- detailed_report == True
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
- This is the main program that makes gene preditions based on an interpolated context model (ICM).
- The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms.
-
------
-
-**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*.
-
------
-
-**Glimmer Overview**
-
-::
-
-************** ************** ************** **************
-* * * * * * * *
-* long-orfs * ===> * Extract * ===> * build-icm * ===> * glimmer3 *
-* * * * * * * *
-************** ************** ************** **************
-
-**Example**
-
-* input::
-
- -Genome Sequence
-
- CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7
- GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
- GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
- TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
- TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
- GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
- ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
- AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
- CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
- TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
- AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
- GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
- AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
- CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
- AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
- GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
- .....
-
-
- - interpolated context model (ICM) 92: glimmer3-build-icm on data 89
- - maximum overlap length 50
- - minimum gene length. 90
- - threshold score 30
- - linear True
-
-* output::
-
- .predict file
- >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
- orf00001 40137 52 +2 8.68
- orf00004 603 34 -1 2.91
- orf00006 1289 1095 -3 3.16
- orf00007 1555 1391 -2 2.33
- orf00008 1809 1576 -1 1.02
- orf00010 1953 2066 +3 3.09
- orf00011 2182 2304 +1 0.89
- orf00013 2390 2521 +2 0.60
- orf00018 2570 3073 +2 2.54
- orf00020 3196 3747 +1 2.91
- orf00022 3758 4000 +2 0.83
- orf00023 4399 4157 -2 1.31
- orf00025 4463 4759 +2 2.92
- orf00026 4878 5111 +3 0.78
- orf00027 5468 5166 -3 1.64
- orf00029 5590 5832 +1 0.29
- orf00032 6023 6226 +2 6.02
- orf00033 6217 6336 +1 3.09
- ........
-
-
- .details file
- >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
- Sequence length = 40222
-
- ----- Start ----- --- Length ---- ------------- Scores -------------
- ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC
- 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0
- 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0
- +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41
- +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5
- 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1
- 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0
- +1 562 592 762 198 168 -2.54 1 1 - - - - - 98
- +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11
- +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3
- 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15
- 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0
- 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1
- 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5
- 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19
- .....
-
--------
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-
-
-
-
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_acgt_content.xml
--- a/glimmer_acgt_content.xml Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-
- of windows in each sequence
-
- glimmer
-
-
- window-acgt
- $percentage
- $input_win_len
- $input_win_skip
- < $infile > $output
-
- ##TODO prettify the output
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
-This tool calculates the ACGT-Content from a given Sequence, given a sliding window.
-
--------
-
-**Output**
-
-Output is in the format:
-
- window-start window-len A's C's G's T's #other %GC
-
-Note the last window in the sequence can be shorter than *window-len* if the sequence ends prematurely
-
-
-
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-
-
-
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_build-icm.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_build-icm.xml Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,119 @@
+
+ (glimmer)
+
+ glimmer
+
+
+ build-icm
+ --depth $depth
+ #if $no_stops:
+ --no_stops
+ #end if
+ --period $period
+ --width $width
+
+ #if $stop_codon_opts.stop_codon_opts_selector == "gb":
+ --trans_table "${stop_codon_opts.genbank_gencode}"
+ #else:
+ --stop_codons "${stop_codon_opts.stop_codons}"
+ #end if
+
+ $outfile < $infile 2>&1;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+ This program constructs an interpolated context model (ICM) from an input set of sequences.
+ This model can be used by Glimmer3 to predict genes.
+
+-----
+
+
+**Example**
+
+* input::
+
+ -Genome Sequence
+
+ >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7
+ GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
+ GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
+ TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
+ TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
+ GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
+ ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
+ AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
+ CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
+ TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
+ AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
+ GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
+ AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
+ CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
+ AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
+ GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
+ .....
+
+* output:
+ interpolated context model (ICM)
+
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+
+
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_orf_to_seq.py
--- a/glimmer_orf_to_seq.py Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-"""
-Input: DNA FASTA file + Glimmer ORF file
-Output: ORF sequences as FASTA file
-Author: Bjoern Gruening
-"""
-import sys, os
-import Bio.SeqIO
-from Bio.SeqRecord import SeqRecord
-
-def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ):
- if len(sys.argv) >= 4:
- glimmerfile = open( glimmer_prediction, "r")
- sequence = open( genome_sequence )
- else:
- print "Missing input values."
- sys.exit()
-
- fastafile = Bio.SeqIO.parse(sequence, "fasta")
-
- sequences = dict()
- seq_records = list()
- for entry in fastafile:
- sequences[entry.description] = entry
-
- for line in glimmerfile:
- if line.startswith('>'):
- entry = sequences[ line[1:].strip() ]
- else:
- orf_start = int(line[8:17])
- orf_end = int(line[18:26])
-
- orf_name = line[0:8]
- if orf_start <= orf_end:
- seq_records.add( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) )
- else:
- seq_records.add( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) )
-
- SeqIO.write( seq_records, outfile, "fasta" )
- glimmerfile.close()
- sequence.close()
-
-if __name__ == "__main__" :
- glimmer2seq()
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_orf_to_seq.xml
--- a/glimmer_orf_to_seq.xml Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,30 +0,0 @@
-
- assigns ORF to its DNA sequence
-
- biopython
-
-
- glimmer_orf_to_seq.py
- $glimmer_orfs
- $input_fasta
- $output
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
-This tool extract all gene sequences from a genome, which are predicted with Glimmer3.
-
-
-
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_predict.py
--- a/glimmer_predict.py Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-#!/usr/bin/env python
-"""
-Input: DNA Fasta File
-Output: Tabular
-Return Tabular File with predicted ORF's
-Bjoern Gruening
-"""
-import sys, os
-import tempfile
-import subprocess
-import shutil
-from glimmer_orf_to_seq import glimmer2seq
-
-def main():
- genome_seq_file = sys.argv[1]
- outfile_classic_glimmer = sys.argv[2]
- outfile_ext_path = sys.argv[3]
- oufile_genes = sys.argv[8]
-
- tag = 'glimmer_non_knowlegde_based_prediction'
- tempdir = tempfile.gettempdir()
-
- trainingset = os.path.join( tempdir, tag + ".train" )
- icm = os.path.join( tempdir, tag + ".icm" )
-
- longorfs = tempfile.NamedTemporaryFile()
- trainingset = tempfile.NamedTemporaryFile()
- icm = tempfile.NamedTemporaryFile()
-
- #glimmeropts = "-o0 -g110 -t30 -l"
- glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6])
- if sys.argv[7] == "true":
- glimmeropts += " -l"
-
- """
- 1. Find long, non-overlapping orfs to use as a training set
- """
- subprocess.Popen(["long-orfs", "-n", "-t", "1.15",
- genome_seq_file, "-"], stdout = longorfs,
- stderr = subprocess.PIPE).communicate()
-
- """
- 2. Extract the training sequences from the genome file
- """
- subprocess.Popen(["extract", "-t",
- genome_seq_file, longorfs.name], stdout=trainingset,
- stderr=subprocess.PIPE).communicate()
-
- """
- 3. Build the icm from the training sequences
- """
-
- # the "-" parameter is used to redirect the output to stdout
- subprocess.Popen(["build-icm", "-r", "-"],
- stdin=open(trainingset.name), stdout = icm,
- stderr=subprocess.PIPE).communicate()
-
- """
- Run Glimmer3
- """
- b = subprocess.Popen(["glimmer3", glimmeropts,
- genome_seq_file, icm.name, os.path.join(tempdir, tag)],
- stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate()
-
- shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer )
- if outfile_ext_path.strip() != 'None':
- shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path )
-
- glimmer2seq( outfile_classic_glimmer, genome_seq_file, oufile_genes )
-
-
-if __name__ == "__main__" :
- main()
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_predict.xml
--- a/glimmer_predict.xml Fri Jun 07 07:51:49 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,96 +0,0 @@
-
- Predict ORFs in prokaryotic genomes (not knowlegde-based)
-
- glimmer
- biopython
-
-
- glimmer_predict.py
- $input
- $prediction
- #if $detailed_report:
- $output_ext
- #else:
- "None"
- #end if
- $overlap
- $gene_length
- $threshold
- $linear
- $genes_output
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- report == True
-
-
- detailed_report == True
-
-
-
-
-
-
-
-
-
-
-**What it does**
-
-This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based.
-
-The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set.
-
------
-
-**Example**
-
-Suppose you have the following DNA formatted sequences::
-
- >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
- cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
- ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
- cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
- cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
- ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
-
-Running this tool will produce this::
-
- >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
- orf00001 577 699 +1 5.24
- orf00003 800 1123 +2 5.18
- orf00004 1144 3813 +1 10.62
- orf00006 3857 6220 +2 6.07
- orf00007 6226 7173 +1 1.69
- orf00008 7187 9307 +2 8.95
- orf00009 9424 10410 +1 8.29
- orf00010 10515 11363 +3 7.00
- orf00011 11812 11964 +1 2.80
- orf00012 12360 13457 +3 4.80
- orf00013 14379 14044 -1 7.41
- orf00015 15029 14739 -3 12.43
- orf00016 15066 15227 +3 1.91
- orf00020 16061 15351 -3 2.83
- orf00021 17513 17391 -3 2.20
- orf00023 17529 17675 +3 0.11
-
-
--------
-
-**References**
-
-A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
-
-
-
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_w_icm.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_w_icm.xml Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,232 @@
+
+ Predict ORFs in prokaryotic genomes (knowlegde-based)
+
+ glimmer
+ biopython
+ GLIMMER_SCRIPT_PATH
+
+
+ #import tempfile, os
+ #set $temp = tempfile.NamedTemporaryFile( delete=False )
+ # $temp.close()
+
+ glimmer3
+ --max_olap $max_olap
+ --gene_len $gene_len
+ --threshold $threshold
+ #if float( str($gc_percent) ) > 0.0:
+ --gc_percent $gc_percent
+ #end if
+
+ #if $stop_codon_opts.stop_codon_opts_selector == "gb":
+ --trans_table "${stop_codon_opts.genbank_gencode}"
+ #else:
+ --stop_codons "${stop_codon_opts.stop_codons}"
+ #end if
+
+ $linear
+ $no_indep
+ $extend
+ $seq_input
+ $icm_input
+ $temp 2>&1;
+
+ ## convert prediction to FASTA sequences
+ \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output
+
+ #if $report:
+ mv $temp".predict" $prediction;
+ #else:
+ rm $temp".predict";
+ #end if
+
+ #if $detailed_report:
+ mv $temp".detail" $detailed;
+ #else:
+ rm $temp".detail";
+ #end if
+
+ rm $temp
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ report == True
+
+
+ detailed_report == True
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+ This is the main program that makes gene preditions based on an interpolated context model (ICM).
+ The ICM can be generated either with a de novo prediction (see glimmer Overview) or with extracted CDS from related organisms.
+
+-----
+
+**TIP** To extract CDS from a GenBank file use the tool *Extract ORF from a GenBank file*.
+
+-----
+
+**Glimmer Overview**
+
+::
+
+************** ************** ************** **************
+* * * * * * * *
+* long-orfs * ===> * Extract * ===> * build-icm * ===> * glimmer3 *
+* * * * * * * *
+************** ************** ************** **************
+
+**Example**
+
+* input::
+
+ -Genome Sequence
+
+ CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7
+ GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
+ GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
+ TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
+ TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
+ GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
+ ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
+ AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
+ CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
+ TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
+ AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
+ GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
+ AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
+ CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
+ AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
+ GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
+ .....
+
+
+ - interpolated context model (ICM) 92: glimmer3-build-icm on data 89
+ - maximum overlap length 50
+ - minimum gene length. 90
+ - threshold score 30
+ - linear True
+
+* output::
+
+ .predict file
+ >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+ orf00001 40137 52 +2 8.68
+ orf00004 603 34 -1 2.91
+ orf00006 1289 1095 -3 3.16
+ orf00007 1555 1391 -2 2.33
+ orf00008 1809 1576 -1 1.02
+ orf00010 1953 2066 +3 3.09
+ orf00011 2182 2304 +1 0.89
+ orf00013 2390 2521 +2 0.60
+ orf00018 2570 3073 +2 2.54
+ orf00020 3196 3747 +1 2.91
+ orf00022 3758 4000 +2 0.83
+ orf00023 4399 4157 -2 1.31
+ orf00025 4463 4759 +2 2.92
+ orf00026 4878 5111 +3 0.78
+ orf00027 5468 5166 -3 1.64
+ orf00029 5590 5832 +1 0.29
+ orf00032 6023 6226 +2 6.02
+ orf00033 6217 6336 +1 3.09
+ ........
+
+
+ .details file
+ >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+ Sequence length = 40222
+
+ ----- Start ----- --- Length ---- ------------- Scores -------------
+ ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC
+ 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0
+ 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0
+ +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41
+ +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5
+ 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1
+ 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0
+ +1 562 592 762 198 168 -2.54 1 1 - - - - - 98
+ +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11
+ +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3
+ 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15
+ 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0
+ 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1
+ 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5
+ 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19
+ .....
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+
+
+
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_wo_icm.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_wo_icm.py Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+"""
+Input: DNA Fasta File
+Output: Tabular
+Return Tabular File with predicted ORF's
+Bjoern Gruening
+"""
+import sys, os
+import tempfile
+import subprocess
+import shutil
+from glimmer2seq import glimmer2seq
+
+def main():
+ genome_seq_file = sys.argv[1]
+ outfile_classic_glimmer = sys.argv[2]
+ outfile_ext_path = sys.argv[3]
+ oufile_genes = sys.argv[8]
+
+ tag = 'glimmer_non_knowlegde_based_prediction'
+ tempdir = tempfile.gettempdir()
+
+ trainingset = os.path.join( tempdir, tag + ".train" )
+ icm = os.path.join( tempdir, tag + ".icm" )
+
+ longorfs = tempfile.NamedTemporaryFile()
+ trainingset = tempfile.NamedTemporaryFile()
+ icm = tempfile.NamedTemporaryFile()
+
+ #glimmeropts = "-o0 -g110 -t30 -l"
+ glimmeropts = "-o%s -g%s -t%s" % (sys.argv[4], sys.argv[5], sys.argv[6])
+ if sys.argv[7] == "true":
+ glimmeropts += " -l"
+
+ """
+ 1. Find long, non-overlapping orfs to use as a training set
+ """
+ subprocess.Popen(["long-orfs", "-n", "-t", "1.15",
+ genome_seq_file, "-"], stdout = longorfs,
+ stderr = subprocess.PIPE).communicate()
+
+ """
+ 2. Extract the training sequences from the genome file
+ """
+ subprocess.Popen(["extract", "-t",
+ genome_seq_file, longorfs.name], stdout=trainingset,
+ stderr=subprocess.PIPE).communicate()
+
+ """
+ 3. Build the icm from the training sequences
+ """
+
+ # the "-" parameter is used to redirect the output to stdout
+ subprocess.Popen(["build-icm", "-r", "-"],
+ stdin=open(trainingset.name), stdout = icm,
+ stderr=subprocess.PIPE).communicate()
+
+ """
+ Run Glimmer3
+ """
+ b = subprocess.Popen(["glimmer3", glimmeropts,
+ genome_seq_file, icm.name, os.path.join(tempdir, tag)],
+ stdout = subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+
+ if outfile_classic_glimmer.strip() != 'None':
+ shutil.copyfile( os.path.join( tempdir, tag + ".predict" ), outfile_classic_glimmer )
+ if outfile_ext_path.strip() != 'None':
+ shutil.copyfile( os.path.join( tempdir, tag + ".detail" ), outfile_ext_path )
+
+ glimmer2seq( os.path.join( tempdir, tag + ".predict" ), genome_seq_file, oufile_genes )
+
+
+if __name__ == "__main__" :
+ main()
diff -r 2d0c26885604 -r 8ddf54417ade glimmer_wo_icm.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_wo_icm.xml Fri Jun 07 10:02:12 2013 -0400
@@ -0,0 +1,100 @@
+
+ Predict ORFs in prokaryotic genomes (not knowlegde-based)
+
+ glimmer
+ biopython
+
+
+ glimmer_wo_icm.py
+ $input
+ #if $report:
+ $prediction
+ #else:
+ "None"
+ #end if
+ #if $detailed_report:
+ $detailed
+ #else:
+ "None"
+ #end if
+ $overlap
+ $gene_length
+ $threshold
+ $linear
+ $genes_output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ report == True
+
+
+ detailed_report == True
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool predicts open reading frames (orfs) from a given DNA Sequence. That tool is not knowlegde-based.
+
+The recommended way is to use a trained Glimmer3 with ICM model. Use the knowlegde-based version for that and insert/generate a training set.
+
+-----
+
+**Example**
+
+Suppose you have the following DNA formatted sequences::
+
+ >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+ cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg
+ ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag
+ cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc
+ cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc
+ ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg
+
+Running this tool will produce this::
+
+ >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other;
+ orf00001 577 699 +1 5.24
+ orf00003 800 1123 +2 5.18
+ orf00004 1144 3813 +1 10.62
+ orf00006 3857 6220 +2 6.07
+ orf00007 6226 7173 +1 1.69
+ orf00008 7187 9307 +2 8.95
+ orf00009 9424 10410 +1 8.29
+ orf00010 10515 11363 +3 7.00
+ orf00011 11812 11964 +1 2.80
+ orf00012 12360 13457 +3 4.80
+ orf00013 14379 14044 -1 7.41
+ orf00015 15029 14739 -3 12.43
+ orf00016 15066 15227 +3 1.91
+ orf00020 16061 15351 -3 2.83
+ orf00021 17513 17391 -3 2.20
+ orf00023 17529 17675 +3 0.11
+
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+
diff -r 2d0c26885604 -r 8ddf54417ade readme.rst
--- a/readme.rst Fri Jun 07 07:51:49 2013 -0400
+++ b/readme.rst Fri Jun 07 10:02:12 2013 -0400
@@ -29,15 +29,9 @@
folder and modify the tools_conf.xml file to make the tool available to Galaxy.
For example:
-
-
-
-
-
-
-
-
-
+
+
+
History