0
|
1 # $Id: Genemark.pm,v 1.11.2.1 2003/04/24 08:51:48 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Tools::Genemark
|
|
4 #
|
|
5 # Cared for by Mark Fiers <hlapp@gmx.net>
|
|
6 #
|
|
7 # Copyright Hilmar Lapp, Mark Fiers
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Tools::Genemark - Results of one Genemark run
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
|
|
20 # filehandle:
|
|
21 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
|
|
22
|
|
23 # parse the results
|
|
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
|
|
25 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
|
|
26 while($gene = $Genemark->next_prediction()) {
|
|
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
|
|
28 # off Bio::SeqFeature::Gene::Transcript.
|
|
29 #
|
|
30 # $gene->exons() returns an array of
|
|
31 # Bio::Tools::Prediction::Exon objects
|
|
32 # all exons:
|
|
33 @exon_arr = $gene->exons();
|
|
34
|
|
35 # initial exons only
|
|
36 @init_exons = $gene->exons('Initial');
|
|
37 # internal exons only
|
|
38 @intrl_exons = $gene->exons('Internal');
|
|
39 # terminal exons only
|
|
40 @term_exons = $gene->exons('Terminal');
|
|
41 # singleton exons:
|
|
42 ($single_exon) = $gene->exons();
|
|
43 }
|
|
44
|
|
45 # essential if you gave a filename at initialization (otherwise the file
|
|
46 # will stay open)
|
|
47 $Genemark->close();
|
|
48
|
|
49 =head1 DESCRIPTION
|
|
50
|
|
51 The Genemark module provides a parser for Genemark gene structure prediction
|
|
52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
|
|
53 derived object.
|
|
54
|
|
55 This module has been developed around genemark.hmm for eukaryots v2.2a and will
|
|
56 probably not work with other versions.
|
|
57
|
|
58
|
|
59 This module also implements the Bio::SeqAnalysisParserI interface, and thus
|
|
60 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
|
|
61
|
|
62 =head1 FEEDBACK
|
|
63
|
|
64 =head2 Mailing Lists
|
|
65
|
|
66 User feedback is an integral part of the evolution of this and other
|
|
67 Bioperl modules. Send your comments and suggestions preferably to one
|
|
68 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
69
|
|
70 bioperl-l@bioperl.org - General discussion
|
|
71 http://bio.perl.org/MailList.html - About the mailing lists
|
|
72
|
|
73 =head2 Reporting Bugs
|
|
74
|
|
75 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
76 the bugs and their resolution. Bug reports can be submitted via email
|
|
77 or the web:
|
|
78
|
|
79 bioperl-bugs@bio.perl.org
|
|
80 http://bugzilla.bioperl.org/
|
|
81
|
|
82 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
|
|
83
|
|
84 Email hlapp@gmx.net
|
|
85 m.w.e.j.fiers@plant.wag-ur.nl
|
|
86
|
|
87 Describe contact details here
|
|
88
|
|
89 =head1 APPENDIX
|
|
90
|
|
91 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
92
|
|
93 =cut
|
|
94
|
|
95
|
|
96 # Let the code begin...
|
|
97
|
|
98
|
|
99 package Bio::Tools::Genemark;
|
|
100 use vars qw(@ISA);
|
|
101 use strict;
|
|
102 use Symbol;
|
|
103
|
|
104 use Bio::Root::Root;
|
|
105 use Bio::Tools::AnalysisResult;
|
|
106 use Bio::Tools::Prediction::Gene;
|
|
107 use Bio::Tools::Prediction::Exon;
|
|
108 use Bio::Seq;
|
|
109
|
|
110 @ISA = qw(Bio::Tools::AnalysisResult);
|
|
111
|
|
112 sub _initialize_state {
|
|
113 my ($self,@args) = @_;
|
|
114
|
|
115 # first call the inherited method!
|
|
116 $self->SUPER::_initialize_state(@args);
|
|
117
|
|
118 # our private state variables
|
|
119 $self->{'_preds_parsed'} = 0;
|
|
120 $self->{'_has_cds'} = 0;
|
|
121 # array of pre-parsed predictions
|
|
122 $self->{'_preds'} = [];
|
|
123 # seq stack
|
|
124 $self->{'_seqstack'} = [];
|
|
125 }
|
|
126
|
|
127 =head2 analysis_method
|
|
128
|
|
129 Usage : $Genemark->analysis_method();
|
|
130 Purpose : Inherited method. Overridden to ensure that the name matches
|
|
131 /GeneMark.hmm/i.
|
|
132 Returns : String
|
|
133 Argument : n/a
|
|
134
|
|
135 =cut
|
|
136
|
|
137 #-------------
|
|
138 sub analysis_method {
|
|
139 #-------------
|
|
140 my ($self, $method) = @_;
|
|
141 if($method && ($method !~ /Genemark\.hmm/i)) {
|
|
142 $self->throw("method $method not supported in " . ref($self));
|
|
143 }
|
|
144 return $self->SUPER::analysis_method($method);
|
|
145 }
|
|
146
|
|
147 =head2 next_feature
|
|
148
|
|
149 Title : next_feature
|
|
150 Usage : while($gene = $Genemark->next_feature()) {
|
|
151 # do something
|
|
152 }
|
|
153 Function: Returns the next gene structure prediction of the Genemark result
|
|
154 file. Call this method repeatedly until FALSE is returned.
|
|
155
|
|
156 The returned object is actually a SeqFeatureI implementing object.
|
|
157 This method is required for classes implementing the
|
|
158 SeqAnalysisParserI interface, and is merely an alias for
|
|
159 next_prediction() at present.
|
|
160
|
|
161 Example :
|
|
162 Returns : A Bio::Tools::Prediction::Gene object.
|
|
163 Args :
|
|
164
|
|
165 =cut
|
|
166
|
|
167 sub next_feature {
|
|
168 my ($self,@args) = @_;
|
|
169 # even though next_prediction doesn't expect any args (and this method
|
|
170 # does neither), we pass on args in order to be prepared if this changes
|
|
171 # ever
|
|
172 return $self->next_prediction(@args);
|
|
173 }
|
|
174
|
|
175 =head2 next_prediction
|
|
176
|
|
177 Title : next_prediction
|
|
178 Usage : while($gene = $Genemark->next_prediction()) {
|
|
179 # do something
|
|
180 }
|
|
181 Function: Returns the next gene structure prediction of the Genemark result
|
|
182 file. Call this method repeatedly until FALSE is returned.
|
|
183
|
|
184 Example :
|
|
185 Returns : A Bio::Tools::Prediction::Gene object.
|
|
186 Args :
|
|
187
|
|
188 =cut
|
|
189
|
|
190 sub next_prediction {
|
|
191 my ($self) = @_;
|
|
192 my $gene;
|
|
193
|
|
194 # if the prediction section hasn't been parsed yet, we do this now
|
|
195 $self->_parse_predictions() unless $self->_predictions_parsed();
|
|
196
|
|
197 # get next gene structure
|
|
198 $gene = $self->_prediction();
|
|
199
|
|
200 return $gene;
|
|
201 }
|
|
202
|
|
203 =head2 _parse_predictions
|
|
204
|
|
205 Title : _parse_predictions()
|
|
206 Usage : $obj->_parse_predictions()
|
|
207 Function: Parses the prediction section. Automatically called by
|
|
208 next_prediction() if not yet done.
|
|
209 Example :
|
|
210 Returns :
|
|
211
|
|
212 =cut
|
|
213
|
|
214 sub _parse_predictions {
|
|
215 my ($self) = @_;
|
|
216 my %exontags = ('Initial' => 'Initial',
|
|
217 'Internal' => 'Internal',
|
|
218 'Terminal' => 'Terminal',
|
|
219 'Single' => '',
|
|
220 '_na_' => '');
|
|
221 my $exontag;
|
|
222 my $gene;
|
|
223 my $seqname;
|
|
224 my $exontype;
|
|
225 my $current_gene_no = -1;
|
|
226
|
|
227 while(defined($_ = $self->_readline())) {
|
|
228
|
|
229 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
|
|
230
|
|
231 # this is an exon, Genemark doesn't predict anything else
|
|
232 # $prednr corresponds to geneno.
|
|
233 my $prednr = $1;
|
|
234
|
|
235 #exon no:
|
|
236 my $signalnr = 0;
|
|
237 if ($2) { my $signalnr = $2; } # used in tag: exon_no
|
|
238
|
|
239 # split into fields
|
|
240 chomp();
|
|
241 my @flds = split(' ', $_);
|
|
242
|
|
243 # create the feature (an exon) object
|
|
244 my $predobj = Bio::Tools::Prediction::Exon->new();
|
|
245
|
|
246
|
|
247 # define info depending on it being eu- or prokaryot
|
|
248 my ($start, $end, $orientation, $prediction_source);
|
|
249
|
|
250 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
|
|
251 $prediction_source = "Genemark.hmm.pro";
|
|
252 $orientation = ($flds[1] eq '+') ? 1 : -1;
|
|
253 ($start, $end) = @flds[(2,3)];
|
|
254 $exontag = "_na_";
|
|
255
|
|
256 } else {
|
|
257 $prediction_source = "Genemark.hmm.eu";
|
|
258 $orientation = ($flds[2] eq '+') ? 1 : -1;
|
|
259 ($start, $end) = @flds[(4,5)];
|
|
260 $exontag = $flds[3];
|
|
261 }
|
|
262
|
|
263 #store the data in the exon object
|
|
264 $predobj->source_tag($prediction_source);
|
|
265 $predobj->start($start);
|
|
266 $predobj->end($end);
|
|
267 $predobj->strand($orientation);
|
|
268
|
|
269 $predobj->primary_tag($exontags{$exontag} . "Exon");
|
|
270
|
|
271 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
|
|
272
|
|
273 $predobj->is_coding(1);
|
|
274
|
|
275
|
|
276 # frame calculation as in the genscan module
|
|
277 # is to be implemented...
|
|
278
|
|
279 #If the $prednr is not equal to the current gene, we
|
|
280 #need to make a new gene and close the old one
|
|
281 if($prednr != $current_gene_no) {
|
|
282 # a new gene, store the old one if it exists
|
|
283 if (defined ($gene)) {
|
|
284 $gene->seq_id($seqname);
|
|
285 $gene = undef ;
|
|
286 }
|
|
287 #and make a new one
|
|
288 $gene = Bio::Tools::Prediction::Gene->new
|
|
289 (
|
|
290 '-primary' => "GenePrediction$prednr",
|
|
291 '-source' => $prediction_source);
|
|
292 $self->_add_prediction($gene);
|
|
293 $current_gene_no = $prednr;
|
|
294 }
|
|
295
|
|
296 # Add the exon to the gene
|
|
297 $gene->add_exon($predobj, ($exontag eq "_na_" ?
|
|
298 undef : $exontags{$exontag}));
|
|
299
|
|
300 }
|
|
301
|
|
302 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
|
|
303 $self->analysis_method($1);
|
|
304
|
|
305 my $gm_version = $2;
|
|
306
|
|
307 $self->analysis_method_version($gm_version);
|
|
308 next;
|
|
309 }
|
|
310
|
|
311 #Matrix file for eukaryot version
|
|
312 if (/^Matrices file:\s+(\S+)?/i) {
|
|
313 $self->analysis_subject($1);
|
|
314 # since the line after the matrix file is always the date
|
|
315 # (in the output file's I have seen!) extract and store this
|
|
316 # here
|
|
317 if (defined(my $_date = $self->_readline())) {
|
|
318 chomp ($_date);
|
|
319 $self->analysis_date($_date);
|
|
320 }
|
|
321 }
|
|
322
|
|
323 #Matrix file for prokaryot version
|
|
324 if (/^Model file name:\s+(\S+)/) {
|
|
325 $self->analysis_subject($1);
|
|
326 # since the line after the matrix file is always the date
|
|
327 # (in the output file's I have seen!) extract and store this
|
|
328 # here
|
|
329 my $_date = $self->_readline() ;
|
|
330 if (defined($_date = $self->_readline())) {
|
|
331 chomp ($_date);
|
|
332 $self->analysis_date($_date);
|
|
333 }
|
|
334 }
|
|
335
|
|
336 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
|
|
337 $seqname = $1;
|
|
338 # $self->analysis_subject($seqname);
|
|
339 next;
|
|
340 }
|
|
341
|
|
342
|
|
343 /^>/ && do {
|
|
344 $self->_pushback($_);
|
|
345
|
|
346 # section of predicted aa sequences on recognition
|
|
347 # of a fasta start, read all sequences and find the
|
|
348 # appropriate gene
|
|
349 while (1) {
|
|
350 my ($aa_id, $seq) = $self->_read_fasta_seq();
|
|
351 last unless ($aa_id);
|
|
352
|
|
353 #now parse through the predictions to add the pred. protein
|
|
354 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
|
|
355 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
|
|
356 my $geneno = $1;
|
|
357 if ($aa_id =~ /\|gene.$geneno\|/) {
|
|
358 #print "x SEQ : \n $seq \nXXXX\n";
|
|
359 my $seqobj = Bio::Seq->new('-seq' => $seq,
|
|
360 '-display_id' => $aa_id,
|
|
361 '-alphabet' => "protein");
|
|
362 $gene->predicted_protein($seqobj);
|
|
363 last FINDPRED;
|
|
364 }
|
|
365
|
|
366 }
|
|
367 }
|
|
368
|
|
369 last;
|
|
370 };
|
|
371 }
|
|
372
|
|
373 # if the analysis query object contains a ref to a Seq of PrimarySeq
|
|
374 # object, then extract the predicted sequences and add it to the gene
|
|
375 # object.
|
|
376 if (defined $self->analysis_query()) {
|
|
377 my $orig_seq = $self->analysis_query();
|
|
378 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) {
|
|
379 my $predseq = "";
|
|
380 foreach my $exon ($gene->exons()) {
|
|
381 #print $exon->start() . " " . $exon->end () . "\n";
|
|
382 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
|
|
383 }
|
|
384
|
|
385 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
|
|
386 '-display_id' => "transl");
|
|
387 $gene->predicted_cds($seqobj);
|
|
388 }
|
|
389 }
|
|
390
|
|
391
|
|
392 $self->_predictions_parsed(1);
|
|
393 }
|
|
394
|
|
395 =head2 _prediction
|
|
396
|
|
397 Title : _prediction()
|
|
398 Usage : $gene = $obj->_prediction()
|
|
399 Function: internal
|
|
400 Example :
|
|
401 Returns :
|
|
402
|
|
403 =cut
|
|
404
|
|
405 sub _prediction {
|
|
406 my ($self) = @_;
|
|
407
|
|
408 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
|
|
409 return shift(@{$self->{'_preds'}});
|
|
410 }
|
|
411
|
|
412 =head2 _add_prediction
|
|
413
|
|
414 Title : _add_prediction()
|
|
415 Usage : $obj->_add_prediction($gene)
|
|
416 Function: internal
|
|
417 Example :
|
|
418 Returns :
|
|
419
|
|
420 =cut
|
|
421
|
|
422 sub _add_prediction {
|
|
423 my ($self, $gene) = @_;
|
|
424
|
|
425 if(! exists($self->{'_preds'})) {
|
|
426 $self->{'_preds'} = [];
|
|
427 }
|
|
428 push(@{$self->{'_preds'}}, $gene);
|
|
429 }
|
|
430
|
|
431 =head2 _predictions_parsed
|
|
432
|
|
433 Title : _predictions_parsed
|
|
434 Usage : $obj->_predictions_parsed
|
|
435 Function: internal
|
|
436 Example :
|
|
437 Returns : TRUE or FALSE
|
|
438
|
|
439 =cut
|
|
440
|
|
441 sub _predictions_parsed {
|
|
442 my ($self, $val) = @_;
|
|
443
|
|
444 $self->{'_preds_parsed'} = $val if $val;
|
|
445 if(! exists($self->{'_preds_parsed'})) {
|
|
446 $self->{'_preds_parsed'} = 0;
|
|
447 }
|
|
448 return $self->{'_preds_parsed'};
|
|
449 }
|
|
450
|
|
451 =head2 _has_cds
|
|
452
|
|
453 Title : _has_cds()
|
|
454 Usage : $obj->_has_cds()
|
|
455 Function: Whether or not the result contains the predicted CDSs, too.
|
|
456 Example :
|
|
457 Returns : TRUE or FALSE
|
|
458
|
|
459 =cut
|
|
460
|
|
461 sub _has_cds {
|
|
462 my ($self, $val) = @_;
|
|
463
|
|
464 $self->{'_has_cds'} = $val if $val;
|
|
465 if(! exists($self->{'_has_cds'})) {
|
|
466 $self->{'_has_cds'} = 0;
|
|
467 }
|
|
468 return $self->{'_has_cds'};
|
|
469 }
|
|
470
|
|
471 =head2 _read_fasta_seq
|
|
472
|
|
473 Title : _read_fasta_seq()
|
|
474 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
|
|
475 Function: Simple but specialised FASTA format sequence reader. Uses
|
|
476 $self->_readline() to retrieve input, and is able to strip off
|
|
477 the traling description lines.
|
|
478 Example :
|
|
479 Returns : An array of two elements.
|
|
480
|
|
481 =cut
|
|
482
|
|
483 sub _read_fasta_seq {
|
|
484 my ($self) = @_;
|
|
485 my ($id, $seq);
|
|
486 local $/ = ">";
|
|
487
|
|
488 return 0 unless (my $entry = $self->_readline());
|
|
489
|
|
490 $entry =~ s/^>//;
|
|
491 # complete the entry if the first line came from a pushback buffer
|
|
492 while(! ($entry =~ />$/)) {
|
|
493 last unless ($_ = $self->_readline());
|
|
494 $entry .= $_;
|
|
495 }
|
|
496
|
|
497 # delete everything onwards from an new fasta start (>)
|
|
498 $entry =~ s/\n>.*$//s;
|
|
499 # id and sequence
|
|
500
|
|
501 if($entry =~ s/^(.+)\n//) {
|
|
502 $id = $1;
|
|
503 $id =~ s/ /_/g;
|
|
504 $seq = $entry;
|
|
505 $seq =~ s/\s//g;
|
|
506 #print "\n@@ $id \n@@ $seq \n##\n";
|
|
507 } else {
|
|
508 $self->throw("Can't parse Genemark predicted sequence entry");
|
|
509 }
|
|
510 $seq =~ s/\s//g; # Remove whitespace
|
|
511 return ($id, $seq);
|
|
512 }
|
|
513
|
|
514 1;
|
|
515
|