comparison variant_effect_predictor/Bio/Tools/Genemark.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: Genemark.pm,v 1.11.2.1 2003/04/24 08:51:48 heikki Exp $
2 #
3 # BioPerl module for Bio::Tools::Genemark
4 #
5 # Cared for by Mark Fiers <hlapp@gmx.net>
6 #
7 # Copyright Hilmar Lapp, Mark Fiers
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::Genemark - Results of one Genemark run
16
17 =head1 SYNOPSIS
18
19 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark');
20 # filehandle:
21 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT );
22
23 # parse the results
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
25 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same
26 while($gene = $Genemark->next_prediction()) {
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
28 # off Bio::SeqFeature::Gene::Transcript.
29 #
30 # $gene->exons() returns an array of
31 # Bio::Tools::Prediction::Exon objects
32 # all exons:
33 @exon_arr = $gene->exons();
34
35 # initial exons only
36 @init_exons = $gene->exons('Initial');
37 # internal exons only
38 @intrl_exons = $gene->exons('Internal');
39 # terminal exons only
40 @term_exons = $gene->exons('Terminal');
41 # singleton exons:
42 ($single_exon) = $gene->exons();
43 }
44
45 # essential if you gave a filename at initialization (otherwise the file
46 # will stay open)
47 $Genemark->close();
48
49 =head1 DESCRIPTION
50
51 The Genemark module provides a parser for Genemark gene structure prediction
52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
53 derived object.
54
55 This module has been developed around genemark.hmm for eukaryots v2.2a and will
56 probably not work with other versions.
57
58
59 This module also implements the Bio::SeqAnalysisParserI interface, and thus
60 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
61
62 =head1 FEEDBACK
63
64 =head2 Mailing Lists
65
66 User feedback is an integral part of the evolution of this and other
67 Bioperl modules. Send your comments and suggestions preferably to one
68 of the Bioperl mailing lists. Your participation is much appreciated.
69
70 bioperl-l@bioperl.org - General discussion
71 http://bio.perl.org/MailList.html - About the mailing lists
72
73 =head2 Reporting Bugs
74
75 Report bugs to the Bioperl bug tracking system to help us keep track
76 the bugs and their resolution. Bug reports can be submitted via email
77 or the web:
78
79 bioperl-bugs@bio.perl.org
80 http://bugzilla.bioperl.org/
81
82 =head1 AUTHOR - Hilmar Lapp, Mark Fiers
83
84 Email hlapp@gmx.net
85 m.w.e.j.fiers@plant.wag-ur.nl
86
87 Describe contact details here
88
89 =head1 APPENDIX
90
91 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
92
93 =cut
94
95
96 # Let the code begin...
97
98
99 package Bio::Tools::Genemark;
100 use vars qw(@ISA);
101 use strict;
102 use Symbol;
103
104 use Bio::Root::Root;
105 use Bio::Tools::AnalysisResult;
106 use Bio::Tools::Prediction::Gene;
107 use Bio::Tools::Prediction::Exon;
108 use Bio::Seq;
109
110 @ISA = qw(Bio::Tools::AnalysisResult);
111
112 sub _initialize_state {
113 my ($self,@args) = @_;
114
115 # first call the inherited method!
116 $self->SUPER::_initialize_state(@args);
117
118 # our private state variables
119 $self->{'_preds_parsed'} = 0;
120 $self->{'_has_cds'} = 0;
121 # array of pre-parsed predictions
122 $self->{'_preds'} = [];
123 # seq stack
124 $self->{'_seqstack'} = [];
125 }
126
127 =head2 analysis_method
128
129 Usage : $Genemark->analysis_method();
130 Purpose : Inherited method. Overridden to ensure that the name matches
131 /GeneMark.hmm/i.
132 Returns : String
133 Argument : n/a
134
135 =cut
136
137 #-------------
138 sub analysis_method {
139 #-------------
140 my ($self, $method) = @_;
141 if($method && ($method !~ /Genemark\.hmm/i)) {
142 $self->throw("method $method not supported in " . ref($self));
143 }
144 return $self->SUPER::analysis_method($method);
145 }
146
147 =head2 next_feature
148
149 Title : next_feature
150 Usage : while($gene = $Genemark->next_feature()) {
151 # do something
152 }
153 Function: Returns the next gene structure prediction of the Genemark result
154 file. Call this method repeatedly until FALSE is returned.
155
156 The returned object is actually a SeqFeatureI implementing object.
157 This method is required for classes implementing the
158 SeqAnalysisParserI interface, and is merely an alias for
159 next_prediction() at present.
160
161 Example :
162 Returns : A Bio::Tools::Prediction::Gene object.
163 Args :
164
165 =cut
166
167 sub next_feature {
168 my ($self,@args) = @_;
169 # even though next_prediction doesn't expect any args (and this method
170 # does neither), we pass on args in order to be prepared if this changes
171 # ever
172 return $self->next_prediction(@args);
173 }
174
175 =head2 next_prediction
176
177 Title : next_prediction
178 Usage : while($gene = $Genemark->next_prediction()) {
179 # do something
180 }
181 Function: Returns the next gene structure prediction of the Genemark result
182 file. Call this method repeatedly until FALSE is returned.
183
184 Example :
185 Returns : A Bio::Tools::Prediction::Gene object.
186 Args :
187
188 =cut
189
190 sub next_prediction {
191 my ($self) = @_;
192 my $gene;
193
194 # if the prediction section hasn't been parsed yet, we do this now
195 $self->_parse_predictions() unless $self->_predictions_parsed();
196
197 # get next gene structure
198 $gene = $self->_prediction();
199
200 return $gene;
201 }
202
203 =head2 _parse_predictions
204
205 Title : _parse_predictions()
206 Usage : $obj->_parse_predictions()
207 Function: Parses the prediction section. Automatically called by
208 next_prediction() if not yet done.
209 Example :
210 Returns :
211
212 =cut
213
214 sub _parse_predictions {
215 my ($self) = @_;
216 my %exontags = ('Initial' => 'Initial',
217 'Internal' => 'Internal',
218 'Terminal' => 'Terminal',
219 'Single' => '',
220 '_na_' => '');
221 my $exontag;
222 my $gene;
223 my $seqname;
224 my $exontype;
225 my $current_gene_no = -1;
226
227 while(defined($_ = $self->_readline())) {
228
229 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) {
230
231 # this is an exon, Genemark doesn't predict anything else
232 # $prednr corresponds to geneno.
233 my $prednr = $1;
234
235 #exon no:
236 my $signalnr = 0;
237 if ($2) { my $signalnr = $2; } # used in tag: exon_no
238
239 # split into fields
240 chomp();
241 my @flds = split(' ', $_);
242
243 # create the feature (an exon) object
244 my $predobj = Bio::Tools::Prediction::Exon->new();
245
246
247 # define info depending on it being eu- or prokaryot
248 my ($start, $end, $orientation, $prediction_source);
249
250 if ($self->analysis_method() =~ /PROKARYOTIC/i) {
251 $prediction_source = "Genemark.hmm.pro";
252 $orientation = ($flds[1] eq '+') ? 1 : -1;
253 ($start, $end) = @flds[(2,3)];
254 $exontag = "_na_";
255
256 } else {
257 $prediction_source = "Genemark.hmm.eu";
258 $orientation = ($flds[2] eq '+') ? 1 : -1;
259 ($start, $end) = @flds[(4,5)];
260 $exontag = $flds[3];
261 }
262
263 #store the data in the exon object
264 $predobj->source_tag($prediction_source);
265 $predobj->start($start);
266 $predobj->end($end);
267 $predobj->strand($orientation);
268
269 $predobj->primary_tag($exontags{$exontag} . "Exon");
270
271 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr);
272
273 $predobj->is_coding(1);
274
275
276 # frame calculation as in the genscan module
277 # is to be implemented...
278
279 #If the $prednr is not equal to the current gene, we
280 #need to make a new gene and close the old one
281 if($prednr != $current_gene_no) {
282 # a new gene, store the old one if it exists
283 if (defined ($gene)) {
284 $gene->seq_id($seqname);
285 $gene = undef ;
286 }
287 #and make a new one
288 $gene = Bio::Tools::Prediction::Gene->new
289 (
290 '-primary' => "GenePrediction$prednr",
291 '-source' => $prediction_source);
292 $self->_add_prediction($gene);
293 $current_gene_no = $prednr;
294 }
295
296 # Add the exon to the gene
297 $gene->add_exon($predobj, ($exontag eq "_na_" ?
298 undef : $exontags{$exontag}));
299
300 }
301
302 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) {
303 $self->analysis_method($1);
304
305 my $gm_version = $2;
306
307 $self->analysis_method_version($gm_version);
308 next;
309 }
310
311 #Matrix file for eukaryot version
312 if (/^Matrices file:\s+(\S+)?/i) {
313 $self->analysis_subject($1);
314 # since the line after the matrix file is always the date
315 # (in the output file's I have seen!) extract and store this
316 # here
317 if (defined(my $_date = $self->_readline())) {
318 chomp ($_date);
319 $self->analysis_date($_date);
320 }
321 }
322
323 #Matrix file for prokaryot version
324 if (/^Model file name:\s+(\S+)/) {
325 $self->analysis_subject($1);
326 # since the line after the matrix file is always the date
327 # (in the output file's I have seen!) extract and store this
328 # here
329 my $_date = $self->_readline() ;
330 if (defined($_date = $self->_readline())) {
331 chomp ($_date);
332 $self->analysis_date($_date);
333 }
334 }
335
336 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) {
337 $seqname = $1;
338 # $self->analysis_subject($seqname);
339 next;
340 }
341
342
343 /^>/ && do {
344 $self->_pushback($_);
345
346 # section of predicted aa sequences on recognition
347 # of a fasta start, read all sequences and find the
348 # appropriate gene
349 while (1) {
350 my ($aa_id, $seq) = $self->_read_fasta_seq();
351 last unless ($aa_id);
352
353 #now parse through the predictions to add the pred. protein
354 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) {
355 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
356 my $geneno = $1;
357 if ($aa_id =~ /\|gene.$geneno\|/) {
358 #print "x SEQ : \n $seq \nXXXX\n";
359 my $seqobj = Bio::Seq->new('-seq' => $seq,
360 '-display_id' => $aa_id,
361 '-alphabet' => "protein");
362 $gene->predicted_protein($seqobj);
363 last FINDPRED;
364 }
365
366 }
367 }
368
369 last;
370 };
371 }
372
373 # if the analysis query object contains a ref to a Seq of PrimarySeq
374 # object, then extract the predicted sequences and add it to the gene
375 # object.
376 if (defined $self->analysis_query()) {
377 my $orig_seq = $self->analysis_query();
378 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) {
379 my $predseq = "";
380 foreach my $exon ($gene->exons()) {
381 #print $exon->start() . " " . $exon->end () . "\n";
382 $predseq .= $orig_seq->subseq($exon->start(), $exon->end());
383 }
384
385 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq,
386 '-display_id' => "transl");
387 $gene->predicted_cds($seqobj);
388 }
389 }
390
391
392 $self->_predictions_parsed(1);
393 }
394
395 =head2 _prediction
396
397 Title : _prediction()
398 Usage : $gene = $obj->_prediction()
399 Function: internal
400 Example :
401 Returns :
402
403 =cut
404
405 sub _prediction {
406 my ($self) = @_;
407
408 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
409 return shift(@{$self->{'_preds'}});
410 }
411
412 =head2 _add_prediction
413
414 Title : _add_prediction()
415 Usage : $obj->_add_prediction($gene)
416 Function: internal
417 Example :
418 Returns :
419
420 =cut
421
422 sub _add_prediction {
423 my ($self, $gene) = @_;
424
425 if(! exists($self->{'_preds'})) {
426 $self->{'_preds'} = [];
427 }
428 push(@{$self->{'_preds'}}, $gene);
429 }
430
431 =head2 _predictions_parsed
432
433 Title : _predictions_parsed
434 Usage : $obj->_predictions_parsed
435 Function: internal
436 Example :
437 Returns : TRUE or FALSE
438
439 =cut
440
441 sub _predictions_parsed {
442 my ($self, $val) = @_;
443
444 $self->{'_preds_parsed'} = $val if $val;
445 if(! exists($self->{'_preds_parsed'})) {
446 $self->{'_preds_parsed'} = 0;
447 }
448 return $self->{'_preds_parsed'};
449 }
450
451 =head2 _has_cds
452
453 Title : _has_cds()
454 Usage : $obj->_has_cds()
455 Function: Whether or not the result contains the predicted CDSs, too.
456 Example :
457 Returns : TRUE or FALSE
458
459 =cut
460
461 sub _has_cds {
462 my ($self, $val) = @_;
463
464 $self->{'_has_cds'} = $val if $val;
465 if(! exists($self->{'_has_cds'})) {
466 $self->{'_has_cds'} = 0;
467 }
468 return $self->{'_has_cds'};
469 }
470
471 =head2 _read_fasta_seq
472
473 Title : _read_fasta_seq()
474 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
475 Function: Simple but specialised FASTA format sequence reader. Uses
476 $self->_readline() to retrieve input, and is able to strip off
477 the traling description lines.
478 Example :
479 Returns : An array of two elements.
480
481 =cut
482
483 sub _read_fasta_seq {
484 my ($self) = @_;
485 my ($id, $seq);
486 local $/ = ">";
487
488 return 0 unless (my $entry = $self->_readline());
489
490 $entry =~ s/^>//;
491 # complete the entry if the first line came from a pushback buffer
492 while(! ($entry =~ />$/)) {
493 last unless ($_ = $self->_readline());
494 $entry .= $_;
495 }
496
497 # delete everything onwards from an new fasta start (>)
498 $entry =~ s/\n>.*$//s;
499 # id and sequence
500
501 if($entry =~ s/^(.+)\n//) {
502 $id = $1;
503 $id =~ s/ /_/g;
504 $seq = $entry;
505 $seq =~ s/\s//g;
506 #print "\n@@ $id \n@@ $seq \n##\n";
507 } else {
508 $self->throw("Can't parse Genemark predicted sequence entry");
509 }
510 $seq =~ s/\s//g; # Remove whitespace
511 return ($id, $seq);
512 }
513
514 1;
515