Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Genemark.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Genemark.pm,v 1.11.2.1 2003/04/24 08:51:48 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::Tools::Genemark | |
4 # | |
5 # Cared for by Mark Fiers <hlapp@gmx.net> | |
6 # | |
7 # Copyright Hilmar Lapp, Mark Fiers | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Tools::Genemark - Results of one Genemark run | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark'); | |
20 # filehandle: | |
21 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT ); | |
22 | |
23 # parse the results | |
24 # note: this class is-a Bio::Tools::AnalysisResult which implements | |
25 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same | |
26 while($gene = $Genemark->next_prediction()) { | |
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits | |
28 # off Bio::SeqFeature::Gene::Transcript. | |
29 # | |
30 # $gene->exons() returns an array of | |
31 # Bio::Tools::Prediction::Exon objects | |
32 # all exons: | |
33 @exon_arr = $gene->exons(); | |
34 | |
35 # initial exons only | |
36 @init_exons = $gene->exons('Initial'); | |
37 # internal exons only | |
38 @intrl_exons = $gene->exons('Internal'); | |
39 # terminal exons only | |
40 @term_exons = $gene->exons('Terminal'); | |
41 # singleton exons: | |
42 ($single_exon) = $gene->exons(); | |
43 } | |
44 | |
45 # essential if you gave a filename at initialization (otherwise the file | |
46 # will stay open) | |
47 $Genemark->close(); | |
48 | |
49 =head1 DESCRIPTION | |
50 | |
51 The Genemark module provides a parser for Genemark gene structure prediction | |
52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript- | |
53 derived object. | |
54 | |
55 This module has been developed around genemark.hmm for eukaryots v2.2a and will | |
56 probably not work with other versions. | |
57 | |
58 | |
59 This module also implements the Bio::SeqAnalysisParserI interface, and thus | |
60 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>. | |
61 | |
62 =head1 FEEDBACK | |
63 | |
64 =head2 Mailing Lists | |
65 | |
66 User feedback is an integral part of the evolution of this and other | |
67 Bioperl modules. Send your comments and suggestions preferably to one | |
68 of the Bioperl mailing lists. Your participation is much appreciated. | |
69 | |
70 bioperl-l@bioperl.org - General discussion | |
71 http://bio.perl.org/MailList.html - About the mailing lists | |
72 | |
73 =head2 Reporting Bugs | |
74 | |
75 Report bugs to the Bioperl bug tracking system to help us keep track | |
76 the bugs and their resolution. Bug reports can be submitted via email | |
77 or the web: | |
78 | |
79 bioperl-bugs@bio.perl.org | |
80 http://bugzilla.bioperl.org/ | |
81 | |
82 =head1 AUTHOR - Hilmar Lapp, Mark Fiers | |
83 | |
84 Email hlapp@gmx.net | |
85 m.w.e.j.fiers@plant.wag-ur.nl | |
86 | |
87 Describe contact details here | |
88 | |
89 =head1 APPENDIX | |
90 | |
91 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
92 | |
93 =cut | |
94 | |
95 | |
96 # Let the code begin... | |
97 | |
98 | |
99 package Bio::Tools::Genemark; | |
100 use vars qw(@ISA); | |
101 use strict; | |
102 use Symbol; | |
103 | |
104 use Bio::Root::Root; | |
105 use Bio::Tools::AnalysisResult; | |
106 use Bio::Tools::Prediction::Gene; | |
107 use Bio::Tools::Prediction::Exon; | |
108 use Bio::Seq; | |
109 | |
110 @ISA = qw(Bio::Tools::AnalysisResult); | |
111 | |
112 sub _initialize_state { | |
113 my ($self,@args) = @_; | |
114 | |
115 # first call the inherited method! | |
116 $self->SUPER::_initialize_state(@args); | |
117 | |
118 # our private state variables | |
119 $self->{'_preds_parsed'} = 0; | |
120 $self->{'_has_cds'} = 0; | |
121 # array of pre-parsed predictions | |
122 $self->{'_preds'} = []; | |
123 # seq stack | |
124 $self->{'_seqstack'} = []; | |
125 } | |
126 | |
127 =head2 analysis_method | |
128 | |
129 Usage : $Genemark->analysis_method(); | |
130 Purpose : Inherited method. Overridden to ensure that the name matches | |
131 /GeneMark.hmm/i. | |
132 Returns : String | |
133 Argument : n/a | |
134 | |
135 =cut | |
136 | |
137 #------------- | |
138 sub analysis_method { | |
139 #------------- | |
140 my ($self, $method) = @_; | |
141 if($method && ($method !~ /Genemark\.hmm/i)) { | |
142 $self->throw("method $method not supported in " . ref($self)); | |
143 } | |
144 return $self->SUPER::analysis_method($method); | |
145 } | |
146 | |
147 =head2 next_feature | |
148 | |
149 Title : next_feature | |
150 Usage : while($gene = $Genemark->next_feature()) { | |
151 # do something | |
152 } | |
153 Function: Returns the next gene structure prediction of the Genemark result | |
154 file. Call this method repeatedly until FALSE is returned. | |
155 | |
156 The returned object is actually a SeqFeatureI implementing object. | |
157 This method is required for classes implementing the | |
158 SeqAnalysisParserI interface, and is merely an alias for | |
159 next_prediction() at present. | |
160 | |
161 Example : | |
162 Returns : A Bio::Tools::Prediction::Gene object. | |
163 Args : | |
164 | |
165 =cut | |
166 | |
167 sub next_feature { | |
168 my ($self,@args) = @_; | |
169 # even though next_prediction doesn't expect any args (and this method | |
170 # does neither), we pass on args in order to be prepared if this changes | |
171 # ever | |
172 return $self->next_prediction(@args); | |
173 } | |
174 | |
175 =head2 next_prediction | |
176 | |
177 Title : next_prediction | |
178 Usage : while($gene = $Genemark->next_prediction()) { | |
179 # do something | |
180 } | |
181 Function: Returns the next gene structure prediction of the Genemark result | |
182 file. Call this method repeatedly until FALSE is returned. | |
183 | |
184 Example : | |
185 Returns : A Bio::Tools::Prediction::Gene object. | |
186 Args : | |
187 | |
188 =cut | |
189 | |
190 sub next_prediction { | |
191 my ($self) = @_; | |
192 my $gene; | |
193 | |
194 # if the prediction section hasn't been parsed yet, we do this now | |
195 $self->_parse_predictions() unless $self->_predictions_parsed(); | |
196 | |
197 # get next gene structure | |
198 $gene = $self->_prediction(); | |
199 | |
200 return $gene; | |
201 } | |
202 | |
203 =head2 _parse_predictions | |
204 | |
205 Title : _parse_predictions() | |
206 Usage : $obj->_parse_predictions() | |
207 Function: Parses the prediction section. Automatically called by | |
208 next_prediction() if not yet done. | |
209 Example : | |
210 Returns : | |
211 | |
212 =cut | |
213 | |
214 sub _parse_predictions { | |
215 my ($self) = @_; | |
216 my %exontags = ('Initial' => 'Initial', | |
217 'Internal' => 'Internal', | |
218 'Terminal' => 'Terminal', | |
219 'Single' => '', | |
220 '_na_' => ''); | |
221 my $exontag; | |
222 my $gene; | |
223 my $seqname; | |
224 my $exontype; | |
225 my $current_gene_no = -1; | |
226 | |
227 while(defined($_ = $self->_readline())) { | |
228 | |
229 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) { | |
230 | |
231 # this is an exon, Genemark doesn't predict anything else | |
232 # $prednr corresponds to geneno. | |
233 my $prednr = $1; | |
234 | |
235 #exon no: | |
236 my $signalnr = 0; | |
237 if ($2) { my $signalnr = $2; } # used in tag: exon_no | |
238 | |
239 # split into fields | |
240 chomp(); | |
241 my @flds = split(' ', $_); | |
242 | |
243 # create the feature (an exon) object | |
244 my $predobj = Bio::Tools::Prediction::Exon->new(); | |
245 | |
246 | |
247 # define info depending on it being eu- or prokaryot | |
248 my ($start, $end, $orientation, $prediction_source); | |
249 | |
250 if ($self->analysis_method() =~ /PROKARYOTIC/i) { | |
251 $prediction_source = "Genemark.hmm.pro"; | |
252 $orientation = ($flds[1] eq '+') ? 1 : -1; | |
253 ($start, $end) = @flds[(2,3)]; | |
254 $exontag = "_na_"; | |
255 | |
256 } else { | |
257 $prediction_source = "Genemark.hmm.eu"; | |
258 $orientation = ($flds[2] eq '+') ? 1 : -1; | |
259 ($start, $end) = @flds[(4,5)]; | |
260 $exontag = $flds[3]; | |
261 } | |
262 | |
263 #store the data in the exon object | |
264 $predobj->source_tag($prediction_source); | |
265 $predobj->start($start); | |
266 $predobj->end($end); | |
267 $predobj->strand($orientation); | |
268 | |
269 $predobj->primary_tag($exontags{$exontag} . "Exon"); | |
270 | |
271 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr); | |
272 | |
273 $predobj->is_coding(1); | |
274 | |
275 | |
276 # frame calculation as in the genscan module | |
277 # is to be implemented... | |
278 | |
279 #If the $prednr is not equal to the current gene, we | |
280 #need to make a new gene and close the old one | |
281 if($prednr != $current_gene_no) { | |
282 # a new gene, store the old one if it exists | |
283 if (defined ($gene)) { | |
284 $gene->seq_id($seqname); | |
285 $gene = undef ; | |
286 } | |
287 #and make a new one | |
288 $gene = Bio::Tools::Prediction::Gene->new | |
289 ( | |
290 '-primary' => "GenePrediction$prednr", | |
291 '-source' => $prediction_source); | |
292 $self->_add_prediction($gene); | |
293 $current_gene_no = $prednr; | |
294 } | |
295 | |
296 # Add the exon to the gene | |
297 $gene->add_exon($predobj, ($exontag eq "_na_" ? | |
298 undef : $exontags{$exontag})); | |
299 | |
300 } | |
301 | |
302 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) { | |
303 $self->analysis_method($1); | |
304 | |
305 my $gm_version = $2; | |
306 | |
307 $self->analysis_method_version($gm_version); | |
308 next; | |
309 } | |
310 | |
311 #Matrix file for eukaryot version | |
312 if (/^Matrices file:\s+(\S+)?/i) { | |
313 $self->analysis_subject($1); | |
314 # since the line after the matrix file is always the date | |
315 # (in the output file's I have seen!) extract and store this | |
316 # here | |
317 if (defined(my $_date = $self->_readline())) { | |
318 chomp ($_date); | |
319 $self->analysis_date($_date); | |
320 } | |
321 } | |
322 | |
323 #Matrix file for prokaryot version | |
324 if (/^Model file name:\s+(\S+)/) { | |
325 $self->analysis_subject($1); | |
326 # since the line after the matrix file is always the date | |
327 # (in the output file's I have seen!) extract and store this | |
328 # here | |
329 my $_date = $self->_readline() ; | |
330 if (defined($_date = $self->_readline())) { | |
331 chomp ($_date); | |
332 $self->analysis_date($_date); | |
333 } | |
334 } | |
335 | |
336 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) { | |
337 $seqname = $1; | |
338 # $self->analysis_subject($seqname); | |
339 next; | |
340 } | |
341 | |
342 | |
343 /^>/ && do { | |
344 $self->_pushback($_); | |
345 | |
346 # section of predicted aa sequences on recognition | |
347 # of a fasta start, read all sequences and find the | |
348 # appropriate gene | |
349 while (1) { | |
350 my ($aa_id, $seq) = $self->_read_fasta_seq(); | |
351 last unless ($aa_id); | |
352 | |
353 #now parse through the predictions to add the pred. protein | |
354 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) { | |
355 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/; | |
356 my $geneno = $1; | |
357 if ($aa_id =~ /\|gene.$geneno\|/) { | |
358 #print "x SEQ : \n $seq \nXXXX\n"; | |
359 my $seqobj = Bio::Seq->new('-seq' => $seq, | |
360 '-display_id' => $aa_id, | |
361 '-alphabet' => "protein"); | |
362 $gene->predicted_protein($seqobj); | |
363 last FINDPRED; | |
364 } | |
365 | |
366 } | |
367 } | |
368 | |
369 last; | |
370 }; | |
371 } | |
372 | |
373 # if the analysis query object contains a ref to a Seq of PrimarySeq | |
374 # object, then extract the predicted sequences and add it to the gene | |
375 # object. | |
376 if (defined $self->analysis_query()) { | |
377 my $orig_seq = $self->analysis_query(); | |
378 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) { | |
379 my $predseq = ""; | |
380 foreach my $exon ($gene->exons()) { | |
381 #print $exon->start() . " " . $exon->end () . "\n"; | |
382 $predseq .= $orig_seq->subseq($exon->start(), $exon->end()); | |
383 } | |
384 | |
385 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq, | |
386 '-display_id' => "transl"); | |
387 $gene->predicted_cds($seqobj); | |
388 } | |
389 } | |
390 | |
391 | |
392 $self->_predictions_parsed(1); | |
393 } | |
394 | |
395 =head2 _prediction | |
396 | |
397 Title : _prediction() | |
398 Usage : $gene = $obj->_prediction() | |
399 Function: internal | |
400 Example : | |
401 Returns : | |
402 | |
403 =cut | |
404 | |
405 sub _prediction { | |
406 my ($self) = @_; | |
407 | |
408 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); | |
409 return shift(@{$self->{'_preds'}}); | |
410 } | |
411 | |
412 =head2 _add_prediction | |
413 | |
414 Title : _add_prediction() | |
415 Usage : $obj->_add_prediction($gene) | |
416 Function: internal | |
417 Example : | |
418 Returns : | |
419 | |
420 =cut | |
421 | |
422 sub _add_prediction { | |
423 my ($self, $gene) = @_; | |
424 | |
425 if(! exists($self->{'_preds'})) { | |
426 $self->{'_preds'} = []; | |
427 } | |
428 push(@{$self->{'_preds'}}, $gene); | |
429 } | |
430 | |
431 =head2 _predictions_parsed | |
432 | |
433 Title : _predictions_parsed | |
434 Usage : $obj->_predictions_parsed | |
435 Function: internal | |
436 Example : | |
437 Returns : TRUE or FALSE | |
438 | |
439 =cut | |
440 | |
441 sub _predictions_parsed { | |
442 my ($self, $val) = @_; | |
443 | |
444 $self->{'_preds_parsed'} = $val if $val; | |
445 if(! exists($self->{'_preds_parsed'})) { | |
446 $self->{'_preds_parsed'} = 0; | |
447 } | |
448 return $self->{'_preds_parsed'}; | |
449 } | |
450 | |
451 =head2 _has_cds | |
452 | |
453 Title : _has_cds() | |
454 Usage : $obj->_has_cds() | |
455 Function: Whether or not the result contains the predicted CDSs, too. | |
456 Example : | |
457 Returns : TRUE or FALSE | |
458 | |
459 =cut | |
460 | |
461 sub _has_cds { | |
462 my ($self, $val) = @_; | |
463 | |
464 $self->{'_has_cds'} = $val if $val; | |
465 if(! exists($self->{'_has_cds'})) { | |
466 $self->{'_has_cds'} = 0; | |
467 } | |
468 return $self->{'_has_cds'}; | |
469 } | |
470 | |
471 =head2 _read_fasta_seq | |
472 | |
473 Title : _read_fasta_seq() | |
474 Usage : ($id,$seqstr) = $obj->_read_fasta_seq(); | |
475 Function: Simple but specialised FASTA format sequence reader. Uses | |
476 $self->_readline() to retrieve input, and is able to strip off | |
477 the traling description lines. | |
478 Example : | |
479 Returns : An array of two elements. | |
480 | |
481 =cut | |
482 | |
483 sub _read_fasta_seq { | |
484 my ($self) = @_; | |
485 my ($id, $seq); | |
486 local $/ = ">"; | |
487 | |
488 return 0 unless (my $entry = $self->_readline()); | |
489 | |
490 $entry =~ s/^>//; | |
491 # complete the entry if the first line came from a pushback buffer | |
492 while(! ($entry =~ />$/)) { | |
493 last unless ($_ = $self->_readline()); | |
494 $entry .= $_; | |
495 } | |
496 | |
497 # delete everything onwards from an new fasta start (>) | |
498 $entry =~ s/\n>.*$//s; | |
499 # id and sequence | |
500 | |
501 if($entry =~ s/^(.+)\n//) { | |
502 $id = $1; | |
503 $id =~ s/ /_/g; | |
504 $seq = $entry; | |
505 $seq =~ s/\s//g; | |
506 #print "\n@@ $id \n@@ $seq \n##\n"; | |
507 } else { | |
508 $self->throw("Can't parse Genemark predicted sequence entry"); | |
509 } | |
510 $seq =~ s/\s//g; # Remove whitespace | |
511 return ($id, $seq); | |
512 } | |
513 | |
514 1; | |
515 |