Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/Tools/Genscan.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: Genscan.pm,v 1.22 2002/10/22 07:38:46 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::Tools::Genscan | |
4 # | |
5 # Cared for by Hilmar Lapp <hlapp@gmx.net> | |
6 # | |
7 # Copyright Hilmar Lapp | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Tools::Genscan - Results of one Genscan run | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan'); | |
20 # filehandle: | |
21 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT ); | |
22 | |
23 # parse the results | |
24 # note: this class is-a Bio::Tools::AnalysisResult which implements | |
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same | |
26 while($gene = $genscan->next_prediction()) { | |
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits | |
28 # off Bio::SeqFeature::Gene::Transcript. | |
29 # | |
30 # $gene->exons() returns an array of | |
31 # Bio::Tools::Prediction::Exon objects | |
32 # all exons: | |
33 @exon_arr = $gene->exons(); | |
34 | |
35 # initial exons only | |
36 @init_exons = $gene->exons('Initial'); | |
37 # internal exons only | |
38 @intrl_exons = $gene->exons('Internal'); | |
39 # terminal exons only | |
40 @term_exons = $gene->exons('Terminal'); | |
41 # singleton exons: | |
42 ($single_exon) = $gene->exons(); | |
43 } | |
44 | |
45 # essential if you gave a filename at initialization (otherwise the file | |
46 # will stay open) | |
47 $genscan->close(); | |
48 | |
49 =head1 DESCRIPTION | |
50 | |
51 The Genscan module provides a parser for Genscan gene structure prediction | |
52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript- | |
53 derived object. | |
54 | |
55 This module also implements the Bio::SeqAnalysisParserI interface, and thus | |
56 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>. | |
57 | |
58 =head1 FEEDBACK | |
59 | |
60 =head2 Mailing Lists | |
61 | |
62 User feedback is an integral part of the evolution of this and other | |
63 Bioperl modules. Send your comments and suggestions preferably to one | |
64 of the Bioperl mailing lists. Your participation is much appreciated. | |
65 | |
66 bioperl-l@bioperl.org - General discussion | |
67 http://bio.perl.org/MailList.html - About the mailing lists | |
68 | |
69 =head2 Reporting Bugs | |
70 | |
71 Report bugs to the Bioperl bug tracking system to help us keep track | |
72 the bugs and their resolution. Bug reports can be submitted via email | |
73 or the web: | |
74 | |
75 bioperl-bugs@bio.perl.org | |
76 http://bugzilla.bioperl.org/ | |
77 | |
78 =head1 AUTHOR - Hilmar Lapp | |
79 | |
80 Email hlapp@gmx.net | |
81 | |
82 Describe contact details here | |
83 | |
84 =head1 APPENDIX | |
85 | |
86 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
87 | |
88 =cut | |
89 | |
90 | |
91 # Let the code begin... | |
92 | |
93 | |
94 package Bio::Tools::Genscan; | |
95 use vars qw(@ISA); | |
96 use strict; | |
97 use Symbol; | |
98 | |
99 use Bio::Root::Root; | |
100 use Bio::Tools::AnalysisResult; | |
101 use Bio::Tools::Prediction::Gene; | |
102 use Bio::Tools::Prediction::Exon; | |
103 | |
104 @ISA = qw(Bio::Tools::AnalysisResult); | |
105 | |
106 sub _initialize_state { | |
107 my ($self,@args) = @_; | |
108 | |
109 # first call the inherited method! | |
110 $self->SUPER::_initialize_state(@args); | |
111 | |
112 # our private state variables | |
113 $self->{'_preds_parsed'} = 0; | |
114 $self->{'_has_cds'} = 0; | |
115 # array of pre-parsed predictions | |
116 $self->{'_preds'} = []; | |
117 # seq stack | |
118 $self->{'_seqstack'} = []; | |
119 } | |
120 | |
121 =head2 analysis_method | |
122 | |
123 Usage : $genscan->analysis_method(); | |
124 Purpose : Inherited method. Overridden to ensure that the name matches | |
125 /genscan/i. | |
126 Returns : String | |
127 Argument : n/a | |
128 | |
129 =cut | |
130 | |
131 #------------- | |
132 sub analysis_method { | |
133 #------------- | |
134 my ($self, $method) = @_; | |
135 if($method && ($method !~ /genscan/i)) { | |
136 $self->throw("method $method not supported in " . ref($self)); | |
137 } | |
138 return $self->SUPER::analysis_method($method); | |
139 } | |
140 | |
141 =head2 next_feature | |
142 | |
143 Title : next_feature | |
144 Usage : while($gene = $genscan->next_feature()) { | |
145 # do something | |
146 } | |
147 Function: Returns the next gene structure prediction of the Genscan result | |
148 file. Call this method repeatedly until FALSE is returned. | |
149 | |
150 The returned object is actually a SeqFeatureI implementing object. | |
151 This method is required for classes implementing the | |
152 SeqAnalysisParserI interface, and is merely an alias for | |
153 next_prediction() at present. | |
154 | |
155 Example : | |
156 Returns : A Bio::Tools::Prediction::Gene object. | |
157 Args : | |
158 | |
159 =cut | |
160 | |
161 sub next_feature { | |
162 my ($self,@args) = @_; | |
163 # even though next_prediction doesn't expect any args (and this method | |
164 # does neither), we pass on args in order to be prepared if this changes | |
165 # ever | |
166 return $self->next_prediction(@args); | |
167 } | |
168 | |
169 =head2 next_prediction | |
170 | |
171 Title : next_prediction | |
172 Usage : while($gene = $genscan->next_prediction()) { | |
173 # do something | |
174 } | |
175 Function: Returns the next gene structure prediction of the Genscan result | |
176 file. Call this method repeatedly until FALSE is returned. | |
177 | |
178 Example : | |
179 Returns : A Bio::Tools::Prediction::Gene object. | |
180 Args : | |
181 | |
182 =cut | |
183 | |
184 sub next_prediction { | |
185 my ($self) = @_; | |
186 my $gene; | |
187 | |
188 # if the prediction section hasn't been parsed yet, we do this now | |
189 $self->_parse_predictions() unless $self->_predictions_parsed(); | |
190 | |
191 # get next gene structure | |
192 $gene = $self->_prediction(); | |
193 | |
194 if($gene) { | |
195 # fill in predicted protein, and if available the predicted CDS | |
196 # | |
197 my ($id, $seq); | |
198 # use the seq stack if there's a seq on it | |
199 my $seqobj = pop(@{$self->{'_seqstack'}}); | |
200 if(! $seqobj) { | |
201 # otherwise read from input stream | |
202 ($id, $seq) = $self->_read_fasta_seq(); | |
203 # there may be no sequence at all, or none any more | |
204 if($id && $seq) { | |
205 $seqobj = Bio::PrimarySeq->new('-seq' => $seq, | |
206 '-display_id' => $id, | |
207 '-alphabet' => "protein"); | |
208 } | |
209 } | |
210 if($seqobj) { | |
211 # check that prediction number matches the prediction number | |
212 # indicated in the sequence id (there may be incomplete gene | |
213 # predictions that contain only signals with no associated protein | |
214 # and CDS, like promoters, poly-A sites etc) | |
215 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/; | |
216 my $prednr = $1; | |
217 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) { | |
218 # this is not our sequence, so push back for next prediction | |
219 push(@{$self->{'_seqstack'}}, $seqobj); | |
220 } else { | |
221 $gene->predicted_protein($seqobj); | |
222 # CDS prediction, too? | |
223 if($self->_has_cds()) { | |
224 ($id, $seq) = $self->_read_fasta_seq(); | |
225 $seqobj = Bio::PrimarySeq->new('-seq' => $seq, | |
226 '-display_id' => $id, | |
227 '-alphabet' => "dna"); | |
228 $gene->predicted_cds($seqobj); | |
229 } | |
230 } | |
231 } | |
232 } | |
233 | |
234 return $gene; | |
235 } | |
236 | |
237 =head2 _parse_predictions | |
238 | |
239 Title : _parse_predictions() | |
240 Usage : $obj->_parse_predictions() | |
241 Function: Parses the prediction section. Automatically called by | |
242 next_prediction() if not yet done. | |
243 Example : | |
244 Returns : | |
245 | |
246 =cut | |
247 | |
248 sub _parse_predictions { | |
249 my ($self) = @_; | |
250 my %exontags = ('Init' => 'Initial', | |
251 'Intr' => 'Internal', | |
252 'Term' => 'Terminal', | |
253 'Sngl' => ''); | |
254 my $gene; | |
255 my $seqname; | |
256 | |
257 while(defined($_ = $self->_readline())) { | |
258 if(/^\s*(\d+)\.(\d+)/) { | |
259 # exon or signal | |
260 my $prednr = $1; | |
261 my $signalnr = $2; # not used presently | |
262 if(! defined($gene)) { | |
263 $gene = Bio::Tools::Prediction::Gene->new( | |
264 '-primary' => "GenePrediction$prednr", | |
265 '-source' => 'Genscan'); | |
266 } | |
267 # split into fields | |
268 chomp(); | |
269 my @flds = split(' ', $_); | |
270 # create the feature object depending on the type of signal | |
271 my $predobj; | |
272 my $is_exon = grep {$_ eq $flds[1];} (keys(%exontags)); | |
273 if($is_exon) { | |
274 $predobj = Bio::Tools::Prediction::Exon->new(); | |
275 } else { | |
276 # PolyA site, or Promoter | |
277 $predobj = Bio::SeqFeature::Generic->new(); | |
278 } | |
279 # set common fields | |
280 $predobj->source_tag('Genscan'); | |
281 $predobj->score($flds[$#flds]); | |
282 $predobj->strand((($flds[2] eq '+') ? 1 : -1)); | |
283 my ($start, $end) = @flds[(3,4)]; | |
284 if($predobj->strand() == 1) { | |
285 $predobj->start($start); | |
286 $predobj->end($end); | |
287 } else { | |
288 $predobj->end($start); | |
289 $predobj->start($end); | |
290 } | |
291 # add to gene structure (should be done only when start and end | |
292 # are set, in order to allow for proper expansion of the range) | |
293 if($is_exon) { | |
294 # first, set fields unique to exons | |
295 $predobj->start_signal_score($flds[8]); | |
296 $predobj->end_signal_score($flds[9]); | |
297 $predobj->coding_signal_score($flds[10]); | |
298 $predobj->significance($flds[11]); | |
299 $predobj->primary_tag($exontags{$flds[1]} . 'Exon'); | |
300 $predobj->is_coding(1); | |
301 # Figure out the frame of this exon. This is NOT the frame | |
302 # given by Genscan, which is the absolute frame of the base | |
303 # starting the first predicted complete codon. By comparing | |
304 # to the absolute frame of the first base we can compute the | |
305 # offset of the first complete codon to the first base of the | |
306 # exon, which determines the frame of the exon. | |
307 my $cod_offset; | |
308 if($predobj->strand() == 1) { | |
309 $cod_offset = $flds[6] - (($predobj->start()-1) % 3); | |
310 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond | |
311 # to offsets 2 and 1, resp. Offset 3 is the same as 0. | |
312 $cod_offset += 3 if($cod_offset < 1); | |
313 } else { | |
314 # On the reverse strand the Genscan frame also refers to | |
315 # the first base of the first complete codon, but viewed | |
316 # from forward, which is the third base viewed from | |
317 # reverse. | |
318 $cod_offset = $flds[6] - (($predobj->end()-3) % 3); | |
319 # Possible values are -2, -1, 0, 1, 2. Due to the reverse | |
320 # situation, {2,-1} and {1,-2} correspond to offsets | |
321 # 1 and 2, resp. Offset 3 is the same as 0. | |
322 $cod_offset -= 3 if($cod_offset >= 0); | |
323 $cod_offset = -$cod_offset; | |
324 } | |
325 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon | |
326 # is the frame of the first base relative to the exon, or the | |
327 # number of bases the first codon is missing). | |
328 $predobj->frame(3 - $cod_offset); | |
329 # then add to gene structure object | |
330 $gene->add_exon($predobj, $exontags{$flds[1]}); | |
331 } elsif($flds[1] eq 'PlyA') { | |
332 $predobj->primary_tag("PolyAsite"); | |
333 $gene->poly_A_site($predobj); | |
334 } elsif($flds[1] eq 'Prom') { | |
335 $predobj->primary_tag("Promoter"); | |
336 $gene->add_promoter($predobj); | |
337 } | |
338 next; | |
339 } | |
340 if(/^\s*$/ && defined($gene)) { | |
341 # current gene is completed | |
342 $gene->seq_id($seqname); | |
343 $self->_add_prediction($gene); | |
344 $gene = undef; | |
345 next; | |
346 } | |
347 if(/^(GENSCAN)\s+(\S+)/) { | |
348 $self->analysis_method($1); | |
349 $self->analysis_method_version($2); | |
350 next; | |
351 } | |
352 if(/^Sequence\s+(\S+)\s*:/) { | |
353 $seqname = $1; | |
354 next; | |
355 } | |
356 | |
357 if(/^Parameter matrix:\s+(\S+)/i) { | |
358 $self->analysis_subject($1); | |
359 next; | |
360 } | |
361 | |
362 if(/^Predicted coding/) { | |
363 $self->_has_cds(1); | |
364 next; | |
365 } | |
366 /^>/ && do { | |
367 # section of predicted sequences | |
368 $self->_pushback($_); | |
369 last; | |
370 }; | |
371 } | |
372 $self->_predictions_parsed(1); | |
373 } | |
374 | |
375 =head2 _prediction | |
376 | |
377 Title : _prediction() | |
378 Usage : $gene = $obj->_prediction() | |
379 Function: internal | |
380 Example : | |
381 Returns : | |
382 | |
383 =cut | |
384 | |
385 sub _prediction { | |
386 my ($self) = @_; | |
387 | |
388 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); | |
389 return shift(@{$self->{'_preds'}}); | |
390 } | |
391 | |
392 =head2 _add_prediction | |
393 | |
394 Title : _add_prediction() | |
395 Usage : $obj->_add_prediction($gene) | |
396 Function: internal | |
397 Example : | |
398 Returns : | |
399 | |
400 =cut | |
401 | |
402 sub _add_prediction { | |
403 my ($self, $gene) = @_; | |
404 | |
405 if(! exists($self->{'_preds'})) { | |
406 $self->{'_preds'} = []; | |
407 } | |
408 push(@{$self->{'_preds'}}, $gene); | |
409 } | |
410 | |
411 =head2 _predictions_parsed | |
412 | |
413 Title : _predictions_parsed | |
414 Usage : $obj->_predictions_parsed | |
415 Function: internal | |
416 Example : | |
417 Returns : TRUE or FALSE | |
418 | |
419 =cut | |
420 | |
421 sub _predictions_parsed { | |
422 my ($self, $val) = @_; | |
423 | |
424 $self->{'_preds_parsed'} = $val if $val; | |
425 if(! exists($self->{'_preds_parsed'})) { | |
426 $self->{'_preds_parsed'} = 0; | |
427 } | |
428 return $self->{'_preds_parsed'}; | |
429 } | |
430 | |
431 =head2 _has_cds | |
432 | |
433 Title : _has_cds() | |
434 Usage : $obj->_has_cds() | |
435 Function: Whether or not the result contains the predicted CDSs, too. | |
436 Example : | |
437 Returns : TRUE or FALSE | |
438 | |
439 =cut | |
440 | |
441 sub _has_cds { | |
442 my ($self, $val) = @_; | |
443 | |
444 $self->{'_has_cds'} = $val if $val; | |
445 if(! exists($self->{'_has_cds'})) { | |
446 $self->{'_has_cds'} = 0; | |
447 } | |
448 return $self->{'_has_cds'}; | |
449 } | |
450 | |
451 =head2 _read_fasta_seq | |
452 | |
453 Title : _read_fasta_seq() | |
454 Usage : ($id,$seqstr) = $obj->_read_fasta_seq(); | |
455 Function: Simple but specialised FASTA format sequence reader. Uses | |
456 $self->_readline() to retrieve input, and is able to strip off | |
457 the traling description lines. | |
458 Example : | |
459 Returns : An array of two elements. | |
460 | |
461 =cut | |
462 | |
463 sub _read_fasta_seq { | |
464 my ($self) = @_; | |
465 my ($id, $seq); | |
466 local $/ = ">"; | |
467 | |
468 my $entry = $self->_readline(); | |
469 if($entry) { | |
470 $entry =~ s/^>//; | |
471 # complete the entry if the first line came from a pushback buffer | |
472 while($entry !~ />$/) { | |
473 last unless $_ = $self->_readline(); | |
474 $entry .= $_; | |
475 } | |
476 # delete everything onwards from an intervening empty line (at the | |
477 # end there might be statistics stuff) | |
478 $entry =~ s/\n\n.*$//s; | |
479 # id and sequence | |
480 if($entry =~ /^(\S+)\n([^>]+)/) { | |
481 $id = $1; | |
482 $seq = $2; | |
483 } else { | |
484 $self->throw("Can't parse Genscan predicted sequence entry"); | |
485 } | |
486 $seq =~ s/\s//g; # Remove whitespace | |
487 } | |
488 return ($id, $seq); | |
489 } | |
490 | |
491 1; |