0
|
1 # $Id: Genscan.pm,v 1.22 2002/10/22 07:38:46 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Tools::Genscan
|
|
4 #
|
|
5 # Cared for by Hilmar Lapp <hlapp@gmx.net>
|
|
6 #
|
|
7 # Copyright Hilmar Lapp
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Tools::Genscan - Results of one Genscan run
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
|
|
20 # filehandle:
|
|
21 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT );
|
|
22
|
|
23 # parse the results
|
|
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
|
|
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
|
|
26 while($gene = $genscan->next_prediction()) {
|
|
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
|
|
28 # off Bio::SeqFeature::Gene::Transcript.
|
|
29 #
|
|
30 # $gene->exons() returns an array of
|
|
31 # Bio::Tools::Prediction::Exon objects
|
|
32 # all exons:
|
|
33 @exon_arr = $gene->exons();
|
|
34
|
|
35 # initial exons only
|
|
36 @init_exons = $gene->exons('Initial');
|
|
37 # internal exons only
|
|
38 @intrl_exons = $gene->exons('Internal');
|
|
39 # terminal exons only
|
|
40 @term_exons = $gene->exons('Terminal');
|
|
41 # singleton exons:
|
|
42 ($single_exon) = $gene->exons();
|
|
43 }
|
|
44
|
|
45 # essential if you gave a filename at initialization (otherwise the file
|
|
46 # will stay open)
|
|
47 $genscan->close();
|
|
48
|
|
49 =head1 DESCRIPTION
|
|
50
|
|
51 The Genscan module provides a parser for Genscan gene structure prediction
|
|
52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
|
|
53 derived object.
|
|
54
|
|
55 This module also implements the Bio::SeqAnalysisParserI interface, and thus
|
|
56 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
|
|
57
|
|
58 =head1 FEEDBACK
|
|
59
|
|
60 =head2 Mailing Lists
|
|
61
|
|
62 User feedback is an integral part of the evolution of this and other
|
|
63 Bioperl modules. Send your comments and suggestions preferably to one
|
|
64 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
65
|
|
66 bioperl-l@bioperl.org - General discussion
|
|
67 http://bio.perl.org/MailList.html - About the mailing lists
|
|
68
|
|
69 =head2 Reporting Bugs
|
|
70
|
|
71 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
72 the bugs and their resolution. Bug reports can be submitted via email
|
|
73 or the web:
|
|
74
|
|
75 bioperl-bugs@bio.perl.org
|
|
76 http://bugzilla.bioperl.org/
|
|
77
|
|
78 =head1 AUTHOR - Hilmar Lapp
|
|
79
|
|
80 Email hlapp@gmx.net
|
|
81
|
|
82 Describe contact details here
|
|
83
|
|
84 =head1 APPENDIX
|
|
85
|
|
86 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
87
|
|
88 =cut
|
|
89
|
|
90
|
|
91 # Let the code begin...
|
|
92
|
|
93
|
|
94 package Bio::Tools::Genscan;
|
|
95 use vars qw(@ISA);
|
|
96 use strict;
|
|
97 use Symbol;
|
|
98
|
|
99 use Bio::Root::Root;
|
|
100 use Bio::Tools::AnalysisResult;
|
|
101 use Bio::Tools::Prediction::Gene;
|
|
102 use Bio::Tools::Prediction::Exon;
|
|
103
|
|
104 @ISA = qw(Bio::Tools::AnalysisResult);
|
|
105
|
|
106 sub _initialize_state {
|
|
107 my ($self,@args) = @_;
|
|
108
|
|
109 # first call the inherited method!
|
|
110 $self->SUPER::_initialize_state(@args);
|
|
111
|
|
112 # our private state variables
|
|
113 $self->{'_preds_parsed'} = 0;
|
|
114 $self->{'_has_cds'} = 0;
|
|
115 # array of pre-parsed predictions
|
|
116 $self->{'_preds'} = [];
|
|
117 # seq stack
|
|
118 $self->{'_seqstack'} = [];
|
|
119 }
|
|
120
|
|
121 =head2 analysis_method
|
|
122
|
|
123 Usage : $genscan->analysis_method();
|
|
124 Purpose : Inherited method. Overridden to ensure that the name matches
|
|
125 /genscan/i.
|
|
126 Returns : String
|
|
127 Argument : n/a
|
|
128
|
|
129 =cut
|
|
130
|
|
131 #-------------
|
|
132 sub analysis_method {
|
|
133 #-------------
|
|
134 my ($self, $method) = @_;
|
|
135 if($method && ($method !~ /genscan/i)) {
|
|
136 $self->throw("method $method not supported in " . ref($self));
|
|
137 }
|
|
138 return $self->SUPER::analysis_method($method);
|
|
139 }
|
|
140
|
|
141 =head2 next_feature
|
|
142
|
|
143 Title : next_feature
|
|
144 Usage : while($gene = $genscan->next_feature()) {
|
|
145 # do something
|
|
146 }
|
|
147 Function: Returns the next gene structure prediction of the Genscan result
|
|
148 file. Call this method repeatedly until FALSE is returned.
|
|
149
|
|
150 The returned object is actually a SeqFeatureI implementing object.
|
|
151 This method is required for classes implementing the
|
|
152 SeqAnalysisParserI interface, and is merely an alias for
|
|
153 next_prediction() at present.
|
|
154
|
|
155 Example :
|
|
156 Returns : A Bio::Tools::Prediction::Gene object.
|
|
157 Args :
|
|
158
|
|
159 =cut
|
|
160
|
|
161 sub next_feature {
|
|
162 my ($self,@args) = @_;
|
|
163 # even though next_prediction doesn't expect any args (and this method
|
|
164 # does neither), we pass on args in order to be prepared if this changes
|
|
165 # ever
|
|
166 return $self->next_prediction(@args);
|
|
167 }
|
|
168
|
|
169 =head2 next_prediction
|
|
170
|
|
171 Title : next_prediction
|
|
172 Usage : while($gene = $genscan->next_prediction()) {
|
|
173 # do something
|
|
174 }
|
|
175 Function: Returns the next gene structure prediction of the Genscan result
|
|
176 file. Call this method repeatedly until FALSE is returned.
|
|
177
|
|
178 Example :
|
|
179 Returns : A Bio::Tools::Prediction::Gene object.
|
|
180 Args :
|
|
181
|
|
182 =cut
|
|
183
|
|
184 sub next_prediction {
|
|
185 my ($self) = @_;
|
|
186 my $gene;
|
|
187
|
|
188 # if the prediction section hasn't been parsed yet, we do this now
|
|
189 $self->_parse_predictions() unless $self->_predictions_parsed();
|
|
190
|
|
191 # get next gene structure
|
|
192 $gene = $self->_prediction();
|
|
193
|
|
194 if($gene) {
|
|
195 # fill in predicted protein, and if available the predicted CDS
|
|
196 #
|
|
197 my ($id, $seq);
|
|
198 # use the seq stack if there's a seq on it
|
|
199 my $seqobj = pop(@{$self->{'_seqstack'}});
|
|
200 if(! $seqobj) {
|
|
201 # otherwise read from input stream
|
|
202 ($id, $seq) = $self->_read_fasta_seq();
|
|
203 # there may be no sequence at all, or none any more
|
|
204 if($id && $seq) {
|
|
205 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
|
|
206 '-display_id' => $id,
|
|
207 '-alphabet' => "protein");
|
|
208 }
|
|
209 }
|
|
210 if($seqobj) {
|
|
211 # check that prediction number matches the prediction number
|
|
212 # indicated in the sequence id (there may be incomplete gene
|
|
213 # predictions that contain only signals with no associated protein
|
|
214 # and CDS, like promoters, poly-A sites etc)
|
|
215 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
|
|
216 my $prednr = $1;
|
|
217 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
|
|
218 # this is not our sequence, so push back for next prediction
|
|
219 push(@{$self->{'_seqstack'}}, $seqobj);
|
|
220 } else {
|
|
221 $gene->predicted_protein($seqobj);
|
|
222 # CDS prediction, too?
|
|
223 if($self->_has_cds()) {
|
|
224 ($id, $seq) = $self->_read_fasta_seq();
|
|
225 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
|
|
226 '-display_id' => $id,
|
|
227 '-alphabet' => "dna");
|
|
228 $gene->predicted_cds($seqobj);
|
|
229 }
|
|
230 }
|
|
231 }
|
|
232 }
|
|
233
|
|
234 return $gene;
|
|
235 }
|
|
236
|
|
237 =head2 _parse_predictions
|
|
238
|
|
239 Title : _parse_predictions()
|
|
240 Usage : $obj->_parse_predictions()
|
|
241 Function: Parses the prediction section. Automatically called by
|
|
242 next_prediction() if not yet done.
|
|
243 Example :
|
|
244 Returns :
|
|
245
|
|
246 =cut
|
|
247
|
|
248 sub _parse_predictions {
|
|
249 my ($self) = @_;
|
|
250 my %exontags = ('Init' => 'Initial',
|
|
251 'Intr' => 'Internal',
|
|
252 'Term' => 'Terminal',
|
|
253 'Sngl' => '');
|
|
254 my $gene;
|
|
255 my $seqname;
|
|
256
|
|
257 while(defined($_ = $self->_readline())) {
|
|
258 if(/^\s*(\d+)\.(\d+)/) {
|
|
259 # exon or signal
|
|
260 my $prednr = $1;
|
|
261 my $signalnr = $2; # not used presently
|
|
262 if(! defined($gene)) {
|
|
263 $gene = Bio::Tools::Prediction::Gene->new(
|
|
264 '-primary' => "GenePrediction$prednr",
|
|
265 '-source' => 'Genscan');
|
|
266 }
|
|
267 # split into fields
|
|
268 chomp();
|
|
269 my @flds = split(' ', $_);
|
|
270 # create the feature object depending on the type of signal
|
|
271 my $predobj;
|
|
272 my $is_exon = grep {$_ eq $flds[1];} (keys(%exontags));
|
|
273 if($is_exon) {
|
|
274 $predobj = Bio::Tools::Prediction::Exon->new();
|
|
275 } else {
|
|
276 # PolyA site, or Promoter
|
|
277 $predobj = Bio::SeqFeature::Generic->new();
|
|
278 }
|
|
279 # set common fields
|
|
280 $predobj->source_tag('Genscan');
|
|
281 $predobj->score($flds[$#flds]);
|
|
282 $predobj->strand((($flds[2] eq '+') ? 1 : -1));
|
|
283 my ($start, $end) = @flds[(3,4)];
|
|
284 if($predobj->strand() == 1) {
|
|
285 $predobj->start($start);
|
|
286 $predobj->end($end);
|
|
287 } else {
|
|
288 $predobj->end($start);
|
|
289 $predobj->start($end);
|
|
290 }
|
|
291 # add to gene structure (should be done only when start and end
|
|
292 # are set, in order to allow for proper expansion of the range)
|
|
293 if($is_exon) {
|
|
294 # first, set fields unique to exons
|
|
295 $predobj->start_signal_score($flds[8]);
|
|
296 $predobj->end_signal_score($flds[9]);
|
|
297 $predobj->coding_signal_score($flds[10]);
|
|
298 $predobj->significance($flds[11]);
|
|
299 $predobj->primary_tag($exontags{$flds[1]} . 'Exon');
|
|
300 $predobj->is_coding(1);
|
|
301 # Figure out the frame of this exon. This is NOT the frame
|
|
302 # given by Genscan, which is the absolute frame of the base
|
|
303 # starting the first predicted complete codon. By comparing
|
|
304 # to the absolute frame of the first base we can compute the
|
|
305 # offset of the first complete codon to the first base of the
|
|
306 # exon, which determines the frame of the exon.
|
|
307 my $cod_offset;
|
|
308 if($predobj->strand() == 1) {
|
|
309 $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
|
|
310 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
|
|
311 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
|
|
312 $cod_offset += 3 if($cod_offset < 1);
|
|
313 } else {
|
|
314 # On the reverse strand the Genscan frame also refers to
|
|
315 # the first base of the first complete codon, but viewed
|
|
316 # from forward, which is the third base viewed from
|
|
317 # reverse.
|
|
318 $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
|
|
319 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
|
|
320 # situation, {2,-1} and {1,-2} correspond to offsets
|
|
321 # 1 and 2, resp. Offset 3 is the same as 0.
|
|
322 $cod_offset -= 3 if($cod_offset >= 0);
|
|
323 $cod_offset = -$cod_offset;
|
|
324 }
|
|
325 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
|
|
326 # is the frame of the first base relative to the exon, or the
|
|
327 # number of bases the first codon is missing).
|
|
328 $predobj->frame(3 - $cod_offset);
|
|
329 # then add to gene structure object
|
|
330 $gene->add_exon($predobj, $exontags{$flds[1]});
|
|
331 } elsif($flds[1] eq 'PlyA') {
|
|
332 $predobj->primary_tag("PolyAsite");
|
|
333 $gene->poly_A_site($predobj);
|
|
334 } elsif($flds[1] eq 'Prom') {
|
|
335 $predobj->primary_tag("Promoter");
|
|
336 $gene->add_promoter($predobj);
|
|
337 }
|
|
338 next;
|
|
339 }
|
|
340 if(/^\s*$/ && defined($gene)) {
|
|
341 # current gene is completed
|
|
342 $gene->seq_id($seqname);
|
|
343 $self->_add_prediction($gene);
|
|
344 $gene = undef;
|
|
345 next;
|
|
346 }
|
|
347 if(/^(GENSCAN)\s+(\S+)/) {
|
|
348 $self->analysis_method($1);
|
|
349 $self->analysis_method_version($2);
|
|
350 next;
|
|
351 }
|
|
352 if(/^Sequence\s+(\S+)\s*:/) {
|
|
353 $seqname = $1;
|
|
354 next;
|
|
355 }
|
|
356
|
|
357 if(/^Parameter matrix:\s+(\S+)/i) {
|
|
358 $self->analysis_subject($1);
|
|
359 next;
|
|
360 }
|
|
361
|
|
362 if(/^Predicted coding/) {
|
|
363 $self->_has_cds(1);
|
|
364 next;
|
|
365 }
|
|
366 /^>/ && do {
|
|
367 # section of predicted sequences
|
|
368 $self->_pushback($_);
|
|
369 last;
|
|
370 };
|
|
371 }
|
|
372 $self->_predictions_parsed(1);
|
|
373 }
|
|
374
|
|
375 =head2 _prediction
|
|
376
|
|
377 Title : _prediction()
|
|
378 Usage : $gene = $obj->_prediction()
|
|
379 Function: internal
|
|
380 Example :
|
|
381 Returns :
|
|
382
|
|
383 =cut
|
|
384
|
|
385 sub _prediction {
|
|
386 my ($self) = @_;
|
|
387
|
|
388 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
|
|
389 return shift(@{$self->{'_preds'}});
|
|
390 }
|
|
391
|
|
392 =head2 _add_prediction
|
|
393
|
|
394 Title : _add_prediction()
|
|
395 Usage : $obj->_add_prediction($gene)
|
|
396 Function: internal
|
|
397 Example :
|
|
398 Returns :
|
|
399
|
|
400 =cut
|
|
401
|
|
402 sub _add_prediction {
|
|
403 my ($self, $gene) = @_;
|
|
404
|
|
405 if(! exists($self->{'_preds'})) {
|
|
406 $self->{'_preds'} = [];
|
|
407 }
|
|
408 push(@{$self->{'_preds'}}, $gene);
|
|
409 }
|
|
410
|
|
411 =head2 _predictions_parsed
|
|
412
|
|
413 Title : _predictions_parsed
|
|
414 Usage : $obj->_predictions_parsed
|
|
415 Function: internal
|
|
416 Example :
|
|
417 Returns : TRUE or FALSE
|
|
418
|
|
419 =cut
|
|
420
|
|
421 sub _predictions_parsed {
|
|
422 my ($self, $val) = @_;
|
|
423
|
|
424 $self->{'_preds_parsed'} = $val if $val;
|
|
425 if(! exists($self->{'_preds_parsed'})) {
|
|
426 $self->{'_preds_parsed'} = 0;
|
|
427 }
|
|
428 return $self->{'_preds_parsed'};
|
|
429 }
|
|
430
|
|
431 =head2 _has_cds
|
|
432
|
|
433 Title : _has_cds()
|
|
434 Usage : $obj->_has_cds()
|
|
435 Function: Whether or not the result contains the predicted CDSs, too.
|
|
436 Example :
|
|
437 Returns : TRUE or FALSE
|
|
438
|
|
439 =cut
|
|
440
|
|
441 sub _has_cds {
|
|
442 my ($self, $val) = @_;
|
|
443
|
|
444 $self->{'_has_cds'} = $val if $val;
|
|
445 if(! exists($self->{'_has_cds'})) {
|
|
446 $self->{'_has_cds'} = 0;
|
|
447 }
|
|
448 return $self->{'_has_cds'};
|
|
449 }
|
|
450
|
|
451 =head2 _read_fasta_seq
|
|
452
|
|
453 Title : _read_fasta_seq()
|
|
454 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
|
|
455 Function: Simple but specialised FASTA format sequence reader. Uses
|
|
456 $self->_readline() to retrieve input, and is able to strip off
|
|
457 the traling description lines.
|
|
458 Example :
|
|
459 Returns : An array of two elements.
|
|
460
|
|
461 =cut
|
|
462
|
|
463 sub _read_fasta_seq {
|
|
464 my ($self) = @_;
|
|
465 my ($id, $seq);
|
|
466 local $/ = ">";
|
|
467
|
|
468 my $entry = $self->_readline();
|
|
469 if($entry) {
|
|
470 $entry =~ s/^>//;
|
|
471 # complete the entry if the first line came from a pushback buffer
|
|
472 while($entry !~ />$/) {
|
|
473 last unless $_ = $self->_readline();
|
|
474 $entry .= $_;
|
|
475 }
|
|
476 # delete everything onwards from an intervening empty line (at the
|
|
477 # end there might be statistics stuff)
|
|
478 $entry =~ s/\n\n.*$//s;
|
|
479 # id and sequence
|
|
480 if($entry =~ /^(\S+)\n([^>]+)/) {
|
|
481 $id = $1;
|
|
482 $seq = $2;
|
|
483 } else {
|
|
484 $self->throw("Can't parse Genscan predicted sequence entry");
|
|
485 }
|
|
486 $seq =~ s/\s//g; # Remove whitespace
|
|
487 }
|
|
488 return ($id, $seq);
|
|
489 }
|
|
490
|
|
491 1;
|