comparison variant_effect_predictor/Bio/Tools/Genscan.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: Genscan.pm,v 1.22 2002/10/22 07:38:46 lapp Exp $
2 #
3 # BioPerl module for Bio::Tools::Genscan
4 #
5 # Cared for by Hilmar Lapp <hlapp@gmx.net>
6 #
7 # Copyright Hilmar Lapp
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::Genscan - Results of one Genscan run
16
17 =head1 SYNOPSIS
18
19 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan');
20 # filehandle:
21 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT );
22
23 # parse the results
24 # note: this class is-a Bio::Tools::AnalysisResult which implements
25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same
26 while($gene = $genscan->next_prediction()) {
27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits
28 # off Bio::SeqFeature::Gene::Transcript.
29 #
30 # $gene->exons() returns an array of
31 # Bio::Tools::Prediction::Exon objects
32 # all exons:
33 @exon_arr = $gene->exons();
34
35 # initial exons only
36 @init_exons = $gene->exons('Initial');
37 # internal exons only
38 @intrl_exons = $gene->exons('Internal');
39 # terminal exons only
40 @term_exons = $gene->exons('Terminal');
41 # singleton exons:
42 ($single_exon) = $gene->exons();
43 }
44
45 # essential if you gave a filename at initialization (otherwise the file
46 # will stay open)
47 $genscan->close();
48
49 =head1 DESCRIPTION
50
51 The Genscan module provides a parser for Genscan gene structure prediction
52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript-
53 derived object.
54
55 This module also implements the Bio::SeqAnalysisParserI interface, and thus
56 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>.
57
58 =head1 FEEDBACK
59
60 =head2 Mailing Lists
61
62 User feedback is an integral part of the evolution of this and other
63 Bioperl modules. Send your comments and suggestions preferably to one
64 of the Bioperl mailing lists. Your participation is much appreciated.
65
66 bioperl-l@bioperl.org - General discussion
67 http://bio.perl.org/MailList.html - About the mailing lists
68
69 =head2 Reporting Bugs
70
71 Report bugs to the Bioperl bug tracking system to help us keep track
72 the bugs and their resolution. Bug reports can be submitted via email
73 or the web:
74
75 bioperl-bugs@bio.perl.org
76 http://bugzilla.bioperl.org/
77
78 =head1 AUTHOR - Hilmar Lapp
79
80 Email hlapp@gmx.net
81
82 Describe contact details here
83
84 =head1 APPENDIX
85
86 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
87
88 =cut
89
90
91 # Let the code begin...
92
93
94 package Bio::Tools::Genscan;
95 use vars qw(@ISA);
96 use strict;
97 use Symbol;
98
99 use Bio::Root::Root;
100 use Bio::Tools::AnalysisResult;
101 use Bio::Tools::Prediction::Gene;
102 use Bio::Tools::Prediction::Exon;
103
104 @ISA = qw(Bio::Tools::AnalysisResult);
105
106 sub _initialize_state {
107 my ($self,@args) = @_;
108
109 # first call the inherited method!
110 $self->SUPER::_initialize_state(@args);
111
112 # our private state variables
113 $self->{'_preds_parsed'} = 0;
114 $self->{'_has_cds'} = 0;
115 # array of pre-parsed predictions
116 $self->{'_preds'} = [];
117 # seq stack
118 $self->{'_seqstack'} = [];
119 }
120
121 =head2 analysis_method
122
123 Usage : $genscan->analysis_method();
124 Purpose : Inherited method. Overridden to ensure that the name matches
125 /genscan/i.
126 Returns : String
127 Argument : n/a
128
129 =cut
130
131 #-------------
132 sub analysis_method {
133 #-------------
134 my ($self, $method) = @_;
135 if($method && ($method !~ /genscan/i)) {
136 $self->throw("method $method not supported in " . ref($self));
137 }
138 return $self->SUPER::analysis_method($method);
139 }
140
141 =head2 next_feature
142
143 Title : next_feature
144 Usage : while($gene = $genscan->next_feature()) {
145 # do something
146 }
147 Function: Returns the next gene structure prediction of the Genscan result
148 file. Call this method repeatedly until FALSE is returned.
149
150 The returned object is actually a SeqFeatureI implementing object.
151 This method is required for classes implementing the
152 SeqAnalysisParserI interface, and is merely an alias for
153 next_prediction() at present.
154
155 Example :
156 Returns : A Bio::Tools::Prediction::Gene object.
157 Args :
158
159 =cut
160
161 sub next_feature {
162 my ($self,@args) = @_;
163 # even though next_prediction doesn't expect any args (and this method
164 # does neither), we pass on args in order to be prepared if this changes
165 # ever
166 return $self->next_prediction(@args);
167 }
168
169 =head2 next_prediction
170
171 Title : next_prediction
172 Usage : while($gene = $genscan->next_prediction()) {
173 # do something
174 }
175 Function: Returns the next gene structure prediction of the Genscan result
176 file. Call this method repeatedly until FALSE is returned.
177
178 Example :
179 Returns : A Bio::Tools::Prediction::Gene object.
180 Args :
181
182 =cut
183
184 sub next_prediction {
185 my ($self) = @_;
186 my $gene;
187
188 # if the prediction section hasn't been parsed yet, we do this now
189 $self->_parse_predictions() unless $self->_predictions_parsed();
190
191 # get next gene structure
192 $gene = $self->_prediction();
193
194 if($gene) {
195 # fill in predicted protein, and if available the predicted CDS
196 #
197 my ($id, $seq);
198 # use the seq stack if there's a seq on it
199 my $seqobj = pop(@{$self->{'_seqstack'}});
200 if(! $seqobj) {
201 # otherwise read from input stream
202 ($id, $seq) = $self->_read_fasta_seq();
203 # there may be no sequence at all, or none any more
204 if($id && $seq) {
205 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
206 '-display_id' => $id,
207 '-alphabet' => "protein");
208 }
209 }
210 if($seqobj) {
211 # check that prediction number matches the prediction number
212 # indicated in the sequence id (there may be incomplete gene
213 # predictions that contain only signals with no associated protein
214 # and CDS, like promoters, poly-A sites etc)
215 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/;
216 my $prednr = $1;
217 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) {
218 # this is not our sequence, so push back for next prediction
219 push(@{$self->{'_seqstack'}}, $seqobj);
220 } else {
221 $gene->predicted_protein($seqobj);
222 # CDS prediction, too?
223 if($self->_has_cds()) {
224 ($id, $seq) = $self->_read_fasta_seq();
225 $seqobj = Bio::PrimarySeq->new('-seq' => $seq,
226 '-display_id' => $id,
227 '-alphabet' => "dna");
228 $gene->predicted_cds($seqobj);
229 }
230 }
231 }
232 }
233
234 return $gene;
235 }
236
237 =head2 _parse_predictions
238
239 Title : _parse_predictions()
240 Usage : $obj->_parse_predictions()
241 Function: Parses the prediction section. Automatically called by
242 next_prediction() if not yet done.
243 Example :
244 Returns :
245
246 =cut
247
248 sub _parse_predictions {
249 my ($self) = @_;
250 my %exontags = ('Init' => 'Initial',
251 'Intr' => 'Internal',
252 'Term' => 'Terminal',
253 'Sngl' => '');
254 my $gene;
255 my $seqname;
256
257 while(defined($_ = $self->_readline())) {
258 if(/^\s*(\d+)\.(\d+)/) {
259 # exon or signal
260 my $prednr = $1;
261 my $signalnr = $2; # not used presently
262 if(! defined($gene)) {
263 $gene = Bio::Tools::Prediction::Gene->new(
264 '-primary' => "GenePrediction$prednr",
265 '-source' => 'Genscan');
266 }
267 # split into fields
268 chomp();
269 my @flds = split(' ', $_);
270 # create the feature object depending on the type of signal
271 my $predobj;
272 my $is_exon = grep {$_ eq $flds[1];} (keys(%exontags));
273 if($is_exon) {
274 $predobj = Bio::Tools::Prediction::Exon->new();
275 } else {
276 # PolyA site, or Promoter
277 $predobj = Bio::SeqFeature::Generic->new();
278 }
279 # set common fields
280 $predobj->source_tag('Genscan');
281 $predobj->score($flds[$#flds]);
282 $predobj->strand((($flds[2] eq '+') ? 1 : -1));
283 my ($start, $end) = @flds[(3,4)];
284 if($predobj->strand() == 1) {
285 $predobj->start($start);
286 $predobj->end($end);
287 } else {
288 $predobj->end($start);
289 $predobj->start($end);
290 }
291 # add to gene structure (should be done only when start and end
292 # are set, in order to allow for proper expansion of the range)
293 if($is_exon) {
294 # first, set fields unique to exons
295 $predobj->start_signal_score($flds[8]);
296 $predobj->end_signal_score($flds[9]);
297 $predobj->coding_signal_score($flds[10]);
298 $predobj->significance($flds[11]);
299 $predobj->primary_tag($exontags{$flds[1]} . 'Exon');
300 $predobj->is_coding(1);
301 # Figure out the frame of this exon. This is NOT the frame
302 # given by Genscan, which is the absolute frame of the base
303 # starting the first predicted complete codon. By comparing
304 # to the absolute frame of the first base we can compute the
305 # offset of the first complete codon to the first base of the
306 # exon, which determines the frame of the exon.
307 my $cod_offset;
308 if($predobj->strand() == 1) {
309 $cod_offset = $flds[6] - (($predobj->start()-1) % 3);
310 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond
311 # to offsets 2 and 1, resp. Offset 3 is the same as 0.
312 $cod_offset += 3 if($cod_offset < 1);
313 } else {
314 # On the reverse strand the Genscan frame also refers to
315 # the first base of the first complete codon, but viewed
316 # from forward, which is the third base viewed from
317 # reverse.
318 $cod_offset = $flds[6] - (($predobj->end()-3) % 3);
319 # Possible values are -2, -1, 0, 1, 2. Due to the reverse
320 # situation, {2,-1} and {1,-2} correspond to offsets
321 # 1 and 2, resp. Offset 3 is the same as 0.
322 $cod_offset -= 3 if($cod_offset >= 0);
323 $cod_offset = -$cod_offset;
324 }
325 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon
326 # is the frame of the first base relative to the exon, or the
327 # number of bases the first codon is missing).
328 $predobj->frame(3 - $cod_offset);
329 # then add to gene structure object
330 $gene->add_exon($predobj, $exontags{$flds[1]});
331 } elsif($flds[1] eq 'PlyA') {
332 $predobj->primary_tag("PolyAsite");
333 $gene->poly_A_site($predobj);
334 } elsif($flds[1] eq 'Prom') {
335 $predobj->primary_tag("Promoter");
336 $gene->add_promoter($predobj);
337 }
338 next;
339 }
340 if(/^\s*$/ && defined($gene)) {
341 # current gene is completed
342 $gene->seq_id($seqname);
343 $self->_add_prediction($gene);
344 $gene = undef;
345 next;
346 }
347 if(/^(GENSCAN)\s+(\S+)/) {
348 $self->analysis_method($1);
349 $self->analysis_method_version($2);
350 next;
351 }
352 if(/^Sequence\s+(\S+)\s*:/) {
353 $seqname = $1;
354 next;
355 }
356
357 if(/^Parameter matrix:\s+(\S+)/i) {
358 $self->analysis_subject($1);
359 next;
360 }
361
362 if(/^Predicted coding/) {
363 $self->_has_cds(1);
364 next;
365 }
366 /^>/ && do {
367 # section of predicted sequences
368 $self->_pushback($_);
369 last;
370 };
371 }
372 $self->_predictions_parsed(1);
373 }
374
375 =head2 _prediction
376
377 Title : _prediction()
378 Usage : $gene = $obj->_prediction()
379 Function: internal
380 Example :
381 Returns :
382
383 =cut
384
385 sub _prediction {
386 my ($self) = @_;
387
388 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}});
389 return shift(@{$self->{'_preds'}});
390 }
391
392 =head2 _add_prediction
393
394 Title : _add_prediction()
395 Usage : $obj->_add_prediction($gene)
396 Function: internal
397 Example :
398 Returns :
399
400 =cut
401
402 sub _add_prediction {
403 my ($self, $gene) = @_;
404
405 if(! exists($self->{'_preds'})) {
406 $self->{'_preds'} = [];
407 }
408 push(@{$self->{'_preds'}}, $gene);
409 }
410
411 =head2 _predictions_parsed
412
413 Title : _predictions_parsed
414 Usage : $obj->_predictions_parsed
415 Function: internal
416 Example :
417 Returns : TRUE or FALSE
418
419 =cut
420
421 sub _predictions_parsed {
422 my ($self, $val) = @_;
423
424 $self->{'_preds_parsed'} = $val if $val;
425 if(! exists($self->{'_preds_parsed'})) {
426 $self->{'_preds_parsed'} = 0;
427 }
428 return $self->{'_preds_parsed'};
429 }
430
431 =head2 _has_cds
432
433 Title : _has_cds()
434 Usage : $obj->_has_cds()
435 Function: Whether or not the result contains the predicted CDSs, too.
436 Example :
437 Returns : TRUE or FALSE
438
439 =cut
440
441 sub _has_cds {
442 my ($self, $val) = @_;
443
444 $self->{'_has_cds'} = $val if $val;
445 if(! exists($self->{'_has_cds'})) {
446 $self->{'_has_cds'} = 0;
447 }
448 return $self->{'_has_cds'};
449 }
450
451 =head2 _read_fasta_seq
452
453 Title : _read_fasta_seq()
454 Usage : ($id,$seqstr) = $obj->_read_fasta_seq();
455 Function: Simple but specialised FASTA format sequence reader. Uses
456 $self->_readline() to retrieve input, and is able to strip off
457 the traling description lines.
458 Example :
459 Returns : An array of two elements.
460
461 =cut
462
463 sub _read_fasta_seq {
464 my ($self) = @_;
465 my ($id, $seq);
466 local $/ = ">";
467
468 my $entry = $self->_readline();
469 if($entry) {
470 $entry =~ s/^>//;
471 # complete the entry if the first line came from a pushback buffer
472 while($entry !~ />$/) {
473 last unless $_ = $self->_readline();
474 $entry .= $_;
475 }
476 # delete everything onwards from an intervening empty line (at the
477 # end there might be statistics stuff)
478 $entry =~ s/\n\n.*$//s;
479 # id and sequence
480 if($entry =~ /^(\S+)\n([^>]+)/) {
481 $id = $1;
482 $seq = $2;
483 } else {
484 $self->throw("Can't parse Genscan predicted sequence entry");
485 }
486 $seq =~ s/\s//g; # Remove whitespace
487 }
488 return ($id, $seq);
489 }
490
491 1;