Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Genscan.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Genscan.pm,v 1.22 2002/10/22 07:38:46 lapp Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Tools::Genscan | |
| 4 # | |
| 5 # Cared for by Hilmar Lapp <hlapp@gmx.net> | |
| 6 # | |
| 7 # Copyright Hilmar Lapp | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Tools::Genscan - Results of one Genscan run | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 $genscan = Bio::Tools::Genscan->new(-file => 'result.genscan'); | |
| 20 # filehandle: | |
| 21 $genscan = Bio::Tools::Genscan->new( -fh => \*INPUT ); | |
| 22 | |
| 23 # parse the results | |
| 24 # note: this class is-a Bio::Tools::AnalysisResult which implements | |
| 25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same | |
| 26 while($gene = $genscan->next_prediction()) { | |
| 27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits | |
| 28 # off Bio::SeqFeature::Gene::Transcript. | |
| 29 # | |
| 30 # $gene->exons() returns an array of | |
| 31 # Bio::Tools::Prediction::Exon objects | |
| 32 # all exons: | |
| 33 @exon_arr = $gene->exons(); | |
| 34 | |
| 35 # initial exons only | |
| 36 @init_exons = $gene->exons('Initial'); | |
| 37 # internal exons only | |
| 38 @intrl_exons = $gene->exons('Internal'); | |
| 39 # terminal exons only | |
| 40 @term_exons = $gene->exons('Terminal'); | |
| 41 # singleton exons: | |
| 42 ($single_exon) = $gene->exons(); | |
| 43 } | |
| 44 | |
| 45 # essential if you gave a filename at initialization (otherwise the file | |
| 46 # will stay open) | |
| 47 $genscan->close(); | |
| 48 | |
| 49 =head1 DESCRIPTION | |
| 50 | |
| 51 The Genscan module provides a parser for Genscan gene structure prediction | |
| 52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript- | |
| 53 derived object. | |
| 54 | |
| 55 This module also implements the Bio::SeqAnalysisParserI interface, and thus | |
| 56 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>. | |
| 57 | |
| 58 =head1 FEEDBACK | |
| 59 | |
| 60 =head2 Mailing Lists | |
| 61 | |
| 62 User feedback is an integral part of the evolution of this and other | |
| 63 Bioperl modules. Send your comments and suggestions preferably to one | |
| 64 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 65 | |
| 66 bioperl-l@bioperl.org - General discussion | |
| 67 http://bio.perl.org/MailList.html - About the mailing lists | |
| 68 | |
| 69 =head2 Reporting Bugs | |
| 70 | |
| 71 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 72 the bugs and their resolution. Bug reports can be submitted via email | |
| 73 or the web: | |
| 74 | |
| 75 bioperl-bugs@bio.perl.org | |
| 76 http://bugzilla.bioperl.org/ | |
| 77 | |
| 78 =head1 AUTHOR - Hilmar Lapp | |
| 79 | |
| 80 Email hlapp@gmx.net | |
| 81 | |
| 82 Describe contact details here | |
| 83 | |
| 84 =head1 APPENDIX | |
| 85 | |
| 86 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
| 87 | |
| 88 =cut | |
| 89 | |
| 90 | |
| 91 # Let the code begin... | |
| 92 | |
| 93 | |
| 94 package Bio::Tools::Genscan; | |
| 95 use vars qw(@ISA); | |
| 96 use strict; | |
| 97 use Symbol; | |
| 98 | |
| 99 use Bio::Root::Root; | |
| 100 use Bio::Tools::AnalysisResult; | |
| 101 use Bio::Tools::Prediction::Gene; | |
| 102 use Bio::Tools::Prediction::Exon; | |
| 103 | |
| 104 @ISA = qw(Bio::Tools::AnalysisResult); | |
| 105 | |
| 106 sub _initialize_state { | |
| 107 my ($self,@args) = @_; | |
| 108 | |
| 109 # first call the inherited method! | |
| 110 $self->SUPER::_initialize_state(@args); | |
| 111 | |
| 112 # our private state variables | |
| 113 $self->{'_preds_parsed'} = 0; | |
| 114 $self->{'_has_cds'} = 0; | |
| 115 # array of pre-parsed predictions | |
| 116 $self->{'_preds'} = []; | |
| 117 # seq stack | |
| 118 $self->{'_seqstack'} = []; | |
| 119 } | |
| 120 | |
| 121 =head2 analysis_method | |
| 122 | |
| 123 Usage : $genscan->analysis_method(); | |
| 124 Purpose : Inherited method. Overridden to ensure that the name matches | |
| 125 /genscan/i. | |
| 126 Returns : String | |
| 127 Argument : n/a | |
| 128 | |
| 129 =cut | |
| 130 | |
| 131 #------------- | |
| 132 sub analysis_method { | |
| 133 #------------- | |
| 134 my ($self, $method) = @_; | |
| 135 if($method && ($method !~ /genscan/i)) { | |
| 136 $self->throw("method $method not supported in " . ref($self)); | |
| 137 } | |
| 138 return $self->SUPER::analysis_method($method); | |
| 139 } | |
| 140 | |
| 141 =head2 next_feature | |
| 142 | |
| 143 Title : next_feature | |
| 144 Usage : while($gene = $genscan->next_feature()) { | |
| 145 # do something | |
| 146 } | |
| 147 Function: Returns the next gene structure prediction of the Genscan result | |
| 148 file. Call this method repeatedly until FALSE is returned. | |
| 149 | |
| 150 The returned object is actually a SeqFeatureI implementing object. | |
| 151 This method is required for classes implementing the | |
| 152 SeqAnalysisParserI interface, and is merely an alias for | |
| 153 next_prediction() at present. | |
| 154 | |
| 155 Example : | |
| 156 Returns : A Bio::Tools::Prediction::Gene object. | |
| 157 Args : | |
| 158 | |
| 159 =cut | |
| 160 | |
| 161 sub next_feature { | |
| 162 my ($self,@args) = @_; | |
| 163 # even though next_prediction doesn't expect any args (and this method | |
| 164 # does neither), we pass on args in order to be prepared if this changes | |
| 165 # ever | |
| 166 return $self->next_prediction(@args); | |
| 167 } | |
| 168 | |
| 169 =head2 next_prediction | |
| 170 | |
| 171 Title : next_prediction | |
| 172 Usage : while($gene = $genscan->next_prediction()) { | |
| 173 # do something | |
| 174 } | |
| 175 Function: Returns the next gene structure prediction of the Genscan result | |
| 176 file. Call this method repeatedly until FALSE is returned. | |
| 177 | |
| 178 Example : | |
| 179 Returns : A Bio::Tools::Prediction::Gene object. | |
| 180 Args : | |
| 181 | |
| 182 =cut | |
| 183 | |
| 184 sub next_prediction { | |
| 185 my ($self) = @_; | |
| 186 my $gene; | |
| 187 | |
| 188 # if the prediction section hasn't been parsed yet, we do this now | |
| 189 $self->_parse_predictions() unless $self->_predictions_parsed(); | |
| 190 | |
| 191 # get next gene structure | |
| 192 $gene = $self->_prediction(); | |
| 193 | |
| 194 if($gene) { | |
| 195 # fill in predicted protein, and if available the predicted CDS | |
| 196 # | |
| 197 my ($id, $seq); | |
| 198 # use the seq stack if there's a seq on it | |
| 199 my $seqobj = pop(@{$self->{'_seqstack'}}); | |
| 200 if(! $seqobj) { | |
| 201 # otherwise read from input stream | |
| 202 ($id, $seq) = $self->_read_fasta_seq(); | |
| 203 # there may be no sequence at all, or none any more | |
| 204 if($id && $seq) { | |
| 205 $seqobj = Bio::PrimarySeq->new('-seq' => $seq, | |
| 206 '-display_id' => $id, | |
| 207 '-alphabet' => "protein"); | |
| 208 } | |
| 209 } | |
| 210 if($seqobj) { | |
| 211 # check that prediction number matches the prediction number | |
| 212 # indicated in the sequence id (there may be incomplete gene | |
| 213 # predictions that contain only signals with no associated protein | |
| 214 # and CDS, like promoters, poly-A sites etc) | |
| 215 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/; | |
| 216 my $prednr = $1; | |
| 217 if($seqobj->display_id() !~ /_predicted_\w+_$prednr\|/) { | |
| 218 # this is not our sequence, so push back for next prediction | |
| 219 push(@{$self->{'_seqstack'}}, $seqobj); | |
| 220 } else { | |
| 221 $gene->predicted_protein($seqobj); | |
| 222 # CDS prediction, too? | |
| 223 if($self->_has_cds()) { | |
| 224 ($id, $seq) = $self->_read_fasta_seq(); | |
| 225 $seqobj = Bio::PrimarySeq->new('-seq' => $seq, | |
| 226 '-display_id' => $id, | |
| 227 '-alphabet' => "dna"); | |
| 228 $gene->predicted_cds($seqobj); | |
| 229 } | |
| 230 } | |
| 231 } | |
| 232 } | |
| 233 | |
| 234 return $gene; | |
| 235 } | |
| 236 | |
| 237 =head2 _parse_predictions | |
| 238 | |
| 239 Title : _parse_predictions() | |
| 240 Usage : $obj->_parse_predictions() | |
| 241 Function: Parses the prediction section. Automatically called by | |
| 242 next_prediction() if not yet done. | |
| 243 Example : | |
| 244 Returns : | |
| 245 | |
| 246 =cut | |
| 247 | |
| 248 sub _parse_predictions { | |
| 249 my ($self) = @_; | |
| 250 my %exontags = ('Init' => 'Initial', | |
| 251 'Intr' => 'Internal', | |
| 252 'Term' => 'Terminal', | |
| 253 'Sngl' => ''); | |
| 254 my $gene; | |
| 255 my $seqname; | |
| 256 | |
| 257 while(defined($_ = $self->_readline())) { | |
| 258 if(/^\s*(\d+)\.(\d+)/) { | |
| 259 # exon or signal | |
| 260 my $prednr = $1; | |
| 261 my $signalnr = $2; # not used presently | |
| 262 if(! defined($gene)) { | |
| 263 $gene = Bio::Tools::Prediction::Gene->new( | |
| 264 '-primary' => "GenePrediction$prednr", | |
| 265 '-source' => 'Genscan'); | |
| 266 } | |
| 267 # split into fields | |
| 268 chomp(); | |
| 269 my @flds = split(' ', $_); | |
| 270 # create the feature object depending on the type of signal | |
| 271 my $predobj; | |
| 272 my $is_exon = grep {$_ eq $flds[1];} (keys(%exontags)); | |
| 273 if($is_exon) { | |
| 274 $predobj = Bio::Tools::Prediction::Exon->new(); | |
| 275 } else { | |
| 276 # PolyA site, or Promoter | |
| 277 $predobj = Bio::SeqFeature::Generic->new(); | |
| 278 } | |
| 279 # set common fields | |
| 280 $predobj->source_tag('Genscan'); | |
| 281 $predobj->score($flds[$#flds]); | |
| 282 $predobj->strand((($flds[2] eq '+') ? 1 : -1)); | |
| 283 my ($start, $end) = @flds[(3,4)]; | |
| 284 if($predobj->strand() == 1) { | |
| 285 $predobj->start($start); | |
| 286 $predobj->end($end); | |
| 287 } else { | |
| 288 $predobj->end($start); | |
| 289 $predobj->start($end); | |
| 290 } | |
| 291 # add to gene structure (should be done only when start and end | |
| 292 # are set, in order to allow for proper expansion of the range) | |
| 293 if($is_exon) { | |
| 294 # first, set fields unique to exons | |
| 295 $predobj->start_signal_score($flds[8]); | |
| 296 $predobj->end_signal_score($flds[9]); | |
| 297 $predobj->coding_signal_score($flds[10]); | |
| 298 $predobj->significance($flds[11]); | |
| 299 $predobj->primary_tag($exontags{$flds[1]} . 'Exon'); | |
| 300 $predobj->is_coding(1); | |
| 301 # Figure out the frame of this exon. This is NOT the frame | |
| 302 # given by Genscan, which is the absolute frame of the base | |
| 303 # starting the first predicted complete codon. By comparing | |
| 304 # to the absolute frame of the first base we can compute the | |
| 305 # offset of the first complete codon to the first base of the | |
| 306 # exon, which determines the frame of the exon. | |
| 307 my $cod_offset; | |
| 308 if($predobj->strand() == 1) { | |
| 309 $cod_offset = $flds[6] - (($predobj->start()-1) % 3); | |
| 310 # Possible values are -2, -1, 0, 1, 2. -1 and -2 correspond | |
| 311 # to offsets 2 and 1, resp. Offset 3 is the same as 0. | |
| 312 $cod_offset += 3 if($cod_offset < 1); | |
| 313 } else { | |
| 314 # On the reverse strand the Genscan frame also refers to | |
| 315 # the first base of the first complete codon, but viewed | |
| 316 # from forward, which is the third base viewed from | |
| 317 # reverse. | |
| 318 $cod_offset = $flds[6] - (($predobj->end()-3) % 3); | |
| 319 # Possible values are -2, -1, 0, 1, 2. Due to the reverse | |
| 320 # situation, {2,-1} and {1,-2} correspond to offsets | |
| 321 # 1 and 2, resp. Offset 3 is the same as 0. | |
| 322 $cod_offset -= 3 if($cod_offset >= 0); | |
| 323 $cod_offset = -$cod_offset; | |
| 324 } | |
| 325 # Offsets 2 and 1 correspond to frame 1 and 2 (frame of exon | |
| 326 # is the frame of the first base relative to the exon, or the | |
| 327 # number of bases the first codon is missing). | |
| 328 $predobj->frame(3 - $cod_offset); | |
| 329 # then add to gene structure object | |
| 330 $gene->add_exon($predobj, $exontags{$flds[1]}); | |
| 331 } elsif($flds[1] eq 'PlyA') { | |
| 332 $predobj->primary_tag("PolyAsite"); | |
| 333 $gene->poly_A_site($predobj); | |
| 334 } elsif($flds[1] eq 'Prom') { | |
| 335 $predobj->primary_tag("Promoter"); | |
| 336 $gene->add_promoter($predobj); | |
| 337 } | |
| 338 next; | |
| 339 } | |
| 340 if(/^\s*$/ && defined($gene)) { | |
| 341 # current gene is completed | |
| 342 $gene->seq_id($seqname); | |
| 343 $self->_add_prediction($gene); | |
| 344 $gene = undef; | |
| 345 next; | |
| 346 } | |
| 347 if(/^(GENSCAN)\s+(\S+)/) { | |
| 348 $self->analysis_method($1); | |
| 349 $self->analysis_method_version($2); | |
| 350 next; | |
| 351 } | |
| 352 if(/^Sequence\s+(\S+)\s*:/) { | |
| 353 $seqname = $1; | |
| 354 next; | |
| 355 } | |
| 356 | |
| 357 if(/^Parameter matrix:\s+(\S+)/i) { | |
| 358 $self->analysis_subject($1); | |
| 359 next; | |
| 360 } | |
| 361 | |
| 362 if(/^Predicted coding/) { | |
| 363 $self->_has_cds(1); | |
| 364 next; | |
| 365 } | |
| 366 /^>/ && do { | |
| 367 # section of predicted sequences | |
| 368 $self->_pushback($_); | |
| 369 last; | |
| 370 }; | |
| 371 } | |
| 372 $self->_predictions_parsed(1); | |
| 373 } | |
| 374 | |
| 375 =head2 _prediction | |
| 376 | |
| 377 Title : _prediction() | |
| 378 Usage : $gene = $obj->_prediction() | |
| 379 Function: internal | |
| 380 Example : | |
| 381 Returns : | |
| 382 | |
| 383 =cut | |
| 384 | |
| 385 sub _prediction { | |
| 386 my ($self) = @_; | |
| 387 | |
| 388 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); | |
| 389 return shift(@{$self->{'_preds'}}); | |
| 390 } | |
| 391 | |
| 392 =head2 _add_prediction | |
| 393 | |
| 394 Title : _add_prediction() | |
| 395 Usage : $obj->_add_prediction($gene) | |
| 396 Function: internal | |
| 397 Example : | |
| 398 Returns : | |
| 399 | |
| 400 =cut | |
| 401 | |
| 402 sub _add_prediction { | |
| 403 my ($self, $gene) = @_; | |
| 404 | |
| 405 if(! exists($self->{'_preds'})) { | |
| 406 $self->{'_preds'} = []; | |
| 407 } | |
| 408 push(@{$self->{'_preds'}}, $gene); | |
| 409 } | |
| 410 | |
| 411 =head2 _predictions_parsed | |
| 412 | |
| 413 Title : _predictions_parsed | |
| 414 Usage : $obj->_predictions_parsed | |
| 415 Function: internal | |
| 416 Example : | |
| 417 Returns : TRUE or FALSE | |
| 418 | |
| 419 =cut | |
| 420 | |
| 421 sub _predictions_parsed { | |
| 422 my ($self, $val) = @_; | |
| 423 | |
| 424 $self->{'_preds_parsed'} = $val if $val; | |
| 425 if(! exists($self->{'_preds_parsed'})) { | |
| 426 $self->{'_preds_parsed'} = 0; | |
| 427 } | |
| 428 return $self->{'_preds_parsed'}; | |
| 429 } | |
| 430 | |
| 431 =head2 _has_cds | |
| 432 | |
| 433 Title : _has_cds() | |
| 434 Usage : $obj->_has_cds() | |
| 435 Function: Whether or not the result contains the predicted CDSs, too. | |
| 436 Example : | |
| 437 Returns : TRUE or FALSE | |
| 438 | |
| 439 =cut | |
| 440 | |
| 441 sub _has_cds { | |
| 442 my ($self, $val) = @_; | |
| 443 | |
| 444 $self->{'_has_cds'} = $val if $val; | |
| 445 if(! exists($self->{'_has_cds'})) { | |
| 446 $self->{'_has_cds'} = 0; | |
| 447 } | |
| 448 return $self->{'_has_cds'}; | |
| 449 } | |
| 450 | |
| 451 =head2 _read_fasta_seq | |
| 452 | |
| 453 Title : _read_fasta_seq() | |
| 454 Usage : ($id,$seqstr) = $obj->_read_fasta_seq(); | |
| 455 Function: Simple but specialised FASTA format sequence reader. Uses | |
| 456 $self->_readline() to retrieve input, and is able to strip off | |
| 457 the traling description lines. | |
| 458 Example : | |
| 459 Returns : An array of two elements. | |
| 460 | |
| 461 =cut | |
| 462 | |
| 463 sub _read_fasta_seq { | |
| 464 my ($self) = @_; | |
| 465 my ($id, $seq); | |
| 466 local $/ = ">"; | |
| 467 | |
| 468 my $entry = $self->_readline(); | |
| 469 if($entry) { | |
| 470 $entry =~ s/^>//; | |
| 471 # complete the entry if the first line came from a pushback buffer | |
| 472 while($entry !~ />$/) { | |
| 473 last unless $_ = $self->_readline(); | |
| 474 $entry .= $_; | |
| 475 } | |
| 476 # delete everything onwards from an intervening empty line (at the | |
| 477 # end there might be statistics stuff) | |
| 478 $entry =~ s/\n\n.*$//s; | |
| 479 # id and sequence | |
| 480 if($entry =~ /^(\S+)\n([^>]+)/) { | |
| 481 $id = $1; | |
| 482 $seq = $2; | |
| 483 } else { | |
| 484 $self->throw("Can't parse Genscan predicted sequence entry"); | |
| 485 } | |
| 486 $seq =~ s/\s//g; # Remove whitespace | |
| 487 } | |
| 488 return ($id, $seq); | |
| 489 } | |
| 490 | |
| 491 1; |
