Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Genemark.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Genemark.pm,v 1.11.2.1 2003/04/24 08:51:48 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Tools::Genemark | |
| 4 # | |
| 5 # Cared for by Mark Fiers <hlapp@gmx.net> | |
| 6 # | |
| 7 # Copyright Hilmar Lapp, Mark Fiers | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Tools::Genemark - Results of one Genemark run | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 $Genemark = Bio::Tools::Genemark->new(-file => 'result.Genemark'); | |
| 20 # filehandle: | |
| 21 $Genemark = Bio::Tools::Genemark->new( -fh => \*INPUT ); | |
| 22 | |
| 23 # parse the results | |
| 24 # note: this class is-a Bio::Tools::AnalysisResult which implements | |
| 25 # Bio::SeqAnalysisParserI, i.e., $Genemark->next_feature() is the same | |
| 26 while($gene = $Genemark->next_prediction()) { | |
| 27 # $gene is an instance of Bio::Tools::Prediction::Gene, which inherits | |
| 28 # off Bio::SeqFeature::Gene::Transcript. | |
| 29 # | |
| 30 # $gene->exons() returns an array of | |
| 31 # Bio::Tools::Prediction::Exon objects | |
| 32 # all exons: | |
| 33 @exon_arr = $gene->exons(); | |
| 34 | |
| 35 # initial exons only | |
| 36 @init_exons = $gene->exons('Initial'); | |
| 37 # internal exons only | |
| 38 @intrl_exons = $gene->exons('Internal'); | |
| 39 # terminal exons only | |
| 40 @term_exons = $gene->exons('Terminal'); | |
| 41 # singleton exons: | |
| 42 ($single_exon) = $gene->exons(); | |
| 43 } | |
| 44 | |
| 45 # essential if you gave a filename at initialization (otherwise the file | |
| 46 # will stay open) | |
| 47 $Genemark->close(); | |
| 48 | |
| 49 =head1 DESCRIPTION | |
| 50 | |
| 51 The Genemark module provides a parser for Genemark gene structure prediction | |
| 52 output. It parses one gene prediction into a Bio::SeqFeature::Gene::Transcript- | |
| 53 derived object. | |
| 54 | |
| 55 This module has been developed around genemark.hmm for eukaryots v2.2a and will | |
| 56 probably not work with other versions. | |
| 57 | |
| 58 | |
| 59 This module also implements the Bio::SeqAnalysisParserI interface, and thus | |
| 60 can be used wherever such an object fits. See L<Bio::SeqAnalysisParserI>. | |
| 61 | |
| 62 =head1 FEEDBACK | |
| 63 | |
| 64 =head2 Mailing Lists | |
| 65 | |
| 66 User feedback is an integral part of the evolution of this and other | |
| 67 Bioperl modules. Send your comments and suggestions preferably to one | |
| 68 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 69 | |
| 70 bioperl-l@bioperl.org - General discussion | |
| 71 http://bio.perl.org/MailList.html - About the mailing lists | |
| 72 | |
| 73 =head2 Reporting Bugs | |
| 74 | |
| 75 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 76 the bugs and their resolution. Bug reports can be submitted via email | |
| 77 or the web: | |
| 78 | |
| 79 bioperl-bugs@bio.perl.org | |
| 80 http://bugzilla.bioperl.org/ | |
| 81 | |
| 82 =head1 AUTHOR - Hilmar Lapp, Mark Fiers | |
| 83 | |
| 84 Email hlapp@gmx.net | |
| 85 m.w.e.j.fiers@plant.wag-ur.nl | |
| 86 | |
| 87 Describe contact details here | |
| 88 | |
| 89 =head1 APPENDIX | |
| 90 | |
| 91 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
| 92 | |
| 93 =cut | |
| 94 | |
| 95 | |
| 96 # Let the code begin... | |
| 97 | |
| 98 | |
| 99 package Bio::Tools::Genemark; | |
| 100 use vars qw(@ISA); | |
| 101 use strict; | |
| 102 use Symbol; | |
| 103 | |
| 104 use Bio::Root::Root; | |
| 105 use Bio::Tools::AnalysisResult; | |
| 106 use Bio::Tools::Prediction::Gene; | |
| 107 use Bio::Tools::Prediction::Exon; | |
| 108 use Bio::Seq; | |
| 109 | |
| 110 @ISA = qw(Bio::Tools::AnalysisResult); | |
| 111 | |
| 112 sub _initialize_state { | |
| 113 my ($self,@args) = @_; | |
| 114 | |
| 115 # first call the inherited method! | |
| 116 $self->SUPER::_initialize_state(@args); | |
| 117 | |
| 118 # our private state variables | |
| 119 $self->{'_preds_parsed'} = 0; | |
| 120 $self->{'_has_cds'} = 0; | |
| 121 # array of pre-parsed predictions | |
| 122 $self->{'_preds'} = []; | |
| 123 # seq stack | |
| 124 $self->{'_seqstack'} = []; | |
| 125 } | |
| 126 | |
| 127 =head2 analysis_method | |
| 128 | |
| 129 Usage : $Genemark->analysis_method(); | |
| 130 Purpose : Inherited method. Overridden to ensure that the name matches | |
| 131 /GeneMark.hmm/i. | |
| 132 Returns : String | |
| 133 Argument : n/a | |
| 134 | |
| 135 =cut | |
| 136 | |
| 137 #------------- | |
| 138 sub analysis_method { | |
| 139 #------------- | |
| 140 my ($self, $method) = @_; | |
| 141 if($method && ($method !~ /Genemark\.hmm/i)) { | |
| 142 $self->throw("method $method not supported in " . ref($self)); | |
| 143 } | |
| 144 return $self->SUPER::analysis_method($method); | |
| 145 } | |
| 146 | |
| 147 =head2 next_feature | |
| 148 | |
| 149 Title : next_feature | |
| 150 Usage : while($gene = $Genemark->next_feature()) { | |
| 151 # do something | |
| 152 } | |
| 153 Function: Returns the next gene structure prediction of the Genemark result | |
| 154 file. Call this method repeatedly until FALSE is returned. | |
| 155 | |
| 156 The returned object is actually a SeqFeatureI implementing object. | |
| 157 This method is required for classes implementing the | |
| 158 SeqAnalysisParserI interface, and is merely an alias for | |
| 159 next_prediction() at present. | |
| 160 | |
| 161 Example : | |
| 162 Returns : A Bio::Tools::Prediction::Gene object. | |
| 163 Args : | |
| 164 | |
| 165 =cut | |
| 166 | |
| 167 sub next_feature { | |
| 168 my ($self,@args) = @_; | |
| 169 # even though next_prediction doesn't expect any args (and this method | |
| 170 # does neither), we pass on args in order to be prepared if this changes | |
| 171 # ever | |
| 172 return $self->next_prediction(@args); | |
| 173 } | |
| 174 | |
| 175 =head2 next_prediction | |
| 176 | |
| 177 Title : next_prediction | |
| 178 Usage : while($gene = $Genemark->next_prediction()) { | |
| 179 # do something | |
| 180 } | |
| 181 Function: Returns the next gene structure prediction of the Genemark result | |
| 182 file. Call this method repeatedly until FALSE is returned. | |
| 183 | |
| 184 Example : | |
| 185 Returns : A Bio::Tools::Prediction::Gene object. | |
| 186 Args : | |
| 187 | |
| 188 =cut | |
| 189 | |
| 190 sub next_prediction { | |
| 191 my ($self) = @_; | |
| 192 my $gene; | |
| 193 | |
| 194 # if the prediction section hasn't been parsed yet, we do this now | |
| 195 $self->_parse_predictions() unless $self->_predictions_parsed(); | |
| 196 | |
| 197 # get next gene structure | |
| 198 $gene = $self->_prediction(); | |
| 199 | |
| 200 return $gene; | |
| 201 } | |
| 202 | |
| 203 =head2 _parse_predictions | |
| 204 | |
| 205 Title : _parse_predictions() | |
| 206 Usage : $obj->_parse_predictions() | |
| 207 Function: Parses the prediction section. Automatically called by | |
| 208 next_prediction() if not yet done. | |
| 209 Example : | |
| 210 Returns : | |
| 211 | |
| 212 =cut | |
| 213 | |
| 214 sub _parse_predictions { | |
| 215 my ($self) = @_; | |
| 216 my %exontags = ('Initial' => 'Initial', | |
| 217 'Internal' => 'Internal', | |
| 218 'Terminal' => 'Terminal', | |
| 219 'Single' => '', | |
| 220 '_na_' => ''); | |
| 221 my $exontag; | |
| 222 my $gene; | |
| 223 my $seqname; | |
| 224 my $exontype; | |
| 225 my $current_gene_no = -1; | |
| 226 | |
| 227 while(defined($_ = $self->_readline())) { | |
| 228 | |
| 229 if( (/^\s*(\d+)\s+(\d+)/) || (/^\s*(\d+)\s+[\+\-]/)) { | |
| 230 | |
| 231 # this is an exon, Genemark doesn't predict anything else | |
| 232 # $prednr corresponds to geneno. | |
| 233 my $prednr = $1; | |
| 234 | |
| 235 #exon no: | |
| 236 my $signalnr = 0; | |
| 237 if ($2) { my $signalnr = $2; } # used in tag: exon_no | |
| 238 | |
| 239 # split into fields | |
| 240 chomp(); | |
| 241 my @flds = split(' ', $_); | |
| 242 | |
| 243 # create the feature (an exon) object | |
| 244 my $predobj = Bio::Tools::Prediction::Exon->new(); | |
| 245 | |
| 246 | |
| 247 # define info depending on it being eu- or prokaryot | |
| 248 my ($start, $end, $orientation, $prediction_source); | |
| 249 | |
| 250 if ($self->analysis_method() =~ /PROKARYOTIC/i) { | |
| 251 $prediction_source = "Genemark.hmm.pro"; | |
| 252 $orientation = ($flds[1] eq '+') ? 1 : -1; | |
| 253 ($start, $end) = @flds[(2,3)]; | |
| 254 $exontag = "_na_"; | |
| 255 | |
| 256 } else { | |
| 257 $prediction_source = "Genemark.hmm.eu"; | |
| 258 $orientation = ($flds[2] eq '+') ? 1 : -1; | |
| 259 ($start, $end) = @flds[(4,5)]; | |
| 260 $exontag = $flds[3]; | |
| 261 } | |
| 262 | |
| 263 #store the data in the exon object | |
| 264 $predobj->source_tag($prediction_source); | |
| 265 $predobj->start($start); | |
| 266 $predobj->end($end); | |
| 267 $predobj->strand($orientation); | |
| 268 | |
| 269 $predobj->primary_tag($exontags{$exontag} . "Exon"); | |
| 270 | |
| 271 $predobj->add_tag_value('exon_no',"$signalnr") if ($signalnr); | |
| 272 | |
| 273 $predobj->is_coding(1); | |
| 274 | |
| 275 | |
| 276 # frame calculation as in the genscan module | |
| 277 # is to be implemented... | |
| 278 | |
| 279 #If the $prednr is not equal to the current gene, we | |
| 280 #need to make a new gene and close the old one | |
| 281 if($prednr != $current_gene_no) { | |
| 282 # a new gene, store the old one if it exists | |
| 283 if (defined ($gene)) { | |
| 284 $gene->seq_id($seqname); | |
| 285 $gene = undef ; | |
| 286 } | |
| 287 #and make a new one | |
| 288 $gene = Bio::Tools::Prediction::Gene->new | |
| 289 ( | |
| 290 '-primary' => "GenePrediction$prednr", | |
| 291 '-source' => $prediction_source); | |
| 292 $self->_add_prediction($gene); | |
| 293 $current_gene_no = $prednr; | |
| 294 } | |
| 295 | |
| 296 # Add the exon to the gene | |
| 297 $gene->add_exon($predobj, ($exontag eq "_na_" ? | |
| 298 undef : $exontags{$exontag})); | |
| 299 | |
| 300 } | |
| 301 | |
| 302 if(/^(Genemark\.hmm\s*[PROKARYOTIC]*)\s+\(Version (.*)\)$/i) { | |
| 303 $self->analysis_method($1); | |
| 304 | |
| 305 my $gm_version = $2; | |
| 306 | |
| 307 $self->analysis_method_version($gm_version); | |
| 308 next; | |
| 309 } | |
| 310 | |
| 311 #Matrix file for eukaryot version | |
| 312 if (/^Matrices file:\s+(\S+)?/i) { | |
| 313 $self->analysis_subject($1); | |
| 314 # since the line after the matrix file is always the date | |
| 315 # (in the output file's I have seen!) extract and store this | |
| 316 # here | |
| 317 if (defined(my $_date = $self->_readline())) { | |
| 318 chomp ($_date); | |
| 319 $self->analysis_date($_date); | |
| 320 } | |
| 321 } | |
| 322 | |
| 323 #Matrix file for prokaryot version | |
| 324 if (/^Model file name:\s+(\S+)/) { | |
| 325 $self->analysis_subject($1); | |
| 326 # since the line after the matrix file is always the date | |
| 327 # (in the output file's I have seen!) extract and store this | |
| 328 # here | |
| 329 my $_date = $self->_readline() ; | |
| 330 if (defined($_date = $self->_readline())) { | |
| 331 chomp ($_date); | |
| 332 $self->analysis_date($_date); | |
| 333 } | |
| 334 } | |
| 335 | |
| 336 if(/^Sequence[ file]? name:\s+(.+)\s*$/i) { | |
| 337 $seqname = $1; | |
| 338 # $self->analysis_subject($seqname); | |
| 339 next; | |
| 340 } | |
| 341 | |
| 342 | |
| 343 /^>/ && do { | |
| 344 $self->_pushback($_); | |
| 345 | |
| 346 # section of predicted aa sequences on recognition | |
| 347 # of a fasta start, read all sequences and find the | |
| 348 # appropriate gene | |
| 349 while (1) { | |
| 350 my ($aa_id, $seq) = $self->_read_fasta_seq(); | |
| 351 last unless ($aa_id); | |
| 352 | |
| 353 #now parse through the predictions to add the pred. protein | |
| 354 FINDPRED: foreach my $gene (@{$self->{'_preds'}}) { | |
| 355 $gene->primary_tag() =~ /[^0-9]([0-9]+)$/; | |
| 356 my $geneno = $1; | |
| 357 if ($aa_id =~ /\|gene.$geneno\|/) { | |
| 358 #print "x SEQ : \n $seq \nXXXX\n"; | |
| 359 my $seqobj = Bio::Seq->new('-seq' => $seq, | |
| 360 '-display_id' => $aa_id, | |
| 361 '-alphabet' => "protein"); | |
| 362 $gene->predicted_protein($seqobj); | |
| 363 last FINDPRED; | |
| 364 } | |
| 365 | |
| 366 } | |
| 367 } | |
| 368 | |
| 369 last; | |
| 370 }; | |
| 371 } | |
| 372 | |
| 373 # if the analysis query object contains a ref to a Seq of PrimarySeq | |
| 374 # object, then extract the predicted sequences and add it to the gene | |
| 375 # object. | |
| 376 if (defined $self->analysis_query()) { | |
| 377 my $orig_seq = $self->analysis_query(); | |
| 378 FINDPREDSEQ: foreach my $gene (@{$self->{'_preds'}}) { | |
| 379 my $predseq = ""; | |
| 380 foreach my $exon ($gene->exons()) { | |
| 381 #print $exon->start() . " " . $exon->end () . "\n"; | |
| 382 $predseq .= $orig_seq->subseq($exon->start(), $exon->end()); | |
| 383 } | |
| 384 | |
| 385 my $seqobj = Bio::PrimarySeq->new('-seq' => $predseq, | |
| 386 '-display_id' => "transl"); | |
| 387 $gene->predicted_cds($seqobj); | |
| 388 } | |
| 389 } | |
| 390 | |
| 391 | |
| 392 $self->_predictions_parsed(1); | |
| 393 } | |
| 394 | |
| 395 =head2 _prediction | |
| 396 | |
| 397 Title : _prediction() | |
| 398 Usage : $gene = $obj->_prediction() | |
| 399 Function: internal | |
| 400 Example : | |
| 401 Returns : | |
| 402 | |
| 403 =cut | |
| 404 | |
| 405 sub _prediction { | |
| 406 my ($self) = @_; | |
| 407 | |
| 408 return undef unless(exists($self->{'_preds'}) && @{$self->{'_preds'}}); | |
| 409 return shift(@{$self->{'_preds'}}); | |
| 410 } | |
| 411 | |
| 412 =head2 _add_prediction | |
| 413 | |
| 414 Title : _add_prediction() | |
| 415 Usage : $obj->_add_prediction($gene) | |
| 416 Function: internal | |
| 417 Example : | |
| 418 Returns : | |
| 419 | |
| 420 =cut | |
| 421 | |
| 422 sub _add_prediction { | |
| 423 my ($self, $gene) = @_; | |
| 424 | |
| 425 if(! exists($self->{'_preds'})) { | |
| 426 $self->{'_preds'} = []; | |
| 427 } | |
| 428 push(@{$self->{'_preds'}}, $gene); | |
| 429 } | |
| 430 | |
| 431 =head2 _predictions_parsed | |
| 432 | |
| 433 Title : _predictions_parsed | |
| 434 Usage : $obj->_predictions_parsed | |
| 435 Function: internal | |
| 436 Example : | |
| 437 Returns : TRUE or FALSE | |
| 438 | |
| 439 =cut | |
| 440 | |
| 441 sub _predictions_parsed { | |
| 442 my ($self, $val) = @_; | |
| 443 | |
| 444 $self->{'_preds_parsed'} = $val if $val; | |
| 445 if(! exists($self->{'_preds_parsed'})) { | |
| 446 $self->{'_preds_parsed'} = 0; | |
| 447 } | |
| 448 return $self->{'_preds_parsed'}; | |
| 449 } | |
| 450 | |
| 451 =head2 _has_cds | |
| 452 | |
| 453 Title : _has_cds() | |
| 454 Usage : $obj->_has_cds() | |
| 455 Function: Whether or not the result contains the predicted CDSs, too. | |
| 456 Example : | |
| 457 Returns : TRUE or FALSE | |
| 458 | |
| 459 =cut | |
| 460 | |
| 461 sub _has_cds { | |
| 462 my ($self, $val) = @_; | |
| 463 | |
| 464 $self->{'_has_cds'} = $val if $val; | |
| 465 if(! exists($self->{'_has_cds'})) { | |
| 466 $self->{'_has_cds'} = 0; | |
| 467 } | |
| 468 return $self->{'_has_cds'}; | |
| 469 } | |
| 470 | |
| 471 =head2 _read_fasta_seq | |
| 472 | |
| 473 Title : _read_fasta_seq() | |
| 474 Usage : ($id,$seqstr) = $obj->_read_fasta_seq(); | |
| 475 Function: Simple but specialised FASTA format sequence reader. Uses | |
| 476 $self->_readline() to retrieve input, and is able to strip off | |
| 477 the traling description lines. | |
| 478 Example : | |
| 479 Returns : An array of two elements. | |
| 480 | |
| 481 =cut | |
| 482 | |
| 483 sub _read_fasta_seq { | |
| 484 my ($self) = @_; | |
| 485 my ($id, $seq); | |
| 486 local $/ = ">"; | |
| 487 | |
| 488 return 0 unless (my $entry = $self->_readline()); | |
| 489 | |
| 490 $entry =~ s/^>//; | |
| 491 # complete the entry if the first line came from a pushback buffer | |
| 492 while(! ($entry =~ />$/)) { | |
| 493 last unless ($_ = $self->_readline()); | |
| 494 $entry .= $_; | |
| 495 } | |
| 496 | |
| 497 # delete everything onwards from an new fasta start (>) | |
| 498 $entry =~ s/\n>.*$//s; | |
| 499 # id and sequence | |
| 500 | |
| 501 if($entry =~ s/^(.+)\n//) { | |
| 502 $id = $1; | |
| 503 $id =~ s/ /_/g; | |
| 504 $seq = $entry; | |
| 505 $seq =~ s/\s//g; | |
| 506 #print "\n@@ $id \n@@ $seq \n##\n"; | |
| 507 } else { | |
| 508 $self->throw("Can't parse Genemark predicted sequence entry"); | |
| 509 } | |
| 510 $seq =~ s/\s//g; # Remove whitespace | |
| 511 return ($id, $seq); | |
| 512 } | |
| 513 | |
| 514 1; | |
| 515 |
