Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/ESTScan.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: ESTScan.pm,v 1.10 2002/10/22 07:38:45 lapp Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Tools::ESTScan | |
| 4 # | |
| 5 # Cared for by Hilmar Lapp <hlapp@gmx.net> | |
| 6 # | |
| 7 # Copyright Hilmar Lapp | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Tools::ESTScan - Results of one ESTScan run | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan'); | |
| 20 # filehandle: | |
| 21 $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT ); | |
| 22 | |
| 23 # parse the results | |
| 24 # note: this class is-a Bio::Tools::AnalysisResult which implements | |
| 25 # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same | |
| 26 while($gene = $estscan->next_prediction()) { | |
| 27 # $gene is an instance of Bio::Tools::Prediction::Gene | |
| 28 foreach my $orf ($gene->exons()) { | |
| 29 # $orf is an instance of Bio::Tools::Prediction::Exon | |
| 30 $cds_str = $orf->predicted_cds(); | |
| 31 } | |
| 32 } | |
| 33 | |
| 34 # essential if you gave a filename at initialization (otherwise the file | |
| 35 # will stay open) | |
| 36 $estscan->close(); | |
| 37 | |
| 38 =head1 DESCRIPTION | |
| 39 | |
| 40 The ESTScan module provides a parser for ESTScan coding region prediction | |
| 41 output. | |
| 42 | |
| 43 This module inherits off L<Bio::Tools::AnalysisResult> and therefore | |
| 44 implements the L<Bio::SeqAnalysisParserI> interface. | |
| 45 See L<Bio::SeqAnalysisParserI>. | |
| 46 | |
| 47 =head1 FEEDBACK | |
| 48 | |
| 49 =head2 Mailing Lists | |
| 50 | |
| 51 User feedback is an integral part of the evolution of this and other | |
| 52 Bioperl modules. Send your comments and suggestions preferably to one | |
| 53 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 54 | |
| 55 bioperl-l@bioperl.org - General discussion | |
| 56 http://bio.perl.org/MailList.html - About the mailing lists | |
| 57 | |
| 58 =head2 Reporting Bugs | |
| 59 | |
| 60 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 61 the bugs and their resolution. Bug reports can be submitted via email | |
| 62 or the web: | |
| 63 | |
| 64 bioperl-bugs@bio.perl.org | |
| 65 http://bugzilla.bioperl.org/ | |
| 66 | |
| 67 =head1 AUTHOR - Hilmar Lapp | |
| 68 | |
| 69 Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) | |
| 70 | |
| 71 Describe contact details here | |
| 72 | |
| 73 =head1 APPENDIX | |
| 74 | |
| 75 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
| 76 | |
| 77 =cut | |
| 78 | |
| 79 # Let the code begin... | |
| 80 | |
| 81 package Bio::Tools::ESTScan; | |
| 82 use vars qw(@ISA); | |
| 83 use strict; | |
| 84 use Symbol; | |
| 85 | |
| 86 use Bio::Root::Root; | |
| 87 use Bio::Tools::AnalysisResult; | |
| 88 use Bio::Tools::Prediction::Exon; | |
| 89 | |
| 90 @ISA = qw(Bio::Tools::AnalysisResult); | |
| 91 | |
| 92 sub _initialize_state { | |
| 93 my ($self,@args) = @_; | |
| 94 | |
| 95 # first call the inherited method! | |
| 96 my $make = $self->SUPER::_initialize_state(@args); | |
| 97 | |
| 98 if(! $self->analysis_method()) { | |
| 99 $self->analysis_method('ESTScan'); | |
| 100 } | |
| 101 } | |
| 102 | |
| 103 =head2 analysis_method | |
| 104 | |
| 105 Usage : $estscan->analysis_method(); | |
| 106 Purpose : Inherited method. Overridden to ensure that the name matches | |
| 107 /estscan/i. | |
| 108 Returns : String | |
| 109 Argument : n/a | |
| 110 | |
| 111 =cut | |
| 112 | |
| 113 #------------- | |
| 114 sub analysis_method { | |
| 115 #------------- | |
| 116 my ($self, $method) = @_; | |
| 117 if($method && ($method !~ /estscan/i)) { | |
| 118 $self->throw("method $method not supported in " . ref($self)); | |
| 119 } | |
| 120 return $self->SUPER::analysis_method($method); | |
| 121 } | |
| 122 | |
| 123 =head2 next_feature | |
| 124 | |
| 125 Title : next_feature | |
| 126 Usage : while($orf = $estscan->next_feature()) { | |
| 127 # do something | |
| 128 } | |
| 129 Function: Returns the next gene structure prediction of the ESTScan result | |
| 130 file. Call this method repeatedly until FALSE is returned. | |
| 131 | |
| 132 The returned object is actually a SeqFeatureI implementing object. | |
| 133 This method is required for classes implementing the | |
| 134 SeqAnalysisParserI interface, and is merely an alias for | |
| 135 next_prediction() at present. | |
| 136 | |
| 137 Example : | |
| 138 Returns : A Bio::Tools::Prediction::Gene object. | |
| 139 Args : | |
| 140 | |
| 141 =cut | |
| 142 | |
| 143 sub next_feature { | |
| 144 my ($self,@args) = @_; | |
| 145 # even though next_prediction doesn't expect any args (and this method | |
| 146 # does neither), we pass on args in order to be prepared if this changes | |
| 147 # ever | |
| 148 return $self->next_prediction(@args); | |
| 149 } | |
| 150 | |
| 151 =head2 next_prediction | |
| 152 | |
| 153 Title : next_prediction | |
| 154 Usage : while($gene = $estscan->next_prediction()) { | |
| 155 # do something | |
| 156 } | |
| 157 Function: Returns the next gene structure prediction of the ESTScan result | |
| 158 file. Call this method repeatedly until FALSE is returned. | |
| 159 | |
| 160 So far, this method DOES NOT work for reverse strand predictions, | |
| 161 even though the code looks like. | |
| 162 Example : | |
| 163 Returns : A Bio::Tools::Prediction::Gene object. | |
| 164 Args : | |
| 165 | |
| 166 =cut | |
| 167 | |
| 168 sub next_prediction { | |
| 169 my ($self) = @_; | |
| 170 my ($gene, $seq, $cds, $predobj); | |
| 171 my $numins = 0; | |
| 172 | |
| 173 # predictions are in the format of FASTA sequences and can be parsed one | |
| 174 # at a time | |
| 175 $seq = $self->_fasta_stream()->next_seq(); | |
| 176 return unless $seq; | |
| 177 # there is a new prediction | |
| 178 $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction", | |
| 179 '-source' => "ESTScan"); | |
| 180 # score starts the description | |
| 181 $seq->desc() =~ /^([\d.]+)\s*(.*)/ or | |
| 182 $self->throw("unexpected format of description: no score in " . | |
| 183 $seq->desc()); | |
| 184 $gene->score($1); | |
| 185 $seq->desc($2); | |
| 186 # strand may end the description | |
| 187 if($seq->desc() =~ /(.*)minus strand$/) { | |
| 188 my $desc = $1; | |
| 189 $desc =~ s/;\s+$//; | |
| 190 $seq->desc($desc); | |
| 191 $gene->strand(-1); | |
| 192 } else { | |
| 193 $gene->strand(1); | |
| 194 } | |
| 195 # check for the format: default or 'all-in-one' (option -a) | |
| 196 if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) { | |
| 197 # default format | |
| 198 $seq->desc($5); | |
| 199 $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan", | |
| 200 '-start' => $3, | |
| 201 '-end' => $4); | |
| 202 $predobj->strand($gene->strand()); | |
| 203 $predobj->score($gene->score()); # FIXME or $1, or $2 ? | |
| 204 $predobj->primary_tag("InternalExon"); | |
| 205 $predobj->seq_id($seq->display_id()); | |
| 206 # add to gene structure object | |
| 207 $gene->add_exon($predobj); | |
| 208 # add predicted CDS | |
| 209 $cds = $seq->seq(); | |
| 210 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions | |
| 211 $cds = Bio::PrimarySeq->new('-seq' => $cds, | |
| 212 '-display_id' => $seq->display_id(), | |
| 213 '-desc' => $seq->desc(), | |
| 214 '-alphabet' => "dna"); | |
| 215 $gene->predicted_cds($cds); | |
| 216 $predobj->predicted_cds($cds); | |
| 217 if($gene->strand() == -1) { | |
| 218 $self->warn("reverse strand ORF, but unable to reverse coordinates!"); | |
| 219 } | |
| 220 } else { | |
| 221 # | |
| 222 # All-in-one format (hopefully). This encodes the following information | |
| 223 # into the sequence: | |
| 224 # 1) untranslated regions: stretches of lower-case letters | |
| 225 # 2) translated regions: stretches of upper-case letters | |
| 226 # 3) insertions in the translated regions: capital X | |
| 227 # 4) deletions in the translated regions: a single lower-case letter | |
| 228 # | |
| 229 # if reverse strand ORF, save a lot of hassle by reversing the sequence | |
| 230 if($gene->strand() == -1) { | |
| 231 $seq = $seq->revcom(); | |
| 232 } | |
| 233 my $seqstr = $seq->seq(); | |
| 234 while($seqstr =~ /^([a-z]*)([A-Z].*)$/) { | |
| 235 # leading 5'UTR | |
| 236 my $utr5 = $1; | |
| 237 # exon + 3'UTR | |
| 238 my $exonseq = $2; | |
| 239 # strip 3'UTR and following exons | |
| 240 if($exonseq =~ s/([a-z]{2,}.*)$//) { | |
| 241 $seqstr = $1; | |
| 242 } else { | |
| 243 $seqstr = ""; | |
| 244 } | |
| 245 # start: take care of yielding the absolute coordinate | |
| 246 my $start = CORE::length($utr5) + 1; | |
| 247 if($predobj) { | |
| 248 $start += $predobj->end() + $numins; | |
| 249 } | |
| 250 # for the end coordinate, we need to subtract the insertions | |
| 251 $cds = $exonseq; | |
| 252 $cds =~ s/[X]//g; | |
| 253 my $end = $start + CORE::length($cds) - 1; | |
| 254 # construct next exon object | |
| 255 $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start, | |
| 256 '-end' => $end); | |
| 257 $predobj->source_tag("ESTScan"); | |
| 258 $predobj->primary_tag("InternalExon"); | |
| 259 $predobj->seq_id($seq->display_id()); | |
| 260 $predobj->strand($gene->strand()); | |
| 261 $predobj->score($gene->score()); | |
| 262 # add the exon to the gene structure object | |
| 263 $gene->add_exon($predobj); | |
| 264 # add the predicted CDS | |
| 265 $cds = $exonseq; | |
| 266 $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions | |
| 267 $cds = Bio::PrimarySeq->new('-seq' => $cds, | |
| 268 '-display_id' => $seq->display_id(), | |
| 269 '-desc' => $seq->desc(), | |
| 270 '-alphabet' => "dna"); | |
| 271 # only store the first one in the overall prediction | |
| 272 $gene->predicted_cds($cds) unless $gene->predicted_cds(); | |
| 273 $predobj->predicted_cds($cds); | |
| 274 # add the predicted insertions and deletions as subfeatures | |
| 275 # of the exon | |
| 276 my $fea = undef; | |
| 277 while($exonseq =~ /([a-zX])/g) { | |
| 278 my $indel = $1; | |
| 279 # start and end: start looking at the position after the | |
| 280 # previous feature | |
| 281 if($fea) { | |
| 282 $start = $fea->start()+$numins; | |
| 283 $start -= 1 if($fea->primary_tag() eq 'insertion'); | |
| 284 } else { | |
| 285 $start = $predobj->start()+$numins-1; | |
| 286 } | |
| 287 #print "# numins = $numins, indel = $indel, start = $start\n"; | |
| 288 $start = index($seq->seq(), $indel, $start) + 1 - $numins; | |
| 289 $fea = Bio::SeqFeature::Generic->new('-start' => $start, | |
| 290 '-end' => $start); | |
| 291 $fea->source_tag("ESTScan"); | |
| 292 $fea->seq_id($seq->display_id()); | |
| 293 $fea->strand($predobj->strand()); | |
| 294 if($indel eq 'X') { | |
| 295 # an insertion (depends on viewpoint: to get the 'real' | |
| 296 # CDS, a base has to be inserted, i.e., the HMMER model | |
| 297 # inserted a base; however, the sequencing process deleted | |
| 298 # a base that was there). | |
| 299 $fea->primary_tag("insertion"); | |
| 300 # we need to count insertions because these are left out | |
| 301 # of any coordinates saved in the objects (which is correct | |
| 302 # because insertions change the original sequence, so | |
| 303 # coordinates wouldn't match) | |
| 304 $numins++; | |
| 305 } else { | |
| 306 # a deletion (depends on viewpoint: to get the 'real' | |
| 307 # CDS, a base has to be deleted, i.e., the HMMER model | |
| 308 # deleted a base; however, the sequencing process inserted | |
| 309 # a base that wasn't there). | |
| 310 $fea->primary_tag("deletion"); | |
| 311 $fea->add_tag_value('base', $indel); | |
| 312 } | |
| 313 $predobj->add_sub_SeqFeature($fea); | |
| 314 } | |
| 315 } | |
| 316 } | |
| 317 | |
| 318 return $gene; | |
| 319 } | |
| 320 | |
| 321 =head2 close | |
| 322 | |
| 323 Title : close | |
| 324 Usage : $result->close() | |
| 325 Function: Closes the file handle associated with this result file. | |
| 326 Inherited method, overridden. | |
| 327 Example : | |
| 328 Returns : | |
| 329 Args : | |
| 330 | |
| 331 =cut | |
| 332 | |
| 333 sub close { | |
| 334 my ($self, @args) = @_; | |
| 335 | |
| 336 delete($self->{'_fastastream'}); | |
| 337 $self->SUPER::close(@args); | |
| 338 } | |
| 339 | |
| 340 =head2 _fasta_stream | |
| 341 | |
| 342 Title : _fasta_stream | |
| 343 Usage : $result->_fasta_stream() | |
| 344 Function: Gets/Sets the FASTA sequence IO stream for reading the contents of | |
| 345 the file associated with this MZEF result object. | |
| 346 | |
| 347 If called for the first time, creates the stream from the filehandle | |
| 348 if necessary. | |
| 349 Example : | |
| 350 Returns : | |
| 351 Args : | |
| 352 | |
| 353 =cut | |
| 354 | |
| 355 sub _fasta_stream { | |
| 356 my ($self, $stream) = @_; | |
| 357 | |
| 358 if($stream || (! exists($self->{'_fastastream'}))) { | |
| 359 if(! $stream) { | |
| 360 $stream = Bio::SeqIO->new('-fh' => $self->_fh(), | |
| 361 '-format' => "fasta"); | |
| 362 } | |
| 363 $self->{'_fastastream'} = $stream; | |
| 364 } | |
| 365 return $self->{'_fastastream'}; | |
| 366 } | |
| 367 | |
| 368 1; | |
| 369 |
