Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/ESTScan.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/ESTScan.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,369 @@ +# $Id: ESTScan.pm,v 1.10 2002/10/22 07:38:45 lapp Exp $ +# +# BioPerl module for Bio::Tools::ESTScan +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::ESTScan - Results of one ESTScan run + +=head1 SYNOPSIS + + $estscan = Bio::Tools::ESTScan->new(-file => 'result.estscan'); + # filehandle: + $estscan = Bio::Tools::ESTScan->new( -fh => \*INPUT ); + + # parse the results + # note: this class is-a Bio::Tools::AnalysisResult which implements + # Bio::SeqAnalysisParserI, i.e., $genscan->next_feature() is the same + while($gene = $estscan->next_prediction()) { + # $gene is an instance of Bio::Tools::Prediction::Gene + foreach my $orf ($gene->exons()) { + # $orf is an instance of Bio::Tools::Prediction::Exon + $cds_str = $orf->predicted_cds(); + } + } + + # essential if you gave a filename at initialization (otherwise the file + # will stay open) + $estscan->close(); + +=head1 DESCRIPTION + +The ESTScan module provides a parser for ESTScan coding region prediction +output. + +This module inherits off L<Bio::Tools::AnalysisResult> and therefore +implements the L<Bio::SeqAnalysisParserI> interface. +See L<Bio::SeqAnalysisParserI>. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net (or hilmar.lapp@pharma.novartis.com) + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::ESTScan; +use vars qw(@ISA); +use strict; +use Symbol; + +use Bio::Root::Root; +use Bio::Tools::AnalysisResult; +use Bio::Tools::Prediction::Exon; + +@ISA = qw(Bio::Tools::AnalysisResult); + +sub _initialize_state { + my ($self,@args) = @_; + + # first call the inherited method! + my $make = $self->SUPER::_initialize_state(@args); + + if(! $self->analysis_method()) { + $self->analysis_method('ESTScan'); + } +} + +=head2 analysis_method + + Usage : $estscan->analysis_method(); + Purpose : Inherited method. Overridden to ensure that the name matches + /estscan/i. + Returns : String + Argument : n/a + +=cut + +#------------- +sub analysis_method { +#------------- + my ($self, $method) = @_; + if($method && ($method !~ /estscan/i)) { + $self->throw("method $method not supported in " . ref($self)); + } + return $self->SUPER::analysis_method($method); +} + +=head2 next_feature + + Title : next_feature + Usage : while($orf = $estscan->next_feature()) { + # do something + } + Function: Returns the next gene structure prediction of the ESTScan result + file. Call this method repeatedly until FALSE is returned. + + The returned object is actually a SeqFeatureI implementing object. + This method is required for classes implementing the + SeqAnalysisParserI interface, and is merely an alias for + next_prediction() at present. + + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_feature { + my ($self,@args) = @_; + # even though next_prediction doesn't expect any args (and this method + # does neither), we pass on args in order to be prepared if this changes + # ever + return $self->next_prediction(@args); +} + +=head2 next_prediction + + Title : next_prediction + Usage : while($gene = $estscan->next_prediction()) { + # do something + } + Function: Returns the next gene structure prediction of the ESTScan result + file. Call this method repeatedly until FALSE is returned. + + So far, this method DOES NOT work for reverse strand predictions, + even though the code looks like. + Example : + Returns : A Bio::Tools::Prediction::Gene object. + Args : + +=cut + +sub next_prediction { + my ($self) = @_; + my ($gene, $seq, $cds, $predobj); + my $numins = 0; + + # predictions are in the format of FASTA sequences and can be parsed one + # at a time + $seq = $self->_fasta_stream()->next_seq(); + return unless $seq; + # there is a new prediction + $gene = Bio::Tools::Prediction::Gene->new('-primary' => "ORFprediction", + '-source' => "ESTScan"); + # score starts the description + $seq->desc() =~ /^([\d.]+)\s*(.*)/ or + $self->throw("unexpected format of description: no score in " . + $seq->desc()); + $gene->score($1); + $seq->desc($2); + # strand may end the description + if($seq->desc() =~ /(.*)minus strand$/) { + my $desc = $1; + $desc =~ s/;\s+$//; + $seq->desc($desc); + $gene->strand(-1); + } else { + $gene->strand(1); + } + # check for the format: default or 'all-in-one' (option -a) + if($seq->desc() =~ /^(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(.*)/) { + # default format + $seq->desc($5); + $predobj = Bio::Tools::Prediction::Exon->new('-source' => "ESTScan", + '-start' => $3, + '-end' => $4); + $predobj->strand($gene->strand()); + $predobj->score($gene->score()); # FIXME or $1, or $2 ? + $predobj->primary_tag("InternalExon"); + $predobj->seq_id($seq->display_id()); + # add to gene structure object + $gene->add_exon($predobj); + # add predicted CDS + $cds = $seq->seq(); + $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions + $cds = Bio::PrimarySeq->new('-seq' => $cds, + '-display_id' => $seq->display_id(), + '-desc' => $seq->desc(), + '-alphabet' => "dna"); + $gene->predicted_cds($cds); + $predobj->predicted_cds($cds); + if($gene->strand() == -1) { + $self->warn("reverse strand ORF, but unable to reverse coordinates!"); + } + } else { + # + # All-in-one format (hopefully). This encodes the following information + # into the sequence: + # 1) untranslated regions: stretches of lower-case letters + # 2) translated regions: stretches of upper-case letters + # 3) insertions in the translated regions: capital X + # 4) deletions in the translated regions: a single lower-case letter + # + # if reverse strand ORF, save a lot of hassle by reversing the sequence + if($gene->strand() == -1) { + $seq = $seq->revcom(); + } + my $seqstr = $seq->seq(); + while($seqstr =~ /^([a-z]*)([A-Z].*)$/) { + # leading 5'UTR + my $utr5 = $1; + # exon + 3'UTR + my $exonseq = $2; + # strip 3'UTR and following exons + if($exonseq =~ s/([a-z]{2,}.*)$//) { + $seqstr = $1; + } else { + $seqstr = ""; + } + # start: take care of yielding the absolute coordinate + my $start = CORE::length($utr5) + 1; + if($predobj) { + $start += $predobj->end() + $numins; + } + # for the end coordinate, we need to subtract the insertions + $cds = $exonseq; + $cds =~ s/[X]//g; + my $end = $start + CORE::length($cds) - 1; + # construct next exon object + $predobj = Bio::Tools::Prediction::Exon->new('-start' => $start, + '-end' => $end); + $predobj->source_tag("ESTScan"); + $predobj->primary_tag("InternalExon"); + $predobj->seq_id($seq->display_id()); + $predobj->strand($gene->strand()); + $predobj->score($gene->score()); + # add the exon to the gene structure object + $gene->add_exon($predobj); + # add the predicted CDS + $cds = $exonseq; + $cds =~ s/[a-z]//g; # remove the deletions, but keep the insertions + $cds = Bio::PrimarySeq->new('-seq' => $cds, + '-display_id' => $seq->display_id(), + '-desc' => $seq->desc(), + '-alphabet' => "dna"); + # only store the first one in the overall prediction + $gene->predicted_cds($cds) unless $gene->predicted_cds(); + $predobj->predicted_cds($cds); + # add the predicted insertions and deletions as subfeatures + # of the exon + my $fea = undef; + while($exonseq =~ /([a-zX])/g) { + my $indel = $1; + # start and end: start looking at the position after the + # previous feature + if($fea) { + $start = $fea->start()+$numins; + $start -= 1 if($fea->primary_tag() eq 'insertion'); + } else { + $start = $predobj->start()+$numins-1; + } + #print "# numins = $numins, indel = $indel, start = $start\n"; + $start = index($seq->seq(), $indel, $start) + 1 - $numins; + $fea = Bio::SeqFeature::Generic->new('-start' => $start, + '-end' => $start); + $fea->source_tag("ESTScan"); + $fea->seq_id($seq->display_id()); + $fea->strand($predobj->strand()); + if($indel eq 'X') { + # an insertion (depends on viewpoint: to get the 'real' + # CDS, a base has to be inserted, i.e., the HMMER model + # inserted a base; however, the sequencing process deleted + # a base that was there). + $fea->primary_tag("insertion"); + # we need to count insertions because these are left out + # of any coordinates saved in the objects (which is correct + # because insertions change the original sequence, so + # coordinates wouldn't match) + $numins++; + } else { + # a deletion (depends on viewpoint: to get the 'real' + # CDS, a base has to be deleted, i.e., the HMMER model + # deleted a base; however, the sequencing process inserted + # a base that wasn't there). + $fea->primary_tag("deletion"); + $fea->add_tag_value('base', $indel); + } + $predobj->add_sub_SeqFeature($fea); + } + } + } + + return $gene; +} + +=head2 close + + Title : close + Usage : $result->close() + Function: Closes the file handle associated with this result file. + Inherited method, overridden. + Example : + Returns : + Args : + +=cut + +sub close { + my ($self, @args) = @_; + + delete($self->{'_fastastream'}); + $self->SUPER::close(@args); +} + +=head2 _fasta_stream + + Title : _fasta_stream + Usage : $result->_fasta_stream() + Function: Gets/Sets the FASTA sequence IO stream for reading the contents of + the file associated with this MZEF result object. + + If called for the first time, creates the stream from the filehandle + if necessary. + Example : + Returns : + Args : + +=cut + +sub _fasta_stream { + my ($self, $stream) = @_; + + if($stream || (! exists($self->{'_fastastream'}))) { + if(! $stream) { + $stream = Bio::SeqIO->new('-fh' => $self->_fh(), + '-format' => "fasta"); + } + $self->{'_fastastream'} = $stream; + } + return $self->{'_fastastream'}; +} + +1; +