Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/SeqFeature/Gene/Transcript.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Gene/Transcript.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,799 @@ +# $Id: Transcript.pm,v 1.25 2002/12/29 09:37:51 lapp Exp $ +# +# BioPerl module for Bio::SeqFeature::Gene::Transcript +# +# Cared for by Hilmar Lapp <hlapp@gmx.net> +# +# Copyright Hilmar Lapp +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Gene::Transcript - A feature representing a transcript + +=head1 SYNOPSIS + +See documentation of methods. + +=head1 DESCRIPTION + +A feature representing a transcript. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Hilmar Lapp + +Email hlapp@gmx.net + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::SeqFeature::Gene::Transcript; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqFeature::Gene::TranscriptI; +use Bio::SeqFeature::Generic; +use Bio::PrimarySeq; + +@ISA = qw(Bio::SeqFeature::Generic Bio::SeqFeature::Gene::TranscriptI); + +sub new { + my ($caller, @args) = @_; + my $self = $caller->SUPER::new(@args); + my ($primary) = $self->_rearrange([qw(PRIMARY)],@args); + + $primary = 'transcript' unless $primary; + $self->primary_tag($primary); + $self->strand(0) if(! defined($self->strand())); + return $self; +} + + +=head2 promoters + + Title : promoters() + Usage : @proms = $transcript->promoters(); + Function: Get the promoter features/sites of this transcript. + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : An array of Bio::SeqFeatureI implementing objects representing the + promoter regions or sites. + Args : + + +=cut + +sub promoters { + my ($self) = @_; + return $self->get_feature_type('Bio::SeqFeature::Gene::Promoter'); +} + +=head2 add_promoter + + Title : add_promoter() + Usage : $transcript->add_promoter($feature); + Function: Add a promoter feature/site to this transcript. + + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : + Args : A Bio::SeqFeatureI implementing object. + + +=cut + +sub add_promoter { + my ($self, $fea) = @_; + $self->_add($fea,'Bio::SeqFeature::Gene::Promoter'); +} + +=head2 flush_promoters + + Title : flush_promoters() + Usage : $transcript->flush_promoters(); + Function: Remove all promoter features/sites from this transcript. + + Note that OO-modeling of regulatory elements is not stable yet. + This means that this method might change or even disappear in a + future release. Be aware of this if you use it. + + Returns : the removed features as a list + Args : none + + +=cut + +sub flush_promoters { + my ($self) = @_; + return $self->_flush('Bio::SeqFeature::Gene::Promoter'); +} + +=head2 exons + + Title : exons() + Usage : @exons = $gene->exons(); + ($inital_exon) = $gene->exons('Initial'); + Function: Get all exon features or all exons of specified type of this + transcript. + + Exon type is treated as a case-insensitive regular expression and + is optional. For consistency, use only the following types: + initial, internal, terminal. + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects. + Args : An optional string specifying the primary_tag of the feature. + + +=cut + +sub exons { + my ($self, $type) = @_; + return $self->get_unordered_feature_type('Bio::SeqFeature::Gene::ExonI', + $type); +} + +=head2 exons_ordered + + Title : exons_ordered + Usage : @exons = $gene->exons_ordered(); + @exons = $gene->exons_ordered("Internal"); + Function: Get an ordered list of all exon features or all exons of specified + type of this transcript. + + Exon type is treated as a case-insensitive regular expression and + is optional. For consistency, use only the following types: + + Returns : An array of Bio::SeqFeature::Gene::ExonI implementing objects. + Args : An optional string specifying the primary_tag of the feature. + +=cut + +sub exons_ordered { + my ($self,$type) = @_; + return $self->get_feature_type('Bio::SeqFeature::Gene::ExonI', $type); +} + +=head2 add_exon + + Title : add_exon() + Usage : $transcript->add_exon($exon,'initial'); + Function: Add a exon feature to this transcript. + + The second argument denotes the type of exon. Mixing exons with and + without a type is likely to cause trouble in exons(). Either + leave out the type for all exons or for none. + + Presently, the following types are known: initial, internal, + terminal, utr, utr5prime, and utr3prime (all case-insensitive). + UTR should better be added through utrs()/add_utr(). + + If you wish to use other or additional types, you will almost + certainly have to call exon_type_sortorder() in order to replace + the default sort order, or mrna(), cds(), protein(), and exons() + may yield unexpected results. + + Returns : + Args : A Bio::SeqFeature::Gene::ExonI implementing object. + A string indicating the type of the exon (optional). + + +=cut + +sub add_exon { + my ($self, $fea) = @_; + if(! $fea->isa('Bio::SeqFeature::Gene::ExonI') ) { + $self->throw("$fea does not implement Bio::SeqFeature::Gene::ExonI"); + } + $self->_add($fea,'Bio::SeqFeature::Gene::Exon'); +} + +=head2 flush_exons + + Title : flush_exons() + Usage : $transcript->flush_exons(); + $transcript->flush_exons('terminal'); + Function: Remove all or a certain type of exon features from this transcript. + + See add_exon() for documentation about types. + + Calling without a type will not flush UTRs. Call flush_utrs() for + this purpose. + Returns : the deleted features as a list + Args : A string indicating the type of the exon (optional). + + +=cut + +sub flush_exons { + my ($self, $type) = @_; + return $self->_flush('Bio::SeqFeature::Gene::Exon',$type); +} + +=head2 introns + + Title : introns() + Usage : @introns = $gene->introns(); + Function: Get all intron features this gene structure. + + Note that this implementation generates these features + on-the-fly, that is, it simply treats all regions between + exons as introns, assuming that exons do not overlap. A + consequence is that a consistent correspondence between the + elements in the returned array and the array that exons() + returns will exist only if the exons are properly sorted + within their types (forward for plus- strand and reverse + for minus-strand transcripts). To ensure correctness the + elements in the array returned will always be sorted. + + Returns : An array of Bio::SeqFeature::Gene::Intron objects representing + the intron regions. + Args : + + +=cut + +sub introns { + my ($self) = @_; + my @introns = (); + my @exons = $self->exons(); + my ($strand, $rev_order); + + # if there's 1 or less exons we're done + return () unless($#exons > 0); + # record strand and order (a minus-strand transcript is likely to have + # the exons stacked in reverse order) + foreach my $exon (@exons) { + $strand = $exon->strand(); + last if $strand; # we're done if we've got 1 or -1 + } + $rev_order = ($exons[0]->end() < $exons[1]->start() ? 0 : 1); + + # Make sure exons are sorted. Because we assume they don't overlap, we + # simply sort by start position. + if((! defined($strand)) || ($strand != -1) || (! $rev_order)) { + # always sort forward for plus-strand transcripts, and for negative- + # strand transcripts that appear to be unsorted or forward sorted + @exons = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [ $_, $_->start()] } @exons; + } else { + # sort in reverse order for transcripts on the negative strand and + # found to be in reverse order + @exons = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->start()] } @exons; + } + # loop over all intervening gaps + for(my $i = 0; $i < $#exons; $i++) { + my ($start, $end); + my $intron; + + if(defined($exons[$i]->strand()) && + (($exons[$i]->strand() * $strand) < 0)) { + $self->throw("Transcript mixes plus and minus strand exons. ". + "Computing introns makes no sense then."); + } + $start = $exons[$i+$rev_order]->end() + 1; # $i or $i+1 + $end = $exons[$i+1-$rev_order]->start() - 1; # $i+1 or $i + $intron = Bio::SeqFeature::Gene::Intron->new( + '-start' => $start, + '-end' => $end, + '-strand' => $strand, + '-primary' => 'intron', + '-source' => ref($self)); + my $seq = $self->entire_seq(); + $intron->attach_seq($seq) if $seq; + $intron->seq_id($self->seq_id()); + push(@introns, $intron); + } + return @introns; +} + +=head2 poly_A_site + + Title : poly_A_site() + Usage : $polyAsite = $transcript->poly_A_site(); + Function: Get/set the poly-adenylation feature/site of this transcript. + Returns : A Bio::SeqFeatureI implementing object representing the + poly-adenylation region. + Args : A Bio::SeqFeatureI implementing object on set, or FALSE to flush + a previously set object. + + +=cut + +sub poly_A_site { + my ($self, $fea) = @_; + if ($fea) { + $self->_add($fea,'Bio::SeqFeature::Gene::Poly_A_site'); + } + return ($self->get_feature_type('Bio::SeqFeature::Gene::Poly_A_site'))[0]; +} + +=head2 utrs + + Title : utrs() + Usage : @utr_sites = $transcript->utrs('utr3prime'); + @utr_sites = $transcript->utrs('utr5prime'); + @utr_sites = $transcript->utrs(); + Function: Get the features representing untranslated regions (UTR) of this + transcript. + + You may provide an argument specifying the type of UTR. Currently + the following types are recognized: utr5prime utr3prime for UTR on the + 5' and 3' end of the CDS, respectively. + + Returns : An array of Bio::SeqFeature::Gene::UTR objects + representing the UTR regions or sites. + Args : Optionally, either utr3prime, or utr5prime for the the type of UTR + feature. + + +=cut + +sub utrs { + my ($self, $type) = @_; + return $self->get_feature_type('Bio::SeqFeature::Gene::UTR',$type); + +} + +=head2 add_utr + + Title : add_utr() + Usage : $transcript->add_utr($utrobj, 'utr3prime'); + $transcript->add_utr($utrobj); + Function: Add a UTR feature/site to this transcript. + + The second parameter is optional and denotes the type of the UTR + feature. Presently recognized types include 'utr5prime' and 'utr3prime' + for UTR on the 5' and 3' end of a gene, respectively. + + Calling this method is the same as calling + add_exon($utrobj, 'utr'.$type). In this sense a UTR object is a + special exon object, which is transcribed, not spliced out, but + not translated. + + Note that the object supplied should return FALSE for is_coding(). + Otherwise cds() and friends will become confused. + + Returns : + Args : A Bio::SeqFeature::Gene::UTR implementing object. + + +=cut + +sub add_utr { + my ($self, $fea, $type) = @_; + $self->_add($fea,'Bio::SeqFeature::Gene::UTR',$type); +} + +=head2 flush_utrs + + Title : flush_utrs() + Usage : $transcript->flush_utrs(); + $transcript->flush_utrs('utr3prime'); + Function: Remove all or a specific type of UTR features/sites from this + transcript. + + Cf. add_utr() for documentation about recognized types. + Returns : a list of the removed features + Args : Optionally a string denoting the type of UTR feature. + + +=cut + +sub flush_utrs { + my ($self, $type) = @_; + return $self->_flush('Bio::SeqFeature::Gene::UTR',$type); +} + +=head2 sub_SeqFeature + + Title : sub_SeqFeature + Usage : @feats = $transcript->sub_SeqFeature(); + Function: Returns an array of all subfeatures. + + This method is defined in Bio::SeqFeatureI. We override this here + to include the exon etc features. + + Returns : An array Bio::SeqFeatureI implementing objects. + Args : none + + +=cut + +sub sub_SeqFeature { + my ($self) = @_; + my @feas; + + # get what the parent already has + @feas = $self->SUPER::sub_SeqFeature(); + # add the features we have in addition + push(@feas, $self->exons()); # this includes UTR features + push(@feas, $self->promoters()); + push(@feas, $self->poly_A_site()) if($self->poly_A_site()); + return @feas; +} + +=head2 flush_sub_SeqFeature + + Title : flush_sub_SeqFeature + Usage : $transcript->flush_sub_SeqFeature(); + $transcript->flush_sub_SeqFeature(1); + Function: Removes all subfeatures. + + This method is overridden from Bio::SeqFeature::Generic to flush + all additional subfeatures like exons, promoters, etc., which is + almost certainly not what you want. To remove only features added + through $transcript->add_sub_SeqFeature($feature) pass any + argument evaluating to TRUE. + + Example : + Returns : none + Args : Optionally, an argument evaluating to TRUE will suppress flushing + of all transcript-specific subfeatures (exons etc.). + + +=cut + +sub flush_sub_SeqFeature { + my ($self,$fea_only) = @_; + + $self->SUPER::flush_sub_SeqFeature(); + if(! $fea_only) { + $self->flush_promoters(); + $self->flush_exons(); + $self->flush_utrs(); + $self->poly_A_site(0); + } +} + + +=head2 cds + + Title : cds + Usage : $seq = $transcript->cds(); + Function: Returns the CDS (coding sequence) as defined by the exons + of this transcript and the attached sequence. + + If no sequence is attached this method will return undef. + + Note that the implementation provided here returns a + concatenation of all coding exons, thereby assuming that + exons do not overlap. + + Note also that you cannot set the CDS via this method. Set + a single CDS feature as a single exon, or derive your own + class if you want to store a predicted CDS. + + Example : + Returns : A Bio::PrimarySeqI implementing object. + Args : + +=cut + +sub cds { + my ($self) = @_; + my @exons = $self->exons_ordered(); #this is always sorted properly according to strand + my $strand; + + return undef unless(@exons); + # record strand (a minus-strand transcript must have the exons sorted in + # reverse order) + foreach my $exon (@exons) { + if(defined($exon->strand()) && (! $strand)) { + $strand = $exon->strand(); + } + if($exon->strand() && (($exon->strand() * $strand) < 0)) { + $self->throw("Transcript mixes coding exons on plus and minus ". + "strand. This makes no sense."); + } + } + my $cds = $self->_make_cds(@exons); + return undef unless $cds; + return Bio::PrimarySeq->new('-id' => $self->seq_id(), + '-seq' => $cds, + '-alphabet' => "dna"); +} + +=head2 protein + + Title : protein() + Usage : $protein = $transcript->protein(); + Function: Get the protein encoded by the transcript as a sequence object. + + The implementation provided here simply calls translate() on the + object returned by cds(). + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub protein { + my ($self) = @_; + my $seq; + + $seq = $self->cds(); + return $seq->translate() if $seq; + return undef; +} + +=head2 mrna + + Title : mrna() + Usage : $mrna = $transcript->mrna(); + Function: Get the mRNA of the transcript as a sequence object. + + The difference to cds() is that the sequence object returned by + this methods will also include UTR and the poly-adenylation site, + but not promoter sequence (TBD). + + HL: do we really need this method? + + Returns : A Bio::PrimarySeqI implementing object. + Args : + + +=cut + +sub mrna { + my ($self) = @_; + my ($seq, $mrna, $elem); + + # get the coding part + $seq = $self->cds(); + if(! $seq) { + $seq = Bio::PrimarySeq->new('-id' => $self->seq_id(), + '-alphabet' => "rna", + '-seq' => ""); + } + # get and add UTR sequences + $mrna = ""; + foreach $elem ($self->utrs('utr5prime')) { + $mrna .= $elem->seq()->seq(); + } + $seq->seq($mrna . $seq->seq()); + $mrna = ""; + foreach $elem ($self->utrs('utr3prime')) { + $mrna .= $elem->seq()->seq(); + } + $seq->seq($seq->seq() . $mrna); + if($self->poly_A_site()) { + $seq->seq($seq->seq() . $self->poly_A_site()->seq()->seq()); + } + return undef if($seq->length() == 0); + return $seq; +} + +sub _get_typed_keys { + my ($self, $keyprefix, $type) = @_; + my @keys = (); + my @feas = (); + + # make case-insensitive + $type = ($type ? lc($type) : ""); + # pull out all feature types that exist and match + @keys = grep { /^_$keyprefix$type/i; } (keys(%{$self})); + return @keys; +} + +sub _make_cds { + my ($self,@exons) = @_; + my $cds = ""; + + foreach my $exon (@exons) { + next if((! defined($exon->seq())) || (! $exon->is_coding())); + my $phase = length($cds) % 3; + # let's check the simple case + if((! defined($exon->frame())) || ($phase == $exon->frame())) { + # this one fits exactly, or frame of the exon is undefined (should + # we warn about that?); we bypass the $exon->cds() here (hmm, + # not very clean style, but I don't see where this screws up) + $cds .= $exon->seq()->seq(); + } else { + # this one is probably from exon shuffling and needs some work + my $seq = $exon->cds(); # now $seq is guaranteed to be in frame 0 + next if(! $seq); + $seq = $seq->seq(); + # adjustment needed? + if($phase > 0) { + # how many Ns can we chop off the piece to be added? + my $n_crop = 0; + if($seq =~ /^(n+)/i) { + $n_crop = length($1); + } + if($n_crop >= $phase) { + # chop off to match the phase + $seq = substr($seq, $phase); + } else { + # fill in Ns + $seq = ("n" x (3-$phase)) . $seq; + } + } + $cds .= $seq; + } + } + return $cds; +} + +=head2 features + + Title : features + Usage : my @features=$transcript->features; + Function: returns all the features associated with this transcript + Returns : a list of SeqFeatureI implementing objects + Args : none + + +=cut + + +sub features { + my ($self) = shift; + $self->{'_features'} = [] unless defined $self->{'_features'}; + return @{$self->{'_features'}}; +} + +=head2 features_ordered + + Title : features_ordered + Usage : my @features=$transcript->features_ordered; + Function: returns all the features associated with this transcript, + in order by feature start, according to strand + Returns : a list of SeqFeatureI implementing objects + Args : none + + +=cut + +sub features_ordered{ + my ($self) = @_; + return $self->_stranded_sort(@{$self->{'_features'}}); +} + + +sub get_unordered_feature_type{ + my ($self, $type, $pri)=@_; + my @list; + foreach ($self->features) { + if ($_->isa($type)) { + if ($pri && $_->primary_tag !~ /$pri/i) { + next; + } + push @list,$_; + } + } + return @list; + +} + +sub get_feature_type { + my ($self)=shift; + return $self->_stranded_sort($self->get_unordered_feature_type(@_)); +} + +#This was fixed by Gene Cutler - the indexing on the list being reversed +#fixed a bad bug. Thanks Gene! +sub _flush { + my ($self, $type, $pri)=@_; + my @list=$self->features; + my @cut; + for (reverse (0..$#list)) { + if ($list[$_]->isa($type)) { + if ($pri && $list[$_]->primary_tag !~ /$pri/i) { + next; + } + push @cut, splice @list, $_, 1; #remove the element of $type from @list + #and return each of them in @cut + } + } + $self->{'_features'}=\@list; + return reverse @cut; +} + +sub _add { + my ($self, $fea, $type)=@_; + require Bio::SeqFeature::Gene::Promoter; + require Bio::SeqFeature::Gene::UTR; + require Bio::SeqFeature::Gene::Exon; + require Bio::SeqFeature::Gene::Intron; + require Bio::SeqFeature::Gene::Poly_A_site; + + if(! $fea->isa('Bio::SeqFeatureI') ) { + $self->throw("$fea does not implement Bio::SeqFeatureI"); + } + if(! $fea->isa($type) ) { + $fea=$self->_new_of_type($fea,$type); + } + if (! $self->strand) { + $self->strand($fea->strand); + } else { + if ($self->strand * $fea->strand == -1) { + $self->throw("$fea is on opposite strand from $self"); + } + } + + $self->_expand_region($fea); + if(defined($self->entire_seq()) && (! defined($fea->entire_seq())) && + $fea->can('attach_seq')) { + $fea->attach_seq($self->entire_seq()); + } + if (defined $self->parent) { + $self->parent->_expand_region($fea); + } + push(@{$self->{'_features'}}, $fea); + 1; +} + +sub _stranded_sort { + my ($self,@list)=@_; + my $strand; + foreach my $fea (@list) { + if($fea->strand()) { + # defined and != 0 + $strand = $fea->strand() if(! $strand); + if(($fea->strand() * $strand) < 0) { + $strand = undef; + last; + } + } + } + if (defined $strand && $strand == - 1) { #reverse strand + return map { $_->[0] } sort {$b->[1] <=> $a->[1]} map { [$_, $_->start] } @list; + } else { #undef or forward strand + return map { $_->[0] } sort {$a->[1] <=> $b->[1]} map { [$_, $_->start] } @list; + } +} + +sub _new_of_type { + my ($self, $fea, $type, $pri)= @_; + my $primary; + if ($pri) { + $primary = $pri; #can set new primary tag if desired + } else { + ($primary) = $type =~ /.*::(.+)/; #or else primary is just end of type string + } + bless $fea,$type; + $fea->primary_tag($primary); + return $fea; +} + +1;