Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/LocatableSeq.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LocatableSeq.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,432 @@ +# $Id: LocatableSeq.pm,v 1.22.2.1 2003/03/31 11:49:51 heikki Exp $ +# +# BioPerl module for Bio::LocatableSeq +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LocatableSeq - A Sequence object with start/end points on it + that can be projected into a MSA or have coordinates relative to another seq. + +=head1 SYNOPSIS + + + use Bio::LocatableSeq; + my $seq = new Bio::LocatableSeq(-seq => "CAGT-GGT", + -id => "seq1", + -start => 1, + -end => 7); + + +=head1 DESCRIPTION + + + # a normal sequence object + $locseq->seq(); + $locseq->id(); + + # has start,end points + $locseq->start(); + $locseq->end(); + + # inheriets off RangeI, so range operations possible + +=head1 FEEDBACK + + +=head2 Mailing Lists + + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + + +The locatable sequence object was developed mainly because the +SimpleAlign object requires this functionality, and in the rewrite +of the Sequence object we had to decide what to do with this. + +It is, to be honest, not well integrated with the rest of bioperl, for +example, the trunc() function does not return a LocatableSeq object, +as some might have thought. There are all sorts of nasty gotcha's +about interactions between coordinate systems when these sort of +objects are used. + + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::LocatableSeq; +use vars qw(@ISA); +use strict; + +use Bio::PrimarySeq; +use Bio::RangeI; +use Bio::Location::Simple; +use Bio::Location::Fuzzy; + + +@ISA = qw(Bio::PrimarySeq Bio::RangeI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($start,$end,$strand) = + $self->_rearrange( [qw(START END STRAND)], + @args); + + defined $start && $self->start($start); + defined $end && $self->end($end); + defined $strand && $self->strand($strand); + + return $self; # success - we hope! +} + +=head2 start + + Title : start + Usage : $obj->start($newval) + Function: + Returns : value of start + Args : newvalue (optional) + +=cut + +sub start{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'start'} = $value; + } + return $self->{'start'}; + +} + +=head2 end + + Title : end + Usage : $obj->end($newval) + Function: + Returns : value of end + Args : newvalue (optional) + +=cut + +sub end { + my $self = shift; + if( @_ ) { + my $value = shift; + my $string = $self->seq; + if ($string and $self->start) { + my $s2 = $string; + $string =~ s/[.-]+//g; + my $len = CORE::length $string; + my $new_end = $self->start + $len - 1 ; + my $id = $self->id; + $self->warn("In sequence $id residue count gives value $len. +Overriding value [$value] with value $new_end for Bio::LocatableSeq::end().") + and $value = $new_end if $new_end != $value and $self->verbose > 0; + } + + $self->{'end'} = $value; + } + return $self->{'end'}; + +} + +=head2 strand + + Title : strand + Usage : $obj->strand($newval) + Function: + Returns : value of strand + Args : newvalue (optional) + +=cut + +sub strand{ + my $self = shift; + if( @_ ) { + my $value = shift; + $self->{'strand'} = $value; + } + return $self->{'strand'}; +} + +=head2 get_nse + + Title : get_nse + Usage : + Function: read-only name of form id/start-end + Example : + Returns : + Args : + +=cut + +sub get_nse{ + my ($self,$char1,$char2) = @_; + + $char1 ||= "/"; + $char2 ||= "-"; + + $self->throw("Attribute id not set") unless $self->id(); + $self->throw("Attribute start not set") unless $self->start(); + $self->throw("Attribute end not set") unless $self->end(); + + return $self->id() . $char1 . $self->start . $char2 . $self->end ; + +} + + +=head2 no_gap + + Title : no_gaps + Usage :$self->no_gaps('.') + Function: + + Gets number of gaps in the sequence. The count excludes + leading or trailing gap characters. + + Valid bioperl sequence characters are [A-Za-z\-\.\*]. Of + these, '.' and '-' are counted as gap characters unless an + optional argument specifies one of them. + + Returns : number of internal gaps in the sequnce. + Args : a gap character (optional) + +=cut + +sub no_gaps { + my ($self,$char) = @_; + my ($seq, $count) = (undef, 0); + + # default gap characters + $char ||= '-.'; + + $self->warn("I hope you know what you are doing setting gap to [$char]") + unless $char =~ /[-.]/; + + $seq = $self->seq; + return 0 unless $seq; # empty sequence does not have gaps + + $seq =~ s/^([$char]+)//; + $seq =~ s/([$char]+)$//; + $count++ while $seq =~ /[$char]+/g; + + return $count; + +} + + +=head2 column_from_residue_number + + Title : column_from_residue_number + Usage : $col = $seq->column_from_residue_number($resnumber) + Function: + + This function gives the position in the alignment + (i.e. column number) of the given residue number in the + sequence. For example, for the sequence + + Seq1/91-97 AC..DEF.GH + + column_from_residue_number(94) returns 5. + + An exception is thrown if the residue number would lie + outside the length of the aligment + (e.g. column_from_residue_number( "Seq2", 22 ) + + Returns : A column number for the position of the + given residue in the given sequence (1 = first column) + Args : A residue number in the whole sequence (not just that + segment of it in the alignment) + +=cut + +sub column_from_residue_number { + my ($self, $resnumber) = @_; + + $self->throw("Residue number has to be a positive integer, not [$resnumber]") + unless $resnumber =~ /^\d+$/ and $resnumber > 0; + + if ($resnumber >= $self->start() and $resnumber <= $self->end()) { + my @residues = split //, $self->seq; + my $count = $self->start(); + my $i; + for ($i=0; $i < @residues; $i++) { + if ($residues[$i] ne '.' and $residues[$i] ne '-') { + $count == $resnumber and last; + $count++; + } + } + # $i now holds the index of the column. + # The actual column number is this index + 1 + + return $i+1; + } + + $self->throw("Could not find residue number $resnumber"); + +} + + +=head2 location_from_column + + Title : location_from_column + Usage : $loc = $ali->location_from_column( $seq_number, $column_number) + Function: + + This function gives the residue number in the sequence with + the given name for a given position in the alignment + (i.e. column number) of the given. Gaps complicate this + process and force the output to be a L<Bio::Range> where + values can be undefined. For example, for the alignment + + Seq1/91-97 AC..DEF.G. + Seq2/1-9 .CYHDEFKGK + + location_from_column( Seq1/91-97, 3 ) position 93 + location_from_column( Seq1/91-97, 2 ) position 92^93 + location_from_column( Seq1/91-97, 10) position 97^98 + location_from_column( Seq2/1-9, 1 ) position undef + + An exact position returns a Bio::Location::Simple object + where where location_type() returns 'EXACT', if a position + is between bases location_type() returns 'IN-BETWEEN'. + Column before the first residue returns undef. Note that if + the position is after the last residue in the alignment, + that there is no guarantee that the original sequence has + residues after that position. + + An exception is thrown if the column number is not within + the sequence. + + Returns : Bio::Location::Simple or undef + Args : A column number + Throws : If column is not within the sequence + +See L<Bio::Location::Simple> for more. + +=cut + +sub location_from_column { + my ($self, $column) = @_; + + $self->throw("Column number has to be a positive integer, not [$column]") + unless $column =~ /^\d+$/ and $column > 0; + $self->throw("Column number [column] is larger than". + " sequence length [". $self->length. "]") + unless $column <= $self->length; + + my ($loc); + my $s = $self->subseq(1,$column); + $s =~ s/\W//g; + my $pos = CORE::length $s; + + my $start = $self->start || 0 ; + if ($self->subseq($column, $column) =~ /[a-zA-Z]/ ) { + $loc = new Bio::Location::Simple + (-start => $pos + $start - 1, + -end => $pos + $start - 1, + -strand => 1 + ); + } + elsif ($pos == 0 and $self->start == 1) { + } else { + $loc = new Bio::Location::Simple + (-start => $pos + $start - 1, + -end => $pos +1 + $start - 1, + -strand => 1, + -location_type => 'IN-BETWEEN' + ); + } + return $loc; +} + +=head2 revcom + + Title : revcom + Usage : $rev = $seq->revcom() + Function: Produces a new Bio::LocatableSeq object which + has the reversed complement of the sequence. For protein + sequences this throws an exception of "Sequence is a + protein. Cannot revcom" + + Returns : A new Bio::LocatableSeq object + Args : none + +=cut + +sub revcom { + my ($self) = @_; + + my $new = $self->SUPER::revcom; + $new->strand($self->strand * -1); + $new->start($self->start) if $self->start; + $new->end($self->end) if $self->end; + return $new; +} + + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence, + + Example : + Returns : a fresh Bio::PrimarySeqI implementing object + Args : Two integers denoting first and last columns of the + sequence to be included into sub-sequence. + + +=cut + +sub trunc { + + my ($self, $start, $end) = @_; + my $new = $self->SUPER::trunc($start, $end); + + $start = $self->location_from_column($start); + $start ? ($start = $start->start) : ($start = 1); + + $end = $self->location_from_column($end); + $end = $end->start if $end; + + $new->strand($self->strand); + $new->start($start) if $start; + $new->end($end) if $end; + + return $new; +} + +1;