Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Location/Fuzzy.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Location/Fuzzy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,560 @@ +# $Id: Fuzzy.pm,v 1.24 2002/12/01 00:05:20 jason Exp $ +# +# BioPerl module for Bio::Location::Fuzzy +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Location::Fuzzy - Implementation of a Location on a Sequence +which has unclear start and/or end locations + +=head1 SYNOPSIS + + use Bio::Location::Fuzzy; + my $fuzzylocation = new Bio::Location::Fuzzy(-start => '<30', + -end => 90, + -location_type => '.'); + + print "location string is ", $fuzzylocation->to_FTstring(), "\n"; + print "location is of the type ", $fuzzylocation->location_type, "\n"; + +=head1 DESCRIPTION + +This module contains the necessary methods for representing a +Fuzzy Location, one that does not have clear start and/or end points. +This will initially serve to handle features from Genbank/EMBL feature +tables that are written as 1^100 meaning between bases 1 and 100 or +E<lt>100..300 meaning it starts somewhere before 100. Advanced +implementations of this interface may be able to handle the necessary +logic of overlaps/intersection/contains/union. It was constructed to +handle fuzzy locations that can be represented in Genbank/EMBL. + +=head1 FEEDBACK + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Location::Fuzzy; +use vars qw(@ISA ); +use strict; + +use Bio::Location::FuzzyLocationI; +use Bio::Location::Atomic; + +@ISA = qw(Bio::Location::Atomic Bio::Location::FuzzyLocationI ); + +BEGIN { + use vars qw( %FUZZYCODES %FUZZYPOINTENCODE %FUZZYRANGEENCODE + @LOCATIONCODESBSANE ); + + @LOCATIONCODESBSANE = (undef, 'EXACT', 'WITHIN', 'BETWEEN', + 'BEFORE', 'AFTER'); + + %FUZZYCODES = ( 'EXACT' => '..', # Position is 'exact + # Exact position is unknown, but is within the range specified, ((1.2)..100) + 'WITHIN' => '.', + # 1^2 + 'BETWEEN' => '^', + # <100 + 'BEFORE' => '<', + # >10 + 'AFTER' => '>'); + + # The following regular expressions map to fuzzy location types. Every + # expression must match the complete encoded point string, and must + # contain two groups identifying min and max. Empty matches are automatic. + # converted to undef, except for 'EXACT', for which max is set to equal + # min. + %FUZZYPOINTENCODE = ( + '\>(\d+)(.{0})' => 'AFTER', + '\<(.{0})(\d+)' => 'BEFORE', + '(\d+)' => 'EXACT', + '(\d+)(.{0})\>' => 'AFTER', + '(.{0})(\d+)\<' => 'BEFORE', + '(\d+)\.(\d+)' => 'WITHIN', + '(\d+)\^(\d+)' => 'BETWEEN', + ); + + %FUZZYRANGEENCODE = ( '\.' => 'WITHIN', + '\.\.' => 'EXACT', + '\^' => 'BETWEEN' ); + +} + +=head2 new + + Title : new + Usage : my $fuzzyloc = new Bio::Location::Fuzzy( @args); + Function: + Returns : + Args : -start => value for start (initialize by superclass) + -end => value for end (initialize by superclass) + -strand => value for strand (initialize by superclass) + -location_type => either ('EXACT', 'WITHIN', 'BETWEEN') OR + ( 1,2,3) + -start_ext=> extension for start - defaults to 0, + -start_fuz= fuzzy code for start can be + ( 'EXACT', 'WITHIN', 'BETWEEN', 'BEFORE', 'AFTER') OR + a value 1 - 5 corresponding to index+1 above + -end_ext=> extension for end - defaults to 0, + -end_fuz= fuzzy code for end can be + ( 'EXACT', 'WITHIN', 'BETWEEN', 'BEFORE', 'AFTER') OR + a value 1 - 5 corresponding to index+1 above + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my ($location_type, $start_ext, $start_fuz, $end_ext, $end_fuz) = + $self->_rearrange([ qw(LOCATION_TYPE START_EXT START_FUZ + END_EXT END_FUZ ) + ], @args); + + $location_type && $self->location_type($location_type); + $start_ext && $self->max_start($self->min_start + $start_ext); + $end_ext && $self->max_end($self->min_end + $end_ext); + $start_fuz && $self->start_pos_type($start_fuz); + $end_fuz && $self->end_pos_type($end_fuz); + + return $self; +} + +=head2 location_type + + Title : location_type + Usage : my $location_type = $location->location_type(); + Function: Get location type encoded as text + Returns : string ('EXACT', 'WITHIN', 'BETWEEN') + Args : none + +=cut + +sub location_type { + my ($self,$value) = @_; + if( defined $value || ! defined $self->{'_location_type'} ) { + $value = 'EXACT' unless defined $value; + if(! defined $FUZZYCODES{$value}) { + $value = uc($value); + if( $value =~ /\.\./ ) { + $value = 'EXACT'; + } elsif( $value =~ /^\.$/ ) { + $value = 'WITHIN'; + } elsif( $value =~ /\^/ ) { + $value = 'BETWEEN'; + + + $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". $self->start. "] and [". $self->end. "]") + if defined $self->start && defined $self->end && ($self->end - 1 == $self->start); + + + } elsif( $value ne 'EXACT' && $value ne 'WITHIN' && + $value ne 'BETWEEN' ) { + $self->throw("Did not specify a valid location type"); + } + } + $self->{'_location_type'} = $value; + } + return $self->{'_location_type'}; +} + +=head1 LocationI methods + +=head2 length + + Title : length + Usage : $length = $fuzzy_loc->length(); + Function: Get the length of this location. + + Note that the length of a fuzzy location will always depend + on the currently active interpretation of start and end. The + result will therefore vary for different CoordinatePolicy objects. + + Returns : an integer + Args : none + +=cut + +#sub length { +# my($self) = @_; +# return $self->SUPER::length() if( !$self->start || !$self->end); +# $self->warn('Length is not valid for a FuzzyLocation'); +# return 0; +#} + +=head2 start + + Title : start + Usage : $start = $fuzzy->start(); + Function: get/set start of this range, handling fuzzy_starts + Returns : a positive integer representing the start of the location + Args : start location on set (can be fuzzy point string) + +=cut + +sub start { + my($self,$value) = @_; + if( defined $value ) { + my ($encode,$min,$max) = $self->_fuzzypointdecode($value); + $self->start_pos_type($encode); + $self->min_start($min); + $self->max_start($max); + } + + $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". $self->SUPER::start. "] and [". $self->SUPER::end. "]") + if $self->location_type eq 'BETWEEN' && defined $self->SUPER::end && ($self->SUPER::end - 1 == $self->SUPER::start); + + return $self->SUPER::start(); +} + +=head2 end + + Title : end + Usage : $end = $fuzzy->end(); + Function: get/set end of this range, handling fuzzy_ends + Returns : a positive integer representing the end of the range + Args : end location on set (can be fuzzy string) + +=cut + +sub end { + my($self,$value) = @_; + if( defined $value ) { + my ($encode,$min,$max) = $self->_fuzzypointdecode($value); + $self->end_pos_type($encode); + $self->min_end($min); + $self->max_end($max); + } + + $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". $self->SUPER::start. "] and [". $self->SUPER::end. "]") + if $self->location_type eq 'BETWEEN' && defined $self->SUPER::start && ($self->SUPER::end - 1 == $self->SUPER::start); + + return $self->SUPER::end(); +} + +=head2 min_start + + Title : min_start + Usage : $min_start = $fuzzy->min_start(); + Function: get/set the minimum starting point + Returns : the minimum starting point from the contained sublocations + Args : integer or undef on set + +=cut + +sub min_start { + my ($self,@args) = @_; + + if(@args) { + $self->{'_min_start'} = $args[0]; # the value may be undef! + } + return $self->{'_min_start'}; +} + +=head2 max_start + + Title : max_start + Usage : my $maxstart = $location->max_start(); + Function: Get/set maximum starting location of feature startpoint + Returns : integer or undef if no maximum starting point. + Args : integer or undef on set + +=cut + +sub max_start { + my ($self,@args) = @_; + + if(@args) { + $self->{'_max_start'} = $args[0]; # the value may be undef! + } + return $self->{'_max_start'}; +} + +=head2 start_pos_type + + Title : start_pos_type + Usage : my $start_pos_type = $location->start_pos_type(); + Function: Get/set start position type. + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : a string on set + +=cut + +sub start_pos_type { + my ($self,$value) = @_; + if(defined $value && $value =~ /^\d+$/ ) { + if( $value == 0 ) { $value = 'EXACT'; } + else { + my $v = $LOCATIONCODESBSANE[$value]; + if( ! defined $v ) { + $self->warn("Provided value $value which I don't understand, reverting to 'EXACT'"); + $v = 'EXACT'; + } + $value = $v; + } + } + if(defined($value)) { + $self->{'_start_pos_type'} = $value; + } + return $self->{'_start_pos_type'}; +} + +=head2 min_end + + Title : min_end + Usage : my $minend = $location->min_end(); + Function: Get/set minimum ending location of feature endpoint + Returns : integer or undef if no minimum ending point. + Args : integer or undef on set + +=cut + +sub min_end { + my ($self,@args) = @_; + + if(@args) { + $self->{'_min_end'} = $args[0]; # the value may be undef! + } + return $self->{'_min_end'}; +} + +=head2 max_end + + Title : max_end + Usage : my $maxend = $location->max_end(); + Function: Get/set maximum ending location of feature endpoint + Returns : integer or undef if no maximum ending point. + Args : integer or undef on set + +=cut + +sub max_end { + my ($self,@args) = @_; + + if(@args) { + $self->{'_max_end'} = $args[0]; # the value may be undef! + } + return $self->{'_max_end'}; +} + +=head2 end_pos_type + + Title : end_pos_type + Usage : my $end_pos_type = $location->end_pos_type(); + Function: Get/set end position type. + Returns : type of position coded as text + ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') + Args : a string on set + +=cut + +sub end_pos_type { + my ($self,$value) = @_; + if( defined $value && $value =~ /^\d+$/ ) { + if( $value == 0 ) { $value = 'EXACT'; } + else { + my $v = $LOCATIONCODESBSANE[$value]; + if( ! defined $v ) { + $self->warn("Provided value $value which I don't understand, reverting to 'EXACT'"); + $v = 'EXACT'; + } + $value = $v; + } + } + + if(defined($value)) { + $self->{'_end_pos_type'} = $value; + } + return $self->{'_end_pos_type'}; +} + +=head2 seq_id + + Title : seq_id + Usage : my $seqid = $location->seq_id(); + Function: Get/Set seq_id that location refers to + Returns : seq_id + Args : [optional] seq_id value to set + +=cut + +=head2 coordinate_policy + + Title : coordinate_policy + Usage : $policy = $location->coordinate_policy(); + $location->coordinate_policy($mypolicy); # set may not be possible + Function: Get the coordinate computing policy employed by this object. + + See Bio::Location::CoordinatePolicyI for documentation about + the policy object and its use. + + The interface *does not* require implementing classes to accept + setting of a different policy. The implementation provided here + does, however, allow to do so. + + Implementors of this interface are expected to initialize every + new instance with a CoordinatePolicyI object. The implementation + provided here will return a default policy object if none has + been set yet. To change this default policy object call this + method as a class method with an appropriate argument. Note that + in this case only subsequently created Location objects will be + affected. + + Returns : A Bio::Location::CoordinatePolicyI implementing object. + Args : On set, a Bio::Location::CoordinatePolicyI implementing object. + +=cut + +=head2 to_FTstring + + Title : to_FTstring + Usage : my $locstr = $location->to_FTstring() + Function: Get/Set seq_id that location refers to + Returns : seq_id + Args : [optional] seq_id value to set + +=cut + +sub to_FTstring { + my ($self) = @_; + my (%vals) = ( 'start' => $self->start, + 'min_start' => $self->min_start, + 'max_start' => $self->max_start, + 'start_code' => $self->start_pos_type, + 'end' => $self->end, + 'min_end' => $self->min_end, + 'max_end' => $self->max_end, + 'end_code' => $self->end_pos_type ); + + my (%strs) = ( 'start' => '', + 'end' => ''); + my ($delimiter) = $FUZZYCODES{$self->location_type}; + # I'm lazy, lets do this in a loop since behaviour will be the same for + # start and end + foreach my $point ( qw(start end) ) { + if( $vals{$point."_code"} ne 'EXACT' ) { + + if( (!defined $vals{"min_$point"} || + !defined $vals{"max_$point"}) + && ( $vals{$point."_code"} eq 'WITHIN' || + $vals{$point."_code"} eq 'BETWEEN') + ) { + $vals{"min_$point"} = '' unless defined $vals{"min_$point"}; + $vals{"max_$point"} = '' unless defined $vals{"max_$point"}; + + $self->warn("Fuzzy codes for start are in a strange state, (". + join(",", ($vals{"min_$point"}, + $vals{"max_$point"}, + $vals{$point."_code"})). ")"); + return ''; + } + if( defined $vals{$point."_code"} && + ($vals{$point."_code"} eq 'BEFORE' || + $vals{$point."_code"} eq 'AFTER') + ) { + $strs{$point} .= $FUZZYCODES{$vals{$point."_code"}}; + } + if( defined $vals{"min_$point"} ) { + $strs{$point} .= $vals{"min_$point"}; + } + if( defined $vals{$point."_code"} && + ($vals{$point."_code"} eq 'WITHIN' || + $vals{$point."_code"} eq 'BETWEEN') + ) { + $strs{$point} .= $FUZZYCODES{$vals{$point."_code"}}; + } + if( defined $vals{"max_$point"} ) { + $strs{$point} .= $vals{"max_$point"}; + } + if(($vals{$point."_code"} eq 'WITHIN') || + ($vals{$point."_code"} eq 'BETWEEN')) { + $strs{$point} = "(".$strs{$point}.")"; + } + } else { + $strs{$point} = $vals{$point}; + } + + } + my $str = $strs{'start'} . $delimiter . $strs{'end'}; + if($self->is_remote() && $self->seq_id()) { + $str = $self->seq_id() . ":" . $str; + } + if( $self->strand == -1 ) { + $str = "complement(" . $str . ")"; + } elsif($self->location_type() eq "WITHIN") { + $str = "(".$str.")"; + } + return $str; +} + +=head2 _fuzzypointdecode + + Title : _fuzzypointdecode + Usage : ($type,$min,$max) = $self->_fuzzypointdecode('<5'); + Function: Decode a fuzzy string. + Returns : A 3-element array consisting of the type of location, the + minimum integer, and the maximum integer describing the range + of coordinates this start or endpoint refers to. Minimum or + maximum coordinate may be undefined. + : Returns empty array on fail. + Args : fuzzypoint string + +=cut + +sub _fuzzypointdecode { + my ($self, $string) = @_; + return () if( !defined $string); + # strip off leading and trailing space + $string =~ s/^\s*(\S+)\s*/$1/; + foreach my $pattern ( keys %FUZZYPOINTENCODE ) { + if( $string =~ /^$pattern$/ ) { + my ($min,$max) = ($1,$2); + if($FUZZYPOINTENCODE{$pattern} eq 'EXACT') { + $max = $min; + } else { + $max = undef if(length($max) == 0); + $min = undef if(length($min) == 0); + } + return ($FUZZYPOINTENCODE{$pattern},$min,$max); + } + } + if( $self->verbose >= 1 ) { + $self->warn("could not find a valid fuzzy encoding for $string"); + } + return (); +} + +1; +