Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Mapper/RangeRegistry.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper/RangeRegistry.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,376 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Mapper::RangeRegistry + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Mapper::RangeRegistry; + + $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + + # Get a fixed width chunk around the range of intereset. This + # will be used if any registration is actually necessary. + $chunk_start = ( $start >> 20 ) << 20 + 1; + $chunk_end = ( ( $end >> 20 ) + 1 ) << 20; + + # Check if any registration is necessary for the range. If it is + # register a large chunked area instead and return a listref of + # unregistered areas that need to be loaded. + if ( + $pairs = $rr->check_and_register( + $id, $start, $end, $chunk_start, $chunk_end + ) ) + { + foreach my $pair (@$pairs) { + my ( $pair_start, $pair_end ) = @$pair; + # Fetch mappings for these regions from the assembly table and + # load them into the mapper. + ...; + } + } else { + # The range ($start - $end) is already registered + ...; + } + + # Check if any registration is necessary. If it is register the + # region and return a listref of pairs that need to be loaded. + if ( $pairs = $rr->check_and_register( $id, $start, $end ) ) { + ...; + } + +=head1 DESCRIPTION + +This module maintains an internal list of registered regions and is +used to quickly ascertain if and what regions of a provided range need +registration. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Mapper::RangeRegistry; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use integer; + +=head2 new + + Arg [1] : none + Example : my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); + Description: Creates a new RangeRegistry object + Returntype : Bio::EnsEMBL::Mapper::RangeRegistry + Exceptions : none + Caller : AssemblyMapperAdaptor + Status : Stable + +=cut + +sub new { + my ($proto) = @_; + + my $class = ref($proto) || $proto; + + return bless( { 'registry' => {} }, $class ); +} + +sub flush { + my ($self) = @_; + $self->{'registry'} = {}; +} + +=head2 check_and_register + + Arg [1] : string $id + The id of the range to be checked/registered (e.g. a sequenceid) + Arg [2] : int $start + The start of the range to be checked + Arg [3] : int $end + The end of the range to be checked + Arg [4] : (optional) int $rstart + The start of the range to be registered + if the checked range was not fully registered + Arg [5] : (optional) int $rend + The end of the range to be registerd + if the checked range was not fully registered + Example : $ranges=$rr->check_and_register('X',500,600, 1,1_000_000)); + Description: Checks the range registry to see if the entire range + denoted by ($id : $start-$end) is already registered. If + it already is, then undef is returned. If it is not then + the range specified by $rstart and $rend is registered, and + a list of regions that were required to completely register + $rstart-$rend is returned. If $rstart and $rend are not + defined they default to $start and $end respectively. + + The reason there is a single call to do both the checking and + registering is to reduce the overhead. Much of the work to + check if a range is registered is the same as registering a + region around that range. + Returntype : undef or listref of [start,end] range pairs + Exceptions : throw if rstart is greater than start + throw if rend is less than end + throw if end is less than start + throw if id, start, or end are not defined + Caller : AssemblyMapperAdaptor + Status : Stable + +=cut + +#"constants" +my $START = 0; +my $END = 1; + +sub check_and_register { + my ( $self, $id, $start, $end, $rstart, $rend ) = @_; + + $rstart = $start if ( !defined($rstart) ); + $rend = $end if ( !defined($rend) ); + + # + # Sanity checks + # + if ( !defined($id) || !defined($start) || !defined($end) ) { + throw("ID, start, end arguments are required"); + } + + # The following was commented out due to Ensembl Genomes requirements + # for bacterial genomes. + # + ## if ( $start > $end ) { + ## throw( "start argument [$start] must be less than " + ## . "(or equal to) end argument [$end]" ); + ## } + ## + ## if ( $rstart > $rend ) { + ## throw( "rstart argument [$rstart] must be less than " + ## . "(or equal to) rend argument [$rend] argument" ); + ## } + + if ( $rstart > $start ) { + throw("rstart [$rstart] must be less than or equal to start [$start]"); + } + + if ( $rend < $end ) { + throw("rend [$rend] must be greater than or equal to end [$end]"); + } + + my $reg = $self->{'registry'}; + my $list = $reg->{$id} ||= []; + + my @gap_pairs; + + my $len = scalar(@$list); + + if ( $len == 0 ) { + # this is the first request for this id, return a gap pair for the + # entire range and register it as seen + $list->[0] = [ $rstart, $rend ]; + return [ [ $rstart, $rend ] ]; + } + + ##### + # loop through the list of existing ranges recording any "gaps" where + # the existing range does not cover part of the requested range + # + + my $start_idx = 0; + my $end_idx = $#$list; + my ( $mid_idx, $range ); + + # binary search the relevant pairs + # helps if the list is big + while ( ( $end_idx - $start_idx ) > 1 ) { + $mid_idx = ( $start_idx + $end_idx ) >> 1; + $range = $list->[$mid_idx]; + + if ( $range->[1] < $rstart ) { + $start_idx = $mid_idx; + } else { + $end_idx = $mid_idx; + } + } + + my ( $gap_start, $gap_end, $r_idx, $rstart_idx, $rend_idx ); + $gap_start = $rstart; + + for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) { + my ( $pstart, $pend ) = @{ $list->[$CUR] }; + + # no work needs to be done at all if we find a range pair that + # entirely overlaps the requested region + if ( $pstart <= $start && $pend >= $end ) { + return undef; + } + + # find adjacent or overlapping regions already registered + if ( $pend >= ( $rstart - 1 ) && $pstart <= ( $rend + 1 ) ) { + if ( !defined($rstart_idx) ) { + $rstart_idx = $CUR; + } + $rend_idx = $CUR; + } + + if ( $pstart > $rstart ) { + $gap_end = ( $rend < $pstart ) ? $rend : $pstart - 1; + push @gap_pairs, [ $gap_start, $gap_end ]; + } + + $gap_start = ( $rstart > $pend ) ? $rstart : $pend + 1; + + # if($pstart > $rend && !defined($r_idx)) { + if ( $pend >= $rend && !defined($r_idx) ) { + $r_idx = $CUR; + last; + } + } ## end for ( my $CUR = $start_idx... + + # do we have to make another gap? + if ( $gap_start <= $rend ) { + push @gap_pairs, [ $gap_start, $rend ]; + } + + # + # Merge the new range into the registered list + # + if ( defined($rstart_idx) ) { + my ( $new_start, $new_end ); + + if ( $rstart < $list->[$rstart_idx]->[0] ) { + $new_start = $rstart; + } else { + $new_start = $list->[$rstart_idx]->[0]; + } + + if ( $rend > $list->[$rend_idx]->[1] ) { + $new_end = $rend; + } else { + $new_end = $list->[$rend_idx]->[1]; + } + + splice( @$list, $rstart_idx, + $rend_idx - $rstart_idx + 1, + [ $new_start, $new_end ] ); + + } elsif ( defined($r_idx) ) { + splice( @$list, $r_idx, 0, [ $rstart, $rend ] ); + } else { + push( @$list, [ $rstart, $rend ] ); + } + + return \@gap_pairs; +} ## end sub check_and_register + +# overlap size is just added to make RangeRegistry generally more useful + +=head2 overlap_size + + Arg [1] : string $id + Arg [2] : int $start + Arg [3] : int $end + Example : my $overlap_size = $rr->( "chr1", 1, 100_000_000 ) + Description: finds out how many bases of the given range are registered + Returntype : int + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub overlap_size { + my ( $self, $id, $start, $end ) = @_; + + my $overlap = 0; + + if ( $start > $end ) { return 0 } + + my $reg = $self->{'registry'}; + my $list = $reg->{$id} ||= []; + + my $len = scalar(@$list); + + if ( $len == 0 ) { + # this is the first request for this id, return a gap pair for the + # entire range and register it as seen + return 0; + } + + ##### + # loop through the list of existing ranges recording any "gaps" where + # the existing range does not cover part of the requested range + # + + my $start_idx = 0; + my $end_idx = $#$list; + my ( $mid_idx, $range ); + + # binary search the relevant pairs + # helps if the list is big + while ( ( $end_idx - $start_idx ) > 1 ) { + $mid_idx = ( $start_idx + $end_idx ) >> 1; + $range = $list->[$mid_idx]; + if ( $range->[1] < $start ) { + $start_idx = $mid_idx; + } else { + $end_idx = $mid_idx; + } + } + + for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) { + my ( $pstart, $pend ) = @{ $list->[$CUR] }; + + if ( $pstart > $end ) { + # No more interesting ranges here. + last; + } + + # no work needs to be done at all if we find a range pair that + # entirely overlaps the requested region + if ( $pstart <= $start && $pend >= $end ) { + $overlap = $end - $start + 1; + last; + } + + my $mstart = ( $start < $pstart ? $pstart : $start ); + my $mend = ( $end < $pend ? $end : $pend ); + + if ( $mend - $mstart >= 0 ) { + $overlap += ( $mend - $mstart + 1 ); + } + } + + return $overlap; +} ## end sub overlap_size + + +# low level function to access the ranges +# only use for read access + +sub get_ranges { + my ( $self, $id ) = @_; + + return $self->{'registry'}->{$id}; +} + +1; +