comparison variant_effect_predictor/Bio/EnsEMBL/Mapper/RangeRegistry.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Mapper::RangeRegistry
24
25 =head1 SYNOPSIS
26
27 use Bio::EnsEMBL::Mapper::RangeRegistry;
28
29 $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
30
31 # Get a fixed width chunk around the range of intereset. This
32 # will be used if any registration is actually necessary.
33 $chunk_start = ( $start >> 20 ) << 20 + 1;
34 $chunk_end = ( ( $end >> 20 ) + 1 ) << 20;
35
36 # Check if any registration is necessary for the range. If it is
37 # register a large chunked area instead and return a listref of
38 # unregistered areas that need to be loaded.
39 if (
40 $pairs = $rr->check_and_register(
41 $id, $start, $end, $chunk_start, $chunk_end
42 ) )
43 {
44 foreach my $pair (@$pairs) {
45 my ( $pair_start, $pair_end ) = @$pair;
46 # Fetch mappings for these regions from the assembly table and
47 # load them into the mapper.
48 ...;
49 }
50 } else {
51 # The range ($start - $end) is already registered
52 ...;
53 }
54
55 # Check if any registration is necessary. If it is register the
56 # region and return a listref of pairs that need to be loaded.
57 if ( $pairs = $rr->check_and_register( $id, $start, $end ) ) {
58 ...;
59 }
60
61 =head1 DESCRIPTION
62
63 This module maintains an internal list of registered regions and is
64 used to quickly ascertain if and what regions of a provided range need
65 registration.
66
67 =head1 METHODS
68
69 =cut
70
71 package Bio::EnsEMBL::Mapper::RangeRegistry;
72
73 use strict;
74
75 use Bio::EnsEMBL::Utils::Exception qw(throw);
76 use integer;
77
78 =head2 new
79
80 Arg [1] : none
81 Example : my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
82 Description: Creates a new RangeRegistry object
83 Returntype : Bio::EnsEMBL::Mapper::RangeRegistry
84 Exceptions : none
85 Caller : AssemblyMapperAdaptor
86 Status : Stable
87
88 =cut
89
90 sub new {
91 my ($proto) = @_;
92
93 my $class = ref($proto) || $proto;
94
95 return bless( { 'registry' => {} }, $class );
96 }
97
98 sub flush {
99 my ($self) = @_;
100 $self->{'registry'} = {};
101 }
102
103 =head2 check_and_register
104
105 Arg [1] : string $id
106 The id of the range to be checked/registered (e.g. a sequenceid)
107 Arg [2] : int $start
108 The start of the range to be checked
109 Arg [3] : int $end
110 The end of the range to be checked
111 Arg [4] : (optional) int $rstart
112 The start of the range to be registered
113 if the checked range was not fully registered
114 Arg [5] : (optional) int $rend
115 The end of the range to be registerd
116 if the checked range was not fully registered
117 Example : $ranges=$rr->check_and_register('X',500,600, 1,1_000_000));
118 Description: Checks the range registry to see if the entire range
119 denoted by ($id : $start-$end) is already registered. If
120 it already is, then undef is returned. If it is not then
121 the range specified by $rstart and $rend is registered, and
122 a list of regions that were required to completely register
123 $rstart-$rend is returned. If $rstart and $rend are not
124 defined they default to $start and $end respectively.
125
126 The reason there is a single call to do both the checking and
127 registering is to reduce the overhead. Much of the work to
128 check if a range is registered is the same as registering a
129 region around that range.
130 Returntype : undef or listref of [start,end] range pairs
131 Exceptions : throw if rstart is greater than start
132 throw if rend is less than end
133 throw if end is less than start
134 throw if id, start, or end are not defined
135 Caller : AssemblyMapperAdaptor
136 Status : Stable
137
138 =cut
139
140 #"constants"
141 my $START = 0;
142 my $END = 1;
143
144 sub check_and_register {
145 my ( $self, $id, $start, $end, $rstart, $rend ) = @_;
146
147 $rstart = $start if ( !defined($rstart) );
148 $rend = $end if ( !defined($rend) );
149
150 #
151 # Sanity checks
152 #
153 if ( !defined($id) || !defined($start) || !defined($end) ) {
154 throw("ID, start, end arguments are required");
155 }
156
157 # The following was commented out due to Ensembl Genomes requirements
158 # for bacterial genomes.
159 #
160 ## if ( $start > $end ) {
161 ## throw( "start argument [$start] must be less than "
162 ## . "(or equal to) end argument [$end]" );
163 ## }
164 ##
165 ## if ( $rstart > $rend ) {
166 ## throw( "rstart argument [$rstart] must be less than "
167 ## . "(or equal to) rend argument [$rend] argument" );
168 ## }
169
170 if ( $rstart > $start ) {
171 throw("rstart [$rstart] must be less than or equal to start [$start]");
172 }
173
174 if ( $rend < $end ) {
175 throw("rend [$rend] must be greater than or equal to end [$end]");
176 }
177
178 my $reg = $self->{'registry'};
179 my $list = $reg->{$id} ||= [];
180
181 my @gap_pairs;
182
183 my $len = scalar(@$list);
184
185 if ( $len == 0 ) {
186 # this is the first request for this id, return a gap pair for the
187 # entire range and register it as seen
188 $list->[0] = [ $rstart, $rend ];
189 return [ [ $rstart, $rend ] ];
190 }
191
192 #####
193 # loop through the list of existing ranges recording any "gaps" where
194 # the existing range does not cover part of the requested range
195 #
196
197 my $start_idx = 0;
198 my $end_idx = $#$list;
199 my ( $mid_idx, $range );
200
201 # binary search the relevant pairs
202 # helps if the list is big
203 while ( ( $end_idx - $start_idx ) > 1 ) {
204 $mid_idx = ( $start_idx + $end_idx ) >> 1;
205 $range = $list->[$mid_idx];
206
207 if ( $range->[1] < $rstart ) {
208 $start_idx = $mid_idx;
209 } else {
210 $end_idx = $mid_idx;
211 }
212 }
213
214 my ( $gap_start, $gap_end, $r_idx, $rstart_idx, $rend_idx );
215 $gap_start = $rstart;
216
217 for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
218 my ( $pstart, $pend ) = @{ $list->[$CUR] };
219
220 # no work needs to be done at all if we find a range pair that
221 # entirely overlaps the requested region
222 if ( $pstart <= $start && $pend >= $end ) {
223 return undef;
224 }
225
226 # find adjacent or overlapping regions already registered
227 if ( $pend >= ( $rstart - 1 ) && $pstart <= ( $rend + 1 ) ) {
228 if ( !defined($rstart_idx) ) {
229 $rstart_idx = $CUR;
230 }
231 $rend_idx = $CUR;
232 }
233
234 if ( $pstart > $rstart ) {
235 $gap_end = ( $rend < $pstart ) ? $rend : $pstart - 1;
236 push @gap_pairs, [ $gap_start, $gap_end ];
237 }
238
239 $gap_start = ( $rstart > $pend ) ? $rstart : $pend + 1;
240
241 # if($pstart > $rend && !defined($r_idx)) {
242 if ( $pend >= $rend && !defined($r_idx) ) {
243 $r_idx = $CUR;
244 last;
245 }
246 } ## end for ( my $CUR = $start_idx...
247
248 # do we have to make another gap?
249 if ( $gap_start <= $rend ) {
250 push @gap_pairs, [ $gap_start, $rend ];
251 }
252
253 #
254 # Merge the new range into the registered list
255 #
256 if ( defined($rstart_idx) ) {
257 my ( $new_start, $new_end );
258
259 if ( $rstart < $list->[$rstart_idx]->[0] ) {
260 $new_start = $rstart;
261 } else {
262 $new_start = $list->[$rstart_idx]->[0];
263 }
264
265 if ( $rend > $list->[$rend_idx]->[1] ) {
266 $new_end = $rend;
267 } else {
268 $new_end = $list->[$rend_idx]->[1];
269 }
270
271 splice( @$list, $rstart_idx,
272 $rend_idx - $rstart_idx + 1,
273 [ $new_start, $new_end ] );
274
275 } elsif ( defined($r_idx) ) {
276 splice( @$list, $r_idx, 0, [ $rstart, $rend ] );
277 } else {
278 push( @$list, [ $rstart, $rend ] );
279 }
280
281 return \@gap_pairs;
282 } ## end sub check_and_register
283
284 # overlap size is just added to make RangeRegistry generally more useful
285
286 =head2 overlap_size
287
288 Arg [1] : string $id
289 Arg [2] : int $start
290 Arg [3] : int $end
291 Example : my $overlap_size = $rr->( "chr1", 1, 100_000_000 )
292 Description: finds out how many bases of the given range are registered
293 Returntype : int
294 Exceptions : none
295 Caller : general
296 Status : Stable
297
298 =cut
299
300 sub overlap_size {
301 my ( $self, $id, $start, $end ) = @_;
302
303 my $overlap = 0;
304
305 if ( $start > $end ) { return 0 }
306
307 my $reg = $self->{'registry'};
308 my $list = $reg->{$id} ||= [];
309
310 my $len = scalar(@$list);
311
312 if ( $len == 0 ) {
313 # this is the first request for this id, return a gap pair for the
314 # entire range and register it as seen
315 return 0;
316 }
317
318 #####
319 # loop through the list of existing ranges recording any "gaps" where
320 # the existing range does not cover part of the requested range
321 #
322
323 my $start_idx = 0;
324 my $end_idx = $#$list;
325 my ( $mid_idx, $range );
326
327 # binary search the relevant pairs
328 # helps if the list is big
329 while ( ( $end_idx - $start_idx ) > 1 ) {
330 $mid_idx = ( $start_idx + $end_idx ) >> 1;
331 $range = $list->[$mid_idx];
332 if ( $range->[1] < $start ) {
333 $start_idx = $mid_idx;
334 } else {
335 $end_idx = $mid_idx;
336 }
337 }
338
339 for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
340 my ( $pstart, $pend ) = @{ $list->[$CUR] };
341
342 if ( $pstart > $end ) {
343 # No more interesting ranges here.
344 last;
345 }
346
347 # no work needs to be done at all if we find a range pair that
348 # entirely overlaps the requested region
349 if ( $pstart <= $start && $pend >= $end ) {
350 $overlap = $end - $start + 1;
351 last;
352 }
353
354 my $mstart = ( $start < $pstart ? $pstart : $start );
355 my $mend = ( $end < $pend ? $end : $pend );
356
357 if ( $mend - $mstart >= 0 ) {
358 $overlap += ( $mend - $mstart + 1 );
359 }
360 }
361
362 return $overlap;
363 } ## end sub overlap_size
364
365
366 # low level function to access the ranges
367 # only use for read access
368
369 sub get_ranges {
370 my ( $self, $id ) = @_;
371
372 return $self->{'registry'}->{$id};
373 }
374
375 1;
376