Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Mapper/RangeRegistry.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Mapper::RangeRegistry | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 use Bio::EnsEMBL::Mapper::RangeRegistry; | |
28 | |
29 $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); | |
30 | |
31 # Get a fixed width chunk around the range of intereset. This | |
32 # will be used if any registration is actually necessary. | |
33 $chunk_start = ( $start >> 20 ) << 20 + 1; | |
34 $chunk_end = ( ( $end >> 20 ) + 1 ) << 20; | |
35 | |
36 # Check if any registration is necessary for the range. If it is | |
37 # register a large chunked area instead and return a listref of | |
38 # unregistered areas that need to be loaded. | |
39 if ( | |
40 $pairs = $rr->check_and_register( | |
41 $id, $start, $end, $chunk_start, $chunk_end | |
42 ) ) | |
43 { | |
44 foreach my $pair (@$pairs) { | |
45 my ( $pair_start, $pair_end ) = @$pair; | |
46 # Fetch mappings for these regions from the assembly table and | |
47 # load them into the mapper. | |
48 ...; | |
49 } | |
50 } else { | |
51 # The range ($start - $end) is already registered | |
52 ...; | |
53 } | |
54 | |
55 # Check if any registration is necessary. If it is register the | |
56 # region and return a listref of pairs that need to be loaded. | |
57 if ( $pairs = $rr->check_and_register( $id, $start, $end ) ) { | |
58 ...; | |
59 } | |
60 | |
61 =head1 DESCRIPTION | |
62 | |
63 This module maintains an internal list of registered regions and is | |
64 used to quickly ascertain if and what regions of a provided range need | |
65 registration. | |
66 | |
67 =head1 METHODS | |
68 | |
69 =cut | |
70 | |
71 package Bio::EnsEMBL::Mapper::RangeRegistry; | |
72 | |
73 use strict; | |
74 | |
75 use Bio::EnsEMBL::Utils::Exception qw(throw); | |
76 use integer; | |
77 | |
78 =head2 new | |
79 | |
80 Arg [1] : none | |
81 Example : my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new(); | |
82 Description: Creates a new RangeRegistry object | |
83 Returntype : Bio::EnsEMBL::Mapper::RangeRegistry | |
84 Exceptions : none | |
85 Caller : AssemblyMapperAdaptor | |
86 Status : Stable | |
87 | |
88 =cut | |
89 | |
90 sub new { | |
91 my ($proto) = @_; | |
92 | |
93 my $class = ref($proto) || $proto; | |
94 | |
95 return bless( { 'registry' => {} }, $class ); | |
96 } | |
97 | |
98 sub flush { | |
99 my ($self) = @_; | |
100 $self->{'registry'} = {}; | |
101 } | |
102 | |
103 =head2 check_and_register | |
104 | |
105 Arg [1] : string $id | |
106 The id of the range to be checked/registered (e.g. a sequenceid) | |
107 Arg [2] : int $start | |
108 The start of the range to be checked | |
109 Arg [3] : int $end | |
110 The end of the range to be checked | |
111 Arg [4] : (optional) int $rstart | |
112 The start of the range to be registered | |
113 if the checked range was not fully registered | |
114 Arg [5] : (optional) int $rend | |
115 The end of the range to be registerd | |
116 if the checked range was not fully registered | |
117 Example : $ranges=$rr->check_and_register('X',500,600, 1,1_000_000)); | |
118 Description: Checks the range registry to see if the entire range | |
119 denoted by ($id : $start-$end) is already registered. If | |
120 it already is, then undef is returned. If it is not then | |
121 the range specified by $rstart and $rend is registered, and | |
122 a list of regions that were required to completely register | |
123 $rstart-$rend is returned. If $rstart and $rend are not | |
124 defined they default to $start and $end respectively. | |
125 | |
126 The reason there is a single call to do both the checking and | |
127 registering is to reduce the overhead. Much of the work to | |
128 check if a range is registered is the same as registering a | |
129 region around that range. | |
130 Returntype : undef or listref of [start,end] range pairs | |
131 Exceptions : throw if rstart is greater than start | |
132 throw if rend is less than end | |
133 throw if end is less than start | |
134 throw if id, start, or end are not defined | |
135 Caller : AssemblyMapperAdaptor | |
136 Status : Stable | |
137 | |
138 =cut | |
139 | |
140 #"constants" | |
141 my $START = 0; | |
142 my $END = 1; | |
143 | |
144 sub check_and_register { | |
145 my ( $self, $id, $start, $end, $rstart, $rend ) = @_; | |
146 | |
147 $rstart = $start if ( !defined($rstart) ); | |
148 $rend = $end if ( !defined($rend) ); | |
149 | |
150 # | |
151 # Sanity checks | |
152 # | |
153 if ( !defined($id) || !defined($start) || !defined($end) ) { | |
154 throw("ID, start, end arguments are required"); | |
155 } | |
156 | |
157 # The following was commented out due to Ensembl Genomes requirements | |
158 # for bacterial genomes. | |
159 # | |
160 ## if ( $start > $end ) { | |
161 ## throw( "start argument [$start] must be less than " | |
162 ## . "(or equal to) end argument [$end]" ); | |
163 ## } | |
164 ## | |
165 ## if ( $rstart > $rend ) { | |
166 ## throw( "rstart argument [$rstart] must be less than " | |
167 ## . "(or equal to) rend argument [$rend] argument" ); | |
168 ## } | |
169 | |
170 if ( $rstart > $start ) { | |
171 throw("rstart [$rstart] must be less than or equal to start [$start]"); | |
172 } | |
173 | |
174 if ( $rend < $end ) { | |
175 throw("rend [$rend] must be greater than or equal to end [$end]"); | |
176 } | |
177 | |
178 my $reg = $self->{'registry'}; | |
179 my $list = $reg->{$id} ||= []; | |
180 | |
181 my @gap_pairs; | |
182 | |
183 my $len = scalar(@$list); | |
184 | |
185 if ( $len == 0 ) { | |
186 # this is the first request for this id, return a gap pair for the | |
187 # entire range and register it as seen | |
188 $list->[0] = [ $rstart, $rend ]; | |
189 return [ [ $rstart, $rend ] ]; | |
190 } | |
191 | |
192 ##### | |
193 # loop through the list of existing ranges recording any "gaps" where | |
194 # the existing range does not cover part of the requested range | |
195 # | |
196 | |
197 my $start_idx = 0; | |
198 my $end_idx = $#$list; | |
199 my ( $mid_idx, $range ); | |
200 | |
201 # binary search the relevant pairs | |
202 # helps if the list is big | |
203 while ( ( $end_idx - $start_idx ) > 1 ) { | |
204 $mid_idx = ( $start_idx + $end_idx ) >> 1; | |
205 $range = $list->[$mid_idx]; | |
206 | |
207 if ( $range->[1] < $rstart ) { | |
208 $start_idx = $mid_idx; | |
209 } else { | |
210 $end_idx = $mid_idx; | |
211 } | |
212 } | |
213 | |
214 my ( $gap_start, $gap_end, $r_idx, $rstart_idx, $rend_idx ); | |
215 $gap_start = $rstart; | |
216 | |
217 for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) { | |
218 my ( $pstart, $pend ) = @{ $list->[$CUR] }; | |
219 | |
220 # no work needs to be done at all if we find a range pair that | |
221 # entirely overlaps the requested region | |
222 if ( $pstart <= $start && $pend >= $end ) { | |
223 return undef; | |
224 } | |
225 | |
226 # find adjacent or overlapping regions already registered | |
227 if ( $pend >= ( $rstart - 1 ) && $pstart <= ( $rend + 1 ) ) { | |
228 if ( !defined($rstart_idx) ) { | |
229 $rstart_idx = $CUR; | |
230 } | |
231 $rend_idx = $CUR; | |
232 } | |
233 | |
234 if ( $pstart > $rstart ) { | |
235 $gap_end = ( $rend < $pstart ) ? $rend : $pstart - 1; | |
236 push @gap_pairs, [ $gap_start, $gap_end ]; | |
237 } | |
238 | |
239 $gap_start = ( $rstart > $pend ) ? $rstart : $pend + 1; | |
240 | |
241 # if($pstart > $rend && !defined($r_idx)) { | |
242 if ( $pend >= $rend && !defined($r_idx) ) { | |
243 $r_idx = $CUR; | |
244 last; | |
245 } | |
246 } ## end for ( my $CUR = $start_idx... | |
247 | |
248 # do we have to make another gap? | |
249 if ( $gap_start <= $rend ) { | |
250 push @gap_pairs, [ $gap_start, $rend ]; | |
251 } | |
252 | |
253 # | |
254 # Merge the new range into the registered list | |
255 # | |
256 if ( defined($rstart_idx) ) { | |
257 my ( $new_start, $new_end ); | |
258 | |
259 if ( $rstart < $list->[$rstart_idx]->[0] ) { | |
260 $new_start = $rstart; | |
261 } else { | |
262 $new_start = $list->[$rstart_idx]->[0]; | |
263 } | |
264 | |
265 if ( $rend > $list->[$rend_idx]->[1] ) { | |
266 $new_end = $rend; | |
267 } else { | |
268 $new_end = $list->[$rend_idx]->[1]; | |
269 } | |
270 | |
271 splice( @$list, $rstart_idx, | |
272 $rend_idx - $rstart_idx + 1, | |
273 [ $new_start, $new_end ] ); | |
274 | |
275 } elsif ( defined($r_idx) ) { | |
276 splice( @$list, $r_idx, 0, [ $rstart, $rend ] ); | |
277 } else { | |
278 push( @$list, [ $rstart, $rend ] ); | |
279 } | |
280 | |
281 return \@gap_pairs; | |
282 } ## end sub check_and_register | |
283 | |
284 # overlap size is just added to make RangeRegistry generally more useful | |
285 | |
286 =head2 overlap_size | |
287 | |
288 Arg [1] : string $id | |
289 Arg [2] : int $start | |
290 Arg [3] : int $end | |
291 Example : my $overlap_size = $rr->( "chr1", 1, 100_000_000 ) | |
292 Description: finds out how many bases of the given range are registered | |
293 Returntype : int | |
294 Exceptions : none | |
295 Caller : general | |
296 Status : Stable | |
297 | |
298 =cut | |
299 | |
300 sub overlap_size { | |
301 my ( $self, $id, $start, $end ) = @_; | |
302 | |
303 my $overlap = 0; | |
304 | |
305 if ( $start > $end ) { return 0 } | |
306 | |
307 my $reg = $self->{'registry'}; | |
308 my $list = $reg->{$id} ||= []; | |
309 | |
310 my $len = scalar(@$list); | |
311 | |
312 if ( $len == 0 ) { | |
313 # this is the first request for this id, return a gap pair for the | |
314 # entire range and register it as seen | |
315 return 0; | |
316 } | |
317 | |
318 ##### | |
319 # loop through the list of existing ranges recording any "gaps" where | |
320 # the existing range does not cover part of the requested range | |
321 # | |
322 | |
323 my $start_idx = 0; | |
324 my $end_idx = $#$list; | |
325 my ( $mid_idx, $range ); | |
326 | |
327 # binary search the relevant pairs | |
328 # helps if the list is big | |
329 while ( ( $end_idx - $start_idx ) > 1 ) { | |
330 $mid_idx = ( $start_idx + $end_idx ) >> 1; | |
331 $range = $list->[$mid_idx]; | |
332 if ( $range->[1] < $start ) { | |
333 $start_idx = $mid_idx; | |
334 } else { | |
335 $end_idx = $mid_idx; | |
336 } | |
337 } | |
338 | |
339 for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) { | |
340 my ( $pstart, $pend ) = @{ $list->[$CUR] }; | |
341 | |
342 if ( $pstart > $end ) { | |
343 # No more interesting ranges here. | |
344 last; | |
345 } | |
346 | |
347 # no work needs to be done at all if we find a range pair that | |
348 # entirely overlaps the requested region | |
349 if ( $pstart <= $start && $pend >= $end ) { | |
350 $overlap = $end - $start + 1; | |
351 last; | |
352 } | |
353 | |
354 my $mstart = ( $start < $pstart ? $pstart : $start ); | |
355 my $mend = ( $end < $pend ? $end : $pend ); | |
356 | |
357 if ( $mend - $mstart >= 0 ) { | |
358 $overlap += ( $mend - $mstart + 1 ); | |
359 } | |
360 } | |
361 | |
362 return $overlap; | |
363 } ## end sub overlap_size | |
364 | |
365 | |
366 # low level function to access the ranges | |
367 # only use for read access | |
368 | |
369 sub get_ranges { | |
370 my ( $self, $id ) = @_; | |
371 | |
372 return $self->{'registry'}->{$id}; | |
373 } | |
374 | |
375 1; | |
376 |