0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Mapper::RangeRegistry
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 use Bio::EnsEMBL::Mapper::RangeRegistry;
|
|
28
|
|
29 $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
|
|
30
|
|
31 # Get a fixed width chunk around the range of intereset. This
|
|
32 # will be used if any registration is actually necessary.
|
|
33 $chunk_start = ( $start >> 20 ) << 20 + 1;
|
|
34 $chunk_end = ( ( $end >> 20 ) + 1 ) << 20;
|
|
35
|
|
36 # Check if any registration is necessary for the range. If it is
|
|
37 # register a large chunked area instead and return a listref of
|
|
38 # unregistered areas that need to be loaded.
|
|
39 if (
|
|
40 $pairs = $rr->check_and_register(
|
|
41 $id, $start, $end, $chunk_start, $chunk_end
|
|
42 ) )
|
|
43 {
|
|
44 foreach my $pair (@$pairs) {
|
|
45 my ( $pair_start, $pair_end ) = @$pair;
|
|
46 # Fetch mappings for these regions from the assembly table and
|
|
47 # load them into the mapper.
|
|
48 ...;
|
|
49 }
|
|
50 } else {
|
|
51 # The range ($start - $end) is already registered
|
|
52 ...;
|
|
53 }
|
|
54
|
|
55 # Check if any registration is necessary. If it is register the
|
|
56 # region and return a listref of pairs that need to be loaded.
|
|
57 if ( $pairs = $rr->check_and_register( $id, $start, $end ) ) {
|
|
58 ...;
|
|
59 }
|
|
60
|
|
61 =head1 DESCRIPTION
|
|
62
|
|
63 This module maintains an internal list of registered regions and is
|
|
64 used to quickly ascertain if and what regions of a provided range need
|
|
65 registration.
|
|
66
|
|
67 =head1 METHODS
|
|
68
|
|
69 =cut
|
|
70
|
|
71 package Bio::EnsEMBL::Mapper::RangeRegistry;
|
|
72
|
|
73 use strict;
|
|
74
|
|
75 use Bio::EnsEMBL::Utils::Exception qw(throw);
|
|
76 use integer;
|
|
77
|
|
78 =head2 new
|
|
79
|
|
80 Arg [1] : none
|
|
81 Example : my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
|
|
82 Description: Creates a new RangeRegistry object
|
|
83 Returntype : Bio::EnsEMBL::Mapper::RangeRegistry
|
|
84 Exceptions : none
|
|
85 Caller : AssemblyMapperAdaptor
|
|
86 Status : Stable
|
|
87
|
|
88 =cut
|
|
89
|
|
90 sub new {
|
|
91 my ($proto) = @_;
|
|
92
|
|
93 my $class = ref($proto) || $proto;
|
|
94
|
|
95 return bless( { 'registry' => {} }, $class );
|
|
96 }
|
|
97
|
|
98 sub flush {
|
|
99 my ($self) = @_;
|
|
100 $self->{'registry'} = {};
|
|
101 }
|
|
102
|
|
103 =head2 check_and_register
|
|
104
|
|
105 Arg [1] : string $id
|
|
106 The id of the range to be checked/registered (e.g. a sequenceid)
|
|
107 Arg [2] : int $start
|
|
108 The start of the range to be checked
|
|
109 Arg [3] : int $end
|
|
110 The end of the range to be checked
|
|
111 Arg [4] : (optional) int $rstart
|
|
112 The start of the range to be registered
|
|
113 if the checked range was not fully registered
|
|
114 Arg [5] : (optional) int $rend
|
|
115 The end of the range to be registerd
|
|
116 if the checked range was not fully registered
|
|
117 Example : $ranges=$rr->check_and_register('X',500,600, 1,1_000_000));
|
|
118 Description: Checks the range registry to see if the entire range
|
|
119 denoted by ($id : $start-$end) is already registered. If
|
|
120 it already is, then undef is returned. If it is not then
|
|
121 the range specified by $rstart and $rend is registered, and
|
|
122 a list of regions that were required to completely register
|
|
123 $rstart-$rend is returned. If $rstart and $rend are not
|
|
124 defined they default to $start and $end respectively.
|
|
125
|
|
126 The reason there is a single call to do both the checking and
|
|
127 registering is to reduce the overhead. Much of the work to
|
|
128 check if a range is registered is the same as registering a
|
|
129 region around that range.
|
|
130 Returntype : undef or listref of [start,end] range pairs
|
|
131 Exceptions : throw if rstart is greater than start
|
|
132 throw if rend is less than end
|
|
133 throw if end is less than start
|
|
134 throw if id, start, or end are not defined
|
|
135 Caller : AssemblyMapperAdaptor
|
|
136 Status : Stable
|
|
137
|
|
138 =cut
|
|
139
|
|
140 #"constants"
|
|
141 my $START = 0;
|
|
142 my $END = 1;
|
|
143
|
|
144 sub check_and_register {
|
|
145 my ( $self, $id, $start, $end, $rstart, $rend ) = @_;
|
|
146
|
|
147 $rstart = $start if ( !defined($rstart) );
|
|
148 $rend = $end if ( !defined($rend) );
|
|
149
|
|
150 #
|
|
151 # Sanity checks
|
|
152 #
|
|
153 if ( !defined($id) || !defined($start) || !defined($end) ) {
|
|
154 throw("ID, start, end arguments are required");
|
|
155 }
|
|
156
|
|
157 # The following was commented out due to Ensembl Genomes requirements
|
|
158 # for bacterial genomes.
|
|
159 #
|
|
160 ## if ( $start > $end ) {
|
|
161 ## throw( "start argument [$start] must be less than "
|
|
162 ## . "(or equal to) end argument [$end]" );
|
|
163 ## }
|
|
164 ##
|
|
165 ## if ( $rstart > $rend ) {
|
|
166 ## throw( "rstart argument [$rstart] must be less than "
|
|
167 ## . "(or equal to) rend argument [$rend] argument" );
|
|
168 ## }
|
|
169
|
|
170 if ( $rstart > $start ) {
|
|
171 throw("rstart [$rstart] must be less than or equal to start [$start]");
|
|
172 }
|
|
173
|
|
174 if ( $rend < $end ) {
|
|
175 throw("rend [$rend] must be greater than or equal to end [$end]");
|
|
176 }
|
|
177
|
|
178 my $reg = $self->{'registry'};
|
|
179 my $list = $reg->{$id} ||= [];
|
|
180
|
|
181 my @gap_pairs;
|
|
182
|
|
183 my $len = scalar(@$list);
|
|
184
|
|
185 if ( $len == 0 ) {
|
|
186 # this is the first request for this id, return a gap pair for the
|
|
187 # entire range and register it as seen
|
|
188 $list->[0] = [ $rstart, $rend ];
|
|
189 return [ [ $rstart, $rend ] ];
|
|
190 }
|
|
191
|
|
192 #####
|
|
193 # loop through the list of existing ranges recording any "gaps" where
|
|
194 # the existing range does not cover part of the requested range
|
|
195 #
|
|
196
|
|
197 my $start_idx = 0;
|
|
198 my $end_idx = $#$list;
|
|
199 my ( $mid_idx, $range );
|
|
200
|
|
201 # binary search the relevant pairs
|
|
202 # helps if the list is big
|
|
203 while ( ( $end_idx - $start_idx ) > 1 ) {
|
|
204 $mid_idx = ( $start_idx + $end_idx ) >> 1;
|
|
205 $range = $list->[$mid_idx];
|
|
206
|
|
207 if ( $range->[1] < $rstart ) {
|
|
208 $start_idx = $mid_idx;
|
|
209 } else {
|
|
210 $end_idx = $mid_idx;
|
|
211 }
|
|
212 }
|
|
213
|
|
214 my ( $gap_start, $gap_end, $r_idx, $rstart_idx, $rend_idx );
|
|
215 $gap_start = $rstart;
|
|
216
|
|
217 for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
|
|
218 my ( $pstart, $pend ) = @{ $list->[$CUR] };
|
|
219
|
|
220 # no work needs to be done at all if we find a range pair that
|
|
221 # entirely overlaps the requested region
|
|
222 if ( $pstart <= $start && $pend >= $end ) {
|
|
223 return undef;
|
|
224 }
|
|
225
|
|
226 # find adjacent or overlapping regions already registered
|
|
227 if ( $pend >= ( $rstart - 1 ) && $pstart <= ( $rend + 1 ) ) {
|
|
228 if ( !defined($rstart_idx) ) {
|
|
229 $rstart_idx = $CUR;
|
|
230 }
|
|
231 $rend_idx = $CUR;
|
|
232 }
|
|
233
|
|
234 if ( $pstart > $rstart ) {
|
|
235 $gap_end = ( $rend < $pstart ) ? $rend : $pstart - 1;
|
|
236 push @gap_pairs, [ $gap_start, $gap_end ];
|
|
237 }
|
|
238
|
|
239 $gap_start = ( $rstart > $pend ) ? $rstart : $pend + 1;
|
|
240
|
|
241 # if($pstart > $rend && !defined($r_idx)) {
|
|
242 if ( $pend >= $rend && !defined($r_idx) ) {
|
|
243 $r_idx = $CUR;
|
|
244 last;
|
|
245 }
|
|
246 } ## end for ( my $CUR = $start_idx...
|
|
247
|
|
248 # do we have to make another gap?
|
|
249 if ( $gap_start <= $rend ) {
|
|
250 push @gap_pairs, [ $gap_start, $rend ];
|
|
251 }
|
|
252
|
|
253 #
|
|
254 # Merge the new range into the registered list
|
|
255 #
|
|
256 if ( defined($rstart_idx) ) {
|
|
257 my ( $new_start, $new_end );
|
|
258
|
|
259 if ( $rstart < $list->[$rstart_idx]->[0] ) {
|
|
260 $new_start = $rstart;
|
|
261 } else {
|
|
262 $new_start = $list->[$rstart_idx]->[0];
|
|
263 }
|
|
264
|
|
265 if ( $rend > $list->[$rend_idx]->[1] ) {
|
|
266 $new_end = $rend;
|
|
267 } else {
|
|
268 $new_end = $list->[$rend_idx]->[1];
|
|
269 }
|
|
270
|
|
271 splice( @$list, $rstart_idx,
|
|
272 $rend_idx - $rstart_idx + 1,
|
|
273 [ $new_start, $new_end ] );
|
|
274
|
|
275 } elsif ( defined($r_idx) ) {
|
|
276 splice( @$list, $r_idx, 0, [ $rstart, $rend ] );
|
|
277 } else {
|
|
278 push( @$list, [ $rstart, $rend ] );
|
|
279 }
|
|
280
|
|
281 return \@gap_pairs;
|
|
282 } ## end sub check_and_register
|
|
283
|
|
284 # overlap size is just added to make RangeRegistry generally more useful
|
|
285
|
|
286 =head2 overlap_size
|
|
287
|
|
288 Arg [1] : string $id
|
|
289 Arg [2] : int $start
|
|
290 Arg [3] : int $end
|
|
291 Example : my $overlap_size = $rr->( "chr1", 1, 100_000_000 )
|
|
292 Description: finds out how many bases of the given range are registered
|
|
293 Returntype : int
|
|
294 Exceptions : none
|
|
295 Caller : general
|
|
296 Status : Stable
|
|
297
|
|
298 =cut
|
|
299
|
|
300 sub overlap_size {
|
|
301 my ( $self, $id, $start, $end ) = @_;
|
|
302
|
|
303 my $overlap = 0;
|
|
304
|
|
305 if ( $start > $end ) { return 0 }
|
|
306
|
|
307 my $reg = $self->{'registry'};
|
|
308 my $list = $reg->{$id} ||= [];
|
|
309
|
|
310 my $len = scalar(@$list);
|
|
311
|
|
312 if ( $len == 0 ) {
|
|
313 # this is the first request for this id, return a gap pair for the
|
|
314 # entire range and register it as seen
|
|
315 return 0;
|
|
316 }
|
|
317
|
|
318 #####
|
|
319 # loop through the list of existing ranges recording any "gaps" where
|
|
320 # the existing range does not cover part of the requested range
|
|
321 #
|
|
322
|
|
323 my $start_idx = 0;
|
|
324 my $end_idx = $#$list;
|
|
325 my ( $mid_idx, $range );
|
|
326
|
|
327 # binary search the relevant pairs
|
|
328 # helps if the list is big
|
|
329 while ( ( $end_idx - $start_idx ) > 1 ) {
|
|
330 $mid_idx = ( $start_idx + $end_idx ) >> 1;
|
|
331 $range = $list->[$mid_idx];
|
|
332 if ( $range->[1] < $start ) {
|
|
333 $start_idx = $mid_idx;
|
|
334 } else {
|
|
335 $end_idx = $mid_idx;
|
|
336 }
|
|
337 }
|
|
338
|
|
339 for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
|
|
340 my ( $pstart, $pend ) = @{ $list->[$CUR] };
|
|
341
|
|
342 if ( $pstart > $end ) {
|
|
343 # No more interesting ranges here.
|
|
344 last;
|
|
345 }
|
|
346
|
|
347 # no work needs to be done at all if we find a range pair that
|
|
348 # entirely overlaps the requested region
|
|
349 if ( $pstart <= $start && $pend >= $end ) {
|
|
350 $overlap = $end - $start + 1;
|
|
351 last;
|
|
352 }
|
|
353
|
|
354 my $mstart = ( $start < $pstart ? $pstart : $start );
|
|
355 my $mend = ( $end < $pend ? $end : $pend );
|
|
356
|
|
357 if ( $mend - $mstart >= 0 ) {
|
|
358 $overlap += ( $mend - $mstart + 1 );
|
|
359 }
|
|
360 }
|
|
361
|
|
362 return $overlap;
|
|
363 } ## end sub overlap_size
|
|
364
|
|
365
|
|
366 # low level function to access the ranges
|
|
367 # only use for read access
|
|
368
|
|
369 sub get_ranges {
|
|
370 my ( $self, $id ) = @_;
|
|
371
|
|
372 return $self->{'registry'}->{$id};
|
|
373 }
|
|
374
|
|
375 1;
|
|
376
|