0
|
1 package Bio::EnsEMBL::Utils::Iterator;
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =cut
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::EnsEMBL::Utils::Iterator
|
|
26
|
|
27 =head1 SYNOPSIS
|
|
28
|
|
29 my $variation_iterator =
|
|
30 $variation_adaptor->fetch_Iterator_by_VariationSet($1kg_set);
|
|
31
|
|
32 while ( my $variation = $variation_iterator->next ) {
|
|
33 # operate on variation object
|
|
34 print $variation->name, "\n";
|
|
35 }
|
|
36
|
|
37 =head1 DESCRIPTION
|
|
38
|
|
39 Some adaptor methods may return more objects than can fit in memory at once, in these cases
|
|
40 you can fetch an iterator object instead of the usual array reference. The iterator object
|
|
41 allows you to iterate over the set of objects (using the next() method) without loading the
|
|
42 entire set into memory at once. You can tell if an iterator is exhausted with the has_next()
|
|
43 method. The peek() method allows you to fetch the next object from the iterator without
|
|
44 advancing the iterator - this is useful if you want to check some property of en element in
|
|
45 the set while leaving the iterator unchanged.
|
|
46
|
|
47 You can filter and transform an iterator in an analogous way to using map and grep on arrays
|
|
48 using the provided map() and grep() methods. These methods return another iterator, and only
|
|
49 perform the filtering and transformation on each element as it is requested, so again these
|
|
50 can be used without loading the entire set into memory.
|
|
51
|
|
52 Iterators can be combined together with the append() method which merges together the
|
|
53 iterator it is called on with the list of iterators passed in as arguments. This is
|
|
54 somewhat analogous to concatenating arrays with the push function. append() returns a new
|
|
55 iterator which iterates over each component iterator until it is exhausted before moving
|
|
56 on to the next iterator, in the order in which they are supplied to the method.
|
|
57
|
|
58 An iterator can be converted to an array (reference) containing all the elements in the
|
|
59 set with the to_arrayref() method, but note that this array may consume a lot of memory if
|
|
60 the set the iterator is iterating over is large and it is recommended that you do not call
|
|
61 this method unless there is no way of working with each element at a time.
|
|
62
|
|
63 =head1 METHODS
|
|
64
|
|
65 =cut
|
|
66
|
|
67 use strict;
|
|
68 use warnings;
|
|
69
|
|
70 use Bio::EnsEMBL::Utils::Exception qw(throw);
|
|
71
|
|
72 =head2 new
|
|
73
|
|
74 Argument : either a coderef representing the iterator, in which case this
|
|
75 anonymous subroutine is assumed to return the next object in the
|
|
76 set when called and to return undef when the set is exhausted,
|
|
77 or an arrayref, in which case we return an iterator over this
|
|
78 array. If the argument is not defined then we return an 'empty'
|
|
79 iterator that immediately returns undef
|
|
80
|
|
81 Example :
|
|
82
|
|
83 my @dbIDs = fetch_relevant_dbIDs();
|
|
84
|
|
85 my $iterator = Bio::EnsEMBL::Utils::Iterator->new(
|
|
86 sub { return $self->fetch_by_dbID(shift @dbIDs) }
|
|
87 );
|
|
88
|
|
89 NB: this is a very simple example showing how to call the constructor
|
|
90 that would be rather inefficient in practice, real examples should
|
|
91 probably be smarter about batching up queries to minimise trips to
|
|
92 the database. See examples in the Variation API.
|
|
93
|
|
94 Description: Constructor, creates a new iterator object
|
|
95 Returntype : Bio::EnsEMBL::Utils::Iterator instance
|
|
96 Exceptions : thrown if the supplied argument is not the expected
|
|
97 Caller : general
|
|
98 Status : Experimental
|
|
99
|
|
100 =cut
|
|
101
|
|
102 sub new {
|
|
103 my $class = shift;
|
|
104
|
|
105 my $arg = shift;
|
|
106
|
|
107 my $coderef;
|
|
108
|
|
109 if (not defined $arg) {
|
|
110 # if the user doesn't supply an argument, we create a
|
|
111 # simple 'empty' iterator that immediately returns undef
|
|
112
|
|
113 $coderef = sub { return undef };
|
|
114 }
|
|
115 elsif (ref $arg eq 'ARRAY') {
|
|
116 # if the user supplies an arrayref as an argument, we
|
|
117 # create an iterator over this array
|
|
118
|
|
119 $coderef = sub { return shift @$arg };
|
|
120 }
|
|
121 elsif (ref $arg eq 'CODE'){
|
|
122 $coderef = $arg;
|
|
123 }
|
|
124 else {
|
|
125 throw("The supplied argument does not look like an arrayref or a coderef ".(ref $arg))
|
|
126 }
|
|
127
|
|
128 my $self = {sub => $coderef};
|
|
129
|
|
130 return bless $self, $class;
|
|
131 }
|
|
132
|
|
133
|
|
134 =head2 next
|
|
135
|
|
136 Example : $obj = $iterator->next
|
|
137 Description: returns the next object from this iterator, or undef if the iterator is exhausted
|
|
138 Returntype : Object type will depend on what this iterator is iterating over
|
|
139 Exceptions : none
|
|
140 Caller : general
|
|
141 Status : Experimental
|
|
142
|
|
143 =cut
|
|
144
|
|
145 sub next {
|
|
146 my $self = shift;
|
|
147
|
|
148 $self->{next} = $self->{sub}->() unless defined $self->{next};
|
|
149
|
|
150 return delete $self->{next};
|
|
151 }
|
|
152
|
|
153 =head2 has_next
|
|
154
|
|
155 Example : if ($iterator->has_next) { my $obj = $iterator->next }
|
|
156 Description: Boolean - true if this iterator has more elements to fetch, false when
|
|
157 it is exhausted
|
|
158 Returntype : boolean
|
|
159 Exceptions : none
|
|
160 Caller : general
|
|
161 Status : Experimental
|
|
162
|
|
163 =cut
|
|
164
|
|
165 sub has_next {
|
|
166 my $self = shift;
|
|
167
|
|
168 $self->{next} = $self->{sub}->() unless defined $self->{next};
|
|
169
|
|
170 return defined $self->{next};
|
|
171 }
|
|
172
|
|
173 =head2 peek
|
|
174
|
|
175 Example : $obj = $iterator->peek
|
|
176 Description: returns the next object from this iterator, or undef if the iterator is exhausted,
|
|
177 much like next but does not advance the iterator (so the same object will be
|
|
178 returned on the following call to next or peek)
|
|
179 Returntype : Object type will depend on what this iterator is iterating over
|
|
180 Exceptions : none
|
|
181 Caller : general
|
|
182 Status : Experimental
|
|
183
|
|
184 =cut
|
|
185
|
|
186 sub peek {
|
|
187 my $self = shift;
|
|
188
|
|
189 $self->{next} = $self->{sub}->() unless defined $self->{next};
|
|
190
|
|
191 return $self->{next};
|
|
192 }
|
|
193
|
|
194 =head2 grep
|
|
195
|
|
196 Example : my $filtered_iterator = $original_iterator->grep(sub {$_->name =~ /^rs/});
|
|
197 Description: filter this iterator, returning another iterator
|
|
198 Argument : a coderef which returns true if the element should be included in the
|
|
199 filtered set, or false if the element should be filtered out. $_ will be
|
|
200 set locally to each element in turn so you should be able to write a block
|
|
201 in a similar way as for the perl grep function (although it will need to be
|
|
202 preceded with the sub keyword). Otherwise you can pass in a reference to a
|
|
203 subroutine which expects a single argument with the same behaviour.
|
|
204 Returntype : Bio::EnsEMBL::Utils::Iterator
|
|
205 Exceptions : thrown if the argument is not a coderef
|
|
206 Caller : general
|
|
207 Status : Experimental
|
|
208
|
|
209 =cut
|
|
210
|
|
211 sub grep {
|
|
212 my ($self, $coderef) = @_;
|
|
213
|
|
214 throw('Argument should be a coderef') unless ref $coderef eq 'CODE';
|
|
215
|
|
216 return Bio::EnsEMBL::Utils::Iterator->new(sub {
|
|
217 while ($self->has_next) {
|
|
218 local $_ = $self->next;
|
|
219 return $_ if $coderef->($_);
|
|
220 }
|
|
221 return undef;
|
|
222 });
|
|
223 }
|
|
224
|
|
225 =head2 map
|
|
226
|
|
227 Example : my $transformed_iterator = $original_iterator->map(sub {$_->name});
|
|
228 Description: transform the elements of this iterator, returning another iterator
|
|
229 Argument : a coderef which returns the desired transformation of each element.
|
|
230 $_ will be set locally set to each original element in turn so you
|
|
231 should be able to write a block in a similar way as for the perl map
|
|
232 function (although it will need to be preceded with the sub keyword).
|
|
233 Otherwise you can pass in a reference to a subroutine which expects a
|
|
234 single argument with the same behaviour.
|
|
235 Returntype : Bio::EnsEMBL::Utils::Iterator
|
|
236 Exceptions : thrown if the argument is not a coderef
|
|
237 Caller : general
|
|
238 Status : Experimental
|
|
239
|
|
240 =cut
|
|
241
|
|
242
|
|
243 sub map {
|
|
244 my ($self, $coderef) = @_;
|
|
245
|
|
246 throw('Argument should be a coderef') unless ref $coderef eq 'CODE';
|
|
247
|
|
248 return Bio::EnsEMBL::Utils::Iterator->new(sub {
|
|
249 local $_ = $self->next;
|
|
250 return defined $_ ? $coderef->($_) : undef;
|
|
251 });
|
|
252 }
|
|
253
|
|
254
|
|
255 =head2 each
|
|
256
|
|
257 Example : $iterator->each(sub { print $_->name, "\n"; });
|
|
258 Description: Performs a full iteration of the current iterator instance.
|
|
259 Argument : a coderef which returns the desired transformation of each element.
|
|
260 $_ will be set locally set to each element.
|
|
261 Returntype : None
|
|
262 Exceptions : thrown if the argument is not a coderef
|
|
263 Caller : general
|
|
264 Status : Experimental
|
|
265
|
|
266 =cut
|
|
267
|
|
268
|
|
269 sub each {
|
|
270 my ($self, $coderef) = @_;
|
|
271 throw('Argument should be a coderef') unless ref $coderef eq 'CODE';
|
|
272 while($self->has_next()) {
|
|
273 local $_ = $self->next();
|
|
274 $coderef->($_);
|
|
275 }
|
|
276 return;
|
|
277 }
|
|
278
|
|
279
|
|
280 =head2 to_arrayref
|
|
281
|
|
282 Example : my $arrayref = $iterator->to_arrayref;
|
|
283 Description: return a reference to an array containing all elements from the
|
|
284 iterator. This is created by simply iterating over the iterator
|
|
285 until it is exhausted and adding each element in turn to an array.
|
|
286 Note that this may consume a lot of memory for iterators over
|
|
287 large collections
|
|
288 Returntype : arrayref
|
|
289 Exceptions : none
|
|
290 Caller : general
|
|
291 Status : Experimental
|
|
292
|
|
293 =cut
|
|
294
|
|
295 sub to_arrayref {
|
|
296 my ($self) = @_;
|
|
297
|
|
298 my @array;
|
|
299
|
|
300 while ($self->has_next) {
|
|
301 push @array, $self->next;
|
|
302 }
|
|
303
|
|
304 return \@array;
|
|
305 }
|
|
306
|
|
307 =head2 append
|
|
308
|
|
309 Example : my $combined_iterator = $iterator1->append($iterator2, $iterator3);
|
|
310 Description: return a new iterator that combines this iterator with the others
|
|
311 passed as arguments, this new iterator will iterate over each
|
|
312 component iterator (in the order supplied here) until it is
|
|
313 exhausted and then move on to the next iterator until all are
|
|
314 exhausted
|
|
315 Argument : an array of Bio::EnsEMBL::Utils::Iterator objects
|
|
316 Returntype : Bio::EnsEMBL::Utils::Iterator
|
|
317 Exceptions : thrown if any of the arguments are not iterators
|
|
318 Caller : general
|
|
319 Status : Experimental
|
|
320
|
|
321 =cut
|
|
322
|
|
323 sub append {
|
|
324 my ($self, @queue) = @_;
|
|
325
|
|
326 for my $iterator (@queue) {
|
|
327 throw("Argument to append doesn't look like an iterator")
|
|
328 unless UNIVERSAL::can($iterator, 'has_next') && UNIVERSAL::can($iterator, 'next');
|
|
329 }
|
|
330
|
|
331 # push ourselves onto the front of the queue
|
|
332 unshift @queue, $self;
|
|
333
|
|
334 return Bio::EnsEMBL::Utils::Iterator->new(sub {
|
|
335 # shift off any exhausted iterators
|
|
336 while (@queue && not $queue[0]->has_next) {
|
|
337 shift @queue;
|
|
338 }
|
|
339
|
|
340 # and return the next object from the iterator at the
|
|
341 # head of the queue, or undef if the queue is empty
|
|
342 return @queue ? $queue[0]->next : undef;
|
|
343 });
|
|
344 }
|
|
345
|
|
346 =head2 take
|
|
347
|
|
348 Example : my $limited_iterator = $iterator->take(5);
|
|
349 Description: return a new iterator that only iterates over the
|
|
350 first n elements of this iterator
|
|
351 Argument : a positive integer
|
|
352 Returntype : Bio::EnsEMBL::Utils::Iterator
|
|
353 Exceptions : thrown if the argument is negative
|
|
354 Caller : general
|
|
355 Status : Experimental
|
|
356
|
|
357 =cut
|
|
358
|
|
359 sub take {
|
|
360 my ($self, $n) = @_;
|
|
361
|
|
362 throw("Argument cannot be negative") if $n < 0;
|
|
363
|
|
364 my $cnt = 0;
|
|
365
|
|
366 return Bio::EnsEMBL::Utils::Iterator->new(sub {
|
|
367 return $cnt++ >= $n ? undef : $self->next;
|
|
368 });
|
|
369 }
|
|
370
|
|
371 =head2 skip
|
|
372
|
|
373 Example : my $limited_iterator = $iterator->skip(5);
|
|
374 Description: skip over the first n elements of this iterator (and then return
|
|
375 the same iterator for your method chaining convenience)
|
|
376 Argument : a positive integer
|
|
377 Returntype : Bio::EnsEMBL::Utils::Iterator
|
|
378 Exceptions : thrown if the argument is negative
|
|
379 Caller : general
|
|
380 Status : Experimental
|
|
381
|
|
382 =cut
|
|
383
|
|
384 sub skip {
|
|
385 my ($self, $n) = @_;
|
|
386
|
|
387 throw("Argument cannot be negative") if $n < 0;
|
|
388
|
|
389 $self->next for (0 .. $n-1);
|
|
390
|
|
391 return $self;
|
|
392 }
|
|
393
|
|
394 =head2 reduce
|
|
395
|
|
396 Example : my $tot_length = $iterator->reduce(sub { $_[0] + $_[1]->length }, 0);
|
|
397 Description: reduce this iterator with the provided coderef, using the (optional)
|
|
398 second argument as the initial value of the accumulator
|
|
399 Argument[1]: a coderef that expects 2 arguments, the current accumulator
|
|
400 value and the next element in the set, and returns the next
|
|
401 accumulator value. Unless the optional second argument is
|
|
402 provided the first accumulator value passed in will be the
|
|
403 first element in the set
|
|
404 Argument[2]: (optional) an initial value to use for the accumulator instead
|
|
405 of the first value of the set
|
|
406 Returntype : returntype of the coderef
|
|
407 Exceptions : thrown if the argument is not a coderef
|
|
408 Caller : general
|
|
409 Status : Experimental
|
|
410
|
|
411 =cut
|
|
412
|
|
413 sub reduce {
|
|
414 my ($self, $coderef, $init_val) = @_;
|
|
415
|
|
416 throw('Argument should be a coderef') unless ref $coderef eq 'CODE';
|
|
417
|
|
418 my $result = defined $init_val ? $init_val : $self->next;
|
|
419
|
|
420 while ($self->has_next) {
|
|
421 $result = $coderef->($result, $self->next);
|
|
422 }
|
|
423
|
|
424 return $result;
|
|
425 }
|
|
426
|
|
427 1;
|
|
428
|