Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqFeature/Collection.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Collection.pm,v 1.9.2.1 2003/02/21 03:07:19 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SeqFeature::Collection | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SeqFeature::Collection - A container class for SeqFeatures | |
16 suitable for performing operations such as finding features within a | |
17 range, that match a certain feature type, etc. | |
18 | |
19 =head1 SYNOPSIS | |
20 | |
21 use Bio::SeqFeature::Collection; | |
22 use Bio::Location::Simple; | |
23 use Bio::Tools::GFF; | |
24 use Bio::Root::IO; | |
25 # let's first input some features | |
26 my $gffio = Bio::Tools::GFF->new(-file => Bio::Root::IO->catfile | |
27 ("t","data","myco_sites.gff"), | |
28 -gff_version => 2); | |
29 my @features = (); | |
30 # loop over the input stream | |
31 while(my $feature = $gffio->next_feature()) { | |
32 # do something with feature | |
33 push @features, $feature; | |
34 } | |
35 $gffio->close(); | |
36 # build the Collection object | |
37 my $col = new Bio::SeqFeature::Collection(); | |
38 # add these features to the object | |
39 my $totaladded = $col->add_features(\@features); | |
40 | |
41 my @subset = $col->features_in_range(-start => 1, | |
42 -end => 25000, | |
43 -strand => 1, | |
44 -contain => 0); | |
45 # subset should have 18 entries for this dataset | |
46 print "size is ", scalar @subset, "\n"; | |
47 @subset = $col->features_in_range(-range => Bio::Location::Simple->new | |
48 (-start => 70000, | |
49 -end => 150000, | |
50 -strand => -1), | |
51 -contain => 1, | |
52 -strandmatch => 'strong'); | |
53 | |
54 # subset should have 22 entries for this dataset | |
55 print "size is ", scalar @subset, "\n"; | |
56 print "total number of features in collection is ", | |
57 $col->feature_count(),"\n"; | |
58 | |
59 =head1 DESCRIPTION | |
60 | |
61 This object will efficiently allow one for query subsets of ranges | |
62 within a large collection of sequence features (in fact the objects | |
63 just have to be Bio::RangeI compliant). This is done by the creation | |
64 of bins which are stored in order in a B-Tree data structure as | |
65 provided by the DB_File interface to the Berkeley DB. | |
66 | |
67 This is based on work done by Lincoln for storage in a mysql instance | |
68 - this is intended to be an embedded in-memory implementation for | |
69 easily quering for subsets of a large range set. All features are | |
70 held in memory even if the -usefile flag is provided. | |
71 | |
72 =head1 FEEDBACK | |
73 | |
74 =head2 Mailing Lists | |
75 | |
76 User feedback is an integral part of the evolution of this and other | |
77 Bioperl modules. Send your comments and suggestions preferably to | |
78 the Bioperl mailing list. Your participation is much appreciated. | |
79 | |
80 bioperl-l@bioperl.org - General discussion | |
81 http://bioperl.org/MailList.shtml - About the mailing lists | |
82 | |
83 =head2 Reporting Bugs | |
84 | |
85 Report bugs to the Bioperl bug tracking system to help us keep track | |
86 of the bugs and their resolution. Bug reports can be submitted via | |
87 email or the web: | |
88 | |
89 bioperl-bugs@bioperl.org | |
90 http://bugzilla.bioperl.org/ | |
91 | |
92 =head1 AUTHOR - Jason Stajich | |
93 | |
94 Email jason@bioperl.org | |
95 | |
96 =head1 CONTRIBUTORS | |
97 | |
98 Using code and strategy developed by Lincoln Stein (lstein@cshl.org) | |
99 in Bio::DB::GFF implementation. | |
100 | |
101 =head1 APPENDIX | |
102 | |
103 The rest of the documentation details each of the object methods. | |
104 Internal methods are usually preceded with a _ | |
105 | |
106 =cut | |
107 | |
108 | |
109 # Let the code begin... | |
110 | |
111 | |
112 package Bio::SeqFeature::Collection; | |
113 use vars qw(@ISA); | |
114 use strict; | |
115 | |
116 # Object preamble - inherits from Bio::Root::Root | |
117 | |
118 use Bio::Root::Root; | |
119 use Bio::Root::IO; | |
120 use Bio::DB::GFF::Util::Binning; | |
121 use DB_File; | |
122 use Bio::Location::Simple; | |
123 | |
124 @ISA = qw(Bio::Root::Root ); | |
125 | |
126 | |
127 # This may need to get re-optimized for BDB usage as these | |
128 # numbers were derived empirically by Lincoln on a mysql srv | |
129 # running on his laptop | |
130 | |
131 # this is the largest that any reference sequence can be (100 megabases) | |
132 use constant MAX_BIN => 100_000_000; | |
133 | |
134 # this is the smallest bin (1 K) | |
135 use constant MIN_BIN => 1_000; | |
136 | |
137 =head2 new | |
138 | |
139 Title : new | |
140 Usage : my $obj = new Bio::SeqFeature::Collection(); | |
141 Function: Builds a new Bio::SeqFeature::Collection object | |
142 Returns : Bio::SeqFeature::Collection | |
143 Args : | |
144 | |
145 -minbin minimum value to use for binning | |
146 (default is 100,000,000) | |
147 -maxbin maximum value to use for binning | |
148 (default is 1,000) | |
149 -usefile boolean to use a file to store | |
150 BTREE rather than an in-memory structure | |
151 (default is false or in-memory). | |
152 | |
153 -features Array ref of features to add initially | |
154 | |
155 =cut | |
156 | |
157 sub new { | |
158 my($class,@args) = @_; | |
159 | |
160 my $self = $class->SUPER::new(@args); | |
161 my ($maxbin,$minbin,$usefile,$features) = $self->_rearrange([qw(MAXBIN MINBIN | |
162 USEFILE | |
163 FEATURES)],@args); | |
164 | |
165 defined $maxbin && $self->max_bin($maxbin); | |
166 defined $minbin && $self->min_bin($minbin); | |
167 | |
168 defined $features && $self->add_features($features); | |
169 $DB_BTREE->{'flags'} = R_DUP ; | |
170 $DB_BTREE->{'compare'} = \&_compare; | |
171 # $DB_BTREE->{'compare'} = \&_comparepack; | |
172 $self->{'_btreehash'} = {}; | |
173 | |
174 my $tmpname = undef; | |
175 if( $usefile ) { | |
176 $self->{'_io'} = new Bio::Root::IO; | |
177 (undef,$tmpname) = $self->{'_io'}->tempfile(); | |
178 unlink($tmpname); | |
179 $self->debug("tmpfile is $tmpname"); | |
180 } | |
181 $self->{'_btree'} = tie %{$self->{'_btreehash'}}, | |
182 'DB_File', $tmpname, O_RDWR|O_CREAT, 0640, $DB_BTREE; | |
183 | |
184 # possibly storing/retrieving as floats for speed improvement? | |
185 # $self->{'_btree'}->filter_store_key ( sub { $_ = pack ("d", $_) } ); | |
186 # $self->{'_btree'}->filter_fetch_key ( sub { $_ = unpack("d", $_) } ); | |
187 | |
188 $self->{'_features'} = []; | |
189 return $self; | |
190 } | |
191 | |
192 | |
193 =head2 add_features | |
194 | |
195 Title : add_features | |
196 Usage : $collection->add_features(\@features); | |
197 Function: | |
198 Returns : number of features added | |
199 Args : arrayref of Bio::SeqFeatureI objects to index | |
200 | |
201 | |
202 =cut | |
203 | |
204 sub add_features{ | |
205 my ($self,$feats) = @_; | |
206 if( ref($feats) !~ /ARRAY/i ) { | |
207 $self->warn("Must provide a valid Array reference to add_features"); | |
208 return 0; | |
209 } | |
210 my $count = 0; | |
211 foreach my $f ( @$feats ) { | |
212 if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) { | |
213 $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n"); | |
214 next; | |
215 } | |
216 my $bin = bin($f->start,$f->end,$self->min_bin); | |
217 | |
218 push @{$self->{'_features'}}, $f; | |
219 $self->{'_btreehash'}->{$bin} = $#{$self->{'_features'}}; | |
220 $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n"); | |
221 $count++; | |
222 } | |
223 return $count; | |
224 } | |
225 | |
226 | |
227 =head2 features_in_range | |
228 | |
229 Title : features_in_range | |
230 Usage : my @features = $collection->features_in_range($range) | |
231 Function: Retrieves a list of features which were contained or overlap the | |
232 the requested range (see Args for way to specify overlap or | |
233 only those containe)d | |
234 Returns : List of Bio::SeqFeatureI objects | |
235 Args : -range => Bio::RangeI object defining range to search, | |
236 OR | |
237 -start => start, | |
238 -end => end, | |
239 -strand => strand | |
240 | |
241 -contain => boolean - true if feature must be completely | |
242 contained with range | |
243 OR false if should include features that simply overlap | |
244 the range. Default: true. | |
245 -strandmatch => 'strong', ranges must have the same strand | |
246 'weak', ranges must have the same | |
247 strand or no strand | |
248 'ignore', ignore strand information | |
249 Default. 'ignore'. | |
250 | |
251 =cut | |
252 | |
253 sub features_in_range{ | |
254 my $self = shift; | |
255 my (@args) = @_; | |
256 my ($range, $contain, $strandmatch,$start,$end,$strand); | |
257 if( @args == 1 ) { | |
258 $range = shift @args; | |
259 } else { | |
260 ($start,$end,$strand,$range, | |
261 $contain,$strandmatch) = $self->_rearrange([qw(START END | |
262 STRAND | |
263 RANGE CONTAIN | |
264 STRANDMATCH)], | |
265 @args); | |
266 $contain = 1 unless defined $contain; | |
267 } | |
268 $strand = 1 unless defined $strand; | |
269 if( $strand !~ /^([\-\+])$/ && | |
270 $strand !~ /^[\-\+]?1$/ ) { | |
271 $self->warn("must provide a valid numeric or +/- for strand"); | |
272 return (); | |
273 } | |
274 if( defined $1 ) { $strand .= 1; } | |
275 | |
276 if( !defined $start && !defined $end ) { | |
277 if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") ) | |
278 { | |
279 $self->warn("Must defined a valid Range for the method feature_in_range"); | |
280 return (); | |
281 } | |
282 ($start,$end,$strand) = ($range->start,$range->end,$range->strand); | |
283 } | |
284 my $r = new Bio::Location::Simple(-start => $start, | |
285 -end => $end, | |
286 -strand => $strand); | |
287 | |
288 my @features; | |
289 my $maxbin = $self->max_bin; | |
290 my $minbin = $self->min_bin; | |
291 my $tier = $maxbin; | |
292 my ($k,$v,@bins) = ("",undef); | |
293 while ($tier >= $minbin) { | |
294 my ($tier_start,$tier_stop) = (bin_bot($tier,$start), | |
295 bin_top($tier,$end)); | |
296 if( $tier_start == $tier_stop ) { | |
297 my @vals = $self->{'_btree'}->get_dup($tier_start); | |
298 if( scalar @vals > 0 ) { | |
299 push @bins, map { $self->{'_features'}->[$_] } @vals; | |
300 } | |
301 } else { | |
302 $k = $tier_start; | |
303 my @vals; | |
304 for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR); | |
305 $rc == 0; | |
306 $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) { | |
307 last if( $k > $tier_stop || $k < $tier_start); | |
308 push @vals, $v; | |
309 } | |
310 foreach my $v ( @vals ) { | |
311 if( defined $self->{'_features'}->[$v] ) { | |
312 push @bins, $self->{'_features'}->[$v] ; | |
313 } else { | |
314 | |
315 } | |
316 | |
317 } | |
318 } | |
319 $tier /= 10; | |
320 } | |
321 | |
322 $strandmatch = 'ignore' unless defined $strandmatch; | |
323 return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins : | |
324 grep { $r->overlaps($_,$strandmatch)} @bins; | |
325 } | |
326 | |
327 =head2 remove_features | |
328 | |
329 Title : remove_features | |
330 Usage : $collection->remove_features(\@array) | |
331 Function: Removes the requested sequence features (based on features | |
332 which have the same location) | |
333 Returns : Number of features removed | |
334 Args : Arrayref of Bio::RangeI objects | |
335 | |
336 | |
337 =cut | |
338 | |
339 sub remove_features{ | |
340 my ($self,$feats) = @_; | |
341 if( ref($feats) !~ /ARRAY/i ) { | |
342 $self->warn("Must provide a valid Array reference to remove_features"); | |
343 return 0; | |
344 } | |
345 my $countprocessed = 0; | |
346 foreach my $f ( @$feats ) { | |
347 next if ! ref($f) || ! $f->isa('Bio::RangeI'); | |
348 my $bin = bin($f->start,$f->end,$self->min_bin); | |
349 my @vals = $self->{'_btree'}->get_dup($bin); | |
350 my $vcount = scalar @vals; | |
351 foreach my $v ( @vals ) { | |
352 # eventually this array will become sparse... | |
353 if( $self->{'_features'}->[$v] == $f ) { | |
354 $self->{'_features'}->[$v] = undef; | |
355 $self->{'_btree'}->del_dup($bin,$v); | |
356 $vcount--; | |
357 } | |
358 } | |
359 if( $vcount == 0 ) { | |
360 $self->{'_btree'}->del($bin); | |
361 } | |
362 } | |
363 } | |
364 | |
365 =head2 get_all_features | |
366 | |
367 Title : get_all_features | |
368 Usage : my @f = $col->get_all_features() | |
369 Function: Return all the features stored in this collection (Could be large) | |
370 Returns : Array of Bio::RangeI objects | |
371 Args : None | |
372 | |
373 | |
374 =cut | |
375 | |
376 sub get_all_features{ | |
377 my ($self) = @_; | |
378 return grep {defined $_} @{ $self->{'_features'} }; | |
379 } | |
380 | |
381 | |
382 =head2 min_bin | |
383 | |
384 Title : min_bin | |
385 Usage : my $minbin= $self->min_bin; | |
386 Function: Get/Set the minimum value to use for binning | |
387 Returns : integer | |
388 Args : [optional] minimum bin value | |
389 | |
390 | |
391 =cut | |
392 | |
393 sub min_bin { | |
394 my ($self,$min) = @_; | |
395 if( defined $min ) { | |
396 $self->{'_min_bin'} = $min; | |
397 } | |
398 return $self->{'_min_bin'} || MIN_BIN; | |
399 } | |
400 | |
401 =head2 max_bin | |
402 | |
403 Title : max_bin | |
404 Usage : my $maxbin= $self->max_bin; | |
405 Function: Get/Set the maximum value to use for binning | |
406 Returns : integer | |
407 Args : [optional] maximum bin value | |
408 | |
409 | |
410 =cut | |
411 | |
412 sub max_bin { | |
413 my ($self,$max) = @_; | |
414 if( defined $max ) { | |
415 $self->{'_max_bin'} = $max; | |
416 } | |
417 return $self->{'max_bin'} || MAX_BIN; | |
418 } | |
419 | |
420 =head2 feature_count | |
421 | |
422 Title : feature_count | |
423 Usage : my $c = $col->feature_count() | |
424 Function: Retrieve the total number of features in the collection | |
425 Returns : integer | |
426 Args : none | |
427 | |
428 | |
429 =cut | |
430 | |
431 sub feature_count{ | |
432 my ($self) = @_; | |
433 return scalar ( grep {defined $_} @{ $self->{'_features'} }); | |
434 } | |
435 | |
436 sub _compare{ if( defined $_[0] && ! defined $_[1] ) { return -1 } | |
437 elsif ( defined $_[1] && ! defined $_[0] ) { return 1} | |
438 $_[0] <=> $_[1]} | |
439 | |
440 sub _comparepack { unpack("d", $_[0]) <=> unpack("d", $_[1]) ;} | |
441 | |
442 sub DESTROY { | |
443 my $self = shift; | |
444 $self->SUPER::DESTROY(); | |
445 if( defined $self->{'_io'} ) { | |
446 $self->{'_io'}->_io_cleanup(); | |
447 $self->{'_io'} = undef; | |
448 } | |
449 $self->{'_btree'} = undef; | |
450 } | |
451 | |
452 1; |