0
|
1 # $Id: Collection.pm,v 1.9.2.1 2003/02/21 03:07:19 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SeqFeature::Collection
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SeqFeature::Collection - A container class for SeqFeatures
|
|
16 suitable for performing operations such as finding features within a
|
|
17 range, that match a certain feature type, etc.
|
|
18
|
|
19 =head1 SYNOPSIS
|
|
20
|
|
21 use Bio::SeqFeature::Collection;
|
|
22 use Bio::Location::Simple;
|
|
23 use Bio::Tools::GFF;
|
|
24 use Bio::Root::IO;
|
|
25 # let's first input some features
|
|
26 my $gffio = Bio::Tools::GFF->new(-file => Bio::Root::IO->catfile
|
|
27 ("t","data","myco_sites.gff"),
|
|
28 -gff_version => 2);
|
|
29 my @features = ();
|
|
30 # loop over the input stream
|
|
31 while(my $feature = $gffio->next_feature()) {
|
|
32 # do something with feature
|
|
33 push @features, $feature;
|
|
34 }
|
|
35 $gffio->close();
|
|
36 # build the Collection object
|
|
37 my $col = new Bio::SeqFeature::Collection();
|
|
38 # add these features to the object
|
|
39 my $totaladded = $col->add_features(\@features);
|
|
40
|
|
41 my @subset = $col->features_in_range(-start => 1,
|
|
42 -end => 25000,
|
|
43 -strand => 1,
|
|
44 -contain => 0);
|
|
45 # subset should have 18 entries for this dataset
|
|
46 print "size is ", scalar @subset, "\n";
|
|
47 @subset = $col->features_in_range(-range => Bio::Location::Simple->new
|
|
48 (-start => 70000,
|
|
49 -end => 150000,
|
|
50 -strand => -1),
|
|
51 -contain => 1,
|
|
52 -strandmatch => 'strong');
|
|
53
|
|
54 # subset should have 22 entries for this dataset
|
|
55 print "size is ", scalar @subset, "\n";
|
|
56 print "total number of features in collection is ",
|
|
57 $col->feature_count(),"\n";
|
|
58
|
|
59 =head1 DESCRIPTION
|
|
60
|
|
61 This object will efficiently allow one for query subsets of ranges
|
|
62 within a large collection of sequence features (in fact the objects
|
|
63 just have to be Bio::RangeI compliant). This is done by the creation
|
|
64 of bins which are stored in order in a B-Tree data structure as
|
|
65 provided by the DB_File interface to the Berkeley DB.
|
|
66
|
|
67 This is based on work done by Lincoln for storage in a mysql instance
|
|
68 - this is intended to be an embedded in-memory implementation for
|
|
69 easily quering for subsets of a large range set. All features are
|
|
70 held in memory even if the -usefile flag is provided.
|
|
71
|
|
72 =head1 FEEDBACK
|
|
73
|
|
74 =head2 Mailing Lists
|
|
75
|
|
76 User feedback is an integral part of the evolution of this and other
|
|
77 Bioperl modules. Send your comments and suggestions preferably to
|
|
78 the Bioperl mailing list. Your participation is much appreciated.
|
|
79
|
|
80 bioperl-l@bioperl.org - General discussion
|
|
81 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
82
|
|
83 =head2 Reporting Bugs
|
|
84
|
|
85 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
86 of the bugs and their resolution. Bug reports can be submitted via
|
|
87 email or the web:
|
|
88
|
|
89 bioperl-bugs@bioperl.org
|
|
90 http://bugzilla.bioperl.org/
|
|
91
|
|
92 =head1 AUTHOR - Jason Stajich
|
|
93
|
|
94 Email jason@bioperl.org
|
|
95
|
|
96 =head1 CONTRIBUTORS
|
|
97
|
|
98 Using code and strategy developed by Lincoln Stein (lstein@cshl.org)
|
|
99 in Bio::DB::GFF implementation.
|
|
100
|
|
101 =head1 APPENDIX
|
|
102
|
|
103 The rest of the documentation details each of the object methods.
|
|
104 Internal methods are usually preceded with a _
|
|
105
|
|
106 =cut
|
|
107
|
|
108
|
|
109 # Let the code begin...
|
|
110
|
|
111
|
|
112 package Bio::SeqFeature::Collection;
|
|
113 use vars qw(@ISA);
|
|
114 use strict;
|
|
115
|
|
116 # Object preamble - inherits from Bio::Root::Root
|
|
117
|
|
118 use Bio::Root::Root;
|
|
119 use Bio::Root::IO;
|
|
120 use Bio::DB::GFF::Util::Binning;
|
|
121 use DB_File;
|
|
122 use Bio::Location::Simple;
|
|
123
|
|
124 @ISA = qw(Bio::Root::Root );
|
|
125
|
|
126
|
|
127 # This may need to get re-optimized for BDB usage as these
|
|
128 # numbers were derived empirically by Lincoln on a mysql srv
|
|
129 # running on his laptop
|
|
130
|
|
131 # this is the largest that any reference sequence can be (100 megabases)
|
|
132 use constant MAX_BIN => 100_000_000;
|
|
133
|
|
134 # this is the smallest bin (1 K)
|
|
135 use constant MIN_BIN => 1_000;
|
|
136
|
|
137 =head2 new
|
|
138
|
|
139 Title : new
|
|
140 Usage : my $obj = new Bio::SeqFeature::Collection();
|
|
141 Function: Builds a new Bio::SeqFeature::Collection object
|
|
142 Returns : Bio::SeqFeature::Collection
|
|
143 Args :
|
|
144
|
|
145 -minbin minimum value to use for binning
|
|
146 (default is 100,000,000)
|
|
147 -maxbin maximum value to use for binning
|
|
148 (default is 1,000)
|
|
149 -usefile boolean to use a file to store
|
|
150 BTREE rather than an in-memory structure
|
|
151 (default is false or in-memory).
|
|
152
|
|
153 -features Array ref of features to add initially
|
|
154
|
|
155 =cut
|
|
156
|
|
157 sub new {
|
|
158 my($class,@args) = @_;
|
|
159
|
|
160 my $self = $class->SUPER::new(@args);
|
|
161 my ($maxbin,$minbin,$usefile,$features) = $self->_rearrange([qw(MAXBIN MINBIN
|
|
162 USEFILE
|
|
163 FEATURES)],@args);
|
|
164
|
|
165 defined $maxbin && $self->max_bin($maxbin);
|
|
166 defined $minbin && $self->min_bin($minbin);
|
|
167
|
|
168 defined $features && $self->add_features($features);
|
|
169 $DB_BTREE->{'flags'} = R_DUP ;
|
|
170 $DB_BTREE->{'compare'} = \&_compare;
|
|
171 # $DB_BTREE->{'compare'} = \&_comparepack;
|
|
172 $self->{'_btreehash'} = {};
|
|
173
|
|
174 my $tmpname = undef;
|
|
175 if( $usefile ) {
|
|
176 $self->{'_io'} = new Bio::Root::IO;
|
|
177 (undef,$tmpname) = $self->{'_io'}->tempfile();
|
|
178 unlink($tmpname);
|
|
179 $self->debug("tmpfile is $tmpname");
|
|
180 }
|
|
181 $self->{'_btree'} = tie %{$self->{'_btreehash'}},
|
|
182 'DB_File', $tmpname, O_RDWR|O_CREAT, 0640, $DB_BTREE;
|
|
183
|
|
184 # possibly storing/retrieving as floats for speed improvement?
|
|
185 # $self->{'_btree'}->filter_store_key ( sub { $_ = pack ("d", $_) } );
|
|
186 # $self->{'_btree'}->filter_fetch_key ( sub { $_ = unpack("d", $_) } );
|
|
187
|
|
188 $self->{'_features'} = [];
|
|
189 return $self;
|
|
190 }
|
|
191
|
|
192
|
|
193 =head2 add_features
|
|
194
|
|
195 Title : add_features
|
|
196 Usage : $collection->add_features(\@features);
|
|
197 Function:
|
|
198 Returns : number of features added
|
|
199 Args : arrayref of Bio::SeqFeatureI objects to index
|
|
200
|
|
201
|
|
202 =cut
|
|
203
|
|
204 sub add_features{
|
|
205 my ($self,$feats) = @_;
|
|
206 if( ref($feats) !~ /ARRAY/i ) {
|
|
207 $self->warn("Must provide a valid Array reference to add_features");
|
|
208 return 0;
|
|
209 }
|
|
210 my $count = 0;
|
|
211 foreach my $f ( @$feats ) {
|
|
212 if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) {
|
|
213 $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n");
|
|
214 next;
|
|
215 }
|
|
216 my $bin = bin($f->start,$f->end,$self->min_bin);
|
|
217
|
|
218 push @{$self->{'_features'}}, $f;
|
|
219 $self->{'_btreehash'}->{$bin} = $#{$self->{'_features'}};
|
|
220 $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n");
|
|
221 $count++;
|
|
222 }
|
|
223 return $count;
|
|
224 }
|
|
225
|
|
226
|
|
227 =head2 features_in_range
|
|
228
|
|
229 Title : features_in_range
|
|
230 Usage : my @features = $collection->features_in_range($range)
|
|
231 Function: Retrieves a list of features which were contained or overlap the
|
|
232 the requested range (see Args for way to specify overlap or
|
|
233 only those containe)d
|
|
234 Returns : List of Bio::SeqFeatureI objects
|
|
235 Args : -range => Bio::RangeI object defining range to search,
|
|
236 OR
|
|
237 -start => start,
|
|
238 -end => end,
|
|
239 -strand => strand
|
|
240
|
|
241 -contain => boolean - true if feature must be completely
|
|
242 contained with range
|
|
243 OR false if should include features that simply overlap
|
|
244 the range. Default: true.
|
|
245 -strandmatch => 'strong', ranges must have the same strand
|
|
246 'weak', ranges must have the same
|
|
247 strand or no strand
|
|
248 'ignore', ignore strand information
|
|
249 Default. 'ignore'.
|
|
250
|
|
251 =cut
|
|
252
|
|
253 sub features_in_range{
|
|
254 my $self = shift;
|
|
255 my (@args) = @_;
|
|
256 my ($range, $contain, $strandmatch,$start,$end,$strand);
|
|
257 if( @args == 1 ) {
|
|
258 $range = shift @args;
|
|
259 } else {
|
|
260 ($start,$end,$strand,$range,
|
|
261 $contain,$strandmatch) = $self->_rearrange([qw(START END
|
|
262 STRAND
|
|
263 RANGE CONTAIN
|
|
264 STRANDMATCH)],
|
|
265 @args);
|
|
266 $contain = 1 unless defined $contain;
|
|
267 }
|
|
268 $strand = 1 unless defined $strand;
|
|
269 if( $strand !~ /^([\-\+])$/ &&
|
|
270 $strand !~ /^[\-\+]?1$/ ) {
|
|
271 $self->warn("must provide a valid numeric or +/- for strand");
|
|
272 return ();
|
|
273 }
|
|
274 if( defined $1 ) { $strand .= 1; }
|
|
275
|
|
276 if( !defined $start && !defined $end ) {
|
|
277 if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") )
|
|
278 {
|
|
279 $self->warn("Must defined a valid Range for the method feature_in_range");
|
|
280 return ();
|
|
281 }
|
|
282 ($start,$end,$strand) = ($range->start,$range->end,$range->strand);
|
|
283 }
|
|
284 my $r = new Bio::Location::Simple(-start => $start,
|
|
285 -end => $end,
|
|
286 -strand => $strand);
|
|
287
|
|
288 my @features;
|
|
289 my $maxbin = $self->max_bin;
|
|
290 my $minbin = $self->min_bin;
|
|
291 my $tier = $maxbin;
|
|
292 my ($k,$v,@bins) = ("",undef);
|
|
293 while ($tier >= $minbin) {
|
|
294 my ($tier_start,$tier_stop) = (bin_bot($tier,$start),
|
|
295 bin_top($tier,$end));
|
|
296 if( $tier_start == $tier_stop ) {
|
|
297 my @vals = $self->{'_btree'}->get_dup($tier_start);
|
|
298 if( scalar @vals > 0 ) {
|
|
299 push @bins, map { $self->{'_features'}->[$_] } @vals;
|
|
300 }
|
|
301 } else {
|
|
302 $k = $tier_start;
|
|
303 my @vals;
|
|
304 for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR);
|
|
305 $rc == 0;
|
|
306 $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) {
|
|
307 last if( $k > $tier_stop || $k < $tier_start);
|
|
308 push @vals, $v;
|
|
309 }
|
|
310 foreach my $v ( @vals ) {
|
|
311 if( defined $self->{'_features'}->[$v] ) {
|
|
312 push @bins, $self->{'_features'}->[$v] ;
|
|
313 } else {
|
|
314
|
|
315 }
|
|
316
|
|
317 }
|
|
318 }
|
|
319 $tier /= 10;
|
|
320 }
|
|
321
|
|
322 $strandmatch = 'ignore' unless defined $strandmatch;
|
|
323 return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins :
|
|
324 grep { $r->overlaps($_,$strandmatch)} @bins;
|
|
325 }
|
|
326
|
|
327 =head2 remove_features
|
|
328
|
|
329 Title : remove_features
|
|
330 Usage : $collection->remove_features(\@array)
|
|
331 Function: Removes the requested sequence features (based on features
|
|
332 which have the same location)
|
|
333 Returns : Number of features removed
|
|
334 Args : Arrayref of Bio::RangeI objects
|
|
335
|
|
336
|
|
337 =cut
|
|
338
|
|
339 sub remove_features{
|
|
340 my ($self,$feats) = @_;
|
|
341 if( ref($feats) !~ /ARRAY/i ) {
|
|
342 $self->warn("Must provide a valid Array reference to remove_features");
|
|
343 return 0;
|
|
344 }
|
|
345 my $countprocessed = 0;
|
|
346 foreach my $f ( @$feats ) {
|
|
347 next if ! ref($f) || ! $f->isa('Bio::RangeI');
|
|
348 my $bin = bin($f->start,$f->end,$self->min_bin);
|
|
349 my @vals = $self->{'_btree'}->get_dup($bin);
|
|
350 my $vcount = scalar @vals;
|
|
351 foreach my $v ( @vals ) {
|
|
352 # eventually this array will become sparse...
|
|
353 if( $self->{'_features'}->[$v] == $f ) {
|
|
354 $self->{'_features'}->[$v] = undef;
|
|
355 $self->{'_btree'}->del_dup($bin,$v);
|
|
356 $vcount--;
|
|
357 }
|
|
358 }
|
|
359 if( $vcount == 0 ) {
|
|
360 $self->{'_btree'}->del($bin);
|
|
361 }
|
|
362 }
|
|
363 }
|
|
364
|
|
365 =head2 get_all_features
|
|
366
|
|
367 Title : get_all_features
|
|
368 Usage : my @f = $col->get_all_features()
|
|
369 Function: Return all the features stored in this collection (Could be large)
|
|
370 Returns : Array of Bio::RangeI objects
|
|
371 Args : None
|
|
372
|
|
373
|
|
374 =cut
|
|
375
|
|
376 sub get_all_features{
|
|
377 my ($self) = @_;
|
|
378 return grep {defined $_} @{ $self->{'_features'} };
|
|
379 }
|
|
380
|
|
381
|
|
382 =head2 min_bin
|
|
383
|
|
384 Title : min_bin
|
|
385 Usage : my $minbin= $self->min_bin;
|
|
386 Function: Get/Set the minimum value to use for binning
|
|
387 Returns : integer
|
|
388 Args : [optional] minimum bin value
|
|
389
|
|
390
|
|
391 =cut
|
|
392
|
|
393 sub min_bin {
|
|
394 my ($self,$min) = @_;
|
|
395 if( defined $min ) {
|
|
396 $self->{'_min_bin'} = $min;
|
|
397 }
|
|
398 return $self->{'_min_bin'} || MIN_BIN;
|
|
399 }
|
|
400
|
|
401 =head2 max_bin
|
|
402
|
|
403 Title : max_bin
|
|
404 Usage : my $maxbin= $self->max_bin;
|
|
405 Function: Get/Set the maximum value to use for binning
|
|
406 Returns : integer
|
|
407 Args : [optional] maximum bin value
|
|
408
|
|
409
|
|
410 =cut
|
|
411
|
|
412 sub max_bin {
|
|
413 my ($self,$max) = @_;
|
|
414 if( defined $max ) {
|
|
415 $self->{'_max_bin'} = $max;
|
|
416 }
|
|
417 return $self->{'max_bin'} || MAX_BIN;
|
|
418 }
|
|
419
|
|
420 =head2 feature_count
|
|
421
|
|
422 Title : feature_count
|
|
423 Usage : my $c = $col->feature_count()
|
|
424 Function: Retrieve the total number of features in the collection
|
|
425 Returns : integer
|
|
426 Args : none
|
|
427
|
|
428
|
|
429 =cut
|
|
430
|
|
431 sub feature_count{
|
|
432 my ($self) = @_;
|
|
433 return scalar ( grep {defined $_} @{ $self->{'_features'} });
|
|
434 }
|
|
435
|
|
436 sub _compare{ if( defined $_[0] && ! defined $_[1] ) { return -1 }
|
|
437 elsif ( defined $_[1] && ! defined $_[0] ) { return 1}
|
|
438 $_[0] <=> $_[1]}
|
|
439
|
|
440 sub _comparepack { unpack("d", $_[0]) <=> unpack("d", $_[1]) ;}
|
|
441
|
|
442 sub DESTROY {
|
|
443 my $self = shift;
|
|
444 $self->SUPER::DESTROY();
|
|
445 if( defined $self->{'_io'} ) {
|
|
446 $self->{'_io'}->_io_cleanup();
|
|
447 $self->{'_io'} = undef;
|
|
448 }
|
|
449 $self->{'_btree'} = undef;
|
|
450 }
|
|
451
|
|
452 1;
|