comparison variant_effect_predictor/Bio/SeqFeature/Collection.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: Collection.pm,v 1.9.2.1 2003/02/21 03:07:19 jason Exp $
2 #
3 # BioPerl module for Bio::SeqFeature::Collection
4 #
5 # Cared for by Jason Stajich <jason@bioperl.org>
6 #
7 # Copyright Jason Stajich
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::SeqFeature::Collection - A container class for SeqFeatures
16 suitable for performing operations such as finding features within a
17 range, that match a certain feature type, etc.
18
19 =head1 SYNOPSIS
20
21 use Bio::SeqFeature::Collection;
22 use Bio::Location::Simple;
23 use Bio::Tools::GFF;
24 use Bio::Root::IO;
25 # let's first input some features
26 my $gffio = Bio::Tools::GFF->new(-file => Bio::Root::IO->catfile
27 ("t","data","myco_sites.gff"),
28 -gff_version => 2);
29 my @features = ();
30 # loop over the input stream
31 while(my $feature = $gffio->next_feature()) {
32 # do something with feature
33 push @features, $feature;
34 }
35 $gffio->close();
36 # build the Collection object
37 my $col = new Bio::SeqFeature::Collection();
38 # add these features to the object
39 my $totaladded = $col->add_features(\@features);
40
41 my @subset = $col->features_in_range(-start => 1,
42 -end => 25000,
43 -strand => 1,
44 -contain => 0);
45 # subset should have 18 entries for this dataset
46 print "size is ", scalar @subset, "\n";
47 @subset = $col->features_in_range(-range => Bio::Location::Simple->new
48 (-start => 70000,
49 -end => 150000,
50 -strand => -1),
51 -contain => 1,
52 -strandmatch => 'strong');
53
54 # subset should have 22 entries for this dataset
55 print "size is ", scalar @subset, "\n";
56 print "total number of features in collection is ",
57 $col->feature_count(),"\n";
58
59 =head1 DESCRIPTION
60
61 This object will efficiently allow one for query subsets of ranges
62 within a large collection of sequence features (in fact the objects
63 just have to be Bio::RangeI compliant). This is done by the creation
64 of bins which are stored in order in a B-Tree data structure as
65 provided by the DB_File interface to the Berkeley DB.
66
67 This is based on work done by Lincoln for storage in a mysql instance
68 - this is intended to be an embedded in-memory implementation for
69 easily quering for subsets of a large range set. All features are
70 held in memory even if the -usefile flag is provided.
71
72 =head1 FEEDBACK
73
74 =head2 Mailing Lists
75
76 User feedback is an integral part of the evolution of this and other
77 Bioperl modules. Send your comments and suggestions preferably to
78 the Bioperl mailing list. Your participation is much appreciated.
79
80 bioperl-l@bioperl.org - General discussion
81 http://bioperl.org/MailList.shtml - About the mailing lists
82
83 =head2 Reporting Bugs
84
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 of the bugs and their resolution. Bug reports can be submitted via
87 email or the web:
88
89 bioperl-bugs@bioperl.org
90 http://bugzilla.bioperl.org/
91
92 =head1 AUTHOR - Jason Stajich
93
94 Email jason@bioperl.org
95
96 =head1 CONTRIBUTORS
97
98 Using code and strategy developed by Lincoln Stein (lstein@cshl.org)
99 in Bio::DB::GFF implementation.
100
101 =head1 APPENDIX
102
103 The rest of the documentation details each of the object methods.
104 Internal methods are usually preceded with a _
105
106 =cut
107
108
109 # Let the code begin...
110
111
112 package Bio::SeqFeature::Collection;
113 use vars qw(@ISA);
114 use strict;
115
116 # Object preamble - inherits from Bio::Root::Root
117
118 use Bio::Root::Root;
119 use Bio::Root::IO;
120 use Bio::DB::GFF::Util::Binning;
121 use DB_File;
122 use Bio::Location::Simple;
123
124 @ISA = qw(Bio::Root::Root );
125
126
127 # This may need to get re-optimized for BDB usage as these
128 # numbers were derived empirically by Lincoln on a mysql srv
129 # running on his laptop
130
131 # this is the largest that any reference sequence can be (100 megabases)
132 use constant MAX_BIN => 100_000_000;
133
134 # this is the smallest bin (1 K)
135 use constant MIN_BIN => 1_000;
136
137 =head2 new
138
139 Title : new
140 Usage : my $obj = new Bio::SeqFeature::Collection();
141 Function: Builds a new Bio::SeqFeature::Collection object
142 Returns : Bio::SeqFeature::Collection
143 Args :
144
145 -minbin minimum value to use for binning
146 (default is 100,000,000)
147 -maxbin maximum value to use for binning
148 (default is 1,000)
149 -usefile boolean to use a file to store
150 BTREE rather than an in-memory structure
151 (default is false or in-memory).
152
153 -features Array ref of features to add initially
154
155 =cut
156
157 sub new {
158 my($class,@args) = @_;
159
160 my $self = $class->SUPER::new(@args);
161 my ($maxbin,$minbin,$usefile,$features) = $self->_rearrange([qw(MAXBIN MINBIN
162 USEFILE
163 FEATURES)],@args);
164
165 defined $maxbin && $self->max_bin($maxbin);
166 defined $minbin && $self->min_bin($minbin);
167
168 defined $features && $self->add_features($features);
169 $DB_BTREE->{'flags'} = R_DUP ;
170 $DB_BTREE->{'compare'} = \&_compare;
171 # $DB_BTREE->{'compare'} = \&_comparepack;
172 $self->{'_btreehash'} = {};
173
174 my $tmpname = undef;
175 if( $usefile ) {
176 $self->{'_io'} = new Bio::Root::IO;
177 (undef,$tmpname) = $self->{'_io'}->tempfile();
178 unlink($tmpname);
179 $self->debug("tmpfile is $tmpname");
180 }
181 $self->{'_btree'} = tie %{$self->{'_btreehash'}},
182 'DB_File', $tmpname, O_RDWR|O_CREAT, 0640, $DB_BTREE;
183
184 # possibly storing/retrieving as floats for speed improvement?
185 # $self->{'_btree'}->filter_store_key ( sub { $_ = pack ("d", $_) } );
186 # $self->{'_btree'}->filter_fetch_key ( sub { $_ = unpack("d", $_) } );
187
188 $self->{'_features'} = [];
189 return $self;
190 }
191
192
193 =head2 add_features
194
195 Title : add_features
196 Usage : $collection->add_features(\@features);
197 Function:
198 Returns : number of features added
199 Args : arrayref of Bio::SeqFeatureI objects to index
200
201
202 =cut
203
204 sub add_features{
205 my ($self,$feats) = @_;
206 if( ref($feats) !~ /ARRAY/i ) {
207 $self->warn("Must provide a valid Array reference to add_features");
208 return 0;
209 }
210 my $count = 0;
211 foreach my $f ( @$feats ) {
212 if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) {
213 $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n");
214 next;
215 }
216 my $bin = bin($f->start,$f->end,$self->min_bin);
217
218 push @{$self->{'_features'}}, $f;
219 $self->{'_btreehash'}->{$bin} = $#{$self->{'_features'}};
220 $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n");
221 $count++;
222 }
223 return $count;
224 }
225
226
227 =head2 features_in_range
228
229 Title : features_in_range
230 Usage : my @features = $collection->features_in_range($range)
231 Function: Retrieves a list of features which were contained or overlap the
232 the requested range (see Args for way to specify overlap or
233 only those containe)d
234 Returns : List of Bio::SeqFeatureI objects
235 Args : -range => Bio::RangeI object defining range to search,
236 OR
237 -start => start,
238 -end => end,
239 -strand => strand
240
241 -contain => boolean - true if feature must be completely
242 contained with range
243 OR false if should include features that simply overlap
244 the range. Default: true.
245 -strandmatch => 'strong', ranges must have the same strand
246 'weak', ranges must have the same
247 strand or no strand
248 'ignore', ignore strand information
249 Default. 'ignore'.
250
251 =cut
252
253 sub features_in_range{
254 my $self = shift;
255 my (@args) = @_;
256 my ($range, $contain, $strandmatch,$start,$end,$strand);
257 if( @args == 1 ) {
258 $range = shift @args;
259 } else {
260 ($start,$end,$strand,$range,
261 $contain,$strandmatch) = $self->_rearrange([qw(START END
262 STRAND
263 RANGE CONTAIN
264 STRANDMATCH)],
265 @args);
266 $contain = 1 unless defined $contain;
267 }
268 $strand = 1 unless defined $strand;
269 if( $strand !~ /^([\-\+])$/ &&
270 $strand !~ /^[\-\+]?1$/ ) {
271 $self->warn("must provide a valid numeric or +/- for strand");
272 return ();
273 }
274 if( defined $1 ) { $strand .= 1; }
275
276 if( !defined $start && !defined $end ) {
277 if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") )
278 {
279 $self->warn("Must defined a valid Range for the method feature_in_range");
280 return ();
281 }
282 ($start,$end,$strand) = ($range->start,$range->end,$range->strand);
283 }
284 my $r = new Bio::Location::Simple(-start => $start,
285 -end => $end,
286 -strand => $strand);
287
288 my @features;
289 my $maxbin = $self->max_bin;
290 my $minbin = $self->min_bin;
291 my $tier = $maxbin;
292 my ($k,$v,@bins) = ("",undef);
293 while ($tier >= $minbin) {
294 my ($tier_start,$tier_stop) = (bin_bot($tier,$start),
295 bin_top($tier,$end));
296 if( $tier_start == $tier_stop ) {
297 my @vals = $self->{'_btree'}->get_dup($tier_start);
298 if( scalar @vals > 0 ) {
299 push @bins, map { $self->{'_features'}->[$_] } @vals;
300 }
301 } else {
302 $k = $tier_start;
303 my @vals;
304 for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR);
305 $rc == 0;
306 $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) {
307 last if( $k > $tier_stop || $k < $tier_start);
308 push @vals, $v;
309 }
310 foreach my $v ( @vals ) {
311 if( defined $self->{'_features'}->[$v] ) {
312 push @bins, $self->{'_features'}->[$v] ;
313 } else {
314
315 }
316
317 }
318 }
319 $tier /= 10;
320 }
321
322 $strandmatch = 'ignore' unless defined $strandmatch;
323 return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins :
324 grep { $r->overlaps($_,$strandmatch)} @bins;
325 }
326
327 =head2 remove_features
328
329 Title : remove_features
330 Usage : $collection->remove_features(\@array)
331 Function: Removes the requested sequence features (based on features
332 which have the same location)
333 Returns : Number of features removed
334 Args : Arrayref of Bio::RangeI objects
335
336
337 =cut
338
339 sub remove_features{
340 my ($self,$feats) = @_;
341 if( ref($feats) !~ /ARRAY/i ) {
342 $self->warn("Must provide a valid Array reference to remove_features");
343 return 0;
344 }
345 my $countprocessed = 0;
346 foreach my $f ( @$feats ) {
347 next if ! ref($f) || ! $f->isa('Bio::RangeI');
348 my $bin = bin($f->start,$f->end,$self->min_bin);
349 my @vals = $self->{'_btree'}->get_dup($bin);
350 my $vcount = scalar @vals;
351 foreach my $v ( @vals ) {
352 # eventually this array will become sparse...
353 if( $self->{'_features'}->[$v] == $f ) {
354 $self->{'_features'}->[$v] = undef;
355 $self->{'_btree'}->del_dup($bin,$v);
356 $vcount--;
357 }
358 }
359 if( $vcount == 0 ) {
360 $self->{'_btree'}->del($bin);
361 }
362 }
363 }
364
365 =head2 get_all_features
366
367 Title : get_all_features
368 Usage : my @f = $col->get_all_features()
369 Function: Return all the features stored in this collection (Could be large)
370 Returns : Array of Bio::RangeI objects
371 Args : None
372
373
374 =cut
375
376 sub get_all_features{
377 my ($self) = @_;
378 return grep {defined $_} @{ $self->{'_features'} };
379 }
380
381
382 =head2 min_bin
383
384 Title : min_bin
385 Usage : my $minbin= $self->min_bin;
386 Function: Get/Set the minimum value to use for binning
387 Returns : integer
388 Args : [optional] minimum bin value
389
390
391 =cut
392
393 sub min_bin {
394 my ($self,$min) = @_;
395 if( defined $min ) {
396 $self->{'_min_bin'} = $min;
397 }
398 return $self->{'_min_bin'} || MIN_BIN;
399 }
400
401 =head2 max_bin
402
403 Title : max_bin
404 Usage : my $maxbin= $self->max_bin;
405 Function: Get/Set the maximum value to use for binning
406 Returns : integer
407 Args : [optional] maximum bin value
408
409
410 =cut
411
412 sub max_bin {
413 my ($self,$max) = @_;
414 if( defined $max ) {
415 $self->{'_max_bin'} = $max;
416 }
417 return $self->{'max_bin'} || MAX_BIN;
418 }
419
420 =head2 feature_count
421
422 Title : feature_count
423 Usage : my $c = $col->feature_count()
424 Function: Retrieve the total number of features in the collection
425 Returns : integer
426 Args : none
427
428
429 =cut
430
431 sub feature_count{
432 my ($self) = @_;
433 return scalar ( grep {defined $_} @{ $self->{'_features'} });
434 }
435
436 sub _compare{ if( defined $_[0] && ! defined $_[1] ) { return -1 }
437 elsif ( defined $_[1] && ! defined $_[0] ) { return 1}
438 $_[0] <=> $_[1]}
439
440 sub _comparepack { unpack("d", $_[0]) <=> unpack("d", $_[1]) ;}
441
442 sub DESTROY {
443 my $self = shift;
444 $self->SUPER::DESTROY();
445 if( defined $self->{'_io'} ) {
446 $self->{'_io'}->_io_cleanup();
447 $self->{'_io'} = undef;
448 }
449 $self->{'_btree'} = undef;
450 }
451
452 1;