Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqFeature/Collection.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Collection.pm,v 1.9.2.1 2003/02/21 03:07:19 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::SeqFeature::Collection | |
| 4 # | |
| 5 # Cared for by Jason Stajich <jason@bioperl.org> | |
| 6 # | |
| 7 # Copyright Jason Stajich | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::SeqFeature::Collection - A container class for SeqFeatures | |
| 16 suitable for performing operations such as finding features within a | |
| 17 range, that match a certain feature type, etc. | |
| 18 | |
| 19 =head1 SYNOPSIS | |
| 20 | |
| 21 use Bio::SeqFeature::Collection; | |
| 22 use Bio::Location::Simple; | |
| 23 use Bio::Tools::GFF; | |
| 24 use Bio::Root::IO; | |
| 25 # let's first input some features | |
| 26 my $gffio = Bio::Tools::GFF->new(-file => Bio::Root::IO->catfile | |
| 27 ("t","data","myco_sites.gff"), | |
| 28 -gff_version => 2); | |
| 29 my @features = (); | |
| 30 # loop over the input stream | |
| 31 while(my $feature = $gffio->next_feature()) { | |
| 32 # do something with feature | |
| 33 push @features, $feature; | |
| 34 } | |
| 35 $gffio->close(); | |
| 36 # build the Collection object | |
| 37 my $col = new Bio::SeqFeature::Collection(); | |
| 38 # add these features to the object | |
| 39 my $totaladded = $col->add_features(\@features); | |
| 40 | |
| 41 my @subset = $col->features_in_range(-start => 1, | |
| 42 -end => 25000, | |
| 43 -strand => 1, | |
| 44 -contain => 0); | |
| 45 # subset should have 18 entries for this dataset | |
| 46 print "size is ", scalar @subset, "\n"; | |
| 47 @subset = $col->features_in_range(-range => Bio::Location::Simple->new | |
| 48 (-start => 70000, | |
| 49 -end => 150000, | |
| 50 -strand => -1), | |
| 51 -contain => 1, | |
| 52 -strandmatch => 'strong'); | |
| 53 | |
| 54 # subset should have 22 entries for this dataset | |
| 55 print "size is ", scalar @subset, "\n"; | |
| 56 print "total number of features in collection is ", | |
| 57 $col->feature_count(),"\n"; | |
| 58 | |
| 59 =head1 DESCRIPTION | |
| 60 | |
| 61 This object will efficiently allow one for query subsets of ranges | |
| 62 within a large collection of sequence features (in fact the objects | |
| 63 just have to be Bio::RangeI compliant). This is done by the creation | |
| 64 of bins which are stored in order in a B-Tree data structure as | |
| 65 provided by the DB_File interface to the Berkeley DB. | |
| 66 | |
| 67 This is based on work done by Lincoln for storage in a mysql instance | |
| 68 - this is intended to be an embedded in-memory implementation for | |
| 69 easily quering for subsets of a large range set. All features are | |
| 70 held in memory even if the -usefile flag is provided. | |
| 71 | |
| 72 =head1 FEEDBACK | |
| 73 | |
| 74 =head2 Mailing Lists | |
| 75 | |
| 76 User feedback is an integral part of the evolution of this and other | |
| 77 Bioperl modules. Send your comments and suggestions preferably to | |
| 78 the Bioperl mailing list. Your participation is much appreciated. | |
| 79 | |
| 80 bioperl-l@bioperl.org - General discussion | |
| 81 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 82 | |
| 83 =head2 Reporting Bugs | |
| 84 | |
| 85 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 86 of the bugs and their resolution. Bug reports can be submitted via | |
| 87 email or the web: | |
| 88 | |
| 89 bioperl-bugs@bioperl.org | |
| 90 http://bugzilla.bioperl.org/ | |
| 91 | |
| 92 =head1 AUTHOR - Jason Stajich | |
| 93 | |
| 94 Email jason@bioperl.org | |
| 95 | |
| 96 =head1 CONTRIBUTORS | |
| 97 | |
| 98 Using code and strategy developed by Lincoln Stein (lstein@cshl.org) | |
| 99 in Bio::DB::GFF implementation. | |
| 100 | |
| 101 =head1 APPENDIX | |
| 102 | |
| 103 The rest of the documentation details each of the object methods. | |
| 104 Internal methods are usually preceded with a _ | |
| 105 | |
| 106 =cut | |
| 107 | |
| 108 | |
| 109 # Let the code begin... | |
| 110 | |
| 111 | |
| 112 package Bio::SeqFeature::Collection; | |
| 113 use vars qw(@ISA); | |
| 114 use strict; | |
| 115 | |
| 116 # Object preamble - inherits from Bio::Root::Root | |
| 117 | |
| 118 use Bio::Root::Root; | |
| 119 use Bio::Root::IO; | |
| 120 use Bio::DB::GFF::Util::Binning; | |
| 121 use DB_File; | |
| 122 use Bio::Location::Simple; | |
| 123 | |
| 124 @ISA = qw(Bio::Root::Root ); | |
| 125 | |
| 126 | |
| 127 # This may need to get re-optimized for BDB usage as these | |
| 128 # numbers were derived empirically by Lincoln on a mysql srv | |
| 129 # running on his laptop | |
| 130 | |
| 131 # this is the largest that any reference sequence can be (100 megabases) | |
| 132 use constant MAX_BIN => 100_000_000; | |
| 133 | |
| 134 # this is the smallest bin (1 K) | |
| 135 use constant MIN_BIN => 1_000; | |
| 136 | |
| 137 =head2 new | |
| 138 | |
| 139 Title : new | |
| 140 Usage : my $obj = new Bio::SeqFeature::Collection(); | |
| 141 Function: Builds a new Bio::SeqFeature::Collection object | |
| 142 Returns : Bio::SeqFeature::Collection | |
| 143 Args : | |
| 144 | |
| 145 -minbin minimum value to use for binning | |
| 146 (default is 100,000,000) | |
| 147 -maxbin maximum value to use for binning | |
| 148 (default is 1,000) | |
| 149 -usefile boolean to use a file to store | |
| 150 BTREE rather than an in-memory structure | |
| 151 (default is false or in-memory). | |
| 152 | |
| 153 -features Array ref of features to add initially | |
| 154 | |
| 155 =cut | |
| 156 | |
| 157 sub new { | |
| 158 my($class,@args) = @_; | |
| 159 | |
| 160 my $self = $class->SUPER::new(@args); | |
| 161 my ($maxbin,$minbin,$usefile,$features) = $self->_rearrange([qw(MAXBIN MINBIN | |
| 162 USEFILE | |
| 163 FEATURES)],@args); | |
| 164 | |
| 165 defined $maxbin && $self->max_bin($maxbin); | |
| 166 defined $minbin && $self->min_bin($minbin); | |
| 167 | |
| 168 defined $features && $self->add_features($features); | |
| 169 $DB_BTREE->{'flags'} = R_DUP ; | |
| 170 $DB_BTREE->{'compare'} = \&_compare; | |
| 171 # $DB_BTREE->{'compare'} = \&_comparepack; | |
| 172 $self->{'_btreehash'} = {}; | |
| 173 | |
| 174 my $tmpname = undef; | |
| 175 if( $usefile ) { | |
| 176 $self->{'_io'} = new Bio::Root::IO; | |
| 177 (undef,$tmpname) = $self->{'_io'}->tempfile(); | |
| 178 unlink($tmpname); | |
| 179 $self->debug("tmpfile is $tmpname"); | |
| 180 } | |
| 181 $self->{'_btree'} = tie %{$self->{'_btreehash'}}, | |
| 182 'DB_File', $tmpname, O_RDWR|O_CREAT, 0640, $DB_BTREE; | |
| 183 | |
| 184 # possibly storing/retrieving as floats for speed improvement? | |
| 185 # $self->{'_btree'}->filter_store_key ( sub { $_ = pack ("d", $_) } ); | |
| 186 # $self->{'_btree'}->filter_fetch_key ( sub { $_ = unpack("d", $_) } ); | |
| 187 | |
| 188 $self->{'_features'} = []; | |
| 189 return $self; | |
| 190 } | |
| 191 | |
| 192 | |
| 193 =head2 add_features | |
| 194 | |
| 195 Title : add_features | |
| 196 Usage : $collection->add_features(\@features); | |
| 197 Function: | |
| 198 Returns : number of features added | |
| 199 Args : arrayref of Bio::SeqFeatureI objects to index | |
| 200 | |
| 201 | |
| 202 =cut | |
| 203 | |
| 204 sub add_features{ | |
| 205 my ($self,$feats) = @_; | |
| 206 if( ref($feats) !~ /ARRAY/i ) { | |
| 207 $self->warn("Must provide a valid Array reference to add_features"); | |
| 208 return 0; | |
| 209 } | |
| 210 my $count = 0; | |
| 211 foreach my $f ( @$feats ) { | |
| 212 if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) { | |
| 213 $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n"); | |
| 214 next; | |
| 215 } | |
| 216 my $bin = bin($f->start,$f->end,$self->min_bin); | |
| 217 | |
| 218 push @{$self->{'_features'}}, $f; | |
| 219 $self->{'_btreehash'}->{$bin} = $#{$self->{'_features'}}; | |
| 220 $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n"); | |
| 221 $count++; | |
| 222 } | |
| 223 return $count; | |
| 224 } | |
| 225 | |
| 226 | |
| 227 =head2 features_in_range | |
| 228 | |
| 229 Title : features_in_range | |
| 230 Usage : my @features = $collection->features_in_range($range) | |
| 231 Function: Retrieves a list of features which were contained or overlap the | |
| 232 the requested range (see Args for way to specify overlap or | |
| 233 only those containe)d | |
| 234 Returns : List of Bio::SeqFeatureI objects | |
| 235 Args : -range => Bio::RangeI object defining range to search, | |
| 236 OR | |
| 237 -start => start, | |
| 238 -end => end, | |
| 239 -strand => strand | |
| 240 | |
| 241 -contain => boolean - true if feature must be completely | |
| 242 contained with range | |
| 243 OR false if should include features that simply overlap | |
| 244 the range. Default: true. | |
| 245 -strandmatch => 'strong', ranges must have the same strand | |
| 246 'weak', ranges must have the same | |
| 247 strand or no strand | |
| 248 'ignore', ignore strand information | |
| 249 Default. 'ignore'. | |
| 250 | |
| 251 =cut | |
| 252 | |
| 253 sub features_in_range{ | |
| 254 my $self = shift; | |
| 255 my (@args) = @_; | |
| 256 my ($range, $contain, $strandmatch,$start,$end,$strand); | |
| 257 if( @args == 1 ) { | |
| 258 $range = shift @args; | |
| 259 } else { | |
| 260 ($start,$end,$strand,$range, | |
| 261 $contain,$strandmatch) = $self->_rearrange([qw(START END | |
| 262 STRAND | |
| 263 RANGE CONTAIN | |
| 264 STRANDMATCH)], | |
| 265 @args); | |
| 266 $contain = 1 unless defined $contain; | |
| 267 } | |
| 268 $strand = 1 unless defined $strand; | |
| 269 if( $strand !~ /^([\-\+])$/ && | |
| 270 $strand !~ /^[\-\+]?1$/ ) { | |
| 271 $self->warn("must provide a valid numeric or +/- for strand"); | |
| 272 return (); | |
| 273 } | |
| 274 if( defined $1 ) { $strand .= 1; } | |
| 275 | |
| 276 if( !defined $start && !defined $end ) { | |
| 277 if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") ) | |
| 278 { | |
| 279 $self->warn("Must defined a valid Range for the method feature_in_range"); | |
| 280 return (); | |
| 281 } | |
| 282 ($start,$end,$strand) = ($range->start,$range->end,$range->strand); | |
| 283 } | |
| 284 my $r = new Bio::Location::Simple(-start => $start, | |
| 285 -end => $end, | |
| 286 -strand => $strand); | |
| 287 | |
| 288 my @features; | |
| 289 my $maxbin = $self->max_bin; | |
| 290 my $minbin = $self->min_bin; | |
| 291 my $tier = $maxbin; | |
| 292 my ($k,$v,@bins) = ("",undef); | |
| 293 while ($tier >= $minbin) { | |
| 294 my ($tier_start,$tier_stop) = (bin_bot($tier,$start), | |
| 295 bin_top($tier,$end)); | |
| 296 if( $tier_start == $tier_stop ) { | |
| 297 my @vals = $self->{'_btree'}->get_dup($tier_start); | |
| 298 if( scalar @vals > 0 ) { | |
| 299 push @bins, map { $self->{'_features'}->[$_] } @vals; | |
| 300 } | |
| 301 } else { | |
| 302 $k = $tier_start; | |
| 303 my @vals; | |
| 304 for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR); | |
| 305 $rc == 0; | |
| 306 $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) { | |
| 307 last if( $k > $tier_stop || $k < $tier_start); | |
| 308 push @vals, $v; | |
| 309 } | |
| 310 foreach my $v ( @vals ) { | |
| 311 if( defined $self->{'_features'}->[$v] ) { | |
| 312 push @bins, $self->{'_features'}->[$v] ; | |
| 313 } else { | |
| 314 | |
| 315 } | |
| 316 | |
| 317 } | |
| 318 } | |
| 319 $tier /= 10; | |
| 320 } | |
| 321 | |
| 322 $strandmatch = 'ignore' unless defined $strandmatch; | |
| 323 return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins : | |
| 324 grep { $r->overlaps($_,$strandmatch)} @bins; | |
| 325 } | |
| 326 | |
| 327 =head2 remove_features | |
| 328 | |
| 329 Title : remove_features | |
| 330 Usage : $collection->remove_features(\@array) | |
| 331 Function: Removes the requested sequence features (based on features | |
| 332 which have the same location) | |
| 333 Returns : Number of features removed | |
| 334 Args : Arrayref of Bio::RangeI objects | |
| 335 | |
| 336 | |
| 337 =cut | |
| 338 | |
| 339 sub remove_features{ | |
| 340 my ($self,$feats) = @_; | |
| 341 if( ref($feats) !~ /ARRAY/i ) { | |
| 342 $self->warn("Must provide a valid Array reference to remove_features"); | |
| 343 return 0; | |
| 344 } | |
| 345 my $countprocessed = 0; | |
| 346 foreach my $f ( @$feats ) { | |
| 347 next if ! ref($f) || ! $f->isa('Bio::RangeI'); | |
| 348 my $bin = bin($f->start,$f->end,$self->min_bin); | |
| 349 my @vals = $self->{'_btree'}->get_dup($bin); | |
| 350 my $vcount = scalar @vals; | |
| 351 foreach my $v ( @vals ) { | |
| 352 # eventually this array will become sparse... | |
| 353 if( $self->{'_features'}->[$v] == $f ) { | |
| 354 $self->{'_features'}->[$v] = undef; | |
| 355 $self->{'_btree'}->del_dup($bin,$v); | |
| 356 $vcount--; | |
| 357 } | |
| 358 } | |
| 359 if( $vcount == 0 ) { | |
| 360 $self->{'_btree'}->del($bin); | |
| 361 } | |
| 362 } | |
| 363 } | |
| 364 | |
| 365 =head2 get_all_features | |
| 366 | |
| 367 Title : get_all_features | |
| 368 Usage : my @f = $col->get_all_features() | |
| 369 Function: Return all the features stored in this collection (Could be large) | |
| 370 Returns : Array of Bio::RangeI objects | |
| 371 Args : None | |
| 372 | |
| 373 | |
| 374 =cut | |
| 375 | |
| 376 sub get_all_features{ | |
| 377 my ($self) = @_; | |
| 378 return grep {defined $_} @{ $self->{'_features'} }; | |
| 379 } | |
| 380 | |
| 381 | |
| 382 =head2 min_bin | |
| 383 | |
| 384 Title : min_bin | |
| 385 Usage : my $minbin= $self->min_bin; | |
| 386 Function: Get/Set the minimum value to use for binning | |
| 387 Returns : integer | |
| 388 Args : [optional] minimum bin value | |
| 389 | |
| 390 | |
| 391 =cut | |
| 392 | |
| 393 sub min_bin { | |
| 394 my ($self,$min) = @_; | |
| 395 if( defined $min ) { | |
| 396 $self->{'_min_bin'} = $min; | |
| 397 } | |
| 398 return $self->{'_min_bin'} || MIN_BIN; | |
| 399 } | |
| 400 | |
| 401 =head2 max_bin | |
| 402 | |
| 403 Title : max_bin | |
| 404 Usage : my $maxbin= $self->max_bin; | |
| 405 Function: Get/Set the maximum value to use for binning | |
| 406 Returns : integer | |
| 407 Args : [optional] maximum bin value | |
| 408 | |
| 409 | |
| 410 =cut | |
| 411 | |
| 412 sub max_bin { | |
| 413 my ($self,$max) = @_; | |
| 414 if( defined $max ) { | |
| 415 $self->{'_max_bin'} = $max; | |
| 416 } | |
| 417 return $self->{'max_bin'} || MAX_BIN; | |
| 418 } | |
| 419 | |
| 420 =head2 feature_count | |
| 421 | |
| 422 Title : feature_count | |
| 423 Usage : my $c = $col->feature_count() | |
| 424 Function: Retrieve the total number of features in the collection | |
| 425 Returns : integer | |
| 426 Args : none | |
| 427 | |
| 428 | |
| 429 =cut | |
| 430 | |
| 431 sub feature_count{ | |
| 432 my ($self) = @_; | |
| 433 return scalar ( grep {defined $_} @{ $self->{'_features'} }); | |
| 434 } | |
| 435 | |
| 436 sub _compare{ if( defined $_[0] && ! defined $_[1] ) { return -1 } | |
| 437 elsif ( defined $_[1] && ! defined $_[0] ) { return 1} | |
| 438 $_[0] <=> $_[1]} | |
| 439 | |
| 440 sub _comparepack { unpack("d", $_[0]) <=> unpack("d", $_[1]) ;} | |
| 441 | |
| 442 sub DESTROY { | |
| 443 my $self = shift; | |
| 444 $self->SUPER::DESTROY(); | |
| 445 if( defined $self->{'_io'} ) { | |
| 446 $self->{'_io'}->_io_cleanup(); | |
| 447 $self->{'_io'} = undef; | |
| 448 } | |
| 449 $self->{'_btree'} = undef; | |
| 450 } | |
| 451 | |
| 452 1; |
