Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqFeature/Collection.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqFeature/Collection.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,452 @@ +# $Id: Collection.pm,v 1.9.2.1 2003/02/21 03:07:19 jason Exp $ +# +# BioPerl module for Bio::SeqFeature::Collection +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqFeature::Collection - A container class for SeqFeatures +suitable for performing operations such as finding features within a +range, that match a certain feature type, etc. + +=head1 SYNOPSIS + + use Bio::SeqFeature::Collection; + use Bio::Location::Simple; + use Bio::Tools::GFF; + use Bio::Root::IO; + # let's first input some features + my $gffio = Bio::Tools::GFF->new(-file => Bio::Root::IO->catfile + ("t","data","myco_sites.gff"), + -gff_version => 2); + my @features = (); + # loop over the input stream + while(my $feature = $gffio->next_feature()) { + # do something with feature + push @features, $feature; + } + $gffio->close(); + # build the Collection object + my $col = new Bio::SeqFeature::Collection(); + # add these features to the object + my $totaladded = $col->add_features(\@features); + + my @subset = $col->features_in_range(-start => 1, + -end => 25000, + -strand => 1, + -contain => 0); + # subset should have 18 entries for this dataset + print "size is ", scalar @subset, "\n"; + @subset = $col->features_in_range(-range => Bio::Location::Simple->new + (-start => 70000, + -end => 150000, + -strand => -1), + -contain => 1, + -strandmatch => 'strong'); + + # subset should have 22 entries for this dataset + print "size is ", scalar @subset, "\n"; + print "total number of features in collection is ", + $col->feature_count(),"\n"; + +=head1 DESCRIPTION + +This object will efficiently allow one for query subsets of ranges +within a large collection of sequence features (in fact the objects +just have to be Bio::RangeI compliant). This is done by the creation +of bins which are stored in order in a B-Tree data structure as +provided by the DB_File interface to the Berkeley DB. + +This is based on work done by Lincoln for storage in a mysql instance +- this is intended to be an embedded in-memory implementation for +easily quering for subsets of a large range set. All features are +held in memory even if the -usefile flag is provided. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 CONTRIBUTORS + +Using code and strategy developed by Lincoln Stein (lstein@cshl.org) +in Bio::DB::GFF implementation. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SeqFeature::Collection; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::DB::GFF::Util::Binning; +use DB_File; +use Bio::Location::Simple; + +@ISA = qw(Bio::Root::Root ); + + +# This may need to get re-optimized for BDB usage as these +# numbers were derived empirically by Lincoln on a mysql srv +# running on his laptop + +# this is the largest that any reference sequence can be (100 megabases) +use constant MAX_BIN => 100_000_000; + +# this is the smallest bin (1 K) +use constant MIN_BIN => 1_000; + +=head2 new + + Title : new + Usage : my $obj = new Bio::SeqFeature::Collection(); + Function: Builds a new Bio::SeqFeature::Collection object + Returns : Bio::SeqFeature::Collection + Args : + + -minbin minimum value to use for binning + (default is 100,000,000) + -maxbin maximum value to use for binning + (default is 1,000) + -usefile boolean to use a file to store + BTREE rather than an in-memory structure + (default is false or in-memory). + + -features Array ref of features to add initially + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + my ($maxbin,$minbin,$usefile,$features) = $self->_rearrange([qw(MAXBIN MINBIN + USEFILE + FEATURES)],@args); + + defined $maxbin && $self->max_bin($maxbin); + defined $minbin && $self->min_bin($minbin); + + defined $features && $self->add_features($features); + $DB_BTREE->{'flags'} = R_DUP ; + $DB_BTREE->{'compare'} = \&_compare; +# $DB_BTREE->{'compare'} = \&_comparepack; + $self->{'_btreehash'} = {}; + + my $tmpname = undef; + if( $usefile ) { + $self->{'_io'} = new Bio::Root::IO; + (undef,$tmpname) = $self->{'_io'}->tempfile(); + unlink($tmpname); + $self->debug("tmpfile is $tmpname"); + } + $self->{'_btree'} = tie %{$self->{'_btreehash'}}, + 'DB_File', $tmpname, O_RDWR|O_CREAT, 0640, $DB_BTREE; + +# possibly storing/retrieving as floats for speed improvement? +# $self->{'_btree'}->filter_store_key ( sub { $_ = pack ("d", $_) } ); +# $self->{'_btree'}->filter_fetch_key ( sub { $_ = unpack("d", $_) } ); + + $self->{'_features'} = []; + return $self; +} + + +=head2 add_features + + Title : add_features + Usage : $collection->add_features(\@features); + Function: + Returns : number of features added + Args : arrayref of Bio::SeqFeatureI objects to index + + +=cut + +sub add_features{ + my ($self,$feats) = @_; + if( ref($feats) !~ /ARRAY/i ) { + $self->warn("Must provide a valid Array reference to add_features"); + return 0; + } + my $count = 0; + foreach my $f ( @$feats ) { + if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) { + $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n"); + next; + } + my $bin = bin($f->start,$f->end,$self->min_bin); + + push @{$self->{'_features'}}, $f; + $self->{'_btreehash'}->{$bin} = $#{$self->{'_features'}}; + $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n"); + $count++; + } + return $count; +} + + +=head2 features_in_range + + Title : features_in_range + Usage : my @features = $collection->features_in_range($range) + Function: Retrieves a list of features which were contained or overlap the + the requested range (see Args for way to specify overlap or + only those containe)d + Returns : List of Bio::SeqFeatureI objects + Args : -range => Bio::RangeI object defining range to search, + OR + -start => start, + -end => end, + -strand => strand + + -contain => boolean - true if feature must be completely + contained with range + OR false if should include features that simply overlap + the range. Default: true. + -strandmatch => 'strong', ranges must have the same strand + 'weak', ranges must have the same + strand or no strand + 'ignore', ignore strand information + Default. 'ignore'. + +=cut + +sub features_in_range{ + my $self = shift; + my (@args) = @_; + my ($range, $contain, $strandmatch,$start,$end,$strand); + if( @args == 1 ) { + $range = shift @args; + } else { + ($start,$end,$strand,$range, + $contain,$strandmatch) = $self->_rearrange([qw(START END + STRAND + RANGE CONTAIN + STRANDMATCH)], + @args); + $contain = 1 unless defined $contain; + } + $strand = 1 unless defined $strand; + if( $strand !~ /^([\-\+])$/ && + $strand !~ /^[\-\+]?1$/ ) { + $self->warn("must provide a valid numeric or +/- for strand"); + return (); + } + if( defined $1 ) { $strand .= 1; } + + if( !defined $start && !defined $end ) { + if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") ) + { + $self->warn("Must defined a valid Range for the method feature_in_range"); + return (); + } + ($start,$end,$strand) = ($range->start,$range->end,$range->strand); + } + my $r = new Bio::Location::Simple(-start => $start, + -end => $end, + -strand => $strand); + + my @features; + my $maxbin = $self->max_bin; + my $minbin = $self->min_bin; + my $tier = $maxbin; + my ($k,$v,@bins) = ("",undef); + while ($tier >= $minbin) { + my ($tier_start,$tier_stop) = (bin_bot($tier,$start), + bin_top($tier,$end)); + if( $tier_start == $tier_stop ) { + my @vals = $self->{'_btree'}->get_dup($tier_start); + if( scalar @vals > 0 ) { + push @bins, map { $self->{'_features'}->[$_] } @vals; + } + } else { + $k = $tier_start; + my @vals; + for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR); + $rc == 0; + $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) { + last if( $k > $tier_stop || $k < $tier_start); + push @vals, $v; + } + foreach my $v ( @vals ) { + if( defined $self->{'_features'}->[$v] ) { + push @bins, $self->{'_features'}->[$v] ; + } else { + + } + + } + } + $tier /= 10; + } + + $strandmatch = 'ignore' unless defined $strandmatch; + return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins : + grep { $r->overlaps($_,$strandmatch)} @bins; +} + +=head2 remove_features + + Title : remove_features + Usage : $collection->remove_features(\@array) + Function: Removes the requested sequence features (based on features + which have the same location) + Returns : Number of features removed + Args : Arrayref of Bio::RangeI objects + + +=cut + +sub remove_features{ + my ($self,$feats) = @_; + if( ref($feats) !~ /ARRAY/i ) { + $self->warn("Must provide a valid Array reference to remove_features"); + return 0; + } + my $countprocessed = 0; + foreach my $f ( @$feats ) { + next if ! ref($f) || ! $f->isa('Bio::RangeI'); + my $bin = bin($f->start,$f->end,$self->min_bin); + my @vals = $self->{'_btree'}->get_dup($bin); + my $vcount = scalar @vals; + foreach my $v ( @vals ) { + # eventually this array will become sparse... + if( $self->{'_features'}->[$v] == $f ) { + $self->{'_features'}->[$v] = undef; + $self->{'_btree'}->del_dup($bin,$v); + $vcount--; + } + } + if( $vcount == 0 ) { + $self->{'_btree'}->del($bin); + } + } +} + +=head2 get_all_features + + Title : get_all_features + Usage : my @f = $col->get_all_features() + Function: Return all the features stored in this collection (Could be large) + Returns : Array of Bio::RangeI objects + Args : None + + +=cut + +sub get_all_features{ + my ($self) = @_; + return grep {defined $_} @{ $self->{'_features'} }; +} + + +=head2 min_bin + + Title : min_bin + Usage : my $minbin= $self->min_bin; + Function: Get/Set the minimum value to use for binning + Returns : integer + Args : [optional] minimum bin value + + +=cut + +sub min_bin { + my ($self,$min) = @_; + if( defined $min ) { + $self->{'_min_bin'} = $min; + } + return $self->{'_min_bin'} || MIN_BIN; +} + +=head2 max_bin + + Title : max_bin + Usage : my $maxbin= $self->max_bin; + Function: Get/Set the maximum value to use for binning + Returns : integer + Args : [optional] maximum bin value + + +=cut + +sub max_bin { + my ($self,$max) = @_; + if( defined $max ) { + $self->{'_max_bin'} = $max; + } + return $self->{'max_bin'} || MAX_BIN; +} + +=head2 feature_count + + Title : feature_count + Usage : my $c = $col->feature_count() + Function: Retrieve the total number of features in the collection + Returns : integer + Args : none + + +=cut + +sub feature_count{ + my ($self) = @_; + return scalar ( grep {defined $_} @{ $self->{'_features'} }); +} + +sub _compare{ if( defined $_[0] && ! defined $_[1] ) { return -1 } + elsif ( defined $_[1] && ! defined $_[0] ) { return 1} + $_[0] <=> $_[1]} + +sub _comparepack { unpack("d", $_[0]) <=> unpack("d", $_[1]) ;} + +sub DESTROY { + my $self = shift; + $self->SUPER::DESTROY(); + if( defined $self->{'_io'} ) { + $self->{'_io'}->_io_cleanup(); + $self->{'_io'} = undef; + } + $self->{'_btree'} = undef; +} + +1;