annotate variant_effect_predictor/Bio/SeqFeature/Collection.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 # $Id: Collection.pm,v 1.9.2.1 2003/02/21 03:07:19 jason Exp $
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 # BioPerl module for Bio::SeqFeature::Collection
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5 # Cared for by Jason Stajich <jason@bioperl.org>
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 # Copyright Jason Stajich
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 # You may distribute this module under the same terms as perl itself
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 # POD documentation - main docs before the code
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15 Bio::SeqFeature::Collection - A container class for SeqFeatures
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16 suitable for performing operations such as finding features within a
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 range, that match a certain feature type, etc.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 use Bio::SeqFeature::Collection;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22 use Bio::Location::Simple;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 use Bio::Tools::GFF;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24 use Bio::Root::IO;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 # let's first input some features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26 my $gffio = Bio::Tools::GFF->new(-file => Bio::Root::IO->catfile
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 ("t","data","myco_sites.gff"),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28 -gff_version => 2);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 my @features = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 # loop over the input stream
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 while(my $feature = $gffio->next_feature()) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 # do something with feature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 push @features, $feature;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 $gffio->close();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 # build the Collection object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 my $col = new Bio::SeqFeature::Collection();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38 # add these features to the object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 my $totaladded = $col->add_features(\@features);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 my @subset = $col->features_in_range(-start => 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 -end => 25000,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 -strand => 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44 -contain => 0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45 # subset should have 18 entries for this dataset
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 print "size is ", scalar @subset, "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 @subset = $col->features_in_range(-range => Bio::Location::Simple->new
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 (-start => 70000,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49 -end => 150000,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 -strand => -1),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51 -contain => 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 -strandmatch => 'strong');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 # subset should have 22 entries for this dataset
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55 print "size is ", scalar @subset, "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 print "total number of features in collection is ",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57 $col->feature_count(),"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 This object will efficiently allow one for query subsets of ranges
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62 within a large collection of sequence features (in fact the objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 just have to be Bio::RangeI compliant). This is done by the creation
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64 of bins which are stored in order in a B-Tree data structure as
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 provided by the DB_File interface to the Berkeley DB.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67 This is based on work done by Lincoln for storage in a mysql instance
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 - this is intended to be an embedded in-memory implementation for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69 easily quering for subsets of a large range set. All features are
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 held in memory even if the -usefile flag is provided.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 =head1 FEEDBACK
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 =head2 Mailing Lists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 User feedback is an integral part of the evolution of this and other
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77 Bioperl modules. Send your comments and suggestions preferably to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 the Bioperl mailing list. Your participation is much appreciated.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 bioperl-l@bioperl.org - General discussion
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81 http://bioperl.org/MailList.shtml - About the mailing lists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 =head2 Reporting Bugs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 Report bugs to the Bioperl bug tracking system to help us keep track
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 of the bugs and their resolution. Bug reports can be submitted via
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 email or the web:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 bioperl-bugs@bioperl.org
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 http://bugzilla.bioperl.org/
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 =head1 AUTHOR - Jason Stajich
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 Email jason@bioperl.org
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 =head1 CONTRIBUTORS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 Using code and strategy developed by Lincoln Stein (lstein@cshl.org)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 in Bio::DB::GFF implementation.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 =head1 APPENDIX
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 The rest of the documentation details each of the object methods.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104 Internal methods are usually preceded with a _
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 # Let the code begin...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 package Bio::SeqFeature::Collection;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 use vars qw(@ISA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 # Object preamble - inherits from Bio::Root::Root
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 use Bio::Root::Root;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 use Bio::Root::IO;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 use Bio::DB::GFF::Util::Binning;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 use DB_File;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 use Bio::Location::Simple;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 @ISA = qw(Bio::Root::Root );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 # This may need to get re-optimized for BDB usage as these
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128 # numbers were derived empirically by Lincoln on a mysql srv
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 # running on his laptop
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131 # this is the largest that any reference sequence can be (100 megabases)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 use constant MAX_BIN => 100_000_000;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 # this is the smallest bin (1 K)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 use constant MIN_BIN => 1_000;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137 =head2 new
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139 Title : new
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 Usage : my $obj = new Bio::SeqFeature::Collection();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 Function: Builds a new Bio::SeqFeature::Collection object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 Returns : Bio::SeqFeature::Collection
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 Args :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 -minbin minimum value to use for binning
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146 (default is 100,000,000)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 -maxbin maximum value to use for binning
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 (default is 1,000)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 -usefile boolean to use a file to store
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 BTREE rather than an in-memory structure
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 (default is false or in-memory).
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 -features Array ref of features to add initially
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 sub new {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 my($class,@args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 my $self = $class->SUPER::new(@args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 my ($maxbin,$minbin,$usefile,$features) = $self->_rearrange([qw(MAXBIN MINBIN
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 USEFILE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 FEATURES)],@args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165 defined $maxbin && $self->max_bin($maxbin);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 defined $minbin && $self->min_bin($minbin);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 defined $features && $self->add_features($features);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169 $DB_BTREE->{'flags'} = R_DUP ;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 $DB_BTREE->{'compare'} = \&_compare;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171 # $DB_BTREE->{'compare'} = \&_comparepack;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 $self->{'_btreehash'} = {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174 my $tmpname = undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175 if( $usefile ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176 $self->{'_io'} = new Bio::Root::IO;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 (undef,$tmpname) = $self->{'_io'}->tempfile();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 unlink($tmpname);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179 $self->debug("tmpfile is $tmpname");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 $self->{'_btree'} = tie %{$self->{'_btreehash'}},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 'DB_File', $tmpname, O_RDWR|O_CREAT, 0640, $DB_BTREE;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 # possibly storing/retrieving as floats for speed improvement?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185 # $self->{'_btree'}->filter_store_key ( sub { $_ = pack ("d", $_) } );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 # $self->{'_btree'}->filter_fetch_key ( sub { $_ = unpack("d", $_) } );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 $self->{'_features'} = [];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 return $self;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193 =head2 add_features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 Title : add_features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 Usage : $collection->add_features(\@features);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197 Function:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198 Returns : number of features added
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199 Args : arrayref of Bio::SeqFeatureI objects to index
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204 sub add_features{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 my ($self,$feats) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206 if( ref($feats) !~ /ARRAY/i ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207 $self->warn("Must provide a valid Array reference to add_features");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 return 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 my $count = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 foreach my $f ( @$feats ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 if( ! $f || ! ref($f) || ! $f->isa('Bio::RangeI') ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 $self->warn("Must provide valid Bio::RangeI objects to add_features, skipping object '$f'\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 my $bin = bin($f->start,$f->end,$self->min_bin);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 push @{$self->{'_features'}}, $f;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 $self->{'_btreehash'}->{$bin} = $#{$self->{'_features'}};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220 $self->debug( "$bin for ". $f->location->to_FTstring(). " matches ".$#{$self->{'_features'}}. "\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 $count++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223 return $count;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227 =head2 features_in_range
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 Title : features_in_range
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 Usage : my @features = $collection->features_in_range($range)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 Function: Retrieves a list of features which were contained or overlap the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 the requested range (see Args for way to specify overlap or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233 only those containe)d
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234 Returns : List of Bio::SeqFeatureI objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 Args : -range => Bio::RangeI object defining range to search,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 OR
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 -start => start,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 -end => end,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 -strand => strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 -contain => boolean - true if feature must be completely
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 contained with range
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243 OR false if should include features that simply overlap
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 the range. Default: true.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245 -strandmatch => 'strong', ranges must have the same strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 'weak', ranges must have the same
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 strand or no strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 'ignore', ignore strand information
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 Default. 'ignore'.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 sub features_in_range{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 my (@args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 my ($range, $contain, $strandmatch,$start,$end,$strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 if( @args == 1 ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258 $range = shift @args;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 ($start,$end,$strand,$range,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 $contain,$strandmatch) = $self->_rearrange([qw(START END
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262 STRAND
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263 RANGE CONTAIN
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 STRANDMATCH)],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 @args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 $contain = 1 unless defined $contain;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 $strand = 1 unless defined $strand;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 if( $strand !~ /^([\-\+])$/ &&
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270 $strand !~ /^[\-\+]?1$/ ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 $self->warn("must provide a valid numeric or +/- for strand");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272 return ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274 if( defined $1 ) { $strand .= 1; }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276 if( !defined $start && !defined $end ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 if( ! defined $range || !ref($range) || ! $range->isa("Bio::RangeI") )
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278 {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 $self->warn("Must defined a valid Range for the method feature_in_range");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280 return ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282 ($start,$end,$strand) = ($range->start,$range->end,$range->strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 my $r = new Bio::Location::Simple(-start => $start,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 -end => $end,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286 -strand => $strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288 my @features;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 my $maxbin = $self->max_bin;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 my $minbin = $self->min_bin;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 my $tier = $maxbin;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292 my ($k,$v,@bins) = ("",undef);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 while ($tier >= $minbin) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294 my ($tier_start,$tier_stop) = (bin_bot($tier,$start),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 bin_top($tier,$end));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 if( $tier_start == $tier_stop ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 my @vals = $self->{'_btree'}->get_dup($tier_start);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 if( scalar @vals > 0 ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 push @bins, map { $self->{'_features'}->[$_] } @vals;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 $k = $tier_start;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303 my @vals;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304 for( my $rc = $self->{'_btree'}->seq($k,$v,R_CURSOR);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305 $rc == 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 $rc = $self->{'_btree'}->seq($k,$v, R_NEXT) ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 last if( $k > $tier_stop || $k < $tier_start);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 push @vals, $v;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310 foreach my $v ( @vals ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 if( defined $self->{'_features'}->[$v] ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312 push @bins, $self->{'_features'}->[$v] ;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319 $tier /= 10;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 $strandmatch = 'ignore' unless defined $strandmatch;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 return ( $contain ) ? grep { $r->contains($_,$strandmatch) } @bins :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324 grep { $r->overlaps($_,$strandmatch)} @bins;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327 =head2 remove_features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 Title : remove_features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 Usage : $collection->remove_features(\@array)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 Function: Removes the requested sequence features (based on features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332 which have the same location)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333 Returns : Number of features removed
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334 Args : Arrayref of Bio::RangeI objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 sub remove_features{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340 my ($self,$feats) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 if( ref($feats) !~ /ARRAY/i ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342 $self->warn("Must provide a valid Array reference to remove_features");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 return 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345 my $countprocessed = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346 foreach my $f ( @$feats ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347 next if ! ref($f) || ! $f->isa('Bio::RangeI');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348 my $bin = bin($f->start,$f->end,$self->min_bin);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349 my @vals = $self->{'_btree'}->get_dup($bin);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350 my $vcount = scalar @vals;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351 foreach my $v ( @vals ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352 # eventually this array will become sparse...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353 if( $self->{'_features'}->[$v] == $f ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354 $self->{'_features'}->[$v] = undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 $self->{'_btree'}->del_dup($bin,$v);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356 $vcount--;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359 if( $vcount == 0 ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 $self->{'_btree'}->del($bin);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365 =head2 get_all_features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367 Title : get_all_features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368 Usage : my @f = $col->get_all_features()
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369 Function: Return all the features stored in this collection (Could be large)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370 Returns : Array of Bio::RangeI objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371 Args : None
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376 sub get_all_features{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377 my ($self) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378 return grep {defined $_} @{ $self->{'_features'} };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382 =head2 min_bin
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 Title : min_bin
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385 Usage : my $minbin= $self->min_bin;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386 Function: Get/Set the minimum value to use for binning
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387 Returns : integer
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 Args : [optional] minimum bin value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393 sub min_bin {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 my ($self,$min) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395 if( defined $min ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396 $self->{'_min_bin'} = $min;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398 return $self->{'_min_bin'} || MIN_BIN;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401 =head2 max_bin
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 Title : max_bin
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404 Usage : my $maxbin= $self->max_bin;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405 Function: Get/Set the maximum value to use for binning
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406 Returns : integer
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
407 Args : [optional] maximum bin value
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
408
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
409
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
410 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
411
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
412 sub max_bin {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
413 my ($self,$max) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
414 if( defined $max ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
415 $self->{'_max_bin'} = $max;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
416 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
417 return $self->{'max_bin'} || MAX_BIN;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
418 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
419
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
420 =head2 feature_count
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
421
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
422 Title : feature_count
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
423 Usage : my $c = $col->feature_count()
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
424 Function: Retrieve the total number of features in the collection
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
425 Returns : integer
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
426 Args : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
427
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
428
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
429 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
430
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
431 sub feature_count{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
432 my ($self) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
433 return scalar ( grep {defined $_} @{ $self->{'_features'} });
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
434 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
435
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
436 sub _compare{ if( defined $_[0] && ! defined $_[1] ) { return -1 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
437 elsif ( defined $_[1] && ! defined $_[0] ) { return 1}
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
438 $_[0] <=> $_[1]}
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
439
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
440 sub _comparepack { unpack("d", $_[0]) <=> unpack("d", $_[1]) ;}
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
441
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
442 sub DESTROY {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
443 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
444 $self->SUPER::DESTROY();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
445 if( defined $self->{'_io'} ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
446 $self->{'_io'}->_io_cleanup();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
447 $self->{'_io'} = undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
448 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
449 $self->{'_btree'} = undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
450 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
451
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
452 1;