0
|
1 #
|
|
2 # $Id: BDB.pm,v 1.6.2.1 2003/03/25 18:46:10 jason Exp $
|
|
3 #
|
|
4 # BioPerl module for Bio::DB::Flat::BDB
|
|
5 #
|
|
6 # Cared for by Lincoln Stein <lstein@cshl.org>
|
|
7 #
|
|
8 # You may distribute this module under the same terms as perl itself
|
|
9
|
|
10 # POD documentation - main docs before the code
|
|
11
|
|
12 =head1 NAME
|
|
13
|
|
14 Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file
|
|
15
|
|
16 =head1 SYNOPSIS
|
|
17
|
|
18 You should not be using this module directly. See Bio::DB::Flat.
|
|
19
|
|
20 =head1 DESCRIPTION
|
|
21
|
|
22 This object provides the basic mechanism to associate positions in
|
|
23 files with primary and secondary name spaces. Unlike
|
|
24 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
|
|
25 to work with the BerkeleyDB-indexed "common" flat file format worked
|
|
26 out at the 2002 BioHackathon.
|
|
27
|
|
28 This object is the guts to the mechanism, which will be used by the
|
|
29 specific objects inheriting from it.
|
|
30
|
|
31 =head1 FEEDBACK
|
|
32
|
|
33 =head2 Mailing Lists
|
|
34
|
|
35 User feedback is an integral part of the evolution of this and other
|
|
36 Bioperl modules. Send your comments and suggestions preferably to one
|
|
37 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
38
|
|
39 bioperl-l@bioperl.org - General discussion
|
|
40 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
41
|
|
42 =head2 Reporting Bugs
|
|
43
|
|
44 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
45 the bugs and their resolution. Bug reports can be submitted via
|
|
46 email or the web:
|
|
47
|
|
48 http://bugzilla.bioperl.org/
|
|
49
|
|
50 =head1 AUTHOR - Lincoln Stein
|
|
51
|
|
52 Email - lstein@cshl.org
|
|
53
|
|
54 =head1 SEE ALSO
|
|
55
|
|
56 L<Bio::DB::Flat>,
|
|
57
|
|
58 =head1 APPENDIX
|
|
59
|
|
60 The rest of the documentation details each of the object methods. Internal
|
|
61 methods are usually preceded with an "_" (underscore).
|
|
62
|
|
63 =cut
|
|
64
|
|
65
|
|
66 # Let the code begin...
|
|
67
|
|
68 package Bio::DB::Flat::BDB;
|
|
69
|
|
70 use strict;
|
|
71 use DB_File;
|
|
72 use IO::File;
|
|
73 use Fcntl qw(O_CREAT O_RDWR O_RDONLY);
|
|
74 use File::Spec;
|
|
75 use Bio::DB::Flat;
|
|
76 use Bio::SeqIO;
|
|
77 use Bio::DB::RandomAccessI;
|
|
78 use Bio::Root::Root;
|
|
79 use Bio::Root::IO;
|
|
80 use vars '@ISA';
|
|
81
|
|
82 @ISA = qw(Bio::DB::Flat);
|
|
83
|
|
84 sub _initialize {
|
|
85 my $self = shift;
|
|
86 my ($max_open) = $self->_rearrange(['MAXOPEN'],@_);
|
|
87 $self->{bdb_maxopen} = $max_open || 32;
|
|
88 }
|
|
89
|
|
90 # return a filehandle seeked to the appropriate place
|
|
91 # this only works with the primary namespace
|
|
92 sub _get_stream {
|
|
93 my ($self,$id) = @_;
|
|
94 my ($filepath,$offset,$length) = $self->_lookup_primary($id)
|
|
95 or $self->throw("Unable to find a record for $id in the flat file index");
|
|
96 my $fh = $self->_fhcache($filepath)
|
|
97 or $self->throw("couldn't open $filepath: $!");
|
|
98 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
|
|
99 $fh;
|
|
100 }
|
|
101
|
|
102 # return records corresponding to the indicated index
|
|
103 # if there are multiple hits will return a list in list context,
|
|
104 # otherwise will throw an exception
|
|
105 sub fetch_raw {
|
|
106 my ($self,$id,$namespace) = @_;
|
|
107
|
|
108 # secondary lookup
|
|
109 if (defined $namespace && $namespace ne $self->primary_namespace) {
|
|
110 my @hits = $self->_lookup_secondary($namespace,$id);
|
|
111 $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context")
|
|
112 unless wantarray;
|
|
113 return map {$self->_read_record(@$_)} @hits;
|
|
114 }
|
|
115
|
|
116 # primary lookup
|
|
117 my @args = $self->_lookup_primary($id)
|
|
118 or $self->throw("Unable to find a record for $id in the flat file index");
|
|
119 return $self->_read_record(@args);
|
|
120 }
|
|
121
|
|
122 # create real live Bio::Seq object
|
|
123 sub get_Seq_by_id {
|
|
124 my $self = shift;
|
|
125 my $id = shift;
|
|
126 my $fh = eval {$self->_get_stream($id)} or return;
|
|
127 my $seqio =
|
|
128 $self->{bdb_cached_parsers}{fileno $fh} ||= Bio::SeqIO->new( -Format => $self->file_format,
|
|
129 -fh => $fh);
|
|
130 return $seqio->next_seq;
|
|
131 }
|
|
132
|
|
133 # fetch array of Bio::Seq objects
|
|
134 sub get_Seq_by_acc {
|
|
135 my $self = shift;
|
|
136 unshift @_,'ACC' if @_==1;
|
|
137 my ($ns,$key) = @_;
|
|
138 my @primary_ids = $self->expand_ids($ns => $key);
|
|
139 $self->throw("more than one sequences correspond to this accession")
|
|
140 if @primary_ids > 1 && ! wantarray;
|
|
141 my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
|
|
142 return wantarray ? @rc : $rc[0];
|
|
143 }
|
|
144
|
|
145 # fetch array of Bio::Seq objects
|
|
146 sub get_Seq_by_version {
|
|
147 my $self = shift;
|
|
148 unshift @_,'VERSION' if @_==1;
|
|
149 my ($ns,$key) = @_;
|
|
150 my @primary_ids = $self->expand_ids($ns => $key);
|
|
151 $self->throw("more than one sequences correspond to this accession")
|
|
152 if @primary_ids > 1 && !wantarray;
|
|
153 return map {$self->get_Seq_by_id($_)} @primary_ids;
|
|
154 }
|
|
155
|
|
156 =head2 get_PrimarySeq_stream
|
|
157
|
|
158 Title : get_PrimarySeq_stream
|
|
159 Usage : $stream = get_PrimarySeq_stream
|
|
160 Function: Makes a Bio::DB::SeqStreamI compliant object
|
|
161 which provides a single method, next_primary_seq
|
|
162 Returns : Bio::DB::SeqStreamI
|
|
163 Args : none
|
|
164
|
|
165
|
|
166 =cut
|
|
167
|
|
168 sub get_PrimarySeq_stream {
|
|
169 my $self = shift;
|
|
170 my @files = $self->files || 0;
|
|
171 my $out = Bio::SeqIO::MultiFile->new( -format => $self->file_format ,
|
|
172 -files => \@files);
|
|
173 return $out;
|
|
174 }
|
|
175
|
|
176 sub get_all_primary_ids {
|
|
177 my $self = shift;
|
|
178 my $db = $self->primary_db;
|
|
179 return keys %$db;
|
|
180 }
|
|
181
|
|
182 =head2 get_all_primary_ids
|
|
183
|
|
184 Title : get_all_primary_ids
|
|
185 Usage : @ids = $seqdb->get_all_primary_ids()
|
|
186 Function: gives an array of all the primary_ids of the
|
|
187 sequence objects in the database.
|
|
188 Example :
|
|
189 Returns : an array of strings
|
|
190 Args : none
|
|
191
|
|
192 =cut
|
|
193
|
|
194 # this will perform an ID lookup on a (possibly secondary)
|
|
195 # id, returning all the corresponding ids
|
|
196 sub expand_ids {
|
|
197 my $self = shift;
|
|
198 my ($ns,$key) = @_;
|
|
199 return $key unless defined $ns;
|
|
200 return $key if $ns eq $self->primary_namespace;
|
|
201 my $db = $self->secondary_db($ns)
|
|
202 or $self->throw("invalid secondary namespace $ns");
|
|
203 my $record = $db->{$key} or return; # nothing there
|
|
204 return $self->unpack_secondary($record);
|
|
205 }
|
|
206
|
|
207 # build index from files listed
|
|
208 sub build_index {
|
|
209 my $self = shift;
|
|
210 my @files = @_;
|
|
211 my $count = 0;
|
|
212 for my $file (@files) {
|
|
213 $file = File::Spec->rel2abs($file)
|
|
214 unless File::Spec->file_name_is_absolute($file);
|
|
215 $count += $self->_index_file($file);
|
|
216 }
|
|
217 $self->write_config;
|
|
218 $count;
|
|
219 }
|
|
220
|
|
221 sub _index_file {
|
|
222 my $self = shift;
|
|
223 my $file = shift;
|
|
224
|
|
225 my $fileno = $self->_path2fileno($file);
|
|
226 defined $fileno or $self->throw("could not create a file number for $file");
|
|
227
|
|
228 my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!");
|
|
229 my $offset = 0;
|
|
230 my $count = 0;
|
|
231 while (!eof($fh)) {
|
|
232 my ($ids,$adjustment) = $self->parse_one_record($fh) or next;
|
|
233 $adjustment ||= 0; # prevent uninit variable warning
|
|
234 my $pos = tell($fh) + $adjustment;
|
|
235 $self->_store_index($ids,$file,$offset,$pos-$offset);
|
|
236 $offset = $pos;
|
|
237 $count++;
|
|
238 }
|
|
239 $count;
|
|
240 }
|
|
241
|
|
242 =head2 To Be Implemented in Subclasses
|
|
243
|
|
244 The following methods MUST be implemented by subclasses.
|
|
245
|
|
246 =cut
|
|
247
|
|
248 =head2 May Be Overridden in Subclasses
|
|
249
|
|
250 The following methods MAY be overridden by subclasses.
|
|
251
|
|
252 =cut
|
|
253
|
|
254 sub default_primary_namespace {
|
|
255 return "ACC";
|
|
256 }
|
|
257
|
|
258 sub default_secondary_namespaces {
|
|
259 return;
|
|
260 }
|
|
261
|
|
262 sub _read_record {
|
|
263 my $self = shift;
|
|
264 my ($filepath,$offset,$length) = @_;
|
|
265 my $fh = $self->_fhcache($filepath)
|
|
266 or $self->throw("couldn't open $filepath: $!");
|
|
267 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
|
|
268 my $record;
|
|
269 read($fh,$record,$length) or $self->throw("can't read $filepath: $!");
|
|
270 $record
|
|
271 }
|
|
272
|
|
273 # return a list in the form ($filepath,$offset,$length)
|
|
274 sub _lookup_primary {
|
|
275 my $self = shift;
|
|
276 my $primary = shift;
|
|
277 my $db = $self->primary_db
|
|
278 or $self->throw("no primary namespace database is open");
|
|
279
|
|
280 my $record = $db->{$primary} or return; # nothing here
|
|
281
|
|
282 my($fileid,$offset,$length) = $self->unpack_primary($record);
|
|
283 my $filepath = $self->_fileno2path($fileid)
|
|
284 or $self->throw("no file path entry for fileid $fileid");
|
|
285 return ($filepath,$offset,$length);
|
|
286 }
|
|
287
|
|
288 # return a list of array refs in the form [$filepath,$offset,$length]
|
|
289 sub _lookup_secondary {
|
|
290 my $self = shift;
|
|
291 my ($namespace,$secondary) = @_;
|
|
292 my @primary = $self->expand_ids($namespace=>$secondary);
|
|
293 return map {[$self->_lookup_primary($_)]} @primary;
|
|
294 }
|
|
295
|
|
296 # store indexing information into a primary & secondary record
|
|
297 # $namespaces is one of:
|
|
298 # 1. a scalar corresponding to the primary name
|
|
299 # 2. a hashref corresponding to namespace=>id identifiers
|
|
300 # it is valid for secondary id to be an arrayref
|
|
301 sub _store_index {
|
|
302 my $self = shift;
|
|
303 my ($keys,$filepath,$offset,$length) = @_;
|
|
304 my ($primary,%secondary);
|
|
305
|
|
306 if (ref $keys eq 'HASH') {
|
|
307 my %valid_secondary = map {$_=>1} $self->secondary_namespaces;
|
|
308 while (my($ns,$value) = each %$keys) {
|
|
309 if ($ns eq $self->primary_namespace) {
|
|
310 $primary = $value;
|
|
311 } else {
|
|
312 $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns");
|
|
313 push @{$secondary{$ns}},$value;
|
|
314 }
|
|
315 }
|
|
316 $primary or $self->throw("no primary namespace ID provided");
|
|
317 } else {
|
|
318 $primary = $keys;
|
|
319 }
|
|
320
|
|
321 $self->throw("invalid primary ID; must be a scalar")
|
|
322 if ref($primary) =~ /^(ARRAY|HASH)$/; # but allow stringified objects
|
|
323
|
|
324 $self->_store_primary($primary,$filepath,$offset,$length);
|
|
325 for my $ns (keys %secondary) {
|
|
326 my @ids = ref $secondary{$ns} ? @{$secondary{$ns}} : $secondary{$ns};
|
|
327 $self->_store_secondary($ns,$_,$primary) foreach @ids;
|
|
328 }
|
|
329
|
|
330 1;
|
|
331 }
|
|
332
|
|
333 # store primary index
|
|
334 sub _store_primary {
|
|
335 my $self = shift;
|
|
336 my ($id,$filepath,$offset,$length) = @_;
|
|
337
|
|
338 my $db = $self->primary_db
|
|
339 or $self->throw("no primary namespace database is open");
|
|
340 my $fileno = $self->_path2fileno($filepath);
|
|
341 defined $fileno or $self->throw("could not create a file number for $filepath");
|
|
342
|
|
343 my $record = $self->pack_primary($fileno,$offset,$length);
|
|
344 $db->{$id} = $record or return; # nothing here
|
|
345 1;
|
|
346 }
|
|
347
|
|
348 # store a primary index name under a secondary index
|
|
349 sub _store_secondary {
|
|
350 my $self = shift;
|
|
351 my ($secondary_ns,$secondary_id,$primary_id) = @_;
|
|
352
|
|
353 my $db = $self->secondary_db($secondary_ns)
|
|
354 or $self->throw("invalid secondary namespace $secondary_ns");
|
|
355
|
|
356 # first get whatever secondary ids are already stored there
|
|
357 my @primary = $self->unpack_secondary($db->{$secondary_id});
|
|
358 # uniqueify
|
|
359 my %unique = map {$_=>undef} @primary,$primary_id;
|
|
360
|
|
361 my $record = $self->pack_secondary(keys %unique);
|
|
362 $db->{$secondary_id} = $record;
|
|
363 }
|
|
364
|
|
365 # get output file handle
|
|
366 sub _outfh {
|
|
367 my $self = shift;
|
|
368 #### XXXXX FINISH #####
|
|
369 # my $
|
|
370 }
|
|
371
|
|
372 # unpack a primary record into fileid,offset,length
|
|
373 sub unpack_primary {
|
|
374 my $self = shift;
|
|
375 my $index_record = shift;
|
|
376 return split "\t",$index_record;
|
|
377 }
|
|
378
|
|
379 # unpack a secondary record into a list of primary ids
|
|
380 sub unpack_secondary {
|
|
381 my $self = shift;
|
|
382 my $index_record = shift or return;
|
|
383 return split "\t",$index_record;
|
|
384 }
|
|
385
|
|
386 # pack a list of fileid,offset,length into a primary id record
|
|
387 sub pack_primary {
|
|
388 my $self = shift;
|
|
389 my ($fileid,$offset,$length) = @_;
|
|
390 return join "\t",($fileid,$offset,$length);
|
|
391 }
|
|
392
|
|
393 # pack a list of primary ids into a secondary id record
|
|
394 sub pack_secondary {
|
|
395 my $self = shift;
|
|
396 my @secondaries = @_;
|
|
397 return join "\t",@secondaries;
|
|
398 }
|
|
399
|
|
400 sub primary_db {
|
|
401 my $self = shift;
|
|
402 # lazy opening
|
|
403 $self->_open_bdb unless exists $self->{bdb_primary_db};
|
|
404 return $self->{bdb_primary_db};
|
|
405 }
|
|
406
|
|
407 sub secondary_db {
|
|
408 my $self = shift;
|
|
409 my $secondary_namespace = shift
|
|
410 or $self->throw("usage: secondary_db(\$secondary_namespace)");
|
|
411 $self->_open_bdb unless exists $self->{bdb_primary_db};
|
|
412 return $self->{bdb_secondary_db}{$secondary_namespace};
|
|
413 }
|
|
414
|
|
415 sub _open_bdb {
|
|
416 my $self = shift;
|
|
417
|
|
418 my $flags = $self->write_flag ? O_CREAT|O_RDWR : O_RDONLY;
|
|
419
|
|
420 my $primary_db = {};
|
|
421 tie(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE)
|
|
422 or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)");
|
|
423 $self->{bdb_primary_db} = $primary_db;
|
|
424
|
|
425 for my $secondary ($self->secondary_namespaces) {
|
|
426 my $secondary_db = {};
|
|
427 tie(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE)
|
|
428 or $self->throw("Could not open primary index file");
|
|
429 $self->{bdb_secondary_db}{$secondary} = $secondary_db;
|
|
430 }
|
|
431
|
|
432 1;
|
|
433 }
|
|
434
|
|
435 sub _primary_db_name {
|
|
436 my $self = shift;
|
|
437 my $pns = $self->primary_namespace or $self->throw('no primary namespace defined');
|
|
438 return "key_$pns";
|
|
439 }
|
|
440
|
|
441 sub _secondary_db_name {
|
|
442 my $self = shift;
|
|
443 my $sns = shift;
|
|
444 return "id_$sns";
|
|
445 }
|
|
446
|
|
447 sub _fhcache {
|
|
448 my $self = shift;
|
|
449 my $path = shift;
|
|
450 my $write = shift;
|
|
451
|
|
452 if (!$self->{bdb_fhcache}{$path}) {
|
|
453 $self->{bdb_curopen} ||= 0;
|
|
454 if ($self->{bdb_curopen} >= $self->{bdb_maxopen}) {
|
|
455 my @lru = sort {$self->{bdb_cacheseq}{$a} <=> $self->{bdb_cacheseq}{$b};} keys %{$self->{bdb_fhcache}};
|
|
456 splice(@lru, $self->{bdb_maxopen} / 3);
|
|
457 $self->{bdb_curopen} -= @lru;
|
|
458 for (@lru) { delete $self->{bdb_fhcache}{$_} }
|
|
459 }
|
|
460 if ($write) {
|
|
461 my $modifier = $self->{bdb_fhcache_seenit}{$path}++ ? '>' : '>>';
|
|
462 $self->{bdb_fhcache}{$path} = IO::File->new("${modifier}${path}") or return;
|
|
463 } else {
|
|
464 $self->{bdb_fhcache}{$path} = IO::File->new($path) or return;
|
|
465 }
|
|
466 $self->{bdb_curopen}++;
|
|
467 }
|
|
468 $self->{bdb_cacheseq}{$path}++;
|
|
469 $self->{bdb_fhcache}{$path}
|
|
470 }
|
|
471
|
|
472 1;
|