comparison variant_effect_predictor/Bio/DB/Flat/BDB.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #
2 # $Id: BDB.pm,v 1.6.2.1 2003/03/25 18:46:10 jason Exp $
3 #
4 # BioPerl module for Bio::DB::Flat::BDB
5 #
6 # Cared for by Lincoln Stein <lstein@cshl.org>
7 #
8 # You may distribute this module under the same terms as perl itself
9
10 # POD documentation - main docs before the code
11
12 =head1 NAME
13
14 Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file
15
16 =head1 SYNOPSIS
17
18 You should not be using this module directly. See Bio::DB::Flat.
19
20 =head1 DESCRIPTION
21
22 This object provides the basic mechanism to associate positions in
23 files with primary and secondary name spaces. Unlike
24 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
25 to work with the BerkeleyDB-indexed "common" flat file format worked
26 out at the 2002 BioHackathon.
27
28 This object is the guts to the mechanism, which will be used by the
29 specific objects inheriting from it.
30
31 =head1 FEEDBACK
32
33 =head2 Mailing Lists
34
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to one
37 of the Bioperl mailing lists. Your participation is much appreciated.
38
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/MailList.shtml - About the mailing lists
41
42 =head2 Reporting Bugs
43
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 the bugs and their resolution. Bug reports can be submitted via
46 email or the web:
47
48 http://bugzilla.bioperl.org/
49
50 =head1 AUTHOR - Lincoln Stein
51
52 Email - lstein@cshl.org
53
54 =head1 SEE ALSO
55
56 L<Bio::DB::Flat>,
57
58 =head1 APPENDIX
59
60 The rest of the documentation details each of the object methods. Internal
61 methods are usually preceded with an "_" (underscore).
62
63 =cut
64
65
66 # Let the code begin...
67
68 package Bio::DB::Flat::BDB;
69
70 use strict;
71 use DB_File;
72 use IO::File;
73 use Fcntl qw(O_CREAT O_RDWR O_RDONLY);
74 use File::Spec;
75 use Bio::DB::Flat;
76 use Bio::SeqIO;
77 use Bio::DB::RandomAccessI;
78 use Bio::Root::Root;
79 use Bio::Root::IO;
80 use vars '@ISA';
81
82 @ISA = qw(Bio::DB::Flat);
83
84 sub _initialize {
85 my $self = shift;
86 my ($max_open) = $self->_rearrange(['MAXOPEN'],@_);
87 $self->{bdb_maxopen} = $max_open || 32;
88 }
89
90 # return a filehandle seeked to the appropriate place
91 # this only works with the primary namespace
92 sub _get_stream {
93 my ($self,$id) = @_;
94 my ($filepath,$offset,$length) = $self->_lookup_primary($id)
95 or $self->throw("Unable to find a record for $id in the flat file index");
96 my $fh = $self->_fhcache($filepath)
97 or $self->throw("couldn't open $filepath: $!");
98 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
99 $fh;
100 }
101
102 # return records corresponding to the indicated index
103 # if there are multiple hits will return a list in list context,
104 # otherwise will throw an exception
105 sub fetch_raw {
106 my ($self,$id,$namespace) = @_;
107
108 # secondary lookup
109 if (defined $namespace && $namespace ne $self->primary_namespace) {
110 my @hits = $self->_lookup_secondary($namespace,$id);
111 $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context")
112 unless wantarray;
113 return map {$self->_read_record(@$_)} @hits;
114 }
115
116 # primary lookup
117 my @args = $self->_lookup_primary($id)
118 or $self->throw("Unable to find a record for $id in the flat file index");
119 return $self->_read_record(@args);
120 }
121
122 # create real live Bio::Seq object
123 sub get_Seq_by_id {
124 my $self = shift;
125 my $id = shift;
126 my $fh = eval {$self->_get_stream($id)} or return;
127 my $seqio =
128 $self->{bdb_cached_parsers}{fileno $fh} ||= Bio::SeqIO->new( -Format => $self->file_format,
129 -fh => $fh);
130 return $seqio->next_seq;
131 }
132
133 # fetch array of Bio::Seq objects
134 sub get_Seq_by_acc {
135 my $self = shift;
136 unshift @_,'ACC' if @_==1;
137 my ($ns,$key) = @_;
138 my @primary_ids = $self->expand_ids($ns => $key);
139 $self->throw("more than one sequences correspond to this accession")
140 if @primary_ids > 1 && ! wantarray;
141 my @rc = map {$self->get_Seq_by_id($_)} @primary_ids;
142 return wantarray ? @rc : $rc[0];
143 }
144
145 # fetch array of Bio::Seq objects
146 sub get_Seq_by_version {
147 my $self = shift;
148 unshift @_,'VERSION' if @_==1;
149 my ($ns,$key) = @_;
150 my @primary_ids = $self->expand_ids($ns => $key);
151 $self->throw("more than one sequences correspond to this accession")
152 if @primary_ids > 1 && !wantarray;
153 return map {$self->get_Seq_by_id($_)} @primary_ids;
154 }
155
156 =head2 get_PrimarySeq_stream
157
158 Title : get_PrimarySeq_stream
159 Usage : $stream = get_PrimarySeq_stream
160 Function: Makes a Bio::DB::SeqStreamI compliant object
161 which provides a single method, next_primary_seq
162 Returns : Bio::DB::SeqStreamI
163 Args : none
164
165
166 =cut
167
168 sub get_PrimarySeq_stream {
169 my $self = shift;
170 my @files = $self->files || 0;
171 my $out = Bio::SeqIO::MultiFile->new( -format => $self->file_format ,
172 -files => \@files);
173 return $out;
174 }
175
176 sub get_all_primary_ids {
177 my $self = shift;
178 my $db = $self->primary_db;
179 return keys %$db;
180 }
181
182 =head2 get_all_primary_ids
183
184 Title : get_all_primary_ids
185 Usage : @ids = $seqdb->get_all_primary_ids()
186 Function: gives an array of all the primary_ids of the
187 sequence objects in the database.
188 Example :
189 Returns : an array of strings
190 Args : none
191
192 =cut
193
194 # this will perform an ID lookup on a (possibly secondary)
195 # id, returning all the corresponding ids
196 sub expand_ids {
197 my $self = shift;
198 my ($ns,$key) = @_;
199 return $key unless defined $ns;
200 return $key if $ns eq $self->primary_namespace;
201 my $db = $self->secondary_db($ns)
202 or $self->throw("invalid secondary namespace $ns");
203 my $record = $db->{$key} or return; # nothing there
204 return $self->unpack_secondary($record);
205 }
206
207 # build index from files listed
208 sub build_index {
209 my $self = shift;
210 my @files = @_;
211 my $count = 0;
212 for my $file (@files) {
213 $file = File::Spec->rel2abs($file)
214 unless File::Spec->file_name_is_absolute($file);
215 $count += $self->_index_file($file);
216 }
217 $self->write_config;
218 $count;
219 }
220
221 sub _index_file {
222 my $self = shift;
223 my $file = shift;
224
225 my $fileno = $self->_path2fileno($file);
226 defined $fileno or $self->throw("could not create a file number for $file");
227
228 my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!");
229 my $offset = 0;
230 my $count = 0;
231 while (!eof($fh)) {
232 my ($ids,$adjustment) = $self->parse_one_record($fh) or next;
233 $adjustment ||= 0; # prevent uninit variable warning
234 my $pos = tell($fh) + $adjustment;
235 $self->_store_index($ids,$file,$offset,$pos-$offset);
236 $offset = $pos;
237 $count++;
238 }
239 $count;
240 }
241
242 =head2 To Be Implemented in Subclasses
243
244 The following methods MUST be implemented by subclasses.
245
246 =cut
247
248 =head2 May Be Overridden in Subclasses
249
250 The following methods MAY be overridden by subclasses.
251
252 =cut
253
254 sub default_primary_namespace {
255 return "ACC";
256 }
257
258 sub default_secondary_namespaces {
259 return;
260 }
261
262 sub _read_record {
263 my $self = shift;
264 my ($filepath,$offset,$length) = @_;
265 my $fh = $self->_fhcache($filepath)
266 or $self->throw("couldn't open $filepath: $!");
267 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!");
268 my $record;
269 read($fh,$record,$length) or $self->throw("can't read $filepath: $!");
270 $record
271 }
272
273 # return a list in the form ($filepath,$offset,$length)
274 sub _lookup_primary {
275 my $self = shift;
276 my $primary = shift;
277 my $db = $self->primary_db
278 or $self->throw("no primary namespace database is open");
279
280 my $record = $db->{$primary} or return; # nothing here
281
282 my($fileid,$offset,$length) = $self->unpack_primary($record);
283 my $filepath = $self->_fileno2path($fileid)
284 or $self->throw("no file path entry for fileid $fileid");
285 return ($filepath,$offset,$length);
286 }
287
288 # return a list of array refs in the form [$filepath,$offset,$length]
289 sub _lookup_secondary {
290 my $self = shift;
291 my ($namespace,$secondary) = @_;
292 my @primary = $self->expand_ids($namespace=>$secondary);
293 return map {[$self->_lookup_primary($_)]} @primary;
294 }
295
296 # store indexing information into a primary & secondary record
297 # $namespaces is one of:
298 # 1. a scalar corresponding to the primary name
299 # 2. a hashref corresponding to namespace=>id identifiers
300 # it is valid for secondary id to be an arrayref
301 sub _store_index {
302 my $self = shift;
303 my ($keys,$filepath,$offset,$length) = @_;
304 my ($primary,%secondary);
305
306 if (ref $keys eq 'HASH') {
307 my %valid_secondary = map {$_=>1} $self->secondary_namespaces;
308 while (my($ns,$value) = each %$keys) {
309 if ($ns eq $self->primary_namespace) {
310 $primary = $value;
311 } else {
312 $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns");
313 push @{$secondary{$ns}},$value;
314 }
315 }
316 $primary or $self->throw("no primary namespace ID provided");
317 } else {
318 $primary = $keys;
319 }
320
321 $self->throw("invalid primary ID; must be a scalar")
322 if ref($primary) =~ /^(ARRAY|HASH)$/; # but allow stringified objects
323
324 $self->_store_primary($primary,$filepath,$offset,$length);
325 for my $ns (keys %secondary) {
326 my @ids = ref $secondary{$ns} ? @{$secondary{$ns}} : $secondary{$ns};
327 $self->_store_secondary($ns,$_,$primary) foreach @ids;
328 }
329
330 1;
331 }
332
333 # store primary index
334 sub _store_primary {
335 my $self = shift;
336 my ($id,$filepath,$offset,$length) = @_;
337
338 my $db = $self->primary_db
339 or $self->throw("no primary namespace database is open");
340 my $fileno = $self->_path2fileno($filepath);
341 defined $fileno or $self->throw("could not create a file number for $filepath");
342
343 my $record = $self->pack_primary($fileno,$offset,$length);
344 $db->{$id} = $record or return; # nothing here
345 1;
346 }
347
348 # store a primary index name under a secondary index
349 sub _store_secondary {
350 my $self = shift;
351 my ($secondary_ns,$secondary_id,$primary_id) = @_;
352
353 my $db = $self->secondary_db($secondary_ns)
354 or $self->throw("invalid secondary namespace $secondary_ns");
355
356 # first get whatever secondary ids are already stored there
357 my @primary = $self->unpack_secondary($db->{$secondary_id});
358 # uniqueify
359 my %unique = map {$_=>undef} @primary,$primary_id;
360
361 my $record = $self->pack_secondary(keys %unique);
362 $db->{$secondary_id} = $record;
363 }
364
365 # get output file handle
366 sub _outfh {
367 my $self = shift;
368 #### XXXXX FINISH #####
369 # my $
370 }
371
372 # unpack a primary record into fileid,offset,length
373 sub unpack_primary {
374 my $self = shift;
375 my $index_record = shift;
376 return split "\t",$index_record;
377 }
378
379 # unpack a secondary record into a list of primary ids
380 sub unpack_secondary {
381 my $self = shift;
382 my $index_record = shift or return;
383 return split "\t",$index_record;
384 }
385
386 # pack a list of fileid,offset,length into a primary id record
387 sub pack_primary {
388 my $self = shift;
389 my ($fileid,$offset,$length) = @_;
390 return join "\t",($fileid,$offset,$length);
391 }
392
393 # pack a list of primary ids into a secondary id record
394 sub pack_secondary {
395 my $self = shift;
396 my @secondaries = @_;
397 return join "\t",@secondaries;
398 }
399
400 sub primary_db {
401 my $self = shift;
402 # lazy opening
403 $self->_open_bdb unless exists $self->{bdb_primary_db};
404 return $self->{bdb_primary_db};
405 }
406
407 sub secondary_db {
408 my $self = shift;
409 my $secondary_namespace = shift
410 or $self->throw("usage: secondary_db(\$secondary_namespace)");
411 $self->_open_bdb unless exists $self->{bdb_primary_db};
412 return $self->{bdb_secondary_db}{$secondary_namespace};
413 }
414
415 sub _open_bdb {
416 my $self = shift;
417
418 my $flags = $self->write_flag ? O_CREAT|O_RDWR : O_RDONLY;
419
420 my $primary_db = {};
421 tie(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE)
422 or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)");
423 $self->{bdb_primary_db} = $primary_db;
424
425 for my $secondary ($self->secondary_namespaces) {
426 my $secondary_db = {};
427 tie(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE)
428 or $self->throw("Could not open primary index file");
429 $self->{bdb_secondary_db}{$secondary} = $secondary_db;
430 }
431
432 1;
433 }
434
435 sub _primary_db_name {
436 my $self = shift;
437 my $pns = $self->primary_namespace or $self->throw('no primary namespace defined');
438 return "key_$pns";
439 }
440
441 sub _secondary_db_name {
442 my $self = shift;
443 my $sns = shift;
444 return "id_$sns";
445 }
446
447 sub _fhcache {
448 my $self = shift;
449 my $path = shift;
450 my $write = shift;
451
452 if (!$self->{bdb_fhcache}{$path}) {
453 $self->{bdb_curopen} ||= 0;
454 if ($self->{bdb_curopen} >= $self->{bdb_maxopen}) {
455 my @lru = sort {$self->{bdb_cacheseq}{$a} <=> $self->{bdb_cacheseq}{$b};} keys %{$self->{bdb_fhcache}};
456 splice(@lru, $self->{bdb_maxopen} / 3);
457 $self->{bdb_curopen} -= @lru;
458 for (@lru) { delete $self->{bdb_fhcache}{$_} }
459 }
460 if ($write) {
461 my $modifier = $self->{bdb_fhcache_seenit}{$path}++ ? '>' : '>>';
462 $self->{bdb_fhcache}{$path} = IO::File->new("${modifier}${path}") or return;
463 } else {
464 $self->{bdb_fhcache}{$path} = IO::File->new($path) or return;
465 }
466 $self->{bdb_curopen}++;
467 }
468 $self->{bdb_cacheseq}{$path}++;
469 $self->{bdb_fhcache}{$path}
470 }
471
472 1;