Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/Flat/BDB.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # | |
2 # $Id: BDB.pm,v 1.6.2.1 2003/03/25 18:46:10 jason Exp $ | |
3 # | |
4 # BioPerl module for Bio::DB::Flat::BDB | |
5 # | |
6 # Cared for by Lincoln Stein <lstein@cshl.org> | |
7 # | |
8 # You may distribute this module under the same terms as perl itself | |
9 | |
10 # POD documentation - main docs before the code | |
11 | |
12 =head1 NAME | |
13 | |
14 Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file | |
15 | |
16 =head1 SYNOPSIS | |
17 | |
18 You should not be using this module directly. See Bio::DB::Flat. | |
19 | |
20 =head1 DESCRIPTION | |
21 | |
22 This object provides the basic mechanism to associate positions in | |
23 files with primary and secondary name spaces. Unlike | |
24 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized | |
25 to work with the BerkeleyDB-indexed "common" flat file format worked | |
26 out at the 2002 BioHackathon. | |
27 | |
28 This object is the guts to the mechanism, which will be used by the | |
29 specific objects inheriting from it. | |
30 | |
31 =head1 FEEDBACK | |
32 | |
33 =head2 Mailing Lists | |
34 | |
35 User feedback is an integral part of the evolution of this and other | |
36 Bioperl modules. Send your comments and suggestions preferably to one | |
37 of the Bioperl mailing lists. Your participation is much appreciated. | |
38 | |
39 bioperl-l@bioperl.org - General discussion | |
40 http://bioperl.org/MailList.shtml - About the mailing lists | |
41 | |
42 =head2 Reporting Bugs | |
43 | |
44 Report bugs to the Bioperl bug tracking system to help us keep track | |
45 the bugs and their resolution. Bug reports can be submitted via | |
46 email or the web: | |
47 | |
48 http://bugzilla.bioperl.org/ | |
49 | |
50 =head1 AUTHOR - Lincoln Stein | |
51 | |
52 Email - lstein@cshl.org | |
53 | |
54 =head1 SEE ALSO | |
55 | |
56 L<Bio::DB::Flat>, | |
57 | |
58 =head1 APPENDIX | |
59 | |
60 The rest of the documentation details each of the object methods. Internal | |
61 methods are usually preceded with an "_" (underscore). | |
62 | |
63 =cut | |
64 | |
65 | |
66 # Let the code begin... | |
67 | |
68 package Bio::DB::Flat::BDB; | |
69 | |
70 use strict; | |
71 use DB_File; | |
72 use IO::File; | |
73 use Fcntl qw(O_CREAT O_RDWR O_RDONLY); | |
74 use File::Spec; | |
75 use Bio::DB::Flat; | |
76 use Bio::SeqIO; | |
77 use Bio::DB::RandomAccessI; | |
78 use Bio::Root::Root; | |
79 use Bio::Root::IO; | |
80 use vars '@ISA'; | |
81 | |
82 @ISA = qw(Bio::DB::Flat); | |
83 | |
84 sub _initialize { | |
85 my $self = shift; | |
86 my ($max_open) = $self->_rearrange(['MAXOPEN'],@_); | |
87 $self->{bdb_maxopen} = $max_open || 32; | |
88 } | |
89 | |
90 # return a filehandle seeked to the appropriate place | |
91 # this only works with the primary namespace | |
92 sub _get_stream { | |
93 my ($self,$id) = @_; | |
94 my ($filepath,$offset,$length) = $self->_lookup_primary($id) | |
95 or $self->throw("Unable to find a record for $id in the flat file index"); | |
96 my $fh = $self->_fhcache($filepath) | |
97 or $self->throw("couldn't open $filepath: $!"); | |
98 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!"); | |
99 $fh; | |
100 } | |
101 | |
102 # return records corresponding to the indicated index | |
103 # if there are multiple hits will return a list in list context, | |
104 # otherwise will throw an exception | |
105 sub fetch_raw { | |
106 my ($self,$id,$namespace) = @_; | |
107 | |
108 # secondary lookup | |
109 if (defined $namespace && $namespace ne $self->primary_namespace) { | |
110 my @hits = $self->_lookup_secondary($namespace,$id); | |
111 $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context") | |
112 unless wantarray; | |
113 return map {$self->_read_record(@$_)} @hits; | |
114 } | |
115 | |
116 # primary lookup | |
117 my @args = $self->_lookup_primary($id) | |
118 or $self->throw("Unable to find a record for $id in the flat file index"); | |
119 return $self->_read_record(@args); | |
120 } | |
121 | |
122 # create real live Bio::Seq object | |
123 sub get_Seq_by_id { | |
124 my $self = shift; | |
125 my $id = shift; | |
126 my $fh = eval {$self->_get_stream($id)} or return; | |
127 my $seqio = | |
128 $self->{bdb_cached_parsers}{fileno $fh} ||= Bio::SeqIO->new( -Format => $self->file_format, | |
129 -fh => $fh); | |
130 return $seqio->next_seq; | |
131 } | |
132 | |
133 # fetch array of Bio::Seq objects | |
134 sub get_Seq_by_acc { | |
135 my $self = shift; | |
136 unshift @_,'ACC' if @_==1; | |
137 my ($ns,$key) = @_; | |
138 my @primary_ids = $self->expand_ids($ns => $key); | |
139 $self->throw("more than one sequences correspond to this accession") | |
140 if @primary_ids > 1 && ! wantarray; | |
141 my @rc = map {$self->get_Seq_by_id($_)} @primary_ids; | |
142 return wantarray ? @rc : $rc[0]; | |
143 } | |
144 | |
145 # fetch array of Bio::Seq objects | |
146 sub get_Seq_by_version { | |
147 my $self = shift; | |
148 unshift @_,'VERSION' if @_==1; | |
149 my ($ns,$key) = @_; | |
150 my @primary_ids = $self->expand_ids($ns => $key); | |
151 $self->throw("more than one sequences correspond to this accession") | |
152 if @primary_ids > 1 && !wantarray; | |
153 return map {$self->get_Seq_by_id($_)} @primary_ids; | |
154 } | |
155 | |
156 =head2 get_PrimarySeq_stream | |
157 | |
158 Title : get_PrimarySeq_stream | |
159 Usage : $stream = get_PrimarySeq_stream | |
160 Function: Makes a Bio::DB::SeqStreamI compliant object | |
161 which provides a single method, next_primary_seq | |
162 Returns : Bio::DB::SeqStreamI | |
163 Args : none | |
164 | |
165 | |
166 =cut | |
167 | |
168 sub get_PrimarySeq_stream { | |
169 my $self = shift; | |
170 my @files = $self->files || 0; | |
171 my $out = Bio::SeqIO::MultiFile->new( -format => $self->file_format , | |
172 -files => \@files); | |
173 return $out; | |
174 } | |
175 | |
176 sub get_all_primary_ids { | |
177 my $self = shift; | |
178 my $db = $self->primary_db; | |
179 return keys %$db; | |
180 } | |
181 | |
182 =head2 get_all_primary_ids | |
183 | |
184 Title : get_all_primary_ids | |
185 Usage : @ids = $seqdb->get_all_primary_ids() | |
186 Function: gives an array of all the primary_ids of the | |
187 sequence objects in the database. | |
188 Example : | |
189 Returns : an array of strings | |
190 Args : none | |
191 | |
192 =cut | |
193 | |
194 # this will perform an ID lookup on a (possibly secondary) | |
195 # id, returning all the corresponding ids | |
196 sub expand_ids { | |
197 my $self = shift; | |
198 my ($ns,$key) = @_; | |
199 return $key unless defined $ns; | |
200 return $key if $ns eq $self->primary_namespace; | |
201 my $db = $self->secondary_db($ns) | |
202 or $self->throw("invalid secondary namespace $ns"); | |
203 my $record = $db->{$key} or return; # nothing there | |
204 return $self->unpack_secondary($record); | |
205 } | |
206 | |
207 # build index from files listed | |
208 sub build_index { | |
209 my $self = shift; | |
210 my @files = @_; | |
211 my $count = 0; | |
212 for my $file (@files) { | |
213 $file = File::Spec->rel2abs($file) | |
214 unless File::Spec->file_name_is_absolute($file); | |
215 $count += $self->_index_file($file); | |
216 } | |
217 $self->write_config; | |
218 $count; | |
219 } | |
220 | |
221 sub _index_file { | |
222 my $self = shift; | |
223 my $file = shift; | |
224 | |
225 my $fileno = $self->_path2fileno($file); | |
226 defined $fileno or $self->throw("could not create a file number for $file"); | |
227 | |
228 my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!"); | |
229 my $offset = 0; | |
230 my $count = 0; | |
231 while (!eof($fh)) { | |
232 my ($ids,$adjustment) = $self->parse_one_record($fh) or next; | |
233 $adjustment ||= 0; # prevent uninit variable warning | |
234 my $pos = tell($fh) + $adjustment; | |
235 $self->_store_index($ids,$file,$offset,$pos-$offset); | |
236 $offset = $pos; | |
237 $count++; | |
238 } | |
239 $count; | |
240 } | |
241 | |
242 =head2 To Be Implemented in Subclasses | |
243 | |
244 The following methods MUST be implemented by subclasses. | |
245 | |
246 =cut | |
247 | |
248 =head2 May Be Overridden in Subclasses | |
249 | |
250 The following methods MAY be overridden by subclasses. | |
251 | |
252 =cut | |
253 | |
254 sub default_primary_namespace { | |
255 return "ACC"; | |
256 } | |
257 | |
258 sub default_secondary_namespaces { | |
259 return; | |
260 } | |
261 | |
262 sub _read_record { | |
263 my $self = shift; | |
264 my ($filepath,$offset,$length) = @_; | |
265 my $fh = $self->_fhcache($filepath) | |
266 or $self->throw("couldn't open $filepath: $!"); | |
267 seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!"); | |
268 my $record; | |
269 read($fh,$record,$length) or $self->throw("can't read $filepath: $!"); | |
270 $record | |
271 } | |
272 | |
273 # return a list in the form ($filepath,$offset,$length) | |
274 sub _lookup_primary { | |
275 my $self = shift; | |
276 my $primary = shift; | |
277 my $db = $self->primary_db | |
278 or $self->throw("no primary namespace database is open"); | |
279 | |
280 my $record = $db->{$primary} or return; # nothing here | |
281 | |
282 my($fileid,$offset,$length) = $self->unpack_primary($record); | |
283 my $filepath = $self->_fileno2path($fileid) | |
284 or $self->throw("no file path entry for fileid $fileid"); | |
285 return ($filepath,$offset,$length); | |
286 } | |
287 | |
288 # return a list of array refs in the form [$filepath,$offset,$length] | |
289 sub _lookup_secondary { | |
290 my $self = shift; | |
291 my ($namespace,$secondary) = @_; | |
292 my @primary = $self->expand_ids($namespace=>$secondary); | |
293 return map {[$self->_lookup_primary($_)]} @primary; | |
294 } | |
295 | |
296 # store indexing information into a primary & secondary record | |
297 # $namespaces is one of: | |
298 # 1. a scalar corresponding to the primary name | |
299 # 2. a hashref corresponding to namespace=>id identifiers | |
300 # it is valid for secondary id to be an arrayref | |
301 sub _store_index { | |
302 my $self = shift; | |
303 my ($keys,$filepath,$offset,$length) = @_; | |
304 my ($primary,%secondary); | |
305 | |
306 if (ref $keys eq 'HASH') { | |
307 my %valid_secondary = map {$_=>1} $self->secondary_namespaces; | |
308 while (my($ns,$value) = each %$keys) { | |
309 if ($ns eq $self->primary_namespace) { | |
310 $primary = $value; | |
311 } else { | |
312 $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns"); | |
313 push @{$secondary{$ns}},$value; | |
314 } | |
315 } | |
316 $primary or $self->throw("no primary namespace ID provided"); | |
317 } else { | |
318 $primary = $keys; | |
319 } | |
320 | |
321 $self->throw("invalid primary ID; must be a scalar") | |
322 if ref($primary) =~ /^(ARRAY|HASH)$/; # but allow stringified objects | |
323 | |
324 $self->_store_primary($primary,$filepath,$offset,$length); | |
325 for my $ns (keys %secondary) { | |
326 my @ids = ref $secondary{$ns} ? @{$secondary{$ns}} : $secondary{$ns}; | |
327 $self->_store_secondary($ns,$_,$primary) foreach @ids; | |
328 } | |
329 | |
330 1; | |
331 } | |
332 | |
333 # store primary index | |
334 sub _store_primary { | |
335 my $self = shift; | |
336 my ($id,$filepath,$offset,$length) = @_; | |
337 | |
338 my $db = $self->primary_db | |
339 or $self->throw("no primary namespace database is open"); | |
340 my $fileno = $self->_path2fileno($filepath); | |
341 defined $fileno or $self->throw("could not create a file number for $filepath"); | |
342 | |
343 my $record = $self->pack_primary($fileno,$offset,$length); | |
344 $db->{$id} = $record or return; # nothing here | |
345 1; | |
346 } | |
347 | |
348 # store a primary index name under a secondary index | |
349 sub _store_secondary { | |
350 my $self = shift; | |
351 my ($secondary_ns,$secondary_id,$primary_id) = @_; | |
352 | |
353 my $db = $self->secondary_db($secondary_ns) | |
354 or $self->throw("invalid secondary namespace $secondary_ns"); | |
355 | |
356 # first get whatever secondary ids are already stored there | |
357 my @primary = $self->unpack_secondary($db->{$secondary_id}); | |
358 # uniqueify | |
359 my %unique = map {$_=>undef} @primary,$primary_id; | |
360 | |
361 my $record = $self->pack_secondary(keys %unique); | |
362 $db->{$secondary_id} = $record; | |
363 } | |
364 | |
365 # get output file handle | |
366 sub _outfh { | |
367 my $self = shift; | |
368 #### XXXXX FINISH ##### | |
369 # my $ | |
370 } | |
371 | |
372 # unpack a primary record into fileid,offset,length | |
373 sub unpack_primary { | |
374 my $self = shift; | |
375 my $index_record = shift; | |
376 return split "\t",$index_record; | |
377 } | |
378 | |
379 # unpack a secondary record into a list of primary ids | |
380 sub unpack_secondary { | |
381 my $self = shift; | |
382 my $index_record = shift or return; | |
383 return split "\t",$index_record; | |
384 } | |
385 | |
386 # pack a list of fileid,offset,length into a primary id record | |
387 sub pack_primary { | |
388 my $self = shift; | |
389 my ($fileid,$offset,$length) = @_; | |
390 return join "\t",($fileid,$offset,$length); | |
391 } | |
392 | |
393 # pack a list of primary ids into a secondary id record | |
394 sub pack_secondary { | |
395 my $self = shift; | |
396 my @secondaries = @_; | |
397 return join "\t",@secondaries; | |
398 } | |
399 | |
400 sub primary_db { | |
401 my $self = shift; | |
402 # lazy opening | |
403 $self->_open_bdb unless exists $self->{bdb_primary_db}; | |
404 return $self->{bdb_primary_db}; | |
405 } | |
406 | |
407 sub secondary_db { | |
408 my $self = shift; | |
409 my $secondary_namespace = shift | |
410 or $self->throw("usage: secondary_db(\$secondary_namespace)"); | |
411 $self->_open_bdb unless exists $self->{bdb_primary_db}; | |
412 return $self->{bdb_secondary_db}{$secondary_namespace}; | |
413 } | |
414 | |
415 sub _open_bdb { | |
416 my $self = shift; | |
417 | |
418 my $flags = $self->write_flag ? O_CREAT|O_RDWR : O_RDONLY; | |
419 | |
420 my $primary_db = {}; | |
421 tie(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE) | |
422 or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)"); | |
423 $self->{bdb_primary_db} = $primary_db; | |
424 | |
425 for my $secondary ($self->secondary_namespaces) { | |
426 my $secondary_db = {}; | |
427 tie(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE) | |
428 or $self->throw("Could not open primary index file"); | |
429 $self->{bdb_secondary_db}{$secondary} = $secondary_db; | |
430 } | |
431 | |
432 1; | |
433 } | |
434 | |
435 sub _primary_db_name { | |
436 my $self = shift; | |
437 my $pns = $self->primary_namespace or $self->throw('no primary namespace defined'); | |
438 return "key_$pns"; | |
439 } | |
440 | |
441 sub _secondary_db_name { | |
442 my $self = shift; | |
443 my $sns = shift; | |
444 return "id_$sns"; | |
445 } | |
446 | |
447 sub _fhcache { | |
448 my $self = shift; | |
449 my $path = shift; | |
450 my $write = shift; | |
451 | |
452 if (!$self->{bdb_fhcache}{$path}) { | |
453 $self->{bdb_curopen} ||= 0; | |
454 if ($self->{bdb_curopen} >= $self->{bdb_maxopen}) { | |
455 my @lru = sort {$self->{bdb_cacheseq}{$a} <=> $self->{bdb_cacheseq}{$b};} keys %{$self->{bdb_fhcache}}; | |
456 splice(@lru, $self->{bdb_maxopen} / 3); | |
457 $self->{bdb_curopen} -= @lru; | |
458 for (@lru) { delete $self->{bdb_fhcache}{$_} } | |
459 } | |
460 if ($write) { | |
461 my $modifier = $self->{bdb_fhcache_seenit}{$path}++ ? '>' : '>>'; | |
462 $self->{bdb_fhcache}{$path} = IO::File->new("${modifier}${path}") or return; | |
463 } else { | |
464 $self->{bdb_fhcache}{$path} = IO::File->new($path) or return; | |
465 } | |
466 $self->{bdb_curopen}++; | |
467 } | |
468 $self->{bdb_cacheseq}{$path}++; | |
469 $self->{bdb_fhcache}{$path} | |
470 } | |
471 | |
472 1; |