comparison variant_effect_predictor/Bio/DB/Flat.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 #
2 # $Id: Flat.pm,v 1.6 2002/12/22 22:02:13 lstein Exp $
3 #
4 # BioPerl module for Bio::DB::Flat
5 #
6 # Cared for by Lincoln Stein <lstein@cshl.org>
7 #
8 # You may distribute this module under the same terms as perl itself
9
10 # POD documentation - main docs before the code
11
12 =head1 NAME
13
14 Bio::DB::Flat - Interface for indexed flat files
15
16 =head1 SYNOPSIS
17
18 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl',
19 -format => 'embl',
20 -write_flag => 1);
21 $db->build_index('/usr/share/embl/primate.embl','/usr/share/embl/protists.embl');
22 $seq = $db->get_Seq_by_id('BUM');
23 @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
24 $raw = $db->fetch_raw('BUM');
25
26 =head1 DESCRIPTION
27
28 This object provides the basic mechanism to associate positions in
29 files with primary and secondary name spaces. Unlike
30 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
31 to work with the "flat index" and BerkeleyDB indexed flat file formats
32 worked out at the 2002 BioHackathon.
33
34 This object is a general front end to the underlying databases.
35
36 =head1 FEEDBACK
37
38 =head2 Mailing Lists
39
40 User feedback is an integral part of the evolution of this and other
41 Bioperl modules. Send your comments and suggestions preferably to one
42 of the Bioperl mailing lists. Your participation is much appreciated.
43
44 bioperl-l@bioperl.org - General discussion
45 http://bioperl.org/MailList.shtml - About the mailing lists
46
47 =head2 Reporting Bugs
48
49 Report bugs to the Bioperl bug tracking system to help us keep track
50 the bugs and their resolution. Bug reports can be submitted via
51 email or the web:
52
53 bioperl-bugs@bio.perl.org
54 http://bugzilla.bioperl.org/
55
56 =head1 AUTHOR - Lincoln Stein
57
58 Email - lstein@cshl.org
59
60 =head1 APPENDIX
61
62 The rest of the documentation details each of the object methods. Internal
63 methods are usually preceded with an "_" (underscore).
64
65 =cut
66
67
68 # Let the code begin...
69 package Bio::DB::Flat;
70
71 use Bio::DB::RandomAccessI;
72 use Bio::Root::Root;
73 use Bio::Root::IO;
74 use vars '@ISA';
75
76 @ISA = qw(Bio::Root::Root Bio::DB::RandomAccessI);
77
78 use constant CONFIG_FILE_NAME => 'config.dat';
79
80 =head2 new
81
82 Title : new
83 Usage : my $db = new Bio::Flat->new(
84 -directory => $root_directory,
85 -write_flag => 0,
86 -index => 'bdb'|'flat',
87 -verbose => 0,
88 -out => 'outputfile',
89 -format => 'genbank');
90 Function: create a new Bio::Index::BDB object
91 Returns : new Bio::Index::BDB object
92 Args : -directory Root directory containing "config.dat"
93 -write_flag If true, allows reindexing.
94 -verbose Verbose messages
95 -out File to write to when write_seq invoked
96 Status : Public
97
98 The root -directory indicates where the flat file indexes will be
99 stored. The build_index() and write_seq() methods will automatically
100 create a human-readable configuration file named "config.dat" in this
101 file.
102
103 The -write_flag enables writing new entries into the database as well
104 as the creation of the indexes. By default the indexes will be opened
105 read only.
106
107 -index is one of "bdb" or "flat" and indicates the type of index to
108 generate. "bdb" corresponds to Berkeley DB. You *must* be using
109 BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB extension
110 installed (DB_File will *not* work).
111
112 The -out argument species the output file for writing objects created
113 with write_seq().
114
115 =cut
116
117 sub new {
118 my $class = shift;
119 $class = ref($class) if ref($class);
120 my $self = $class->SUPER::new(@_);
121
122 # first we initialize ourselves
123 my ($flat_directory) = @_ == 1 ? shift
124 : $self->_rearrange([qw(DIRECTORY)],@_);
125
126 # set values from configuration file
127 $self->directory($flat_directory);
128 $self->_read_config() if -e $flat_directory;
129
130 # but override with initialization values
131 $self->_initialize(@_);
132
133 # now we figure out what subclass to instantiate
134 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB'
135 :$self->indexing_scheme eq 'flat/1' ? 'Flat'
136 :$self->throw("unknown indexing scheme: ".$self->indexing_scheme);
137 my $format = $self->file_format;
138 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
139 eval "use $child_class";
140 $self->throw($@) if $@;
141
142 # rebless & reinitialize with the new class
143 # (this prevents subclasses from forgetting to call our own initialization)
144 bless $self,$child_class;
145 $self->_initialize(@_);
146 $self->_set_namespaces(@_);
147
148 $self;
149 }
150
151 sub _initialize {
152 my $self = shift;
153
154 my ($flat_write_flag,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
155 = $self->_rearrange([qw(WRITE_FLAG INDEX VERBOSE OUT FORMAT)],@_);
156
157 $self->write_flag($flat_write_flag) if defined $flat_write_flag;
158
159 if (defined $flat_indexing) {
160 # very permissive
161 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/;
162 $flat_indexing = 'flat/1' if $flat_indexing =~ /flat/;
163 $self->indexing_scheme($flat_indexing);
164 }
165
166 $self->verbose($flat_verbose) if defined $flat_verbose;
167 $self->out_file($flat_outfile) if defined $flat_outfile;
168 $self->file_format($flat_format) if defined $flat_format;
169 }
170
171 sub _set_namespaces {
172 my $self = shift;
173
174 $self->primary_namespace($self->default_primary_namespace)
175 unless defined $self->{flat_primary_namespace};
176
177 $self->secondary_namespaces($self->default_secondary_namespaces)
178 unless defined $self->{flat_secondary_namespaces};
179
180 $self->file_format($self->default_file_format)
181 unless defined $self->{flat_format};
182 }
183
184 # accessors
185 sub directory {
186 my $self = shift;
187 my $d = $self->{flat_directory};
188 $self->{flat_directory} = shift if @_;
189 $d;
190 }
191 sub write_flag {
192 my $self = shift;
193 my $d = $self->{flat_write_flag};
194 $self->{flat_write_flag} = shift if @_;
195 $d;
196 }
197 sub verbose {
198 my $self = shift;
199 my $d = $self->{flat_verbose};
200 $self->{flat_verbose} = shift if @_;
201 $d;
202 }
203 sub out_file {
204 my $self = shift;
205 my $d = $self->{flat_outfile};
206 $self->{flat_outfile} = shift if @_;
207 $d;
208 }
209
210 sub primary_namespace {
211 my $self = shift;
212 my $d = $self->{flat_primary_namespace};
213 $self->{flat_primary_namespace} = shift if @_;
214 $d;
215 }
216
217 # get/set secondary namespace(s)
218 # pass an array ref.
219 # get an array ref in scalar context, list in list context.
220 sub secondary_namespaces {
221 my $self = shift;
222 my $d = $self->{flat_secondary_namespaces};
223 $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_;
224 return unless $d;
225 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia
226 return wantarray ? @$d : $d;
227 }
228
229 # return the file format
230 sub file_format {
231 my $self = shift;
232 my $d = $self->{flat_format};
233 $self->{flat_format} = shift if @_;
234 $d;
235 }
236
237 # return the indexing scheme
238 sub indexing_scheme {
239 my $self = shift;
240 my $d = $self->{flat_indexing};
241 $self->{flat_indexing} = shift if @_;
242 $d;
243 }
244
245 sub add_flat_file {
246 my $self = shift;
247 my ($file_path,$file_length,$nf) = @_;
248
249 # check that file_path is absolute
250 File::Spec->file_name_is_absolute($file_path)
251 or $self->throw("the flat file path $file_path must be absolute");
252
253 -r $file_path or $self->throw("flat file $file_path cannot be read: $!");
254
255 my $current_size = -s _;
256 if (defined $file_length) {
257 $current_size == $file_length
258 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size");
259 } else {
260 $file_length = $current_size;
261 }
262
263 unless (defined $nf) {
264 $self->{flat_file_index} = 0 unless exists $self->{flat_file_index};
265 $nf = $self->{flat_file_index}++;
266 }
267 $self->{flat_flat_file_path}{$nf} = $file_path;
268 $self->{flat_flat_file_no}{$file_path} = $nf;
269 $nf;
270 }
271
272 sub write_config {
273 my $self = shift;
274 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
275 my $path = $self->_config_path;
276
277 open (F,">$path") or $self->throw("open error on $path: $!");
278
279 my $index_type = $self->indexing_scheme;
280 print F "index\t$index_type\n";
281
282 my $format = $self->file_format;
283 print F "format\t$format\n";
284
285 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
286 for my $nf (@filenos) {
287 my $path = $self->{flat_flat_file_path}{$nf};
288 my $size = -s $path;
289 print F join("\t","fileid_$nf",$path,$size),"\n";
290 }
291
292 # write primary namespace
293 my $primary_ns = $self->primary_namespace
294 or $self->throw('cannot write config file because no primary namespace defined');
295
296 print F join("\t",'primary_namespace',$primary_ns),"\n";
297
298 # write secondary namespaces
299 my @secondary = $self->secondary_namespaces;
300 print F join("\t",'secondary_namespaces',@secondary),"\n";
301
302 close F or $self->throw("close error on $path: $!");
303 }
304
305 sub files {
306 my $self = shift;
307 return unless $self->{flat_flat_file_no};
308 return keys %{$self->{flat_flat_file_no}};
309 }
310
311 sub write_seq {
312 my $self = shift;
313 my $seq = shift;
314
315 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
316
317 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
318 my $seqio = $self->{flat_cached_parsers}{$file}
319 ||= Bio::SeqIO->new(-Format => $self->file_format,
320 -file => ">$file")
321 or $self->throw("couldn't create Bio::SeqIO object");
322
323 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
324 my $offset = tell($fh);
325 $seqio->write_seq($seq);
326 my $length = tell($fh)-$offset;
327 my $ids = $self->seq_to_ids($seq);
328 $self->_store_index($ids,$file,$offset,$length);
329
330 $self->{flat_outfile_dirty}++;
331 }
332
333 sub close {
334 my $self = shift;
335 return unless $self->{flat_outfile_dirty};
336 $self->write_config;
337 delete $self->{flat_outfile_dirty};
338 delete $self->{flat_cached_parsers}{$self->out_file};
339 }
340
341
342 sub _filenos {
343 my $self = shift;
344 return unless $self->{flat_flat_file_path};
345 return keys %{$self->{flat_flat_file_path}};
346 }
347
348 # read the configuration file
349 sub _read_config {
350 my $self = shift;
351 my $config = shift;
352
353 my $path = defined $config ? Bio::Root::IO->catfile($config,CONFIG_FILE_NAME)
354 : $self->_config_path;
355 return unless -e $path;
356
357 open (F,$path) or $self->throw("open error on $path: $!");
358 my %config;
359 while (<F>) {
360 chomp;
361 my ($tag,@values) = split "\t";
362 $config{$tag} = \@values;
363 }
364 CORE::close F or $self->throw("close error on $path: $!");
365
366 $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~
367 or $self->throw("invalid configuration file $path: no index line");
368
369 $self->indexing_scheme($1);
370
371 $self->file_format($config{format}[0]) if $config{format};
372
373 # set up primary namespace
374 my $primary_namespace = $config{primary_namespace}[0]
375 or $self->throw("invalid configuration file $path: no primary namespace defined");
376 $self->primary_namespace($primary_namespace);
377
378 # set up secondary namespaces (may be empty)
379 $self->secondary_namespaces($config{secondary_namespaces});
380
381 # get file paths and their normalization information
382 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
383 for my $nf (@normalized_files) {
384 my ($file_path,$file_length) = @{$config{"fileid_${nf}"}};
385 $self->add_flat_file($file_path,$file_length,$nf);
386 }
387 1;
388 }
389
390
391 sub _config_path {
392 my $self = shift;
393 $self->_catfile($self->_config_name);
394 }
395
396 sub _catfile {
397 my $self = shift;
398 my $component = shift;
399 Bio::Root::IO->catfile($self->directory,$component);
400 }
401
402 sub _config_name { CONFIG_FILE_NAME }
403
404 sub _path2fileno {
405 my $self = shift;
406 my $path = shift;
407 return $self->add_flat_file($path)
408 unless exists $self->{flat_flat_file_no}{$path};
409 $self->{flat_flat_file_no}{$path};
410 }
411
412 sub _fileno2path {
413 my $self = shift;
414 my $fileno = shift;
415 $self->{flat_flat_file_path}{$fileno};
416 }
417
418 sub _files {
419 my $self = shift;
420 my $paths = $self->{flat_flat_file_no};
421 return keys %$paths;
422 }
423
424 =head2 fetch
425
426 Title : fetch
427 Usage : $index->fetch( $id )
428 Function: Returns a Bio::Seq object from the index
429 Example : $seq = $index->fetch( 'dJ67B12' )
430 Returns : Bio::Seq object
431 Args : ID
432
433 Deprecated. Use get_Seq_by_id instead.
434
435 =cut
436
437 sub fetch { shift->get_Seq_by_id(@_) }
438
439
440 =head2 To Be Implemented in Subclasses
441
442 The following methods MUST be implemented by subclasses.
443
444 =cut
445
446 # create real live Bio::Seq object
447 sub get_Seq_by_id {
448 my $self = shift;
449 my $id = shift;
450 $self->throw_not_implemented;
451 }
452
453
454 # fetch array of Bio::Seq objects
455 sub get_Seq_by_acc {
456 my $self = shift;
457 return $self->get_Seq_by_id(shift) if @_ == 1;
458 my ($ns,$key) = @_;
459
460 $self->throw_not_implemented;
461 }
462
463 sub fetch_raw {
464 my ($self,$id,$namespace) = @_;
465 $self->throw_not_implemented;
466 }
467
468 # This is the method that must be implemented in
469 # child classes. It is passed a filehandle which should
470 # point to the next record to be indexed in the file,
471 # and returns a two element list
472 # consisting of a key and an adjustment value.
473 # The key can be a scalar, in which case it is treated
474 # as the primary ID, or a hashref containing namespace=>[id] pairs,
475 # one of which MUST correspond to the primary namespace.
476 # The adjustment value is normally zero, but can be a positive or
477 # negative integer which will be added to the current file position
478 # in order to calculate the correct end of the record.
479 sub parse_one_record {
480 my $self = shift;
481 my $fh = shift;
482 $self->throw_not_implemented;
483 # here's what you would implement
484 my (%keys,$offset);
485 return (\%keys,$offset);
486 }
487
488 sub default_file_format {
489 my $self = shift;
490 $self->throw_not_implemented;
491 }
492
493 sub _store_index {
494 my ($ids,$file,$offset,$length) = @_;
495 $self->throw_not_implemented;
496 }
497
498 =head2 May Be Overridden in Subclasses
499
500 The following methods MAY be overridden by subclasses.
501
502 =cut
503
504 sub default_primary_namespace {
505 return "ACC";
506 }
507
508 sub default_secondary_namespaces {
509 return;
510 }
511
512 sub seq_to_ids {
513 my $self = shift;
514 my $seq = shift;
515 my %ids;
516 $ids{$self->primary_namespace} = $seq->accession_number;
517 \%ids;
518 }
519
520 sub DESTROY {
521 my $self = shift;
522 $self->close;
523 }
524
525
526 1;