Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/DB/Flat.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # | |
2 # $Id: Flat.pm,v 1.6 2002/12/22 22:02:13 lstein Exp $ | |
3 # | |
4 # BioPerl module for Bio::DB::Flat | |
5 # | |
6 # Cared for by Lincoln Stein <lstein@cshl.org> | |
7 # | |
8 # You may distribute this module under the same terms as perl itself | |
9 | |
10 # POD documentation - main docs before the code | |
11 | |
12 =head1 NAME | |
13 | |
14 Bio::DB::Flat - Interface for indexed flat files | |
15 | |
16 =head1 SYNOPSIS | |
17 | |
18 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl', | |
19 -format => 'embl', | |
20 -write_flag => 1); | |
21 $db->build_index('/usr/share/embl/primate.embl','/usr/share/embl/protists.embl'); | |
22 $seq = $db->get_Seq_by_id('BUM'); | |
23 @sequences = $db->get_Seq_by_acc('DIV' => 'primate'); | |
24 $raw = $db->fetch_raw('BUM'); | |
25 | |
26 =head1 DESCRIPTION | |
27 | |
28 This object provides the basic mechanism to associate positions in | |
29 files with primary and secondary name spaces. Unlike | |
30 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized | |
31 to work with the "flat index" and BerkeleyDB indexed flat file formats | |
32 worked out at the 2002 BioHackathon. | |
33 | |
34 This object is a general front end to the underlying databases. | |
35 | |
36 =head1 FEEDBACK | |
37 | |
38 =head2 Mailing Lists | |
39 | |
40 User feedback is an integral part of the evolution of this and other | |
41 Bioperl modules. Send your comments and suggestions preferably to one | |
42 of the Bioperl mailing lists. Your participation is much appreciated. | |
43 | |
44 bioperl-l@bioperl.org - General discussion | |
45 http://bioperl.org/MailList.shtml - About the mailing lists | |
46 | |
47 =head2 Reporting Bugs | |
48 | |
49 Report bugs to the Bioperl bug tracking system to help us keep track | |
50 the bugs and their resolution. Bug reports can be submitted via | |
51 email or the web: | |
52 | |
53 bioperl-bugs@bio.perl.org | |
54 http://bugzilla.bioperl.org/ | |
55 | |
56 =head1 AUTHOR - Lincoln Stein | |
57 | |
58 Email - lstein@cshl.org | |
59 | |
60 =head1 APPENDIX | |
61 | |
62 The rest of the documentation details each of the object methods. Internal | |
63 methods are usually preceded with an "_" (underscore). | |
64 | |
65 =cut | |
66 | |
67 | |
68 # Let the code begin... | |
69 package Bio::DB::Flat; | |
70 | |
71 use Bio::DB::RandomAccessI; | |
72 use Bio::Root::Root; | |
73 use Bio::Root::IO; | |
74 use vars '@ISA'; | |
75 | |
76 @ISA = qw(Bio::Root::Root Bio::DB::RandomAccessI); | |
77 | |
78 use constant CONFIG_FILE_NAME => 'config.dat'; | |
79 | |
80 =head2 new | |
81 | |
82 Title : new | |
83 Usage : my $db = new Bio::Flat->new( | |
84 -directory => $root_directory, | |
85 -write_flag => 0, | |
86 -index => 'bdb'|'flat', | |
87 -verbose => 0, | |
88 -out => 'outputfile', | |
89 -format => 'genbank'); | |
90 Function: create a new Bio::Index::BDB object | |
91 Returns : new Bio::Index::BDB object | |
92 Args : -directory Root directory containing "config.dat" | |
93 -write_flag If true, allows reindexing. | |
94 -verbose Verbose messages | |
95 -out File to write to when write_seq invoked | |
96 Status : Public | |
97 | |
98 The root -directory indicates where the flat file indexes will be | |
99 stored. The build_index() and write_seq() methods will automatically | |
100 create a human-readable configuration file named "config.dat" in this | |
101 file. | |
102 | |
103 The -write_flag enables writing new entries into the database as well | |
104 as the creation of the indexes. By default the indexes will be opened | |
105 read only. | |
106 | |
107 -index is one of "bdb" or "flat" and indicates the type of index to | |
108 generate. "bdb" corresponds to Berkeley DB. You *must* be using | |
109 BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB extension | |
110 installed (DB_File will *not* work). | |
111 | |
112 The -out argument species the output file for writing objects created | |
113 with write_seq(). | |
114 | |
115 =cut | |
116 | |
117 sub new { | |
118 my $class = shift; | |
119 $class = ref($class) if ref($class); | |
120 my $self = $class->SUPER::new(@_); | |
121 | |
122 # first we initialize ourselves | |
123 my ($flat_directory) = @_ == 1 ? shift | |
124 : $self->_rearrange([qw(DIRECTORY)],@_); | |
125 | |
126 # set values from configuration file | |
127 $self->directory($flat_directory); | |
128 $self->_read_config() if -e $flat_directory; | |
129 | |
130 # but override with initialization values | |
131 $self->_initialize(@_); | |
132 | |
133 # now we figure out what subclass to instantiate | |
134 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB' | |
135 :$self->indexing_scheme eq 'flat/1' ? 'Flat' | |
136 :$self->throw("unknown indexing scheme: ".$self->indexing_scheme); | |
137 my $format = $self->file_format; | |
138 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format"; | |
139 eval "use $child_class"; | |
140 $self->throw($@) if $@; | |
141 | |
142 # rebless & reinitialize with the new class | |
143 # (this prevents subclasses from forgetting to call our own initialization) | |
144 bless $self,$child_class; | |
145 $self->_initialize(@_); | |
146 $self->_set_namespaces(@_); | |
147 | |
148 $self; | |
149 } | |
150 | |
151 sub _initialize { | |
152 my $self = shift; | |
153 | |
154 my ($flat_write_flag,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format) | |
155 = $self->_rearrange([qw(WRITE_FLAG INDEX VERBOSE OUT FORMAT)],@_); | |
156 | |
157 $self->write_flag($flat_write_flag) if defined $flat_write_flag; | |
158 | |
159 if (defined $flat_indexing) { | |
160 # very permissive | |
161 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/; | |
162 $flat_indexing = 'flat/1' if $flat_indexing =~ /flat/; | |
163 $self->indexing_scheme($flat_indexing); | |
164 } | |
165 | |
166 $self->verbose($flat_verbose) if defined $flat_verbose; | |
167 $self->out_file($flat_outfile) if defined $flat_outfile; | |
168 $self->file_format($flat_format) if defined $flat_format; | |
169 } | |
170 | |
171 sub _set_namespaces { | |
172 my $self = shift; | |
173 | |
174 $self->primary_namespace($self->default_primary_namespace) | |
175 unless defined $self->{flat_primary_namespace}; | |
176 | |
177 $self->secondary_namespaces($self->default_secondary_namespaces) | |
178 unless defined $self->{flat_secondary_namespaces}; | |
179 | |
180 $self->file_format($self->default_file_format) | |
181 unless defined $self->{flat_format}; | |
182 } | |
183 | |
184 # accessors | |
185 sub directory { | |
186 my $self = shift; | |
187 my $d = $self->{flat_directory}; | |
188 $self->{flat_directory} = shift if @_; | |
189 $d; | |
190 } | |
191 sub write_flag { | |
192 my $self = shift; | |
193 my $d = $self->{flat_write_flag}; | |
194 $self->{flat_write_flag} = shift if @_; | |
195 $d; | |
196 } | |
197 sub verbose { | |
198 my $self = shift; | |
199 my $d = $self->{flat_verbose}; | |
200 $self->{flat_verbose} = shift if @_; | |
201 $d; | |
202 } | |
203 sub out_file { | |
204 my $self = shift; | |
205 my $d = $self->{flat_outfile}; | |
206 $self->{flat_outfile} = shift if @_; | |
207 $d; | |
208 } | |
209 | |
210 sub primary_namespace { | |
211 my $self = shift; | |
212 my $d = $self->{flat_primary_namespace}; | |
213 $self->{flat_primary_namespace} = shift if @_; | |
214 $d; | |
215 } | |
216 | |
217 # get/set secondary namespace(s) | |
218 # pass an array ref. | |
219 # get an array ref in scalar context, list in list context. | |
220 sub secondary_namespaces { | |
221 my $self = shift; | |
222 my $d = $self->{flat_secondary_namespaces}; | |
223 $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_; | |
224 return unless $d; | |
225 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia | |
226 return wantarray ? @$d : $d; | |
227 } | |
228 | |
229 # return the file format | |
230 sub file_format { | |
231 my $self = shift; | |
232 my $d = $self->{flat_format}; | |
233 $self->{flat_format} = shift if @_; | |
234 $d; | |
235 } | |
236 | |
237 # return the indexing scheme | |
238 sub indexing_scheme { | |
239 my $self = shift; | |
240 my $d = $self->{flat_indexing}; | |
241 $self->{flat_indexing} = shift if @_; | |
242 $d; | |
243 } | |
244 | |
245 sub add_flat_file { | |
246 my $self = shift; | |
247 my ($file_path,$file_length,$nf) = @_; | |
248 | |
249 # check that file_path is absolute | |
250 File::Spec->file_name_is_absolute($file_path) | |
251 or $self->throw("the flat file path $file_path must be absolute"); | |
252 | |
253 -r $file_path or $self->throw("flat file $file_path cannot be read: $!"); | |
254 | |
255 my $current_size = -s _; | |
256 if (defined $file_length) { | |
257 $current_size == $file_length | |
258 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size"); | |
259 } else { | |
260 $file_length = $current_size; | |
261 } | |
262 | |
263 unless (defined $nf) { | |
264 $self->{flat_file_index} = 0 unless exists $self->{flat_file_index}; | |
265 $nf = $self->{flat_file_index}++; | |
266 } | |
267 $self->{flat_flat_file_path}{$nf} = $file_path; | |
268 $self->{flat_flat_file_no}{$file_path} = $nf; | |
269 $nf; | |
270 } | |
271 | |
272 sub write_config { | |
273 my $self = shift; | |
274 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set"); | |
275 my $path = $self->_config_path; | |
276 | |
277 open (F,">$path") or $self->throw("open error on $path: $!"); | |
278 | |
279 my $index_type = $self->indexing_scheme; | |
280 print F "index\t$index_type\n"; | |
281 | |
282 my $format = $self->file_format; | |
283 print F "format\t$format\n"; | |
284 | |
285 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined"); | |
286 for my $nf (@filenos) { | |
287 my $path = $self->{flat_flat_file_path}{$nf}; | |
288 my $size = -s $path; | |
289 print F join("\t","fileid_$nf",$path,$size),"\n"; | |
290 } | |
291 | |
292 # write primary namespace | |
293 my $primary_ns = $self->primary_namespace | |
294 or $self->throw('cannot write config file because no primary namespace defined'); | |
295 | |
296 print F join("\t",'primary_namespace',$primary_ns),"\n"; | |
297 | |
298 # write secondary namespaces | |
299 my @secondary = $self->secondary_namespaces; | |
300 print F join("\t",'secondary_namespaces',@secondary),"\n"; | |
301 | |
302 close F or $self->throw("close error on $path: $!"); | |
303 } | |
304 | |
305 sub files { | |
306 my $self = shift; | |
307 return unless $self->{flat_flat_file_no}; | |
308 return keys %{$self->{flat_flat_file_no}}; | |
309 } | |
310 | |
311 sub write_seq { | |
312 my $self = shift; | |
313 my $seq = shift; | |
314 | |
315 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set"); | |
316 | |
317 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()'); | |
318 my $seqio = $self->{flat_cached_parsers}{$file} | |
319 ||= Bio::SeqIO->new(-Format => $self->file_format, | |
320 -file => ">$file") | |
321 or $self->throw("couldn't create Bio::SeqIO object"); | |
322 | |
323 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object"); | |
324 my $offset = tell($fh); | |
325 $seqio->write_seq($seq); | |
326 my $length = tell($fh)-$offset; | |
327 my $ids = $self->seq_to_ids($seq); | |
328 $self->_store_index($ids,$file,$offset,$length); | |
329 | |
330 $self->{flat_outfile_dirty}++; | |
331 } | |
332 | |
333 sub close { | |
334 my $self = shift; | |
335 return unless $self->{flat_outfile_dirty}; | |
336 $self->write_config; | |
337 delete $self->{flat_outfile_dirty}; | |
338 delete $self->{flat_cached_parsers}{$self->out_file}; | |
339 } | |
340 | |
341 | |
342 sub _filenos { | |
343 my $self = shift; | |
344 return unless $self->{flat_flat_file_path}; | |
345 return keys %{$self->{flat_flat_file_path}}; | |
346 } | |
347 | |
348 # read the configuration file | |
349 sub _read_config { | |
350 my $self = shift; | |
351 my $config = shift; | |
352 | |
353 my $path = defined $config ? Bio::Root::IO->catfile($config,CONFIG_FILE_NAME) | |
354 : $self->_config_path; | |
355 return unless -e $path; | |
356 | |
357 open (F,$path) or $self->throw("open error on $path: $!"); | |
358 my %config; | |
359 while (<F>) { | |
360 chomp; | |
361 my ($tag,@values) = split "\t"; | |
362 $config{$tag} = \@values; | |
363 } | |
364 CORE::close F or $self->throw("close error on $path: $!"); | |
365 | |
366 $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~ | |
367 or $self->throw("invalid configuration file $path: no index line"); | |
368 | |
369 $self->indexing_scheme($1); | |
370 | |
371 $self->file_format($config{format}[0]) if $config{format}; | |
372 | |
373 # set up primary namespace | |
374 my $primary_namespace = $config{primary_namespace}[0] | |
375 or $self->throw("invalid configuration file $path: no primary namespace defined"); | |
376 $self->primary_namespace($primary_namespace); | |
377 | |
378 # set up secondary namespaces (may be empty) | |
379 $self->secondary_namespaces($config{secondary_namespaces}); | |
380 | |
381 # get file paths and their normalization information | |
382 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config; | |
383 for my $nf (@normalized_files) { | |
384 my ($file_path,$file_length) = @{$config{"fileid_${nf}"}}; | |
385 $self->add_flat_file($file_path,$file_length,$nf); | |
386 } | |
387 1; | |
388 } | |
389 | |
390 | |
391 sub _config_path { | |
392 my $self = shift; | |
393 $self->_catfile($self->_config_name); | |
394 } | |
395 | |
396 sub _catfile { | |
397 my $self = shift; | |
398 my $component = shift; | |
399 Bio::Root::IO->catfile($self->directory,$component); | |
400 } | |
401 | |
402 sub _config_name { CONFIG_FILE_NAME } | |
403 | |
404 sub _path2fileno { | |
405 my $self = shift; | |
406 my $path = shift; | |
407 return $self->add_flat_file($path) | |
408 unless exists $self->{flat_flat_file_no}{$path}; | |
409 $self->{flat_flat_file_no}{$path}; | |
410 } | |
411 | |
412 sub _fileno2path { | |
413 my $self = shift; | |
414 my $fileno = shift; | |
415 $self->{flat_flat_file_path}{$fileno}; | |
416 } | |
417 | |
418 sub _files { | |
419 my $self = shift; | |
420 my $paths = $self->{flat_flat_file_no}; | |
421 return keys %$paths; | |
422 } | |
423 | |
424 =head2 fetch | |
425 | |
426 Title : fetch | |
427 Usage : $index->fetch( $id ) | |
428 Function: Returns a Bio::Seq object from the index | |
429 Example : $seq = $index->fetch( 'dJ67B12' ) | |
430 Returns : Bio::Seq object | |
431 Args : ID | |
432 | |
433 Deprecated. Use get_Seq_by_id instead. | |
434 | |
435 =cut | |
436 | |
437 sub fetch { shift->get_Seq_by_id(@_) } | |
438 | |
439 | |
440 =head2 To Be Implemented in Subclasses | |
441 | |
442 The following methods MUST be implemented by subclasses. | |
443 | |
444 =cut | |
445 | |
446 # create real live Bio::Seq object | |
447 sub get_Seq_by_id { | |
448 my $self = shift; | |
449 my $id = shift; | |
450 $self->throw_not_implemented; | |
451 } | |
452 | |
453 | |
454 # fetch array of Bio::Seq objects | |
455 sub get_Seq_by_acc { | |
456 my $self = shift; | |
457 return $self->get_Seq_by_id(shift) if @_ == 1; | |
458 my ($ns,$key) = @_; | |
459 | |
460 $self->throw_not_implemented; | |
461 } | |
462 | |
463 sub fetch_raw { | |
464 my ($self,$id,$namespace) = @_; | |
465 $self->throw_not_implemented; | |
466 } | |
467 | |
468 # This is the method that must be implemented in | |
469 # child classes. It is passed a filehandle which should | |
470 # point to the next record to be indexed in the file, | |
471 # and returns a two element list | |
472 # consisting of a key and an adjustment value. | |
473 # The key can be a scalar, in which case it is treated | |
474 # as the primary ID, or a hashref containing namespace=>[id] pairs, | |
475 # one of which MUST correspond to the primary namespace. | |
476 # The adjustment value is normally zero, but can be a positive or | |
477 # negative integer which will be added to the current file position | |
478 # in order to calculate the correct end of the record. | |
479 sub parse_one_record { | |
480 my $self = shift; | |
481 my $fh = shift; | |
482 $self->throw_not_implemented; | |
483 # here's what you would implement | |
484 my (%keys,$offset); | |
485 return (\%keys,$offset); | |
486 } | |
487 | |
488 sub default_file_format { | |
489 my $self = shift; | |
490 $self->throw_not_implemented; | |
491 } | |
492 | |
493 sub _store_index { | |
494 my ($ids,$file,$offset,$length) = @_; | |
495 $self->throw_not_implemented; | |
496 } | |
497 | |
498 =head2 May Be Overridden in Subclasses | |
499 | |
500 The following methods MAY be overridden by subclasses. | |
501 | |
502 =cut | |
503 | |
504 sub default_primary_namespace { | |
505 return "ACC"; | |
506 } | |
507 | |
508 sub default_secondary_namespaces { | |
509 return; | |
510 } | |
511 | |
512 sub seq_to_ids { | |
513 my $self = shift; | |
514 my $seq = shift; | |
515 my %ids; | |
516 $ids{$self->primary_namespace} = $seq->accession_number; | |
517 \%ids; | |
518 } | |
519 | |
520 sub DESTROY { | |
521 my $self = shift; | |
522 $self->close; | |
523 } | |
524 | |
525 | |
526 1; |