Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/Flat.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # | |
| 2 # $Id: Flat.pm,v 1.6 2002/12/22 22:02:13 lstein Exp $ | |
| 3 # | |
| 4 # BioPerl module for Bio::DB::Flat | |
| 5 # | |
| 6 # Cared for by Lincoln Stein <lstein@cshl.org> | |
| 7 # | |
| 8 # You may distribute this module under the same terms as perl itself | |
| 9 | |
| 10 # POD documentation - main docs before the code | |
| 11 | |
| 12 =head1 NAME | |
| 13 | |
| 14 Bio::DB::Flat - Interface for indexed flat files | |
| 15 | |
| 16 =head1 SYNOPSIS | |
| 17 | |
| 18 $db = Bio::DB::Flat->new(-directory => '/usr/share/embl', | |
| 19 -format => 'embl', | |
| 20 -write_flag => 1); | |
| 21 $db->build_index('/usr/share/embl/primate.embl','/usr/share/embl/protists.embl'); | |
| 22 $seq = $db->get_Seq_by_id('BUM'); | |
| 23 @sequences = $db->get_Seq_by_acc('DIV' => 'primate'); | |
| 24 $raw = $db->fetch_raw('BUM'); | |
| 25 | |
| 26 =head1 DESCRIPTION | |
| 27 | |
| 28 This object provides the basic mechanism to associate positions in | |
| 29 files with primary and secondary name spaces. Unlike | |
| 30 Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized | |
| 31 to work with the "flat index" and BerkeleyDB indexed flat file formats | |
| 32 worked out at the 2002 BioHackathon. | |
| 33 | |
| 34 This object is a general front end to the underlying databases. | |
| 35 | |
| 36 =head1 FEEDBACK | |
| 37 | |
| 38 =head2 Mailing Lists | |
| 39 | |
| 40 User feedback is an integral part of the evolution of this and other | |
| 41 Bioperl modules. Send your comments and suggestions preferably to one | |
| 42 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 43 | |
| 44 bioperl-l@bioperl.org - General discussion | |
| 45 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 46 | |
| 47 =head2 Reporting Bugs | |
| 48 | |
| 49 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 50 the bugs and their resolution. Bug reports can be submitted via | |
| 51 email or the web: | |
| 52 | |
| 53 bioperl-bugs@bio.perl.org | |
| 54 http://bugzilla.bioperl.org/ | |
| 55 | |
| 56 =head1 AUTHOR - Lincoln Stein | |
| 57 | |
| 58 Email - lstein@cshl.org | |
| 59 | |
| 60 =head1 APPENDIX | |
| 61 | |
| 62 The rest of the documentation details each of the object methods. Internal | |
| 63 methods are usually preceded with an "_" (underscore). | |
| 64 | |
| 65 =cut | |
| 66 | |
| 67 | |
| 68 # Let the code begin... | |
| 69 package Bio::DB::Flat; | |
| 70 | |
| 71 use Bio::DB::RandomAccessI; | |
| 72 use Bio::Root::Root; | |
| 73 use Bio::Root::IO; | |
| 74 use vars '@ISA'; | |
| 75 | |
| 76 @ISA = qw(Bio::Root::Root Bio::DB::RandomAccessI); | |
| 77 | |
| 78 use constant CONFIG_FILE_NAME => 'config.dat'; | |
| 79 | |
| 80 =head2 new | |
| 81 | |
| 82 Title : new | |
| 83 Usage : my $db = new Bio::Flat->new( | |
| 84 -directory => $root_directory, | |
| 85 -write_flag => 0, | |
| 86 -index => 'bdb'|'flat', | |
| 87 -verbose => 0, | |
| 88 -out => 'outputfile', | |
| 89 -format => 'genbank'); | |
| 90 Function: create a new Bio::Index::BDB object | |
| 91 Returns : new Bio::Index::BDB object | |
| 92 Args : -directory Root directory containing "config.dat" | |
| 93 -write_flag If true, allows reindexing. | |
| 94 -verbose Verbose messages | |
| 95 -out File to write to when write_seq invoked | |
| 96 Status : Public | |
| 97 | |
| 98 The root -directory indicates where the flat file indexes will be | |
| 99 stored. The build_index() and write_seq() methods will automatically | |
| 100 create a human-readable configuration file named "config.dat" in this | |
| 101 file. | |
| 102 | |
| 103 The -write_flag enables writing new entries into the database as well | |
| 104 as the creation of the indexes. By default the indexes will be opened | |
| 105 read only. | |
| 106 | |
| 107 -index is one of "bdb" or "flat" and indicates the type of index to | |
| 108 generate. "bdb" corresponds to Berkeley DB. You *must* be using | |
| 109 BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB extension | |
| 110 installed (DB_File will *not* work). | |
| 111 | |
| 112 The -out argument species the output file for writing objects created | |
| 113 with write_seq(). | |
| 114 | |
| 115 =cut | |
| 116 | |
| 117 sub new { | |
| 118 my $class = shift; | |
| 119 $class = ref($class) if ref($class); | |
| 120 my $self = $class->SUPER::new(@_); | |
| 121 | |
| 122 # first we initialize ourselves | |
| 123 my ($flat_directory) = @_ == 1 ? shift | |
| 124 : $self->_rearrange([qw(DIRECTORY)],@_); | |
| 125 | |
| 126 # set values from configuration file | |
| 127 $self->directory($flat_directory); | |
| 128 $self->_read_config() if -e $flat_directory; | |
| 129 | |
| 130 # but override with initialization values | |
| 131 $self->_initialize(@_); | |
| 132 | |
| 133 # now we figure out what subclass to instantiate | |
| 134 my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB' | |
| 135 :$self->indexing_scheme eq 'flat/1' ? 'Flat' | |
| 136 :$self->throw("unknown indexing scheme: ".$self->indexing_scheme); | |
| 137 my $format = $self->file_format; | |
| 138 my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format"; | |
| 139 eval "use $child_class"; | |
| 140 $self->throw($@) if $@; | |
| 141 | |
| 142 # rebless & reinitialize with the new class | |
| 143 # (this prevents subclasses from forgetting to call our own initialization) | |
| 144 bless $self,$child_class; | |
| 145 $self->_initialize(@_); | |
| 146 $self->_set_namespaces(@_); | |
| 147 | |
| 148 $self; | |
| 149 } | |
| 150 | |
| 151 sub _initialize { | |
| 152 my $self = shift; | |
| 153 | |
| 154 my ($flat_write_flag,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format) | |
| 155 = $self->_rearrange([qw(WRITE_FLAG INDEX VERBOSE OUT FORMAT)],@_); | |
| 156 | |
| 157 $self->write_flag($flat_write_flag) if defined $flat_write_flag; | |
| 158 | |
| 159 if (defined $flat_indexing) { | |
| 160 # very permissive | |
| 161 $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/; | |
| 162 $flat_indexing = 'flat/1' if $flat_indexing =~ /flat/; | |
| 163 $self->indexing_scheme($flat_indexing); | |
| 164 } | |
| 165 | |
| 166 $self->verbose($flat_verbose) if defined $flat_verbose; | |
| 167 $self->out_file($flat_outfile) if defined $flat_outfile; | |
| 168 $self->file_format($flat_format) if defined $flat_format; | |
| 169 } | |
| 170 | |
| 171 sub _set_namespaces { | |
| 172 my $self = shift; | |
| 173 | |
| 174 $self->primary_namespace($self->default_primary_namespace) | |
| 175 unless defined $self->{flat_primary_namespace}; | |
| 176 | |
| 177 $self->secondary_namespaces($self->default_secondary_namespaces) | |
| 178 unless defined $self->{flat_secondary_namespaces}; | |
| 179 | |
| 180 $self->file_format($self->default_file_format) | |
| 181 unless defined $self->{flat_format}; | |
| 182 } | |
| 183 | |
| 184 # accessors | |
| 185 sub directory { | |
| 186 my $self = shift; | |
| 187 my $d = $self->{flat_directory}; | |
| 188 $self->{flat_directory} = shift if @_; | |
| 189 $d; | |
| 190 } | |
| 191 sub write_flag { | |
| 192 my $self = shift; | |
| 193 my $d = $self->{flat_write_flag}; | |
| 194 $self->{flat_write_flag} = shift if @_; | |
| 195 $d; | |
| 196 } | |
| 197 sub verbose { | |
| 198 my $self = shift; | |
| 199 my $d = $self->{flat_verbose}; | |
| 200 $self->{flat_verbose} = shift if @_; | |
| 201 $d; | |
| 202 } | |
| 203 sub out_file { | |
| 204 my $self = shift; | |
| 205 my $d = $self->{flat_outfile}; | |
| 206 $self->{flat_outfile} = shift if @_; | |
| 207 $d; | |
| 208 } | |
| 209 | |
| 210 sub primary_namespace { | |
| 211 my $self = shift; | |
| 212 my $d = $self->{flat_primary_namespace}; | |
| 213 $self->{flat_primary_namespace} = shift if @_; | |
| 214 $d; | |
| 215 } | |
| 216 | |
| 217 # get/set secondary namespace(s) | |
| 218 # pass an array ref. | |
| 219 # get an array ref in scalar context, list in list context. | |
| 220 sub secondary_namespaces { | |
| 221 my $self = shift; | |
| 222 my $d = $self->{flat_secondary_namespaces}; | |
| 223 $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_; | |
| 224 return unless $d; | |
| 225 $d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia | |
| 226 return wantarray ? @$d : $d; | |
| 227 } | |
| 228 | |
| 229 # return the file format | |
| 230 sub file_format { | |
| 231 my $self = shift; | |
| 232 my $d = $self->{flat_format}; | |
| 233 $self->{flat_format} = shift if @_; | |
| 234 $d; | |
| 235 } | |
| 236 | |
| 237 # return the indexing scheme | |
| 238 sub indexing_scheme { | |
| 239 my $self = shift; | |
| 240 my $d = $self->{flat_indexing}; | |
| 241 $self->{flat_indexing} = shift if @_; | |
| 242 $d; | |
| 243 } | |
| 244 | |
| 245 sub add_flat_file { | |
| 246 my $self = shift; | |
| 247 my ($file_path,$file_length,$nf) = @_; | |
| 248 | |
| 249 # check that file_path is absolute | |
| 250 File::Spec->file_name_is_absolute($file_path) | |
| 251 or $self->throw("the flat file path $file_path must be absolute"); | |
| 252 | |
| 253 -r $file_path or $self->throw("flat file $file_path cannot be read: $!"); | |
| 254 | |
| 255 my $current_size = -s _; | |
| 256 if (defined $file_length) { | |
| 257 $current_size == $file_length | |
| 258 or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size"); | |
| 259 } else { | |
| 260 $file_length = $current_size; | |
| 261 } | |
| 262 | |
| 263 unless (defined $nf) { | |
| 264 $self->{flat_file_index} = 0 unless exists $self->{flat_file_index}; | |
| 265 $nf = $self->{flat_file_index}++; | |
| 266 } | |
| 267 $self->{flat_flat_file_path}{$nf} = $file_path; | |
| 268 $self->{flat_flat_file_no}{$file_path} = $nf; | |
| 269 $nf; | |
| 270 } | |
| 271 | |
| 272 sub write_config { | |
| 273 my $self = shift; | |
| 274 $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set"); | |
| 275 my $path = $self->_config_path; | |
| 276 | |
| 277 open (F,">$path") or $self->throw("open error on $path: $!"); | |
| 278 | |
| 279 my $index_type = $self->indexing_scheme; | |
| 280 print F "index\t$index_type\n"; | |
| 281 | |
| 282 my $format = $self->file_format; | |
| 283 print F "format\t$format\n"; | |
| 284 | |
| 285 my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined"); | |
| 286 for my $nf (@filenos) { | |
| 287 my $path = $self->{flat_flat_file_path}{$nf}; | |
| 288 my $size = -s $path; | |
| 289 print F join("\t","fileid_$nf",$path,$size),"\n"; | |
| 290 } | |
| 291 | |
| 292 # write primary namespace | |
| 293 my $primary_ns = $self->primary_namespace | |
| 294 or $self->throw('cannot write config file because no primary namespace defined'); | |
| 295 | |
| 296 print F join("\t",'primary_namespace',$primary_ns),"\n"; | |
| 297 | |
| 298 # write secondary namespaces | |
| 299 my @secondary = $self->secondary_namespaces; | |
| 300 print F join("\t",'secondary_namespaces',@secondary),"\n"; | |
| 301 | |
| 302 close F or $self->throw("close error on $path: $!"); | |
| 303 } | |
| 304 | |
| 305 sub files { | |
| 306 my $self = shift; | |
| 307 return unless $self->{flat_flat_file_no}; | |
| 308 return keys %{$self->{flat_flat_file_no}}; | |
| 309 } | |
| 310 | |
| 311 sub write_seq { | |
| 312 my $self = shift; | |
| 313 my $seq = shift; | |
| 314 | |
| 315 $self->write_flag or $self->throw("cannot write sequences because write_flag is not set"); | |
| 316 | |
| 317 my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()'); | |
| 318 my $seqio = $self->{flat_cached_parsers}{$file} | |
| 319 ||= Bio::SeqIO->new(-Format => $self->file_format, | |
| 320 -file => ">$file") | |
| 321 or $self->throw("couldn't create Bio::SeqIO object"); | |
| 322 | |
| 323 my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object"); | |
| 324 my $offset = tell($fh); | |
| 325 $seqio->write_seq($seq); | |
| 326 my $length = tell($fh)-$offset; | |
| 327 my $ids = $self->seq_to_ids($seq); | |
| 328 $self->_store_index($ids,$file,$offset,$length); | |
| 329 | |
| 330 $self->{flat_outfile_dirty}++; | |
| 331 } | |
| 332 | |
| 333 sub close { | |
| 334 my $self = shift; | |
| 335 return unless $self->{flat_outfile_dirty}; | |
| 336 $self->write_config; | |
| 337 delete $self->{flat_outfile_dirty}; | |
| 338 delete $self->{flat_cached_parsers}{$self->out_file}; | |
| 339 } | |
| 340 | |
| 341 | |
| 342 sub _filenos { | |
| 343 my $self = shift; | |
| 344 return unless $self->{flat_flat_file_path}; | |
| 345 return keys %{$self->{flat_flat_file_path}}; | |
| 346 } | |
| 347 | |
| 348 # read the configuration file | |
| 349 sub _read_config { | |
| 350 my $self = shift; | |
| 351 my $config = shift; | |
| 352 | |
| 353 my $path = defined $config ? Bio::Root::IO->catfile($config,CONFIG_FILE_NAME) | |
| 354 : $self->_config_path; | |
| 355 return unless -e $path; | |
| 356 | |
| 357 open (F,$path) or $self->throw("open error on $path: $!"); | |
| 358 my %config; | |
| 359 while (<F>) { | |
| 360 chomp; | |
| 361 my ($tag,@values) = split "\t"; | |
| 362 $config{$tag} = \@values; | |
| 363 } | |
| 364 CORE::close F or $self->throw("close error on $path: $!"); | |
| 365 | |
| 366 $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~ | |
| 367 or $self->throw("invalid configuration file $path: no index line"); | |
| 368 | |
| 369 $self->indexing_scheme($1); | |
| 370 | |
| 371 $self->file_format($config{format}[0]) if $config{format}; | |
| 372 | |
| 373 # set up primary namespace | |
| 374 my $primary_namespace = $config{primary_namespace}[0] | |
| 375 or $self->throw("invalid configuration file $path: no primary namespace defined"); | |
| 376 $self->primary_namespace($primary_namespace); | |
| 377 | |
| 378 # set up secondary namespaces (may be empty) | |
| 379 $self->secondary_namespaces($config{secondary_namespaces}); | |
| 380 | |
| 381 # get file paths and their normalization information | |
| 382 my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config; | |
| 383 for my $nf (@normalized_files) { | |
| 384 my ($file_path,$file_length) = @{$config{"fileid_${nf}"}}; | |
| 385 $self->add_flat_file($file_path,$file_length,$nf); | |
| 386 } | |
| 387 1; | |
| 388 } | |
| 389 | |
| 390 | |
| 391 sub _config_path { | |
| 392 my $self = shift; | |
| 393 $self->_catfile($self->_config_name); | |
| 394 } | |
| 395 | |
| 396 sub _catfile { | |
| 397 my $self = shift; | |
| 398 my $component = shift; | |
| 399 Bio::Root::IO->catfile($self->directory,$component); | |
| 400 } | |
| 401 | |
| 402 sub _config_name { CONFIG_FILE_NAME } | |
| 403 | |
| 404 sub _path2fileno { | |
| 405 my $self = shift; | |
| 406 my $path = shift; | |
| 407 return $self->add_flat_file($path) | |
| 408 unless exists $self->{flat_flat_file_no}{$path}; | |
| 409 $self->{flat_flat_file_no}{$path}; | |
| 410 } | |
| 411 | |
| 412 sub _fileno2path { | |
| 413 my $self = shift; | |
| 414 my $fileno = shift; | |
| 415 $self->{flat_flat_file_path}{$fileno}; | |
| 416 } | |
| 417 | |
| 418 sub _files { | |
| 419 my $self = shift; | |
| 420 my $paths = $self->{flat_flat_file_no}; | |
| 421 return keys %$paths; | |
| 422 } | |
| 423 | |
| 424 =head2 fetch | |
| 425 | |
| 426 Title : fetch | |
| 427 Usage : $index->fetch( $id ) | |
| 428 Function: Returns a Bio::Seq object from the index | |
| 429 Example : $seq = $index->fetch( 'dJ67B12' ) | |
| 430 Returns : Bio::Seq object | |
| 431 Args : ID | |
| 432 | |
| 433 Deprecated. Use get_Seq_by_id instead. | |
| 434 | |
| 435 =cut | |
| 436 | |
| 437 sub fetch { shift->get_Seq_by_id(@_) } | |
| 438 | |
| 439 | |
| 440 =head2 To Be Implemented in Subclasses | |
| 441 | |
| 442 The following methods MUST be implemented by subclasses. | |
| 443 | |
| 444 =cut | |
| 445 | |
| 446 # create real live Bio::Seq object | |
| 447 sub get_Seq_by_id { | |
| 448 my $self = shift; | |
| 449 my $id = shift; | |
| 450 $self->throw_not_implemented; | |
| 451 } | |
| 452 | |
| 453 | |
| 454 # fetch array of Bio::Seq objects | |
| 455 sub get_Seq_by_acc { | |
| 456 my $self = shift; | |
| 457 return $self->get_Seq_by_id(shift) if @_ == 1; | |
| 458 my ($ns,$key) = @_; | |
| 459 | |
| 460 $self->throw_not_implemented; | |
| 461 } | |
| 462 | |
| 463 sub fetch_raw { | |
| 464 my ($self,$id,$namespace) = @_; | |
| 465 $self->throw_not_implemented; | |
| 466 } | |
| 467 | |
| 468 # This is the method that must be implemented in | |
| 469 # child classes. It is passed a filehandle which should | |
| 470 # point to the next record to be indexed in the file, | |
| 471 # and returns a two element list | |
| 472 # consisting of a key and an adjustment value. | |
| 473 # The key can be a scalar, in which case it is treated | |
| 474 # as the primary ID, or a hashref containing namespace=>[id] pairs, | |
| 475 # one of which MUST correspond to the primary namespace. | |
| 476 # The adjustment value is normally zero, but can be a positive or | |
| 477 # negative integer which will be added to the current file position | |
| 478 # in order to calculate the correct end of the record. | |
| 479 sub parse_one_record { | |
| 480 my $self = shift; | |
| 481 my $fh = shift; | |
| 482 $self->throw_not_implemented; | |
| 483 # here's what you would implement | |
| 484 my (%keys,$offset); | |
| 485 return (\%keys,$offset); | |
| 486 } | |
| 487 | |
| 488 sub default_file_format { | |
| 489 my $self = shift; | |
| 490 $self->throw_not_implemented; | |
| 491 } | |
| 492 | |
| 493 sub _store_index { | |
| 494 my ($ids,$file,$offset,$length) = @_; | |
| 495 $self->throw_not_implemented; | |
| 496 } | |
| 497 | |
| 498 =head2 May Be Overridden in Subclasses | |
| 499 | |
| 500 The following methods MAY be overridden by subclasses. | |
| 501 | |
| 502 =cut | |
| 503 | |
| 504 sub default_primary_namespace { | |
| 505 return "ACC"; | |
| 506 } | |
| 507 | |
| 508 sub default_secondary_namespaces { | |
| 509 return; | |
| 510 } | |
| 511 | |
| 512 sub seq_to_ids { | |
| 513 my $self = shift; | |
| 514 my $seq = shift; | |
| 515 my %ids; | |
| 516 $ids{$self->primary_namespace} = $seq->accession_number; | |
| 517 \%ids; | |
| 518 } | |
| 519 | |
| 520 sub DESTROY { | |
| 521 my $self = shift; | |
| 522 $self->close; | |
| 523 } | |
| 524 | |
| 525 | |
| 526 1; |
