comparison variant_effect_predictor/Bio/Index/Abstract.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1
2 #
3 # $Id: Abstract.pm,v 1.41 2002/12/17 02:08:36 jason Exp $
4 #
5 # BioPerl module for Bio::Index::Abstract
6 #
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
8 # and James Gilbert <jgrg@sanger.ac.uk>
9 #
10 # You may distribute this module under the same terms as perl itself
11
12 # POD documentation - main docs before the code
13
14 =head1 NAME
15
16 Bio::Index::Abstract - Abstract interface for indexing a flat file
17
18 =head1 SYNOPSIS
19
20 You should not be using this module directly
21
22 =head1 USING DB_FILE
23
24 To use DB_File and not SDBM for this index, pass the value:
25
26 -dbm_package => 'DB_File'
27
28 to new (see below).
29
30 =head1 DESCRIPTION
31
32 This object provides the basic mechanism to associate positions
33 in files with names. The position and filenames are stored in DBM
34 which can then be accessed later on. It is the equivalent of flat
35 file indexing (eg, SRS or efetch).
36
37 This object is the guts to the mechanism, which will be used by the
38 specific objects inheriting from it.
39
40 =head1 FEEDBACK
41
42 =head2 Mailing Lists
43
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to one
46 of the Bioperl mailing lists. Your participation is much appreciated.
47
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/MailList.shtml - About the mailing lists
50
51 =head2 Reporting Bugs
52
53 Report bugs to the Bioperl bug tracking system to help us keep track
54 the bugs and their resolution. Bug reports can be submitted via
55 email or the web:
56
57 bioperl-bugs@bio.perl.org
58 http://bugzilla.bioperl.org/
59
60 =head1 AUTHOR - Ewan Birney, James Gilbert
61
62 Email - birney@sanger.ac.uk, jgrg@sanger.ac.uk
63
64 =head1 APPENDIX
65
66 The rest of the documentation details each of the object methods. Internal
67 methods are usually preceded with an "_" (underscore).
68
69 =cut
70
71
72 # Let the code begin...
73
74 package Bio::Index::Abstract;
75
76 use strict;
77 use Fcntl qw( O_RDWR O_CREAT O_RDONLY );
78 use vars qw( $TYPE_AND_VERSION_KEY
79 @ISA $USE_DBM_TYPE $DB_HASH );
80
81 # Object preamble - inheriets from Bio::Root::Object
82
83 use Bio::Root::Root;
84 use Bio::Root::IO;
85 use Symbol();
86
87 @ISA = qw(Bio::Root::Root);
88
89 # Generate accessor methods for simple object fields
90 BEGIN {
91 foreach my $func (qw(filename write_flag)) {
92 no strict 'refs';
93 my $field = "_$func";
94
95 *$func = sub {
96 my( $self, $value ) = @_;
97
98 if (defined $value) {
99 $self->{$field} = $value;
100 }
101 return $self->{$field};
102 }
103 }
104 }
105
106 =head2 new
107
108 Usage : $index = Bio::Index::Abstract->new(
109 -filename => $dbm_file,
110 -write_flag => 0,
111 -dbm_package => 'DB_File',
112 -verbose => 0);
113 Function: Returns a new index object. If filename is
114 specified, then open_dbm() is immediately called.
115 Bio::Index::Abstract->new() will usually be called
116 directly only when opening an existing index.
117 Returns : A new index object
118 Args : -filename The name of the dbm index file.
119 -write_flag TRUE if write access to the dbm file is
120 needed.
121 -dbm_package The Perl dbm module to use for the
122 index.
123 -verbose Print debugging output to STDERR if
124 TRUE.
125
126 =cut
127
128 sub new {
129 my($class, @args) = @_;
130 my $self = $class->SUPER::new(@args);
131 my( $filename, $write_flag, $dbm_package, $cachesize, $ffactor ) =
132 $self->_rearrange([qw(FILENAME
133 WRITE_FLAG
134 DBM_PACKAGE
135 CACHESIZE
136 FFACTOR
137 )], @args);
138
139 # Store any parameters passed
140 $self->filename($filename) if $filename;
141 $self->cachesize($cachesize) if $cachesize;
142 $self->ffactor($ffactor) if $ffactor;
143 $self->write_flag($write_flag) if $write_flag;
144 $self->dbm_package($dbm_package) if $dbm_package;
145
146 $self->{'_filehandle'} = []; # Array in which to cache SeqIO objects
147 $self->{'_DB'} = {}; # Gets tied to the DBM file
148
149 # Open database
150 $self->open_dbm() if $filename;
151 return $self;
152 }
153
154 =pod
155
156 =head2 filename
157
158 Title : filename
159 Usage : $value = $self->filename();
160 $self->filename($value);
161 Function: Gets or sets the name of the dbm index file.
162 Returns : The current value of filename
163 Args : Value of filename if setting, or none if
164 getting the value.
165
166 =head2 write_flag
167
168 Title : write_flag
169 Usage : $value = $self->write_flag();
170 $self->write_flag($value);
171 Function: Gets or sets the value of write_flag, which
172 is wether the dbm file should be opened with
173 write access.
174 Returns : The current value of write_flag (default 0)
175 Args : Value of write_flag if setting, or none if
176 getting the value.
177
178 =head2 dbm_package
179
180 Usage : $value = $self->dbm_package();
181 $self->dbm_package($value);
182
183 Function: Gets or sets the name of the Perl dbm module used.
184 If the value is unset, then it returns the value of
185 the package variable $USE_DBM_TYPE or if that is
186 unset, then it chooses the best available dbm type,
187 choosing 'DB_File' in preference to 'SDBM_File'.
188 Bio::Abstract::Index may work with other dbm file
189 types.
190
191 Returns : The current value of dbm_package
192 Args : Value of dbm_package if setting, or none if
193 getting the value.
194
195 =cut
196
197 sub dbm_package {
198 my( $self, $value ) = @_;
199 my $to_require = 0;
200 if( $value || ! $self->{'_dbm_package'} ) {
201 my $type = $value || $USE_DBM_TYPE || 'DB_File';
202 if( $type =~ /DB_File/i ) {
203 eval {
204 require DB_File;
205 };
206 $type = ( $@ ) ? 'SDBM_File' : 'DB_File';
207 }
208 if( $type ne 'DB_File' ) {
209 eval { require "$type.pm"; };
210 $self->throw($@) if( $@ );
211 }
212 $self->{'_dbm_package'} = $type;
213 if( ! defined $USE_DBM_TYPE ) {
214 $USE_DBM_TYPE = $self->{'_dbm_package'};
215 }
216 }
217 return $self->{'_dbm_package'};
218 }
219
220 =head2 db
221
222 Title : db
223 Usage : $index->db
224 Function: Returns a ref to the hash which is tied to the dbm
225 file. Used internally when adding and retrieving
226 data from the database.
227 Example : $db = $index->db();
228 $db->{ $some_key } = $data
229 $data = $index->db->{ $some_key };
230 Returns : ref to HASH
231 Args : NONE
232
233 =cut
234
235 sub db {
236 return $_[0]->{'_DB'};
237 }
238
239
240 =head2 get_stream
241
242 Title : get_stream
243 Usage : $stream = $index->get_stream( $id );
244 Function: Returns a file handle with the file pointer
245 at the approprite place
246
247 This provides for a way to get the actual
248 file contents and not an object
249
250 WARNING: you must parse the record deliminter
251 *yourself*. Abstract wont do this for you
252 So this code
253
254 $fh = $index->get_stream($myid);
255 while( <$fh> ) {
256 # do something
257 }
258 will parse the entire file if you don't put in
259 a last statement in, like
260
261 while( <$fh> ) {
262 /^\/\// && last; # end of record
263 # do something
264 }
265
266 Returns : A filehandle object
267 Args : string represents the accession number
268 Notes : This method should not be used without forethought
269
270 =cut
271
272 #'
273
274 sub get_stream {
275 my ($self,$id) = @_;
276
277 my ($desc,$acc,$out);
278 my $db = $self->db();
279
280 if (my $rec = $db->{ $id }) {
281 my( @record );
282
283 my ($file, $begin, $end) = $self->unpack_record( $rec );
284
285 # Get the (possibly cached) filehandle
286 my $fh = $self->_file_handle( $file );
287
288 # move to start
289 seek($fh, $begin, 0);
290
291 return $fh;
292 }
293 else {
294 $self->throw("Unable to find a record for $id in the flat file index");
295 }
296 }
297
298
299 =head2 cachesize
300
301 Usage : $index->cachesize(1000000)
302 Function: Sets the dbm file cache size for the index.
303 Needs to be set before the DBM file gets opened.
304 Example : $index->cachesize(1000000)
305 Returns : size of the curent cache
306
307 =cut
308
309 sub cachesize {
310 my( $self, $size ) = @_;
311
312 if(defined $size){
313 $self->{'_cachesize'} = $size;
314 }
315 return ( $self->{'_cachesize'} );
316
317 }
318
319
320 =head2 ffactor
321
322 Usage : $index->ffactor(1000000)
323 Function: Sets the dbm file fill factor.
324 Needs to be set before the DBM file gets opened.
325
326 Example : $index->ffactor(1000000)
327 Returns : size of the curent cache
328
329 =cut
330
331 sub ffactor {
332 my( $self, $size ) = @_;
333
334 if(defined $size){
335 $self->{'_ffactor'} = $size;
336 }
337 return ( $self->{'_ffactor'} );
338
339 }
340
341
342 =head2 open_dbm
343
344 Usage : $index->open_dbm()
345 Function: Opens the dbm file associated with the index
346 object. Write access is only given if explicitly
347 asked for by calling new(-write => 1) or having set
348 the write_flag(1) on the index object. The type of
349 dbm file opened is that returned by dbm_package().
350 The name of the file to be is opened is obtained by
351 calling the filename() method.
352
353 Example : $index->_open_dbm()
354 Returns : 1 on success
355
356 =cut
357
358 sub open_dbm {
359 my( $self ) = @_;
360
361 my $filename = $self->filename()
362 or $self->throw("filename() not set");
363
364 my $db = $self->db();
365
366 # Close the dbm file if already open (maybe we're getting
367 # or dropping write access
368 if (ref($db) ne 'HASH') {
369 untie($db);
370 }
371
372 # What kind of DBM file are we going to open?
373 my $dbm_type = $self->dbm_package;
374
375 # Choose mode for opening dbm file (read/write+create or read-only).
376 my $mode_flags = $self->write_flag ? O_RDWR|O_CREAT : O_RDONLY;
377
378 # Open the dbm file
379 if ($dbm_type eq 'DB_File') {
380 my $hash_inf = DB_File::HASHINFO->new();
381 my $cache = $self->cachesize();
382 my $ffactor = $self->ffactor();
383 if ($cache){
384 $hash_inf->{'cachesize'} = $cache;
385 }
386 if ($ffactor){
387 $hash_inf->{'ffactor'} = $ffactor;
388 }
389 tie( %$db, $dbm_type, $filename, $mode_flags, 0644, $hash_inf )
390 or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
391 } else {
392 tie( %$db, $dbm_type, $filename, $mode_flags, 0644 )
393 or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
394 }
395
396 # The following methods access data in the dbm file:
397
398 # Now, if we're a Bio::Index::Abstract caterpillar, then we
399 # transform ourselves into a Bio::Index::<something> butterfly!
400 if( ref($self) eq "Bio::Index::Abstract" ) {
401 my $pkg = $self->_code_base();
402 bless $self, $pkg;
403 }
404
405 # Check or set this is the right kind and version of index
406 $self->_type_and_version();
407
408 # Check files haven't changed size since they were indexed
409 $self->_check_file_sizes();
410
411 return 1;
412 }
413
414 =head2 _version
415
416 Title : _version
417 Usage : $type = $index->_version()
418 Function: Returns a string which identifes the version of an
419 index module. Used to permanently identify an index
420 file as having been created by a particular version
421 of the index module. Must be provided by the sub class
422 Example :
423 Returns :
424 Args : none
425
426 =cut
427
428 sub _version {
429 my $self = shift;
430
431 $self->throw("In Bio::Index::Abstract, no _version method in sub class");
432 }
433
434 =head2 _code_base
435
436 Title : _code_base
437 Usage : $code = $db->_code_base();
438 Function:
439 Example :
440 Returns : Code package to be used with this
441 Args :
442
443
444 =cut
445
446 sub _code_base {
447 my ($self) = @_;
448 my $code_key = '__TYPE_AND_VERSION';
449 my $record;
450
451 $record = $self->db->{$code_key};
452
453 my($code,$version) = $self->unpack_record($record);
454 if( wantarray ) {
455 return ($code,$version);
456 } else {
457 return $code;
458 }
459 }
460
461
462 =head2 _type_and_version
463
464 Title : _type_and_version
465 Usage : Called by _initalize
466 Function: Checks that the index opened is made by the same index
467 module and version of that module that made it. If the
468 index is empty, then it adds the information to the
469 database.
470 Example :
471 Returns : 1 or exception
472 Args : none
473
474 =cut
475
476 sub _type_and_version {
477 my $self = shift;
478 my $key = '__TYPE_AND_VERSION';
479 my $version = $self->_version();
480 my $type = ref $self;
481
482 # Run check or add type and version key if missing
483 if (my $rec = $self->db->{ $key }) {
484 my( $db_type, $db_version ) = $self->unpack_record($rec);
485 $self->throw("This index file is from version [$db_version] - You need to rebuild it to use module version [$version]")
486 unless $db_version == $version;
487 $self->throw("This index file is type [$db_type] - Can't access it with module for [$type]")
488 unless $db_type eq $type;
489 } else {
490 $self->add_record( $key, $type, $version )
491 or $self->throw("Can't add Type and Version record");
492 }
493 return 1;
494 }
495
496
497 =head2 _check_file_sizes
498
499 Title : _check_file_sizes
500 Usage : $index->_check_file_sizes()
501 Function: Verifies that the files listed in the database
502 are the same size as when the database was built,
503 or throws an exception. Called by the new()
504 function.
505 Example :
506 Returns : 1 or exception
507 Args :
508
509 =cut
510
511 sub _check_file_sizes {
512 my $self = shift;
513 my $num = $self->_file_count() || 0;
514
515 for (my $i = 0; $i < $num; $i++) {
516 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
517 my $size = -s $file;
518 unless ($size == $stored_size) {
519 $self->throw("file $i [ $file ] has changed size $stored_size -> $size. This probably means you need to rebuild the index.");
520 }
521 }
522 return 1;
523 }
524
525
526 =head2 make_index
527
528 Title : make_index
529 Usage : $index->make_index( FILE_LIST )
530 Function: Takes a list of file names, checks that they are
531 all fully qualified, and then calls _filename() on
532 each. It supplies _filename() with the name of the
533 file, and an integer which is stored with each record
534 created by _filename(). Can be called multiple times,
535 and can be used to add to an existing index file.
536 Example : $index->make_index( '/home/seqs1', '/home/seqs2', '/nfs/pub/big_db' );
537 Returns : Number of files indexed
538 Args : LIST OF FILES
539
540 =cut
541
542 sub make_index {
543 my($self, @files) = @_;
544 my $count = 0;
545 my $recs = 0;
546 # blow up if write flag is not set. EB fix
547
548 if( !defined $self->write_flag ) {
549 $self->throw("Attempting to make an index on a read-only database. What about a WRITE flag on opening the index?");
550 }
551
552 # We're really fussy/lazy, expecting all file names to be fully qualified
553 $self->throw("No files to index provided") unless @files;
554 for(my $i=0;$i<scalar @files; $i++) {
555 if( $Bio::Root::IO::FILESPECLOADED && File::Spec->can('rel2abs') ) {
556 if( ! File::Spec->file_name_is_absolute($files[$i]) ) {
557 $files[$i] = File::Spec->rel2abs($files[$i]);
558 }
559 } else {
560 if( $^O =~ /MSWin/i ) {
561 ($files[$i] =~ m|^[A-Za-z]:/|) ||
562 $self->throw("Not an absolute file path '$files[$i]'");
563 } else {
564 ($files[$i] =~ m|^/|) ||
565 $self->throw("Not an absolute file path '$files[$i]'");
566 }
567 }
568 $self->throw("File does not exist '$files[$i]'") unless -e $files[$i];
569 }
570
571 # Add each file to the index
572 FILE :
573 foreach my $file (@files) {
574
575 my $i; # index for this file
576
577 # Get new index for this file and increment file count
578 if ( defined(my $count = $self->_file_count) ) {
579 $i = $count;
580 } else {
581 $i = 0; $self->_file_count(0);
582 }
583
584 # see whether this file has been already indexed
585 my ($record,$number,$size);
586
587 if( ($record = $self->db->{"__FILENAME_$file"}) ) {
588 ($number,$size) = $self->unpack_record($record);
589
590 # if it is the same size - fine. Otherwise die
591 if( -s $file == $size ) {
592 warn "File $file already indexed. Skipping...\n";
593 next FILE;
594 } else {
595 $self->throw("In index, $file has changed size ($size). Indicates that the index is out of date");
596 }
597 }
598
599 # index this file
600 warn "Indexing file $file\n" if( $self->verbose > 0);
601
602 # this is supplied by the subclass and does the serious work
603 $recs += $self->_index_file( $file, $i ); # Specific method for each type of index
604
605
606 # Save file name and size for this index
607 $self->add_record("__FILE_$i", $file, -s $file)
608 or $self->throw("Can't add data to file: $file");
609 $self->add_record("__FILENAME_$file", $i, -s $file)
610 or $self->throw("Can't add data to file: $file");
611
612 # increment file lines
613 $i++; $self->_file_count($i);
614 my $temp;
615 $temp = $self->_file_count();
616
617
618 }
619 return ($count, $recs);
620 }
621
622 =head2 _filename
623
624 Title : _filename
625 Usage : $index->_filename( FILE INT )
626 Function: Indexes the file
627 Example :
628 Returns :
629 Args :
630
631 =cut
632
633 sub _index_file {
634 my $self = shift;
635
636 my $pkg = ref($self);
637 $self->throw("Error: '$pkg' does not provide the _index_file() method");
638 }
639
640
641
642 =head2 _file_handle
643
644 Title : _file_handle
645 Usage : $fh = $index->_file_handle( INT )
646 Function: Returns an open filehandle for the file
647 index INT. On opening a new filehandle it
648 caches it in the @{$index->_filehandle} array.
649 If the requested filehandle is already open,
650 it simply returns it from the array.
651 Example : $fist_file_indexed = $index->_file_handle( 0 );
652 Returns : ref to a filehandle
653 Args : INT
654
655 =cut
656
657 sub _file_handle {
658 my( $self, $i ) = @_;
659
660 unless ($self->{'_filehandle'}[$i]) {
661 my $fh = Symbol::gensym();
662 my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
663 or $self->throw("Can't get filename for index : $i");
664 my $file = $rec[0];
665 open $fh, $file or $self->throw("Can't read file '$file' : $!");
666 $self->{'_filehandle'}[$i] = $fh; # Cache filehandle
667 }
668 return $self->{'_filehandle'}[$i];
669 }
670
671
672 =head2 _file_count
673
674 Title : _file_count
675 Usage : $index->_file_count( INT )
676 Function: Used by the index building sub in a sub class to
677 track the number of files indexed. Sets or gets
678 the number of files indexed when called with or
679 without an argument.
680 Example :
681 Returns : INT
682 Args : INT
683
684 =cut
685
686 sub _file_count {
687 my $self = shift;
688 if (@_) {
689 $self->db->{'__FILE_COUNT'} = shift;
690 }
691 return $self->db->{'__FILE_COUNT'};
692 }
693
694
695 =head2 add_record
696
697 Title : add_record
698 Usage : $index->add_record( $id, @stuff );
699 Function: Calls pack_record on @stuff, and adds the result
700 of pack_record to the index database under key $id.
701 If $id is a reference to an array, then a new entry
702 is added under a key corresponding to each element
703 of the array.
704 Example : $index->add_record( $id, $fileNumber, $begin, $end )
705 Returns : TRUE on success or FALSE on failure
706 Args : ID LIST
707
708 =cut
709
710 sub add_record {
711 my( $self, $id, @rec ) = @_;
712 $self->debug( "Adding key $id\n") if( $self->verbose > 0 );
713 $self->db->{$id} = $self->pack_record( @rec );
714 return 1;
715 }
716
717
718 =head2 pack_record
719
720 Title : pack_record
721 Usage : $packed_string = $index->pack_record( LIST )
722 Function: Packs an array of scalars into a single string
723 joined by ASCII 034 (which is unlikely to be used
724 in any of the strings), and returns it.
725 Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end )
726 Returns : STRING or undef
727 Args : LIST
728
729 =cut
730
731 sub pack_record {
732 my( $self, @args ) = @_;
733 return join "\034", @args;
734 }
735
736 =head2 unpack_record
737
738 Title : unpack_record
739 Usage : $index->unpack_record( STRING )
740 Function: Splits the sting provided into an array,
741 splitting on ASCII 034.
742 Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} )
743 Returns : A 3 element ARRAY
744 Args : STRING containing ASCII 034
745
746 =cut
747
748 sub unpack_record {
749 my( $self, @args ) = @_;
750 return split /\034/, $args[0];
751 }
752
753 =head2 count_records
754
755 Title : count_records
756 Usage : $recs = $seqdb->count_records()
757 Function: return count of all recs in the index
758 Example :
759 Returns : a scalar
760 Args : none
761
762
763 =cut
764
765 sub count_records {
766 my ($self,@args) = @_;
767 my $db = $self->db;
768 my $c = 0;
769 while (my($id, $rec) = each %$db) {
770 if( $id =~ /^__/ ) {
771 # internal info
772 next;
773 }
774 $c++;
775 }
776
777 return ($c);
778 }
779
780
781 =head2 DESTROY
782
783 Title : DESTROY
784 Usage : Called automatically when index goes out of scope
785 Function: Closes connection to database and handles to
786 sequence files
787 Returns : NEVER
788 Args : NONE
789
790
791 =cut
792
793 sub DESTROY {
794 my $self = shift;
795 untie($self->{'_DB'});
796 }
797
798 1;