Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Index/Abstract.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/Abstract.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,798 @@ + +# +# $Id: Abstract.pm,v 1.41 2002/12/17 02:08:36 jason Exp $ +# +# BioPerl module for Bio::Index::Abstract +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# and James Gilbert <jgrg@sanger.ac.uk> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::Abstract - Abstract interface for indexing a flat file + +=head1 SYNOPSIS + +You should not be using this module directly + +=head1 USING DB_FILE + +To use DB_File and not SDBM for this index, pass the value: + + -dbm_package => 'DB_File' + +to new (see below). + +=head1 DESCRIPTION + +This object provides the basic mechanism to associate positions +in files with names. The position and filenames are stored in DBM +which can then be accessed later on. It is the equivalent of flat +file indexing (eg, SRS or efetch). + +This object is the guts to the mechanism, which will be used by the +specific objects inheriting from it. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, James Gilbert + +Email - birney@sanger.ac.uk, jgrg@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal +methods are usually preceded with an "_" (underscore). + +=cut + + +# Let the code begin... + +package Bio::Index::Abstract; + +use strict; +use Fcntl qw( O_RDWR O_CREAT O_RDONLY ); +use vars qw( $TYPE_AND_VERSION_KEY + @ISA $USE_DBM_TYPE $DB_HASH ); + +# Object preamble - inheriets from Bio::Root::Object + +use Bio::Root::Root; +use Bio::Root::IO; +use Symbol(); + +@ISA = qw(Bio::Root::Root); + +# Generate accessor methods for simple object fields +BEGIN { + foreach my $func (qw(filename write_flag)) { + no strict 'refs'; + my $field = "_$func"; + + *$func = sub { + my( $self, $value ) = @_; + + if (defined $value) { + $self->{$field} = $value; + } + return $self->{$field}; + } + } +} + +=head2 new + + Usage : $index = Bio::Index::Abstract->new( + -filename => $dbm_file, + -write_flag => 0, + -dbm_package => 'DB_File', + -verbose => 0); + Function: Returns a new index object. If filename is + specified, then open_dbm() is immediately called. + Bio::Index::Abstract->new() will usually be called + directly only when opening an existing index. + Returns : A new index object + Args : -filename The name of the dbm index file. + -write_flag TRUE if write access to the dbm file is + needed. + -dbm_package The Perl dbm module to use for the + index. + -verbose Print debugging output to STDERR if + TRUE. + +=cut + +sub new { + my($class, @args) = @_; + my $self = $class->SUPER::new(@args); + my( $filename, $write_flag, $dbm_package, $cachesize, $ffactor ) = + $self->_rearrange([qw(FILENAME + WRITE_FLAG + DBM_PACKAGE + CACHESIZE + FFACTOR + )], @args); + + # Store any parameters passed + $self->filename($filename) if $filename; + $self->cachesize($cachesize) if $cachesize; + $self->ffactor($ffactor) if $ffactor; + $self->write_flag($write_flag) if $write_flag; + $self->dbm_package($dbm_package) if $dbm_package; + + $self->{'_filehandle'} = []; # Array in which to cache SeqIO objects + $self->{'_DB'} = {}; # Gets tied to the DBM file + + # Open database + $self->open_dbm() if $filename; + return $self; +} + +=pod + +=head2 filename + + Title : filename + Usage : $value = $self->filename(); + $self->filename($value); + Function: Gets or sets the name of the dbm index file. + Returns : The current value of filename + Args : Value of filename if setting, or none if + getting the value. + +=head2 write_flag + + Title : write_flag + Usage : $value = $self->write_flag(); + $self->write_flag($value); + Function: Gets or sets the value of write_flag, which + is wether the dbm file should be opened with + write access. + Returns : The current value of write_flag (default 0) + Args : Value of write_flag if setting, or none if + getting the value. + +=head2 dbm_package + + Usage : $value = $self->dbm_package(); + $self->dbm_package($value); + + Function: Gets or sets the name of the Perl dbm module used. + If the value is unset, then it returns the value of + the package variable $USE_DBM_TYPE or if that is + unset, then it chooses the best available dbm type, + choosing 'DB_File' in preference to 'SDBM_File'. + Bio::Abstract::Index may work with other dbm file + types. + + Returns : The current value of dbm_package + Args : Value of dbm_package if setting, or none if + getting the value. + +=cut + +sub dbm_package { + my( $self, $value ) = @_; + my $to_require = 0; + if( $value || ! $self->{'_dbm_package'} ) { + my $type = $value || $USE_DBM_TYPE || 'DB_File'; + if( $type =~ /DB_File/i ) { + eval { + require DB_File; + }; + $type = ( $@ ) ? 'SDBM_File' : 'DB_File'; + } + if( $type ne 'DB_File' ) { + eval { require "$type.pm"; }; + $self->throw($@) if( $@ ); + } + $self->{'_dbm_package'} = $type; + if( ! defined $USE_DBM_TYPE ) { + $USE_DBM_TYPE = $self->{'_dbm_package'}; + } + } + return $self->{'_dbm_package'}; +} + +=head2 db + + Title : db + Usage : $index->db + Function: Returns a ref to the hash which is tied to the dbm + file. Used internally when adding and retrieving + data from the database. + Example : $db = $index->db(); + $db->{ $some_key } = $data + $data = $index->db->{ $some_key }; + Returns : ref to HASH + Args : NONE + +=cut + +sub db { + return $_[0]->{'_DB'}; +} + + +=head2 get_stream + + Title : get_stream + Usage : $stream = $index->get_stream( $id ); + Function: Returns a file handle with the file pointer + at the approprite place + + This provides for a way to get the actual + file contents and not an object + + WARNING: you must parse the record deliminter + *yourself*. Abstract wont do this for you + So this code + + $fh = $index->get_stream($myid); + while( <$fh> ) { + # do something + } + will parse the entire file if you don't put in + a last statement in, like + + while( <$fh> ) { + /^\/\// && last; # end of record + # do something + } + + Returns : A filehandle object + Args : string represents the accession number + Notes : This method should not be used without forethought + +=cut + +#' + +sub get_stream { + my ($self,$id) = @_; + + my ($desc,$acc,$out); + my $db = $self->db(); + + if (my $rec = $db->{ $id }) { + my( @record ); + + my ($file, $begin, $end) = $self->unpack_record( $rec ); + + # Get the (possibly cached) filehandle + my $fh = $self->_file_handle( $file ); + + # move to start + seek($fh, $begin, 0); + + return $fh; + } + else { + $self->throw("Unable to find a record for $id in the flat file index"); + } +} + + +=head2 cachesize + + Usage : $index->cachesize(1000000) + Function: Sets the dbm file cache size for the index. + Needs to be set before the DBM file gets opened. + Example : $index->cachesize(1000000) + Returns : size of the curent cache + +=cut + +sub cachesize { + my( $self, $size ) = @_; + + if(defined $size){ + $self->{'_cachesize'} = $size; + } + return ( $self->{'_cachesize'} ); + +} + + +=head2 ffactor + + Usage : $index->ffactor(1000000) + Function: Sets the dbm file fill factor. + Needs to be set before the DBM file gets opened. + + Example : $index->ffactor(1000000) + Returns : size of the curent cache + +=cut + +sub ffactor { + my( $self, $size ) = @_; + + if(defined $size){ + $self->{'_ffactor'} = $size; + } + return ( $self->{'_ffactor'} ); + +} + + +=head2 open_dbm + + Usage : $index->open_dbm() + Function: Opens the dbm file associated with the index + object. Write access is only given if explicitly + asked for by calling new(-write => 1) or having set + the write_flag(1) on the index object. The type of + dbm file opened is that returned by dbm_package(). + The name of the file to be is opened is obtained by + calling the filename() method. + + Example : $index->_open_dbm() + Returns : 1 on success + +=cut + +sub open_dbm { + my( $self ) = @_; + + my $filename = $self->filename() + or $self->throw("filename() not set"); + + my $db = $self->db(); + + # Close the dbm file if already open (maybe we're getting + # or dropping write access + if (ref($db) ne 'HASH') { + untie($db); + } + + # What kind of DBM file are we going to open? + my $dbm_type = $self->dbm_package; + + # Choose mode for opening dbm file (read/write+create or read-only). + my $mode_flags = $self->write_flag ? O_RDWR|O_CREAT : O_RDONLY; + + # Open the dbm file + if ($dbm_type eq 'DB_File') { + my $hash_inf = DB_File::HASHINFO->new(); + my $cache = $self->cachesize(); + my $ffactor = $self->ffactor(); + if ($cache){ + $hash_inf->{'cachesize'} = $cache; + } + if ($ffactor){ + $hash_inf->{'ffactor'} = $ffactor; + } + tie( %$db, $dbm_type, $filename, $mode_flags, 0644, $hash_inf ) + or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!"); + } else { + tie( %$db, $dbm_type, $filename, $mode_flags, 0644 ) + or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!"); + } + + # The following methods access data in the dbm file: + + # Now, if we're a Bio::Index::Abstract caterpillar, then we + # transform ourselves into a Bio::Index::<something> butterfly! + if( ref($self) eq "Bio::Index::Abstract" ) { + my $pkg = $self->_code_base(); + bless $self, $pkg; + } + + # Check or set this is the right kind and version of index + $self->_type_and_version(); + + # Check files haven't changed size since they were indexed + $self->_check_file_sizes(); + + return 1; +} + +=head2 _version + + Title : _version + Usage : $type = $index->_version() + Function: Returns a string which identifes the version of an + index module. Used to permanently identify an index + file as having been created by a particular version + of the index module. Must be provided by the sub class + Example : + Returns : + Args : none + +=cut + +sub _version { + my $self = shift; + + $self->throw("In Bio::Index::Abstract, no _version method in sub class"); +} + +=head2 _code_base + + Title : _code_base + Usage : $code = $db->_code_base(); + Function: + Example : + Returns : Code package to be used with this + Args : + + +=cut + +sub _code_base { + my ($self) = @_; + my $code_key = '__TYPE_AND_VERSION'; + my $record; + + $record = $self->db->{$code_key}; + + my($code,$version) = $self->unpack_record($record); + if( wantarray ) { + return ($code,$version); + } else { + return $code; + } +} + + +=head2 _type_and_version + + Title : _type_and_version + Usage : Called by _initalize + Function: Checks that the index opened is made by the same index + module and version of that module that made it. If the + index is empty, then it adds the information to the + database. + Example : + Returns : 1 or exception + Args : none + +=cut + +sub _type_and_version { + my $self = shift; + my $key = '__TYPE_AND_VERSION'; + my $version = $self->_version(); + my $type = ref $self; + + # Run check or add type and version key if missing + if (my $rec = $self->db->{ $key }) { + my( $db_type, $db_version ) = $self->unpack_record($rec); + $self->throw("This index file is from version [$db_version] - You need to rebuild it to use module version [$version]") + unless $db_version == $version; + $self->throw("This index file is type [$db_type] - Can't access it with module for [$type]") + unless $db_type eq $type; + } else { + $self->add_record( $key, $type, $version ) + or $self->throw("Can't add Type and Version record"); + } + return 1; +} + + +=head2 _check_file_sizes + + Title : _check_file_sizes + Usage : $index->_check_file_sizes() + Function: Verifies that the files listed in the database + are the same size as when the database was built, + or throws an exception. Called by the new() + function. + Example : + Returns : 1 or exception + Args : + +=cut + +sub _check_file_sizes { + my $self = shift; + my $num = $self->_file_count() || 0; + + for (my $i = 0; $i < $num; $i++) { + my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} ); + my $size = -s $file; + unless ($size == $stored_size) { + $self->throw("file $i [ $file ] has changed size $stored_size -> $size. This probably means you need to rebuild the index."); + } + } + return 1; +} + + +=head2 make_index + + Title : make_index + Usage : $index->make_index( FILE_LIST ) + Function: Takes a list of file names, checks that they are + all fully qualified, and then calls _filename() on + each. It supplies _filename() with the name of the + file, and an integer which is stored with each record + created by _filename(). Can be called multiple times, + and can be used to add to an existing index file. + Example : $index->make_index( '/home/seqs1', '/home/seqs2', '/nfs/pub/big_db' ); + Returns : Number of files indexed + Args : LIST OF FILES + +=cut + +sub make_index { + my($self, @files) = @_; + my $count = 0; + my $recs = 0; + # blow up if write flag is not set. EB fix + + if( !defined $self->write_flag ) { + $self->throw("Attempting to make an index on a read-only database. What about a WRITE flag on opening the index?"); + } + + # We're really fussy/lazy, expecting all file names to be fully qualified + $self->throw("No files to index provided") unless @files; + for(my $i=0;$i<scalar @files; $i++) { + if( $Bio::Root::IO::FILESPECLOADED && File::Spec->can('rel2abs') ) { + if( ! File::Spec->file_name_is_absolute($files[$i]) ) { + $files[$i] = File::Spec->rel2abs($files[$i]); + } + } else { + if( $^O =~ /MSWin/i ) { + ($files[$i] =~ m|^[A-Za-z]:/|) || + $self->throw("Not an absolute file path '$files[$i]'"); + } else { + ($files[$i] =~ m|^/|) || + $self->throw("Not an absolute file path '$files[$i]'"); + } + } + $self->throw("File does not exist '$files[$i]'") unless -e $files[$i]; + } + + # Add each file to the index + FILE : + foreach my $file (@files) { + + my $i; # index for this file + + # Get new index for this file and increment file count + if ( defined(my $count = $self->_file_count) ) { + $i = $count; + } else { + $i = 0; $self->_file_count(0); + } + + # see whether this file has been already indexed + my ($record,$number,$size); + + if( ($record = $self->db->{"__FILENAME_$file"}) ) { + ($number,$size) = $self->unpack_record($record); + + # if it is the same size - fine. Otherwise die + if( -s $file == $size ) { + warn "File $file already indexed. Skipping...\n"; + next FILE; + } else { + $self->throw("In index, $file has changed size ($size). Indicates that the index is out of date"); + } + } + + # index this file + warn "Indexing file $file\n" if( $self->verbose > 0); + + # this is supplied by the subclass and does the serious work + $recs += $self->_index_file( $file, $i ); # Specific method for each type of index + + + # Save file name and size for this index + $self->add_record("__FILE_$i", $file, -s $file) + or $self->throw("Can't add data to file: $file"); + $self->add_record("__FILENAME_$file", $i, -s $file) + or $self->throw("Can't add data to file: $file"); + + # increment file lines + $i++; $self->_file_count($i); + my $temp; + $temp = $self->_file_count(); + + + } + return ($count, $recs); +} + +=head2 _filename + + Title : _filename + Usage : $index->_filename( FILE INT ) + Function: Indexes the file + Example : + Returns : + Args : + +=cut + +sub _index_file { + my $self = shift; + + my $pkg = ref($self); + $self->throw("Error: '$pkg' does not provide the _index_file() method"); +} + + + +=head2 _file_handle + + Title : _file_handle + Usage : $fh = $index->_file_handle( INT ) + Function: Returns an open filehandle for the file + index INT. On opening a new filehandle it + caches it in the @{$index->_filehandle} array. + If the requested filehandle is already open, + it simply returns it from the array. + Example : $fist_file_indexed = $index->_file_handle( 0 ); + Returns : ref to a filehandle + Args : INT + +=cut + +sub _file_handle { + my( $self, $i ) = @_; + + unless ($self->{'_filehandle'}[$i]) { + my $fh = Symbol::gensym(); + my @rec = $self->unpack_record($self->db->{"__FILE_$i"}) + or $self->throw("Can't get filename for index : $i"); + my $file = $rec[0]; + open $fh, $file or $self->throw("Can't read file '$file' : $!"); + $self->{'_filehandle'}[$i] = $fh; # Cache filehandle + } + return $self->{'_filehandle'}[$i]; +} + + +=head2 _file_count + + Title : _file_count + Usage : $index->_file_count( INT ) + Function: Used by the index building sub in a sub class to + track the number of files indexed. Sets or gets + the number of files indexed when called with or + without an argument. + Example : + Returns : INT + Args : INT + +=cut + +sub _file_count { + my $self = shift; + if (@_) { + $self->db->{'__FILE_COUNT'} = shift; + } + return $self->db->{'__FILE_COUNT'}; +} + + +=head2 add_record + + Title : add_record + Usage : $index->add_record( $id, @stuff ); + Function: Calls pack_record on @stuff, and adds the result + of pack_record to the index database under key $id. + If $id is a reference to an array, then a new entry + is added under a key corresponding to each element + of the array. + Example : $index->add_record( $id, $fileNumber, $begin, $end ) + Returns : TRUE on success or FALSE on failure + Args : ID LIST + +=cut + +sub add_record { + my( $self, $id, @rec ) = @_; + $self->debug( "Adding key $id\n") if( $self->verbose > 0 ); + $self->db->{$id} = $self->pack_record( @rec ); + return 1; +} + + +=head2 pack_record + + Title : pack_record + Usage : $packed_string = $index->pack_record( LIST ) + Function: Packs an array of scalars into a single string + joined by ASCII 034 (which is unlikely to be used + in any of the strings), and returns it. + Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end ) + Returns : STRING or undef + Args : LIST + +=cut + +sub pack_record { + my( $self, @args ) = @_; + return join "\034", @args; +} + +=head2 unpack_record + + Title : unpack_record + Usage : $index->unpack_record( STRING ) + Function: Splits the sting provided into an array, + splitting on ASCII 034. + Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} ) + Returns : A 3 element ARRAY + Args : STRING containing ASCII 034 + +=cut + +sub unpack_record { + my( $self, @args ) = @_; + return split /\034/, $args[0]; +} + +=head2 count_records + + Title : count_records + Usage : $recs = $seqdb->count_records() + Function: return count of all recs in the index + Example : + Returns : a scalar + Args : none + + +=cut + +sub count_records { + my ($self,@args) = @_; + my $db = $self->db; + my $c = 0; + while (my($id, $rec) = each %$db) { + if( $id =~ /^__/ ) { + # internal info + next; + } + $c++; + } + + return ($c); +} + + +=head2 DESTROY + + Title : DESTROY + Usage : Called automatically when index goes out of scope + Function: Closes connection to database and handles to + sequence files + Returns : NEVER + Args : NONE + + +=cut + +sub DESTROY { + my $self = shift; + untie($self->{'_DB'}); +} + +1;