diff variant_effect_predictor/Bio/Index/Abstract.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Index/Abstract.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,798 @@
+
+#
+# $Id: Abstract.pm,v 1.41 2002/12/17 02:08:36 jason Exp $
+#
+# BioPerl module for Bio::Index::Abstract
+#
+# Cared for by Ewan Birney <birney@sanger.ac.uk>
+#          and James Gilbert <jgrg@sanger.ac.uk>
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Index::Abstract - Abstract interface for indexing a flat file
+
+=head1 SYNOPSIS
+
+You should not be using this module directly
+
+=head1 USING DB_FILE
+
+To use DB_File and not SDBM for this index, pass the value:
+
+    -dbm_package => 'DB_File'
+
+to new (see below).
+
+=head1 DESCRIPTION
+
+This object provides the basic mechanism to associate positions
+in files with names. The position and filenames are stored in DBM
+which can then be accessed later on. It is the equivalent of flat
+file indexing (eg, SRS or efetch).
+
+This object is the guts to the mechanism, which will be used by the
+specific objects inheriting from it.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org             - General discussion
+  http://bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Ewan Birney, James Gilbert
+
+Email - birney@sanger.ac.uk, jgrg@sanger.ac.uk
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal
+methods are usually preceded with an "_" (underscore).
+
+=cut
+
+
+# Let the code begin...
+
+package Bio::Index::Abstract;
+
+use strict;
+use Fcntl qw( O_RDWR O_CREAT O_RDONLY );
+use vars qw( $TYPE_AND_VERSION_KEY
+             @ISA $USE_DBM_TYPE $DB_HASH );
+
+# Object preamble - inheriets from Bio::Root::Object
+
+use Bio::Root::Root;
+use Bio::Root::IO;
+use Symbol();
+
+@ISA = qw(Bio::Root::Root);
+
+# Generate accessor methods for simple object fields
+BEGIN {
+    foreach my $func (qw(filename write_flag)) {
+        no strict 'refs';
+        my $field = "_$func";
+
+        *$func = sub {
+            my( $self, $value ) = @_;
+            
+            if (defined $value) {
+                $self->{$field} = $value;
+            }
+            return $self->{$field};
+        }
+    }
+}
+
+=head2 new
+
+  Usage   : $index = Bio::Index::Abstract->new(
+                -filename    => $dbm_file,
+                -write_flag  => 0,  
+                -dbm_package => 'DB_File',
+                -verbose     => 0);
+  Function: Returns a new index object.  If filename is
+            specified, then open_dbm() is immediately called. 
+            Bio::Index::Abstract->new() will usually be called
+            directly only when opening an existing index.
+  Returns : A new index object
+  Args    : -filename    The name of the dbm index file.
+            -write_flag  TRUE if write access to the dbm file is
+                         needed.
+            -dbm_package The Perl dbm module to use for the
+                         index.
+            -verbose     Print debugging output to STDERR if
+                         TRUE.
+
+=cut
+
+sub new {
+    my($class, @args) = @_;
+    my $self = $class->SUPER::new(@args);
+    my( $filename, $write_flag, $dbm_package, $cachesize, $ffactor ) =
+        $self->_rearrange([qw(FILENAME 
+			      WRITE_FLAG
+			      DBM_PACKAGE
+			      CACHESIZE
+			      FFACTOR
+			      )], @args);
+    
+    # Store any parameters passed
+    $self->filename($filename)       if $filename;
+    $self->cachesize($cachesize)     if $cachesize;
+    $self->ffactor($ffactor)     	 if $ffactor;
+    $self->write_flag($write_flag)   if $write_flag;
+    $self->dbm_package($dbm_package) if $dbm_package;
+
+    $self->{'_filehandle'} = []; # Array in which to cache SeqIO objects
+    $self->{'_DB'}         = {}; # Gets tied to the DBM file
+    
+    # Open database
+    $self->open_dbm() if $filename;
+    return $self;
+}
+
+=pod
+
+=head2 filename
+
+ Title   : filename
+ Usage   : $value = $self->filename();
+           $self->filename($value);
+ Function: Gets or sets the name of the dbm index file.
+ Returns : The current value of filename
+ Args    : Value of filename if setting, or none if
+           getting the value.
+
+=head2 write_flag
+
+ Title   : write_flag
+ Usage   : $value = $self->write_flag();
+           $self->write_flag($value);
+ Function: Gets or sets the value of write_flag, which
+           is wether the dbm file should be opened with
+           write access.
+ Returns : The current value of write_flag (default 0)
+ Args    : Value of write_flag if setting, or none if
+           getting the value.
+
+=head2 dbm_package
+
+ Usage   : $value = $self->dbm_package();
+           $self->dbm_package($value);
+
+ Function: Gets or sets the name of the Perl dbm module used. 
+           If the value is unset, then it returns the value of
+           the package variable $USE_DBM_TYPE or if that is
+           unset, then it chooses the best available dbm type,
+           choosing 'DB_File' in preference to 'SDBM_File'. 
+           Bio::Abstract::Index may work with other dbm file
+           types.
+
+ Returns : The current value of dbm_package
+ Args    : Value of dbm_package if setting, or none if
+           getting the value.
+
+=cut
+
+sub dbm_package {
+    my( $self, $value ) = @_;
+    my $to_require = 0;
+    if( $value || ! $self->{'_dbm_package'} ) {
+	my $type = $value || $USE_DBM_TYPE || 'DB_File';	
+	if( $type =~ /DB_File/i ) {
+	    eval { 
+		require DB_File;
+	    };
+	    $type = ( $@ ) ? 'SDBM_File' : 'DB_File';
+	} 	
+	if( $type ne 'DB_File' ) {
+	    eval { require "$type.pm"; };
+	    $self->throw($@) if( $@ );
+	}
+	$self->{'_dbm_package'} = $type;
+	if( ! defined $USE_DBM_TYPE ) {
+	    $USE_DBM_TYPE = $self->{'_dbm_package'};
+	}	
+    } 
+    return $self->{'_dbm_package'};
+}
+
+=head2 db
+
+  Title   : db
+  Usage   : $index->db
+  Function: Returns a ref to the hash which is tied to the dbm
+            file.  Used internally when adding and retrieving
+            data from the database.
+  Example : $db = $index->db();
+            $db->{ $some_key } = $data
+            $data = $index->db->{ $some_key };
+  Returns : ref to HASH
+  Args    : NONE
+
+=cut
+
+sub db {
+    return $_[0]->{'_DB'};
+}
+
+
+=head2 get_stream
+
+ Title   : get_stream
+ Usage   : $stream = $index->get_stream( $id );
+ Function: Returns a file handle with the file pointer
+           at the approprite place
+
+           This provides for a way to get the actual
+           file contents and not an object 
+
+           WARNING: you must parse the record deliminter
+           *yourself*. Abstract wont do this for you 
+           So this code
+
+           $fh = $index->get_stream($myid);
+           while( <$fh> ) {
+              # do something
+           }
+           will parse the entire file if you don't put in
+           a last statement in, like
+
+           while( <$fh> ) {
+              /^\/\// && last; # end of record
+              # do something
+           }
+
+ Returns : A filehandle object
+ Args    : string represents the accession number
+ Notes   : This method should not be used without forethought 
+
+=cut
+
+#'
+
+sub get_stream {
+   my ($self,$id) = @_;
+
+   my ($desc,$acc,$out);
+   my $db = $self->db();
+
+   if (my $rec = $db->{ $id }) {
+       my( @record );
+       
+       my ($file, $begin, $end) = $self->unpack_record( $rec );
+        
+       # Get the (possibly cached) filehandle
+       my $fh = $self->_file_handle( $file );
+       
+       # move to start
+       seek($fh, $begin, 0);
+       
+       return $fh;
+   }
+   else {
+       $self->throw("Unable to find a record for $id in the flat file index");
+   }
+}
+
+
+=head2 cachesize
+
+  Usage   : $index->cachesize(1000000)
+  Function: Sets the dbm file cache size for the index.
+  	    Needs to be set before the DBM file gets opened.
+  Example : $index->cachesize(1000000)
+  Returns : size of the curent cache
+
+=cut
+
+sub cachesize {
+    my( $self, $size ) = @_;
+
+	if(defined $size){
+		$self->{'_cachesize'} = $size;
+	}
+	return ( $self->{'_cachesize'} );
+	
+}
+
+
+=head2 ffactor
+
+  Usage   : $index->ffactor(1000000)
+  Function: Sets the dbm file fill factor.
+  			Needs to be set before the DBM file gets opened.
+
+  Example : $index->ffactor(1000000)
+  Returns : size of the curent cache
+
+=cut
+
+sub ffactor {
+    my( $self, $size ) = @_;
+
+	if(defined $size){
+		$self->{'_ffactor'} = $size;
+	}
+	return ( $self->{'_ffactor'} );
+	
+}
+
+
+=head2 open_dbm
+
+  Usage   : $index->open_dbm()
+  Function: Opens the dbm file associated with the index
+            object.  Write access is only given if explicitly
+            asked for by calling new(-write => 1) or having set
+            the write_flag(1) on the index object.  The type of
+            dbm file opened is that returned by dbm_package(). 
+            The name of the file to be is opened is obtained by
+            calling the filename() method.
+
+  Example : $index->_open_dbm()
+  Returns : 1 on success
+
+=cut
+
+sub open_dbm {
+    my( $self ) = @_;
+    
+    my $filename = $self->filename()
+        or $self->throw("filename() not set");
+
+    my $db = $self->db();
+    
+    # Close the dbm file if already open (maybe we're getting
+    # or dropping write access
+    if (ref($db) ne 'HASH') {
+        untie($db);
+    }
+    
+    # What kind of DBM file are we going to open?
+    my $dbm_type = $self->dbm_package;
+    
+    # Choose mode for opening dbm file (read/write+create or read-only).
+    my $mode_flags = $self->write_flag ? O_RDWR|O_CREAT : O_RDONLY;
+    
+    # Open the dbm file
+    if ($dbm_type eq 'DB_File') {
+		my $hash_inf = DB_File::HASHINFO->new();
+		my $cache = $self->cachesize();
+		my $ffactor = $self->ffactor();
+		if ($cache){
+			$hash_inf->{'cachesize'} = $cache;
+		}
+		if ($ffactor){
+			$hash_inf->{'ffactor'} = $ffactor;
+		}
+        tie( %$db, $dbm_type, $filename, $mode_flags, 0644, $hash_inf )
+            or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
+    } else {
+        tie( %$db, $dbm_type, $filename, $mode_flags, 0644 )
+            or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
+    }
+
+    # The following methods access data in the dbm file:
+
+    # Now, if we're a Bio::Index::Abstract caterpillar, then we
+    # transform ourselves into a Bio::Index::<something> butterfly!
+    if( ref($self) eq "Bio::Index::Abstract" ) { 
+	my $pkg = $self->_code_base();
+	bless $self, $pkg;
+    }
+
+    # Check or set this is the right kind and version of index
+    $self->_type_and_version();
+    
+    # Check files haven't changed size since they were indexed
+    $self->_check_file_sizes();
+
+    return 1;
+}
+
+=head2 _version
+
+  Title   : _version
+  Usage   : $type = $index->_version()
+  Function: Returns a string which identifes the version of an
+            index module.  Used to permanently identify an index
+            file as having been created by a particular version
+            of the index module.  Must be provided by the sub class
+  Example : 
+  Returns : 
+  Args    : none
+
+=cut
+
+sub _version {
+    my $self = shift;
+    
+    $self->throw("In Bio::Index::Abstract, no _version method in sub class");
+}
+
+=head2 _code_base
+
+ Title   : _code_base
+ Usage   : $code = $db->_code_base();
+ Function:
+ Example :
+ Returns : Code package to be used with this 
+ Args    :
+
+
+=cut
+
+sub _code_base {
+   my ($self) = @_;
+   my $code_key    = '__TYPE_AND_VERSION';
+   my $record;
+
+   $record = $self->db->{$code_key};
+
+   my($code,$version) = $self->unpack_record($record);
+   if( wantarray ) {
+       return ($code,$version);
+   } else {
+       return $code;
+   }
+}
+
+
+=head2 _type_and_version
+
+  Title   : _type_and_version
+  Usage   : Called by _initalize
+  Function: Checks that the index opened is made by the same index
+            module and version of that module that made it.  If the
+            index is empty, then it adds the information to the
+            database.
+  Example : 
+  Returns : 1 or exception
+  Args    : none
+
+=cut
+
+sub _type_and_version {
+    my $self    = shift;
+    my $key     = '__TYPE_AND_VERSION';
+    my $version = $self->_version();
+    my $type    = ref $self;
+    
+    # Run check or add type and version key if missing
+    if (my $rec = $self->db->{ $key }) {
+        my( $db_type, $db_version ) = $self->unpack_record($rec);
+        $self->throw("This index file is from version [$db_version] - You need to rebuild it to use module version [$version]")
+            unless $db_version == $version;
+        $self->throw("This index file is type [$db_type] - Can't access it with module for [$type]")
+            unless $db_type eq $type;
+    } else {
+        $self->add_record( $key, $type, $version )
+            or $self->throw("Can't add Type and Version record");
+    }
+    return 1;
+}
+
+
+=head2 _check_file_sizes
+
+  Title   : _check_file_sizes
+  Usage   : $index->_check_file_sizes()
+  Function: Verifies that the files listed in the database
+            are the same size as when the database was built,
+            or throws an exception.  Called by the new()
+            function.
+  Example : 
+  Returns : 1 or exception
+  Args    : 
+
+=cut
+
+sub _check_file_sizes {
+    my $self = shift;
+    my $num  = $self->_file_count() || 0;
+    
+    for (my $i = 0; $i < $num; $i++) {
+        my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
+        my $size = -s $file;
+        unless ($size == $stored_size) {
+            $self->throw("file $i [ $file ] has changed size $stored_size -> $size. This probably means you need to rebuild the index.");
+        }
+    }
+    return 1;
+}
+
+
+=head2 make_index
+
+  Title   : make_index
+  Usage   : $index->make_index( FILE_LIST )
+  Function: Takes a list of file names, checks that they are
+            all fully qualified, and then calls _filename() on
+            each.  It supplies _filename() with the name of the
+            file, and an integer which is stored with each record
+            created by _filename().  Can be called multiple times,
+            and can be used to add to an existing index file.
+  Example : $index->make_index( '/home/seqs1', '/home/seqs2', '/nfs/pub/big_db' );
+  Returns : Number of files indexed
+  Args    : LIST OF FILES
+
+=cut
+
+sub make_index {
+    my($self, @files) = @_;
+    my $count = 0;
+	my $recs = 0;
+    # blow up if write flag is not set. EB fix
+
+    if( !defined $self->write_flag ) {
+	$self->throw("Attempting to make an index on a read-only database. What about a WRITE flag on opening the index?");
+    }
+
+    # We're really fussy/lazy, expecting all file names to be fully qualified
+    $self->throw("No files to index provided") unless @files;
+    for(my $i=0;$i<scalar @files; $i++)  {
+	if( $Bio::Root::IO::FILESPECLOADED && File::Spec->can('rel2abs') ) {	    
+	    if( ! File::Spec->file_name_is_absolute($files[$i]) ) {
+		$files[$i] = File::Spec->rel2abs($files[$i]);
+	    }
+	} else { 
+	    if(  $^O =~ /MSWin/i ) {
+		($files[$i] =~ m|^[A-Za-z]:/|) || 
+		    $self->throw("Not an absolute file path '$files[$i]'");
+	    } else {
+		($files[$i] =~ m|^/|) || 
+		    $self->throw("Not an absolute file path '$files[$i]'"); 
+	    }
+	}
+        $self->throw("File does not exist '$files[$i]'")   unless -e $files[$i];
+    }
+
+    # Add each file to the index
+    FILE :
+    foreach my $file (@files) {
+
+        my $i; # index for this file
+    
+        # Get new index for this file and increment file count
+        if ( defined(my $count = $self->_file_count) ) {
+            $i = $count;
+        } else {
+            $i = 0; $self->_file_count(0);
+        }
+
+	# see whether this file has been already indexed
+	my ($record,$number,$size);
+
+	if( ($record = $self->db->{"__FILENAME_$file"}) ) {
+	    ($number,$size) = $self->unpack_record($record);
+
+	    # if it is the same size - fine. Otherwise die 
+	    if( -s $file == $size ) {
+		warn "File $file already indexed. Skipping...\n";
+		next FILE;
+	    } else {
+		$self->throw("In index, $file has changed size ($size). Indicates that the index is out of date");
+	    }
+	}
+
+	# index this file
+	warn "Indexing file $file\n" if( $self->verbose > 0);
+
+	# this is supplied by the subclass and does the serious work
+        $recs += $self->_index_file( $file, $i ); # Specific method for each type of index
+
+
+        # Save file name and size for this index
+        $self->add_record("__FILE_$i", $file, -s $file)
+            or $self->throw("Can't add data to file: $file");
+        $self->add_record("__FILENAME_$file", $i, -s $file)
+            or $self->throw("Can't add data to file: $file");
+
+        # increment file lines
+	$i++; $self->_file_count($i);
+	my $temp;
+	$temp = $self->_file_count();
+	
+
+    }
+    return ($count, $recs);
+}
+
+=head2 _filename
+
+  Title   : _filename
+  Usage   : $index->_filename( FILE INT )
+  Function: Indexes the file
+  Example : 
+  Returns : 
+  Args    : 
+
+=cut
+
+sub _index_file {
+    my $self = shift;
+    
+    my $pkg = ref($self);
+    $self->throw("Error: '$pkg' does not provide the _index_file() method");
+}
+
+
+
+=head2 _file_handle
+
+  Title   : _file_handle
+  Usage   : $fh = $index->_file_handle( INT )
+  Function: Returns an open filehandle for the file
+            index INT.  On opening a new filehandle it
+            caches it in the @{$index->_filehandle} array.
+            If the requested filehandle is already open,
+            it simply returns it from the array.
+  Example : $fist_file_indexed = $index->_file_handle( 0 );
+  Returns : ref to a filehandle
+  Args    : INT
+
+=cut
+
+sub _file_handle {
+    my( $self, $i ) = @_;
+    
+    unless ($self->{'_filehandle'}[$i]) {
+        my $fh = Symbol::gensym();
+        my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
+            or $self->throw("Can't get filename for index : $i");
+        my $file = $rec[0];
+        open $fh, $file or $self->throw("Can't read file '$file' : $!");
+        $self->{'_filehandle'}[$i] = $fh; # Cache filehandle
+    }
+    return $self->{'_filehandle'}[$i];
+}
+
+
+=head2 _file_count
+
+  Title   : _file_count
+  Usage   : $index->_file_count( INT )
+  Function: Used by the index building sub in a sub class to
+            track the number of files indexed.  Sets or gets
+            the number of files indexed when called with or
+            without an argument.
+  Example : 
+  Returns : INT
+  Args    : INT
+
+=cut
+
+sub _file_count {
+    my $self = shift;
+    if (@_) {
+        $self->db->{'__FILE_COUNT'} = shift;
+    }
+    return $self->db->{'__FILE_COUNT'};
+}
+
+
+=head2 add_record
+
+  Title   : add_record
+  Usage   : $index->add_record( $id, @stuff );
+  Function: Calls pack_record on @stuff, and adds the result
+            of pack_record to the index database under key $id.
+            If $id is a reference to an array, then a new entry
+            is added under a key corresponding to each element
+            of the array.
+  Example : $index->add_record( $id, $fileNumber, $begin, $end )
+  Returns : TRUE on success or FALSE on failure
+  Args    : ID LIST
+
+=cut
+
+sub add_record {
+    my( $self, $id, @rec ) = @_;
+    $self->debug( "Adding key $id\n") if( $self->verbose > 0 );
+    $self->db->{$id} = $self->pack_record( @rec );
+    return 1;
+}
+
+
+=head2 pack_record
+
+  Title   : pack_record
+  Usage   : $packed_string = $index->pack_record( LIST )
+  Function: Packs an array of scalars into a single string
+            joined by ASCII 034 (which is unlikely to be used
+            in any of the strings), and returns it. 
+  Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end )
+  Returns : STRING or undef
+  Args    : LIST
+
+=cut
+
+sub pack_record {
+    my( $self, @args ) = @_;
+    return join "\034", @args;
+}
+
+=head2 unpack_record
+
+  Title   : unpack_record
+  Usage   : $index->unpack_record( STRING )
+  Function: Splits the sting provided into an array,
+            splitting on ASCII 034.
+  Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} )
+  Returns : A 3 element ARRAY
+  Args    : STRING containing ASCII 034
+
+=cut
+
+sub unpack_record {
+    my( $self, @args ) = @_;
+    return split /\034/, $args[0];
+}
+
+=head2 count_records
+
+ Title   : count_records
+ Usage   : $recs = $seqdb->count_records()
+ Function: return count of all recs in the index 
+ Example :
+ Returns : a scalar
+ Args    : none
+
+
+=cut
+
+sub count_records {
+   my ($self,@args) = @_;
+   my $db = $self->db;
+   my $c = 0;
+   while (my($id, $rec) = each %$db) {
+       if( $id =~ /^__/ ) {
+           # internal info
+           next;
+       }
+		$c++;
+   }
+
+   return ($c);
+}
+
+
+=head2 DESTROY
+
+ Title   : DESTROY
+ Usage   : Called automatically when index goes out of scope
+ Function: Closes connection to database and handles to
+           sequence files
+ Returns : NEVER
+ Args    : NONE
+
+
+=cut
+
+sub DESTROY {
+    my $self = shift;
+    untie($self->{'_DB'});
+}
+
+1;