diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/BaseObject.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/BaseObject.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,478 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::IdMapping::BaseObject - base object for IdMapping objects
+
+=head1 SYNOPSIS
+
+  # this object isn't instantiated directly but rather extended
+  use Bio::EnsEMBL::IdMapping::BaseObject;
+  our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
+
+=head1 DESCRIPTION
+
+This is the base object for some of the objects used in the IdMapping
+application. An object that extends BaseObject will have a ConfParser,
+Logger and Cache object. BaseObject also implements some useful utility
+functions related to file and db access.
+
+This isn't very clean OO design but it's efficient and easy to use...
+
+=head1 METHODS
+
+  new
+  get_filehandle
+  file_exists
+  fetch_value_from_db
+  dump_table_to_file
+  upload_file_into_table
+  logger
+  conf
+  cache
+
+=cut
+
+
+package Bio::EnsEMBL::IdMapping::BaseObject;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::Argument qw(rearrange);
+use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
+
+
+=head2 new
+
+  Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
+  Arg [CONF]  : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
+  Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object
+  Example     : my $object = Bio::EnsEMBL::IdMapping::BaseObjectSubclass->new(
+                  -LOGGER => $logger,
+                  -CONF   => $conf,
+                  -CACHE  => $cache
+                );
+  Description : Constructor
+  Return type : implementing subclass type
+  Exceptions  : thrown on wrong or missing arguments
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+
+  my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);
+
+  unless ($logger and ref($logger) and
+          $logger->isa('Bio::EnsEMBL::Utils::Logger')) {
+    throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
+  }
+  
+  unless ($conf and ref($conf) and
+          $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
+    throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
+  }
+  
+  unless ($cache and ref($cache) and
+          $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
+    throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
+  }
+  
+  my $self = {};
+  bless ($self, $class);
+
+  # initialise
+  $self->logger($logger);
+  $self->conf($conf);
+  $self->cache($cache);
+  
+  return $self;
+}
+
+
+=head2 get_filehandle 
+
+  Arg[1]      : String $filename - filename for filehandle
+  Arg[2]      : String $path_append - append subdirectory name to basedir
+  Arg[3]      : String $mode - filehandle mode (<|>|>>)
+  Example     : my $fh = $object->get_filehandle('mapping_stats.txt', 'stats',
+                  '>');
+                print $fh "Stats:\n";
+  Description : Returns a filehandle to a file for reading or writing. The file
+                is qualified with the basedir defined in the configuration and
+                an optional subdirectory name.
+  Return type : filehandle
+  Exceptions  : thrown on missing filename
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub get_filehandle {
+  my $self = shift;
+  my $filename = shift;
+  my $path_append = shift;
+  my $mode = shift;
+
+  throw("Need a filename for this filehandle.") unless (defined($filename));
+  
+  my $path = $self->conf->param('basedir');
+  $path = path_append($path, $path_append) if (defined($path_append));
+
+  $mode ||= '>';
+  
+  open(my $fh, $mode, "$path/$filename") or
+    throw("Unable to open $path/$filename: $!");
+
+  return $fh;
+}
+
+
+=head2 file_exists
+
+  Arg[1]      : String $filename - filename to test
+  Arg[2]      : Boolean $path_append - turn on pre-pending of basedir
+  Example     : unless ($object->file_exists('gene_mappings.ser', 1)) {
+                  $object->do_gene_mapping;
+                }
+  Description : Tests if a file exists and has non-zero size.
+  Return type : Boolean
+  Exceptions  : none
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub file_exists {
+  my $self = shift;
+  my $filename = shift;
+  my $path_append = shift;
+
+  my $path = $self->conf->param('basedir');
+  $path = path_append($path, $path_append) if (defined($path_append));
+
+  return (-s "$path/$filename");
+}
+
+
+=head2 fetch_value_from_db 
+
+  Arg[1]      : DBI::db $dbh - a DBI database handle
+  Arg[2]      : String $sql - SQL statement to execute
+  Example     : my $num_genes = $object->fetch_value_from_db($dbh,
+                  'SELECT count(*) FROM gene');
+  Description : Executes an SQL statement on a db handle and returns the first
+                column of the first row returned. Useful for queries returning a
+                single value, like table counts.
+  Return type : Return type of SQL statement
+  Exceptions  : thrown on wrong or missing arguments
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub fetch_value_from_db {
+  my $self = shift;
+  my $dbh = shift;
+  my $sql = shift;
+
+  throw("Need a db handle.") unless ($dbh and $dbh->isa('DBI::db'));
+  throw("Need an SQL query to execute.") unless ($sql);
+
+  my $sth = $dbh->prepare($sql);
+  $sth->execute;
+  my ($retval) = $sth->fetchrow_array;
+
+  return $retval;
+}
+
+
+=head2 dump_table_to_file 
+
+  Arg[1]      : String $dbtype - db type (source|target)
+  Arg[2]      : String $table - name of table to dump
+  Arg[3]      : String $filename - name of dump file
+  Arg[4]      : Boolean $check_existing - turn on test for existing dump
+  Example     : my $rows_dumped = $object->dump_table_to_file('source',
+                  'stable_id_event', 'stable_id_event_existing.txt');
+  Description : Dumps the contents of a db table to a tab-delimited file. The
+                dump file will be written to a subdirectory called 'tables'
+                under the basedir from your configuration.
+  Return type : Int - the number of rows dumped
+  Exceptions  : thrown on wrong or missing arguments
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub dump_table_to_file {
+  my $self = shift;
+  my $dbtype = shift;
+  my $table = shift;
+  my $filename = shift;
+  my $check_existing = shift;
+
+  # argument check
+  unless (($dbtype eq 'source') or ($dbtype eq 'target')) {
+    throw("Missing or unknown db type: $dbtype.");
+  }
+  throw("Need a table name.") unless ($table);
+  throw("Need a filename.") unless ($filename);
+
+  # conditionally check if table was already dumped
+  if ($check_existing and $self->file_exists($filename, 'tables')) {
+    $self->logger->info("$filename exists, won't dump again.\n");
+    return 0;
+  }
+  
+  my $fh = $self->get_filehandle($filename, 'tables');
+
+  my $dba = $self->cache->get_DBAdaptor($dbtype);
+  my $dbh = $dba->dbc->db_handle;
+  my $sth = $dbh->prepare("SELECT * FROM $table");
+  $sth->execute;
+
+  my $i = 0;
+
+  while (my @row = $sth->fetchrow_array) {
+    $i++;
+
+    # use '\N' for NULL values
+    for (my $j = 0; $j < scalar(@row); $j++) {
+      $row[$j] = '\N' unless (defined($row[$j]));
+    }
+    
+    print $fh join("\t", @row);
+    print $fh "\n";
+  }
+
+  $sth->finish;
+  
+  return $i;
+}
+
+
+=head2 upload_file_into_table
+
+  Arg[1]      : String $dbtype - db type (source|target)
+  Arg[2]      : String $table - name of table to upload the data to
+  Arg[3]      : String $filename - name of dump file
+  Arg[4]      : Boolean $no_check_empty - don't check if table is empty
+  Example     : my $rows_uploaded = $object->upload_file_into_table('target',
+                  'stable_id_event', 'stable_id_event_new.txt');
+  Description : Uploads a tab-delimited data file into a db table. The data file
+                will be taken from a subdirectory 'tables' under your configured
+                basedir. If the db table isn't empty and $no_check_empty isn't
+                set, no data is uploaded (and a warning is issued).
+  Return type : Int - the number of rows uploaded
+  Exceptions  : thrown on wrong or missing arguments
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub upload_file_into_table {
+  my $self           = shift;
+  my $dbtype         = shift;
+  my $table          = shift;
+  my $filename       = shift;
+  my $no_check_empty = shift;
+
+  # argument check
+  unless ( ( $dbtype eq 'source' ) or ( $dbtype eq 'target' ) ) {
+    throw("Missing or unknown db type: $dbtype.");
+  }
+  throw("Need a table name.") unless ($table);
+  throw("Need a filename.")   unless ($filename);
+
+  # sanity check for dry run
+  if ( $self->conf->param('dry_run') ) {
+    $self->logger->warning(
+                       "dry_run - skipping db upload for $filename.\n");
+    return;
+  }
+
+  my $file =
+    join( '/', $self->conf->param('basedir'), 'tables', $filename );
+  my $r = 0;
+
+  if ( -s $file ) {
+
+    $self->logger->debug( "$file -> $table\n", 1 );
+
+    my $dba = $self->cache->get_DBAdaptor($dbtype);
+    my $dbh = $dba->dbc->db_handle;
+
+    my $idtable = 0;
+    if ( $table =~ /^([^_]+)_stable_id/ ) {
+      # This is a stable_id table we're working with.
+      $idtable = 1;
+      $table   = $1;
+    }
+
+    # check table is empty
+    my ( $sql, $sth );
+    unless ($no_check_empty) {
+      if ($idtable) {
+        $sql =
+          qq(SELECT count(*) FROM $table WHERE stable_id IS NOT NULL);
+      }
+      else {
+        $sql = qq(SELECT count(*) FROM $table);
+      }
+      $sth = $dbh->prepare($sql);
+      $sth->execute;
+      my ($c) = $sth->fetchrow_array;
+      $sth->finish;
+
+      if ( $c > 0 ) {
+        if ($idtable) {
+          $self->logger->warning(
+                               "Table $table contains $c stable IDs.\n",
+                               1 );
+        }
+        else {
+          $self->logger->warning(
+                          "Table $table not empty: found $c entries.\n",
+                          1 );
+        }
+        $self->logger->info( "Data not uploaded!\n", 1 );
+        return $r;
+      }
+    } ## end unless ($no_check_empty)
+
+    # now upload the data
+    if ($idtable) {
+      # Create a temporary table, upload the data into it, and then
+      # update the main table.
+      $dbh->do(
+        qq( CREATE TABLE stable_id_$$ (  object_id INTEGER UNSIGNED,
+                                             stable_id VARCHAR(255),
+                                             version SMALLINT UNSIGNED,
+                                             created_date DATETIME,
+                                             modified_date DATETIME,
+                                             PRIMARY KEY(object_id) ) )
+      );
+
+      $dbh->do(
+            qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE stable_id_$$));
+
+      $dbh->do(
+        qq(
+      UPDATE $table, stable_id_$$
+      SET $table.stable_id=stable_id_$$.stable_id,
+          $table.version=stable_id_$$.version,
+          $table.created_date=stable_id_$$.created_date,
+          $table.modified_date=stable_id_$$.modified_date
+      WHERE $table.${table}_id = stable_id_$$.object_id )
+      );
+
+      $dbh->do(qq(DROP TABLE stable_id_$$));
+    } ## end if ($idtable)
+    else {
+      $dbh->do(qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE $table));
+    }
+    $dbh->do(qq(OPTIMIZE TABLE $table));
+
+  } ## end if ( -s $file )
+  else {
+    $self->logger->warning( "No data found in file $filename.\n", 1 );
+  }
+
+  return $r;
+} ## end sub upload_file_into_table
+
+
+=head2 logger
+
+  Arg[1]      : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set
+  Example     : $object->logger->info("Starting ID mapping.\n");
+  Description : Getter/setter for logger object
+  Return type : Bio::EnsEMBL::Utils::Logger
+  Exceptions  : none
+  Caller      : constructor
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub logger {
+  my $self = shift;
+  $self->{'_logger'} = shift if (@_);
+  return $self->{'_logger'};
+}
+
+
+=head2 conf
+
+  Arg[1]      : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration
+                to set
+  Example     : my $basedir = $object->conf->param('basedir');
+  Description : Getter/setter for configuration object
+  Return type : Bio::EnsEMBL::Utils::ConfParser
+  Exceptions  : none
+  Caller      : constructor
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub conf {
+  my $self = shift;
+  $self->{'_conf'} = shift if (@_);
+  return $self->{'_conf'};
+}
+
+
+=head2 cache
+
+  Arg[1]      : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set
+  Example     : $object->cache->read_from_file('source');
+  Description : Getter/setter for cache object
+  Return type : Bio::EnsEMBL::IdMapping::Cache
+  Exceptions  : none
+  Caller      : constructor
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub cache {
+  my $self = shift;
+  $self->{'_cache'} = shift if (@_);
+  return $self->{'_cache'};
+}
+
+
+1;
+