view variant_effect_predictor/Bio/EnsEMBL/IdMapping/BaseObject.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

Bio::EnsEMBL::IdMapping::BaseObject - base object for IdMapping objects

=head1 SYNOPSIS

  # this object isn't instantiated directly but rather extended
  use Bio::EnsEMBL::IdMapping::BaseObject;
  our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);

=head1 DESCRIPTION

This is the base object for some of the objects used in the IdMapping
application. An object that extends BaseObject will have a ConfParser,
Logger and Cache object. BaseObject also implements some useful utility
functions related to file and db access.

This isn't very clean OO design but it's efficient and easy to use...

=head1 METHODS

  new
  get_filehandle
  file_exists
  fetch_value_from_db
  dump_table_to_file
  upload_file_into_table
  logger
  conf
  cache

=cut


package Bio::EnsEMBL::IdMapping::BaseObject;

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);


=head2 new

  Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
  Arg [CONF]  : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
  Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object
  Example     : my $object = Bio::EnsEMBL::IdMapping::BaseObjectSubclass->new(
                  -LOGGER => $logger,
                  -CONF   => $conf,
                  -CACHE  => $cache
                );
  Description : Constructor
  Return type : implementing subclass type
  Exceptions  : thrown on wrong or missing arguments
  Caller      : general
  Status      : At Risk
              : under development

=cut

sub new {
  my $caller = shift;
  my $class = ref($caller) || $caller;

  my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);

  unless ($logger and ref($logger) and
          $logger->isa('Bio::EnsEMBL::Utils::Logger')) {
    throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
  }
  
  unless ($conf and ref($conf) and
          $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
    throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
  }
  
  unless ($cache and ref($cache) and
          $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
    throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
  }
  
  my $self = {};
  bless ($self, $class);

  # initialise
  $self->logger($logger);
  $self->conf($conf);
  $self->cache($cache);
  
  return $self;
}


=head2 get_filehandle 

  Arg[1]      : String $filename - filename for filehandle
  Arg[2]      : String $path_append - append subdirectory name to basedir
  Arg[3]      : String $mode - filehandle mode (<|>|>>)
  Example     : my $fh = $object->get_filehandle('mapping_stats.txt', 'stats',
                  '>');
                print $fh "Stats:\n";
  Description : Returns a filehandle to a file for reading or writing. The file
                is qualified with the basedir defined in the configuration and
                an optional subdirectory name.
  Return type : filehandle
  Exceptions  : thrown on missing filename
  Caller      : general
  Status      : At Risk
              : under development

=cut

sub get_filehandle {
  my $self = shift;
  my $filename = shift;
  my $path_append = shift;
  my $mode = shift;

  throw("Need a filename for this filehandle.") unless (defined($filename));
  
  my $path = $self->conf->param('basedir');
  $path = path_append($path, $path_append) if (defined($path_append));

  $mode ||= '>';
  
  open(my $fh, $mode, "$path/$filename") or
    throw("Unable to open $path/$filename: $!");

  return $fh;
}


=head2 file_exists

  Arg[1]      : String $filename - filename to test
  Arg[2]      : Boolean $path_append - turn on pre-pending of basedir
  Example     : unless ($object->file_exists('gene_mappings.ser', 1)) {
                  $object->do_gene_mapping;
                }
  Description : Tests if a file exists and has non-zero size.
  Return type : Boolean
  Exceptions  : none
  Caller      : general
  Status      : At Risk
              : under development

=cut

sub file_exists {
  my $self = shift;
  my $filename = shift;
  my $path_append = shift;

  my $path = $self->conf->param('basedir');
  $path = path_append($path, $path_append) if (defined($path_append));

  return (-s "$path/$filename");
}


=head2 fetch_value_from_db 

  Arg[1]      : DBI::db $dbh - a DBI database handle
  Arg[2]      : String $sql - SQL statement to execute
  Example     : my $num_genes = $object->fetch_value_from_db($dbh,
                  'SELECT count(*) FROM gene');
  Description : Executes an SQL statement on a db handle and returns the first
                column of the first row returned. Useful for queries returning a
                single value, like table counts.
  Return type : Return type of SQL statement
  Exceptions  : thrown on wrong or missing arguments
  Caller      : general
  Status      : At Risk
              : under development

=cut

sub fetch_value_from_db {
  my $self = shift;
  my $dbh = shift;
  my $sql = shift;

  throw("Need a db handle.") unless ($dbh and $dbh->isa('DBI::db'));
  throw("Need an SQL query to execute.") unless ($sql);

  my $sth = $dbh->prepare($sql);
  $sth->execute;
  my ($retval) = $sth->fetchrow_array;

  return $retval;
}


=head2 dump_table_to_file 

  Arg[1]      : String $dbtype - db type (source|target)
  Arg[2]      : String $table - name of table to dump
  Arg[3]      : String $filename - name of dump file
  Arg[4]      : Boolean $check_existing - turn on test for existing dump
  Example     : my $rows_dumped = $object->dump_table_to_file('source',
                  'stable_id_event', 'stable_id_event_existing.txt');
  Description : Dumps the contents of a db table to a tab-delimited file. The
                dump file will be written to a subdirectory called 'tables'
                under the basedir from your configuration.
  Return type : Int - the number of rows dumped
  Exceptions  : thrown on wrong or missing arguments
  Caller      : general
  Status      : At Risk
              : under development

=cut

sub dump_table_to_file {
  my $self = shift;
  my $dbtype = shift;
  my $table = shift;
  my $filename = shift;
  my $check_existing = shift;

  # argument check
  unless (($dbtype eq 'source') or ($dbtype eq 'target')) {
    throw("Missing or unknown db type: $dbtype.");
  }
  throw("Need a table name.") unless ($table);
  throw("Need a filename.") unless ($filename);

  # conditionally check if table was already dumped
  if ($check_existing and $self->file_exists($filename, 'tables')) {
    $self->logger->info("$filename exists, won't dump again.\n");
    return 0;
  }
  
  my $fh = $self->get_filehandle($filename, 'tables');

  my $dba = $self->cache->get_DBAdaptor($dbtype);
  my $dbh = $dba->dbc->db_handle;
  my $sth = $dbh->prepare("SELECT * FROM $table");
  $sth->execute;

  my $i = 0;

  while (my @row = $sth->fetchrow_array) {
    $i++;

    # use '\N' for NULL values
    for (my $j = 0; $j < scalar(@row); $j++) {
      $row[$j] = '\N' unless (defined($row[$j]));
    }
    
    print $fh join("\t", @row);
    print $fh "\n";
  }

  $sth->finish;
  
  return $i;
}


=head2 upload_file_into_table

  Arg[1]      : String $dbtype - db type (source|target)
  Arg[2]      : String $table - name of table to upload the data to
  Arg[3]      : String $filename - name of dump file
  Arg[4]      : Boolean $no_check_empty - don't check if table is empty
  Example     : my $rows_uploaded = $object->upload_file_into_table('target',
                  'stable_id_event', 'stable_id_event_new.txt');
  Description : Uploads a tab-delimited data file into a db table. The data file
                will be taken from a subdirectory 'tables' under your configured
                basedir. If the db table isn't empty and $no_check_empty isn't
                set, no data is uploaded (and a warning is issued).
  Return type : Int - the number of rows uploaded
  Exceptions  : thrown on wrong or missing arguments
  Caller      : general
  Status      : At Risk
              : under development

=cut

sub upload_file_into_table {
  my $self           = shift;
  my $dbtype         = shift;
  my $table          = shift;
  my $filename       = shift;
  my $no_check_empty = shift;

  # argument check
  unless ( ( $dbtype eq 'source' ) or ( $dbtype eq 'target' ) ) {
    throw("Missing or unknown db type: $dbtype.");
  }
  throw("Need a table name.") unless ($table);
  throw("Need a filename.")   unless ($filename);

  # sanity check for dry run
  if ( $self->conf->param('dry_run') ) {
    $self->logger->warning(
                       "dry_run - skipping db upload for $filename.\n");
    return;
  }

  my $file =
    join( '/', $self->conf->param('basedir'), 'tables', $filename );
  my $r = 0;

  if ( -s $file ) {

    $self->logger->debug( "$file -> $table\n", 1 );

    my $dba = $self->cache->get_DBAdaptor($dbtype);
    my $dbh = $dba->dbc->db_handle;

    my $idtable = 0;
    if ( $table =~ /^([^_]+)_stable_id/ ) {
      # This is a stable_id table we're working with.
      $idtable = 1;
      $table   = $1;
    }

    # check table is empty
    my ( $sql, $sth );
    unless ($no_check_empty) {
      if ($idtable) {
        $sql =
          qq(SELECT count(*) FROM $table WHERE stable_id IS NOT NULL);
      }
      else {
        $sql = qq(SELECT count(*) FROM $table);
      }
      $sth = $dbh->prepare($sql);
      $sth->execute;
      my ($c) = $sth->fetchrow_array;
      $sth->finish;

      if ( $c > 0 ) {
        if ($idtable) {
          $self->logger->warning(
                               "Table $table contains $c stable IDs.\n",
                               1 );
        }
        else {
          $self->logger->warning(
                          "Table $table not empty: found $c entries.\n",
                          1 );
        }
        $self->logger->info( "Data not uploaded!\n", 1 );
        return $r;
      }
    } ## end unless ($no_check_empty)

    # now upload the data
    if ($idtable) {
      # Create a temporary table, upload the data into it, and then
      # update the main table.
      $dbh->do(
        qq( CREATE TABLE stable_id_$$ (  object_id INTEGER UNSIGNED,
                                             stable_id VARCHAR(255),
                                             version SMALLINT UNSIGNED,
                                             created_date DATETIME,
                                             modified_date DATETIME,
                                             PRIMARY KEY(object_id) ) )
      );

      $dbh->do(
            qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE stable_id_$$));

      $dbh->do(
        qq(
      UPDATE $table, stable_id_$$
      SET $table.stable_id=stable_id_$$.stable_id,
          $table.version=stable_id_$$.version,
          $table.created_date=stable_id_$$.created_date,
          $table.modified_date=stable_id_$$.modified_date
      WHERE $table.${table}_id = stable_id_$$.object_id )
      );

      $dbh->do(qq(DROP TABLE stable_id_$$));
    } ## end if ($idtable)
    else {
      $dbh->do(qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE $table));
    }
    $dbh->do(qq(OPTIMIZE TABLE $table));

  } ## end if ( -s $file )
  else {
    $self->logger->warning( "No data found in file $filename.\n", 1 );
  }

  return $r;
} ## end sub upload_file_into_table


=head2 logger

  Arg[1]      : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set
  Example     : $object->logger->info("Starting ID mapping.\n");
  Description : Getter/setter for logger object
  Return type : Bio::EnsEMBL::Utils::Logger
  Exceptions  : none
  Caller      : constructor
  Status      : At Risk
              : under development

=cut

sub logger {
  my $self = shift;
  $self->{'_logger'} = shift if (@_);
  return $self->{'_logger'};
}


=head2 conf

  Arg[1]      : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration
                to set
  Example     : my $basedir = $object->conf->param('basedir');
  Description : Getter/setter for configuration object
  Return type : Bio::EnsEMBL::Utils::ConfParser
  Exceptions  : none
  Caller      : constructor
  Status      : At Risk
              : under development

=cut

sub conf {
  my $self = shift;
  $self->{'_conf'} = shift if (@_);
  return $self->{'_conf'};
}


=head2 cache

  Arg[1]      : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set
  Example     : $object->cache->read_from_file('source');
  Description : Getter/setter for cache object
  Return type : Bio::EnsEMBL::IdMapping::Cache
  Exceptions  : none
  Caller      : constructor
  Status      : At Risk
              : under development

=cut

sub cache {
  my $self = shift;
  $self->{'_cache'} = shift if (@_);
  return $self->{'_cache'};
}


1;