Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/BaseObject.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/BaseObject.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,478 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::BaseObject - base object for IdMapping objects + +=head1 SYNOPSIS + + # this object isn't instantiated directly but rather extended + use Bio::EnsEMBL::IdMapping::BaseObject; + our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +=head1 DESCRIPTION + +This is the base object for some of the objects used in the IdMapping +application. An object that extends BaseObject will have a ConfParser, +Logger and Cache object. BaseObject also implements some useful utility +functions related to file and db access. + +This isn't very clean OO design but it's efficient and easy to use... + +=head1 METHODS + + new + get_filehandle + file_exists + fetch_value_from_db + dump_table_to_file + upload_file_into_table + logger + conf + cache + +=cut + + +package Bio::EnsEMBL::IdMapping::BaseObject; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +=head2 new + + Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object + Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object + Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object + Example : my $object = Bio::EnsEMBL::IdMapping::BaseObjectSubclass->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache + ); + Description : Constructor + Return type : implementing subclass type + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_); + + unless ($logger and ref($logger) and + $logger->isa('Bio::EnsEMBL::Utils::Logger')) { + throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging."); + } + + unless ($conf and ref($conf) and + $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) { + throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object."); + } + + unless ($cache and ref($cache) and + $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) { + throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object."); + } + + my $self = {}; + bless ($self, $class); + + # initialise + $self->logger($logger); + $self->conf($conf); + $self->cache($cache); + + return $self; +} + + +=head2 get_filehandle + + Arg[1] : String $filename - filename for filehandle + Arg[2] : String $path_append - append subdirectory name to basedir + Arg[3] : String $mode - filehandle mode (<|>|>>) + Example : my $fh = $object->get_filehandle('mapping_stats.txt', 'stats', + '>'); + print $fh "Stats:\n"; + Description : Returns a filehandle to a file for reading or writing. The file + is qualified with the basedir defined in the configuration and + an optional subdirectory name. + Return type : filehandle + Exceptions : thrown on missing filename + Caller : general + Status : At Risk + : under development + +=cut + +sub get_filehandle { + my $self = shift; + my $filename = shift; + my $path_append = shift; + my $mode = shift; + + throw("Need a filename for this filehandle.") unless (defined($filename)); + + my $path = $self->conf->param('basedir'); + $path = path_append($path, $path_append) if (defined($path_append)); + + $mode ||= '>'; + + open(my $fh, $mode, "$path/$filename") or + throw("Unable to open $path/$filename: $!"); + + return $fh; +} + + +=head2 file_exists + + Arg[1] : String $filename - filename to test + Arg[2] : Boolean $path_append - turn on pre-pending of basedir + Example : unless ($object->file_exists('gene_mappings.ser', 1)) { + $object->do_gene_mapping; + } + Description : Tests if a file exists and has non-zero size. + Return type : Boolean + Exceptions : none + Caller : general + Status : At Risk + : under development + +=cut + +sub file_exists { + my $self = shift; + my $filename = shift; + my $path_append = shift; + + my $path = $self->conf->param('basedir'); + $path = path_append($path, $path_append) if (defined($path_append)); + + return (-s "$path/$filename"); +} + + +=head2 fetch_value_from_db + + Arg[1] : DBI::db $dbh - a DBI database handle + Arg[2] : String $sql - SQL statement to execute + Example : my $num_genes = $object->fetch_value_from_db($dbh, + 'SELECT count(*) FROM gene'); + Description : Executes an SQL statement on a db handle and returns the first + column of the first row returned. Useful for queries returning a + single value, like table counts. + Return type : Return type of SQL statement + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub fetch_value_from_db { + my $self = shift; + my $dbh = shift; + my $sql = shift; + + throw("Need a db handle.") unless ($dbh and $dbh->isa('DBI::db')); + throw("Need an SQL query to execute.") unless ($sql); + + my $sth = $dbh->prepare($sql); + $sth->execute; + my ($retval) = $sth->fetchrow_array; + + return $retval; +} + + +=head2 dump_table_to_file + + Arg[1] : String $dbtype - db type (source|target) + Arg[2] : String $table - name of table to dump + Arg[3] : String $filename - name of dump file + Arg[4] : Boolean $check_existing - turn on test for existing dump + Example : my $rows_dumped = $object->dump_table_to_file('source', + 'stable_id_event', 'stable_id_event_existing.txt'); + Description : Dumps the contents of a db table to a tab-delimited file. The + dump file will be written to a subdirectory called 'tables' + under the basedir from your configuration. + Return type : Int - the number of rows dumped + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub dump_table_to_file { + my $self = shift; + my $dbtype = shift; + my $table = shift; + my $filename = shift; + my $check_existing = shift; + + # argument check + unless (($dbtype eq 'source') or ($dbtype eq 'target')) { + throw("Missing or unknown db type: $dbtype."); + } + throw("Need a table name.") unless ($table); + throw("Need a filename.") unless ($filename); + + # conditionally check if table was already dumped + if ($check_existing and $self->file_exists($filename, 'tables')) { + $self->logger->info("$filename exists, won't dump again.\n"); + return 0; + } + + my $fh = $self->get_filehandle($filename, 'tables'); + + my $dba = $self->cache->get_DBAdaptor($dbtype); + my $dbh = $dba->dbc->db_handle; + my $sth = $dbh->prepare("SELECT * FROM $table"); + $sth->execute; + + my $i = 0; + + while (my @row = $sth->fetchrow_array) { + $i++; + + # use '\N' for NULL values + for (my $j = 0; $j < scalar(@row); $j++) { + $row[$j] = '\N' unless (defined($row[$j])); + } + + print $fh join("\t", @row); + print $fh "\n"; + } + + $sth->finish; + + return $i; +} + + +=head2 upload_file_into_table + + Arg[1] : String $dbtype - db type (source|target) + Arg[2] : String $table - name of table to upload the data to + Arg[3] : String $filename - name of dump file + Arg[4] : Boolean $no_check_empty - don't check if table is empty + Example : my $rows_uploaded = $object->upload_file_into_table('target', + 'stable_id_event', 'stable_id_event_new.txt'); + Description : Uploads a tab-delimited data file into a db table. The data file + will be taken from a subdirectory 'tables' under your configured + basedir. If the db table isn't empty and $no_check_empty isn't + set, no data is uploaded (and a warning is issued). + Return type : Int - the number of rows uploaded + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub upload_file_into_table { + my $self = shift; + my $dbtype = shift; + my $table = shift; + my $filename = shift; + my $no_check_empty = shift; + + # argument check + unless ( ( $dbtype eq 'source' ) or ( $dbtype eq 'target' ) ) { + throw("Missing or unknown db type: $dbtype."); + } + throw("Need a table name.") unless ($table); + throw("Need a filename.") unless ($filename); + + # sanity check for dry run + if ( $self->conf->param('dry_run') ) { + $self->logger->warning( + "dry_run - skipping db upload for $filename.\n"); + return; + } + + my $file = + join( '/', $self->conf->param('basedir'), 'tables', $filename ); + my $r = 0; + + if ( -s $file ) { + + $self->logger->debug( "$file -> $table\n", 1 ); + + my $dba = $self->cache->get_DBAdaptor($dbtype); + my $dbh = $dba->dbc->db_handle; + + my $idtable = 0; + if ( $table =~ /^([^_]+)_stable_id/ ) { + # This is a stable_id table we're working with. + $idtable = 1; + $table = $1; + } + + # check table is empty + my ( $sql, $sth ); + unless ($no_check_empty) { + if ($idtable) { + $sql = + qq(SELECT count(*) FROM $table WHERE stable_id IS NOT NULL); + } + else { + $sql = qq(SELECT count(*) FROM $table); + } + $sth = $dbh->prepare($sql); + $sth->execute; + my ($c) = $sth->fetchrow_array; + $sth->finish; + + if ( $c > 0 ) { + if ($idtable) { + $self->logger->warning( + "Table $table contains $c stable IDs.\n", + 1 ); + } + else { + $self->logger->warning( + "Table $table not empty: found $c entries.\n", + 1 ); + } + $self->logger->info( "Data not uploaded!\n", 1 ); + return $r; + } + } ## end unless ($no_check_empty) + + # now upload the data + if ($idtable) { + # Create a temporary table, upload the data into it, and then + # update the main table. + $dbh->do( + qq( CREATE TABLE stable_id_$$ ( object_id INTEGER UNSIGNED, + stable_id VARCHAR(255), + version SMALLINT UNSIGNED, + created_date DATETIME, + modified_date DATETIME, + PRIMARY KEY(object_id) ) ) + ); + + $dbh->do( + qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE stable_id_$$)); + + $dbh->do( + qq( + UPDATE $table, stable_id_$$ + SET $table.stable_id=stable_id_$$.stable_id, + $table.version=stable_id_$$.version, + $table.created_date=stable_id_$$.created_date, + $table.modified_date=stable_id_$$.modified_date + WHERE $table.${table}_id = stable_id_$$.object_id ) + ); + + $dbh->do(qq(DROP TABLE stable_id_$$)); + } ## end if ($idtable) + else { + $dbh->do(qq(LOAD DATA LOCAL INFILE '$file' INTO TABLE $table)); + } + $dbh->do(qq(OPTIMIZE TABLE $table)); + + } ## end if ( -s $file ) + else { + $self->logger->warning( "No data found in file $filename.\n", 1 ); + } + + return $r; +} ## end sub upload_file_into_table + + +=head2 logger + + Arg[1] : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set + Example : $object->logger->info("Starting ID mapping.\n"); + Description : Getter/setter for logger object + Return type : Bio::EnsEMBL::Utils::Logger + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub logger { + my $self = shift; + $self->{'_logger'} = shift if (@_); + return $self->{'_logger'}; +} + + +=head2 conf + + Arg[1] : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration + to set + Example : my $basedir = $object->conf->param('basedir'); + Description : Getter/setter for configuration object + Return type : Bio::EnsEMBL::Utils::ConfParser + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub conf { + my $self = shift; + $self->{'_conf'} = shift if (@_); + return $self->{'_conf'}; +} + + +=head2 cache + + Arg[1] : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set + Example : $object->cache->read_from_file('source'); + Description : Getter/setter for cache object + Return type : Bio::EnsEMBL::IdMapping::Cache + Exceptions : none + Caller : constructor + Status : At Risk + : under development + +=cut + +sub cache { + my $self = shift; + $self->{'_cache'} = shift if (@_); + return $self->{'_cache'}; +} + + +1; +