Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/Cache.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/Cache.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1356 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::Cache - a cache to hold data objects used by the +IdMapping application + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::IdMapping::Cache; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append); +use Bio::EnsEMBL::Utils::Scalar qw(assert_ref); +use Bio::EnsEMBL::IdMapping::TinyGene; +use Bio::EnsEMBL::IdMapping::TinyTranscript; +use Bio::EnsEMBL::IdMapping::TinyTranslation; +use Bio::EnsEMBL::IdMapping::TinyExon; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Storable qw(nstore retrieve); +use Digest::MD5 qw(md5_hex); + +# define available cache names here +my @cache_names = qw( + exons_by_id + transcripts_by_id + transcripts_by_exon_id + translations_by_id + genes_by_id + genes_by_transcript_id +); + + +=head2 new + + Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object + Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object + Example : my $cache = Bio::EnsEMBL::IdMapping::Cache->new( + -LOGGER => $logger, + -CONF => $conf, + ); + Description : constructor + Return type : Bio::EnsEMBL::IdMapping::Cache object + Exceptions : thrown on wrong or missing arguments + Caller : general + Status : At Risk + : under development + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my ($logger, $conf, $load_instance) = + rearrange(['LOGGER', 'CONF', 'LOAD_INSTANCE'], @_); + + unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) { + throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging."); + } + + unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) { + throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object."); + } + + my $self = {}; + bless ($self, $class); + + # initialise + $self->logger($logger); + $self->conf($conf); + + if ($load_instance) { + $self->read_instance_from_file; + } + + return $self; +} + + +=head2 build_cache_by_slice + + Arg[1] : String $dbtype - db type (source|target) + Arg[2] : String $slice_name - the name of a slice (format as returned by + Bio::EnsEMBL::Slice->name) + Example : my ($num_genes, $filesize) = $cache->build_cache_by_slice( + 'source', 'chromosome:NCBI36:X:1:1000000:-1'); + Description : Builds a cache of genes, transcripts, translations and exons + needed by the IdMapping application and serialises the resulting + cache object to a file, one slice at a time. + Return type : list of the number of genes processed and the size of the + serialised cache file + Exceptions : thrown on invalid slice name + Caller : general + Status : At Risk + : under development + +=cut + +sub build_cache_by_slice { + my $self = shift; + my $dbtype = shift; + my $slice_name = shift; + + # set cache method (required for loading cache later) + $self->cache_method('BY_SEQ_REGION'); + + my $dba = $self->get_DBAdaptor($dbtype); + my $sa = $dba->get_SliceAdaptor; + + my $slice = $sa->fetch_by_name($slice_name); + unless ($slice) { + throw("Could not retrieve slice $slice_name."); + } + + my $genes = $slice->get_all_Genes( undef, undef, 1 ); + + # find common coord_system + my $common_cs_found = $self->find_common_coord_systems; + + # find out whether native coord_system is a common coord_system. + # if so, you don't need to project. + # also don't project if no common coord_system present + my $need_project = 1; + + my $csid = join( ':', + $slice->coord_system_name, + $slice->coord_system->version ); + + if ( $self->is_common_cs($csid) or !$self->highest_common_cs ) { + $need_project = 0; + } + + # build cache + my $type = "$dbtype.$slice_name"; + my $num_genes = + $self->build_cache_from_genes( $type, $genes, $need_project ); + undef $genes; + + # write cache to file, then flush cache to reclaim memory + my $size = $self->write_all_to_file($type); + + return $num_genes, $size; +} ## end sub build_cache_by_slice + + +=head2 build_cache_all + + Arg[1] : String $dbtype - db type (source|target) + Example : my ($num_genes, $filesize) = $cache->build_cache_all('source'); + Description : Builds a cache of genes, transcripts, translations and exons + needed by the IdMapping application and serialises the + resulting cache object to a file. All genes across the genome + are processed in one go. This method should be used when + build_cache_by_seq_region can't be used due to a large number + of toplevel seq_regions (e.g. 2x genomes). + Return type : list of the number of genes processed and the size of the + serialised cache file + Exceptions : thrown on invalid slice name + Caller : general + Status : At Risk + : under development + +=cut + +sub build_cache_all { + my $self = shift; + my $dbtype = shift; + + # set cache method (required for loading cache later) + $self->cache_method('ALL'); + + my $dba = $self->get_DBAdaptor($dbtype); + my $ga = $dba->get_GeneAdaptor; + + my $genes = $ga->fetch_all; + + # find common coord_system + my $common_cs_found = $self->find_common_coord_systems; + + # Build cache. Setting $need_project to 'CHECK' will cause + # build_cache_from_genes() to check the coordinate system for each + # gene. + my $type = "$dbtype.ALL"; + my $need_project = 'CHECK'; + my $num_genes = + $self->build_cache_from_genes( $type, $genes, $need_project ); + + undef $genes; + + # write cache to file, then flush cache to reclaim memory + my $size = $self->write_all_to_file($type); + + return $num_genes, $size; +} + + +=head2 build_cache_from_genes + + Arg[1] : String $type - cache type + Arg[2] : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache + from + Arg[3] : Boolean $need_project - indicate if we need to project exons to + common coordinate system + Example : $cache->build_cache_from_genes( + 'source.chromosome:NCBI36:X:1:100000:1', \@genes); + Description : Builds the cache by fetching transcripts, translations and exons + for a list of genes from the database, and creating lightweight + Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the + data needed by the IdMapping application. These objects are + attached to a name cache in this cache object. Exons only need + to be projected to a commond coordinate system if their native + coordinate system isn't common to source and target assembly + itself. + Return type : int - number of genes after filtering + Exceptions : thrown on wrong or missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub build_cache_from_genes { + my $self = shift; + my $type = shift; + my $genes = shift; + my $need_project = shift; + + throw("You must provide a type.") unless $type; + throw("You must provide a listref of genes.") + unless ( ref($genes) eq 'ARRAY' ); + + # biotype filter + if ( $self->conf()->param('biotypes') || + $self->conf()->param('biotypes_include') || + $self->conf()->param('biotypes_exclude') ) + { + $genes = $self->filter_biotypes($genes); + } + my $num_genes = scalar(@$genes); + + # initialise cache for the given type. + $self->{'cache'}->{$type} = {}; + + #my $i = 0; + #my $num_genes = scalar(@$genes); + #my $progress_id = $self->logger->init_progress($num_genes); + + # loop over genes sorted by gene location. + # the sort will hopefully improve assembly mapper cache performance and + # therefore speed up exon sequence retrieval + foreach my $gene ( sort { $a->start <=> $b->start } @$genes ) { + #$self->logger->log_progressbar($progress_id, ++$i, 2); + #$self->logger->log_progress($num_genes, ++$i, 20, 3, 1); + + if ( $need_project eq 'CHECK' ) { + # find out whether native coord_system is a common coord_system. + # if so, you don't need to project. + # also don't project if no common coord_system present + if ( $self->highest_common_cs ) { + my $csid = join( ':', + $gene->slice->coord_system_name, + $gene->slice->coord_system->version ); + if ( $self->is_common_cs($csid) ) { + $need_project = 0; + } + } + else { + $need_project = 0; + } + } + + # create lightweigt gene + my $lgene = + Bio::EnsEMBL::IdMapping::TinyGene->new_fast( [ + $gene->dbID, $gene->stable_id, + $gene->version, $gene->created_date, + $gene->modified_date, $gene->start, + $gene->end, $gene->strand, + $gene->slice->seq_region_name, $gene->biotype, + $gene->status, $gene->analysis->logic_name, + ( $gene->is_known ? 1 : 0 ), ] ); + + # build gene caches + $self->add( 'genes_by_id', $type, $gene->dbID, $lgene ); + + # transcripts + foreach my $tr ( @{ $gene->get_all_Transcripts } ) { + my $ltr = + Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast( [ + $tr->dbID, $tr->stable_id, + $tr->version, $tr->created_date, + $tr->modified_date, $tr->start, + $tr->end, $tr->strand, + $tr->length, md5_hex( $tr->spliced_seq ), + ( $tr->is_known ? 1 : 0 ) ] ); + + $ltr->biotype( $tr->biotype() ); + $lgene->add_Transcript($ltr); + + # build transcript caches + $self->add( 'transcripts_by_id', $type, $tr->dbID, $ltr ); + $self->add( 'genes_by_transcript_id', $type, $tr->dbID, $lgene ); + + # translation (if there is one) + if ( my $tl = $tr->translation ) { + my $ltl = + Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast( [ + $tl->dbID, $tl->stable_id, + $tl->version, $tl->created_date, + $tl->modified_date, $tr->dbID, + $tr->translate->seq, ( $tr->is_known ? 1 : 0 ), + ] ); + + $ltr->add_Translation($ltl); + + $self->add( 'translations_by_id', $type, $tl->dbID, $ltl ); + + undef $tl; + } + + # exons + foreach my $exon ( @{ $tr->get_all_Exons } ) { + my $lexon = + Bio::EnsEMBL::IdMapping::TinyExon->new_fast( [ + $exon->dbID, + $exon->stable_id, + $exon->version, + $exon->created_date, + $exon->modified_date, + $exon->start, + $exon->end, + $exon->strand, + $exon->slice->seq_region_name, + $exon->slice->coord_system_name, + $exon->slice->coord_system->version, + $exon->slice->subseq( $exon->start, $exon->end, + $exon->strand ), + $exon->phase, + $need_project, ] ); + + # get coordinates in common coordinate system if needed + if ($need_project) { + my @seg = @{ + $exon->project( $self->highest_common_cs, + $self->highest_common_cs_version ) }; + + if ( scalar(@seg) == 1 ) { + my $sl = $seg[0]->to_Slice; + $lexon->common_start( $sl->start ); + $lexon->common_end( $sl->end ); + $lexon->common_strand( $sl->strand ); + $lexon->common_sr_name( $sl->seq_region_name ); + } + } + + $ltr->add_Exon($lexon); + + $self->add( 'exons_by_id', $type, $exon->dbID, $lexon ); + $self->add_list( 'transcripts_by_exon_id', + $type, $exon->dbID, $ltr ); + + undef $exon; + } ## end foreach my $exon ( @{ $tr->get_all_Exons...}) + + undef $tr; + } ## end foreach my $tr ( @{ $gene->get_all_Transcripts...}) + + undef $gene; + } ## end foreach my $gene ( sort { $a...}) + + return $num_genes; +} ## end sub build_cache_from_genes + + +=head2 filter_biotypes + + Arg[1] : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter + Example : my @filtered = @{ $cache->filter_biotypes(\@genes) }; + + Description : Filters a list of genes by biotype. Biotypes are + taken from the IdMapping configuration parameter + 'biotypes_include' or 'biotypes_exclude'. + + If the configuration parameter 'biotypes_exclude' is + defined, then rather than returning the genes whose + biotype is listed in the configuration parameter + 'biotypes_include' the method will return the genes + whose biotype is *not* listed in the 'biotypes_exclude' + configuration parameter. + + It is an error to define both these configuration + parameters. + + The old parameter 'biotypes' is equivalent to + 'biotypes_include'. + + Return type : Listref of Bio::EnsEMBL::Genes (or empty list) + Exceptions : none + Caller : internal + Status : At Risk + : under development + +=cut + +sub filter_biotypes { + my ( $self, $genes ) = @_; + + my @filtered; + my @biotypes; + my $opt_reverse; + + if ( defined( $self->conf()->param('biotypes_include') ) || + defined( $self->conf()->param('biotypes') ) ) + { + if ( defined( $self->conf()->param('biotypes_exclude') ) ) { + $self->logger() + ->error( "You may not use both " . + "'biotypes_include' and 'biotypes_exclude' " . + "in the configuration" ); + } + + if ( defined( $self->conf()->param('biotypes_include') ) ) { + @biotypes = $self->conf()->param('biotypes_include'); + } + else { + @biotypes = $self->conf()->param('biotypes'); + } + $opt_reverse = 0; + } + else { + @biotypes = $self->conf()->param('biotypes_exclude'); + $opt_reverse = 1; + } + + foreach my $gene ( @{$genes} ) { + my $keep_gene; + + foreach my $biotype (@biotypes) { + if ( $gene->biotype() eq $biotype ) { + if ($opt_reverse) { $keep_gene = 0 } + else { $keep_gene = 1 } + last; + } + } + + if ( defined($keep_gene) ) { + if ($keep_gene) { + push( @filtered, $gene ); + } + } + elsif ($opt_reverse) { + push( @filtered, $gene ); + } + } + + return \@filtered; +} ## end sub filter_biotypes + + +=head2 add + + Arg[1] : String $name - a cache name (e.g. 'genes_by_id') + Arg[2] : String type - a cache type (e.g. "source.$slice_name") + Arg[3] : String $key - key of this entry (e.g. a gene dbID) + Arg[4] : Bio::EnsEMBL::IdMappping::TinyFeature $val - value to cache + Example : $cache->add('genes_by_id', + 'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene); + Description : Adds a TinyFeature object to a named cache. + Return type : Bio::EnsEMBL::IdMapping::TinyFeature + Exceptions : thrown on wrong or missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub add { + my $self = shift; + my $name = shift; + my $type = shift; + my $key = shift; + my $val = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + throw("You must provide a cache key (e.g. a gene dbID).") unless $key; + + $self->{'cache'}->{$type}->{$name}->{$key} = $val; + + return $self->{'cache'}->{$type}->{$name}->{$key}; +} + +=head2 add_list + + Arg[1] : String $name - a cache name (e.g. 'genes_by_id') + Arg[2] : String type - a cache type (e.g. "source.$slice_name") + Arg[3] : String $key - key of this entry (e.g. a gene dbID) + Arg[4] : List of Bio::EnsEMBL::IdMappping::TinyFeature @val - values + to cache + Example : $cache->add_list('transcripts_by_exon_id', + 'source.chromosome:NCBI36:X:1:1000000:1', '1234', + $tiny_transcript1, $tiny_transcript2); + Description : Adds a list of TinyFeature objects to a named cache. + Return type : Listref of Bio::EnsEMBL::IdMapping::TinyFeature objects + Exceptions : thrown on wrong or missing arguments + Caller : internal + Status : At Risk + : under development + +=cut + +sub add_list { + my $self = shift; + my $name = shift; + my $type = shift; + my $key = shift; + my @vals = @_; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + throw("You must provide a cache key (e.g. a gene dbID).") unless $key; + + push @{ $self->{'cache'}->{$type}->{$name}->{$key} }, @vals; + + return $self->{'cache'}->{$type}->{$name}->{$key}; +} + +sub get_by_key { + my $self = shift; + my $name = shift; + my $type = shift; + my $key = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + throw("You must provide a cache key (e.g. a gene dbID).") unless $key; + + # transparently load cache from file unless already loaded + unless ($self->{'instance'}->{'loaded'}->{"$type"}) { + $self->read_and_merge($type); + } + + return $self->{'cache'}->{$type}->{$name}->{$key}; +} + +sub get_by_name { + my $self = shift; + my $name = shift; + my $type = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + + # transparently load cache from file unless already loaded + unless ($self->{'instance'}->{'loaded'}->{$type}) { + $self->read_and_merge($type); + } + + return $self->{'cache'}->{$type}->{$name} || {}; +} + + +sub get_count_by_name { + my $self = shift; + my $name = shift; + my $type = shift; + + throw("You must provide a cache name (e.g. genes_by_id.") unless $name; + throw("You must provide a cache type.") unless $type; + + # transparently load cache from file unless already loaded + unless ($self->{'instance'}->{'loaded'}->{$type}) { + $self->read_and_merge($type); + } + + return scalar(keys %{ $self->get_by_name($name, $type) }); +} + + +sub find_common_coord_systems { + my $self = shift; + + # get adaptors for source db + my $s_dba = $self->get_DBAdaptor('source'); + my $s_csa = $s_dba->get_CoordSystemAdaptor; + my $s_sa = $s_dba->get_SliceAdaptor; + + # get adaptors for target db + my $t_dba = $self->get_DBAdaptor('target'); + my $t_csa = $t_dba->get_CoordSystemAdaptor; + my $t_sa = $t_dba->get_SliceAdaptor; + + # find common coord_systems + my @s_coord_systems = @{ $s_csa->fetch_all }; + my @t_coord_systems = @{ $t_csa->fetch_all }; + my $found_highest = 0; + +SOURCE: + foreach my $s_cs (@s_coord_systems) { + if ( !$s_cs->is_default() ) { next SOURCE } + + TARGET: + foreach my $t_cs (@t_coord_systems) { + if ( !$t_cs->is_default() ) { next TARGET } + + if ( $s_cs->name eq $t_cs->name ) { + + # test for identical coord_system version + if ( $s_cs->version and ( $s_cs->version ne $t_cs->version ) ) { + next TARGET; + } + + # test for at least 50% identical seq_regions + if ( $self->seq_regions_compatible( $s_cs, $s_sa, $t_sa ) ) { + $self->add_common_cs($s_cs); + + unless ($found_highest) { + $self->highest_common_cs( $s_cs->name ); + $self->highest_common_cs_version( $s_cs->version ); + } + + $found_highest = 1; + + next SOURCE; + } + } + } ## end foreach my $t_cs (@t_coord_systems) + } ## end foreach my $s_cs (@s_coord_systems) + + return $found_highest; +} ## end sub find_common_coord_systems + + +sub seq_regions_compatible { + my $self = shift; + my $cs = shift; + my $s_sa = shift; + my $t_sa = shift; + + unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) { + throw('You must provide a CoordSystem'); + } + + unless ($s_sa and $t_sa and $s_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor') + and $t_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) { + throw('You must provide a source and target SliceAdaptor'); + } + + my %sr_match; + my $equal = 0; + + my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version); + my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version); + + # sanity check to prevent divison by zero + my $s_count = scalar(@$s_seq_regions); + my $t_count = scalar(@$t_seq_regions); + return(0) if ($s_count == 0 or $t_count == 0); + + foreach my $s_sr (@$s_seq_regions) { + $sr_match{$s_sr->seq_region_name} = $s_sr->length; + } + + foreach my $t_sr (@$t_seq_regions) { + if (exists($sr_match{$t_sr->seq_region_name})) { + $equal++; + + # return false if we have a region with same name but different length + return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length); + } + } + + if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) { + return(1); + } else { + $self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n"); + return(0); + } + +} + + +sub check_db_connection { + my $self = shift; + my $dbtype = shift; + + my $err = 0; + + eval { + my $dba = $self->get_DBAdaptor($dbtype); + $dba->dbc->connect; + }; + + if ($@) { + $self->logger->warning("Can't connect to $dbtype db: $@\n"); + $err++; + } else { + $self->logger->debug("Connection to $dbtype db ok.\n"); + $self->{'_db_conn_ok'}->{$dbtype} = 1; + } + + return $err; +} + + +sub check_db_read_permissions { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + my %privs = %{ $self->get_db_privs($dbtype) }; + + unless ($privs{'SELECT'} or $privs{'ALL PRIVILEGES'}) { + $self->logger->warning("User doesn't have read permission on $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("Read permission on $dbtype db ok.\n"); + } + + return $err; +} + + +sub check_db_write_permissions { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + + unless ($self->do_upload) { + $self->logger->debug("No uploads, so write permission on $dbtype db not required.\n"); + return $err; + } + + my %privs = %{ $self->get_db_privs($dbtype) }; + + unless ($privs{'INSERT'} or $privs{'ALL PRIVILEGES'}) { + $self->logger->warning("User doesn't have write permission on $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("Write permission on $dbtype db ok.\n"); + } + + return $err; +} + + +sub do_upload { + my $self = shift; + + if ($self->conf->param('dry_run') or + ! ($self->conf->param('upload_events') or + $self->conf->param('upload_stable_ids') or + $self->conf->param('upload_archive'))) { + return 0; + } else { + return 1; + } +} + + +sub get_db_privs { + my ( $self, $dbtype ) = @_; + + my %privs = (); + my $rs; + + # get privileges from mysql db + eval { + my $dbc = $self->get_DBAdaptor($dbtype)->dbc(); + my $sql = qq(SHOW GRANTS FOR ) . $dbc->username(); + my $sth = $dbc->prepare($sql); + $sth->execute(); + $rs = $sth->fetchall_arrayref(); + #$sth->finish(); + }; + + if ($@) { + $self->logger->warning( + "Error obtaining privileges from $dbtype db: $@\n"); + return {}; + } + + # parse the output + foreach my $r ( map { $_->[0] } @{$rs} ) { + $r =~ s/GRANT (.*) ON .*/$1/i; + foreach my $p ( split( ',', $r ) ) { + # trim leading and trailing whitespace + $p =~ s/^\s+//; + $p =~ s/\s+$//; + $privs{ uc($p) } = 1; + } + } + + return \%privs; +} ## end sub get_db_privs + + +sub check_empty_tables { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + my $c = 0; + + if ($self->conf->param('no_check_empty_tables') or !$self->do_upload) { + $self->logger->debug("Won't check for empty stable ID and archive tables in $dbtype db.\n"); + return $err; + } + + eval { + my @tables = + qw( + gene_stable_id + transcript_stable_id + translation_stable_id + exon_stable_id + stable_id_event + mapping_session + gene_archive + peptide_archive + ); + + my $dba = $self->get_DBAdaptor($dbtype); + foreach my $table (@tables) { + if ( $table =~ /^([^_]+)_stable_id/ ) { + $table = $1; + if ( $c = + $self->fetch_value_from_db( + $dba, + "SELECT COUNT(*) FROM $table WHERE stable_id IS NOT NULL" + ) ) + { + $self->logger->warning( + "$table table in $dbtype db has $c stable IDs.\n"); + $err++; + } + } + else { + if ( $c = + $self->fetch_value_from_db( + $dba, "SELECT COUNT(*) FROM $table" + ) ) + { + $self->logger->warning( + "$table table in $dbtype db has $c entries.\n"); + $err++; + } + } + } ## end foreach my $table (@tables) + }; + + if ($@) { + $self->logger->warning( +"Error retrieving stable ID and archive table row counts from $dbtype db: $@\n" + ); + $err++; + } + elsif ( !$err ) { + $self->logger->debug( + "All stable ID and archive tables in $dbtype db are empty.\n"); + } + return $err; +} + + +sub check_sequence { + my ( $self, $dbtype ) = @_; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ( $self->{'_db_conn_ok'}->{$dbtype} ); + + my $err = 0; + my $c = 0; + + eval { + my $dba = $self->get_DBAdaptor($dbtype); + unless ( $c = + $self->fetch_value_from_db( + $dba->dnadb(), "SELECT COUNT(*) FROM dna" + ) ) + { + $err++; + } + }; + + if ($@) { + $self->logger->warning( "Error retrieving dna table row count " + . "from $dbtype database: $@\n" ); + $err++; + } elsif ($err) { + $self->logger->warning("No sequence found in $dbtype database.\n"); + } else { + $self->logger->debug( + ucfirst($dbtype) . " db has sequence ($c entries).\n" ); + } + + return $err; +} ## end sub check_sequence + + +sub check_meta_entries { + my $self = shift; + my $dbtype = shift; + + # skip this check if db connection failed (this prevents re-throwing + # exceptions). + return 1 unless ($self->{'_db_conn_ok'}->{$dbtype}); + + my $err = 0; + my $assembly_default; + my $schema_version; + + eval { + my $dba = $self->get_DBAdaptor($dbtype); + $assembly_default = $self->fetch_value_from_db($dba, + qq(SELECT meta_value FROM meta WHERE meta_key = 'assembly.default')); + $schema_version = $self->fetch_value_from_db($dba, + qq(SELECT meta_value FROM meta WHERE meta_key = 'schema_version')); + }; + + if ($@) { + $self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n"); + return ++$err; + } + + unless ($assembly_default) { + $self->logger->warning("No meta.assembly.default value found in $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("meta.assembly.default value found ($assembly_default).\n"); + } + + unless ($schema_version) { + $self->logger->warning("No meta.schema_version value found in $dbtype db.\n"); + $err++; + } else { + $self->logger->debug("meta.schema_version value found ($schema_version).\n"); + } + + return $err; +} + + +sub fetch_value_from_db { + my ( $self, $dba, $sql ) = @_; + + assert_ref( $dba, 'Bio::EnsEMBL::DBSQL::DBAdaptor' ); + + if ( !defined($sql) ) { + throw("Need an SQL statement to execute.\n"); + } + + my $sth = $dba->dbc->prepare($sql); + $sth->execute(); + + my ($c) = $sth->fetchrow_array; + return $c; +} + +sub get_DBAdaptor { + my ( $self, $prefix ) = @_; + + unless ( $self->{'_dba'}->{$prefix} ) { + # connect to database + my $dba = + new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => $self->conf->param("${prefix}host"), + -port => $self->conf->param("${prefix}port"), + -user => $self->conf->param("${prefix}user"), + -pass => $self->conf->param("${prefix}pass"), + -dbname => $self->conf->param("${prefix}dbname"), + -group => $prefix, ); + + if ( !defined( $self->conf->param("${prefix}host_dna") ) ) { + # explicitely set the dnadb to itself - by default the Registry + # assumes a group 'core' for this now + $dba->dnadb($dba); + } else { + my $dna_dba = + new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => $self->conf->param("${prefix}host_dna"), + -port => $self->conf->param("${prefix}port_dna"), + -user => $self->conf->param("${prefix}user_dna"), + -pass => $self->conf->param("${prefix}pass_dna"), + -dbname => $self->conf->param("${prefix}dbname_dna"), + -group => $prefix, ); + $dba->dnadb($dna_dba); + } + + $self->{'_dba'}->{$prefix} = $dba; + } ## end unless ( $self->{'_dba'}->...) + + return $self->{'_dba'}->{$prefix}; +} ## end sub get_DBAdaptor + + +sub cache_file_exists { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + my $cache_file = $self->cache_file($type); + + if (-e $cache_file) { + $self->logger->info("Cache file found for $type.\n", 2); + $self->logger->debug("Will read from $cache_file.\n", 2); + return 1; + } else { + $self->logger->info("No cache file found for $type.\n", 2); + $self->logger->info("Will build cache from db.\n", 2); + return 0; + } +} + + +sub cache_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + return $self->dump_path."/$type.object_cache.ser"; +} + + +sub instance_file { + my $self = shift; + + return $self->dump_path."/cache_instance.ser"; +} + + +sub dump_path { + my $self = shift; + + $self->{'dump_path'} ||= path_append($self->conf->param('basedir'), 'cache'); + + return $self->{'dump_path'}; +} + + +sub write_all_to_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + my $size = 0; + $size += $self->write_to_file($type); + $size += $self->write_instance_to_file; + + return parse_bytes($size); +} + + +sub write_to_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + unless ($self->{'cache'}->{$type}) { + $self->logger->warning("No features found in $type. Won't write cache file.\n"); + return; + } + + my $cache_file = $self->cache_file($type); + + eval { nstore($self->{'cache'}->{$type}, $cache_file) }; + if ($@) { + throw("Unable to store $cache_file: $@\n"); + } + + my $size = -s $cache_file; + return $size; +} + + +sub write_instance_to_file { + my $self = shift; + + my $instance_file = $self->instance_file; + + eval { nstore($self->{'instance'}, $instance_file) }; + if ($@) { + throw("Unable to store $instance_file: $@\n"); + } + + my $size = -s $instance_file; + return $size; +} + + +sub read_from_file { + my $self = shift; + my $type = shift; + + throw("You must provide a cache type.") unless $type; + + my $cache_file = $self->cache_file($type); + + if (-s $cache_file) { + + #$self->logger->info("Reading cache from file...\n", 0, 'stamped'); + #$self->logger->info("Cache file $cache_file.\n", 1); + eval { $self->{'cache'}->{$type} = retrieve($cache_file); }; + if ($@) { + throw("Unable to retrieve cache: $@"); + } + #$self->logger->info("Done.\n", 0, 'stamped'); + + } else { + $self->logger->warning("Cache file $cache_file not found or empty.\n"); + } + + + return $self->{'cache'}->{$type}; +} + + +sub read_and_merge { + my $self = shift; + my $dbtype = shift; + + unless ($dbtype eq 'source' or $dbtype eq 'target') { + throw("Db type must be 'source' or 'target'."); + } + + # read cache from single or multiple files, depending on caching strategy + my $cache_method = $self->cache_method; + if ($cache_method eq 'ALL') { + $self->read_from_file("$dbtype.ALL"); + } elsif ($cache_method eq 'BY_SEQ_REGION') { + foreach my $slice_name (@{ $self->slice_names($dbtype) }) { + $self->read_from_file("$dbtype.$slice_name"); + } + } else { + throw("Unknown cache method: $cache_method."); + } + + $self->merge($dbtype); + + # flag as being loaded + $self->{'instance'}->{'loaded'}->{$dbtype} = 1; +} + + +sub merge { + my $self = shift; + my $dbtype = shift; + + unless ($dbtype eq 'source' or $dbtype eq 'target') { + throw("Db type must be 'source' or 'target'."); + } + + foreach my $type (keys %{ $self->{'cache'} || {} }) { + next unless ($type =~ /^$dbtype/); + + foreach my $name (keys %{ $self->{'cache'}->{$type} || {} }) { + + foreach my $key (keys %{ $self->{'cache'}->{$type}->{$name} || {} }) { + if (defined $self->{'cache'}->{$dbtype}->{$name}->{$key}) { + # warning("Duplicate key in cache: $name|$dbtype|$key. Skipping.\n"); + } else { + $self->{'cache'}->{$dbtype}->{$name}->{$key} = + $self->{'cache'}->{$type}->{$name}->{$key}; + } + + delete $self->{'cache'}->{$type}->{$name}->{$key}; + } + + delete $self->{'cache'}->{$type}->{$name}; + } + + delete $self->{'cache'}->{$type}; + + } +} + + +sub read_instance_from_file { + my $self = shift; + + my $instance_file = $self->instance_file; + + unless (-s $instance_file) { + throw("No valid cache instance file found at $instance_file."); + } + + eval { $self->{'instance'} = retrieve($instance_file); }; + if ($@) { + throw("Unable to retrieve cache instance: $@"); + } + + return $self->{'instance'}; +} + + +sub slice_names { + my $self = shift; + my $dbtype = shift; + + throw("You must provide a db type (source|target).") unless $dbtype; + + my $dba = $self->get_DBAdaptor($dbtype); + my $sa = $dba->get_SliceAdaptor; + + my @slice_names = (); + + if ( $self->conf->param('chromosomes') ) { + # Fetch the specified chromosomes. + foreach my $chr ( $self->conf->param('chromosomes') ) { + my $slice = $sa->fetch_by_region( 'chromosome', $chr ); + push @slice_names, $slice->name; + } + + } + elsif ( $self->conf->param('region') ) { + # Fetch the slices on the specified regions. Don't use + # SliceAdaptor->fetch_by_name() since this will fail if assembly + # versions are different for source and target db. + my ( $cs, $version, $name, $start, $end, $strand ) = + split( /:/, $self->conf->param('region') ); + + my $slice = $sa->fetch_by_region( $cs, $name, $start, $end ); + + push @slice_names, $slice->name; + + } + else { + # Fetch all slices that have genes on them. + my $ga = $dba->get_GeneAdaptor; + + foreach my $srid ( @{ $ga->list_seq_region_ids } ) { + my $slice = $sa->fetch_by_seq_region_id($srid); + + if ( !$slice->is_reference() ) { + my $slices = + $slice->adaptor() + ->fetch_by_region_unique( $slice->coord_system_name(), + $slice->seq_region_name() ); + + push( @slice_names, map { $_->name() } @{$slices} ); + } + else { + push @slice_names, $slice->name(); + } + } + } + + return \@slice_names; +} ## end sub slice_names + + +sub logger { + my $self = shift; + $self->{'logger'} = shift if (@_); + return $self->{'logger'}; +} + +sub conf { + my $self = shift; + $self->{'conf'} = shift if (@_); + return $self->{'conf'}; +} + + +sub cache_method { + my $self = shift; + $self->{'instance'}->{'cache_method'} = shift if (@_); + return $self->{'instance'}->{'cache_method'}; +} + + +sub highest_common_cs { + my $self = shift; + $self->{'instance'}->{'hccs'} = shift if (@_); + return $self->{'instance'}->{'hccs'}; +} + + +sub highest_common_cs_version { + my $self = shift; + $self->{'instance'}->{'hccsv'} = shift if (@_); + return $self->{'instance'}->{'hccsv'}; +} + + +sub add_common_cs { + my $self = shift; + my $cs = shift; + + unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) { + throw('You must provide a CoordSystem'); + } + + my $csid = join(':', $cs->name, $cs->version); + + $self->{'instance'}->{'ccs'}->{$csid} = 1; +} + + +sub is_common_cs { + my $self = shift; + my $csid = shift; + + return $self->{'instance'}->{'ccs'}->{$csid}; +} + + +1; +