comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/Cache.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::IdMapping::Cache - a cache to hold data objects used by the
24 IdMapping application
25
26 =head1 DESCRIPTION
27
28 =head1 METHODS
29
30 =cut
31
32
33 package Bio::EnsEMBL::IdMapping::Cache;
34
35 use strict;
36 use warnings;
37 no warnings 'uninitialized';
38
39 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
40 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
41 use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
42 use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
43 use Bio::EnsEMBL::IdMapping::TinyGene;
44 use Bio::EnsEMBL::IdMapping::TinyTranscript;
45 use Bio::EnsEMBL::IdMapping::TinyTranslation;
46 use Bio::EnsEMBL::IdMapping::TinyExon;
47 use Bio::EnsEMBL::DBSQL::DBAdaptor;
48 use Storable qw(nstore retrieve);
49 use Digest::MD5 qw(md5_hex);
50
51 # define available cache names here
52 my @cache_names = qw(
53 exons_by_id
54 transcripts_by_id
55 transcripts_by_exon_id
56 translations_by_id
57 genes_by_id
58 genes_by_transcript_id
59 );
60
61
62 =head2 new
63
64 Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
65 Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
66 Example : my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
67 -LOGGER => $logger,
68 -CONF => $conf,
69 );
70 Description : constructor
71 Return type : Bio::EnsEMBL::IdMapping::Cache object
72 Exceptions : thrown on wrong or missing arguments
73 Caller : general
74 Status : At Risk
75 : under development
76
77 =cut
78
79 sub new {
80 my $caller = shift;
81 my $class = ref($caller) || $caller;
82
83 my ($logger, $conf, $load_instance) =
84 rearrange(['LOGGER', 'CONF', 'LOAD_INSTANCE'], @_);
85
86 unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) {
87 throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
88 }
89
90 unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
91 throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
92 }
93
94 my $self = {};
95 bless ($self, $class);
96
97 # initialise
98 $self->logger($logger);
99 $self->conf($conf);
100
101 if ($load_instance) {
102 $self->read_instance_from_file;
103 }
104
105 return $self;
106 }
107
108
109 =head2 build_cache_by_slice
110
111 Arg[1] : String $dbtype - db type (source|target)
112 Arg[2] : String $slice_name - the name of a slice (format as returned by
113 Bio::EnsEMBL::Slice->name)
114 Example : my ($num_genes, $filesize) = $cache->build_cache_by_slice(
115 'source', 'chromosome:NCBI36:X:1:1000000:-1');
116 Description : Builds a cache of genes, transcripts, translations and exons
117 needed by the IdMapping application and serialises the resulting
118 cache object to a file, one slice at a time.
119 Return type : list of the number of genes processed and the size of the
120 serialised cache file
121 Exceptions : thrown on invalid slice name
122 Caller : general
123 Status : At Risk
124 : under development
125
126 =cut
127
128 sub build_cache_by_slice {
129 my $self = shift;
130 my $dbtype = shift;
131 my $slice_name = shift;
132
133 # set cache method (required for loading cache later)
134 $self->cache_method('BY_SEQ_REGION');
135
136 my $dba = $self->get_DBAdaptor($dbtype);
137 my $sa = $dba->get_SliceAdaptor;
138
139 my $slice = $sa->fetch_by_name($slice_name);
140 unless ($slice) {
141 throw("Could not retrieve slice $slice_name.");
142 }
143
144 my $genes = $slice->get_all_Genes( undef, undef, 1 );
145
146 # find common coord_system
147 my $common_cs_found = $self->find_common_coord_systems;
148
149 # find out whether native coord_system is a common coord_system.
150 # if so, you don't need to project.
151 # also don't project if no common coord_system present
152 my $need_project = 1;
153
154 my $csid = join( ':',
155 $slice->coord_system_name,
156 $slice->coord_system->version );
157
158 if ( $self->is_common_cs($csid) or !$self->highest_common_cs ) {
159 $need_project = 0;
160 }
161
162 # build cache
163 my $type = "$dbtype.$slice_name";
164 my $num_genes =
165 $self->build_cache_from_genes( $type, $genes, $need_project );
166 undef $genes;
167
168 # write cache to file, then flush cache to reclaim memory
169 my $size = $self->write_all_to_file($type);
170
171 return $num_genes, $size;
172 } ## end sub build_cache_by_slice
173
174
175 =head2 build_cache_all
176
177 Arg[1] : String $dbtype - db type (source|target)
178 Example : my ($num_genes, $filesize) = $cache->build_cache_all('source');
179 Description : Builds a cache of genes, transcripts, translations and exons
180 needed by the IdMapping application and serialises the
181 resulting cache object to a file. All genes across the genome
182 are processed in one go. This method should be used when
183 build_cache_by_seq_region can't be used due to a large number
184 of toplevel seq_regions (e.g. 2x genomes).
185 Return type : list of the number of genes processed and the size of the
186 serialised cache file
187 Exceptions : thrown on invalid slice name
188 Caller : general
189 Status : At Risk
190 : under development
191
192 =cut
193
194 sub build_cache_all {
195 my $self = shift;
196 my $dbtype = shift;
197
198 # set cache method (required for loading cache later)
199 $self->cache_method('ALL');
200
201 my $dba = $self->get_DBAdaptor($dbtype);
202 my $ga = $dba->get_GeneAdaptor;
203
204 my $genes = $ga->fetch_all;
205
206 # find common coord_system
207 my $common_cs_found = $self->find_common_coord_systems;
208
209 # Build cache. Setting $need_project to 'CHECK' will cause
210 # build_cache_from_genes() to check the coordinate system for each
211 # gene.
212 my $type = "$dbtype.ALL";
213 my $need_project = 'CHECK';
214 my $num_genes =
215 $self->build_cache_from_genes( $type, $genes, $need_project );
216
217 undef $genes;
218
219 # write cache to file, then flush cache to reclaim memory
220 my $size = $self->write_all_to_file($type);
221
222 return $num_genes, $size;
223 }
224
225
226 =head2 build_cache_from_genes
227
228 Arg[1] : String $type - cache type
229 Arg[2] : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache
230 from
231 Arg[3] : Boolean $need_project - indicate if we need to project exons to
232 common coordinate system
233 Example : $cache->build_cache_from_genes(
234 'source.chromosome:NCBI36:X:1:100000:1', \@genes);
235 Description : Builds the cache by fetching transcripts, translations and exons
236 for a list of genes from the database, and creating lightweight
237 Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the
238 data needed by the IdMapping application. These objects are
239 attached to a name cache in this cache object. Exons only need
240 to be projected to a commond coordinate system if their native
241 coordinate system isn't common to source and target assembly
242 itself.
243 Return type : int - number of genes after filtering
244 Exceptions : thrown on wrong or missing arguments
245 Caller : internal
246 Status : At Risk
247 : under development
248
249 =cut
250
251 sub build_cache_from_genes {
252 my $self = shift;
253 my $type = shift;
254 my $genes = shift;
255 my $need_project = shift;
256
257 throw("You must provide a type.") unless $type;
258 throw("You must provide a listref of genes.")
259 unless ( ref($genes) eq 'ARRAY' );
260
261 # biotype filter
262 if ( $self->conf()->param('biotypes') ||
263 $self->conf()->param('biotypes_include') ||
264 $self->conf()->param('biotypes_exclude') )
265 {
266 $genes = $self->filter_biotypes($genes);
267 }
268 my $num_genes = scalar(@$genes);
269
270 # initialise cache for the given type.
271 $self->{'cache'}->{$type} = {};
272
273 #my $i = 0;
274 #my $num_genes = scalar(@$genes);
275 #my $progress_id = $self->logger->init_progress($num_genes);
276
277 # loop over genes sorted by gene location.
278 # the sort will hopefully improve assembly mapper cache performance and
279 # therefore speed up exon sequence retrieval
280 foreach my $gene ( sort { $a->start <=> $b->start } @$genes ) {
281 #$self->logger->log_progressbar($progress_id, ++$i, 2);
282 #$self->logger->log_progress($num_genes, ++$i, 20, 3, 1);
283
284 if ( $need_project eq 'CHECK' ) {
285 # find out whether native coord_system is a common coord_system.
286 # if so, you don't need to project.
287 # also don't project if no common coord_system present
288 if ( $self->highest_common_cs ) {
289 my $csid = join( ':',
290 $gene->slice->coord_system_name,
291 $gene->slice->coord_system->version );
292 if ( $self->is_common_cs($csid) ) {
293 $need_project = 0;
294 }
295 }
296 else {
297 $need_project = 0;
298 }
299 }
300
301 # create lightweigt gene
302 my $lgene =
303 Bio::EnsEMBL::IdMapping::TinyGene->new_fast( [
304 $gene->dbID, $gene->stable_id,
305 $gene->version, $gene->created_date,
306 $gene->modified_date, $gene->start,
307 $gene->end, $gene->strand,
308 $gene->slice->seq_region_name, $gene->biotype,
309 $gene->status, $gene->analysis->logic_name,
310 ( $gene->is_known ? 1 : 0 ), ] );
311
312 # build gene caches
313 $self->add( 'genes_by_id', $type, $gene->dbID, $lgene );
314
315 # transcripts
316 foreach my $tr ( @{ $gene->get_all_Transcripts } ) {
317 my $ltr =
318 Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast( [
319 $tr->dbID, $tr->stable_id,
320 $tr->version, $tr->created_date,
321 $tr->modified_date, $tr->start,
322 $tr->end, $tr->strand,
323 $tr->length, md5_hex( $tr->spliced_seq ),
324 ( $tr->is_known ? 1 : 0 ) ] );
325
326 $ltr->biotype( $tr->biotype() );
327 $lgene->add_Transcript($ltr);
328
329 # build transcript caches
330 $self->add( 'transcripts_by_id', $type, $tr->dbID, $ltr );
331 $self->add( 'genes_by_transcript_id', $type, $tr->dbID, $lgene );
332
333 # translation (if there is one)
334 if ( my $tl = $tr->translation ) {
335 my $ltl =
336 Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast( [
337 $tl->dbID, $tl->stable_id,
338 $tl->version, $tl->created_date,
339 $tl->modified_date, $tr->dbID,
340 $tr->translate->seq, ( $tr->is_known ? 1 : 0 ),
341 ] );
342
343 $ltr->add_Translation($ltl);
344
345 $self->add( 'translations_by_id', $type, $tl->dbID, $ltl );
346
347 undef $tl;
348 }
349
350 # exons
351 foreach my $exon ( @{ $tr->get_all_Exons } ) {
352 my $lexon =
353 Bio::EnsEMBL::IdMapping::TinyExon->new_fast( [
354 $exon->dbID,
355 $exon->stable_id,
356 $exon->version,
357 $exon->created_date,
358 $exon->modified_date,
359 $exon->start,
360 $exon->end,
361 $exon->strand,
362 $exon->slice->seq_region_name,
363 $exon->slice->coord_system_name,
364 $exon->slice->coord_system->version,
365 $exon->slice->subseq( $exon->start, $exon->end,
366 $exon->strand ),
367 $exon->phase,
368 $need_project, ] );
369
370 # get coordinates in common coordinate system if needed
371 if ($need_project) {
372 my @seg = @{
373 $exon->project( $self->highest_common_cs,
374 $self->highest_common_cs_version ) };
375
376 if ( scalar(@seg) == 1 ) {
377 my $sl = $seg[0]->to_Slice;
378 $lexon->common_start( $sl->start );
379 $lexon->common_end( $sl->end );
380 $lexon->common_strand( $sl->strand );
381 $lexon->common_sr_name( $sl->seq_region_name );
382 }
383 }
384
385 $ltr->add_Exon($lexon);
386
387 $self->add( 'exons_by_id', $type, $exon->dbID, $lexon );
388 $self->add_list( 'transcripts_by_exon_id',
389 $type, $exon->dbID, $ltr );
390
391 undef $exon;
392 } ## end foreach my $exon ( @{ $tr->get_all_Exons...})
393
394 undef $tr;
395 } ## end foreach my $tr ( @{ $gene->get_all_Transcripts...})
396
397 undef $gene;
398 } ## end foreach my $gene ( sort { $a...})
399
400 return $num_genes;
401 } ## end sub build_cache_from_genes
402
403
404 =head2 filter_biotypes
405
406 Arg[1] : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter
407 Example : my @filtered = @{ $cache->filter_biotypes(\@genes) };
408
409 Description : Filters a list of genes by biotype. Biotypes are
410 taken from the IdMapping configuration parameter
411 'biotypes_include' or 'biotypes_exclude'.
412
413 If the configuration parameter 'biotypes_exclude' is
414 defined, then rather than returning the genes whose
415 biotype is listed in the configuration parameter
416 'biotypes_include' the method will return the genes
417 whose biotype is *not* listed in the 'biotypes_exclude'
418 configuration parameter.
419
420 It is an error to define both these configuration
421 parameters.
422
423 The old parameter 'biotypes' is equivalent to
424 'biotypes_include'.
425
426 Return type : Listref of Bio::EnsEMBL::Genes (or empty list)
427 Exceptions : none
428 Caller : internal
429 Status : At Risk
430 : under development
431
432 =cut
433
434 sub filter_biotypes {
435 my ( $self, $genes ) = @_;
436
437 my @filtered;
438 my @biotypes;
439 my $opt_reverse;
440
441 if ( defined( $self->conf()->param('biotypes_include') ) ||
442 defined( $self->conf()->param('biotypes') ) )
443 {
444 if ( defined( $self->conf()->param('biotypes_exclude') ) ) {
445 $self->logger()
446 ->error( "You may not use both " .
447 "'biotypes_include' and 'biotypes_exclude' " .
448 "in the configuration" );
449 }
450
451 if ( defined( $self->conf()->param('biotypes_include') ) ) {
452 @biotypes = $self->conf()->param('biotypes_include');
453 }
454 else {
455 @biotypes = $self->conf()->param('biotypes');
456 }
457 $opt_reverse = 0;
458 }
459 else {
460 @biotypes = $self->conf()->param('biotypes_exclude');
461 $opt_reverse = 1;
462 }
463
464 foreach my $gene ( @{$genes} ) {
465 my $keep_gene;
466
467 foreach my $biotype (@biotypes) {
468 if ( $gene->biotype() eq $biotype ) {
469 if ($opt_reverse) { $keep_gene = 0 }
470 else { $keep_gene = 1 }
471 last;
472 }
473 }
474
475 if ( defined($keep_gene) ) {
476 if ($keep_gene) {
477 push( @filtered, $gene );
478 }
479 }
480 elsif ($opt_reverse) {
481 push( @filtered, $gene );
482 }
483 }
484
485 return \@filtered;
486 } ## end sub filter_biotypes
487
488
489 =head2 add
490
491 Arg[1] : String $name - a cache name (e.g. 'genes_by_id')
492 Arg[2] : String type - a cache type (e.g. "source.$slice_name")
493 Arg[3] : String $key - key of this entry (e.g. a gene dbID)
494 Arg[4] : Bio::EnsEMBL::IdMappping::TinyFeature $val - value to cache
495 Example : $cache->add('genes_by_id',
496 'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene);
497 Description : Adds a TinyFeature object to a named cache.
498 Return type : Bio::EnsEMBL::IdMapping::TinyFeature
499 Exceptions : thrown on wrong or missing arguments
500 Caller : internal
501 Status : At Risk
502 : under development
503
504 =cut
505
506 sub add {
507 my $self = shift;
508 my $name = shift;
509 my $type = shift;
510 my $key = shift;
511 my $val = shift;
512
513 throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
514 throw("You must provide a cache type.") unless $type;
515 throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
516
517 $self->{'cache'}->{$type}->{$name}->{$key} = $val;
518
519 return $self->{'cache'}->{$type}->{$name}->{$key};
520 }
521
522 =head2 add_list
523
524 Arg[1] : String $name - a cache name (e.g. 'genes_by_id')
525 Arg[2] : String type - a cache type (e.g. "source.$slice_name")
526 Arg[3] : String $key - key of this entry (e.g. a gene dbID)
527 Arg[4] : List of Bio::EnsEMBL::IdMappping::TinyFeature @val - values
528 to cache
529 Example : $cache->add_list('transcripts_by_exon_id',
530 'source.chromosome:NCBI36:X:1:1000000:1', '1234',
531 $tiny_transcript1, $tiny_transcript2);
532 Description : Adds a list of TinyFeature objects to a named cache.
533 Return type : Listref of Bio::EnsEMBL::IdMapping::TinyFeature objects
534 Exceptions : thrown on wrong or missing arguments
535 Caller : internal
536 Status : At Risk
537 : under development
538
539 =cut
540
541 sub add_list {
542 my $self = shift;
543 my $name = shift;
544 my $type = shift;
545 my $key = shift;
546 my @vals = @_;
547
548 throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
549 throw("You must provide a cache type.") unless $type;
550 throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
551
552 push @{ $self->{'cache'}->{$type}->{$name}->{$key} }, @vals;
553
554 return $self->{'cache'}->{$type}->{$name}->{$key};
555 }
556
557 sub get_by_key {
558 my $self = shift;
559 my $name = shift;
560 my $type = shift;
561 my $key = shift;
562
563 throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
564 throw("You must provide a cache type.") unless $type;
565 throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
566
567 # transparently load cache from file unless already loaded
568 unless ($self->{'instance'}->{'loaded'}->{"$type"}) {
569 $self->read_and_merge($type);
570 }
571
572 return $self->{'cache'}->{$type}->{$name}->{$key};
573 }
574
575 sub get_by_name {
576 my $self = shift;
577 my $name = shift;
578 my $type = shift;
579
580 throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
581 throw("You must provide a cache type.") unless $type;
582
583 # transparently load cache from file unless already loaded
584 unless ($self->{'instance'}->{'loaded'}->{$type}) {
585 $self->read_and_merge($type);
586 }
587
588 return $self->{'cache'}->{$type}->{$name} || {};
589 }
590
591
592 sub get_count_by_name {
593 my $self = shift;
594 my $name = shift;
595 my $type = shift;
596
597 throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
598 throw("You must provide a cache type.") unless $type;
599
600 # transparently load cache from file unless already loaded
601 unless ($self->{'instance'}->{'loaded'}->{$type}) {
602 $self->read_and_merge($type);
603 }
604
605 return scalar(keys %{ $self->get_by_name($name, $type) });
606 }
607
608
609 sub find_common_coord_systems {
610 my $self = shift;
611
612 # get adaptors for source db
613 my $s_dba = $self->get_DBAdaptor('source');
614 my $s_csa = $s_dba->get_CoordSystemAdaptor;
615 my $s_sa = $s_dba->get_SliceAdaptor;
616
617 # get adaptors for target db
618 my $t_dba = $self->get_DBAdaptor('target');
619 my $t_csa = $t_dba->get_CoordSystemAdaptor;
620 my $t_sa = $t_dba->get_SliceAdaptor;
621
622 # find common coord_systems
623 my @s_coord_systems = @{ $s_csa->fetch_all };
624 my @t_coord_systems = @{ $t_csa->fetch_all };
625 my $found_highest = 0;
626
627 SOURCE:
628 foreach my $s_cs (@s_coord_systems) {
629 if ( !$s_cs->is_default() ) { next SOURCE }
630
631 TARGET:
632 foreach my $t_cs (@t_coord_systems) {
633 if ( !$t_cs->is_default() ) { next TARGET }
634
635 if ( $s_cs->name eq $t_cs->name ) {
636
637 # test for identical coord_system version
638 if ( $s_cs->version and ( $s_cs->version ne $t_cs->version ) ) {
639 next TARGET;
640 }
641
642 # test for at least 50% identical seq_regions
643 if ( $self->seq_regions_compatible( $s_cs, $s_sa, $t_sa ) ) {
644 $self->add_common_cs($s_cs);
645
646 unless ($found_highest) {
647 $self->highest_common_cs( $s_cs->name );
648 $self->highest_common_cs_version( $s_cs->version );
649 }
650
651 $found_highest = 1;
652
653 next SOURCE;
654 }
655 }
656 } ## end foreach my $t_cs (@t_coord_systems)
657 } ## end foreach my $s_cs (@s_coord_systems)
658
659 return $found_highest;
660 } ## end sub find_common_coord_systems
661
662
663 sub seq_regions_compatible {
664 my $self = shift;
665 my $cs = shift;
666 my $s_sa = shift;
667 my $t_sa = shift;
668
669 unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
670 throw('You must provide a CoordSystem');
671 }
672
673 unless ($s_sa and $t_sa and $s_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')
674 and $t_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) {
675 throw('You must provide a source and target SliceAdaptor');
676 }
677
678 my %sr_match;
679 my $equal = 0;
680
681 my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version);
682 my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version);
683
684 # sanity check to prevent divison by zero
685 my $s_count = scalar(@$s_seq_regions);
686 my $t_count = scalar(@$t_seq_regions);
687 return(0) if ($s_count == 0 or $t_count == 0);
688
689 foreach my $s_sr (@$s_seq_regions) {
690 $sr_match{$s_sr->seq_region_name} = $s_sr->length;
691 }
692
693 foreach my $t_sr (@$t_seq_regions) {
694 if (exists($sr_match{$t_sr->seq_region_name})) {
695 $equal++;
696
697 # return false if we have a region with same name but different length
698 return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length);
699 }
700 }
701
702 if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) {
703 return(1);
704 } else {
705 $self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n");
706 return(0);
707 }
708
709 }
710
711
712 sub check_db_connection {
713 my $self = shift;
714 my $dbtype = shift;
715
716 my $err = 0;
717
718 eval {
719 my $dba = $self->get_DBAdaptor($dbtype);
720 $dba->dbc->connect;
721 };
722
723 if ($@) {
724 $self->logger->warning("Can't connect to $dbtype db: $@\n");
725 $err++;
726 } else {
727 $self->logger->debug("Connection to $dbtype db ok.\n");
728 $self->{'_db_conn_ok'}->{$dbtype} = 1;
729 }
730
731 return $err;
732 }
733
734
735 sub check_db_read_permissions {
736 my $self = shift;
737 my $dbtype = shift;
738
739 # skip this check if db connection failed (this prevents re-throwing
740 # exceptions).
741 return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
742
743 my $err = 0;
744 my %privs = %{ $self->get_db_privs($dbtype) };
745
746 unless ($privs{'SELECT'} or $privs{'ALL PRIVILEGES'}) {
747 $self->logger->warning("User doesn't have read permission on $dbtype db.\n");
748 $err++;
749 } else {
750 $self->logger->debug("Read permission on $dbtype db ok.\n");
751 }
752
753 return $err;
754 }
755
756
757 sub check_db_write_permissions {
758 my $self = shift;
759 my $dbtype = shift;
760
761 # skip this check if db connection failed (this prevents re-throwing
762 # exceptions).
763 return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
764
765 my $err = 0;
766
767 unless ($self->do_upload) {
768 $self->logger->debug("No uploads, so write permission on $dbtype db not required.\n");
769 return $err;
770 }
771
772 my %privs = %{ $self->get_db_privs($dbtype) };
773
774 unless ($privs{'INSERT'} or $privs{'ALL PRIVILEGES'}) {
775 $self->logger->warning("User doesn't have write permission on $dbtype db.\n");
776 $err++;
777 } else {
778 $self->logger->debug("Write permission on $dbtype db ok.\n");
779 }
780
781 return $err;
782 }
783
784
785 sub do_upload {
786 my $self = shift;
787
788 if ($self->conf->param('dry_run') or
789 ! ($self->conf->param('upload_events') or
790 $self->conf->param('upload_stable_ids') or
791 $self->conf->param('upload_archive'))) {
792 return 0;
793 } else {
794 return 1;
795 }
796 }
797
798
799 sub get_db_privs {
800 my ( $self, $dbtype ) = @_;
801
802 my %privs = ();
803 my $rs;
804
805 # get privileges from mysql db
806 eval {
807 my $dbc = $self->get_DBAdaptor($dbtype)->dbc();
808 my $sql = qq(SHOW GRANTS FOR ) . $dbc->username();
809 my $sth = $dbc->prepare($sql);
810 $sth->execute();
811 $rs = $sth->fetchall_arrayref();
812 #$sth->finish();
813 };
814
815 if ($@) {
816 $self->logger->warning(
817 "Error obtaining privileges from $dbtype db: $@\n");
818 return {};
819 }
820
821 # parse the output
822 foreach my $r ( map { $_->[0] } @{$rs} ) {
823 $r =~ s/GRANT (.*) ON .*/$1/i;
824 foreach my $p ( split( ',', $r ) ) {
825 # trim leading and trailing whitespace
826 $p =~ s/^\s+//;
827 $p =~ s/\s+$//;
828 $privs{ uc($p) } = 1;
829 }
830 }
831
832 return \%privs;
833 } ## end sub get_db_privs
834
835
836 sub check_empty_tables {
837 my $self = shift;
838 my $dbtype = shift;
839
840 # skip this check if db connection failed (this prevents re-throwing
841 # exceptions).
842 return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
843
844 my $err = 0;
845 my $c = 0;
846
847 if ($self->conf->param('no_check_empty_tables') or !$self->do_upload) {
848 $self->logger->debug("Won't check for empty stable ID and archive tables in $dbtype db.\n");
849 return $err;
850 }
851
852 eval {
853 my @tables =
854 qw(
855 gene_stable_id
856 transcript_stable_id
857 translation_stable_id
858 exon_stable_id
859 stable_id_event
860 mapping_session
861 gene_archive
862 peptide_archive
863 );
864
865 my $dba = $self->get_DBAdaptor($dbtype);
866 foreach my $table (@tables) {
867 if ( $table =~ /^([^_]+)_stable_id/ ) {
868 $table = $1;
869 if ( $c =
870 $self->fetch_value_from_db(
871 $dba,
872 "SELECT COUNT(*) FROM $table WHERE stable_id IS NOT NULL"
873 ) )
874 {
875 $self->logger->warning(
876 "$table table in $dbtype db has $c stable IDs.\n");
877 $err++;
878 }
879 }
880 else {
881 if ( $c =
882 $self->fetch_value_from_db(
883 $dba, "SELECT COUNT(*) FROM $table"
884 ) )
885 {
886 $self->logger->warning(
887 "$table table in $dbtype db has $c entries.\n");
888 $err++;
889 }
890 }
891 } ## end foreach my $table (@tables)
892 };
893
894 if ($@) {
895 $self->logger->warning(
896 "Error retrieving stable ID and archive table row counts from $dbtype db: $@\n"
897 );
898 $err++;
899 }
900 elsif ( !$err ) {
901 $self->logger->debug(
902 "All stable ID and archive tables in $dbtype db are empty.\n");
903 }
904 return $err;
905 }
906
907
908 sub check_sequence {
909 my ( $self, $dbtype ) = @_;
910
911 # skip this check if db connection failed (this prevents re-throwing
912 # exceptions).
913 return 1 unless ( $self->{'_db_conn_ok'}->{$dbtype} );
914
915 my $err = 0;
916 my $c = 0;
917
918 eval {
919 my $dba = $self->get_DBAdaptor($dbtype);
920 unless ( $c =
921 $self->fetch_value_from_db(
922 $dba->dnadb(), "SELECT COUNT(*) FROM dna"
923 ) )
924 {
925 $err++;
926 }
927 };
928
929 if ($@) {
930 $self->logger->warning( "Error retrieving dna table row count "
931 . "from $dbtype database: $@\n" );
932 $err++;
933 } elsif ($err) {
934 $self->logger->warning("No sequence found in $dbtype database.\n");
935 } else {
936 $self->logger->debug(
937 ucfirst($dbtype) . " db has sequence ($c entries).\n" );
938 }
939
940 return $err;
941 } ## end sub check_sequence
942
943
944 sub check_meta_entries {
945 my $self = shift;
946 my $dbtype = shift;
947
948 # skip this check if db connection failed (this prevents re-throwing
949 # exceptions).
950 return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
951
952 my $err = 0;
953 my $assembly_default;
954 my $schema_version;
955
956 eval {
957 my $dba = $self->get_DBAdaptor($dbtype);
958 $assembly_default = $self->fetch_value_from_db($dba,
959 qq(SELECT meta_value FROM meta WHERE meta_key = 'assembly.default'));
960 $schema_version = $self->fetch_value_from_db($dba,
961 qq(SELECT meta_value FROM meta WHERE meta_key = 'schema_version'));
962 };
963
964 if ($@) {
965 $self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n");
966 return ++$err;
967 }
968
969 unless ($assembly_default) {
970 $self->logger->warning("No meta.assembly.default value found in $dbtype db.\n");
971 $err++;
972 } else {
973 $self->logger->debug("meta.assembly.default value found ($assembly_default).\n");
974 }
975
976 unless ($schema_version) {
977 $self->logger->warning("No meta.schema_version value found in $dbtype db.\n");
978 $err++;
979 } else {
980 $self->logger->debug("meta.schema_version value found ($schema_version).\n");
981 }
982
983 return $err;
984 }
985
986
987 sub fetch_value_from_db {
988 my ( $self, $dba, $sql ) = @_;
989
990 assert_ref( $dba, 'Bio::EnsEMBL::DBSQL::DBAdaptor' );
991
992 if ( !defined($sql) ) {
993 throw("Need an SQL statement to execute.\n");
994 }
995
996 my $sth = $dba->dbc->prepare($sql);
997 $sth->execute();
998
999 my ($c) = $sth->fetchrow_array;
1000 return $c;
1001 }
1002
1003 sub get_DBAdaptor {
1004 my ( $self, $prefix ) = @_;
1005
1006 unless ( $self->{'_dba'}->{$prefix} ) {
1007 # connect to database
1008 my $dba =
1009 new Bio::EnsEMBL::DBSQL::DBAdaptor(
1010 -host => $self->conf->param("${prefix}host"),
1011 -port => $self->conf->param("${prefix}port"),
1012 -user => $self->conf->param("${prefix}user"),
1013 -pass => $self->conf->param("${prefix}pass"),
1014 -dbname => $self->conf->param("${prefix}dbname"),
1015 -group => $prefix, );
1016
1017 if ( !defined( $self->conf->param("${prefix}host_dna") ) ) {
1018 # explicitely set the dnadb to itself - by default the Registry
1019 # assumes a group 'core' for this now
1020 $dba->dnadb($dba);
1021 } else {
1022 my $dna_dba =
1023 new Bio::EnsEMBL::DBSQL::DBAdaptor(
1024 -host => $self->conf->param("${prefix}host_dna"),
1025 -port => $self->conf->param("${prefix}port_dna"),
1026 -user => $self->conf->param("${prefix}user_dna"),
1027 -pass => $self->conf->param("${prefix}pass_dna"),
1028 -dbname => $self->conf->param("${prefix}dbname_dna"),
1029 -group => $prefix, );
1030 $dba->dnadb($dna_dba);
1031 }
1032
1033 $self->{'_dba'}->{$prefix} = $dba;
1034 } ## end unless ( $self->{'_dba'}->...)
1035
1036 return $self->{'_dba'}->{$prefix};
1037 } ## end sub get_DBAdaptor
1038
1039
1040 sub cache_file_exists {
1041 my $self = shift;
1042 my $type = shift;
1043
1044 throw("You must provide a cache type.") unless $type;
1045
1046 my $cache_file = $self->cache_file($type);
1047
1048 if (-e $cache_file) {
1049 $self->logger->info("Cache file found for $type.\n", 2);
1050 $self->logger->debug("Will read from $cache_file.\n", 2);
1051 return 1;
1052 } else {
1053 $self->logger->info("No cache file found for $type.\n", 2);
1054 $self->logger->info("Will build cache from db.\n", 2);
1055 return 0;
1056 }
1057 }
1058
1059
1060 sub cache_file {
1061 my $self = shift;
1062 my $type = shift;
1063
1064 throw("You must provide a cache type.") unless $type;
1065
1066 return $self->dump_path."/$type.object_cache.ser";
1067 }
1068
1069
1070 sub instance_file {
1071 my $self = shift;
1072
1073 return $self->dump_path."/cache_instance.ser";
1074 }
1075
1076
1077 sub dump_path {
1078 my $self = shift;
1079
1080 $self->{'dump_path'} ||= path_append($self->conf->param('basedir'), 'cache');
1081
1082 return $self->{'dump_path'};
1083 }
1084
1085
1086 sub write_all_to_file {
1087 my $self = shift;
1088 my $type = shift;
1089
1090 throw("You must provide a cache type.") unless $type;
1091
1092 my $size = 0;
1093 $size += $self->write_to_file($type);
1094 $size += $self->write_instance_to_file;
1095
1096 return parse_bytes($size);
1097 }
1098
1099
1100 sub write_to_file {
1101 my $self = shift;
1102 my $type = shift;
1103
1104 throw("You must provide a cache type.") unless $type;
1105
1106 unless ($self->{'cache'}->{$type}) {
1107 $self->logger->warning("No features found in $type. Won't write cache file.\n");
1108 return;
1109 }
1110
1111 my $cache_file = $self->cache_file($type);
1112
1113 eval { nstore($self->{'cache'}->{$type}, $cache_file) };
1114 if ($@) {
1115 throw("Unable to store $cache_file: $@\n");
1116 }
1117
1118 my $size = -s $cache_file;
1119 return $size;
1120 }
1121
1122
1123 sub write_instance_to_file {
1124 my $self = shift;
1125
1126 my $instance_file = $self->instance_file;
1127
1128 eval { nstore($self->{'instance'}, $instance_file) };
1129 if ($@) {
1130 throw("Unable to store $instance_file: $@\n");
1131 }
1132
1133 my $size = -s $instance_file;
1134 return $size;
1135 }
1136
1137
1138 sub read_from_file {
1139 my $self = shift;
1140 my $type = shift;
1141
1142 throw("You must provide a cache type.") unless $type;
1143
1144 my $cache_file = $self->cache_file($type);
1145
1146 if (-s $cache_file) {
1147
1148 #$self->logger->info("Reading cache from file...\n", 0, 'stamped');
1149 #$self->logger->info("Cache file $cache_file.\n", 1);
1150 eval { $self->{'cache'}->{$type} = retrieve($cache_file); };
1151 if ($@) {
1152 throw("Unable to retrieve cache: $@");
1153 }
1154 #$self->logger->info("Done.\n", 0, 'stamped');
1155
1156 } else {
1157 $self->logger->warning("Cache file $cache_file not found or empty.\n");
1158 }
1159
1160
1161 return $self->{'cache'}->{$type};
1162 }
1163
1164
1165 sub read_and_merge {
1166 my $self = shift;
1167 my $dbtype = shift;
1168
1169 unless ($dbtype eq 'source' or $dbtype eq 'target') {
1170 throw("Db type must be 'source' or 'target'.");
1171 }
1172
1173 # read cache from single or multiple files, depending on caching strategy
1174 my $cache_method = $self->cache_method;
1175 if ($cache_method eq 'ALL') {
1176 $self->read_from_file("$dbtype.ALL");
1177 } elsif ($cache_method eq 'BY_SEQ_REGION') {
1178 foreach my $slice_name (@{ $self->slice_names($dbtype) }) {
1179 $self->read_from_file("$dbtype.$slice_name");
1180 }
1181 } else {
1182 throw("Unknown cache method: $cache_method.");
1183 }
1184
1185 $self->merge($dbtype);
1186
1187 # flag as being loaded
1188 $self->{'instance'}->{'loaded'}->{$dbtype} = 1;
1189 }
1190
1191
1192 sub merge {
1193 my $self = shift;
1194 my $dbtype = shift;
1195
1196 unless ($dbtype eq 'source' or $dbtype eq 'target') {
1197 throw("Db type must be 'source' or 'target'.");
1198 }
1199
1200 foreach my $type (keys %{ $self->{'cache'} || {} }) {
1201 next unless ($type =~ /^$dbtype/);
1202
1203 foreach my $name (keys %{ $self->{'cache'}->{$type} || {} }) {
1204
1205 foreach my $key (keys %{ $self->{'cache'}->{$type}->{$name} || {} }) {
1206 if (defined $self->{'cache'}->{$dbtype}->{$name}->{$key}) {
1207 # warning("Duplicate key in cache: $name|$dbtype|$key. Skipping.\n");
1208 } else {
1209 $self->{'cache'}->{$dbtype}->{$name}->{$key} =
1210 $self->{'cache'}->{$type}->{$name}->{$key};
1211 }
1212
1213 delete $self->{'cache'}->{$type}->{$name}->{$key};
1214 }
1215
1216 delete $self->{'cache'}->{$type}->{$name};
1217 }
1218
1219 delete $self->{'cache'}->{$type};
1220
1221 }
1222 }
1223
1224
1225 sub read_instance_from_file {
1226 my $self = shift;
1227
1228 my $instance_file = $self->instance_file;
1229
1230 unless (-s $instance_file) {
1231 throw("No valid cache instance file found at $instance_file.");
1232 }
1233
1234 eval { $self->{'instance'} = retrieve($instance_file); };
1235 if ($@) {
1236 throw("Unable to retrieve cache instance: $@");
1237 }
1238
1239 return $self->{'instance'};
1240 }
1241
1242
1243 sub slice_names {
1244 my $self = shift;
1245 my $dbtype = shift;
1246
1247 throw("You must provide a db type (source|target).") unless $dbtype;
1248
1249 my $dba = $self->get_DBAdaptor($dbtype);
1250 my $sa = $dba->get_SliceAdaptor;
1251
1252 my @slice_names = ();
1253
1254 if ( $self->conf->param('chromosomes') ) {
1255 # Fetch the specified chromosomes.
1256 foreach my $chr ( $self->conf->param('chromosomes') ) {
1257 my $slice = $sa->fetch_by_region( 'chromosome', $chr );
1258 push @slice_names, $slice->name;
1259 }
1260
1261 }
1262 elsif ( $self->conf->param('region') ) {
1263 # Fetch the slices on the specified regions. Don't use
1264 # SliceAdaptor->fetch_by_name() since this will fail if assembly
1265 # versions are different for source and target db.
1266 my ( $cs, $version, $name, $start, $end, $strand ) =
1267 split( /:/, $self->conf->param('region') );
1268
1269 my $slice = $sa->fetch_by_region( $cs, $name, $start, $end );
1270
1271 push @slice_names, $slice->name;
1272
1273 }
1274 else {
1275 # Fetch all slices that have genes on them.
1276 my $ga = $dba->get_GeneAdaptor;
1277
1278 foreach my $srid ( @{ $ga->list_seq_region_ids } ) {
1279 my $slice = $sa->fetch_by_seq_region_id($srid);
1280
1281 if ( !$slice->is_reference() ) {
1282 my $slices =
1283 $slice->adaptor()
1284 ->fetch_by_region_unique( $slice->coord_system_name(),
1285 $slice->seq_region_name() );
1286
1287 push( @slice_names, map { $_->name() } @{$slices} );
1288 }
1289 else {
1290 push @slice_names, $slice->name();
1291 }
1292 }
1293 }
1294
1295 return \@slice_names;
1296 } ## end sub slice_names
1297
1298
1299 sub logger {
1300 my $self = shift;
1301 $self->{'logger'} = shift if (@_);
1302 return $self->{'logger'};
1303 }
1304
1305 sub conf {
1306 my $self = shift;
1307 $self->{'conf'} = shift if (@_);
1308 return $self->{'conf'};
1309 }
1310
1311
1312 sub cache_method {
1313 my $self = shift;
1314 $self->{'instance'}->{'cache_method'} = shift if (@_);
1315 return $self->{'instance'}->{'cache_method'};
1316 }
1317
1318
1319 sub highest_common_cs {
1320 my $self = shift;
1321 $self->{'instance'}->{'hccs'} = shift if (@_);
1322 return $self->{'instance'}->{'hccs'};
1323 }
1324
1325
1326 sub highest_common_cs_version {
1327 my $self = shift;
1328 $self->{'instance'}->{'hccsv'} = shift if (@_);
1329 return $self->{'instance'}->{'hccsv'};
1330 }
1331
1332
1333 sub add_common_cs {
1334 my $self = shift;
1335 my $cs = shift;
1336
1337 unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
1338 throw('You must provide a CoordSystem');
1339 }
1340
1341 my $csid = join(':', $cs->name, $cs->version);
1342
1343 $self->{'instance'}->{'ccs'}->{$csid} = 1;
1344 }
1345
1346
1347 sub is_common_cs {
1348 my $self = shift;
1349 my $csid = shift;
1350
1351 return $self->{'instance'}->{'ccs'}->{$csid};
1352 }
1353
1354
1355 1;
1356