comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <ensembl-dev@ebi.ac.uk>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =head1 NAME
20
21 Bio::EnsEMBL::Funcgen::Utils::HealthChecker
22
23 =head1 SYNOPSIS
24
25 =head1 DESCRIPTION
26
27 B<This program> provides several methods to health check and update tables prior to
28 release. Using the updte_DB_for_release method runs the following:
29
30 validate_new_seq_regions - _pre_stores seq_region & coord_system info from new core DB
31 check_regbuild_strings - Validates or inserts regbuild_string entries
32 check_meta_species_version - Validates meta species and version wrt dbname
33 set_current_coord_system - Updates coord_system.is_current to 1 for current schema_build (required for mart)
34 update_meta_coord - Regenerates meta_coord.max_length values (required for Slice range queries)
35 clean_xrefs - Removes old unused xref and external_db records
36 validate_DataSets - Performs various checks on Data/Feature/ResultSets links and states
37 check_stable_ids - Check for any NULL stable IDs
38 log_data_sets - Logs all DISPLAYABLE DataSets
39 analyse_and_optimise_tables - Does what is says
40
41
42 =cut
43
44 ################################################################################
45
46 package Bio::EnsEMBL::Funcgen::Utils::HealthChecker;
47
48 use strict;
49 use Bio::EnsEMBL::Funcgen::Utils::Helper;
50 use Bio::EnsEMBL::Funcgen::ProbeFeature;
51 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
52 use Bio::EnsEMBL::Utils::Exception qw( throw );
53 use Bio::EnsEMBL::Analysis;
54 use vars qw(@ISA);
55 @ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
56
57
58
59 #TO DO
60 # 1 DONE Print all fails and warnings in summary at end of script.
61 # 2 validate_RegulatoryFeature_Sets
62 # 3 Some of these can be migrated or mirrored in java HCs for safety
63
64
65 ################################################################################
66
67 sub new {
68 my $caller = shift;
69 my $class = ref($caller) || $caller;
70 my $self = $class->SUPER::new(@_);
71
72 #validate and set type, analysis and feature_set here
73 my ($db, $builds, $skip_mc, $check_displayable, $skip_analyse, $meta_coord_tables, $skip_xrefs, $fix) =
74 rearrange(['DB', 'BUILDS', 'SKIP_META_COORD', 'CHECK_DISPLAYABLE', 'SKIP_ANALYSE', 'META_COORD_TABLES', 'SKIP_XREF_CLEANUP', 'FIX'], @_);
75
76
77 if (! ($db && ref($db) &&
78 $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){
79 throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor');
80 }
81
82 #test connection
83 $db->dbc->db_handle;
84
85 $self->{'db'} = $db;
86 $self->{'mysql_connect_string'} = 'mysql -h'.$db->dbc->host.' -u'.$db->dbc->username.' -p'
87 .$db->dbc->password.' '.$db->dbc->dbname.' -P'.$db->dbc->port;
88 $self->{'dbname'} = $db->dbc->dbname;
89 $self->{'builds'} = (scalar(@$builds)>0) ? $builds : [];
90 $self->{'skip_meta_coord'} = $skip_mc;
91 $self->{'skip_xrefs'} = $skip_xrefs;
92 $self->{'skip_analyse'} = $skip_analyse;
93 $self->{'check_displayable'} = $check_displayable;
94 $self->{fix} = $fix;
95
96 if(defined $meta_coord_tables){
97
98 throw('-skip_meta_coord is set, Cannot build meta_coord entries for tables '.join(', ', @$meta_coord_tables));
99
100 if(! ref($meta_coord_tables) eq 'ARRAY'){
101 throw('-meta_coord_tables parameter must be an array ref');
102 }
103
104 @{$self->{'meta_coord_tables'}} = @$meta_coord_tables;
105 }
106
107 return $self;
108 }
109
110 sub db{ return $_[0]->{db}; }
111
112 sub fix{ return $_[0]->{fix}; }
113
114 =head2 update_db_for_release
115
116 Arg[0] :
117 Example :
118 Description: Wrapper method to perform all common update functions
119 Returntype :
120 Exceptions : None
121 Caller : General
122 Status : at risk
123
124 =cut
125
126 sub update_db_for_release{
127 my ($self, @args) = @_;
128
129 if(@args){
130 }
131
132 #do seq_region_update to validate dnadb first
133 #hence avoiding redoing longer methods
134 $self->validate_new_seq_regions;#($force_srs);
135 #$self->update_meta_schema_version;
136 $self->check_regbuild_strings;
137 $self->check_meta_species_version;
138 $self->set_current_coord_system;
139 $self->update_meta_coord;
140 $self->clean_xrefs;
141 $self->validate_DataSets;
142 $self->check_stable_ids;
143 $self->log_data_sets();
144 $self->analyse_and_optimise_tables;#ALWAYS LAST!!
145
146 $self->log_header('??? Have you dumped/copied GFF dumps ???');
147
148 #Log footer? Pass optional counts hash?
149 $self->log('Finished updating '.$self->{'dbname'}." for release\n\n");
150 }
151
152 sub validate_new_seq_regions{
153 my ($self, $force) = @_;
154
155
156 #We need to add some functionality to handle non-standard schema_build progression here
157 #This should be used before any data is loaded
158 #It should also warn if there any duplicates if it is not run before coord_systems are duplicated
159 #Should this just be handled in the BaseFeatureAdaptor/CoordSystemAdaptor?
160
161
162
163
164 #do we need to add the none default levels here?
165 #or are we only bothered about those which constitute the toplevel?
166
167 #To make sure we have all the correct levels in eFG we need to get all the names.
168 #then get all by name from the core db and set them as the dnadb.
169 # we also need to get all the toplevel seq_regions and store them in the seq_region table
170 #use BaseFeatureAdaptor::_pre_store with and array of pseudo feature on each top level slice
171
172 #Validate the efgdb and dnadb schema version are the same first
173 #This is because we expect the schem_build to be the same for a release
174 #However, this will autoset the dnadb if no defined, so will always match!
175
176 if(! $force){
177 my $efgdb_sm = join('_', @{$self->get_schema_and_build($self->{'dbname'})});
178 my $dnadb_sm = join('_', @{$self->get_schema_and_build($self->db->dnadb->dbc->dbname)});
179
180 if($efgdb_sm ne $dnadb_sm){
181 $self->report("WARNING Skipped validate_new_seq_regions as schema_versions are mismatched:\t".
182 "efgdb $efgdb_sm\tdnadb $dnadb_sm");
183 return 0;
184 }
185 }
186
187 my $pf_adaptor = $self->db->get_ProbeFeatureAdaptor();
188 my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor();
189 my $dnadb_csa = $self->db->dnadb->get_CoordSystemAdaptor;
190
191 $self->log_header('Validating new coord_systems/seq_regions');
192
193 my @slices;
194 my %versioned_levels;
195 my $default_version;
196
197 #Grab unversioned top level slices and versioned levels
198 #May miss some old versioned level if the new assembly no longer has them
199 foreach my $slice(@{$slice_adaptor->fetch_all('toplevel', undef, 1)}){
200
201 if (! $slice->coord_system->version){
202 push @slices, $slice;
203 }
204 else{
205
206 if($default_version &&
207 ($default_version ne $slice->coord_system->version)){
208 throw("Found more than one default CoordSystem version:\t${default_version}\t".$slice->coord_system->version);
209 }
210 else{
211 $default_version = $slice->coord_system->version;
212 }
213 }
214 }
215
216
217 #Get all versioned levels for all builds
218 foreach my $cs(@{$dnadb_csa->fetch_all}){
219
220 if($cs->version){
221 $versioned_levels{$cs->version} ||= [];
222 push @{$versioned_levels{$cs->version}}, $cs->name;
223 }
224 }
225
226 push @{$self->{'builds'}}, $default_version if scalar(@{$self->{'builds'}}) == 0;
227
228
229
230 #Grab slices for each versioned level
231 foreach my $build(@{$self->{'builds'}}){
232
233 if(! exists $versioned_levels{$build}){
234 throw("CoordSystem version $build does not exist in the dnadb ".$self->db->dnadb->dbc->dbname);
235 }
236
237 foreach my $level(@{$versioned_levels{$build}}){
238 $self->log("Getting slices for $level $build");
239 push @slices, @{$slice_adaptor->fetch_all($level, $build)};
240 }
241 }
242
243 $self->log("Importing seq_region/coord_system info for builds:\t".join(',', @{$self->{'builds'}}));
244
245 foreach my $slice(@slices){
246
247 if($slice->start() != 1){
248 $self->log("Reslicing slice:\t".$slice->name());
249 #we must have some sort of PAR linked region i.e. Y
250 $slice = $slice_adaptor->fetch_by_region($slice->coord_system_name(), $slice->seq_region_name());
251 }
252
253
254 #we need test if it needs doing first?
255 #we would need to test for the coord_systems outside of this loop
256 #and then for each seq_region inside the loop if the coord_system is present
257
258 $self->log("_pre_storing seq_region info for slice:\t".$slice->name());
259
260 my $pseudo_feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new
261 (
262 -slice => $slice,
263 -start => 0,
264 -end => 0,
265 -strand => 0,
266 );
267
268 $pf_adaptor->_pre_store($pseudo_feature);
269 #This will create a meta_coord entry of max_length 1 for features which have an absent meta_coord entry
270
271 }
272
273
274 $self->log("Finished validating seq_regions\n");
275
276 return;
277 }
278
279
280
281
282 sub update_meta_coord{
283 my ($self, @table_names) = @_;
284
285 my $species_id = $self->db->species_id;
286
287 if($self->{'skip_meta_coord'}){
288 $self->log("Skipping meta_coord update\n");
289 return;
290 }
291
292
293 $self->log_header('Updating meta_coord table');
294
295
296 #set default table_name
297 if(! @table_names || scalar(@table_names) == 0){
298
299 #Can we do this via DBAdaptor and get all available adaptors which are BaseFeatureAdaptors then grab the first table name
300
301 if(defined $self->{'meta_coord_tables'}){
302 @table_names = @{$self->{'meta_coord_tables'}};
303 }
304 else{#default
305
306 @table_names = qw(
307 regulatory_feature
308 probe_feature
309 external_feature
310 annotated_feature
311 result_feature
312 segmentation_feature
313 );
314 }
315 }
316
317 #backup meta coord
318 if(system($self->{'mysql_connect_string'}." -e 'SELECT * FROM meta_coord'"
319 . '> '.$self->{'dbname'}.'meta_coord.backup'
320 ) != 0 ){
321
322 throw("Can't dump the original meta_coord for back up");#will this get copied to log?
323 }
324 else {
325 $self->log('Original meta_coord table backed up in '. $self->{'dbname'}.'.meta_coord.backup');
326 }
327
328
329 #Update each max_length for table_name and coord_system
330
331 my $sql;
332
333 foreach my $table_name(@table_names){
334
335 $sql = "select distinct(cs.name), mc.coord_system_id, cs.version, mc.max_length from coord_system cs, meta_coord mc where mc.table_name='$table_name' and mc.coord_system_id=cs.coord_system_id and cs.species_id = $species_id";
336
337 $self->log('');
338 $self->log("Updating meta_coord max_length for $table_name:");
339 $self->log("name\tcoord_system_id\tversion\tmax_length");
340
341 #can we test for emtpy array here? Then skip delete.
342
343 my @info = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
344
345 #log this
346 map {$self->log(join("\t", @{$_}))} @info;
347
348 # Clean old entries
349 $self->log("Deleting old meta_coord entries");
350 $sql = "DELETE mc FROM meta_coord mc, coord_system cs WHERE mc.table_name ='$table_name' and mc.coord_system_id = cs.coord_system_id and cs.species_id = $species_id";
351 $self->db->dbc->db_handle->do($sql);
352
353 # Generate new max_lengths
354 $self->log("Generating new max_lengths");
355
356 #Is this query running for each redundant cs_id?
357 #would it be more efficient to retrieve the NR cs_ids first and loop the query for each cs_id?
358
359 #Can we get the dbID of the largest feature for ease of checking?
360 #This won't work as we're grouping by coord_system
361 #would need to select distinct coord_system_id for table first
362 #This may well slow down quite a bit doing it this way
363
364 $sql = "select distinct s.coord_system_id from coord_system cs, seq_region s, $table_name t WHERE t.seq_region_id = s.seq_region_id and s.coord_system_id = cs.coord_system_id and cs.species_id = $species_id";
365 my @cs_ids = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
366 #Convert single element arrayrefs to scalars
367 map $_ = ${$_}[0], @cs_ids;
368
369 $self->log("New max_lengths for $table_name are:");
370
371 $self->log(join("\t", ('coord_system_id', 'max_length', 'longest record dbID')));
372
373 foreach my $cs_id(@cs_ids){
374 #This will always give a length of 1 even if there are no features present
375
376 my @cs_lengths;
377
378 #The probe_feature table is now too big to do this in one go
379 #We need to break this down into sr_ids
380
381 $sql = "SELECT distinct t.seq_region_id from $table_name t, seq_region sr where t.seq_region_id=sr.seq_region_id and sr.coord_system_id=$cs_id";
382
383 my @sr_ids = @{$self->db->dbc->db_handle->selectcol_arrayref($sql)};
384
385
386 #Get longest feature for all seq_regions
387 foreach my $sr_id(@sr_ids){
388 $sql = "SELECT (t.seq_region_end - t.seq_region_start + 1 ) as max, t.${table_name}_id "
389 . "FROM $table_name t "
390 . "WHERE t.seq_region_id = $sr_id ";
391 $sql .= ' and t.window_size=0' if $table_name eq 'result_feature';
392 $sql .= " order by max desc limit 1";
393
394
395 #Problem here is that DBs without 0 wsize result_feture entries will not get a meta_coord entry
396 #We need to implement this in the _pre_store method too?
397
398
399 my ($cs_length, $table_id);
400 ($cs_length, $table_id) = $self->db->dbc->db_handle->selectrow_array($sql);
401 push @cs_lengths, [$cs_length, $table_id] if $cs_length;
402 }
403
404
405 if(@cs_lengths){
406 #This will now contain a list of arrays refs contain the max length and feature id for
407 #each seq_region in this coord_system
408 #Now sort to get the longest
409 #Can't sort on 2 day array in the normal way
410 #One list list lists, the comparatee is no longer a list but a reference
411 @cs_lengths = sort { $b->[0] <=> $a->[0] } @cs_lengths;
412 $self->log(join("\t\t", ($cs_id, @{$cs_lengths[0]})));
413 $sql = "INSERT INTO meta_coord values('${table_name}', $cs_id, ".$cs_lengths[0][0].')';
414 $self->db->dbc->db_handle->do($sql);
415 }
416 }
417 }
418
419 $self->log("Finished updating meta_coord max_lengths\n");
420
421 return;
422 }
423
424
425 #change this to check_meta_species_version
426
427 sub check_meta_species_version{
428 my ($self) = @_;
429
430 $self->log_header('Checking meta species.production_name and schema_version against dbname');
431
432 my $dbname = $self->db->dbc->dbname;
433 (my $dbname_species = $dbname) =~ s/_funcgen_.*//;
434 my $mc = $self->db->get_MetaContainer;
435 my $schema_version = $mc->list_value_by_key('schema_version')->[0];
436
437 if(! defined $schema_version){
438 $self->report("FAIL:\tNo meta schema_version defined");
439 }
440 elsif($dbname !~ /funcgen_${schema_version}_/){
441 $self->report("FAIL:\tMeta schema_version ($schema_version) does not match the dbname ($dbname).");
442 }
443
444
445 my @latin_names = @{$mc->list_value_by_key('species.production_name')};
446
447 if(scalar(@latin_names) > 1){
448 $self->report("FAIL:\tFound more than one species.production_name in meta:\t".join(", ", @latin_names));
449 }
450 elsif(scalar(@latin_names) == 1 && ($latin_names[0] ne $dbname_species)){
451 $self->report("FAIL:\tFound mismatch between meta species.production_name and dbname:\t".$latin_names[0]." vs $dbname_species");
452 }
453 elsif(scalar(@latin_names) == 0){
454 $self->report("WARNING:\tFound no meta species.production_name setting as:\t$dbname_species");
455 $self->db->dbc->db_handle->do("INSERT into meta(species_id, meta_key, meta_value) values(1, 'species.production_name', '$dbname_species')");
456 }
457 #else is okay
458
459 return;
460 }
461
462
463
464 #Move to Java HC? Or update if update flag specified
465 #Using same code used by build_reg_feats!
466
467 sub check_regbuild_strings{
468 my ($self) = @_;
469
470 #Removed $update arg as we would always want to do this manually
471
472 $self->log_header('Checking regbuild strings');
473 my $species_id = $self->db()->species_id();
474
475
476 my @regf_fsets;
477 my $passed = 1;
478 my $fset_a = $self->db->get_FeatureSetAdaptor;
479 #my $mc = $self->db->get_MetaContainer;
480 my $regf_a = $self->db->get_RegulatoryFeatureAdaptor;
481 #We now want to chek all build
482 @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')};
483
484
485 if(scalar(@regf_fsets) == 0){
486 $self->report('WARNING:check_regbuild_strings found no regulatory FeatureSets (fine if '.$self->db->species.' your species does not have a regulatory build');
487 }
488 else{
489
490 warn "Need to check/update regbuild.version and regbuild.initial_release_date regbuild.last_annotation_update";
491
492
493 #How do we validate this?
494 #Check all feature_sets exist
495 #Pull back some features from a test slice and check the number of bits match.
496 #Check the feature_type string exists and matches else create.
497
498
499 foreach my $fset(@regf_fsets){
500 $self->log_header("Validating regbuild_string entries for FeatureSets:\t".$fset->name);
501
502 #Fail for old versions as we want to remove these
503 if( $fset->name =~ /_v[0-9]+$/){
504 $self->report("FAIL:\t".$fset->name." is an old RegulatoryFeature set, please remove!");
505 next;
506 }
507
508 my $cell_type = (defined $fset->cell_type) ? $fset->cell_type->name : 'core';
509
510 #This has been lifted from build_regulatory_features.pl store_regbuild_strings
511 #Need to move this to a RegulatoryBuilder module
512 my $dset = $self->db->get_DataSetAdaptor->fetch_by_product_FeatureSet($fset);
513
514 my @ssets = @{$dset->get_supporting_sets};
515
516 if(! @ssets){
517 throw('You must provide a DataSet with associated supporting sets');
518 }
519
520
521
522 my %reg_strings =
523 (
524 "regbuild.${cell_type}.feature_set_ids" => join(',', map {
525 $_->dbID} sort {$a->name cmp $b->name
526 } @ssets),
527
528 "regbuild.${cell_type}.feature_type_ids" => join(',', map {
529 $_->feature_type->dbID} sort {$a->name cmp $b->name
530 } @ssets),
531 );
532
533 my @ffset_ids;
534
535 #Skip this now as we use the ftype classes for defining the focus sets
536 #foreach my $fset(@ssets){
537
538
539 # #This might fail if soem TFs haven't been included as focus i.e. are part of PolII/III
540 # #e.g. TFIIIC-110
541 ##
542 # if( ($fset->feature_type->class eq 'Transcription Factor') ||
543 # ($fset->feature_type->class eq 'Open Chromatin') ){
544 # push @ffset_ids, $fset->dbID;
545 # }
546 # }
547
548
549 my ($sql, %db_reg_string);
550
551 foreach my $string_key(keys %reg_strings){
552 my ($string)= $self->db->dbc->db_handle->selectrow_array("select string from regbuild_string where name='${string_key}' and species_id=$species_id");
553
554 if (! defined $string) {
555 $sql = "insert into regbuild_string (species_id, name, string) values ($species_id, '${string_key}', '$reg_strings{${string_key}}');";
556
557 $self->report("WARNING:\tInserting absent $string_key into regbuild_string table");
558 eval { $self->db->dbc->do($sql) };
559 die("Couldn't store $string_key in regbuild_string table\n$sql\n$@") if $@;
560 }
561 elsif ($string ne $reg_strings{$string_key}){
562 $sql = "update regbuild_string set string='".$reg_strings{$string_key}."' where name='${string_key}';";
563
564 if($self->fix){
565 $self->report("WARNING:\tUpdating mismatched $string_key found in regbuild_string table:\t${string}");#\tUpdate using:\t$sql");
566 eval { $self->db->dbc->do($sql) };
567 die("Couldn't update $string_key in regbuild_string table\n$sql\n$@") if $@;
568
569 }
570 else{
571 $self->report("FAIL:\tMismatched $string_key found in regbuild_string table:\t${string}\n\tUpdate using:\t$sql");
572 }
573 }
574
575 $db_reg_string{$string_key} = $string;
576 }
577
578
579 #Now need to tidy this block wrt new code added above
580 my $fset_string_key = "regbuild.${cell_type}.feature_set_ids";
581 my $ftype_string_key = "regbuild.${cell_type}.feature_type_ids";
582 my $fset_string = $db_reg_string{$fset_string_key};
583 my $ftype_string = $db_reg_string{$ftype_string_key};
584
585 if(! ($fset_string && $ftype_string)){
586 $self->report("FAIL:\tSkipping fset vs ftype string test for $cell_type")
587 }
588 else{
589
590 #This is now effectively handled by the loop above
591
592 $self->log("Validating :\t$fset_string_key vs $ftype_string_key");
593
594 my @fset_ids = split/,/, $fset_string;
595 my @ftype_ids = split/,/, $ftype_string;
596 my @new_ftype_ids;
597 my $ftype_fail = 0;
598
599 #Now need to work backwards through ftypes to remove pseudo ftypes before validating
600 #New string should be A,A,A;S,S,S,S,S,S;P,P,P
601 #Where A is and Anchor/Seed set
602 #S is a supporting set
603 #P is a pseudo feature type e.g. TSS proximal
604
605
606 if(scalar(@fset_ids) != scalar(@ftype_ids)){
607 $self->report("FAIL:\tLength mismatch between:\n\t$fset_string_key(".scalar(@fset_ids).")\t$fset_string\n\tAND\n\t$ftype_string_key(".scalar(@ftype_ids).")\t$ftype_string");
608 }
609
610 foreach my $i(0..$#fset_ids){
611 my $supporting_set_id = $fset_ids[$i];
612 my $sset = $fset_a->fetch_by_dbID($supporting_set_id);
613
614 if(! defined $sset){
615 $self->report("FAIL:\t$fset_string_key $supporting_set_id does not exist in the DB");
616 }
617 else{
618 #test/build ftype string
619
620 if(defined $ftype_string){
621
622 if($sset->feature_type->dbID != $ftype_ids[$i]){
623 $ftype_fail = 1;
624 $self->report("FAIL:\t$fset_string_key $supporting_set_id(".$sset->name.") FeatureType(".$sset->feature_type->name.") does not match $ftype_string_key $ftype_ids[$i]");
625 }
626 }
627
628 push @new_ftype_ids, $sset->feature_type->dbID;
629
630 }
631 }
632
633
634 #Set ftype_string
635 #This will not account for pseudo ftypes? Remove!!!?
636 my $new_ftype_string = join(',', @new_ftype_ids);
637
638 if(! defined $ftype_string){
639 $self->log("Updating $ftype_string_key to:\t$new_ftype_string");
640 $self->db->dbc->db_handle->do("INSERT into regbuild_string(species_id, name, string) values($species_id, '$ftype_string_key', '$new_ftype_string')");
641 }
642 elsif($ftype_fail){
643 $self->report("FAIL:\t$ftype_string_key($ftype_string) does not match $fset_string_key types($new_ftype_string)");
644 }
645
646
647 #Finally validate versus a reg feat
648 #Need to change this to ftype string rather than fset string?
649 my $id_row_ref = $self->db->dbc->db_handle->selectrow_arrayref('select regulatory_feature_id from regulatory_feature where feature_set_id='.$fset->dbID.' limit 1');
650
651 if(! defined $id_row_ref){
652 $self->report("FAIL:\tNo RegulatoryFeatures found for FeatureSet ".$fset->name);
653 }
654 else{
655 my ($regf_dbID) = @$id_row_ref;
656 my $rf_string = $regf_a->fetch_by_dbID($regf_dbID)->binary_string;
657
658 if(length($rf_string) != scalar(@fset_ids)){
659 $self->report("FAIL:\tRegulatory string length mismatch between RegulatoryFeature($regf_dbID) and $fset_string_key:\n$rf_string(".length($rf_string).")\n$fset_string(".scalar(@fset_ids).")");
660 }
661 }
662 }
663 }
664 }
665
666 return;
667 }
668
669
670 #Change this to log sets and incorporate RegFeat FeatureSet as standard
671 #Grab all reg fsets
672 #grab all displayable data sets which aren't reg sets?
673
674 sub log_data_sets{
675 my $self = shift;
676
677 my $dset_adaptor = $self->db->get_DataSetAdaptor;
678 my ($status);
679 my $txt = 'Checking ';
680 $status = 'DISPLAYABLE' if($self->{'check_displayable'});
681 $txt.= $status.' ' if $status;
682 $txt .= 'DataSets';
683 $self->log_header($txt);
684
685 #Check for status first to avoid warning from BaseAdaptor.
686 eval { $dset_adaptor->_get_status_name_id($status) };
687
688 if($@){
689 $self->report("FAIL: You have specified check_displayable, but the DISPLAYABLE status_name is not present in the DB");
690 return;
691 }
692
693
694 my @dsets;
695 my $dsets = $dset_adaptor->fetch_all($status);
696 @dsets = @$dsets if defined $dsets;
697
698
699 $self->log('Found '.scalar(@dsets).' DataSets');
700
701
702 foreach my $dset(@dsets){
703 $self->log_set("DataSet:\t\t", $dset) ;
704
705 my $fset = $dset->product_FeatureSet;
706 $self->log_set("Product FeatureSet:\t", $fset) if $fset;
707
708 my @supporting_sets = @{$dset->get_supporting_sets};
709
710 $self->log('Found '.scalar(@supporting_sets).' supporting sets:');
711
712 if(my @supporting_sets = @{$dset->get_supporting_sets}){
713 #type here could be result, experimental or feature
714 #and feature could be annotated or experimental
715 #Move this to log set?
716
717 map { my $type = ref($_);
718 $type =~ s/.*://;
719 $type .= '('.$_->feature_class.')' if($type eq 'FeatureSet');
720 #Need to sprintf $type here to fixed width
721 $self->log_set($type.":\t", $_)} @supporting_sets;
722 }
723 $self->log();
724 }
725
726 return;
727 }
728
729 sub log_set{
730 my ($self, $text, $set) = @_;
731
732 $text .= $set->display_label.'('.$set->name.')';
733 $text .= "\tDISPLAYABLE" if($set->is_displayable);
734 $self->log($text);
735
736 return;
737 }
738
739
740 sub check_stable_ids{
741 my ($self, @slices) = @_;
742
743 my $species_id = $self->db()->species_id();
744
745 $self->log_header('Checking stable IDs');
746
747 my $fset_a = $self->db->get_FeatureSetAdaptor;
748
749 my @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')};
750
751 if(!@regf_fsets){
752 $self->report('WARNING: No regulatory FeatureSets found (fine if '.$self->db->species.' does not have a regulatory build)');
753 }
754 else{
755
756 foreach my $fset(@regf_fsets){
757
758 if($fset->name =~ /_v[0-9]$/){
759 $self->log("Skipping stable_id test on archived set:\t".$fset->name);
760 next;
761 }
762
763 #Can't count NULL field, so have to count regulatory_feature_id!!!
764
765 #getting SR product here!!
766 my $sql = "select count(rf.regulatory_feature_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.stable_id is NULL and rf.seq_region_id = sr.seq_region_id and sr.coord_system_id = cs.coord_system_id and cs.species_id = $species_id and rf.feature_set_id=".$fset->dbID;
767
768
769
770 my ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)};
771
772 if($null_sids){
773 $self->report("FAIL: Found a total of $null_sids NULL stable IDs for ".$fset->name);
774
775 my $slice_a = $self->db->get_SliceAdaptor;
776
777 if(! @slices){
778 @slices = @{$slice_a->fetch_all('toplevel', 1)};
779 }
780
781 foreach my $slice(@slices){
782 my $sr_name=$slice->seq_region_name;
783 $sql = 'select count(rf.stable_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.seq_region_id=sr.seq_region_id and sr.name="'.$sr_name.'" and sr.coord_system_id = cs.coord_system_id and cs.species_id = '.$species_id.' and rf.stable_id is NULL and rf.feature_set_id='.$fset->dbID;
784 ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)};
785
786 #This is not reporting properly.
787
788 $self->log($fset->name.":\t$null_sids NULL stable IDs on ".$slice->name) if $null_sids;
789 }
790 }
791 else{
792 $self->log($fset->name.":\tNo NULL stable IDs found");
793 }
794 }
795 }
796
797 return;
798
799 }
800
801
802 #This is for mart to enable them to join to the seq_region table without
803 #creating a product from all the reundant seq_region entries for each schema_build
804
805 sub set_current_coord_system{
806 my ($self) = @_;
807
808
809
810 my $schema_build = $self->db->_get_schema_build($self->db->dnadb);
811 $self->log_header("Setting current coord_system on $schema_build");
812
813 my $sql = "update coord_system set is_current=False where schema_build !='$schema_build'";
814 $self->db->dbc->do($sql);
815 $sql = 'update coord_system set is_current=True where schema_build ="'.$schema_build.'" and attrib like "%default_version%"';
816 $self->db->dbc->do($sql);
817
818 return;
819 }
820
821 sub validate_DataSets{
822 my $self = shift;
823
824 $self->log_header('Validating DataSets');
825
826
827 #checks regualtory feature data and supporting sets
828 #links between DataSet and FeatureSet, i.e. correct naming, not linking to old set
829 #Displayable, DAS_DISPLAYABLE, IMPORTED_ASSM, MART_DISPLAYABLE
830 #Naming of result_set should match data/feature_set
831 #warn non-attr feature/result sets which are DISPLAYABL
832 #warn about DISPLAYABLE sets which do not have displayable set in analysis_description.web_data
833
834
835 my $fset_a = $self->db->get_FeatureSetAdaptor;
836 my $dset_a = $self->db->get_DataSetAdaptor;
837 my ($dset_states, $rset_states, $fset_states) = $self->get_regbuild_set_states($self->db);
838
839 my %rf_fsets;
840 my %set_states;
841 my $sql;
842
843 RF_FSET: foreach my $rf_fset(@{$fset_a->fetch_all_by_type('regulatory')}){
844 my $rf_fset_name = $rf_fset->name;
845
846
847 $self->log("Validating $rf_fset_name");
848
849
850 $rf_fsets{$rf_fset_name} = $rf_fset;#Do we only need the name for checking the dsets independantly?
851
852 if($rf_fset_name =~ /_v[0-9]+$/){
853 $self->report("FAIL:\tFound archived regulatory FeatureSet:\t$rf_fset_name");
854 next RF_FSET;
855 }
856
857 foreach my $state(@$fset_states){
858
859 if(! $rf_fset->has_status($state)){
860 $self->report("WARNING:\tUpdating FeatureSet $rf_fset_name with status $state");
861
862 $sql = 'INSERT into status select '. $rf_fset->dbID.
863 ", 'feature_set', status_name_id from status_name where name='$state'";
864
865 $self->db->dbc->db_handle->do($sql);
866 }
867 }
868
869 #Do we need to warn about other states?
870
871
872
873 my $rf_dset = $dset_a->fetch_by_product_FeatureSet($rf_fset);
874
875 if(! $rf_dset){
876 $self->report("FAIL:\tNo DataSet for FeatureSet:\t$rf_fset_name");
877 next RF_FSET;
878 }
879
880
881
882 if($rf_fset_name ne $rf_dset->name){
883 $self->report("FAIL:\tFound Feature/DataSet name mismatch:\t$rf_fset_name vs ".$rf_dset->name);
884 next RF_FSET;
885 }
886
887
888 foreach my $state(@$dset_states){
889
890 if(! $rf_dset->has_status($state)){
891 #Can we just update and warn here?
892 #Or do this separately in case we want some control over this?
893 $self->report("WARNING:\tUpdating DataSet $rf_fset_name with status $state");
894
895 $sql = 'INSERT into status select '.$rf_dset->dbID.
896 ", 'data_set', status_name_id from status_name where name='$state'";
897 $self->db->dbc->db_handle->do($sql);
898 }
899 }
900
901
902
903 #We should check fset ctype matches all attr_set ctypes?
904 #May have problems if we want to merge two lines into the same ctype build
905
906
907
908 foreach my $ra_fset(@{$rf_dset->get_supporting_sets}){
909
910 foreach my $state(@$fset_states){
911
912 if(! $ra_fset->has_status($state)){
913 #Can we just update and warn here?
914 #Or do this separately in case we want some control over this?
915 $self->report("WARNING:\tUpdating FeatureSet ".$ra_fset->name." with status $state");
916
917 $sql = 'INSERT into status select '.$ra_fset->dbID.
918 ", 'feature_set', status_name_id from status_name where name='$state'";
919 $self->db->dbc->db_handle->do($sql);
920 }
921 }
922
923
924
925 my $ra_dset = $dset_a->fetch_by_product_FeatureSet($ra_fset);
926 my @ssets = @{$ra_dset->get_supporting_sets(undef, 'result')};
927 my @displayable_sets;
928
929 foreach my $sset(@ssets){
930
931 if($sset->has_status('DISPLAYABLE')){
932 push @displayable_sets, $sset;
933 }
934 }
935 #Change this to get all then check status
936 #else print update sql
937
938 if(scalar(@displayable_sets) > 1){#There should only be one
939 $self->report("FAIL:\tThere should only be one DISPLAYABLE supporting ResultSet for DataSet:\t".$ra_dset->name);
940 }
941 elsif(scalar(@displayable_sets) == 0){
942
943 my $msg;
944
945 if(scalar(@ssets) == 1){
946
947 #fix here?
948
949 $msg = "Found unique non-DISPLAYABLE ResultSet:\t".$ssets[0]->name.
950 "\n\tinsert into status select ".$ssets[0]->dbID.
951 ", 'result_set', status_name_id from status_name where name='DISPLAYABLE';";
952 }
953 else{
954 $msg = "Found ".scalar(@ssets)." ResultSets ".join("\t", map($_->name, @ssets));
955 }
956
957 $self->report("FAIL:\tThere are no DISPLAYABLE supporting ResultSets for DataSet:\t".
958 $ra_dset->name."\n$msg");
959
960
961
962 next; #$ra_fset
963 }
964
965 my $ra_rset = $ssets[0];
966
967 foreach my $state(@$rset_states){
968
969 if(! $ra_rset->has_status($state)){
970 #Can we just update and warn here?
971 #Or do this separately in case we want some control over this?
972 $self->report("WARNING:\tUpdating ResultSet ".$ra_rset->name." with status $state");
973
974 $sql = 'INSERT into status select '.$ra_rset->dbID.
975 ", 'result_set', status_name_id from status_name where name='$state'";
976 $self->db->dbc->db_handle->do($sql);
977 }
978 }
979 }
980 }
981
982 return;
983 } # End of validate_DataSets
984
985
986
987
988
989
990 sub analyse_and_optimise_tables{
991 my $self = shift;
992
993 #myisamchk --analyze. or analyze statement
994
995 if($self->{'skip_analyse'}){
996 $self->log_header('Skipping analyse/optimise tables');
997 return;
998 }
999
1000 $self->log_header("Analysing and optimising tables");
1001
1002
1003 my $sql = 'show tables;';
1004 my @tables = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
1005 map $_ = "@{$_}", @tables;
1006 my $analyse_sql = 'analyze table ';
1007 my $optimise_sql = 'optimize table ';
1008
1009
1010
1011 foreach my $table(@tables){
1012 $self->log("Analysing and optimising $table");
1013
1014
1015
1016 #Remove analyse as optimise does everything this does
1017 my @anal_info = @{$self->db->dbc->db_handle->selectall_arrayref($analyse_sql.$table)};
1018
1019 foreach my $line_ref(@anal_info){
1020 my $status = $line_ref->[3];
1021 $self->report("FAIL: analyse $table status $status") if (!($status eq 'OK' || $status eq 'Table is already up to date'));
1022 }
1023
1024 my @opt_info = @{$self->db->dbc->db_handle->selectall_arrayref($optimise_sql.$table)};
1025
1026 foreach my $line_ref(@opt_info){
1027
1028 my $status = $line_ref->[3];
1029 $self->report("FAIL: optimise $table status $status") if (!( $status eq 'OK' || $status eq 'Table is already up to date'));
1030 }
1031
1032 }
1033
1034 return;
1035 }# end of analyse_and_optimise_tables
1036
1037
1038 sub clean_xrefs{
1039 my ($self) = @_;
1040
1041 if($self->{'skip_xrefs'}){
1042 $self->log_header('Skipping clean_xrefs');
1043 return;
1044 }
1045
1046 $self->log_header("Cleaning unlinked xref records");
1047
1048 my $sql = 'DELETE x FROM xref x LEFT JOIN object_xref ox ON ox.xref_id = x.xref_id WHERE ox.xref_id IS NULL';
1049 #Should this also take accoumt of unmapped_objects?
1050 #No, as unmapped_object doesn't use xref, but probably should
1051
1052 my $row_cnt = $self->db->dbc->do($sql);
1053
1054 $self->reset_table_autoinc('xref', 'xref_id', $self->db);
1055 $row_cnt = 0 if $row_cnt eq '0E0';
1056 $self->log("Deleted $row_cnt unlinked xref records");
1057
1058
1059 #Now remove old edbs
1060 $self->log_header("Cleaning unlinked external_db records");
1061
1062 #Need to account for xref and unmapped_object here
1063 $sql = 'DELETE edb FROM external_db edb '.
1064 'LEFT JOIN xref x ON x.external_db_id = edb.external_db_id '.
1065 'LEFT JOIN unmapped_object uo ON uo.external_db_id=edb.external_db_id '.
1066 'WHERE x.external_db_id IS NULL and uo.external_db_id is NULL';
1067 $row_cnt = $self->db->dbc->do($sql);
1068
1069 $self->reset_table_autoinc('external_db', 'external_db_id', $self->db);
1070 $row_cnt = 0 if $row_cnt eq '0E0';
1071 $self->log("Deleted $row_cnt unlinked external_db records");
1072
1073
1074 #Shouldn't clean orphaned oxs here as this means a rollback been done underneath the ox data
1075 #or we have xref_id=0!
1076 #Leave this to HC?
1077
1078
1079
1080
1081
1082 return;
1083 }
1084
1085
1086 1;