Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <ensembl-dev@ebi.ac.uk>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =head1 NAME | |
20 | |
21 Bio::EnsEMBL::Funcgen::Utils::HealthChecker | |
22 | |
23 =head1 SYNOPSIS | |
24 | |
25 =head1 DESCRIPTION | |
26 | |
27 B<This program> provides several methods to health check and update tables prior to | |
28 release. Using the updte_DB_for_release method runs the following: | |
29 | |
30 validate_new_seq_regions - _pre_stores seq_region & coord_system info from new core DB | |
31 check_regbuild_strings - Validates or inserts regbuild_string entries | |
32 check_meta_species_version - Validates meta species and version wrt dbname | |
33 set_current_coord_system - Updates coord_system.is_current to 1 for current schema_build (required for mart) | |
34 update_meta_coord - Regenerates meta_coord.max_length values (required for Slice range queries) | |
35 clean_xrefs - Removes old unused xref and external_db records | |
36 validate_DataSets - Performs various checks on Data/Feature/ResultSets links and states | |
37 check_stable_ids - Check for any NULL stable IDs | |
38 log_data_sets - Logs all DISPLAYABLE DataSets | |
39 analyse_and_optimise_tables - Does what is says | |
40 | |
41 | |
42 =cut | |
43 | |
44 ################################################################################ | |
45 | |
46 package Bio::EnsEMBL::Funcgen::Utils::HealthChecker; | |
47 | |
48 use strict; | |
49 use Bio::EnsEMBL::Funcgen::Utils::Helper; | |
50 use Bio::EnsEMBL::Funcgen::ProbeFeature; | |
51 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
52 use Bio::EnsEMBL::Utils::Exception qw( throw ); | |
53 use Bio::EnsEMBL::Analysis; | |
54 use vars qw(@ISA); | |
55 @ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); | |
56 | |
57 | |
58 | |
59 #TO DO | |
60 # 1 DONE Print all fails and warnings in summary at end of script. | |
61 # 2 validate_RegulatoryFeature_Sets | |
62 # 3 Some of these can be migrated or mirrored in java HCs for safety | |
63 | |
64 | |
65 ################################################################################ | |
66 | |
67 sub new { | |
68 my $caller = shift; | |
69 my $class = ref($caller) || $caller; | |
70 my $self = $class->SUPER::new(@_); | |
71 | |
72 #validate and set type, analysis and feature_set here | |
73 my ($db, $builds, $skip_mc, $check_displayable, $skip_analyse, $meta_coord_tables, $skip_xrefs, $fix) = | |
74 rearrange(['DB', 'BUILDS', 'SKIP_META_COORD', 'CHECK_DISPLAYABLE', 'SKIP_ANALYSE', 'META_COORD_TABLES', 'SKIP_XREF_CLEANUP', 'FIX'], @_); | |
75 | |
76 | |
77 if (! ($db && ref($db) && | |
78 $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ | |
79 throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); | |
80 } | |
81 | |
82 #test connection | |
83 $db->dbc->db_handle; | |
84 | |
85 $self->{'db'} = $db; | |
86 $self->{'mysql_connect_string'} = 'mysql -h'.$db->dbc->host.' -u'.$db->dbc->username.' -p' | |
87 .$db->dbc->password.' '.$db->dbc->dbname.' -P'.$db->dbc->port; | |
88 $self->{'dbname'} = $db->dbc->dbname; | |
89 $self->{'builds'} = (scalar(@$builds)>0) ? $builds : []; | |
90 $self->{'skip_meta_coord'} = $skip_mc; | |
91 $self->{'skip_xrefs'} = $skip_xrefs; | |
92 $self->{'skip_analyse'} = $skip_analyse; | |
93 $self->{'check_displayable'} = $check_displayable; | |
94 $self->{fix} = $fix; | |
95 | |
96 if(defined $meta_coord_tables){ | |
97 | |
98 throw('-skip_meta_coord is set, Cannot build meta_coord entries for tables '.join(', ', @$meta_coord_tables)); | |
99 | |
100 if(! ref($meta_coord_tables) eq 'ARRAY'){ | |
101 throw('-meta_coord_tables parameter must be an array ref'); | |
102 } | |
103 | |
104 @{$self->{'meta_coord_tables'}} = @$meta_coord_tables; | |
105 } | |
106 | |
107 return $self; | |
108 } | |
109 | |
110 sub db{ return $_[0]->{db}; } | |
111 | |
112 sub fix{ return $_[0]->{fix}; } | |
113 | |
114 =head2 update_db_for_release | |
115 | |
116 Arg[0] : | |
117 Example : | |
118 Description: Wrapper method to perform all common update functions | |
119 Returntype : | |
120 Exceptions : None | |
121 Caller : General | |
122 Status : at risk | |
123 | |
124 =cut | |
125 | |
126 sub update_db_for_release{ | |
127 my ($self, @args) = @_; | |
128 | |
129 if(@args){ | |
130 } | |
131 | |
132 #do seq_region_update to validate dnadb first | |
133 #hence avoiding redoing longer methods | |
134 $self->validate_new_seq_regions;#($force_srs); | |
135 #$self->update_meta_schema_version; | |
136 $self->check_regbuild_strings; | |
137 $self->check_meta_species_version; | |
138 $self->set_current_coord_system; | |
139 $self->update_meta_coord; | |
140 $self->clean_xrefs; | |
141 $self->validate_DataSets; | |
142 $self->check_stable_ids; | |
143 $self->log_data_sets(); | |
144 $self->analyse_and_optimise_tables;#ALWAYS LAST!! | |
145 | |
146 $self->log_header('??? Have you dumped/copied GFF dumps ???'); | |
147 | |
148 #Log footer? Pass optional counts hash? | |
149 $self->log('Finished updating '.$self->{'dbname'}." for release\n\n"); | |
150 } | |
151 | |
152 sub validate_new_seq_regions{ | |
153 my ($self, $force) = @_; | |
154 | |
155 | |
156 #We need to add some functionality to handle non-standard schema_build progression here | |
157 #This should be used before any data is loaded | |
158 #It should also warn if there any duplicates if it is not run before coord_systems are duplicated | |
159 #Should this just be handled in the BaseFeatureAdaptor/CoordSystemAdaptor? | |
160 | |
161 | |
162 | |
163 | |
164 #do we need to add the none default levels here? | |
165 #or are we only bothered about those which constitute the toplevel? | |
166 | |
167 #To make sure we have all the correct levels in eFG we need to get all the names. | |
168 #then get all by name from the core db and set them as the dnadb. | |
169 # we also need to get all the toplevel seq_regions and store them in the seq_region table | |
170 #use BaseFeatureAdaptor::_pre_store with and array of pseudo feature on each top level slice | |
171 | |
172 #Validate the efgdb and dnadb schema version are the same first | |
173 #This is because we expect the schem_build to be the same for a release | |
174 #However, this will autoset the dnadb if no defined, so will always match! | |
175 | |
176 if(! $force){ | |
177 my $efgdb_sm = join('_', @{$self->get_schema_and_build($self->{'dbname'})}); | |
178 my $dnadb_sm = join('_', @{$self->get_schema_and_build($self->db->dnadb->dbc->dbname)}); | |
179 | |
180 if($efgdb_sm ne $dnadb_sm){ | |
181 $self->report("WARNING Skipped validate_new_seq_regions as schema_versions are mismatched:\t". | |
182 "efgdb $efgdb_sm\tdnadb $dnadb_sm"); | |
183 return 0; | |
184 } | |
185 } | |
186 | |
187 my $pf_adaptor = $self->db->get_ProbeFeatureAdaptor(); | |
188 my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor(); | |
189 my $dnadb_csa = $self->db->dnadb->get_CoordSystemAdaptor; | |
190 | |
191 $self->log_header('Validating new coord_systems/seq_regions'); | |
192 | |
193 my @slices; | |
194 my %versioned_levels; | |
195 my $default_version; | |
196 | |
197 #Grab unversioned top level slices and versioned levels | |
198 #May miss some old versioned level if the new assembly no longer has them | |
199 foreach my $slice(@{$slice_adaptor->fetch_all('toplevel', undef, 1)}){ | |
200 | |
201 if (! $slice->coord_system->version){ | |
202 push @slices, $slice; | |
203 } | |
204 else{ | |
205 | |
206 if($default_version && | |
207 ($default_version ne $slice->coord_system->version)){ | |
208 throw("Found more than one default CoordSystem version:\t${default_version}\t".$slice->coord_system->version); | |
209 } | |
210 else{ | |
211 $default_version = $slice->coord_system->version; | |
212 } | |
213 } | |
214 } | |
215 | |
216 | |
217 #Get all versioned levels for all builds | |
218 foreach my $cs(@{$dnadb_csa->fetch_all}){ | |
219 | |
220 if($cs->version){ | |
221 $versioned_levels{$cs->version} ||= []; | |
222 push @{$versioned_levels{$cs->version}}, $cs->name; | |
223 } | |
224 } | |
225 | |
226 push @{$self->{'builds'}}, $default_version if scalar(@{$self->{'builds'}}) == 0; | |
227 | |
228 | |
229 | |
230 #Grab slices for each versioned level | |
231 foreach my $build(@{$self->{'builds'}}){ | |
232 | |
233 if(! exists $versioned_levels{$build}){ | |
234 throw("CoordSystem version $build does not exist in the dnadb ".$self->db->dnadb->dbc->dbname); | |
235 } | |
236 | |
237 foreach my $level(@{$versioned_levels{$build}}){ | |
238 $self->log("Getting slices for $level $build"); | |
239 push @slices, @{$slice_adaptor->fetch_all($level, $build)}; | |
240 } | |
241 } | |
242 | |
243 $self->log("Importing seq_region/coord_system info for builds:\t".join(',', @{$self->{'builds'}})); | |
244 | |
245 foreach my $slice(@slices){ | |
246 | |
247 if($slice->start() != 1){ | |
248 $self->log("Reslicing slice:\t".$slice->name()); | |
249 #we must have some sort of PAR linked region i.e. Y | |
250 $slice = $slice_adaptor->fetch_by_region($slice->coord_system_name(), $slice->seq_region_name()); | |
251 } | |
252 | |
253 | |
254 #we need test if it needs doing first? | |
255 #we would need to test for the coord_systems outside of this loop | |
256 #and then for each seq_region inside the loop if the coord_system is present | |
257 | |
258 $self->log("_pre_storing seq_region info for slice:\t".$slice->name()); | |
259 | |
260 my $pseudo_feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new | |
261 ( | |
262 -slice => $slice, | |
263 -start => 0, | |
264 -end => 0, | |
265 -strand => 0, | |
266 ); | |
267 | |
268 $pf_adaptor->_pre_store($pseudo_feature); | |
269 #This will create a meta_coord entry of max_length 1 for features which have an absent meta_coord entry | |
270 | |
271 } | |
272 | |
273 | |
274 $self->log("Finished validating seq_regions\n"); | |
275 | |
276 return; | |
277 } | |
278 | |
279 | |
280 | |
281 | |
282 sub update_meta_coord{ | |
283 my ($self, @table_names) = @_; | |
284 | |
285 my $species_id = $self->db->species_id; | |
286 | |
287 if($self->{'skip_meta_coord'}){ | |
288 $self->log("Skipping meta_coord update\n"); | |
289 return; | |
290 } | |
291 | |
292 | |
293 $self->log_header('Updating meta_coord table'); | |
294 | |
295 | |
296 #set default table_name | |
297 if(! @table_names || scalar(@table_names) == 0){ | |
298 | |
299 #Can we do this via DBAdaptor and get all available adaptors which are BaseFeatureAdaptors then grab the first table name | |
300 | |
301 if(defined $self->{'meta_coord_tables'}){ | |
302 @table_names = @{$self->{'meta_coord_tables'}}; | |
303 } | |
304 else{#default | |
305 | |
306 @table_names = qw( | |
307 regulatory_feature | |
308 probe_feature | |
309 external_feature | |
310 annotated_feature | |
311 result_feature | |
312 segmentation_feature | |
313 ); | |
314 } | |
315 } | |
316 | |
317 #backup meta coord | |
318 if(system($self->{'mysql_connect_string'}." -e 'SELECT * FROM meta_coord'" | |
319 . '> '.$self->{'dbname'}.'meta_coord.backup' | |
320 ) != 0 ){ | |
321 | |
322 throw("Can't dump the original meta_coord for back up");#will this get copied to log? | |
323 } | |
324 else { | |
325 $self->log('Original meta_coord table backed up in '. $self->{'dbname'}.'.meta_coord.backup'); | |
326 } | |
327 | |
328 | |
329 #Update each max_length for table_name and coord_system | |
330 | |
331 my $sql; | |
332 | |
333 foreach my $table_name(@table_names){ | |
334 | |
335 $sql = "select distinct(cs.name), mc.coord_system_id, cs.version, mc.max_length from coord_system cs, meta_coord mc where mc.table_name='$table_name' and mc.coord_system_id=cs.coord_system_id and cs.species_id = $species_id"; | |
336 | |
337 $self->log(''); | |
338 $self->log("Updating meta_coord max_length for $table_name:"); | |
339 $self->log("name\tcoord_system_id\tversion\tmax_length"); | |
340 | |
341 #can we test for emtpy array here? Then skip delete. | |
342 | |
343 my @info = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; | |
344 | |
345 #log this | |
346 map {$self->log(join("\t", @{$_}))} @info; | |
347 | |
348 # Clean old entries | |
349 $self->log("Deleting old meta_coord entries"); | |
350 $sql = "DELETE mc FROM meta_coord mc, coord_system cs WHERE mc.table_name ='$table_name' and mc.coord_system_id = cs.coord_system_id and cs.species_id = $species_id"; | |
351 $self->db->dbc->db_handle->do($sql); | |
352 | |
353 # Generate new max_lengths | |
354 $self->log("Generating new max_lengths"); | |
355 | |
356 #Is this query running for each redundant cs_id? | |
357 #would it be more efficient to retrieve the NR cs_ids first and loop the query for each cs_id? | |
358 | |
359 #Can we get the dbID of the largest feature for ease of checking? | |
360 #This won't work as we're grouping by coord_system | |
361 #would need to select distinct coord_system_id for table first | |
362 #This may well slow down quite a bit doing it this way | |
363 | |
364 $sql = "select distinct s.coord_system_id from coord_system cs, seq_region s, $table_name t WHERE t.seq_region_id = s.seq_region_id and s.coord_system_id = cs.coord_system_id and cs.species_id = $species_id"; | |
365 my @cs_ids = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; | |
366 #Convert single element arrayrefs to scalars | |
367 map $_ = ${$_}[0], @cs_ids; | |
368 | |
369 $self->log("New max_lengths for $table_name are:"); | |
370 | |
371 $self->log(join("\t", ('coord_system_id', 'max_length', 'longest record dbID'))); | |
372 | |
373 foreach my $cs_id(@cs_ids){ | |
374 #This will always give a length of 1 even if there are no features present | |
375 | |
376 my @cs_lengths; | |
377 | |
378 #The probe_feature table is now too big to do this in one go | |
379 #We need to break this down into sr_ids | |
380 | |
381 $sql = "SELECT distinct t.seq_region_id from $table_name t, seq_region sr where t.seq_region_id=sr.seq_region_id and sr.coord_system_id=$cs_id"; | |
382 | |
383 my @sr_ids = @{$self->db->dbc->db_handle->selectcol_arrayref($sql)}; | |
384 | |
385 | |
386 #Get longest feature for all seq_regions | |
387 foreach my $sr_id(@sr_ids){ | |
388 $sql = "SELECT (t.seq_region_end - t.seq_region_start + 1 ) as max, t.${table_name}_id " | |
389 . "FROM $table_name t " | |
390 . "WHERE t.seq_region_id = $sr_id "; | |
391 $sql .= ' and t.window_size=0' if $table_name eq 'result_feature'; | |
392 $sql .= " order by max desc limit 1"; | |
393 | |
394 | |
395 #Problem here is that DBs without 0 wsize result_feture entries will not get a meta_coord entry | |
396 #We need to implement this in the _pre_store method too? | |
397 | |
398 | |
399 my ($cs_length, $table_id); | |
400 ($cs_length, $table_id) = $self->db->dbc->db_handle->selectrow_array($sql); | |
401 push @cs_lengths, [$cs_length, $table_id] if $cs_length; | |
402 } | |
403 | |
404 | |
405 if(@cs_lengths){ | |
406 #This will now contain a list of arrays refs contain the max length and feature id for | |
407 #each seq_region in this coord_system | |
408 #Now sort to get the longest | |
409 #Can't sort on 2 day array in the normal way | |
410 #One list list lists, the comparatee is no longer a list but a reference | |
411 @cs_lengths = sort { $b->[0] <=> $a->[0] } @cs_lengths; | |
412 $self->log(join("\t\t", ($cs_id, @{$cs_lengths[0]}))); | |
413 $sql = "INSERT INTO meta_coord values('${table_name}', $cs_id, ".$cs_lengths[0][0].')'; | |
414 $self->db->dbc->db_handle->do($sql); | |
415 } | |
416 } | |
417 } | |
418 | |
419 $self->log("Finished updating meta_coord max_lengths\n"); | |
420 | |
421 return; | |
422 } | |
423 | |
424 | |
425 #change this to check_meta_species_version | |
426 | |
427 sub check_meta_species_version{ | |
428 my ($self) = @_; | |
429 | |
430 $self->log_header('Checking meta species.production_name and schema_version against dbname'); | |
431 | |
432 my $dbname = $self->db->dbc->dbname; | |
433 (my $dbname_species = $dbname) =~ s/_funcgen_.*//; | |
434 my $mc = $self->db->get_MetaContainer; | |
435 my $schema_version = $mc->list_value_by_key('schema_version')->[0]; | |
436 | |
437 if(! defined $schema_version){ | |
438 $self->report("FAIL:\tNo meta schema_version defined"); | |
439 } | |
440 elsif($dbname !~ /funcgen_${schema_version}_/){ | |
441 $self->report("FAIL:\tMeta schema_version ($schema_version) does not match the dbname ($dbname)."); | |
442 } | |
443 | |
444 | |
445 my @latin_names = @{$mc->list_value_by_key('species.production_name')}; | |
446 | |
447 if(scalar(@latin_names) > 1){ | |
448 $self->report("FAIL:\tFound more than one species.production_name in meta:\t".join(", ", @latin_names)); | |
449 } | |
450 elsif(scalar(@latin_names) == 1 && ($latin_names[0] ne $dbname_species)){ | |
451 $self->report("FAIL:\tFound mismatch between meta species.production_name and dbname:\t".$latin_names[0]." vs $dbname_species"); | |
452 } | |
453 elsif(scalar(@latin_names) == 0){ | |
454 $self->report("WARNING:\tFound no meta species.production_name setting as:\t$dbname_species"); | |
455 $self->db->dbc->db_handle->do("INSERT into meta(species_id, meta_key, meta_value) values(1, 'species.production_name', '$dbname_species')"); | |
456 } | |
457 #else is okay | |
458 | |
459 return; | |
460 } | |
461 | |
462 | |
463 | |
464 #Move to Java HC? Or update if update flag specified | |
465 #Using same code used by build_reg_feats! | |
466 | |
467 sub check_regbuild_strings{ | |
468 my ($self) = @_; | |
469 | |
470 #Removed $update arg as we would always want to do this manually | |
471 | |
472 $self->log_header('Checking regbuild strings'); | |
473 my $species_id = $self->db()->species_id(); | |
474 | |
475 | |
476 my @regf_fsets; | |
477 my $passed = 1; | |
478 my $fset_a = $self->db->get_FeatureSetAdaptor; | |
479 #my $mc = $self->db->get_MetaContainer; | |
480 my $regf_a = $self->db->get_RegulatoryFeatureAdaptor; | |
481 #We now want to chek all build | |
482 @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')}; | |
483 | |
484 | |
485 if(scalar(@regf_fsets) == 0){ | |
486 $self->report('WARNING:check_regbuild_strings found no regulatory FeatureSets (fine if '.$self->db->species.' your species does not have a regulatory build'); | |
487 } | |
488 else{ | |
489 | |
490 warn "Need to check/update regbuild.version and regbuild.initial_release_date regbuild.last_annotation_update"; | |
491 | |
492 | |
493 #How do we validate this? | |
494 #Check all feature_sets exist | |
495 #Pull back some features from a test slice and check the number of bits match. | |
496 #Check the feature_type string exists and matches else create. | |
497 | |
498 | |
499 foreach my $fset(@regf_fsets){ | |
500 $self->log_header("Validating regbuild_string entries for FeatureSets:\t".$fset->name); | |
501 | |
502 #Fail for old versions as we want to remove these | |
503 if( $fset->name =~ /_v[0-9]+$/){ | |
504 $self->report("FAIL:\t".$fset->name." is an old RegulatoryFeature set, please remove!"); | |
505 next; | |
506 } | |
507 | |
508 my $cell_type = (defined $fset->cell_type) ? $fset->cell_type->name : 'core'; | |
509 | |
510 #This has been lifted from build_regulatory_features.pl store_regbuild_strings | |
511 #Need to move this to a RegulatoryBuilder module | |
512 my $dset = $self->db->get_DataSetAdaptor->fetch_by_product_FeatureSet($fset); | |
513 | |
514 my @ssets = @{$dset->get_supporting_sets}; | |
515 | |
516 if(! @ssets){ | |
517 throw('You must provide a DataSet with associated supporting sets'); | |
518 } | |
519 | |
520 | |
521 | |
522 my %reg_strings = | |
523 ( | |
524 "regbuild.${cell_type}.feature_set_ids" => join(',', map { | |
525 $_->dbID} sort {$a->name cmp $b->name | |
526 } @ssets), | |
527 | |
528 "regbuild.${cell_type}.feature_type_ids" => join(',', map { | |
529 $_->feature_type->dbID} sort {$a->name cmp $b->name | |
530 } @ssets), | |
531 ); | |
532 | |
533 my @ffset_ids; | |
534 | |
535 #Skip this now as we use the ftype classes for defining the focus sets | |
536 #foreach my $fset(@ssets){ | |
537 | |
538 | |
539 # #This might fail if soem TFs haven't been included as focus i.e. are part of PolII/III | |
540 # #e.g. TFIIIC-110 | |
541 ## | |
542 # if( ($fset->feature_type->class eq 'Transcription Factor') || | |
543 # ($fset->feature_type->class eq 'Open Chromatin') ){ | |
544 # push @ffset_ids, $fset->dbID; | |
545 # } | |
546 # } | |
547 | |
548 | |
549 my ($sql, %db_reg_string); | |
550 | |
551 foreach my $string_key(keys %reg_strings){ | |
552 my ($string)= $self->db->dbc->db_handle->selectrow_array("select string from regbuild_string where name='${string_key}' and species_id=$species_id"); | |
553 | |
554 if (! defined $string) { | |
555 $sql = "insert into regbuild_string (species_id, name, string) values ($species_id, '${string_key}', '$reg_strings{${string_key}}');"; | |
556 | |
557 $self->report("WARNING:\tInserting absent $string_key into regbuild_string table"); | |
558 eval { $self->db->dbc->do($sql) }; | |
559 die("Couldn't store $string_key in regbuild_string table\n$sql\n$@") if $@; | |
560 } | |
561 elsif ($string ne $reg_strings{$string_key}){ | |
562 $sql = "update regbuild_string set string='".$reg_strings{$string_key}."' where name='${string_key}';"; | |
563 | |
564 if($self->fix){ | |
565 $self->report("WARNING:\tUpdating mismatched $string_key found in regbuild_string table:\t${string}");#\tUpdate using:\t$sql"); | |
566 eval { $self->db->dbc->do($sql) }; | |
567 die("Couldn't update $string_key in regbuild_string table\n$sql\n$@") if $@; | |
568 | |
569 } | |
570 else{ | |
571 $self->report("FAIL:\tMismatched $string_key found in regbuild_string table:\t${string}\n\tUpdate using:\t$sql"); | |
572 } | |
573 } | |
574 | |
575 $db_reg_string{$string_key} = $string; | |
576 } | |
577 | |
578 | |
579 #Now need to tidy this block wrt new code added above | |
580 my $fset_string_key = "regbuild.${cell_type}.feature_set_ids"; | |
581 my $ftype_string_key = "regbuild.${cell_type}.feature_type_ids"; | |
582 my $fset_string = $db_reg_string{$fset_string_key}; | |
583 my $ftype_string = $db_reg_string{$ftype_string_key}; | |
584 | |
585 if(! ($fset_string && $ftype_string)){ | |
586 $self->report("FAIL:\tSkipping fset vs ftype string test for $cell_type") | |
587 } | |
588 else{ | |
589 | |
590 #This is now effectively handled by the loop above | |
591 | |
592 $self->log("Validating :\t$fset_string_key vs $ftype_string_key"); | |
593 | |
594 my @fset_ids = split/,/, $fset_string; | |
595 my @ftype_ids = split/,/, $ftype_string; | |
596 my @new_ftype_ids; | |
597 my $ftype_fail = 0; | |
598 | |
599 #Now need to work backwards through ftypes to remove pseudo ftypes before validating | |
600 #New string should be A,A,A;S,S,S,S,S,S;P,P,P | |
601 #Where A is and Anchor/Seed set | |
602 #S is a supporting set | |
603 #P is a pseudo feature type e.g. TSS proximal | |
604 | |
605 | |
606 if(scalar(@fset_ids) != scalar(@ftype_ids)){ | |
607 $self->report("FAIL:\tLength mismatch between:\n\t$fset_string_key(".scalar(@fset_ids).")\t$fset_string\n\tAND\n\t$ftype_string_key(".scalar(@ftype_ids).")\t$ftype_string"); | |
608 } | |
609 | |
610 foreach my $i(0..$#fset_ids){ | |
611 my $supporting_set_id = $fset_ids[$i]; | |
612 my $sset = $fset_a->fetch_by_dbID($supporting_set_id); | |
613 | |
614 if(! defined $sset){ | |
615 $self->report("FAIL:\t$fset_string_key $supporting_set_id does not exist in the DB"); | |
616 } | |
617 else{ | |
618 #test/build ftype string | |
619 | |
620 if(defined $ftype_string){ | |
621 | |
622 if($sset->feature_type->dbID != $ftype_ids[$i]){ | |
623 $ftype_fail = 1; | |
624 $self->report("FAIL:\t$fset_string_key $supporting_set_id(".$sset->name.") FeatureType(".$sset->feature_type->name.") does not match $ftype_string_key $ftype_ids[$i]"); | |
625 } | |
626 } | |
627 | |
628 push @new_ftype_ids, $sset->feature_type->dbID; | |
629 | |
630 } | |
631 } | |
632 | |
633 | |
634 #Set ftype_string | |
635 #This will not account for pseudo ftypes? Remove!!!? | |
636 my $new_ftype_string = join(',', @new_ftype_ids); | |
637 | |
638 if(! defined $ftype_string){ | |
639 $self->log("Updating $ftype_string_key to:\t$new_ftype_string"); | |
640 $self->db->dbc->db_handle->do("INSERT into regbuild_string(species_id, name, string) values($species_id, '$ftype_string_key', '$new_ftype_string')"); | |
641 } | |
642 elsif($ftype_fail){ | |
643 $self->report("FAIL:\t$ftype_string_key($ftype_string) does not match $fset_string_key types($new_ftype_string)"); | |
644 } | |
645 | |
646 | |
647 #Finally validate versus a reg feat | |
648 #Need to change this to ftype string rather than fset string? | |
649 my $id_row_ref = $self->db->dbc->db_handle->selectrow_arrayref('select regulatory_feature_id from regulatory_feature where feature_set_id='.$fset->dbID.' limit 1'); | |
650 | |
651 if(! defined $id_row_ref){ | |
652 $self->report("FAIL:\tNo RegulatoryFeatures found for FeatureSet ".$fset->name); | |
653 } | |
654 else{ | |
655 my ($regf_dbID) = @$id_row_ref; | |
656 my $rf_string = $regf_a->fetch_by_dbID($regf_dbID)->binary_string; | |
657 | |
658 if(length($rf_string) != scalar(@fset_ids)){ | |
659 $self->report("FAIL:\tRegulatory string length mismatch between RegulatoryFeature($regf_dbID) and $fset_string_key:\n$rf_string(".length($rf_string).")\n$fset_string(".scalar(@fset_ids).")"); | |
660 } | |
661 } | |
662 } | |
663 } | |
664 } | |
665 | |
666 return; | |
667 } | |
668 | |
669 | |
670 #Change this to log sets and incorporate RegFeat FeatureSet as standard | |
671 #Grab all reg fsets | |
672 #grab all displayable data sets which aren't reg sets? | |
673 | |
674 sub log_data_sets{ | |
675 my $self = shift; | |
676 | |
677 my $dset_adaptor = $self->db->get_DataSetAdaptor; | |
678 my ($status); | |
679 my $txt = 'Checking '; | |
680 $status = 'DISPLAYABLE' if($self->{'check_displayable'}); | |
681 $txt.= $status.' ' if $status; | |
682 $txt .= 'DataSets'; | |
683 $self->log_header($txt); | |
684 | |
685 #Check for status first to avoid warning from BaseAdaptor. | |
686 eval { $dset_adaptor->_get_status_name_id($status) }; | |
687 | |
688 if($@){ | |
689 $self->report("FAIL: You have specified check_displayable, but the DISPLAYABLE status_name is not present in the DB"); | |
690 return; | |
691 } | |
692 | |
693 | |
694 my @dsets; | |
695 my $dsets = $dset_adaptor->fetch_all($status); | |
696 @dsets = @$dsets if defined $dsets; | |
697 | |
698 | |
699 $self->log('Found '.scalar(@dsets).' DataSets'); | |
700 | |
701 | |
702 foreach my $dset(@dsets){ | |
703 $self->log_set("DataSet:\t\t", $dset) ; | |
704 | |
705 my $fset = $dset->product_FeatureSet; | |
706 $self->log_set("Product FeatureSet:\t", $fset) if $fset; | |
707 | |
708 my @supporting_sets = @{$dset->get_supporting_sets}; | |
709 | |
710 $self->log('Found '.scalar(@supporting_sets).' supporting sets:'); | |
711 | |
712 if(my @supporting_sets = @{$dset->get_supporting_sets}){ | |
713 #type here could be result, experimental or feature | |
714 #and feature could be annotated or experimental | |
715 #Move this to log set? | |
716 | |
717 map { my $type = ref($_); | |
718 $type =~ s/.*://; | |
719 $type .= '('.$_->feature_class.')' if($type eq 'FeatureSet'); | |
720 #Need to sprintf $type here to fixed width | |
721 $self->log_set($type.":\t", $_)} @supporting_sets; | |
722 } | |
723 $self->log(); | |
724 } | |
725 | |
726 return; | |
727 } | |
728 | |
729 sub log_set{ | |
730 my ($self, $text, $set) = @_; | |
731 | |
732 $text .= $set->display_label.'('.$set->name.')'; | |
733 $text .= "\tDISPLAYABLE" if($set->is_displayable); | |
734 $self->log($text); | |
735 | |
736 return; | |
737 } | |
738 | |
739 | |
740 sub check_stable_ids{ | |
741 my ($self, @slices) = @_; | |
742 | |
743 my $species_id = $self->db()->species_id(); | |
744 | |
745 $self->log_header('Checking stable IDs'); | |
746 | |
747 my $fset_a = $self->db->get_FeatureSetAdaptor; | |
748 | |
749 my @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')}; | |
750 | |
751 if(!@regf_fsets){ | |
752 $self->report('WARNING: No regulatory FeatureSets found (fine if '.$self->db->species.' does not have a regulatory build)'); | |
753 } | |
754 else{ | |
755 | |
756 foreach my $fset(@regf_fsets){ | |
757 | |
758 if($fset->name =~ /_v[0-9]$/){ | |
759 $self->log("Skipping stable_id test on archived set:\t".$fset->name); | |
760 next; | |
761 } | |
762 | |
763 #Can't count NULL field, so have to count regulatory_feature_id!!! | |
764 | |
765 #getting SR product here!! | |
766 my $sql = "select count(rf.regulatory_feature_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.stable_id is NULL and rf.seq_region_id = sr.seq_region_id and sr.coord_system_id = cs.coord_system_id and cs.species_id = $species_id and rf.feature_set_id=".$fset->dbID; | |
767 | |
768 | |
769 | |
770 my ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)}; | |
771 | |
772 if($null_sids){ | |
773 $self->report("FAIL: Found a total of $null_sids NULL stable IDs for ".$fset->name); | |
774 | |
775 my $slice_a = $self->db->get_SliceAdaptor; | |
776 | |
777 if(! @slices){ | |
778 @slices = @{$slice_a->fetch_all('toplevel', 1)}; | |
779 } | |
780 | |
781 foreach my $slice(@slices){ | |
782 my $sr_name=$slice->seq_region_name; | |
783 $sql = 'select count(rf.stable_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.seq_region_id=sr.seq_region_id and sr.name="'.$sr_name.'" and sr.coord_system_id = cs.coord_system_id and cs.species_id = '.$species_id.' and rf.stable_id is NULL and rf.feature_set_id='.$fset->dbID; | |
784 ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)}; | |
785 | |
786 #This is not reporting properly. | |
787 | |
788 $self->log($fset->name.":\t$null_sids NULL stable IDs on ".$slice->name) if $null_sids; | |
789 } | |
790 } | |
791 else{ | |
792 $self->log($fset->name.":\tNo NULL stable IDs found"); | |
793 } | |
794 } | |
795 } | |
796 | |
797 return; | |
798 | |
799 } | |
800 | |
801 | |
802 #This is for mart to enable them to join to the seq_region table without | |
803 #creating a product from all the reundant seq_region entries for each schema_build | |
804 | |
805 sub set_current_coord_system{ | |
806 my ($self) = @_; | |
807 | |
808 | |
809 | |
810 my $schema_build = $self->db->_get_schema_build($self->db->dnadb); | |
811 $self->log_header("Setting current coord_system on $schema_build"); | |
812 | |
813 my $sql = "update coord_system set is_current=False where schema_build !='$schema_build'"; | |
814 $self->db->dbc->do($sql); | |
815 $sql = 'update coord_system set is_current=True where schema_build ="'.$schema_build.'" and attrib like "%default_version%"'; | |
816 $self->db->dbc->do($sql); | |
817 | |
818 return; | |
819 } | |
820 | |
821 sub validate_DataSets{ | |
822 my $self = shift; | |
823 | |
824 $self->log_header('Validating DataSets'); | |
825 | |
826 | |
827 #checks regualtory feature data and supporting sets | |
828 #links between DataSet and FeatureSet, i.e. correct naming, not linking to old set | |
829 #Displayable, DAS_DISPLAYABLE, IMPORTED_ASSM, MART_DISPLAYABLE | |
830 #Naming of result_set should match data/feature_set | |
831 #warn non-attr feature/result sets which are DISPLAYABL | |
832 #warn about DISPLAYABLE sets which do not have displayable set in analysis_description.web_data | |
833 | |
834 | |
835 my $fset_a = $self->db->get_FeatureSetAdaptor; | |
836 my $dset_a = $self->db->get_DataSetAdaptor; | |
837 my ($dset_states, $rset_states, $fset_states) = $self->get_regbuild_set_states($self->db); | |
838 | |
839 my %rf_fsets; | |
840 my %set_states; | |
841 my $sql; | |
842 | |
843 RF_FSET: foreach my $rf_fset(@{$fset_a->fetch_all_by_type('regulatory')}){ | |
844 my $rf_fset_name = $rf_fset->name; | |
845 | |
846 | |
847 $self->log("Validating $rf_fset_name"); | |
848 | |
849 | |
850 $rf_fsets{$rf_fset_name} = $rf_fset;#Do we only need the name for checking the dsets independantly? | |
851 | |
852 if($rf_fset_name =~ /_v[0-9]+$/){ | |
853 $self->report("FAIL:\tFound archived regulatory FeatureSet:\t$rf_fset_name"); | |
854 next RF_FSET; | |
855 } | |
856 | |
857 foreach my $state(@$fset_states){ | |
858 | |
859 if(! $rf_fset->has_status($state)){ | |
860 $self->report("WARNING:\tUpdating FeatureSet $rf_fset_name with status $state"); | |
861 | |
862 $sql = 'INSERT into status select '. $rf_fset->dbID. | |
863 ", 'feature_set', status_name_id from status_name where name='$state'"; | |
864 | |
865 $self->db->dbc->db_handle->do($sql); | |
866 } | |
867 } | |
868 | |
869 #Do we need to warn about other states? | |
870 | |
871 | |
872 | |
873 my $rf_dset = $dset_a->fetch_by_product_FeatureSet($rf_fset); | |
874 | |
875 if(! $rf_dset){ | |
876 $self->report("FAIL:\tNo DataSet for FeatureSet:\t$rf_fset_name"); | |
877 next RF_FSET; | |
878 } | |
879 | |
880 | |
881 | |
882 if($rf_fset_name ne $rf_dset->name){ | |
883 $self->report("FAIL:\tFound Feature/DataSet name mismatch:\t$rf_fset_name vs ".$rf_dset->name); | |
884 next RF_FSET; | |
885 } | |
886 | |
887 | |
888 foreach my $state(@$dset_states){ | |
889 | |
890 if(! $rf_dset->has_status($state)){ | |
891 #Can we just update and warn here? | |
892 #Or do this separately in case we want some control over this? | |
893 $self->report("WARNING:\tUpdating DataSet $rf_fset_name with status $state"); | |
894 | |
895 $sql = 'INSERT into status select '.$rf_dset->dbID. | |
896 ", 'data_set', status_name_id from status_name where name='$state'"; | |
897 $self->db->dbc->db_handle->do($sql); | |
898 } | |
899 } | |
900 | |
901 | |
902 | |
903 #We should check fset ctype matches all attr_set ctypes? | |
904 #May have problems if we want to merge two lines into the same ctype build | |
905 | |
906 | |
907 | |
908 foreach my $ra_fset(@{$rf_dset->get_supporting_sets}){ | |
909 | |
910 foreach my $state(@$fset_states){ | |
911 | |
912 if(! $ra_fset->has_status($state)){ | |
913 #Can we just update and warn here? | |
914 #Or do this separately in case we want some control over this? | |
915 $self->report("WARNING:\tUpdating FeatureSet ".$ra_fset->name." with status $state"); | |
916 | |
917 $sql = 'INSERT into status select '.$ra_fset->dbID. | |
918 ", 'feature_set', status_name_id from status_name where name='$state'"; | |
919 $self->db->dbc->db_handle->do($sql); | |
920 } | |
921 } | |
922 | |
923 | |
924 | |
925 my $ra_dset = $dset_a->fetch_by_product_FeatureSet($ra_fset); | |
926 my @ssets = @{$ra_dset->get_supporting_sets(undef, 'result')}; | |
927 my @displayable_sets; | |
928 | |
929 foreach my $sset(@ssets){ | |
930 | |
931 if($sset->has_status('DISPLAYABLE')){ | |
932 push @displayable_sets, $sset; | |
933 } | |
934 } | |
935 #Change this to get all then check status | |
936 #else print update sql | |
937 | |
938 if(scalar(@displayable_sets) > 1){#There should only be one | |
939 $self->report("FAIL:\tThere should only be one DISPLAYABLE supporting ResultSet for DataSet:\t".$ra_dset->name); | |
940 } | |
941 elsif(scalar(@displayable_sets) == 0){ | |
942 | |
943 my $msg; | |
944 | |
945 if(scalar(@ssets) == 1){ | |
946 | |
947 #fix here? | |
948 | |
949 $msg = "Found unique non-DISPLAYABLE ResultSet:\t".$ssets[0]->name. | |
950 "\n\tinsert into status select ".$ssets[0]->dbID. | |
951 ", 'result_set', status_name_id from status_name where name='DISPLAYABLE';"; | |
952 } | |
953 else{ | |
954 $msg = "Found ".scalar(@ssets)." ResultSets ".join("\t", map($_->name, @ssets)); | |
955 } | |
956 | |
957 $self->report("FAIL:\tThere are no DISPLAYABLE supporting ResultSets for DataSet:\t". | |
958 $ra_dset->name."\n$msg"); | |
959 | |
960 | |
961 | |
962 next; #$ra_fset | |
963 } | |
964 | |
965 my $ra_rset = $ssets[0]; | |
966 | |
967 foreach my $state(@$rset_states){ | |
968 | |
969 if(! $ra_rset->has_status($state)){ | |
970 #Can we just update and warn here? | |
971 #Or do this separately in case we want some control over this? | |
972 $self->report("WARNING:\tUpdating ResultSet ".$ra_rset->name." with status $state"); | |
973 | |
974 $sql = 'INSERT into status select '.$ra_rset->dbID. | |
975 ", 'result_set', status_name_id from status_name where name='$state'"; | |
976 $self->db->dbc->db_handle->do($sql); | |
977 } | |
978 } | |
979 } | |
980 } | |
981 | |
982 return; | |
983 } # End of validate_DataSets | |
984 | |
985 | |
986 | |
987 | |
988 | |
989 | |
990 sub analyse_and_optimise_tables{ | |
991 my $self = shift; | |
992 | |
993 #myisamchk --analyze. or analyze statement | |
994 | |
995 if($self->{'skip_analyse'}){ | |
996 $self->log_header('Skipping analyse/optimise tables'); | |
997 return; | |
998 } | |
999 | |
1000 $self->log_header("Analysing and optimising tables"); | |
1001 | |
1002 | |
1003 my $sql = 'show tables;'; | |
1004 my @tables = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; | |
1005 map $_ = "@{$_}", @tables; | |
1006 my $analyse_sql = 'analyze table '; | |
1007 my $optimise_sql = 'optimize table '; | |
1008 | |
1009 | |
1010 | |
1011 foreach my $table(@tables){ | |
1012 $self->log("Analysing and optimising $table"); | |
1013 | |
1014 | |
1015 | |
1016 #Remove analyse as optimise does everything this does | |
1017 my @anal_info = @{$self->db->dbc->db_handle->selectall_arrayref($analyse_sql.$table)}; | |
1018 | |
1019 foreach my $line_ref(@anal_info){ | |
1020 my $status = $line_ref->[3]; | |
1021 $self->report("FAIL: analyse $table status $status") if (!($status eq 'OK' || $status eq 'Table is already up to date')); | |
1022 } | |
1023 | |
1024 my @opt_info = @{$self->db->dbc->db_handle->selectall_arrayref($optimise_sql.$table)}; | |
1025 | |
1026 foreach my $line_ref(@opt_info){ | |
1027 | |
1028 my $status = $line_ref->[3]; | |
1029 $self->report("FAIL: optimise $table status $status") if (!( $status eq 'OK' || $status eq 'Table is already up to date')); | |
1030 } | |
1031 | |
1032 } | |
1033 | |
1034 return; | |
1035 }# end of analyse_and_optimise_tables | |
1036 | |
1037 | |
1038 sub clean_xrefs{ | |
1039 my ($self) = @_; | |
1040 | |
1041 if($self->{'skip_xrefs'}){ | |
1042 $self->log_header('Skipping clean_xrefs'); | |
1043 return; | |
1044 } | |
1045 | |
1046 $self->log_header("Cleaning unlinked xref records"); | |
1047 | |
1048 my $sql = 'DELETE x FROM xref x LEFT JOIN object_xref ox ON ox.xref_id = x.xref_id WHERE ox.xref_id IS NULL'; | |
1049 #Should this also take accoumt of unmapped_objects? | |
1050 #No, as unmapped_object doesn't use xref, but probably should | |
1051 | |
1052 my $row_cnt = $self->db->dbc->do($sql); | |
1053 | |
1054 $self->reset_table_autoinc('xref', 'xref_id', $self->db); | |
1055 $row_cnt = 0 if $row_cnt eq '0E0'; | |
1056 $self->log("Deleted $row_cnt unlinked xref records"); | |
1057 | |
1058 | |
1059 #Now remove old edbs | |
1060 $self->log_header("Cleaning unlinked external_db records"); | |
1061 | |
1062 #Need to account for xref and unmapped_object here | |
1063 $sql = 'DELETE edb FROM external_db edb '. | |
1064 'LEFT JOIN xref x ON x.external_db_id = edb.external_db_id '. | |
1065 'LEFT JOIN unmapped_object uo ON uo.external_db_id=edb.external_db_id '. | |
1066 'WHERE x.external_db_id IS NULL and uo.external_db_id is NULL'; | |
1067 $row_cnt = $self->db->dbc->do($sql); | |
1068 | |
1069 $self->reset_table_autoinc('external_db', 'external_db_id', $self->db); | |
1070 $row_cnt = 0 if $row_cnt eq '0E0'; | |
1071 $self->log("Deleted $row_cnt unlinked external_db records"); | |
1072 | |
1073 | |
1074 #Shouldn't clean orphaned oxs here as this means a rollback been done underneath the ox data | |
1075 #or we have xref_id=0! | |
1076 #Leave this to HC? | |
1077 | |
1078 | |
1079 | |
1080 | |
1081 | |
1082 return; | |
1083 } | |
1084 | |
1085 | |
1086 1; |