Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1086 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Utils::HealthChecker + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +B<This program> provides several methods to health check and update tables prior to +release. Using the updte_DB_for_release method runs the following: + + validate_new_seq_regions - _pre_stores seq_region & coord_system info from new core DB + check_regbuild_strings - Validates or inserts regbuild_string entries + check_meta_species_version - Validates meta species and version wrt dbname + set_current_coord_system - Updates coord_system.is_current to 1 for current schema_build (required for mart) + update_meta_coord - Regenerates meta_coord.max_length values (required for Slice range queries) + clean_xrefs - Removes old unused xref and external_db records + validate_DataSets - Performs various checks on Data/Feature/ResultSets links and states + check_stable_ids - Check for any NULL stable IDs + log_data_sets - Logs all DISPLAYABLE DataSets + analyse_and_optimise_tables - Does what is says + + +=cut + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Utils::HealthChecker; + +use strict; +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::Funcgen::ProbeFeature; +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Analysis; +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + + + +#TO DO +# 1 DONE Print all fails and warnings in summary at end of script. +# 2 validate_RegulatoryFeature_Sets +# 3 Some of these can be migrated or mirrored in java HCs for safety + + +################################################################################ + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + #validate and set type, analysis and feature_set here + my ($db, $builds, $skip_mc, $check_displayable, $skip_analyse, $meta_coord_tables, $skip_xrefs, $fix) = + rearrange(['DB', 'BUILDS', 'SKIP_META_COORD', 'CHECK_DISPLAYABLE', 'SKIP_ANALYSE', 'META_COORD_TABLES', 'SKIP_XREF_CLEANUP', 'FIX'], @_); + + + if (! ($db && ref($db) && + $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + } + + #test connection + $db->dbc->db_handle; + + $self->{'db'} = $db; + $self->{'mysql_connect_string'} = 'mysql -h'.$db->dbc->host.' -u'.$db->dbc->username.' -p' + .$db->dbc->password.' '.$db->dbc->dbname.' -P'.$db->dbc->port; + $self->{'dbname'} = $db->dbc->dbname; + $self->{'builds'} = (scalar(@$builds)>0) ? $builds : []; + $self->{'skip_meta_coord'} = $skip_mc; + $self->{'skip_xrefs'} = $skip_xrefs; + $self->{'skip_analyse'} = $skip_analyse; + $self->{'check_displayable'} = $check_displayable; + $self->{fix} = $fix; + + if(defined $meta_coord_tables){ + + throw('-skip_meta_coord is set, Cannot build meta_coord entries for tables '.join(', ', @$meta_coord_tables)); + + if(! ref($meta_coord_tables) eq 'ARRAY'){ + throw('-meta_coord_tables parameter must be an array ref'); + } + + @{$self->{'meta_coord_tables'}} = @$meta_coord_tables; + } + + return $self; +} + +sub db{ return $_[0]->{db}; } + +sub fix{ return $_[0]->{fix}; } + +=head2 update_db_for_release + + Arg[0] : + Example : + Description: Wrapper method to perform all common update functions + Returntype : + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub update_db_for_release{ + my ($self, @args) = @_; + + if(@args){ + } + + #do seq_region_update to validate dnadb first + #hence avoiding redoing longer methods + $self->validate_new_seq_regions;#($force_srs); + #$self->update_meta_schema_version; + $self->check_regbuild_strings; + $self->check_meta_species_version; + $self->set_current_coord_system; + $self->update_meta_coord; + $self->clean_xrefs; + $self->validate_DataSets; + $self->check_stable_ids; + $self->log_data_sets(); + $self->analyse_and_optimise_tables;#ALWAYS LAST!! + + $self->log_header('??? Have you dumped/copied GFF dumps ???'); + + #Log footer? Pass optional counts hash? + $self->log('Finished updating '.$self->{'dbname'}." for release\n\n"); +} + +sub validate_new_seq_regions{ + my ($self, $force) = @_; + + + #We need to add some functionality to handle non-standard schema_build progression here + #This should be used before any data is loaded + #It should also warn if there any duplicates if it is not run before coord_systems are duplicated + #Should this just be handled in the BaseFeatureAdaptor/CoordSystemAdaptor? + + + + + #do we need to add the none default levels here? + #or are we only bothered about those which constitute the toplevel? + + #To make sure we have all the correct levels in eFG we need to get all the names. + #then get all by name from the core db and set them as the dnadb. + # we also need to get all the toplevel seq_regions and store them in the seq_region table + #use BaseFeatureAdaptor::_pre_store with and array of pseudo feature on each top level slice + + #Validate the efgdb and dnadb schema version are the same first + #This is because we expect the schem_build to be the same for a release + #However, this will autoset the dnadb if no defined, so will always match! + + if(! $force){ + my $efgdb_sm = join('_', @{$self->get_schema_and_build($self->{'dbname'})}); + my $dnadb_sm = join('_', @{$self->get_schema_and_build($self->db->dnadb->dbc->dbname)}); + + if($efgdb_sm ne $dnadb_sm){ + $self->report("WARNING Skipped validate_new_seq_regions as schema_versions are mismatched:\t". + "efgdb $efgdb_sm\tdnadb $dnadb_sm"); + return 0; + } + } + + my $pf_adaptor = $self->db->get_ProbeFeatureAdaptor(); + my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor(); + my $dnadb_csa = $self->db->dnadb->get_CoordSystemAdaptor; + + $self->log_header('Validating new coord_systems/seq_regions'); + + my @slices; + my %versioned_levels; + my $default_version; + + #Grab unversioned top level slices and versioned levels + #May miss some old versioned level if the new assembly no longer has them + foreach my $slice(@{$slice_adaptor->fetch_all('toplevel', undef, 1)}){ + + if (! $slice->coord_system->version){ + push @slices, $slice; + } + else{ + + if($default_version && + ($default_version ne $slice->coord_system->version)){ + throw("Found more than one default CoordSystem version:\t${default_version}\t".$slice->coord_system->version); + } + else{ + $default_version = $slice->coord_system->version; + } + } + } + + + #Get all versioned levels for all builds + foreach my $cs(@{$dnadb_csa->fetch_all}){ + + if($cs->version){ + $versioned_levels{$cs->version} ||= []; + push @{$versioned_levels{$cs->version}}, $cs->name; + } + } + + push @{$self->{'builds'}}, $default_version if scalar(@{$self->{'builds'}}) == 0; + + + + #Grab slices for each versioned level + foreach my $build(@{$self->{'builds'}}){ + + if(! exists $versioned_levels{$build}){ + throw("CoordSystem version $build does not exist in the dnadb ".$self->db->dnadb->dbc->dbname); + } + + foreach my $level(@{$versioned_levels{$build}}){ + $self->log("Getting slices for $level $build"); + push @slices, @{$slice_adaptor->fetch_all($level, $build)}; + } + } + + $self->log("Importing seq_region/coord_system info for builds:\t".join(',', @{$self->{'builds'}})); + + foreach my $slice(@slices){ + + if($slice->start() != 1){ + $self->log("Reslicing slice:\t".$slice->name()); + #we must have some sort of PAR linked region i.e. Y + $slice = $slice_adaptor->fetch_by_region($slice->coord_system_name(), $slice->seq_region_name()); + } + + + #we need test if it needs doing first? + #we would need to test for the coord_systems outside of this loop + #and then for each seq_region inside the loop if the coord_system is present + + $self->log("_pre_storing seq_region info for slice:\t".$slice->name()); + + my $pseudo_feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new + ( + -slice => $slice, + -start => 0, + -end => 0, + -strand => 0, + ); + + $pf_adaptor->_pre_store($pseudo_feature); + #This will create a meta_coord entry of max_length 1 for features which have an absent meta_coord entry + + } + + + $self->log("Finished validating seq_regions\n"); + + return; +} + + + + +sub update_meta_coord{ + my ($self, @table_names) = @_; + + my $species_id = $self->db->species_id; + + if($self->{'skip_meta_coord'}){ + $self->log("Skipping meta_coord update\n"); + return; + } + + + $self->log_header('Updating meta_coord table'); + + + #set default table_name + if(! @table_names || scalar(@table_names) == 0){ + + #Can we do this via DBAdaptor and get all available adaptors which are BaseFeatureAdaptors then grab the first table name + + if(defined $self->{'meta_coord_tables'}){ + @table_names = @{$self->{'meta_coord_tables'}}; + } + else{#default + + @table_names = qw( + regulatory_feature + probe_feature + external_feature + annotated_feature + result_feature + segmentation_feature + ); + } + } + + #backup meta coord + if(system($self->{'mysql_connect_string'}." -e 'SELECT * FROM meta_coord'" + . '> '.$self->{'dbname'}.'meta_coord.backup' + ) != 0 ){ + + throw("Can't dump the original meta_coord for back up");#will this get copied to log? + } + else { + $self->log('Original meta_coord table backed up in '. $self->{'dbname'}.'.meta_coord.backup'); + } + + + #Update each max_length for table_name and coord_system + + my $sql; + + foreach my $table_name(@table_names){ + + $sql = "select distinct(cs.name), mc.coord_system_id, cs.version, mc.max_length from coord_system cs, meta_coord mc where mc.table_name='$table_name' and mc.coord_system_id=cs.coord_system_id and cs.species_id = $species_id"; + + $self->log(''); + $self->log("Updating meta_coord max_length for $table_name:"); + $self->log("name\tcoord_system_id\tversion\tmax_length"); + + #can we test for emtpy array here? Then skip delete. + + my @info = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + + #log this + map {$self->log(join("\t", @{$_}))} @info; + + # Clean old entries + $self->log("Deleting old meta_coord entries"); + $sql = "DELETE mc FROM meta_coord mc, coord_system cs WHERE mc.table_name ='$table_name' and mc.coord_system_id = cs.coord_system_id and cs.species_id = $species_id"; + $self->db->dbc->db_handle->do($sql); + + # Generate new max_lengths + $self->log("Generating new max_lengths"); + + #Is this query running for each redundant cs_id? + #would it be more efficient to retrieve the NR cs_ids first and loop the query for each cs_id? + + #Can we get the dbID of the largest feature for ease of checking? + #This won't work as we're grouping by coord_system + #would need to select distinct coord_system_id for table first + #This may well slow down quite a bit doing it this way + + $sql = "select distinct s.coord_system_id from coord_system cs, seq_region s, $table_name t WHERE t.seq_region_id = s.seq_region_id and s.coord_system_id = cs.coord_system_id and cs.species_id = $species_id"; + my @cs_ids = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + #Convert single element arrayrefs to scalars + map $_ = ${$_}[0], @cs_ids; + + $self->log("New max_lengths for $table_name are:"); + + $self->log(join("\t", ('coord_system_id', 'max_length', 'longest record dbID'))); + + foreach my $cs_id(@cs_ids){ + #This will always give a length of 1 even if there are no features present + + my @cs_lengths; + + #The probe_feature table is now too big to do this in one go + #We need to break this down into sr_ids + + $sql = "SELECT distinct t.seq_region_id from $table_name t, seq_region sr where t.seq_region_id=sr.seq_region_id and sr.coord_system_id=$cs_id"; + + my @sr_ids = @{$self->db->dbc->db_handle->selectcol_arrayref($sql)}; + + + #Get longest feature for all seq_regions + foreach my $sr_id(@sr_ids){ + $sql = "SELECT (t.seq_region_end - t.seq_region_start + 1 ) as max, t.${table_name}_id " + . "FROM $table_name t " + . "WHERE t.seq_region_id = $sr_id "; + $sql .= ' and t.window_size=0' if $table_name eq 'result_feature'; + $sql .= " order by max desc limit 1"; + + + #Problem here is that DBs without 0 wsize result_feture entries will not get a meta_coord entry + #We need to implement this in the _pre_store method too? + + + my ($cs_length, $table_id); + ($cs_length, $table_id) = $self->db->dbc->db_handle->selectrow_array($sql); + push @cs_lengths, [$cs_length, $table_id] if $cs_length; + } + + + if(@cs_lengths){ + #This will now contain a list of arrays refs contain the max length and feature id for + #each seq_region in this coord_system + #Now sort to get the longest + #Can't sort on 2 day array in the normal way + #One list list lists, the comparatee is no longer a list but a reference + @cs_lengths = sort { $b->[0] <=> $a->[0] } @cs_lengths; + $self->log(join("\t\t", ($cs_id, @{$cs_lengths[0]}))); + $sql = "INSERT INTO meta_coord values('${table_name}', $cs_id, ".$cs_lengths[0][0].')'; + $self->db->dbc->db_handle->do($sql); + } + } + } + + $self->log("Finished updating meta_coord max_lengths\n"); + + return; +} + + +#change this to check_meta_species_version + +sub check_meta_species_version{ + my ($self) = @_; + + $self->log_header('Checking meta species.production_name and schema_version against dbname'); + + my $dbname = $self->db->dbc->dbname; + (my $dbname_species = $dbname) =~ s/_funcgen_.*//; + my $mc = $self->db->get_MetaContainer; + my $schema_version = $mc->list_value_by_key('schema_version')->[0]; + + if(! defined $schema_version){ + $self->report("FAIL:\tNo meta schema_version defined"); + } + elsif($dbname !~ /funcgen_${schema_version}_/){ + $self->report("FAIL:\tMeta schema_version ($schema_version) does not match the dbname ($dbname)."); + } + + + my @latin_names = @{$mc->list_value_by_key('species.production_name')}; + + if(scalar(@latin_names) > 1){ + $self->report("FAIL:\tFound more than one species.production_name in meta:\t".join(", ", @latin_names)); + } + elsif(scalar(@latin_names) == 1 && ($latin_names[0] ne $dbname_species)){ + $self->report("FAIL:\tFound mismatch between meta species.production_name and dbname:\t".$latin_names[0]." vs $dbname_species"); + } + elsif(scalar(@latin_names) == 0){ + $self->report("WARNING:\tFound no meta species.production_name setting as:\t$dbname_species"); + $self->db->dbc->db_handle->do("INSERT into meta(species_id, meta_key, meta_value) values(1, 'species.production_name', '$dbname_species')"); + } + #else is okay + + return; +} + + + +#Move to Java HC? Or update if update flag specified +#Using same code used by build_reg_feats! + +sub check_regbuild_strings{ + my ($self) = @_; + + #Removed $update arg as we would always want to do this manually + + $self->log_header('Checking regbuild strings'); + my $species_id = $self->db()->species_id(); + + + my @regf_fsets; + my $passed = 1; + my $fset_a = $self->db->get_FeatureSetAdaptor; + #my $mc = $self->db->get_MetaContainer; + my $regf_a = $self->db->get_RegulatoryFeatureAdaptor; + #We now want to chek all build + @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')}; + + + if(scalar(@regf_fsets) == 0){ + $self->report('WARNING:check_regbuild_strings found no regulatory FeatureSets (fine if '.$self->db->species.' your species does not have a regulatory build'); + } + else{ + + warn "Need to check/update regbuild.version and regbuild.initial_release_date regbuild.last_annotation_update"; + + + #How do we validate this? + #Check all feature_sets exist + #Pull back some features from a test slice and check the number of bits match. + #Check the feature_type string exists and matches else create. + + + foreach my $fset(@regf_fsets){ + $self->log_header("Validating regbuild_string entries for FeatureSets:\t".$fset->name); + + #Fail for old versions as we want to remove these + if( $fset->name =~ /_v[0-9]+$/){ + $self->report("FAIL:\t".$fset->name." is an old RegulatoryFeature set, please remove!"); + next; + } + + my $cell_type = (defined $fset->cell_type) ? $fset->cell_type->name : 'core'; + + #This has been lifted from build_regulatory_features.pl store_regbuild_strings + #Need to move this to a RegulatoryBuilder module + my $dset = $self->db->get_DataSetAdaptor->fetch_by_product_FeatureSet($fset); + + my @ssets = @{$dset->get_supporting_sets}; + + if(! @ssets){ + throw('You must provide a DataSet with associated supporting sets'); + } + + + + my %reg_strings = + ( + "regbuild.${cell_type}.feature_set_ids" => join(',', map { + $_->dbID} sort {$a->name cmp $b->name + } @ssets), + + "regbuild.${cell_type}.feature_type_ids" => join(',', map { + $_->feature_type->dbID} sort {$a->name cmp $b->name + } @ssets), + ); + + my @ffset_ids; + + #Skip this now as we use the ftype classes for defining the focus sets + #foreach my $fset(@ssets){ + + + # #This might fail if soem TFs haven't been included as focus i.e. are part of PolII/III + # #e.g. TFIIIC-110 +## +# if( ($fset->feature_type->class eq 'Transcription Factor') || +# ($fset->feature_type->class eq 'Open Chromatin') ){ +# push @ffset_ids, $fset->dbID; +# } +# } + + + my ($sql, %db_reg_string); + + foreach my $string_key(keys %reg_strings){ + my ($string)= $self->db->dbc->db_handle->selectrow_array("select string from regbuild_string where name='${string_key}' and species_id=$species_id"); + + if (! defined $string) { + $sql = "insert into regbuild_string (species_id, name, string) values ($species_id, '${string_key}', '$reg_strings{${string_key}}');"; + + $self->report("WARNING:\tInserting absent $string_key into regbuild_string table"); + eval { $self->db->dbc->do($sql) }; + die("Couldn't store $string_key in regbuild_string table\n$sql\n$@") if $@; + } + elsif ($string ne $reg_strings{$string_key}){ + $sql = "update regbuild_string set string='".$reg_strings{$string_key}."' where name='${string_key}';"; + + if($self->fix){ + $self->report("WARNING:\tUpdating mismatched $string_key found in regbuild_string table:\t${string}");#\tUpdate using:\t$sql"); + eval { $self->db->dbc->do($sql) }; + die("Couldn't update $string_key in regbuild_string table\n$sql\n$@") if $@; + + } + else{ + $self->report("FAIL:\tMismatched $string_key found in regbuild_string table:\t${string}\n\tUpdate using:\t$sql"); + } + } + + $db_reg_string{$string_key} = $string; + } + + + #Now need to tidy this block wrt new code added above + my $fset_string_key = "regbuild.${cell_type}.feature_set_ids"; + my $ftype_string_key = "regbuild.${cell_type}.feature_type_ids"; + my $fset_string = $db_reg_string{$fset_string_key}; + my $ftype_string = $db_reg_string{$ftype_string_key}; + + if(! ($fset_string && $ftype_string)){ + $self->report("FAIL:\tSkipping fset vs ftype string test for $cell_type") + } + else{ + + #This is now effectively handled by the loop above + + $self->log("Validating :\t$fset_string_key vs $ftype_string_key"); + + my @fset_ids = split/,/, $fset_string; + my @ftype_ids = split/,/, $ftype_string; + my @new_ftype_ids; + my $ftype_fail = 0; + + #Now need to work backwards through ftypes to remove pseudo ftypes before validating + #New string should be A,A,A;S,S,S,S,S,S;P,P,P + #Where A is and Anchor/Seed set + #S is a supporting set + #P is a pseudo feature type e.g. TSS proximal + + + if(scalar(@fset_ids) != scalar(@ftype_ids)){ + $self->report("FAIL:\tLength mismatch between:\n\t$fset_string_key(".scalar(@fset_ids).")\t$fset_string\n\tAND\n\t$ftype_string_key(".scalar(@ftype_ids).")\t$ftype_string"); + } + + foreach my $i(0..$#fset_ids){ + my $supporting_set_id = $fset_ids[$i]; + my $sset = $fset_a->fetch_by_dbID($supporting_set_id); + + if(! defined $sset){ + $self->report("FAIL:\t$fset_string_key $supporting_set_id does not exist in the DB"); + } + else{ + #test/build ftype string + + if(defined $ftype_string){ + + if($sset->feature_type->dbID != $ftype_ids[$i]){ + $ftype_fail = 1; + $self->report("FAIL:\t$fset_string_key $supporting_set_id(".$sset->name.") FeatureType(".$sset->feature_type->name.") does not match $ftype_string_key $ftype_ids[$i]"); + } + } + + push @new_ftype_ids, $sset->feature_type->dbID; + + } + } + + + #Set ftype_string + #This will not account for pseudo ftypes? Remove!!!? + my $new_ftype_string = join(',', @new_ftype_ids); + + if(! defined $ftype_string){ + $self->log("Updating $ftype_string_key to:\t$new_ftype_string"); + $self->db->dbc->db_handle->do("INSERT into regbuild_string(species_id, name, string) values($species_id, '$ftype_string_key', '$new_ftype_string')"); + } + elsif($ftype_fail){ + $self->report("FAIL:\t$ftype_string_key($ftype_string) does not match $fset_string_key types($new_ftype_string)"); + } + + + #Finally validate versus a reg feat + #Need to change this to ftype string rather than fset string? + my $id_row_ref = $self->db->dbc->db_handle->selectrow_arrayref('select regulatory_feature_id from regulatory_feature where feature_set_id='.$fset->dbID.' limit 1'); + + if(! defined $id_row_ref){ + $self->report("FAIL:\tNo RegulatoryFeatures found for FeatureSet ".$fset->name); + } + else{ + my ($regf_dbID) = @$id_row_ref; + my $rf_string = $regf_a->fetch_by_dbID($regf_dbID)->binary_string; + + if(length($rf_string) != scalar(@fset_ids)){ + $self->report("FAIL:\tRegulatory string length mismatch between RegulatoryFeature($regf_dbID) and $fset_string_key:\n$rf_string(".length($rf_string).")\n$fset_string(".scalar(@fset_ids).")"); + } + } + } + } + } + + return; +} + + +#Change this to log sets and incorporate RegFeat FeatureSet as standard +#Grab all reg fsets +#grab all displayable data sets which aren't reg sets? + +sub log_data_sets{ + my $self = shift; + + my $dset_adaptor = $self->db->get_DataSetAdaptor; + my ($status); + my $txt = 'Checking '; + $status = 'DISPLAYABLE' if($self->{'check_displayable'}); + $txt.= $status.' ' if $status; + $txt .= 'DataSets'; + $self->log_header($txt); + + #Check for status first to avoid warning from BaseAdaptor. + eval { $dset_adaptor->_get_status_name_id($status) }; + + if($@){ + $self->report("FAIL: You have specified check_displayable, but the DISPLAYABLE status_name is not present in the DB"); + return; + } + + + my @dsets; + my $dsets = $dset_adaptor->fetch_all($status); + @dsets = @$dsets if defined $dsets; + + + $self->log('Found '.scalar(@dsets).' DataSets'); + + + foreach my $dset(@dsets){ + $self->log_set("DataSet:\t\t", $dset) ; + + my $fset = $dset->product_FeatureSet; + $self->log_set("Product FeatureSet:\t", $fset) if $fset; + + my @supporting_sets = @{$dset->get_supporting_sets}; + + $self->log('Found '.scalar(@supporting_sets).' supporting sets:'); + + if(my @supporting_sets = @{$dset->get_supporting_sets}){ + #type here could be result, experimental or feature + #and feature could be annotated or experimental + #Move this to log set? + + map { my $type = ref($_); + $type =~ s/.*://; + $type .= '('.$_->feature_class.')' if($type eq 'FeatureSet'); + #Need to sprintf $type here to fixed width + $self->log_set($type.":\t", $_)} @supporting_sets; + } + $self->log(); + } + + return; +} + +sub log_set{ + my ($self, $text, $set) = @_; + + $text .= $set->display_label.'('.$set->name.')'; + $text .= "\tDISPLAYABLE" if($set->is_displayable); + $self->log($text); + + return; +} + + +sub check_stable_ids{ + my ($self, @slices) = @_; + + my $species_id = $self->db()->species_id(); + + $self->log_header('Checking stable IDs'); + + my $fset_a = $self->db->get_FeatureSetAdaptor; + + my @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')}; + + if(!@regf_fsets){ + $self->report('WARNING: No regulatory FeatureSets found (fine if '.$self->db->species.' does not have a regulatory build)'); + } + else{ + + foreach my $fset(@regf_fsets){ + + if($fset->name =~ /_v[0-9]$/){ + $self->log("Skipping stable_id test on archived set:\t".$fset->name); + next; + } + + #Can't count NULL field, so have to count regulatory_feature_id!!! + + #getting SR product here!! + my $sql = "select count(rf.regulatory_feature_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.stable_id is NULL and rf.seq_region_id = sr.seq_region_id and sr.coord_system_id = cs.coord_system_id and cs.species_id = $species_id and rf.feature_set_id=".$fset->dbID; + + + + my ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)}; + + if($null_sids){ + $self->report("FAIL: Found a total of $null_sids NULL stable IDs for ".$fset->name); + + my $slice_a = $self->db->get_SliceAdaptor; + + if(! @slices){ + @slices = @{$slice_a->fetch_all('toplevel', 1)}; + } + + foreach my $slice(@slices){ + my $sr_name=$slice->seq_region_name; + $sql = 'select count(rf.stable_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.seq_region_id=sr.seq_region_id and sr.name="'.$sr_name.'" and sr.coord_system_id = cs.coord_system_id and cs.species_id = '.$species_id.' and rf.stable_id is NULL and rf.feature_set_id='.$fset->dbID; + ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)}; + + #This is not reporting properly. + + $self->log($fset->name.":\t$null_sids NULL stable IDs on ".$slice->name) if $null_sids; + } + } + else{ + $self->log($fset->name.":\tNo NULL stable IDs found"); + } + } + } + + return; + +} + + +#This is for mart to enable them to join to the seq_region table without +#creating a product from all the reundant seq_region entries for each schema_build + +sub set_current_coord_system{ + my ($self) = @_; + + + + my $schema_build = $self->db->_get_schema_build($self->db->dnadb); + $self->log_header("Setting current coord_system on $schema_build"); + + my $sql = "update coord_system set is_current=False where schema_build !='$schema_build'"; + $self->db->dbc->do($sql); + $sql = 'update coord_system set is_current=True where schema_build ="'.$schema_build.'" and attrib like "%default_version%"'; + $self->db->dbc->do($sql); + + return; +} + +sub validate_DataSets{ + my $self = shift; + + $self->log_header('Validating DataSets'); + + + #checks regualtory feature data and supporting sets + #links between DataSet and FeatureSet, i.e. correct naming, not linking to old set + #Displayable, DAS_DISPLAYABLE, IMPORTED_ASSM, MART_DISPLAYABLE + #Naming of result_set should match data/feature_set + #warn non-attr feature/result sets which are DISPLAYABL + #warn about DISPLAYABLE sets which do not have displayable set in analysis_description.web_data + + + my $fset_a = $self->db->get_FeatureSetAdaptor; + my $dset_a = $self->db->get_DataSetAdaptor; + my ($dset_states, $rset_states, $fset_states) = $self->get_regbuild_set_states($self->db); + + my %rf_fsets; + my %set_states; + my $sql; + + RF_FSET: foreach my $rf_fset(@{$fset_a->fetch_all_by_type('regulatory')}){ + my $rf_fset_name = $rf_fset->name; + + + $self->log("Validating $rf_fset_name"); + + + $rf_fsets{$rf_fset_name} = $rf_fset;#Do we only need the name for checking the dsets independantly? + + if($rf_fset_name =~ /_v[0-9]+$/){ + $self->report("FAIL:\tFound archived regulatory FeatureSet:\t$rf_fset_name"); + next RF_FSET; + } + + foreach my $state(@$fset_states){ + + if(! $rf_fset->has_status($state)){ + $self->report("WARNING:\tUpdating FeatureSet $rf_fset_name with status $state"); + + $sql = 'INSERT into status select '. $rf_fset->dbID. + ", 'feature_set', status_name_id from status_name where name='$state'"; + + $self->db->dbc->db_handle->do($sql); + } + } + + #Do we need to warn about other states? + + + + my $rf_dset = $dset_a->fetch_by_product_FeatureSet($rf_fset); + + if(! $rf_dset){ + $self->report("FAIL:\tNo DataSet for FeatureSet:\t$rf_fset_name"); + next RF_FSET; + } + + + + if($rf_fset_name ne $rf_dset->name){ + $self->report("FAIL:\tFound Feature/DataSet name mismatch:\t$rf_fset_name vs ".$rf_dset->name); + next RF_FSET; + } + + + foreach my $state(@$dset_states){ + + if(! $rf_dset->has_status($state)){ + #Can we just update and warn here? + #Or do this separately in case we want some control over this? + $self->report("WARNING:\tUpdating DataSet $rf_fset_name with status $state"); + + $sql = 'INSERT into status select '.$rf_dset->dbID. + ", 'data_set', status_name_id from status_name where name='$state'"; + $self->db->dbc->db_handle->do($sql); + } + } + + + + #We should check fset ctype matches all attr_set ctypes? + #May have problems if we want to merge two lines into the same ctype build + + + + foreach my $ra_fset(@{$rf_dset->get_supporting_sets}){ + + foreach my $state(@$fset_states){ + + if(! $ra_fset->has_status($state)){ + #Can we just update and warn here? + #Or do this separately in case we want some control over this? + $self->report("WARNING:\tUpdating FeatureSet ".$ra_fset->name." with status $state"); + + $sql = 'INSERT into status select '.$ra_fset->dbID. + ", 'feature_set', status_name_id from status_name where name='$state'"; + $self->db->dbc->db_handle->do($sql); + } + } + + + + my $ra_dset = $dset_a->fetch_by_product_FeatureSet($ra_fset); + my @ssets = @{$ra_dset->get_supporting_sets(undef, 'result')}; + my @displayable_sets; + + foreach my $sset(@ssets){ + + if($sset->has_status('DISPLAYABLE')){ + push @displayable_sets, $sset; + } + } + #Change this to get all then check status + #else print update sql + + if(scalar(@displayable_sets) > 1){#There should only be one + $self->report("FAIL:\tThere should only be one DISPLAYABLE supporting ResultSet for DataSet:\t".$ra_dset->name); + } + elsif(scalar(@displayable_sets) == 0){ + + my $msg; + + if(scalar(@ssets) == 1){ + + #fix here? + + $msg = "Found unique non-DISPLAYABLE ResultSet:\t".$ssets[0]->name. + "\n\tinsert into status select ".$ssets[0]->dbID. + ", 'result_set', status_name_id from status_name where name='DISPLAYABLE';"; + } + else{ + $msg = "Found ".scalar(@ssets)." ResultSets ".join("\t", map($_->name, @ssets)); + } + + $self->report("FAIL:\tThere are no DISPLAYABLE supporting ResultSets for DataSet:\t". + $ra_dset->name."\n$msg"); + + + + next; #$ra_fset + } + + my $ra_rset = $ssets[0]; + + foreach my $state(@$rset_states){ + + if(! $ra_rset->has_status($state)){ + #Can we just update and warn here? + #Or do this separately in case we want some control over this? + $self->report("WARNING:\tUpdating ResultSet ".$ra_rset->name." with status $state"); + + $sql = 'INSERT into status select '.$ra_rset->dbID. + ", 'result_set', status_name_id from status_name where name='$state'"; + $self->db->dbc->db_handle->do($sql); + } + } + } + } + + return; +} # End of validate_DataSets + + + + + + +sub analyse_and_optimise_tables{ + my $self = shift; + + #myisamchk --analyze. or analyze statement + + if($self->{'skip_analyse'}){ + $self->log_header('Skipping analyse/optimise tables'); + return; + } + + $self->log_header("Analysing and optimising tables"); + + + my $sql = 'show tables;'; + my @tables = @{$self->db->dbc->db_handle->selectall_arrayref($sql)}; + map $_ = "@{$_}", @tables; + my $analyse_sql = 'analyze table '; + my $optimise_sql = 'optimize table '; + + + + foreach my $table(@tables){ + $self->log("Analysing and optimising $table"); + + + + #Remove analyse as optimise does everything this does + my @anal_info = @{$self->db->dbc->db_handle->selectall_arrayref($analyse_sql.$table)}; + + foreach my $line_ref(@anal_info){ + my $status = $line_ref->[3]; + $self->report("FAIL: analyse $table status $status") if (!($status eq 'OK' || $status eq 'Table is already up to date')); + } + + my @opt_info = @{$self->db->dbc->db_handle->selectall_arrayref($optimise_sql.$table)}; + + foreach my $line_ref(@opt_info){ + + my $status = $line_ref->[3]; + $self->report("FAIL: optimise $table status $status") if (!( $status eq 'OK' || $status eq 'Table is already up to date')); + } + + } + + return; +}# end of analyse_and_optimise_tables + + +sub clean_xrefs{ + my ($self) = @_; + + if($self->{'skip_xrefs'}){ + $self->log_header('Skipping clean_xrefs'); + return; + } + + $self->log_header("Cleaning unlinked xref records"); + + my $sql = 'DELETE x FROM xref x LEFT JOIN object_xref ox ON ox.xref_id = x.xref_id WHERE ox.xref_id IS NULL'; + #Should this also take accoumt of unmapped_objects? + #No, as unmapped_object doesn't use xref, but probably should + + my $row_cnt = $self->db->dbc->do($sql); + + $self->reset_table_autoinc('xref', 'xref_id', $self->db); + $row_cnt = 0 if $row_cnt eq '0E0'; + $self->log("Deleted $row_cnt unlinked xref records"); + + + #Now remove old edbs + $self->log_header("Cleaning unlinked external_db records"); + + #Need to account for xref and unmapped_object here + $sql = 'DELETE edb FROM external_db edb '. + 'LEFT JOIN xref x ON x.external_db_id = edb.external_db_id '. + 'LEFT JOIN unmapped_object uo ON uo.external_db_id=edb.external_db_id '. + 'WHERE x.external_db_id IS NULL and uo.external_db_id is NULL'; + $row_cnt = $self->db->dbc->do($sql); + + $self->reset_table_autoinc('external_db', 'external_db_id', $self->db); + $row_cnt = 0 if $row_cnt eq '0E0'; + $self->log("Deleted $row_cnt unlinked external_db records"); + + + #Shouldn't clean orphaned oxs here as this means a rollback been done underneath the ox data + #or we have xref_id=0! + #Leave this to HC? + + + + + + return; +} + + +1;
