diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Utils/HealthChecker.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,1086 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::Utils::HealthChecker
+  
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+B<This program> provides several methods to health check and update tables prior to 
+release. Using the updte_DB_for_release method runs the following:
+
+  validate_new_seq_regions    - _pre_stores seq_region & coord_system info from new core DB
+  check_regbuild_strings      - Validates or inserts regbuild_string entries
+  check_meta_species_version  - Validates meta species and version wrt dbname
+  set_current_coord_system    - Updates coord_system.is_current to 1 for current schema_build (required for mart)
+  update_meta_coord           - Regenerates meta_coord.max_length values (required for Slice range queries)
+  clean_xrefs                 - Removes old unused xref and external_db records
+  validate_DataSets           - Performs various checks on Data/Feature/ResultSets links and states
+  check_stable_ids            - Check for any NULL stable IDs
+  log_data_sets               - Logs all DISPLAYABLE DataSets
+  analyse_and_optimise_tables - Does what is says
+
+
+=cut
+
+################################################################################
+
+package Bio::EnsEMBL::Funcgen::Utils::HealthChecker;
+
+use strict;
+use Bio::EnsEMBL::Funcgen::Utils::Helper;
+use Bio::EnsEMBL::Funcgen::ProbeFeature;
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+use Bio::EnsEMBL::Utils::Exception qw( throw );
+use Bio::EnsEMBL::Analysis;
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
+
+
+
+#TO DO
+# 1 DONE Print all fails and warnings in summary at end of script.
+# 2 validate_RegulatoryFeature_Sets
+# 3 Some of these can be migrated or mirrored in java HCs for safety
+
+
+################################################################################
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+  my $self = $class->SUPER::new(@_);
+    
+  #validate and set type, analysis and feature_set here
+  my ($db, $builds, $skip_mc, $check_displayable, $skip_analyse, $meta_coord_tables, $skip_xrefs, $fix) = 
+	rearrange(['DB', 'BUILDS', 'SKIP_META_COORD', 'CHECK_DISPLAYABLE', 'SKIP_ANALYSE', 'META_COORD_TABLES', 'SKIP_XREF_CLEANUP', 'FIX'], @_);
+  
+  
+  if (! ($db && ref($db) &&
+		 $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){
+	throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor');
+  }
+  
+  #test connection
+  $db->dbc->db_handle;
+  
+  $self->{'db'} = $db;
+  $self->{'mysql_connect_string'} = 'mysql -h'.$db->dbc->host.' -u'.$db->dbc->username.' -p'
+	.$db->dbc->password.' '.$db->dbc->dbname.' -P'.$db->dbc->port;
+  $self->{'dbname'} = $db->dbc->dbname;
+  $self->{'builds'} = (scalar(@$builds)>0) ? $builds : [];
+  $self->{'skip_meta_coord'} = $skip_mc;
+  $self->{'skip_xrefs'} = $skip_xrefs;
+  $self->{'skip_analyse'} = $skip_analyse;
+  $self->{'check_displayable'} = $check_displayable;
+  $self->{fix} = $fix;
+  
+  if(defined $meta_coord_tables){
+
+	throw('-skip_meta_coord is set, Cannot build meta_coord entries for tables '.join(', ', @$meta_coord_tables));
+
+	if(! ref($meta_coord_tables) eq 'ARRAY'){
+	  throw('-meta_coord_tables parameter must be an array ref');
+	}
+
+	@{$self->{'meta_coord_tables'}} = @$meta_coord_tables;
+  }
+
+  return $self;
+}
+
+sub db{  return $_[0]->{db};  }
+
+sub fix{ return $_[0]->{fix}; }
+
+=head2 update_db_for_release
+
+  Arg[0]     : 
+  Example    : 
+  Description: Wrapper method to perform all common update functions
+  Returntype : 
+  Exceptions : None
+  Caller     : General
+  Status     : at risk
+
+=cut
+
+sub update_db_for_release{
+  my ($self, @args) = @_;
+  
+  if(@args){
+  }
+	
+  #do seq_region_update to validate dnadb first
+  #hence avoiding redoing longer methods
+  $self->validate_new_seq_regions;#($force_srs);
+  #$self->update_meta_schema_version;
+  $self->check_regbuild_strings;
+  $self->check_meta_species_version;
+  $self->set_current_coord_system;
+  $self->update_meta_coord;
+  $self->clean_xrefs;
+  $self->validate_DataSets;
+  $self->check_stable_ids;
+  $self->log_data_sets();
+  $self->analyse_and_optimise_tables;#ALWAYS LAST!!
+
+  $self->log_header('??? Have you dumped/copied GFF dumps ???');
+
+  #Log footer? Pass optional counts hash?
+  $self->log('Finished updating '.$self->{'dbname'}." for release\n\n");
+}
+
+sub validate_new_seq_regions{
+  my ($self, $force) = @_;
+  
+
+  #We need to add some functionality to handle non-standard schema_build progression here
+  #This should be used before any data is loaded
+  #It should also warn if there any duplicates if it is not run before coord_systems are duplicated
+  #Should this just be handled in the BaseFeatureAdaptor/CoordSystemAdaptor?
+  
+
+  
+  
+  #do we need to add the none default levels here?
+  #or are we only bothered about those which constitute the toplevel?
+
+  #To make sure we have all the correct levels in eFG we need to get all the names.
+  #then get all by name from the core db and set them as the dnadb.
+  # we also need to get all the toplevel seq_regions and store them in the seq_region table
+  #use BaseFeatureAdaptor::_pre_store with and array of pseudo feature on each top level slice
+
+  #Validate the efgdb and dnadb schema version are the same first
+  #This is because we expect the schem_build to be the same for a release
+  #However, this will autoset the dnadb if no defined, so will always match!
+
+  if(! $force){
+	my $efgdb_sm = join('_', @{$self->get_schema_and_build($self->{'dbname'})});
+	my $dnadb_sm = join('_', @{$self->get_schema_and_build($self->db->dnadb->dbc->dbname)});
+	
+	if($efgdb_sm ne $dnadb_sm){
+	  $self->report("WARNING Skipped validate_new_seq_regions as schema_versions are mismatched:\t".
+				 "efgdb $efgdb_sm\tdnadb $dnadb_sm");
+	  return 0;
+	}
+  }
+  
+  my $pf_adaptor    = $self->db->get_ProbeFeatureAdaptor();
+  my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor();
+  my $dnadb_csa     = $self->db->dnadb->get_CoordSystemAdaptor;
+  
+  $self->log_header('Validating new coord_systems/seq_regions');
+  
+  my @slices;
+  my %versioned_levels;
+  my $default_version;
+  
+  #Grab unversioned top level slices and versioned levels
+  #May miss some old versioned level if the new assembly no longer has them
+  foreach my $slice(@{$slice_adaptor->fetch_all('toplevel', undef, 1)}){
+
+	if (! $slice->coord_system->version){
+	  push @slices, $slice;
+	}
+	else{
+
+	  if($default_version &&
+		  ($default_version ne $slice->coord_system->version)){
+		throw("Found more than one default CoordSystem version:\t${default_version}\t".$slice->coord_system->version);
+	  }
+	  else{
+		$default_version = $slice->coord_system->version;
+	  }
+	}
+  }
+  
+ 
+  #Get all versioned levels for all builds
+  foreach my $cs(@{$dnadb_csa->fetch_all}){
+	
+	if($cs->version){
+	  $versioned_levels{$cs->version} ||= [];	
+	  push @{$versioned_levels{$cs->version}}, $cs->name;
+	}
+  }
+  
+  push @{$self->{'builds'}}, $default_version if scalar(@{$self->{'builds'}}) == 0;
+  
+
+
+  #Grab slices for each versioned level
+  foreach my $build(@{$self->{'builds'}}){
+	
+	if(! exists $versioned_levels{$build}){
+	  throw("CoordSystem version $build does not exist in the dnadb ".$self->db->dnadb->dbc->dbname);
+	}
+	
+	foreach my $level(@{$versioned_levels{$build}}){
+	  $self->log("Getting slices for $level $build");
+	  push @slices, @{$slice_adaptor->fetch_all($level, $build)};
+	}
+  }
+  
+  $self->log("Importing seq_region/coord_system info for builds:\t".join(',', @{$self->{'builds'}}));
+  
+  foreach my $slice(@slices){
+
+	if($slice->start() != 1){
+	  $self->log("Reslicing slice:\t".$slice->name());
+	  #we must have some sort of PAR linked region i.e. Y
+	  $slice = $slice_adaptor->fetch_by_region($slice->coord_system_name(), $slice->seq_region_name());
+	}
+	
+	
+	#we need test if it needs doing first?
+	#we would need to test for the coord_systems outside of this loop
+	#and then for each seq_region inside the loop if the coord_system is present
+	
+	$self->log("_pre_storing seq_region info for slice:\t".$slice->name());
+	
+	my $pseudo_feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new
+	  (
+	   -slice => $slice,
+	   -start => 0,
+	   -end   => 0,
+	   -strand => 0,
+		);
+	
+	$pf_adaptor->_pre_store($pseudo_feature);
+	#This will create a meta_coord entry of max_length 1 for features which have an absent meta_coord entry
+	
+  }
+
+  
+  $self->log("Finished validating seq_regions\n");
+  
+  return;
+}
+
+
+
+
+sub update_meta_coord{
+  my ($self, @table_names) = @_;
+  
+  my $species_id = $self->db->species_id;
+  
+  if($self->{'skip_meta_coord'}){
+	$self->log("Skipping meta_coord update\n");
+	return;
+  }
+  
+
+  $self->log_header('Updating meta_coord table');
+
+
+  #set default table_name
+  if(! @table_names || scalar(@table_names) == 0){
+
+	#Can we do this via DBAdaptor and get all available adaptors which are BaseFeatureAdaptors then grab the first table name
+
+	if(defined $self->{'meta_coord_tables'}){
+	  @table_names = @{$self->{'meta_coord_tables'}};
+	}
+	else{#default
+
+	  @table_names = qw(
+						regulatory_feature
+						probe_feature
+						external_feature
+						annotated_feature
+						result_feature
+						segmentation_feature
+					 );
+	}
+  }
+
+  #backup meta coord
+  if(system($self->{'mysql_connect_string'}." -e 'SELECT * FROM meta_coord'"
+			. '> '.$self->{'dbname'}.'meta_coord.backup'
+		   ) != 0 ){
+	
+	throw("Can't dump the original meta_coord for back up");#will this get copied to log?
+  } 
+  else {
+	$self->log('Original meta_coord table backed up in '. $self->{'dbname'}.'.meta_coord.backup');
+  }
+  
+
+  #Update each max_length for table_name and coord_system
+
+  my $sql;
+
+  foreach my $table_name(@table_names){
+  
+	$sql = "select distinct(cs.name), mc.coord_system_id, cs.version, mc.max_length from coord_system cs, meta_coord mc where mc.table_name='$table_name' and mc.coord_system_id=cs.coord_system_id and cs.species_id = $species_id";
+	
+	$self->log('');
+	$self->log("Updating meta_coord max_length for $table_name:");
+	$self->log("name\tcoord_system_id\tversion\tmax_length");
+	
+	#can we test for emtpy array here? Then skip delete.
+	
+	my @info = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
+	
+	#log this
+	map {$self->log(join("\t", @{$_}))} @info;
+	
+	# Clean old entries
+	$self->log("Deleting old meta_coord entries");
+	$sql = "DELETE mc FROM meta_coord mc, coord_system cs WHERE mc.table_name ='$table_name' and mc.coord_system_id = cs.coord_system_id and cs.species_id = $species_id";
+	$self->db->dbc->db_handle->do($sql);
+	
+	# Generate new max_lengths
+	$self->log("Generating new max_lengths");
+	
+	#Is this query running for each redundant cs_id?
+	#would it be more efficient to retrieve the NR cs_ids first and loop the query for each cs_id?
+	
+	#Can we get the dbID of the largest feature for ease of checking?
+	#This won't work as we're grouping by coord_system
+	#would need to select distinct coord_system_id for table first
+	#This may well slow down quite a bit doing it this way
+	
+	$sql = "select distinct s.coord_system_id from coord_system cs, seq_region s, $table_name t WHERE t.seq_region_id = s.seq_region_id and s.coord_system_id = cs.coord_system_id and cs.species_id = $species_id";
+	my @cs_ids = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
+	#Convert single element arrayrefs to scalars
+	map $_ = ${$_}[0], @cs_ids;
+
+	$self->log("New max_lengths for $table_name are:");
+	
+	$self->log(join("\t", ('coord_system_id', 'max_length', 'longest record dbID')));
+
+	foreach my $cs_id(@cs_ids){
+	  #This will always give a length of 1 even if there are no features present
+
+	  my @cs_lengths;
+
+	  #The probe_feature table is now too big to do this in one go
+	  #We need to break this down into sr_ids
+
+	  $sql = "SELECT distinct t.seq_region_id from $table_name t, seq_region sr where t.seq_region_id=sr.seq_region_id and sr.coord_system_id=$cs_id";
+	  
+	  my @sr_ids = @{$self->db->dbc->db_handle->selectcol_arrayref($sql)};
+	  
+
+	  #Get longest feature for all seq_regions
+	  foreach my $sr_id(@sr_ids){
+		$sql = "SELECT (t.seq_region_end - t.seq_region_start + 1 ) as max, t.${table_name}_id "
+		  . "FROM $table_name t "
+			. "WHERE t.seq_region_id = $sr_id ";
+		$sql .= ' and t.window_size=0' if $table_name eq 'result_feature';
+		$sql .= " order by max desc limit 1";
+		
+
+		#Problem here is that DBs without 0 wsize result_feture entries will not get a meta_coord entry
+		#We need to implement this in the _pre_store method too?
+
+
+		my ($cs_length, $table_id);
+		($cs_length, $table_id) = $self->db->dbc->db_handle->selectrow_array($sql);
+		push @cs_lengths, [$cs_length, $table_id] if $cs_length;
+	  }
+
+
+	  if(@cs_lengths){
+		#This will now contain a list of arrays refs contain the max length and feature id for 
+		#each seq_region in this coord_system
+		#Now sort to get the longest
+		#Can't sort on 2 day array in the normal way
+		#One list list lists, the comparatee is no longer a list but a reference
+		@cs_lengths = sort { $b->[0] <=> $a->[0] } @cs_lengths;
+		$self->log(join("\t\t", ($cs_id, @{$cs_lengths[0]})));
+		$sql = "INSERT INTO meta_coord values('${table_name}', $cs_id, ".$cs_lengths[0][0].')';
+		$self->db->dbc->db_handle->do($sql);
+	  }
+	}
+  }  
+  
+  $self->log("Finished updating meta_coord max_lengths\n");
+  
+  return;
+}
+
+
+#change this to check_meta_species_version
+
+sub check_meta_species_version{
+  my ($self) = @_;
+
+  $self->log_header('Checking meta species.production_name and schema_version against dbname');
+
+  my $dbname = $self->db->dbc->dbname;
+  (my $dbname_species = $dbname) =~ s/_funcgen_.*//;
+  my $mc = $self->db->get_MetaContainer;
+  my $schema_version = $mc->list_value_by_key('schema_version')->[0];
+
+  if(! defined $schema_version){
+	$self->report("FAIL:\tNo meta schema_version defined");
+  }
+  elsif($dbname !~ /funcgen_${schema_version}_/){
+	$self->report("FAIL:\tMeta schema_version ($schema_version) does not match the dbname ($dbname).");
+  }
+
+
+  my @latin_names = @{$mc->list_value_by_key('species.production_name')};
+  
+  if(scalar(@latin_names) > 1){
+	$self->report("FAIL:\tFound more than one species.production_name in meta:\t".join(", ", @latin_names));
+  }
+  elsif(scalar(@latin_names) == 1 && ($latin_names[0] ne $dbname_species)){
+	$self->report("FAIL:\tFound mismatch between meta species.production_name and dbname:\t".$latin_names[0]." vs $dbname_species");
+  }
+  elsif(scalar(@latin_names) == 0){
+	$self->report("WARNING:\tFound no meta species.production_name setting as:\t$dbname_species");
+	$self->db->dbc->db_handle->do("INSERT into meta(species_id, meta_key, meta_value) values(1, 'species.production_name', '$dbname_species')");
+  }
+  #else is okay
+
+  return;
+}
+
+
+
+#Move to Java HC? Or update if update flag specified
+#Using same code used by build_reg_feats!
+
+sub check_regbuild_strings{
+  my ($self) = @_;
+  
+  #Removed $update arg as we would always want to do this manually
+
+  $self->log_header('Checking regbuild strings');
+  my $species_id = $self->db()->species_id();
+
+
+  my @regf_fsets;
+  my $passed = 1;
+  my $fset_a = $self->db->get_FeatureSetAdaptor;
+  #my $mc = $self->db->get_MetaContainer;
+  my $regf_a = $self->db->get_RegulatoryFeatureAdaptor;
+  #We now want to chek all build
+  @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')};
+ 
+  
+  if(scalar(@regf_fsets) == 0){
+    $self->report('WARNING:check_regbuild_strings found no regulatory FeatureSets (fine if '.$self->db->species.' your species does not have a regulatory build');
+  }
+  else{
+	
+	warn "Need to check/update regbuild.version and regbuild.initial_release_date regbuild.last_annotation_update";
+  
+
+	#How do we validate this?
+	#Check all feature_sets exist
+	#Pull back some features from a test slice and check the number of bits match.
+	#Check the feature_type string exists and matches else create.
+	
+	
+	foreach my $fset(@regf_fsets){
+	  $self->log_header("Validating regbuild_string entries for FeatureSets:\t".$fset->name);
+		
+	  #Fail for old versions as we want to remove these
+	  if( $fset->name =~ /_v[0-9]+$/){
+		$self->report("FAIL:\t".$fset->name." is an old RegulatoryFeature set, please remove!");
+		next;
+	  }
+
+	  my $cell_type = (defined $fset->cell_type) ? $fset->cell_type->name : 'core';
+	
+	  #This has been lifted from build_regulatory_features.pl store_regbuild_strings
+	  #Need to move this to a RegulatoryBuilder module
+	  my $dset = $self->db->get_DataSetAdaptor->fetch_by_product_FeatureSet($fset);
+	    
+	  my @ssets = @{$dset->get_supporting_sets};
+	  
+	  if(! @ssets){
+		throw('You must provide a DataSet with associated supporting sets');
+	  }
+
+
+
+	  my %reg_strings = 
+		(
+		 "regbuild.${cell_type}.feature_set_ids" => join(',', map {
+		   $_->dbID} sort {$a->name cmp $b->name
+						 } @ssets),
+
+		 "regbuild.${cell_type}.feature_type_ids" => join(',', map {
+		   $_->feature_type->dbID} sort {$a->name cmp $b->name
+									   } @ssets),
+		);
+
+	  my @ffset_ids;
+	  
+	  #Skip this now as we use the ftype classes for defining the focus sets
+	  #foreach my $fset(@ssets){
+
+
+	#	#This might fail if soem TFs haven't been included as focus i.e. are part of PolII/III
+	#	#e.g. TFIIIC-110
+##
+#		if( ($fset->feature_type->class eq 'Transcription Factor') ||
+#			($fset->feature_type->class eq 'Open Chromatin') ){
+#		  push @ffset_ids, $fset->dbID;
+#		}
+#	  }
+
+	 	  
+	  my ($sql, %db_reg_string);
+
+	  foreach my $string_key(keys %reg_strings){
+      my ($string)= $self->db->dbc->db_handle->selectrow_array("select string from regbuild_string where name='${string_key}' and species_id=$species_id");
+		      
+      if (! defined $string) {
+        $sql = "insert into regbuild_string (species_id, name, string) values ($species_id, '${string_key}', '$reg_strings{${string_key}}');";
+    
+        $self->report("WARNING:\tInserting absent $string_key into regbuild_string table");
+        eval { $self->db->dbc->do($sql) };
+        die("Couldn't store $string_key in regbuild_string table\n$sql\n$@") if $@;
+      }
+      elsif ($string ne $reg_strings{$string_key}){
+        $sql = "update regbuild_string set string='".$reg_strings{$string_key}."' where name='${string_key}';";
+        
+        if($self->fix){
+          $self->report("WARNING:\tUpdating mismatched $string_key found in regbuild_string table:\t${string}");#\tUpdate using:\t$sql");
+          eval { $self->db->dbc->do($sql) };
+          die("Couldn't update $string_key in regbuild_string table\n$sql\n$@") if $@;
+          
+        }
+        else{
+          $self->report("FAIL:\tMismatched $string_key found in regbuild_string table:\t${string}\n\tUpdate using:\t$sql");
+        }
+      }
+
+      $db_reg_string{$string_key} = $string;
+	  }
+
+
+	  #Now need to tidy this block wrt new code added above
+	  my $fset_string_key  = "regbuild.${cell_type}.feature_set_ids";
+	  my $ftype_string_key = "regbuild.${cell_type}.feature_type_ids";
+	  my $fset_string  = $db_reg_string{$fset_string_key};
+	  my $ftype_string = $db_reg_string{$ftype_string_key};
+
+	  if(! ($fset_string && $ftype_string)){
+      $self->report("FAIL:\tSkipping fset vs ftype string test for $cell_type")
+	  }
+	  else{
+
+		#This is now effectively handled by the loop above
+
+		$self->log("Validating :\t$fset_string_key vs $ftype_string_key");
+
+		my @fset_ids  = split/,/, $fset_string;
+		my @ftype_ids = split/,/, $ftype_string;
+		my @new_ftype_ids;
+		my $ftype_fail = 0;
+		
+		#Now need to work backwards through ftypes to remove pseudo ftypes before validating
+		#New string should be A,A,A;S,S,S,S,S,S;P,P,P
+		#Where A is and Anchor/Seed set
+		#S is a supporting set
+		#P is a pseudo feature type e.g. TSS proximal
+		
+
+		if(scalar(@fset_ids) != scalar(@ftype_ids)){
+		  $self->report("FAIL:\tLength mismatch between:\n\t$fset_string_key(".scalar(@fset_ids).")\t$fset_string\n\tAND\n\t$ftype_string_key(".scalar(@ftype_ids).")\t$ftype_string");
+		}
+		
+		foreach my $i(0..$#fset_ids){
+		  my $supporting_set_id = $fset_ids[$i];
+		  my $sset = $fset_a->fetch_by_dbID($supporting_set_id);
+		  
+		  if(! defined $sset){
+        $self->report("FAIL:\t$fset_string_key $supporting_set_id does not exist in the DB");
+		  }
+		  else{
+			#test/build ftype string
+		  
+			if(defined $ftype_string){
+			  
+			  if($sset->feature_type->dbID != $ftype_ids[$i]){
+				$ftype_fail = 1;
+				$self->report("FAIL:\t$fset_string_key $supporting_set_id(".$sset->name.") FeatureType(".$sset->feature_type->name.") does not match $ftype_string_key $ftype_ids[$i]");
+			  }
+			}
+	
+			push @new_ftype_ids, $sset->feature_type->dbID;
+	
+		  }
+		}
+
+
+		#Set ftype_string
+		#This will not account for pseudo ftypes?  Remove!!!?
+		my $new_ftype_string = join(',', @new_ftype_ids);
+
+		if(! defined $ftype_string){
+		  $self->log("Updating $ftype_string_key to:\t$new_ftype_string");
+		  $self->db->dbc->db_handle->do("INSERT into regbuild_string(species_id, name, string) values($species_id, '$ftype_string_key', '$new_ftype_string')");
+    }
+		elsif($ftype_fail){
+		  $self->report("FAIL:\t$ftype_string_key($ftype_string) does not match $fset_string_key types($new_ftype_string)");
+		}
+
+
+		#Finally validate versus a reg feat
+		#Need to change this to ftype string rather than fset string?
+		my $id_row_ref = $self->db->dbc->db_handle->selectrow_arrayref('select regulatory_feature_id from regulatory_feature where feature_set_id='.$fset->dbID.' limit 1');
+
+		if(! defined $id_row_ref){
+		  $self->report("FAIL:\tNo RegulatoryFeatures found for FeatureSet ".$fset->name);
+		}
+		else{
+		  my ($regf_dbID) = @$id_row_ref;
+		  my $rf_string = $regf_a->fetch_by_dbID($regf_dbID)->binary_string;
+		  
+		  if(length($rf_string) != scalar(@fset_ids)){
+			$self->report("FAIL:\tRegulatory string length mismatch between RegulatoryFeature($regf_dbID) and $fset_string_key:\n$rf_string(".length($rf_string).")\n$fset_string(".scalar(@fset_ids).")");
+		  }
+		}
+	  }
+	}
+  }
+
+  return;
+}
+
+
+#Change this to log sets and incorporate RegFeat FeatureSet as standard
+#Grab all reg fsets
+#grab all displayable data sets which aren't reg sets?
+
+sub log_data_sets{
+  my $self = shift;
+  
+  my $dset_adaptor = $self->db->get_DataSetAdaptor;
+  my ($status);
+  my $txt = 'Checking ';
+  $status = 'DISPLAYABLE' if($self->{'check_displayable'});
+  $txt.= $status.' ' if $status;
+  $txt .= 'DataSets';
+  $self->log_header($txt);
+  
+  #Check for status first to avoid warning from BaseAdaptor.
+  eval { $dset_adaptor->_get_status_name_id($status) };
+
+  if($@){
+	$self->report("FAIL: You have specified check_displayable, but the DISPLAYABLE status_name is not present in the DB");
+	return;
+  }
+
+	
+  my @dsets;
+  my $dsets = $dset_adaptor->fetch_all($status);
+  @dsets = @$dsets if defined $dsets;
+  
+  
+  $self->log('Found '.scalar(@dsets).' DataSets');
+  
+  
+  foreach my $dset(@dsets){
+	$self->log_set("DataSet:\t\t", $dset) ;
+
+	my $fset = $dset->product_FeatureSet;
+	$self->log_set("Product FeatureSet:\t", $fset) if $fset;
+	
+	my @supporting_sets = @{$dset->get_supporting_sets};
+	
+	$self->log('Found '.scalar(@supporting_sets).' supporting sets:');
+
+	if(my @supporting_sets = @{$dset->get_supporting_sets}){
+	  #type here could be result, experimental or feature
+	  #and feature could be annotated or experimental
+	  #Move this to log set?
+
+	  map { my $type = ref($_);
+			$type =~ s/.*://;			
+			$type .= '('.$_->feature_class.')' if($type eq 'FeatureSet');
+			#Need to sprintf $type here to fixed width
+			$self->log_set($type.":\t", $_)} @supporting_sets;
+	}
+	$self->log();
+  }
+
+  return;
+}
+
+sub log_set{
+  my ($self, $text, $set) = @_;
+  
+  $text .= $set->display_label.'('.$set->name.')';
+  $text .= "\tDISPLAYABLE" if($set->is_displayable);
+  $self->log($text);
+
+  return;
+}
+
+
+sub check_stable_ids{
+  my ($self, @slices) = @_;
+  
+  my $species_id = $self->db()->species_id();
+
+  $self->log_header('Checking stable IDs');
+
+  my $fset_a = $self->db->get_FeatureSetAdaptor;
+
+  my @regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')};
+
+  if(!@regf_fsets){
+	$self->report('WARNING: No regulatory FeatureSets found (fine if '.$self->db->species.' does not have a regulatory build)');
+  }
+  else{
+
+	foreach my $fset(@regf_fsets){
+
+	  if($fset->name =~ /_v[0-9]$/){
+		$self->log("Skipping stable_id test on archived set:\t".$fset->name);
+		next;
+	  }
+	  
+	  #Can't count NULL field, so have to count regulatory_feature_id!!!
+
+	  #getting SR product here!!
+	  my $sql = "select count(rf.regulatory_feature_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.stable_id is NULL and rf.seq_region_id = sr.seq_region_id and sr.coord_system_id = cs.coord_system_id and cs.species_id = $species_id and rf.feature_set_id=".$fset->dbID;
+	  
+	  
+	  
+	  my ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)};
+	  
+	  if($null_sids){
+		$self->report("FAIL: Found a total of $null_sids NULL stable IDs for ".$fset->name);
+		
+		my $slice_a = $self->db->get_SliceAdaptor;
+		
+		if(! @slices){
+		  @slices = @{$slice_a->fetch_all('toplevel', 1)};
+		}
+		
+		foreach my $slice(@slices){
+		  my $sr_name=$slice->seq_region_name;
+		  $sql = 'select count(rf.stable_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.seq_region_id=sr.seq_region_id and sr.name="'.$sr_name.'" and sr.coord_system_id = cs.coord_system_id and cs.species_id = '.$species_id.' and rf.stable_id is NULL and rf.feature_set_id='.$fset->dbID;
+		  ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)};
+		  
+		  #This is not reporting properly.
+
+		  $self->log($fset->name.":\t$null_sids NULL stable IDs on ".$slice->name) if $null_sids;
+		}
+	  }
+	  else{
+		$self->log($fset->name.":\tNo NULL stable IDs found");
+	  }
+	}
+  }
+
+  return;
+
+}
+
+
+#This is for mart to enable them to join to the seq_region table without
+#creating a product from all the reundant seq_region entries for each schema_build
+
+sub set_current_coord_system{
+  my ($self) = @_;
+
+
+
+  my $schema_build = $self->db->_get_schema_build($self->db->dnadb);
+  $self->log_header("Setting current coord_system on $schema_build");
+
+  my $sql = "update coord_system set is_current=False where schema_build !='$schema_build'";
+  $self->db->dbc->do($sql);
+  $sql = 'update coord_system set is_current=True where schema_build ="'.$schema_build.'" and attrib like "%default_version%"';
+  $self->db->dbc->do($sql);
+
+  return;
+}
+
+sub validate_DataSets{
+  my $self = shift;
+  
+  $self->log_header('Validating DataSets');
+
+
+  #checks regualtory feature data and supporting sets
+  #links between DataSet and FeatureSet, i.e. correct naming, not linking to old set
+  #Displayable, DAS_DISPLAYABLE, IMPORTED_ASSM, MART_DISPLAYABLE
+  #Naming of result_set should match data/feature_set
+  #warn non-attr feature/result sets which are DISPLAYABL
+  #warn about DISPLAYABLE sets which do not have displayable set in analysis_description.web_data
+
+
+  my $fset_a = $self->db->get_FeatureSetAdaptor;
+  my $dset_a = $self->db->get_DataSetAdaptor;
+  my ($dset_states, $rset_states, $fset_states) = $self->get_regbuild_set_states($self->db);
+
+  my %rf_fsets;
+  my %set_states;
+  my $sql;
+
+ RF_FSET: foreach my $rf_fset(@{$fset_a->fetch_all_by_type('regulatory')}){
+	my $rf_fset_name = $rf_fset->name;
+	
+
+	$self->log("Validating $rf_fset_name");
+
+
+	$rf_fsets{$rf_fset_name} = $rf_fset;#Do we only need the name for checking the dsets independantly?
+
+	if($rf_fset_name =~ /_v[0-9]+$/){
+	  $self->report("FAIL:\tFound archived regulatory FeatureSet:\t$rf_fset_name");
+	  next RF_FSET;
+	}
+	
+	foreach my $state(@$fset_states){
+	  
+	  if(! $rf_fset->has_status($state)){
+		$self->report("WARNING:\tUpdating FeatureSet $rf_fset_name with status $state");
+		
+		$sql = 'INSERT into status select '. $rf_fset->dbID.
+		  ", 'feature_set', status_name_id from status_name where name='$state'";
+
+		$self->db->dbc->db_handle->do($sql);
+	  }
+	}
+
+	#Do we need to warn about other states?
+		
+
+
+	my $rf_dset = $dset_a->fetch_by_product_FeatureSet($rf_fset);
+
+	if(! $rf_dset){
+	    $self->report("FAIL:\tNo DataSet for FeatureSet:\t$rf_fset_name");
+	  next RF_FSET;
+	}
+
+	
+
+	if($rf_fset_name ne $rf_dset->name){
+	  $self->report("FAIL:\tFound Feature/DataSet name mismatch:\t$rf_fset_name vs ".$rf_dset->name);
+	  next RF_FSET;
+	}
+
+	
+	foreach my $state(@$dset_states){
+	  
+	  if(! $rf_dset->has_status($state)){
+		#Can we just update and warn here?
+		#Or do this separately in case we want some control over this?
+		$self->report("WARNING:\tUpdating DataSet $rf_fset_name with status $state");
+
+		$sql = 'INSERT into status select '.$rf_dset->dbID.
+		  ", 'data_set', status_name_id from status_name where name='$state'";
+		$self->db->dbc->db_handle->do($sql);
+	  }
+	}
+
+
+
+	#We should check fset ctype matches all attr_set ctypes?
+	#May have problems if we want to merge two lines into the same ctype build
+
+
+
+	foreach my $ra_fset(@{$rf_dset->get_supporting_sets}){
+	
+	  foreach my $state(@$fset_states){
+	  
+		if(! $ra_fset->has_status($state)){
+		  #Can we just update and warn here?
+		  #Or do this separately in case we want some control over this?
+		  $self->report("WARNING:\tUpdating FeatureSet ".$ra_fset->name." with status $state");
+
+		  $sql = 'INSERT into status select '.$ra_fset->dbID.
+			", 'feature_set', status_name_id from status_name where name='$state'";
+		  $self->db->dbc->db_handle->do($sql);
+    }
+  }
+
+
+
+	  my $ra_dset = $dset_a->fetch_by_product_FeatureSet($ra_fset);
+	  my @ssets = @{$ra_dset->get_supporting_sets(undef, 'result')};
+	  my @displayable_sets;
+
+	  foreach my $sset(@ssets){
+	
+		if($sset->has_status('DISPLAYABLE')){
+		  push @displayable_sets, $sset;
+		}
+	  }
+	  #Change this to get all then check status
+	  #else print update sql
+	  
+	  if(scalar(@displayable_sets) > 1){#There should only be one
+		$self->report("FAIL:\tThere should only be one DISPLAYABLE supporting ResultSet for DataSet:\t".$ra_dset->name);
+	  }
+	  elsif(scalar(@displayable_sets) == 0){
+
+		my $msg;
+		
+		if(scalar(@ssets) == 1){
+
+      #fix here?
+
+		  $msg = "Found unique non-DISPLAYABLE ResultSet:\t".$ssets[0]->name.
+			"\n\tinsert into status select ".$ssets[0]->dbID.
+			  ", 'result_set', status_name_id from status_name where name='DISPLAYABLE';";
+		}
+		else{
+		  $msg = "Found ".scalar(@ssets)." ResultSets ".join("\t", map($_->name, @ssets));
+		}
+
+		$self->report("FAIL:\tThere are no DISPLAYABLE supporting ResultSets for DataSet:\t".
+					  $ra_dset->name."\n$msg");
+
+		
+
+		next; #$ra_fset
+	  }
+
+	  my $ra_rset = $ssets[0];
+	
+	  foreach my $state(@$rset_states){
+	  
+		if(! $ra_rset->has_status($state)){
+		  #Can we just update and warn here?
+		  #Or do this separately in case we want some control over this?
+		  $self->report("WARNING:\tUpdating ResultSet ".$ra_rset->name." with status $state");
+
+		  $sql = 'INSERT into status select '.$ra_rset->dbID.
+			", 'result_set', status_name_id from status_name where name='$state'";
+		  $self->db->dbc->db_handle->do($sql);
+		}
+	  }
+	}
+  }
+
+  return;
+} # End of validate_DataSets
+
+
+
+
+
+
+sub analyse_and_optimise_tables{
+  my $self = shift;
+
+  #myisamchk --analyze. or analyze statement
+
+  if($self->{'skip_analyse'}){
+	$self->log_header('Skipping analyse/optimise tables');
+	return;
+  }
+
+  $self->log_header("Analysing and optimising tables");
+  
+
+  my $sql = 'show tables;';
+  my @tables = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};	  
+  map $_ = "@{$_}", @tables;
+  my  $analyse_sql = 'analyze table ';
+  my $optimise_sql = 'optimize table ';   
+
+
+
+  foreach my $table(@tables){
+	$self->log("Analysing and optimising $table");
+
+
+
+	#Remove analyse as optimise does everything this does
+	my @anal_info = @{$self->db->dbc->db_handle->selectall_arrayref($analyse_sql.$table)};
+
+	foreach my $line_ref(@anal_info){
+	  my $status = $line_ref->[3];
+	  $self->report("FAIL: analyse $table status $status") if (!($status eq 'OK' || $status eq 'Table is already up to date'));
+	}
+
+	my @opt_info = @{$self->db->dbc->db_handle->selectall_arrayref($optimise_sql.$table)};
+
+	foreach my $line_ref(@opt_info){
+	  
+	  my $status = $line_ref->[3];
+	  $self->report("FAIL: optimise $table status $status") if (!( $status eq 'OK' || $status eq 'Table is already up to date'));
+	}
+	
+  }
+
+  return;
+}# end of analyse_and_optimise_tables
+
+
+sub clean_xrefs{
+  my ($self) = @_;
+
+  if($self->{'skip_xrefs'}){
+	$self->log_header('Skipping clean_xrefs');
+	return;
+  }
+
+  $self->log_header("Cleaning unlinked xref records");
+ 
+  my $sql = 'DELETE x FROM xref x LEFT JOIN object_xref ox ON ox.xref_id = x.xref_id WHERE ox.xref_id IS NULL';
+  #Should this also take accoumt of unmapped_objects?
+  #No, as unmapped_object doesn't use xref, but probably should
+
+   my $row_cnt = $self->db->dbc->do($sql);
+
+  $self->reset_table_autoinc('xref', 'xref_id', $self->db);
+  $row_cnt = 0 if $row_cnt eq '0E0';
+  $self->log("Deleted $row_cnt unlinked xref records");
+
+
+  #Now remove old edbs
+  $self->log_header("Cleaning unlinked external_db records");
+
+  #Need to account for xref and unmapped_object here
+  $sql = 'DELETE edb FROM external_db edb '.
+	'LEFT JOIN xref x ON x.external_db_id = edb.external_db_id '.
+	  'LEFT JOIN  unmapped_object uo ON uo.external_db_id=edb.external_db_id '.
+		'WHERE x.external_db_id IS NULL and uo.external_db_id is NULL';
+  $row_cnt = $self->db->dbc->do($sql);
+
+  $self->reset_table_autoinc('external_db', 'external_db_id', $self->db);
+  $row_cnt = 0 if $row_cnt eq '0E0';
+  $self->log("Deleted $row_cnt unlinked external_db records");
+
+
+  #Shouldn't clean orphaned oxs here as this means a rollback been done underneath the ox data
+  #or we have xref_id=0!
+  #Leave this to HC?
+
+  
+
+
+
+  return;
+}
+
+
+1;