Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,2708 @@
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::Importer
+  
+=head1 SYNOPSIS
+
+my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params);
+$imp->register_experiment();
+
+
+=head1 DESCRIPTION
+
+B<This program> is the main class coordinating import of tiling array design and experimental data.
+It utilises several underlying parser classes specific to array vendor or import file type.
+
+=cut
+
+################################################################################
+
+package Bio::EnsEMBL::Funcgen::Importer;
+
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(get_date open_file run_system_cmd);
+use Bio::EnsEMBL::Utils::Exception qw( throw deprecate );
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+use Bio::EnsEMBL::Funcgen::Experiment;
+use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+
+use File::Path;
+use strict;
+use vars qw(@ISA);
+
+
+################################################################################
+
+=head2 new
+
+ Description : Constructor method
+
+ Arg  [1]    : hash containing optional attributes:
+                    -name        Name of Experiment(dir) 
+                    -format      of array e.g. Tiled(default)
+                    -vendor      name of array vendor
+                    -description of the experiment
+                    -pass        DB password
+		            -host        DB host
+		            -user        DB user
+		            -port        DB port
+                    -registry_host Host to load registry from
+                    -registry_port Port for registry host
+                    -registry_user User for registry host 
+                    -registry_pass Password for registry user
+                    -ssh  Flag to set connection over ssh via forwarded port to localhost (default = 0); remove?
+                    -group    name of experimental/research group
+`                    -location of experimental/research group
+                    -contact  e/mail address of primary contact for experimental group
+                    -species 
+                    -assembly Genome assembly version i.e. 36 for NCBI36
+                    -recover Recovery flag (default = 0)
+                    -data_dir  Root data directory (default = $ENV{'EFG_DATA'})
+                    -output_dir review these dirs ???????
+                    -input_dir  ?????????
+                    -import_dir  ???????
+                    -norm_dir    ??????
+                    -fasta dump FASTA flag (default =0)
+                    -array_set Flag to treat all chip designs as part of same array (default = 0)
+                    -array_name Name for array set
+                    -array_file Path of array file to import for sanger ENCODE array
+                    -result_set_name  Name to give the raw and normalised result sets (default uses experiment and analysis name)
+                    -norm_method  Normalisation method (Nimblegen default = VSN_GLOG);
+                    -dbname Override for autogeneration of funcgen dbaname
+                    -reg_config path to local registry config file (default = ~/ensembl.init || undef)
+                    -design_type MGED term (default = binding_site_identification) get from meta/MAGE?
+
+                    -farm      Flag to submit jobs to farm e.g. normalisation jobs
+                    -batch_job Flag to signify that this Importer is running as a prepared batch/farm job
+                    -prepared  Flag to signify result files have been previously imported in prepare mode
+                               and file names will differ to those record in InputSubset
+
+
+                    #-use_defaults This changes some mandatory parameters to optional, instead using either DEFAULT or the input file name for the following options -name, -input_set, -feature_type, -cell_type etc ???
+
+                    -verbose
+ ReturnType  : Bio::EnsEMBL::Funcgen::Importer
+ Example     : my $Exp = Bio::EnsEMBL::Importer->new(%params);
+ Exceptions  : throws if mandatory params are not set or DB connect fails
+ Caller      : General
+ Status      : Medium - potential for %params names to change, remove %attrdata?
+
+=cut
+
+################################################################################
+
+sub new{
+  my ($caller) = shift;
+
+  #my $reg = "Bio::EnsEMBL::Registry";
+  my $class = ref($caller) || $caller;
+
+  #$user, $host, $port, $pass, $dbname, $db, $assm_version, $reg_config, $reg_db,  $ucsc_coords, $verbose, $release, $reg_host, $reg_port, $reg_user, $reg_pass
+  #$ftype_name, $ctype_name,
+
+  my ($name, $format, $vendor, $group, $location, $contact,
+      $array_name, $array_set, $array_file, $data_dir, $result_files,
+      $exp_date, $desc, $design_type, $output_dir, $input_dir,
+      $batch_job, $farm, $prepared,
+      $norm_method, $old_dvd_format, $parser_type)
+    = rearrange(['NAME', 'FORMAT', 'VENDOR', 'GROUP', 'LOCATION', 'CONTACT', 
+                 'ARRAY_NAME', 'ARRAY_SET', 'ARRAY_FILE', 'DATA_DIR', 'RESULT_FILES',
+                 'EXPERIMENT_DATE', 'DESCRIPTION',
+                 'DESIGN_TYPE', 'OUTPUT_DIR', 'INPUT_DIR', #to allow override of defaults
+                 'BATCH_JOB', 'FARM', 'PREPARED', 'NORM_METHOD', 
+                 'OLD_DVD_FORMAT', 'PARSER'], @_);
+
+  
+ 
+  #### Define parent parser class based on vendor
+  throw("Mandatory argument -vendor not defined") if ! defined $vendor;
+
+  #This will override the default Vendor Parser type
+  #Evals simply protect from messy errors if parser type not found
+  my $parser_error;
+  my $vendor_parser =  ucfirst(lc($vendor));
+
+
+  #WARNING evaling these parsers to enable pluggability hides errors in parser
+  #use a perl -MBio::EnsEMBL::Funcgen::Parsers:ParserType to debug
+  #get rid of all this case guessing and force correct parser name usage?
+
+
+  #WARNING
+  #Dynamic setting of ISA in this way reports the resultant object as Importer, when 
+  #some throws/methods are actually in other base/custom Parsers
+  #This can seem a little counterintuitive, but allows plugability
+  #With out the need for separate control scripts
+
+
+  #Change this to be set and required/detected in the parse_and_import.pl script
+  #Then we can have Importer.pm as the base class and get rid of this.
+  #as well as set_config methods?
+  
+
+  eval {require "Bio/EnsEMBL/Funcgen/Parsers/${vendor_parser}.pm";};
+ 
+  if ($@) {
+	  #Don't warn/throw yet as we might have a standard parser format
+	
+    $parser_error .= "There is no valid parser for the vendor your have specified:\t".$vendor.
+      "\nMaybe this is a typo or maybe you want to specify a default import format using the -parser option\n".$@;
+  }
+  
+
+  
+  if (defined $parser_type) {
+
+    #try normal case first
+    eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";};
+
+    if ($@) {
+      $parser_type = ucfirst(lc($parser_type));
+
+      #Now eval the new parser
+      eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";};
+
+      if ($@) {
+		
+        #Might be no default
+        my $txt = "There is no valid parser for the -parser format your have specified:\t".$parser_type."\n";
+		
+        if (! $parser_error) {
+          $txt .= "Maybe this is a typo or maybe you want run with the default $vendor_parser parser\n";
+        }
+		
+        throw($txt.$@);
+      }
+	  
+      #warn about over riding vendor parser here
+      if (! $parser_error) {
+        #Can't log this as we haven't blessed the Helper yet
+        warn("WARNING\t::\tYou are over-riding the default ".$vendor." parser with -parser ".$parser_type);
+      }
+    }
+  } else {
+    throw($parser_error) if $parser_error;
+    $parser_type = $vendor_parser;
+  }
+  
+
+  #we should now really set parser_type as an attrtibute?
+  unshift @ISA, 'Bio::EnsEMBL::Funcgen::Parsers::'.$parser_type;
+  #change this to be called explicitly from the load script?
+
+  #### Create object from parent class
+
+  my $self = $class->SUPER::new(@_);
+    
+  #### Set vars and test minimum mandatory params for any import type
+
+  $self->{'name'} = $name || throw('Mandatory param -name not met'); #This is not mandatory for array design import
+  ##  $self->{'user'} = $user || $ENV{'EFG_WRITE_USER'};
+  $self->vendor(uc($vendor));   #already tested
+  $self->{'format'} = uc($format) || 'TILED'; #remove default?
+  $self->group($group) if $group;
+  $self->location($location) if $location;
+  $self->contact($contact) if $contact;
+  ##  $species || throw('Mandatory param -species not met');
+  $self->array_name($array_name) if $array_name;
+  $self->array_set($array_set) if $array_set;
+  $self->array_file($array_file) if $array_file;
+  $self->{'data_dir'} = $data_dir || $ENV{'EFG_DATA'};
+  $self->result_files($result_files)if $result_files;
+  $self->experiment_date($exp_date) if $exp_date;
+  $self->description($desc) if $desc; #experiment
+  ##  $self->feature_set_description($fset_desc) if $fset_desc;
+
+  #$assm_version || throw('Mandatory param -assembly not met');
+  #Only required if setting DB by params e.g db not passed or generated from reg
+  #i.e. most of the time
+  #Why was this made mandatory?
+  #Default to dnadb||efgdb assm from the dbname
+
+  $self->{'design_type'} = $design_type || 'binding_site_identification'; #remove default?
+  $self->{'output_dir'} = $output_dir if $output_dir; #config default override
+  $self->{'input_dir'} = $input_dir if $input_dir; #config default override
+  $self->farm($farm) if $farm;
+  $self->batch_job($batch_job);
+  $self->prepared($prepared);
+  ##  $self->{'ssh'} = $ssh || 0;
+  ##  $self->{'_dump_fasta'} = $fasta || 0;
+  #$self->{'recover'} = $recover || 0; Now in BaseImporter
+  #check for ~/.ensembl_init to mirror general EnsEMBL behaviour
+  ##  $self->{'reg_config'} = $reg_config || ((-f "$ENV{'HOME'}/.ensembl_init") ? "$ENV{'HOME'}/.ensembl_init" : undef);
+  $self->{'old_dvd_format'} = $old_dvd_format || 0;
+  ##  $self->{'ucsc_coords'} = $ucsc_coords || 0;
+  ##  $self->{'verbose'} = $verbose || 0;
+  ##  $self->{'release'} = $release;
+
+  
+ 
+  ##  if($reg_host && $self->{'reg_config'}){
+  ##	warn "You have specified registry parameters and a config file:\t".$self->{'reg_config'}.
+  ##	  "\nOver-riding config file with specified paramters:\t${reg_user}@${reg_host}:$reg_port";
+  ##  }
+
+
+  #Will a general norm method be applicable for all imports?
+  #Already casued problems with Bed imports... remove?
+  #Could set NORM_METHOD in Parser!!
+  #warn "Need to fully implement norm_method is validate_mage, remove ENV NORM_METHOD?";
+  $self->{'norm_method'} = $norm_method; # || $ENV{'NORM_METHOD'};
+ 
+  #if ($self->vendor ne 'NIMBLEGEN'){
+  #	$self->{'no_mage'} = 1;
+  #	warn "Hardcoding no_mage for non-NIMBLEGEN imports";
+  #  }
+
+ 
+  #  if($self->{'no_mage'} && $self->{'write_mage'}){
+  #	throw('-no_mage and -write_mage options are mutually exclusive, please select just one');
+  #  }
+
+  ##  #### Set up DBs and load and reconfig registry
+  ##
+  ##  ### Load Registry
+  ##  #Can we load the registry using the assembly version, then just redefine the efg DB?
+  ##  #We have problems here if we try and load on a dev version, where no dev DBs are available on ensembldb
+  ##  #Get the latest API version for the assembly we want to use
+  ##  #Then load the registry from that version
+  ##  #Then we can remove some of the dnadb setting code below?
+  ##  #This may cause problems with API schema mismatches
+  ##  #Can we just test whether the current default dnadb contains the assembly?
+  ##  #Problem with this is that this will not have any other data e.g. genes etc 
+  ##  #which may be required for some parsers
+  ##
+  ##  #How does the registry pick up the schema version??
+  ##
+  ##  #We should really load the registry first given the dnadb assembly version
+  ##  #Then reset the eFG DB as appropriate
+  ##
+  ##
+  ##  if ($reg_host || ! defined $self->{'_reg_config'}) {
+  ##	#defaults to current ensembl DBs
+  ##	$reg_host ||= 'ensembldb.ensembl.org';
+  ##	$reg_user ||= 'anonymous';
+  ##
+  ##	#Default to the most recent port for ensdb
+  ##	if( (! $reg_port) && 
+  ##		($reg_host eq 'ensdb-archive') ){
+  ##	  $reg_port = 5304;
+  ##	}
+  ##
+  ##	#This will try and load the dev DBs if we are using v49 schema or API?
+  ##	#Need to be mindful about this when developing
+  ##	#we need to tip all this on it's head and load the reg from the dnadb version!!!!!!!
+  ##
+  ##	my $version_text= ($self->{'release'}) ? 'version '.$self->{'release'} : 'current version';
+  ##	$self->log("Loading $version_text registry from $reg_user".'@'.$reg_host);
+  ##
+  ##	#Note this defaults API version, hence running with head code
+  ##	#And not specifying a release version will find not head version
+  ##	#DBs on ensembldb, resulting in an exception from reset_DBAdaptor below
+  ##	$reg->load_registry_from_db(
+  ##								-host => $reg_host,
+  ##								-user => $reg_user,
+  ##								-port => $reg_port,
+  ##								-pass => $reg_pass,
+  ##								#-host => "ens-staging",
+  ##								#-user => 'ensro',
+  ##								-db_version => $self->{'release'},#51
+  ##								-verbose => $self->verbose,
+  ##							   );
+  ##
+  ##	throw('Not sensible to set the import DB as the default eFG DB from '.$reg_host.', please define db params') if ((! $dbname) && (! $db));
+  ##  }
+  ##  else{
+  ##	$self->log("Loading registry from:\t".$self->{'_reg_config'});
+  ##	$reg->load_all($self->{'_reg_config'}, 1);
+  ##  }
+  ##
+  ##
+  ##  #Need to test the DBs here, as we may not have loaded any!
+  ##  #get_alias wil fail otherwise?
+  ##  #This is a cyclical dependancy as we need alias to get species which we use to grab the DB
+  ##  #alias is dependant on core DB being loaded with relevant meta entries.
+  ##  #revise this when we split the Importer
+  ##
+  ##
+  ##  #Validate species
+  ##  my $alias = $reg->get_alias($species) || throw("Could not find valid species alias for $species\nYou might want to clean up:\t".$self->get_dir('output'));
+  ##  $self->species($alias);
+  ##  $self->{'param_species'} = $species;#Only used for dir generation
+  ##
+  ##  
+  ##  #SET UP DBs
+  ##  if($db){
+  ##	#db will have been defined before reg loaded, so will be present in reg
+  ##
+  ##	if(! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){
+  ##	  $self->throw('-db must be a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor');
+  ##	}
+  ##  }
+  ##  else{ #define eFG DB from params or registry
+  ##
+  ##	if($reg_db){#load eFG DB from reg
+  ##
+  ##	  if($dbname){
+  ##		throw("You cannot specify DB params($dbname) and load from the registry at the same time.");
+  ##	  }
+  ##
+  ##	  $self->log('WARNING: Loading eFG DB from Registry');
+  ##	  $db = $reg->get_DBAdaptor($self->species(), 'funcgen');
+  ##	  throw("Unable to retrieve ".$self->species." funcgen DB from the registry") if ! $db;
+  ##	}
+  ##	else{#resets the eFG DB in the custom or generic registry
+  ##
+  ##	  $dbname || throw('Must provide a -dbname if not using default custom registry config');
+  ##	  #$user || throw('Must provide a -user parameter');#make this default to EFG_WRITE_USER?
+  ##	  $pass || throw('Must provide a -pass parameter');
+  ##	 
+  ##	  #remove this and throw?
+  ##	  if(! defined $host){
+  ##		$self->log('WARNING: Defaulting to localhost');
+  ##		$host = 'localhost';
+  ##	  }
+  ##	  
+  ##	  $port ||= 3306;
+  ##	  my $host_ip = '127.0.0.1';#is this valid for all localhosts?
+  ##	  
+  ##	  if ($self->{'ssh'}) {
+  ##		$host = `host localhost`; #mac specific? nslookup localhost wont work on server/non-PC 
+  ##		#will this always be the same?
+  ##		
+  ##		if (! (exists $ENV{'EFG_HOST_IP'})) {
+  ##		  warn "Environment variable EFG_HOST_IP not set for ssh mode, defaulting to $host_ip for $host";
+  ##		} else {
+  ##		  $host_ip = $ENV{'EFG_HOST_IP'};
+  ##		}
+  ##		
+  ##		if ($self->host() ne 'localhost') {
+  ##		  warn "Overriding host ".$self->host()." for ssh connection via localhost($host_ip)";
+  ##		}
+  ##	  }
+  ##	
+  ##
+  ##	  #data version is only used if we don't want to define the dbname
+  ##	  #This should never be guessed so don't need data_version here
+  ##	  #$dbname ||= $self->species()."_funcgen_".$self->data_version();
+  ##
+  ##
+  ##	  #Remove block below when we can
+  ##	  my $dbhost = ($self->{'ssh'}) ? $host_ip : $host;
+  ##
+  ##	  #This isn't set yet!?
+  ##	  #When we try to load, say 49, when we only have 48 on ensembldb
+  ##	  #This fails because there is not DB set for v49, as it is not on ensembl DB
+  ##	  #In this case we need to load from the previous version
+  ##	  #Trap this and suggest using the -schema_version/release option
+  ##	  #Can we autodetect this and reload the registry?
+  ##	  #We want to reload the registry anyway with the right version corresponding to the dnadb
+  ##	  #We could either test for the db in the registry or just pass the class.
+  ##
+  ##	  $db = $reg->reset_DBAdaptor($self->species(), 'funcgen', $dbname, $dbhost, $port, $self->user, $pass,
+  ##								  {
+  ##								   -dnadb_host => $reg_host,
+  ##								   -dnadb_port => $reg_port,
+  ##								   -dnadb_assembly => $assm_version,
+  ##								   -dnadb_user => $reg_user,
+  ##								   -dnadb_pass => $reg_pass,
+  ##								  });
+  ##
+  ##
+  ##	  #ConfigRegistry will try and set this
+  ##	  #This will fail if there is already one in the registry as it will try
+  ##	  #and defined a new unique species so as not to overwrite the original
+  ##	  #e.g. homo_sapiens1
+  ##	  
+  ##	  #This is why it was orignally written backwards as we can't easily dynamically redefine
+  ##	  #an adaptor in the registry without ConfigRegistry trying to change the name
+  ##	  #the very act of creating a new db to redefine the registry with causes ConfigRegistry
+  ##	  #to try and register it with a unique species name
+  ##
+  ##	  #Delete the old funcgen DB from the registry first
+  ##	  #$reg->remove_DBAdaptor($self->species, 'funcgen');
+  ##
+  ##	  #ConfigRegistry will automatically configure this new db
+  ##	  
+  ##	  #$db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(
+  ##	#													 -user => $user,
+  ##	#													 -host => ($self->{'ssh'}) ? $host_ip : $host,
+  ##	#													 -port => $port,
+  ##	#													 -pass => $pass,
+  ##	#													 #we need to pass dbname else we can use non-standard dbs
+  ##	#													 -dbname => $dbname,
+  ##	#													 -species => $self->species(),
+  ##	#													 -group    => 'funcgen',
+  ##	#													);
+  ##
+  ##
+  ##	  #if we get a species like homo_sapiens1 here
+  ##	  #This is because ConfigRegistry is try to make the dbname different between the 
+  ##	  #one already present and the one you're trying to add
+  ##	}
+  ##  }
+  ##
+  ##    
+  ##  ### VALIDATE DNADB
+  ##  #This is now done in DBAdaptor
+  ##
+  ##  #We can change this to just use the assembly version
+  ##  #we could even have the wordy assmelby version from the meta table
+  ##  #do the standard ensembl subs
+  ##  #s/[A-Za-z]//g;
+  ##  #s/\.//g;
+  ##  #And then validate?
+  ##  #Just stick to number version for now.
+  ##
+  ##
+  ##  #Now we need to set the dnadb_host params to avoid ensembldb defaults
+  ##  #This should check the registry first
+  ##  #Then load from the registry db?
+  ##  #If we have a multi host registry config file this isn't going to work!
+  ##
+  ##  #Is this required anymore as the DBAdaptor handles this?
+  ##  #Not if we pass a db with an incorrect dnadb attached.
+  ##
+  ##  #if($db->_get_schema_build($db->dnadb()) !~ /_[0-9]+_${assm_version}[a-z]*$/){
+  ###	my $warning = "WARNING: dnadb does not match assembly_version $assm_version. Using ensembldb.enembl.org to define the dnadb";
+  ###	$warning .= ' rather than the reg_config' if (defined $self->{'_reg_config'});
+  ##
+  ##	#We need to account for reg_config DBs which may have custom info in
+  ##	#So try reg_config host first, then try ensembldb with warning
+  ##	#Could have a reg_config only flag for core dbs
+  ##	#Need to implement more params in set_dnadb_by_assembly_version
+  ###	$self->log($warning);
+  ##
+  ###	$db->set_dnadb_by_assembly_version($assm_version);
+  ###  }
+  ##
+  ##
+  ##
+  ##
+  ##  #Test connections
+  ##  $self->db($db);
+  ##  $db->dbc->db_handle;
+  ##  $db->dnadb->dbc->db_handle;
+  ##  #Set re/disconnect options
+  ##
+  ##  #These really need setting dependant on the import parser
+  ##  $db->dbc->disconnect_when_inactive(1);
+  ##  $db->dnadb->dbc->disconnect_when_inactive(1);
+
+ 
+
+  ##  ### Check analyses/feature_type/cell_type
+  ##  if($feature_analysis){
+  ##	my $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($feature_analysis);
+  ## 	throw("The Feature Analysis $feature_analysis does not exist in the database") if(!$fanal);
+  ##	$self->feature_analysis($fanal);
+  ##
+  ##	#This currently fails before the config gets loaded!
+  ##	#Need to load config before this validation!
+  ##  }
+  ##
+  ##  if($ctype_name){
+  ##	my $ctype = $self->db->get_CellTypeAdaptor->fetch_by_name($ctype_name);
+  ## 	throw("The CellType $ctype_name does not exist in the database") if(!$ctype);
+  ##	$self->cell_type($ctype);
+  ##  }
+  ##
+  ##  if ($ftype_name) {
+  ##    my $ftype = $self->db->get_FeatureTypeAdaptor->fetch_by_name($ftype_name);
+  ##	throw("The FeatureType $ftype_name does not exist in the database") if(!$ftype);
+  ##	$self->feature_type($ftype);
+  ##  }
+
+
+  #Set config here instead?
+  #So we can check all mandatory params
+  #Set vendor specific attr dependent vars
+  
+  #Generic input dir
+  $self->{'input_dir'} ||= $self->get_dir("data").'/input/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name();
+
+  if (! -d $self->get_dir('input')) {
+
+    if (@{$self->result_files}) {
+      #This is really InputSet specific
+      #Could go in init_experiment_import
+      $self->log("Processing files:\n\t\t".join("\n\t\t",@{$self->result_files})); 
+    } else {
+      throw('input_dir is not defined or does not exist ('.$self->get_dir('input').')');
+    }
+  }
+
+  #Parser specific config
+  $self->set_config();
+
+
+  $self->debug(2, "Importer class instance created.");
+  $self->debug_hash(3, \$self);
+    
+  return ($self);
+}
+
+=head2 registry_host
+
+  Example    : my $reg_host = $imp->registry_host;
+  Description: Accessor for registry host attribute
+  Returntype : string e.g. ensembldb.ensembl.org
+  Exceptions : None
+  Caller     : general
+  Status     : at risk 
+
+=cut
+
+sub registry_host{
+  return $_[0]->{'reg_host'};
+}
+
+=head2 registry_user
+
+  Example    : my $reg_user = $imp->registry_user;
+  Description: Accessor for registry user attribute
+  Returntype : string e.g. ensembldb.ensembl.org
+  Exceptions : None
+  Caller     : general
+  Status     : at risk 
+
+=cut
+
+sub registry_user{
+  return $_[0]->{'reg_user'};
+}
+
+=head2 registry_port
+
+  Example    : my $reg_port = $imp->registry_port;
+  Description: Accessor for registry port attribute
+  Returntype : string e.g. ensembldb.ensembl.org
+  Exceptions : None
+  Caller     : general
+  Status     : at risk 
+
+=cut
+
+sub registry_port{
+  return $_[0]->{'reg_port'};
+}
+
+=head2 registry_pass
+
+  Example    : my $reg_pass = $imp->registry_pass;
+  Description: Accessor for registry pass attribute
+  Returntype : string e.g. ensembldb.ensembl.org
+  Exceptions : None
+  Caller     : general
+  Status     : at risk 
+
+=cut
+
+sub registry_pass{
+  return $_[0]->{'reg_pass'};
+}
+
+
+#init method kept separate from new due to differing madatory check and set up
+
+=head2 init_array_import
+
+  Example    : $self->init_import();
+  Description: Initialises import by creating working directories 
+               and by storing the Experiemnt
+  Returntype : none
+  Exceptions : warns and throws depending on recover and Experiment status 
+  Caller     : general
+  Status     : at risk - merge with register_array_design
+
+=cut
+
+sub init_array_import{
+
+  my ($self) = shift;
+
+  # we need to define which paramters we'll be storing
+  #use the logic names of the analyses as the field headers
+
+  #need to test for vendor here
+
+  #Sanger, NIMBLEGEN(no design_id issue, could get from the ndf, but we want it in the DesignNotes.txt)
+  #Then we can change the Array/Chip generation to soley use the DesignNotes.txt rather than SampleKey 
+  #which is experiment specific
+  #or eFG format.
+
+  $self->create_output_dirs('caches', 'fastas');
+
+
+}
+
+
+=head2 init_experiment_import
+
+  Example    : $self->init_import();
+  Description: Initialises import by creating working directories 
+               and by storing the Experiemnt
+  Returntype : none
+  Exceptions : warns and throws depending on recover and Experiment status 
+  Caller     : general
+  Status     : at risk - merge with register exeriment
+
+=cut
+
+sub init_experiment_import{
+  my ($self) = shift;
+
+  #Change this to take config mandatory params?
+  #No specific to exp import
+  #Name is used in set_config anyway
+  #Currently we only have array and experiment import, both of which should have names
+  #Make mandatory?
+
+  foreach my $tmp ("group", "data_dir") { #name now generically mandatory
+    throw("Mandatory arg $tmp not been defined") if (! defined $self->{$tmp});
+  }
+  #Should we separate path on group here too, so we can have a dev/test group?
+  
+  #Create output dirs
+  #This should be moved to the Parser to avoid generating directories which are needed for different imports
+  $self->create_output_dirs('raw', 'norm', 'caches', 'fastas');
+  throw("No result_files defined.") if (! defined $self->result_files());
+
+  #Log input files
+  #if (@{$self->result_files()}) {
+  #  $self->log("Found result files arguments:\n\t".join("\n\t", @{$self->result_files()}));
+  #}
+  #This is done in new
+
+  #check for cell||feature and warn if no met file supplied?
+
+  if ($self->norm_method) {
+    my $norm_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->norm_method);
+
+    #should we list the valid analyses?
+    throw($self->norm_method.' is not a valid analysis') if ! $norm_anal;
+    $self->norm_analysis($norm_anal);
+  } else {
+    $self->log('WARNING: No normalisation analysis specified');
+  }
+  
+  #warn "Need to check env vars here or in Parser or just after set_config?";
+  #Need generic method for checking ENV vars in Helper
+  #check for ENV vars?
+  #R_LIBS
+  #R_PATH if ! farm
+  #R_FARM_PATH 
+
+  $self->validate_group();      #import experimental_group
+
+  #Get experiment
+  my $exp_adaptor = $self->db->get_ExperimentAdaptor();  
+  my $exp = $exp_adaptor->fetch_by_name($self->name());	#, $self->group());
+  $self->process_experiment_config if $self->can('process_experiment_config'); #Parsers::MAGE::process_experiment_config
+
+  #Moved MAGE support form here to MAGE.pm
+	
+  #Recovery now set so deal with experiment
+  if ($self->recovery() && ($exp)) {
+    $self->log("Using previously stored Experiment:\t".$exp->name);
+  } elsif ((! $self->recovery()) && $exp) {
+    throw("Your experiment name is already registered in the database, please choose a different \"name\", this will require renaming you input directory, or specify -recover if you are working with a failed/partial import.");
+    #can we skip this and store, and then check in register experiment if it is already stored then throw if not recovery
+  } else {                      # (recover && exp) || (recover  && ! exp) 
+ 
+    
+    $exp = Bio::EnsEMBL::Funcgen::Experiment->new(
+                                                  -EXPERIMENTAL_GROUP => $self->{egroup},
+                                                  -NAME  => $self->name(),
+                                                  -DATE  => $self->experiment_date(),
+                                                  -PRIMARY_DESIGN_TYPE => $self->design_type(),
+                                                  -DESCRIPTION => $self->description(),
+                                                  -ADAPTOR => $self->db->get_ExperimentAdaptor(),
+                                                 );
+    
+    ($exp) =  @{$exp_adaptor->store($exp)};
+  }
+  
+  
+  $self->experiment($exp);
+  
+  #remove and add specific report, this is catchig some Root stuff
+  #$self->log("Initiated efg import with following parameters:\n".Data::Dumper::Dumper(\$self));
+  
+  return;
+}
+
+
+#Move this to new or init_experiment
+
+=head2 validate_group
+
+  Example    : $self->validate_group();
+  Description: Validates groups details
+  Returntype : none
+  Exceptions : throws if insufficient info defined to store new Group and is not already present
+  Caller     : general
+  Status     : Medium - check location and contact i.e. group name clash?
+
+=cut
+
+sub validate_group{
+  my ($self) = shift;
+
+  my $egroup = $self->db->get_ExperimentalGroupAdaptor->fetch_by_name($self->group);
+
+  if (! defined $egroup) {
+	
+    throw("validate_group does not yet fully support ExperimentalGroups, please add manually");
+	
+    #if ($self->location() && $self->contact()) {
+    #  $self->db->import_group($self->group(), $self->location, $self->contact());
+    #} else {
+    #  throw("Group ".$self->group()." does not exist, please specify a location and contact to register the group");
+    #}
+  }
+  
+  $self->{egroup} = $egroup;
+
+  return;
+}
+
+=head2 create_output_dirs
+
+  Example    : $self->create_output_dirs();
+  Description: Does what it says on the tin, creates dirs in 
+               the root output dir foreach @dirnames, also set paths in self
+  Arg [1]    : mandatory - list of dir names
+  Returntype : none
+  Exceptions : none
+  Caller     : general
+  Status     : Medium - add throw?
+
+=cut
+
+sub create_output_dirs{
+  my ($self, @dirnames) = @_;
+	
+  #output dir created in control script
+  #avoids errors when logs generated first
+
+
+  foreach my $name (@dirnames) {
+
+    if ($name eq 'caches') {
+      $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/".$self->db->dbc->dbname() if(! defined $self->{"${name}_dir"});
+    } elsif ($name eq 'fastas') {
+      $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/" if(! defined $self->{"${name}_dir"});
+    } else {
+      $self->{"${name}_dir"} = $self->get_dir('output')."/${name}" if(! defined $self->{"${name}_dir"});
+    }
+
+    if (! (-d $self->get_dir($name) || (-l $self->get_dir($name)))) {
+      $self->log("Creating directory:\t".$self->get_dir($name));
+      #This did not throw with mkdir!!
+      mkpath $self->get_dir($name) || throw('Failed to create directory:    '. $self->get_dir($name));
+      chmod 0744, $self->get_dir($name);
+    }
+  }
+  
+  return;
+}
+
+=head2 vendor
+  
+  Example    : $imp->vendor("NimbleGen");
+  Description: Getter/Setter for array vendor
+  Arg [1]    : optional - vendor name
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub vendor{
+  my ($self) = shift;
+
+  if (@_) {
+    $self->{'vendor'} = shift;
+    $self->{'vendor'} = uc($self->{'vendor'});
+  }
+
+  return $self->{'vendor'};
+}
+
+
+=head2 feature_type
+  
+  Example    : $imp->feature_type($ftype);
+  Description: Getter/Setter for Experiment FeatureType
+  Arg [1]    : optional - Bio::EnsEMBL::Funcgen::FeatureType
+  Returntype : Bio::EnsEMBL::Funcgen::FeatureType
+  Exceptions : Throws if arg is not valid or stored
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub feature_type{
+  my ($self) = shift;
+
+  if (@_) {
+    my $ftype = shift;
+    
+    #do we need this as we're checking in new?
+    if (! ($ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType') && $ftype->dbID())) {
+      throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType");
+    }
+  
+    $self->{'feature_type'} = $ftype;
+  }
+
+  return $self->{'feature_type'};
+}
+
+=head2 feature_analysis
+  
+  Example    : $imp->feature_analysis($fanal);
+  Description: Getter/Setter for Analysis used for creating the imported Features
+  Arg [1]    : optional - Bio::EnsEMBL::Analysis
+  Returntype : Bio::EnsEMBL::Analysis
+  Exceptions : Throws if arg is not valid or stored
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub feature_analysis{
+  my ($self) = shift;
+
+  if (@_) {
+    my $fanal = shift;
+    
+    #do we need this as we're checking in new?
+    if (! (ref ($fanal) && $fanal->isa('Bio::EnsEMBL::Analysis') && $fanal->dbID())) {
+      throw("Must pass a valid stored Bio::EnsEMBL::Analysis");
+    }
+  
+    $self->{'feature_analysis'} = $fanal;
+  }
+
+  return $self->{'feature_analysis'};
+}
+
+=head2 norm_analysis
+  
+  Example    : $imp->norm_analysis($anal);
+  Description: Getter/Setter for the normalisation analysis
+  Arg [1]    : optional - Bio::EnsEMBL::Analysis
+  Returntype : Bio::EnsEMBL::Analysis
+  Exceptions : Throws if arg is not valid or stored
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub norm_analysis{
+  my ($self) = shift;
+
+  if (@_) {
+    my $anal = shift;
+    
+    #do we need this as we're checking in new?
+    if (! (ref($anal) && $anal->isa('Bio::EnsEMBL::Analysis') && $anal->dbID())) {
+      throw("Must pass a valid stored Bio::EnsEMBL::Analysis");
+    }
+  
+    $self->{'norm_analysis'} = $anal;
+  }
+
+  return $self->{'norm_analysis'};
+}
+
+
+
+=head2 cell_type
+  
+  Example    : $imp->cell_type($ctype);
+  Description: Getter/Setter for Experiment CellType
+  Arg [1]    : optional - Bio::EnsEMBL::Funcgen::CellType
+  Returntype : Bio::EnsEMBL::Funcgen::CellType
+  Exceptions : Throws if arg is not valid or stored
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub cell_type{
+  my ($self) = shift;
+
+  if (@_) {
+    my $ctype = shift;
+    
+    #do we need this as we're checking in new?
+    if (! ($ctype->isa('Bio::EnsEMBL::Funcgen::CellType') && $ctype->dbID())) {
+      throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::CellType");
+    }
+  
+    $self->{'cell_type'} = $ctype;
+  }
+
+  return $self->{'cell_type'};
+}
+
+
+##=head2 ucsc_coords
+##  
+##  Example    : $start += 1 if $self->ucsc_coords;
+##  Description: Getter for UCSC coordinate usage flag
+##  Returntype : boolean
+##  Exceptions : none
+##  Caller     : general
+##  Status     : at risk
+##
+##=cut
+##
+##sub ucsc_coords{
+##  my $self = shift;
+##  return $self->{'ucsc_coords'};
+##}
+
+
+
+=head2 array_file
+  
+  Example    : my $array_file = $imp->array_file();
+  Description: Getter/Setter for sanger/design array file
+  Arg [1]    : optional - path to adf or gff array definition/mapping file
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub array_file{
+  my ($self) = shift;
+  $self->{'array_file'} = shift if(@_);
+  return $self->{'array_file'};
+}
+
+=head2 array_name
+  
+  Example    : my $array_name = $imp->array_name();
+  Description: Getter/Setter for array name
+  Arg [1]    : optional string - name of array
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub array_name{
+  my ($self) = shift;
+  $self->{'array_name'} = shift if(@_);
+  return $self->{'array_name'};
+}
+
+
+=head2 array_set
+  
+  Example    : $imp->array_set(1);
+  Description: Getter/Setter for array set flag
+  Arg [1]    : optional boolean - treat all array chips as the same array
+  Returntype : boolean
+  Exceptions : none
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub array_set{
+  my ($self) = shift;
+  $self->{'array_set'} = shift if(@_);
+  return $self->{'array_set'};
+}
+
+
+=head2 add_Array
+
+  Arg [1]    : Bio::EnsEMBL::Funcgen::Array
+  Example    : $self->add_Array($array);
+  Description: Setter for array elements
+  Returntype : none
+  Exceptions : throws if passed non Array or if more than one Array set
+  Caller     : Importer
+  Status     : At risk - Implement multiple arrays? Move to Experiment?
+
+=cut
+
+sub add_Array{
+  my $self = shift;
+
+  #do we need to check if stored?
+  if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Array')) {
+    throw("Must supply a Bio::EnsEMBL::Funcgen::Array");
+  } elsif (@_) {
+    push @{$self->{'arrays'}}, @_;
+  }
+   
+  throw("Does not yet support multiple array imports") if(scalar (@{$self->{'arrays'}}) > 1);
+  #need to alter read_probe data at the very least
+  
+  return;
+}
+
+
+
+=head2 arrays
+  
+  Example    : foreach my $array(@{$imp->arrays}){ ...do an array of things ...};
+  Description: Getter for the arrays attribute
+  Returntype : ARRAYREF
+  Exceptions : none
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub arrays{
+  my $self = shift;
+
+  if (! defined $self->{'arrays'}) {
+    $self->{'arrays'} = $self->db->get_ArrayAdaptor->fetch_all_by_Experiment($self->experiment());
+  }
+
+  return $self->{'arrays'};
+}
+
+
+
+=head2 location
+  
+  Example    : $imp->vendor("Hinxton");
+  Description: Getter/Setter for group location
+  Arg [1]    : optional - location
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+
+sub location{
+  my ($self) = shift;
+  $self->{'location'} = shift if(@_);
+  return $self->{'location'};
+}
+
+
+=head2 contact
+  
+  Example    : my $contact = $imp->contact();
+  Description: Getter/Setter for the group contact
+  Arg [1]    : optional - contact name/email/address
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub contact{
+  my ($self) = shift;
+  $self->{'contact'} = shift if(@_);
+  return $self->{'contact'};
+}
+
+=head2 name
+  
+  Example    : $imp->name('Experiment1');
+  Description: Getter/Setter for the experiment name
+  Arg [1]    : optional - experiment name
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+
+sub name{
+  my ($self) = shift;	
+  $self->{'name'} = shift if(@_);
+  return $self->{'name'};
+}
+
+=head2 result_files
+  
+  Example    : $imp->result_files(\@files);
+  Description: Getter/Setter for the result file paths
+  Arg [1]    : Listref of file paths
+  Returntype : Listref
+  Exceptions : none
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+
+sub result_files{
+  my ($self) = shift;	
+  $self->{'result_files'} = shift if(@_);
+  return $self->{'result_files'};
+}
+
+
+
+
+
+=head2 experiment_date
+  
+  Example    : $imp->experiment_date('2006-11-02');
+  Description: Getter/Setter for the experiment date
+  Arg [1]    : optional - date string in yyyy-mm-dd
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : At risk 
+
+=cut
+
+
+
+sub experiment_date{
+  my ($self) = shift;	
+
+  if (@_) {
+    my $date = shift;
+
+    if ($date !~ /[0-9]{4}-[0-9]{2}[0-9]{2}/o) {
+      throw('Parameter -experiment_date needs to fe in the format: YYYY-MM-DD');
+    }
+
+    $self->{'experiment_date'} = $date;
+  } elsif ($self->vendor() eq "nimblegen" && ! defined $self->{'experiment_date'}) {
+    $self->{'experiment_date'} = &get_date("date", $self->get_config("chip_file")),
+  }
+
+  return $self->{'experiment_date'};
+}
+
+
+
+=head2 group
+  
+  Example    : my $exp_group = $imp->group();
+  Description: Getter/Setter for the group name
+  Arg [1]    : optional - group name
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub group{
+  my ($self) = shift;	
+  $self->{'group'} = shift if(@_);
+  return $self->{'group'};
+}
+
+
+=head2 description
+  
+  Example    : $imp->description("Human chrX H3 Lys 9 methlyation");
+  Description: Getter/Setter for the experiment element
+  Arg [1]    : optional - experiment description
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub description{
+  my $self = shift;
+
+  if (@_) {
+    $self->{'description'} = shift;
+  }
+
+  return $self->{'description'};
+}
+
+##=head2 feature_set_description
+##  
+##  Example    : $imp->description("ExperimentalSet description");
+##  Description: Getter/Setter for the FeatureSet description for an 
+##               InputSet import e.g. preprocessed GFF/Bed data
+##  Arg [1]    : optional - string feature set description
+##  Returntype : string
+##  Exceptions : none
+##  Caller     : general
+##  Status     : At risk
+##
+##=cut
+##
+##sub feature_set_description{
+##  my $self = shift;
+##
+##  $self->{'feature_set_description'} = shift if @_;
+##  
+##  return $self->{'feature_set_description'};
+##}
+
+=head2 format
+  
+  Example    : $imp->format("Tiled");
+  Description: Getter/Setter for the array format
+  Arg [1]    : optional - array format
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub format{
+  my ($self) = shift;	
+  $self->{'format'} = shift if(@_);
+  return $self->{'format'};
+}
+
+=head2 experiment
+  
+  Example    : my $exp = $imp->experiment();
+  Description: Getter/Setter for the Experiment element
+  Arg [1]    : optional - Bio::EnsEMBL::Funcgen::Experiment
+  Returntype : Bio::EnsEMBL::Funcgen::Experiment
+  Exceptions : throws if arg is not an Experiment
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub experiment{
+  my ($self) = shift;	
+
+  if (@_) {
+	
+    if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Experiment')) {
+      throw("Must pass a Bio::EnsEMBL::Funcgen::Experiment object");
+    }
+
+    $self->{'experiment'} = shift;
+  }
+
+  return $self->{'experiment'};
+}
+
+##=head2 db
+##  
+##  Example    : $imp->db($funcgen_db);
+##  Description: Getter/Setter for the db element
+##  Arg [1]    : optional - Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor
+##  Returntype : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor
+##  Exceptions : throws if arg is not an DBAdaptor
+##  Caller     : general
+##  Status     : Stable
+##
+##=cut
+##
+##sub db{
+##  my $self = shift;
+##
+##  if (defined $_[0] && $_[0]->isa("Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor")) {
+##    $self->{'db'} = shift;
+##  } elsif (defined $_[0]) {
+##    throw("Need to pass a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor");
+##  }
+##  
+##  return $self->{'db'};
+##}
+##
+##=head2 pass
+##  
+##  Example    : $imp->pass("password");
+##  Description: Getter/Setter for the db password
+##  Arg [1]    : optional - db password
+##  Returntype : string
+##  Exceptions : none
+##  Caller     : general
+##  Status     : Stable
+##
+##=cut
+##
+##
+##sub pass{
+##  my $self = shift;
+##  $self->{'pass'} = shift if(@_);
+##  return $self->{'pass'};
+##}
+##
+##=head2 pass
+##  
+##  Example    : $imp->host("hoastname");
+##  Description: Getter/Setter for the db hostname
+##  Arg [1]    : optional - db hostname
+##  Returntype : string
+##  Exceptions : none
+##  Caller     : general
+##  Status     : Stable
+##
+##=cut
+##
+##sub host{
+##  my $self = shift;
+##  $self->{'host'} = shift if(@_);
+##  return $self->{'host'};
+##}
+##
+##=head2 port
+##  
+##  Example    : $imp->port(3306);
+##  Description: Getter/Setter for the db port number
+##  Arg [1]    : optional - db port number
+##  Returntype : int
+##  Exceptions : none
+##  Caller     : general
+##  Status     : Stable
+##
+##=cut
+##
+##sub port{
+##  my $self = shift;
+##  $self->{'port'} = shift if(@_);
+##  return $self->{'port'};
+##}
+##
+##=head2 user
+##  
+##  Example    : $imp->user("user_name");
+##  Description: Getter/Setter for the db user name
+##  Arg [1]    : optional - db user name
+##  Returntype : string
+##  Exceptions : none
+##  Caller     : general
+##  Status     : Stable
+##
+##=cut
+##
+##sub user{
+##  my $self = shift;
+##  $self->{'user'} = shift if(@_);
+##  return $self->{'user'};
+##}
+
+##=head2 dump_fasta
+##  
+##  Example    : if($self->dump_fasta()){...do fasta dump...}
+##  Description: Getter/Setter for the dump_fasta flag
+##  Arg [1]    : optional - 0 or 1
+##  Returntype : boolean
+##  Exceptions : none
+##  Caller     : self
+##  Status     : Stable
+##
+##=cut
+##
+##
+##sub dump_fasta{
+##  my $self = shift;
+##  $self->{'_dump_fasta'} = shift if @_;
+##  return $self->{'_dump_fasta'};
+##}
+##
+
+
+##=head2 species
+##  
+##  Example    : $imp->species("homo_sapiens");
+##  Description: Getter/Setter for species
+##  Arg [1]    : optional - species name(alias?)
+##  Returntype : string
+##  Exceptions : none ? throw if no alias found?
+##  Caller     : general
+##  Status     : Medium - may move reg alias look up to this method
+##
+##=cut
+##
+##sub species{
+##  my $self = shift;
+##
+##  #should we do reg alias look up here?
+##  #Will we ever want to redefine species?
+##  #Change to just getter?
+##
+##  $self->{'species'} = shift if(@_);
+##	
+##  return $self->{'species'};
+##}
+
+=head2 get_dir
+  
+  Example    : $imp->get_dir("import");
+  Description: Retrieves full path for given directory
+  Arg [1]    : mandatory - dir name
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : at risk - move to Helper?
+
+=cut
+
+sub get_dir{
+  my ($self, $dirname) = @_;
+  return $self->get_data("${dirname}_dir");
+}
+
+=head2 norm_method
+  
+  Example    : my $norm_method = $imp->norm_method()
+  Description: Getter/Setter for normalisation method
+  Arg [1]    : mandatory - method name
+  Returntype : string
+  Exceptions : none ? throw if no analysis with logic name
+  Caller     : general
+  Status     : At risk - restrict to logic_name and validate against DB, allow multiple
+
+=cut
+
+#Move to Nimblegen?
+#Do we ever want to normalise other data?
+
+sub norm_method{
+  my $self = shift;
+
+  if (@_) {
+    $self->{'norm_method'} = shift;
+  } elsif (! defined  $self->{'norm_method'}) {
+    $self->{'norm_method'}= $self->get_config('norm_method');
+  }
+
+  return $self->{'norm_method'};
+}
+
+
+=head2 get_config
+
+  Arg [1]    : mandatory - name of the data element to retrieve from the config hash
+  Example    : %dye_freqs = %{$imp->get_config('dye_freqs')};
+  Description: returns data from the definitions hash
+  Returntype : various
+  Exceptions : none
+  Caller     : Importer
+  Status     : at risk - replace with direct calls in the inherited Defs class?
+
+=cut
+
+
+sub get_config{
+  my ($self, $data_name) = @_;
+  return $self->get_data('config', $data_name); #will this cause undefs?
+}
+
+
+
+
+
+=head2 register_experiment
+  
+  Example    : $imp->register_experiment()
+  Description: General control method, performs all data import and normalisations
+  Arg [1]    : optional - dnadb DBAdaptor
+  Returntype : none
+  Exceptions : throws if arg is not Bio::EnsEMBL::DBSQL::DBAdaptor
+  Caller     : general
+  Status     : Medium
+
+=cut
+
+
+#Need to split this method?
+#Pre farm process
+#Define and store all sets, 
+#pre process input file once if required
+#How are we going to be able to tell wether this has been done successfully?
+#runner will catch error, therefore safe to assume that file is complete if farm job
+#unless we are not running with farm
+
+#farm specific processes
+#actually parse and store data
+
+#Can we separate these for different import types?
+
+sub register_experiment{
+  my ($self) = shift;
+  
+  #Need to check for dnadb passed with adaptor to contructor
+  if (@_) { 
+
+    if ( ! $_[0]->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")) {
+      throw("You need to pass a valid dnadb adaptor to register the experiment");
+    }
+    $self->db->dnadb($_[0]);
+  } elsif ( ! $self->db) {
+    throw("You need to pass/set a DBAdaptor with a DNADB attached of the relevant data version");
+  }
+  
+  #This could still be the default core db for the current version
+  #warn here if not passed DB?
+  #These should be vendor independent, only the read methods should need specific order?
+  
+  $self->init_experiment_import();
+  #can we just have init here instead?
+  
+
+  #This could do with a rewrite to move some things to the parsers
+  #$self::SUPER->register_experiment
+
+  $self->write_validate_experiment_config if $self->can('write_validate_experiment_config'); 
+
+  #This is too array specific!
+  #Can we have an array of import_methods in the config?
+  #foreach my $method(@{$self->get_config{'import_methods'}}){
+  #$self->method;
+  #}
+  #We're already doing this in read_data for each of these data_types
+
+  #Need to be able to run this separately, so we can normalise previously imported sets with different methods
+  #should be able t do this without raw data files e.g. retrieve info from DB
+  #Is this implemented?
+  
+  $self->read_data("probe");
+  $self->read_data("results");
+
+
+  my $norm_method = $self->norm_method();
+  
+  if (defined $norm_method) {
+    warn "norm method is $norm_method";
+
+    $self->R_norm($norm_method);
+    #change this to $self->$norm_method
+    #so we can have non-R_norm normalisation
+  }
+  
+  
+  return;
+}
+
+#Move array specific ones to Nimblegen.pm?
+#Also used by ArrayDesign and Sanger.pm
+#So need to create Array.pm baseclass, which is a Helper.
+
+=head2 store_set_probes_features
+
+  Arg [1]    : mandatory - array chip id
+  Arg [2]    : optional - Bio::EnsEMBL::Funcgen::ProbeSet
+  Arg [3]    : mandatory - hashref of keys probe id, values are 
+               hash of probe/features with values 
+               Bio::EnsEMBL::Funcgen::Probe/Features for a given 
+               probe set if defined.
+  Example    : $self->store_set_probes_features($ac->dbID(), $ops, \%pfs);
+  Description: Stores probe set, probes and probe features 
+  Returntype : none
+  Exceptions : none
+  Caller     : self
+  Status     : Medium
+
+=cut
+
+sub store_set_probes_features{
+  my ($self, $ac_id, $pf_hash, $ops) = @_;
+  
+  ### Deal with ProbeSets
+  if ($ops) {
+    $ops->size(scalar(keys %$pf_hash));
+    ($ops) = $self->db->get_ProbeSetAdaptor->store($ops);
+  }
+
+  #If we're going to validate fully, we need to check for probes in this probeset on this array chip
+  #Update size if we have any new probes
+  #Overkill? Only do on recover? Do not read if array chip is IMPORTED 
+  #This does not make any attempt to validate probes/set vs previously stored data
+   
+  for my $probe_id (keys %$pf_hash) {
+    
+    #set probeset in probe and store
+    #the process corresponding feature
+    my $probe = $pf_hash->{$probe_id}->{'probe'};
+    $probe->probeset($ops) if $ops;
+    ($probe) = @{$self->db->get_ProbeAdaptor->store($probe)};
+
+    #Can't use get_all_Arrays here as we can't guarantee this will only ever be the array we've generated
+    #Might dynamically load array if non-present
+    #This is allowing multiple dbIDs per probe???  Is this wrong?
+    #$self->cache_probe_info($probe->get_probename(), $probe->dbID());###########Remove as we're now importing all then resolving
+
+        
+    foreach my $feature (@{$pf_hash->{$probe_id}->{'features'}}) {
+      $feature->probe($probe);
+      ($feature) = @{$self->db->get_ProbeFeatureAdaptor->store($feature)};
+    }
+  }
+
+  undef $ops;                   #Will this persist in the caller?
+  undef %{$pf_hash};
+
+  return;
+}
+
+
+=head2 cache_slice
+
+  Arg [0]    : string - region_name e.g. X
+  Arg [1]    : optional - coordinate system name e.g. supercontig, defaults to chromosome
+  Example    : my $slice = $self->cache_slice(12);
+  Description: Caches or retrieves from cache a given slice
+  Returntype : Bio::EnsEMBL::Slice
+  Exceptions : throws f no region name specified
+  Caller     : self
+  Status     : At risk
+
+=cut
+
+sub cache_slice{
+  my ($self, $region_name, $cs_name, $total_count) = @_;
+
+  throw("Need to define a region_name to cache a slice from") if ! $region_name;
+  $self->{'slice_cache'} ||= {};
+  $region_name =~ s/chr//;
+  $region_name = "MT" if $region_name eq "M";
+  
+  if (! exists ${$self->{'seen_slice_cache'}}{$region_name}) {
+    my $slice = $self->slice_adaptor->fetch_by_region($cs_name, $region_name);
+
+    #Set seen cache so we don't try this again
+    $self->{seen_slice_cache}{$region_name} = $slice;
+	
+    if (! $slice) {
+      warn("-- Could not generate a slice for ${cs_name}:$region_name\n");
+    } else {
+
+      my $sr_name = $slice->seq_region_name; #In case we passed a slice name
+
+      if (@{$self->{seq_region_names}}) {
+        return if ! grep(/^${sr_name}$/, @{$self->{seq_region_names}}); #not on required slice
+      }
+    }
+    
+    $self->{'slice_cache'}->{$region_name} = $slice;
+  }
+
+  if ($total_count && exists ${$self->{'seen_slice_cache'}}{$region_name}) {
+    #This is an InputSet specific method
+    $self->count('total_features') if $self->can('count');
+  }
+
+  #Only return if exists to avoid creating hash key
+  return (exists $self->{'slice_cache'}->{$region_name}) ? $self->{'slice_cache'}->{$region_name} : undef;
+}
+
+=head2 slice_cache
+
+  Example    : my @seen_slices = values(%{$self->slice_cache});;
+  Description: Returns the slice cache i.e. all the Slices seen in the data filtered 
+               by the defined slices. This method can be used to run only the appropriate 
+               slice jobs after a prepare stage.
+  Returntype : Hashref of seq_region name Bio::EnsEMBL::Slice pairs
+  Exceptions : None
+  Caller     : self
+  Status     : At risk
+
+=cut
+
+
+sub slice_cache{
+  my $self = shift;
+  
+  return $self->{'slice_cache'};
+}
+
+
+
+
+=head2 cache_probe_info
+
+  Arg [0]    : mandatory - probe name
+  Arg [1]    : mandatory - probe dbID
+  Arg [2]    : optioanl int - x coord of probe on array
+  Arg [3]    : optional int - y coord of probe on array
+  Example    : $self->cache_probe_info("Probe1", $probe->dbID());
+               Or for result files which do not have X & Y, we need to cache 
+               X & Y from the design files: $self->cache_probe_info('Probe2', $probe->dbID(), $x, $y);
+  Description: Setter for probe cache values
+  Returntype : none
+  Exceptions : throws is cache conflict encountered
+  Caller     : self
+  Status     : At risk - merge with following?
+
+=cut
+
+sub cache_probe_info{
+  my ($self, $pname, $pid, $x, $y) = @_;
+
+  throw('Deprecated, too memory expensive, now resolving DB duplicates and using Tied File cache');
+  throw("Must provide a probe name and id") if (! defined $pname || ! defined $pid);
+
+
+  #do we need to loop through the file here?
+  #if we make sure we're testing for a previous dbID before storing probes then we don't need to do this
+  #we can catch the error when we get the probe id as we can check for >1 id for the same probe name
+  #if (defined $self->{'_probe_cache'}->{$pname} && ($self->{'_probe_cache'}->{$pname}->[0] != $pid)) {
+  #  throw("Found two differing dbIDs for $pname, need to sort out redundant probe entries");
+  #}
+  
+  $self->{'_probe_cache'}->{$pname} = (defined $x && defined $y) ? [$pid, $x, $y] : [$pid];
+  
+  return;
+}
+
+
+=head2 get_probe_id_by_name_Array
+
+  Arg [1]    : mandatory - probe name
+  Example    : $pid = $self->get_probe_id_by_name($pname);
+  Description: Getter for probe cache values
+  Returntype : int
+  Exceptions : none
+  Caller     : self
+  Status     : At risk - merge with previous, move to importer?
+
+=cut
+
+sub get_probe_id_by_name_Array{
+  my ($self, $name, $array) = @_;
+  
+  #this is only ever called for fully imported ArrayChips, as will be deleted if recovering
+  $self->resolve_probe_data() if(! exists $self->{'_probe_cache'}{$array->name()});
+  
+  #we want to cycle through the given cache starting from the last position or 0.
+  #we don't want to have to test for the size of the cache each time as this will be quite expensive
+  #so we should store sizes separately along with current position
+
+
+  my ($pid, $line);
+ 
+  #check current line
+  if ($line = $self->{'_probe_cache'}{$array->name()}{'current_line'}) {
+    if ($line =~ /^\Q${name}\E\t/) {
+      $pid = (split/\t/o, $line)[1];
+    }
+  }
+
+
+  if (! $pid) {
+    while ($line = $self->{'_probe_cache'}{$array->name()}{'handle'}->getline()) {
+	  
+      if ($line =~ /^\Q${name}\E\t/) {
+        $pid = (split/\t/o, $line)[1];
+        $self->{'_probe_cache'}{$array->name()}{'current_line'} = $line;
+        last;
+      }
+    }
+  }
+
+  #do not remove this
+  if (! $pid) {
+    throw("Did not find probe name ($name) in cache, cache may need rebuilding, results may need sorting, or do you have an anomolaous probe?")
+  } else {
+    chomp $pid;
+  }
+
+  return $pid;
+}
+
+=head2 get_probe_cache_by_Array
+
+  Arg[1]     : Bio::EnsEMBL::Funcgen::Array
+  Arg[2]     : boolean - from db flag, only to be used by Importer->resolve_probe_data !
+  Example    : $self->get_probe_cache_by_Array();
+  Description: Gets the probe info cache which is an array tied to a file
+  Returntype : Boolean - True if cache has been generated and set successfully
+  Exceptions : none
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+#from db flag should only be used by importer
+#this is because there is no guarantee that it will be resolved unless
+#called by resolve_probe_data
+#which then renames the file and resets the handle
+#can we clean this up and protect/hide this functionality?
+#can we check the cache file name in the get methods and throw if it contains unresolved?
+#make this private?
+
+sub get_probe_cache_by_Array{
+  my ($self, $array, $from_db) = @_;
+
+  my $msg = "Getting probe cache for ".$array->name();
+  $msg .= " from DB" if $from_db;
+  $self->log($msg);             #, 1);
+
+  if (! ($array && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())) {
+    throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::Array object');
+  }
+
+  my $set = 0;
+  my $cache_file = $self->get_dir('caches').'/'.$array->name().'.probe_cache';
+
+  ### Generate and resolve fresh cache from DB
+  if ($from_db) {
+
+    $cache_file .= '.unresolved'; #This will be renamed by the caller if it is resolved
+
+    if (exists $self->{'_probe_cache'}{$array->name()}) {
+      $self->log('Rebuilding probe_cache from DB for '.$array->name(), 1);
+
+
+      #untie @{$self->{'_probe_cache'}{$array->name()}{'entries'}};
+      #close($self->{'_probe_cache'}{$array->name()}{'handle'});#do we need to do this?
+      delete $self->{'_probe_cache'}{$array->name()}; #implicitly closes
+      $self->log('Deleted old cache', 1);
+    } else {
+      $self->log('Building probe_cache from DB for '.$array->name(), 1);
+    }
+	
+    #Move this to ProbeAdaptor?
+    #This is where we'd set the unique key for a vendor and resolves duplicates based on the key
+    my $cmd = 'SELECT name, probe_id from probe WHERE array_chip_id IN ('.join(',', @{$array->get_array_chip_ids()}).') ORDER by name, probe_id';
+    $cmd = 'mysql '.$self->db->connect_string()." -e \"$cmd\" >".$cache_file;
+    run_system_cmd($cmd);
+	
+  }
+ 
+  ### Set cache
+  if (-f $cache_file) { 
+    $self->log('MD5 check here?',1);
+    $self->{'_probe_cache'}{$array->name()}{'current_line'} = undef;
+    $self->{'_probe_cache'}{$array->name()}{'handle'} = open_file($cache_file);
+
+    #can we do a select count instead? and do this instead of the MD5?
+    #$cmd = "wc -l $cache_file";
+    #my $size = `$cmd`;
+
+    $set = 1;
+  } else {
+    warn 'Failed to get probe cache for array:'.$array->name();
+  }
+ 
+  return $set;
+}
+
+
+#should reorganise these emthods to split reading the array data, and the actual data
+#currently:
+#meta reads array and chip data
+#probe reads probe_set, probes, which should definitely be in array, probe_feature? and results
+#native data format may not map to these methods directly, so may need to call previous method if required data not defined
+
+
+=head2 read_data
+  
+  Example    : $self->read_data("probe")
+  Description: Calls each method in data_type array from config hash
+  Arg [1]    : mandatory - data type
+  Returntype : none
+  Exceptions : none
+  Caller     : self
+  Status     : At risk
+
+=cut
+
+sub read_data{
+  my($self, $data_type) = @_;
+
+  map {my $method = "read_${_}_data"; $self->$method()} @{$self->get_config("${data_type}_data")};
+  return;
+}
+
+
+=head2 design_type
+  
+  Example    : $self->design_type("binding_site_identification")
+  Description: Getter/Setter for experimental design type
+  Arg [1]    : optional - design type
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+sub design_type{
+  my $self = shift;
+  return $self->{'design_type'};
+}
+
+
+=head2 get_chr_seq_region_id
+  
+  Example    : $seq_region_id = $self->get_seq_region_id('X');
+  Description: Calls each method in data_type array from config hash
+  Arg [1]    : mandatory - chromosome name
+  Arg [2]    : optional - start value
+  Arg [3]    : optional - end value
+  Returntype : int
+  Exceptions : none
+  Caller     : self
+  Status     : At risk
+
+=cut
+
+#convinience wrapper method
+#could we use the seq region cache instead?
+#this seems like a lot of overhead for getting the id
+sub get_chr_seq_region_id{
+  my ($self, $chr, $start, $end) = @_;
+  #what about strand info?
+
+  #do we need the start and stop?
+
+  #use start and stop to prevent problems with scaffodl assemblies, i.e. >1 seq_region_id
+  #my $slice = $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end);
+  #we could pass the slice back to the slice adaptor for this, to avoid dbid problems betwen DBs
+  
+  ###would need to implement other cs's here
+
+  return $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end)->get_seq_region_id();
+}
+
+
+
+=head2 vsn_norm
+  
+  Example    : $self->vsn_norm();
+  Description: Convinience/Wrapper method for vsn R normalisation
+  Returntype : none
+  Exceptions : none
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+#Have Norm class or contain methods in importer?
+#Need to have analysis set up script for all standard analyses.
+
+sub vsn_norm{
+  my $self = shift;
+  return $self->R_norm("VSN_GLOG");
+}
+
+
+=head2 farm
+  
+  Arg [1]    : Boolean
+  Example    : $importer->farm(1);
+  Description: Flag to turn farm submission on
+  Returntype : Boolean
+  Exceptions : Throws is argument not a boolean
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+
+sub farm{
+  my ($self, $farm) = @_;
+
+  $self->{'farm'} ||= undef;    #define farm
+
+  if (defined $farm) {
+    throw("Argument to farm must be a boolean 1 or 0")  if(! ($farm == 1 || $farm == 0));
+    $self->{'farm'} = $farm;
+  }
+
+  return $self->{'farm'};
+
+}
+
+=head2 batch_job
+  
+  Arg [1]    : Boolean
+  Example    : $importer->batch_job(1);
+  Description: Flag to turn on batch_job status
+  Returntype : Boolean
+  Exceptions : Throws is argument not a boolean
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+
+sub batch_job{
+  my ($self, $batch_job) = @_;
+
+  #$self->{'batch_job'} ||= undef;
+
+  if (defined $batch_job) {
+    throw("Argument to batch_job must be a boolean 1 or 0")  if(! ($batch_job == 1 || $batch_job == 0));
+    $self->{'batch_job'} = $batch_job;
+  }
+
+  return $self->{'batch_job'};
+
+}
+
+=head2 prepared
+  
+  Arg [1]    : Boolean
+  Example    : $importer->prepared(1);
+  Description: Flag to turn on prepared file status
+               This signifies that the files have been previously imported 
+               using prepare mode and may not match the InputSubset names
+  Returntype : Boolean
+  Exceptions : None
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+
+sub prepared{
+  my ($self, $prepared) = @_;
+  $self->{'prepared'} = $prepared if (defined $prepared);
+  return $self->{'prepared'};
+}
+
+
+=head2 R_norm
+  
+  Example    : $self->R_norm(@logic_names);
+  Description: Performs R normalisations for given logic names
+  Returntype : none
+  Exceptions : Throws if R exits with error code or if data not not valid for analysis
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+sub R_norm{
+  my ($self, @logic_names) = @_;
+  #This currently normalises a single two colour chip at a time
+  #rather than normalising across a set of chip
+  #also does in sets of analyses
+  #good for keeping data separate, but not efficient in terms of querying
+  #convert to use one script which only queries for each echip once, then does each anal
+
+
+  my $aa = $self->db->get_AnalysisAdaptor();
+  my $rset_adaptor = $self->db->get_ResultSetAdaptor();
+  my $ra_id = $aa->fetch_by_logic_name("RawValue")->dbID();
+  my %r_config = (
+                  "VSN_GLOG"      => {( libs         => ['vsn'],
+                                        #norm_method => 'vsn',
+                                      )},
+				  
+                  "T.Biweight" => {(
+                                    libs => ['affy'],
+                                    #norm_method => 'tukey.biweight',
+                                   )},
+                 );
+  
+
+  foreach my $logic_name (@logic_names) {
+    #throw("Not yet implemented TukeyBiweight") if $logic_name eq "Tukey_Biweight";
+
+    #this has already been chcecked and set as the norm_analysis
+    #need to resolve this multi method approach
+    my $norm_anal = $aa->fetch_by_logic_name($logic_name);
+
+
+	
+    #This only creates an RSet for the IMPORT set
+    #So if we re-run with a different analysis
+    #tab2mage will have already been validated
+    #So RSet generation will be skipped
+    #We need to recreate the each non-import RSet for this norm analysis
+	
+    #This also means the RSets are being created before the data has been imported
+    #This avoids having to parse tab2mage each time but means we have an uncertain status of these Rsets
+
+    my $rset = $self->get_import_ResultSet($norm_anal, 'experimental_chip');
+	
+    my @chips = ();
+  
+    if (! $rset) {
+      $self->log("All ExperimentalChips already have status:\t${logic_name}");
+    } else {                    #Got data to normalise and import
+      my @dbids;
+      my $R_file     = $self->get_dir("norm")."/${logic_name}.R";
+      my $job_name   = $self->experiment->name()."_${logic_name}";
+      my $resultfile = $self->get_dir("norm")."/result.${logic_name}.txt";
+      my $outfile    = $self->get_dir("norm")."/${logic_name}.out";
+
+      #How do we get farm job output i.e. run time memusage
+      #from interactive job?
+      #This assumes R_PATH 
+      my $errfile    = $self->get_dir("norm")."/${logic_name}.err";
+
+      #Let's build this better so we capture the farm output aswell as the job output.
+      my $cmdline = "$ENV{'R_PATH'} --no-save < $R_file"; # >$errfile 2>&1";
+      #-K option waits for job to complete
+      my $bsub = "bsub -K -J $job_name ".$ENV{'R_BSUB_OPTIONS'}.
+        " -e $errfile -o $outfile $ENV{'R_FARM_PATH'} CMD BATCH $R_file";
+	  
+      #Can we separate the out and err for commandline?
+      my $r_cmd = (! $self->farm()) ? "$cmdline >$outfile 2>&1" : $bsub;
+
+      $self->backup_file($resultfile); #Need to do this as we're appending in the loop
+  
+      #setup qurey
+      #warn "Need to add host and port here";
+      #Set up DB, defaults and libs for each logic name
+      my $query = "options(scipen=20);library(RMySQL);library(Ringo);"; 
+      #scipen is to prevent probe_ids being converted to exponents
+      #Ringo is for default QC
+
+      #foreach my $ln(@logic_names){
+	
+      foreach my $lib (@{$r_config{$logic_name}{'libs'}}) {
+        $query .= "library($lib);";
+      }
+      #}
+      
+      $query .= "con<-dbConnect(dbDriver(\"MySQL\"), host=\"".$self->db->dbc->host()."\", port=".$self->db->dbc->port().", dbname=\"".$self->db->dbc->dbname()."\", user=\"".$self->db->dbc->username()."\"";
+
+      #should use read only pass here as we are printing this to file
+      $query .= ", pass=\"".$self->db->dbc->password."\")\n";
+      
+      
+      #Build queries for each chip
+      foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) {
+    
+
+        #should implement logic name here?
+        #can't as we need seperate ResultSet for each
+
+  
+        if ($echip->has_status($logic_name)) {
+          $self->log("ExperimentalChip ".$echip->unique_id()." already has status:\t$logic_name");
+        } else {
+	  
+          #warn "Need to roll back here if recovery, as norm import may fail halfway through";
+		  
+          push @chips, $echip;
+          my $cc_id = $rset->get_chip_channel_id($echip->dbID());
+
+          #if ($self->recovery()){
+          #	$self->log('Rolling back results for ExperimentalChip('.$echip->dbID().") $logic_name");
+          #	$self->db->rollback_results($cc_id) if $self->recovery();							  
+          #  }
+		  
+          $self->log("Building $logic_name R cmd for ".$echip->unique_id());
+          @dbids = ();
+	  
+          foreach my $chan (@{$echip->get_Channels()}) {
+
+            if ($chan->type() eq "EXPERIMENTAL") {
+              push @dbids, $chan->dbID();
+            } else {
+              unshift @dbids, $chan->dbID();
+            }
+          }
+	  
+          throw("vsn does not accomodate more than 2 channels") if (scalar(@dbids > 2) && $logic_name eq "VSN_GLOG");
+	  
+          #should do some of this with maps?
+          #HARDCODED metric ID for raw data as one
+	
+          #Need to get total and experimental here and set db_id accordingly
+          #can probably do this directly into one df
+	  
+          $query .= "c1<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as CONTROL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[0]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n";
+
+          $query .= "c2<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as EXPERIMENTAL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[1]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n";
+	  
+          #Can we define some of the basic structures here and reuse in the QC and each norm method?
+
+
+          #Is this going to eat up memory?
+          #can we strip out and separate the data from c1 and c2 into RGList and
+          #individual vector for probe_ids, then rm c1 and c2 to free up memory
+
+          #create RGList object
+          $query .= "R<-as.matrix(c1['CONTROL_score'])\nG<-as.matrix(c2['EXPERIMENTAL_score'])\n";
+          $query .= "genes<-cbind(c1['PROBE_ID'], c1['X'], c1['Y'])\n";
+          $query .= "testRG<-new('RGList', list(R=R, G=G, genes=genes))\n";
+		 
+
+          #QC plots here before doing norm
+
+          #open pdf device
+          $query .= "pdf('".$self->get_dir('norm').'/'.$echip->unique_id."_QC.pdf', paper='a4', height = 15, width = 9)\n";
+          #set format
+          $query .= "par(mfrow = c(2,2), font.lab = 2)\n";
+
+          #Channel densisties
+          #These need limma or Ringo
+          $query .= "plotDensities(testRG)\n";
+		  
+          #MvA Plot
+
+          $query .= 'meanLogA<-((log(testRG$R, base=exp(2)) + log(testRG$G, base=exp(2)))/2)'."\n";
+          $query .= 'logIntRatioM<-(log(testRG$R, base=exp(2)) - log(testRG$G, base=exp(2)))'."\n";
+          $query .= "yMin<-min(logIntRatioM)\n";
+          $query .= "yMax<-max(logIntRatioM)\n";
+
+
+
+          #Need to validate yMax here
+          #If is is Inf then we need to sort the vector and track back until we find the high real number
+          #count number of Infs and note on MvA plot
+          $query .= "infCount<-0\n";
+          $query .= "if( yMax == Inf){; sortedM<-sort(logIntRatioM); lengthM<-length(logIntRatioM); indexM<-lengthM\n"
+            ."while (yMax == Inf){; indexM<-(indexM-1); yMax<-sortedM[indexM];}; infCount<-(lengthM-indexM);}\n";
+
+          #
+          $query .= "if(infCount == 0){\n"; 
+          $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'")'."\n";
+          $query .= "} else {\n";
+          $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'", sub=paste(infCount, " Inf values not plotted"));'."}\n";
+
+
+          #$query .= 'plot(log(testRG$R*testRG$G, base=exp(2))/2, log(testRG$R/testRG$G, base=exp(2)),xlab="A",ylab="M",pch=".",ylim=c(-3,3), main="'.$echip->unique_id."\")\n";
+
+          #Plate plots
+          $query .= 'image(testRG, 1, channel = "green", mycols = c("black", "green4", "springgreen"))'."\n";
+          $query .= 'image(testRG, 1, channel = "red", mycols = c("black", "green4", "springgreen"))'."\n";
+
+          $query .= "dev.off()\n";
+          #Finished QC pdf printing
+
+
+
+          #The simple preprocess step of Ringo is actually vsn, so we can nest these in line
+
+
+          ### Build Analyses cmds ###
+		  
+          if ($logic_name eq 'T.Biweight') {
+
+            #log2 ratios
+            $query .= 'lr_df<-cbind((log(c2["EXPERIMENTAL_score"], base=exp(2)) - log(c1["CONTROL_score"], base=exp(2))))'."\n";
+			
+            #Adjust using tukey.biweight weighted average
+            #inherits first col name
+            $query .= 'norm_df<-(lr_df["EXPERIMENTAL_score"]-tukey.biweight(as.matrix(lr_df)))'."\n";
+            $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", norm_df[,1]), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n";
+		  
+          } elsif ($logic_name eq 'VSN_GLOG') {
+            #could do this directly
+            $query .= "raw_df<-cbind(c1[\"CONTROL_score\"], c2[\"EXPERIMENTAL_score\"])\n";
+            #variance stabilise
+            $query .= "norm_df<-vsn(raw_df)\n";
+    
+	  
+            #do some more calcs here and print report?
+            #fold change exponentiate? See VSN docs
+            #should do someplot's of raw and glog and save here?
+            #set log func and params
+            #$query .= "par(mfrow = c(1, 2)); log.na = function(x) log(ifelse(x > 0, x, NA));";
+            #plot
+            #$query .= "plot(exprs(glog_df), main = \"vsn\", pch = \".\");". 
+            #  "plot(log.na(exprs(raw_df)), main = \"raw\", pch = \".\");"; 
+            #FAILS ON RAW PLOT!!
+            #par(mfrow = c(1, 2)) 
+            #> meanSdPlot(nkid, ranks = TRUE) 
+            #> meanSdPlot(nkid, ranks = FALSE) 
+		
+	
+            #Now create table structure with glog values(diffs)
+            #3 sig dec places on scores(doesn't work?!)
+            $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", (exprs(norm_df[,2]) - exprs(norm_df[,1]))), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n";
+		  
+          }
+          #load back into DB
+          #c3results<-cbind(rep("", length(c3["probe_id"])), c3["probe_id"], c3["c3_score"], rep(1, length(c3["probe_id"])), rep(1, length(c3["probe_id"])))
+          #may want to use safe.write here
+          #dbWriteTable(con, "result", c3results, append=TRUE)
+          #dbWriteTable returns true but does not load any data into table!!!
+		
+          $query .= "write.table(formatted_df, file=\"${resultfile}\", sep=\"\\t\", col.names=FALSE, row.names=FALSE, quote=FALSE, append=TRUE)\n";
+		
+          #tidy up here?? 
+        }
+      }
+  
+      $query .= "q();";
+  
+      open(RFILE, ">$R_file") || throw("Cannot open $R_file for writing");
+      print RFILE $query;
+      close(RFILE);
+ 
+      my $submit_text = "Submitting $logic_name job";
+      $submit_text .= ' to farm' if $self->farm;
+      $self->log("${submit_text}:\t".localtime());
+      run_system_cmd($r_cmd);
+      $self->log("Finished $logic_name job:\t".localtime());
+      $self->log('See '.$self->get_dir('norm').' for ExperimentalChip QC files');
+
+      #Now load file and update status
+      #Import directly here to avoid having to reparse all results if we crash!!!!
+      $self->log("Importing:\t$resultfile");
+      $self->db->load_table_data("result",  $resultfile);
+      $self->log("Finishing importing:\t$resultfile");
+	  
+
+      foreach my $echip (@chips) {
+        $echip->adaptor->store_status($logic_name, $echip);
+      }
+
+      #Recreate all non-import RSets for analysis if not already present
+      #
+
+      my $rset_a = $self->db->get_ResultSetAdaptor();
+      my %seen_rsets;
+	 
+      foreach my $anal_rset (@{$rset_a->fetch_all_by_Experiment($self->experiment)}) {
+        next if($anal_rset->name =~ /_IMPORT$/o);
+        next if(exists $seen_rsets{$anal_rset->name});
+        next if $anal_rset->analysis->logic_name eq $norm_anal->logic_name;
+        $seen_rsets{$rset->name} = 1;
+        $anal_rset->analysis($norm_anal);
+        $anal_rset->{'dbID'} = undef;
+        $anal_rset->{'adaptor'} = undef;
+		
+        #add the chip_channel_ids from the new anal IMPORT set
+        foreach my $table_id (@{$anal_rset->table_ids}) {
+          $anal_rset->{'table_id_hash'}{$table_id} = $rset->get_chip_channel_id($table_id);
+        }
+
+        $self->log('Adding new ResultSet '.$anal_rset->name.' with analysis '.$norm_anal->logic_name);
+        $rset_a->store($anal_rset);
+      }
+
+
+
+    }
+  }
+
+  return;
+}
+
+#can we sub this? args: table_name, logic_name
+#also use result_set_name
+#would also clean all data for result set if recovery
+#return would be result_set
+#Can we extend this to incorporate InputSet parser define_sets?
+
+sub get_import_ResultSet{
+  my ($self, $anal, $table_name) = @_;
+
+  if (!($anal && $anal->isa("Bio::EnsEMBL::Analysis") && $anal->dbID())) {
+    throw("Must provide a valid stored Bio::EnsEMBL::Analysis");
+  }
+
+  $self->log("Getting import $table_name ResultSet for analysis:\t".$anal->logic_name());
+
+  my ($rset, @new_chip_channels);
+  my $result_adaptor = $self->db->get_ResultSetAdaptor();
+  my $logic_name = $anal->logic_name;
+  my $status = ($logic_name eq "RawValue") ? "IMPORTED" : $logic_name;
+
+  if (($logic_name) eq 'RawValue' && ($table_name eq 'experimental_chip')) {
+    throw("Cannot have an ExperimentalChip ResultSet with a RawValue analysis, either specify 'channel' or another analysis");
+  }
+
+  #Build IMPORT Set for $table_name
+  foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) {
+
+    #clean chip import and generate rset
+	
+    if ($table_name eq 'experimental_chip') {
+	  
+      if ($echip->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status
+        $self->log("ExperimentalChip(".$echip->unique_id().") already has status:\t".$status);
+      } else {
+        $self->log("Found ExperimentalChip(".$echip->unique_id().") without status $status");
+
+        push @new_chip_channels, $echip;
+      }
+
+    } else {                    #channel
+	  
+      foreach my $chan (@{$echip->get_Channels()}) {
+
+        if ($chan->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status
+          $self->log("Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") already has status:\t".$status);
+        } else {
+          $self->log("Found Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") without status $status");
+          push @new_chip_channels, $chan;
+        }
+      }
+    }
+  
+    if (( ! $rset) && @new_chip_channels) {
+      my(@tmp) = @{$result_adaptor->fetch_all_by_name_Analysis($self->name()."_IMPORT", $anal)};
+
+      if (scalar(@tmp) > 1) {
+        throw('Found more than one IMPORT ResultSet for '.$self->name().'_IMPORT with analysis '.$logic_name);
+      }
+
+      $rset = shift @tmp;
+
+
+      #do we need to throw here if not recovery?
+      #what if we want the import result set elsewhere during the first import?
+	  
+      #if ($self->recovery()) {
+	  
+      #fetch by anal and experiment_id
+      #Need to change this to result_set.name!
+      #	warn("add chip set handling here");
+	  
+      #my @tmp = @{$result_adaptor->fetch_all_by_Experiment_Analysis($self->experiment(), $anal)};
+      #throw("Found more than one ResultSet for Experiment:\t".$self->experiment->name()."\tAnalysis:\t".$anal->logic_name().')' if (scalar(@tmp) >1);
+      #$rset = $tmp[0];
+	  
+      #warn "fetching rset with ".$self->name()."_IMPORT ". $anal->logic_name;
+	  
+      #$rset = $result_adaptor->fetch_by_name_Analysis($self->name()."_IMPORT", $anal);
+      warn("Warning: Could not find recovery ResultSet for analysis ".$logic_name) if ! $rset;
+      #}
+	  
+      if (! $rset) {
+        $self->log("Generating new ResultSet for analysis ".$logic_name);
+	
+        $rset = Bio::EnsEMBL::Funcgen::ResultSet->new
+          (
+           -analysis   => $anal,
+           -table_name => $table_name,
+           -name       => $self->name()."_IMPORT",
+           -feature_type => $self->feature_type(),
+           -cell_type    => $self->cell_type(),
+          );
+	
+        #These types should be set to NULL during the MAGE-XML validation if we have more than one type in an experiment
+	
+        ($rset) = @{$result_adaptor->store($rset)};
+      }
+    }
+  }
+
+  #do we need this here as we're rolling back in the read methods?
+  #we only want to roll back those chips/channels which have not been registered
+  
+  if ($self->recovery()) {
+
+    my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor();
+	
+    foreach my $cc (@new_chip_channels) {
+	  
+      #only roll back if already part of import set
+      #Not previously registered if not 
+      if ($rset->contains($cc) && $rset->get_chip_channel_id($cc->dbID())) {
+		
+        if ($table_name eq 'channel') {
+          my $chan_name = $ec_adaptor->fetch_by_dbID($cc->experimental_chip_id())->unique_id()."_".
+            $self->get_config('dye_freqs')->{$cc->dye()};
+          $self->log("Rolling back results for $table_name:\t".$chan_name);
+		  
+        } else {
+          $self->log("Rolling back results for $table_name:\t".$cc->unique_id);
+        }
+		
+        $self->rollback_results([$rset->get_chip_channel_id($cc->dbID())]);
+      }
+    }
+  }
+
+  
+  #check whether it is present in the ResultSet and add if not
+  if ($rset) {
+    #ids will already be present if not rset i.e. already imported
+
+    foreach my $cc (@new_chip_channels) {
+      $rset->add_table_id($cc->dbID()) if(! $rset->contains($cc));
+    }
+  }
+
+
+  
+  if ($rset) {
+    $result_adaptor->store_chip_channels($rset);
+  } else {
+    $self->log("All ExperimentalChips have status:\t$status");
+  }
+  
+  #this only returns a result set if there is new data to import
+  return $rset;
+}
+
+
+
+=head2 resolve_probe_data
+
+  Example    : $self->resolve_probe_data();
+  Description: Resolves DB probe duplicates and builds local probe cache
+  Returntype : none
+  Exceptions : ????
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+sub resolve_probe_data{
+  my $self = shift;
+
+  $self->log("Resolving probe data", 1);
+
+  warn "Probe cache resolution needs to accomodate probesets too!";
+
+  foreach my $array (@{$self->arrays()}) {
+    my $resolve = 0;
+	
+    if ($self->get_probe_cache_by_Array($array)) { #cache already generated
+
+      #check if we have any new unresolved array chips to add to the cache
+      foreach my $achip (@{$array->get_ArrayChips()}) {
+		
+        if ($achip->has_status('RESOLVED')) {
+          $self->log("ArrayChip has RESOLVED status:\t".$achip->design_id()); #, 1);
+          next;
+        } else {
+          $self->log("Found un-RESOLVED ArrayChip:\t".$achip->design_id());
+          $resolve = 1;
+          last;
+        }
+      }
+    } else {                    #no cache file
+      $resolve = 1;
+      $self->log('No probe cache found for array '.$array->name());
+    }
+
+    if ($resolve) {
+      $self->log('Resolving array duplicates('.$array->name().') and rebuilding probe cache.', 1);
+      $self->get_probe_cache_by_Array($array, 1); #get from DB
+
+      #we need ot make sure we mark cache as unresolved, so we don't use it by mistake.
+	  
+
+
+      my ($line, $name, $pid, @pids);
+      #my $index = 0;
+      my $tmp_name = '';
+      my $tmp_id = '';
+
+      #miss the header
+
+      while ($line = $self->{'_probe_cache'}{$array->name}{'handle'}->getline()) {
+        ($name, $pid) = split/\t/o, $line;
+
+        if ($name eq $tmp_name) {
+
+          if ($pid != $tmp_id) {
+            push @pids, $pid;
+            #should reset to pid here if we have x y data else undef
+            #ignore this and force result to have x y
+          }
+
+          #can't do this naymore unless we figure out how to move the line pointer
+          #would still need to sed the file anyway, better to regen from DB?
+          #undef $self->{'_probe_cache'}{$array->name}{'entries'}->[$i];#delete true or to be resolved duplicate
+        } elsif ($name ne $tmp_name) { #new probe
+          $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1);
+          $tmp_name = $name;
+          $tmp_id = $pid;
+          @pids = ($pid);
+          #$index = $i + 1;
+        }
+      }
+
+      $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1);
+
+      #rename resovled cache and reset cache handle
+      my $cmd = 'mv '.$self->get_dir('caches').'/'.$array->name().'.probe_cache.unresolved '.
+        $self->get_dir('caches').'/'.$array->name().'.probe_cache';
+
+      run_system_cmd($cmd);
+      $self->get_probe_cache_by_Array($array); #This sets the caches
+
+
+      #warn "Only generate MD5 here, as this is guranteed to be correct";
+	  
+      foreach my $achip (@{$array->get_ArrayChips()}) {
+		
+        if (! $achip->has_status('RESOLVED')) {
+          $self->log("Updating ArrayChip to RESOLVED status:\t".$achip->design_id());
+          $achip->adaptor->store_status('RESOLVED', $achip);
+        }
+      }
+	  
+      $self->log('Finished building probe cache for '.$array->name(), 1);
+    }
+  }
+
+  $self->log('Finished resolving probe data', 1);
+
+  return;
+}
+
+
+sub tidy_duplicates{
+  my ($self, $pids) = @_;
+
+  my $pfa = $self->db->get_ProbeFeatureAdaptor();
+  my ($feature, %features);
+			
+  foreach my $dup_id (@$pids) {
+			  
+    foreach $feature(@{$pfa->fetch_all_by_Probe_id($dup_id)}) {
+      #can we safely assume end will be same too?
+      push @{$features{$feature->seq_region_name().':'.$feature->start()}}, $feature;
+    }
+  }
+			
+  my (@reassign_ids, @delete_ids);
+			
+  foreach my $seq_start_key (keys %features) {
+    my $reassign_features = 1;
+	
+    foreach $feature(@{$features{$seq_start_key}}) {
+	  
+      if ($feature->probe_id() == $pids->[0]) {
+        $reassign_features = 0;
+      } else {
+        push @delete_ids, $feature->dbID();
+      }
+    }
+	
+    #This assumes that we actually have at least one element to every seq_start_key array
+    if ($reassign_features) {
+      my $new_fid = pop @delete_ids;
+      push @reassign_ids, $new_fid;
+    }
+  }
+			
+  #resolve features first so we don't get any orphaned features if we crash.
+  $pfa->reassign_features_to_probe(\@reassign_ids, $pids->[0]) if @reassign_ids;
+  $pfa->delete_features(\@delete_ids) if @delete_ids;
+
+  return;
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children