Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2708 @@ + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Importer + +=head1 SYNOPSIS + +my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); +$imp->register_experiment(); + + +=head1 DESCRIPTION + +B<This program> is the main class coordinating import of tiling array design and experimental data. +It utilises several underlying parser classes specific to array vendor or import file type. + +=cut + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Importer; + +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(get_date open_file run_system_cmd); +use Bio::EnsEMBL::Utils::Exception qw( throw deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Funcgen::Experiment; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::DBSQL::DBAdaptor; + +use File::Path; +use strict; +use vars qw(@ISA); + + +################################################################################ + +=head2 new + + Description : Constructor method + + Arg [1] : hash containing optional attributes: + -name Name of Experiment(dir) + -format of array e.g. Tiled(default) + -vendor name of array vendor + -description of the experiment + -pass DB password + -host DB host + -user DB user + -port DB port + -registry_host Host to load registry from + -registry_port Port for registry host + -registry_user User for registry host + -registry_pass Password for registry user + -ssh Flag to set connection over ssh via forwarded port to localhost (default = 0); remove? + -group name of experimental/research group +` -location of experimental/research group + -contact e/mail address of primary contact for experimental group + -species + -assembly Genome assembly version i.e. 36 for NCBI36 + -recover Recovery flag (default = 0) + -data_dir Root data directory (default = $ENV{'EFG_DATA'}) + -output_dir review these dirs ??????? + -input_dir ????????? + -import_dir ??????? + -norm_dir ?????? + -fasta dump FASTA flag (default =0) + -array_set Flag to treat all chip designs as part of same array (default = 0) + -array_name Name for array set + -array_file Path of array file to import for sanger ENCODE array + -result_set_name Name to give the raw and normalised result sets (default uses experiment and analysis name) + -norm_method Normalisation method (Nimblegen default = VSN_GLOG); + -dbname Override for autogeneration of funcgen dbaname + -reg_config path to local registry config file (default = ~/ensembl.init || undef) + -design_type MGED term (default = binding_site_identification) get from meta/MAGE? + + -farm Flag to submit jobs to farm e.g. normalisation jobs + -batch_job Flag to signify that this Importer is running as a prepared batch/farm job + -prepared Flag to signify result files have been previously imported in prepare mode + and file names will differ to those record in InputSubset + + + #-use_defaults This changes some mandatory parameters to optional, instead using either DEFAULT or the input file name for the following options -name, -input_set, -feature_type, -cell_type etc ??? + + -verbose + ReturnType : Bio::EnsEMBL::Funcgen::Importer + Example : my $Exp = Bio::EnsEMBL::Importer->new(%params); + Exceptions : throws if mandatory params are not set or DB connect fails + Caller : General + Status : Medium - potential for %params names to change, remove %attrdata? + +=cut + +################################################################################ + +sub new{ + my ($caller) = shift; + + #my $reg = "Bio::EnsEMBL::Registry"; + my $class = ref($caller) || $caller; + + #$user, $host, $port, $pass, $dbname, $db, $assm_version, $reg_config, $reg_db, $ucsc_coords, $verbose, $release, $reg_host, $reg_port, $reg_user, $reg_pass + #$ftype_name, $ctype_name, + + my ($name, $format, $vendor, $group, $location, $contact, + $array_name, $array_set, $array_file, $data_dir, $result_files, + $exp_date, $desc, $design_type, $output_dir, $input_dir, + $batch_job, $farm, $prepared, + $norm_method, $old_dvd_format, $parser_type) + = rearrange(['NAME', 'FORMAT', 'VENDOR', 'GROUP', 'LOCATION', 'CONTACT', + 'ARRAY_NAME', 'ARRAY_SET', 'ARRAY_FILE', 'DATA_DIR', 'RESULT_FILES', + 'EXPERIMENT_DATE', 'DESCRIPTION', + 'DESIGN_TYPE', 'OUTPUT_DIR', 'INPUT_DIR', #to allow override of defaults + 'BATCH_JOB', 'FARM', 'PREPARED', 'NORM_METHOD', + 'OLD_DVD_FORMAT', 'PARSER'], @_); + + + + #### Define parent parser class based on vendor + throw("Mandatory argument -vendor not defined") if ! defined $vendor; + + #This will override the default Vendor Parser type + #Evals simply protect from messy errors if parser type not found + my $parser_error; + my $vendor_parser = ucfirst(lc($vendor)); + + + #WARNING evaling these parsers to enable pluggability hides errors in parser + #use a perl -MBio::EnsEMBL::Funcgen::Parsers:ParserType to debug + #get rid of all this case guessing and force correct parser name usage? + + + #WARNING + #Dynamic setting of ISA in this way reports the resultant object as Importer, when + #some throws/methods are actually in other base/custom Parsers + #This can seem a little counterintuitive, but allows plugability + #With out the need for separate control scripts + + + #Change this to be set and required/detected in the parse_and_import.pl script + #Then we can have Importer.pm as the base class and get rid of this. + #as well as set_config methods? + + + eval {require "Bio/EnsEMBL/Funcgen/Parsers/${vendor_parser}.pm";}; + + if ($@) { + #Don't warn/throw yet as we might have a standard parser format + + $parser_error .= "There is no valid parser for the vendor your have specified:\t".$vendor. + "\nMaybe this is a typo or maybe you want to specify a default import format using the -parser option\n".$@; + } + + + + if (defined $parser_type) { + + #try normal case first + eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";}; + + if ($@) { + $parser_type = ucfirst(lc($parser_type)); + + #Now eval the new parser + eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";}; + + if ($@) { + + #Might be no default + my $txt = "There is no valid parser for the -parser format your have specified:\t".$parser_type."\n"; + + if (! $parser_error) { + $txt .= "Maybe this is a typo or maybe you want run with the default $vendor_parser parser\n"; + } + + throw($txt.$@); + } + + #warn about over riding vendor parser here + if (! $parser_error) { + #Can't log this as we haven't blessed the Helper yet + warn("WARNING\t::\tYou are over-riding the default ".$vendor." parser with -parser ".$parser_type); + } + } + } else { + throw($parser_error) if $parser_error; + $parser_type = $vendor_parser; + } + + + #we should now really set parser_type as an attrtibute? + unshift @ISA, 'Bio::EnsEMBL::Funcgen::Parsers::'.$parser_type; + #change this to be called explicitly from the load script? + + #### Create object from parent class + + my $self = $class->SUPER::new(@_); + + #### Set vars and test minimum mandatory params for any import type + + $self->{'name'} = $name || throw('Mandatory param -name not met'); #This is not mandatory for array design import + ## $self->{'user'} = $user || $ENV{'EFG_WRITE_USER'}; + $self->vendor(uc($vendor)); #already tested + $self->{'format'} = uc($format) || 'TILED'; #remove default? + $self->group($group) if $group; + $self->location($location) if $location; + $self->contact($contact) if $contact; + ## $species || throw('Mandatory param -species not met'); + $self->array_name($array_name) if $array_name; + $self->array_set($array_set) if $array_set; + $self->array_file($array_file) if $array_file; + $self->{'data_dir'} = $data_dir || $ENV{'EFG_DATA'}; + $self->result_files($result_files)if $result_files; + $self->experiment_date($exp_date) if $exp_date; + $self->description($desc) if $desc; #experiment + ## $self->feature_set_description($fset_desc) if $fset_desc; + + #$assm_version || throw('Mandatory param -assembly not met'); + #Only required if setting DB by params e.g db not passed or generated from reg + #i.e. most of the time + #Why was this made mandatory? + #Default to dnadb||efgdb assm from the dbname + + $self->{'design_type'} = $design_type || 'binding_site_identification'; #remove default? + $self->{'output_dir'} = $output_dir if $output_dir; #config default override + $self->{'input_dir'} = $input_dir if $input_dir; #config default override + $self->farm($farm) if $farm; + $self->batch_job($batch_job); + $self->prepared($prepared); + ## $self->{'ssh'} = $ssh || 0; + ## $self->{'_dump_fasta'} = $fasta || 0; + #$self->{'recover'} = $recover || 0; Now in BaseImporter + #check for ~/.ensembl_init to mirror general EnsEMBL behaviour + ## $self->{'reg_config'} = $reg_config || ((-f "$ENV{'HOME'}/.ensembl_init") ? "$ENV{'HOME'}/.ensembl_init" : undef); + $self->{'old_dvd_format'} = $old_dvd_format || 0; + ## $self->{'ucsc_coords'} = $ucsc_coords || 0; + ## $self->{'verbose'} = $verbose || 0; + ## $self->{'release'} = $release; + + + + ## if($reg_host && $self->{'reg_config'}){ + ## warn "You have specified registry parameters and a config file:\t".$self->{'reg_config'}. + ## "\nOver-riding config file with specified paramters:\t${reg_user}@${reg_host}:$reg_port"; + ## } + + + #Will a general norm method be applicable for all imports? + #Already casued problems with Bed imports... remove? + #Could set NORM_METHOD in Parser!! + #warn "Need to fully implement norm_method is validate_mage, remove ENV NORM_METHOD?"; + $self->{'norm_method'} = $norm_method; # || $ENV{'NORM_METHOD'}; + + #if ($self->vendor ne 'NIMBLEGEN'){ + # $self->{'no_mage'} = 1; + # warn "Hardcoding no_mage for non-NIMBLEGEN imports"; + # } + + + # if($self->{'no_mage'} && $self->{'write_mage'}){ + # throw('-no_mage and -write_mage options are mutually exclusive, please select just one'); + # } + + ## #### Set up DBs and load and reconfig registry + ## + ## ### Load Registry + ## #Can we load the registry using the assembly version, then just redefine the efg DB? + ## #We have problems here if we try and load on a dev version, where no dev DBs are available on ensembldb + ## #Get the latest API version for the assembly we want to use + ## #Then load the registry from that version + ## #Then we can remove some of the dnadb setting code below? + ## #This may cause problems with API schema mismatches + ## #Can we just test whether the current default dnadb contains the assembly? + ## #Problem with this is that this will not have any other data e.g. genes etc + ## #which may be required for some parsers + ## + ## #How does the registry pick up the schema version?? + ## + ## #We should really load the registry first given the dnadb assembly version + ## #Then reset the eFG DB as appropriate + ## + ## + ## if ($reg_host || ! defined $self->{'_reg_config'}) { + ## #defaults to current ensembl DBs + ## $reg_host ||= 'ensembldb.ensembl.org'; + ## $reg_user ||= 'anonymous'; + ## + ## #Default to the most recent port for ensdb + ## if( (! $reg_port) && + ## ($reg_host eq 'ensdb-archive') ){ + ## $reg_port = 5304; + ## } + ## + ## #This will try and load the dev DBs if we are using v49 schema or API? + ## #Need to be mindful about this when developing + ## #we need to tip all this on it's head and load the reg from the dnadb version!!!!!!! + ## + ## my $version_text= ($self->{'release'}) ? 'version '.$self->{'release'} : 'current version'; + ## $self->log("Loading $version_text registry from $reg_user".'@'.$reg_host); + ## + ## #Note this defaults API version, hence running with head code + ## #And not specifying a release version will find not head version + ## #DBs on ensembldb, resulting in an exception from reset_DBAdaptor below + ## $reg->load_registry_from_db( + ## -host => $reg_host, + ## -user => $reg_user, + ## -port => $reg_port, + ## -pass => $reg_pass, + ## #-host => "ens-staging", + ## #-user => 'ensro', + ## -db_version => $self->{'release'},#51 + ## -verbose => $self->verbose, + ## ); + ## + ## throw('Not sensible to set the import DB as the default eFG DB from '.$reg_host.', please define db params') if ((! $dbname) && (! $db)); + ## } + ## else{ + ## $self->log("Loading registry from:\t".$self->{'_reg_config'}); + ## $reg->load_all($self->{'_reg_config'}, 1); + ## } + ## + ## + ## #Need to test the DBs here, as we may not have loaded any! + ## #get_alias wil fail otherwise? + ## #This is a cyclical dependancy as we need alias to get species which we use to grab the DB + ## #alias is dependant on core DB being loaded with relevant meta entries. + ## #revise this when we split the Importer + ## + ## + ## #Validate species + ## my $alias = $reg->get_alias($species) || throw("Could not find valid species alias for $species\nYou might want to clean up:\t".$self->get_dir('output')); + ## $self->species($alias); + ## $self->{'param_species'} = $species;#Only used for dir generation + ## + ## + ## #SET UP DBs + ## if($db){ + ## #db will have been defined before reg loaded, so will be present in reg + ## + ## if(! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ + ## $self->throw('-db must be a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); + ## } + ## } + ## else{ #define eFG DB from params or registry + ## + ## if($reg_db){#load eFG DB from reg + ## + ## if($dbname){ + ## throw("You cannot specify DB params($dbname) and load from the registry at the same time."); + ## } + ## + ## $self->log('WARNING: Loading eFG DB from Registry'); + ## $db = $reg->get_DBAdaptor($self->species(), 'funcgen'); + ## throw("Unable to retrieve ".$self->species." funcgen DB from the registry") if ! $db; + ## } + ## else{#resets the eFG DB in the custom or generic registry + ## + ## $dbname || throw('Must provide a -dbname if not using default custom registry config'); + ## #$user || throw('Must provide a -user parameter');#make this default to EFG_WRITE_USER? + ## $pass || throw('Must provide a -pass parameter'); + ## + ## #remove this and throw? + ## if(! defined $host){ + ## $self->log('WARNING: Defaulting to localhost'); + ## $host = 'localhost'; + ## } + ## + ## $port ||= 3306; + ## my $host_ip = '127.0.0.1';#is this valid for all localhosts? + ## + ## if ($self->{'ssh'}) { + ## $host = `host localhost`; #mac specific? nslookup localhost wont work on server/non-PC + ## #will this always be the same? + ## + ## if (! (exists $ENV{'EFG_HOST_IP'})) { + ## warn "Environment variable EFG_HOST_IP not set for ssh mode, defaulting to $host_ip for $host"; + ## } else { + ## $host_ip = $ENV{'EFG_HOST_IP'}; + ## } + ## + ## if ($self->host() ne 'localhost') { + ## warn "Overriding host ".$self->host()." for ssh connection via localhost($host_ip)"; + ## } + ## } + ## + ## + ## #data version is only used if we don't want to define the dbname + ## #This should never be guessed so don't need data_version here + ## #$dbname ||= $self->species()."_funcgen_".$self->data_version(); + ## + ## + ## #Remove block below when we can + ## my $dbhost = ($self->{'ssh'}) ? $host_ip : $host; + ## + ## #This isn't set yet!? + ## #When we try to load, say 49, when we only have 48 on ensembldb + ## #This fails because there is not DB set for v49, as it is not on ensembl DB + ## #In this case we need to load from the previous version + ## #Trap this and suggest using the -schema_version/release option + ## #Can we autodetect this and reload the registry? + ## #We want to reload the registry anyway with the right version corresponding to the dnadb + ## #We could either test for the db in the registry or just pass the class. + ## + ## $db = $reg->reset_DBAdaptor($self->species(), 'funcgen', $dbname, $dbhost, $port, $self->user, $pass, + ## { + ## -dnadb_host => $reg_host, + ## -dnadb_port => $reg_port, + ## -dnadb_assembly => $assm_version, + ## -dnadb_user => $reg_user, + ## -dnadb_pass => $reg_pass, + ## }); + ## + ## + ## #ConfigRegistry will try and set this + ## #This will fail if there is already one in the registry as it will try + ## #and defined a new unique species so as not to overwrite the original + ## #e.g. homo_sapiens1 + ## + ## #This is why it was orignally written backwards as we can't easily dynamically redefine + ## #an adaptor in the registry without ConfigRegistry trying to change the name + ## #the very act of creating a new db to redefine the registry with causes ConfigRegistry + ## #to try and register it with a unique species name + ## + ## #Delete the old funcgen DB from the registry first + ## #$reg->remove_DBAdaptor($self->species, 'funcgen'); + ## + ## #ConfigRegistry will automatically configure this new db + ## + ## #$db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( + ## # -user => $user, + ## # -host => ($self->{'ssh'}) ? $host_ip : $host, + ## # -port => $port, + ## # -pass => $pass, + ## # #we need to pass dbname else we can use non-standard dbs + ## # -dbname => $dbname, + ## # -species => $self->species(), + ## # -group => 'funcgen', + ## # ); + ## + ## + ## #if we get a species like homo_sapiens1 here + ## #This is because ConfigRegistry is try to make the dbname different between the + ## #one already present and the one you're trying to add + ## } + ## } + ## + ## + ## ### VALIDATE DNADB + ## #This is now done in DBAdaptor + ## + ## #We can change this to just use the assembly version + ## #we could even have the wordy assmelby version from the meta table + ## #do the standard ensembl subs + ## #s/[A-Za-z]//g; + ## #s/\.//g; + ## #And then validate? + ## #Just stick to number version for now. + ## + ## + ## #Now we need to set the dnadb_host params to avoid ensembldb defaults + ## #This should check the registry first + ## #Then load from the registry db? + ## #If we have a multi host registry config file this isn't going to work! + ## + ## #Is this required anymore as the DBAdaptor handles this? + ## #Not if we pass a db with an incorrect dnadb attached. + ## + ## #if($db->_get_schema_build($db->dnadb()) !~ /_[0-9]+_${assm_version}[a-z]*$/){ + ### my $warning = "WARNING: dnadb does not match assembly_version $assm_version. Using ensembldb.enembl.org to define the dnadb"; + ### $warning .= ' rather than the reg_config' if (defined $self->{'_reg_config'}); + ## + ## #We need to account for reg_config DBs which may have custom info in + ## #So try reg_config host first, then try ensembldb with warning + ## #Could have a reg_config only flag for core dbs + ## #Need to implement more params in set_dnadb_by_assembly_version + ### $self->log($warning); + ## + ### $db->set_dnadb_by_assembly_version($assm_version); + ### } + ## + ## + ## + ## + ## #Test connections + ## $self->db($db); + ## $db->dbc->db_handle; + ## $db->dnadb->dbc->db_handle; + ## #Set re/disconnect options + ## + ## #These really need setting dependant on the import parser + ## $db->dbc->disconnect_when_inactive(1); + ## $db->dnadb->dbc->disconnect_when_inactive(1); + + + + ## ### Check analyses/feature_type/cell_type + ## if($feature_analysis){ + ## my $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($feature_analysis); + ## throw("The Feature Analysis $feature_analysis does not exist in the database") if(!$fanal); + ## $self->feature_analysis($fanal); + ## + ## #This currently fails before the config gets loaded! + ## #Need to load config before this validation! + ## } + ## + ## if($ctype_name){ + ## my $ctype = $self->db->get_CellTypeAdaptor->fetch_by_name($ctype_name); + ## throw("The CellType $ctype_name does not exist in the database") if(!$ctype); + ## $self->cell_type($ctype); + ## } + ## + ## if ($ftype_name) { + ## my $ftype = $self->db->get_FeatureTypeAdaptor->fetch_by_name($ftype_name); + ## throw("The FeatureType $ftype_name does not exist in the database") if(!$ftype); + ## $self->feature_type($ftype); + ## } + + + #Set config here instead? + #So we can check all mandatory params + #Set vendor specific attr dependent vars + + #Generic input dir + $self->{'input_dir'} ||= $self->get_dir("data").'/input/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name(); + + if (! -d $self->get_dir('input')) { + + if (@{$self->result_files}) { + #This is really InputSet specific + #Could go in init_experiment_import + $self->log("Processing files:\n\t\t".join("\n\t\t",@{$self->result_files})); + } else { + throw('input_dir is not defined or does not exist ('.$self->get_dir('input').')'); + } + } + + #Parser specific config + $self->set_config(); + + + $self->debug(2, "Importer class instance created."); + $self->debug_hash(3, \$self); + + return ($self); +} + +=head2 registry_host + + Example : my $reg_host = $imp->registry_host; + Description: Accessor for registry host attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_host{ + return $_[0]->{'reg_host'}; +} + +=head2 registry_user + + Example : my $reg_user = $imp->registry_user; + Description: Accessor for registry user attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_user{ + return $_[0]->{'reg_user'}; +} + +=head2 registry_port + + Example : my $reg_port = $imp->registry_port; + Description: Accessor for registry port attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_port{ + return $_[0]->{'reg_port'}; +} + +=head2 registry_pass + + Example : my $reg_pass = $imp->registry_pass; + Description: Accessor for registry pass attribute + Returntype : string e.g. ensembldb.ensembl.org + Exceptions : None + Caller : general + Status : at risk + +=cut + +sub registry_pass{ + return $_[0]->{'reg_pass'}; +} + + +#init method kept separate from new due to differing madatory check and set up + +=head2 init_array_import + + Example : $self->init_import(); + Description: Initialises import by creating working directories + and by storing the Experiemnt + Returntype : none + Exceptions : warns and throws depending on recover and Experiment status + Caller : general + Status : at risk - merge with register_array_design + +=cut + +sub init_array_import{ + + my ($self) = shift; + + # we need to define which paramters we'll be storing + #use the logic names of the analyses as the field headers + + #need to test for vendor here + + #Sanger, NIMBLEGEN(no design_id issue, could get from the ndf, but we want it in the DesignNotes.txt) + #Then we can change the Array/Chip generation to soley use the DesignNotes.txt rather than SampleKey + #which is experiment specific + #or eFG format. + + $self->create_output_dirs('caches', 'fastas'); + + +} + + +=head2 init_experiment_import + + Example : $self->init_import(); + Description: Initialises import by creating working directories + and by storing the Experiemnt + Returntype : none + Exceptions : warns and throws depending on recover and Experiment status + Caller : general + Status : at risk - merge with register exeriment + +=cut + +sub init_experiment_import{ + my ($self) = shift; + + #Change this to take config mandatory params? + #No specific to exp import + #Name is used in set_config anyway + #Currently we only have array and experiment import, both of which should have names + #Make mandatory? + + foreach my $tmp ("group", "data_dir") { #name now generically mandatory + throw("Mandatory arg $tmp not been defined") if (! defined $self->{$tmp}); + } + #Should we separate path on group here too, so we can have a dev/test group? + + #Create output dirs + #This should be moved to the Parser to avoid generating directories which are needed for different imports + $self->create_output_dirs('raw', 'norm', 'caches', 'fastas'); + throw("No result_files defined.") if (! defined $self->result_files()); + + #Log input files + #if (@{$self->result_files()}) { + # $self->log("Found result files arguments:\n\t".join("\n\t", @{$self->result_files()})); + #} + #This is done in new + + #check for cell||feature and warn if no met file supplied? + + if ($self->norm_method) { + my $norm_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->norm_method); + + #should we list the valid analyses? + throw($self->norm_method.' is not a valid analysis') if ! $norm_anal; + $self->norm_analysis($norm_anal); + } else { + $self->log('WARNING: No normalisation analysis specified'); + } + + #warn "Need to check env vars here or in Parser or just after set_config?"; + #Need generic method for checking ENV vars in Helper + #check for ENV vars? + #R_LIBS + #R_PATH if ! farm + #R_FARM_PATH + + $self->validate_group(); #import experimental_group + + #Get experiment + my $exp_adaptor = $self->db->get_ExperimentAdaptor(); + my $exp = $exp_adaptor->fetch_by_name($self->name()); #, $self->group()); + $self->process_experiment_config if $self->can('process_experiment_config'); #Parsers::MAGE::process_experiment_config + + #Moved MAGE support form here to MAGE.pm + + #Recovery now set so deal with experiment + if ($self->recovery() && ($exp)) { + $self->log("Using previously stored Experiment:\t".$exp->name); + } elsif ((! $self->recovery()) && $exp) { + throw("Your experiment name is already registered in the database, please choose a different \"name\", this will require renaming you input directory, or specify -recover if you are working with a failed/partial import."); + #can we skip this and store, and then check in register experiment if it is already stored then throw if not recovery + } else { # (recover && exp) || (recover && ! exp) + + + $exp = Bio::EnsEMBL::Funcgen::Experiment->new( + -EXPERIMENTAL_GROUP => $self->{egroup}, + -NAME => $self->name(), + -DATE => $self->experiment_date(), + -PRIMARY_DESIGN_TYPE => $self->design_type(), + -DESCRIPTION => $self->description(), + -ADAPTOR => $self->db->get_ExperimentAdaptor(), + ); + + ($exp) = @{$exp_adaptor->store($exp)}; + } + + + $self->experiment($exp); + + #remove and add specific report, this is catchig some Root stuff + #$self->log("Initiated efg import with following parameters:\n".Data::Dumper::Dumper(\$self)); + + return; +} + + +#Move this to new or init_experiment + +=head2 validate_group + + Example : $self->validate_group(); + Description: Validates groups details + Returntype : none + Exceptions : throws if insufficient info defined to store new Group and is not already present + Caller : general + Status : Medium - check location and contact i.e. group name clash? + +=cut + +sub validate_group{ + my ($self) = shift; + + my $egroup = $self->db->get_ExperimentalGroupAdaptor->fetch_by_name($self->group); + + if (! defined $egroup) { + + throw("validate_group does not yet fully support ExperimentalGroups, please add manually"); + + #if ($self->location() && $self->contact()) { + # $self->db->import_group($self->group(), $self->location, $self->contact()); + #} else { + # throw("Group ".$self->group()." does not exist, please specify a location and contact to register the group"); + #} + } + + $self->{egroup} = $egroup; + + return; +} + +=head2 create_output_dirs + + Example : $self->create_output_dirs(); + Description: Does what it says on the tin, creates dirs in + the root output dir foreach @dirnames, also set paths in self + Arg [1] : mandatory - list of dir names + Returntype : none + Exceptions : none + Caller : general + Status : Medium - add throw? + +=cut + +sub create_output_dirs{ + my ($self, @dirnames) = @_; + + #output dir created in control script + #avoids errors when logs generated first + + + foreach my $name (@dirnames) { + + if ($name eq 'caches') { + $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/".$self->db->dbc->dbname() if(! defined $self->{"${name}_dir"}); + } elsif ($name eq 'fastas') { + $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/" if(! defined $self->{"${name}_dir"}); + } else { + $self->{"${name}_dir"} = $self->get_dir('output')."/${name}" if(! defined $self->{"${name}_dir"}); + } + + if (! (-d $self->get_dir($name) || (-l $self->get_dir($name)))) { + $self->log("Creating directory:\t".$self->get_dir($name)); + #This did not throw with mkdir!! + mkpath $self->get_dir($name) || throw('Failed to create directory: '. $self->get_dir($name)); + chmod 0744, $self->get_dir($name); + } + } + + return; +} + +=head2 vendor + + Example : $imp->vendor("NimbleGen"); + Description: Getter/Setter for array vendor + Arg [1] : optional - vendor name + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub vendor{ + my ($self) = shift; + + if (@_) { + $self->{'vendor'} = shift; + $self->{'vendor'} = uc($self->{'vendor'}); + } + + return $self->{'vendor'}; +} + + +=head2 feature_type + + Example : $imp->feature_type($ftype); + Description: Getter/Setter for Experiment FeatureType + Arg [1] : optional - Bio::EnsEMBL::Funcgen::FeatureType + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub feature_type{ + my ($self) = shift; + + if (@_) { + my $ftype = shift; + + #do we need this as we're checking in new? + if (! ($ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType') && $ftype->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType"); + } + + $self->{'feature_type'} = $ftype; + } + + return $self->{'feature_type'}; +} + +=head2 feature_analysis + + Example : $imp->feature_analysis($fanal); + Description: Getter/Setter for Analysis used for creating the imported Features + Arg [1] : optional - Bio::EnsEMBL::Analysis + Returntype : Bio::EnsEMBL::Analysis + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub feature_analysis{ + my ($self) = shift; + + if (@_) { + my $fanal = shift; + + #do we need this as we're checking in new? + if (! (ref ($fanal) && $fanal->isa('Bio::EnsEMBL::Analysis') && $fanal->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Analysis"); + } + + $self->{'feature_analysis'} = $fanal; + } + + return $self->{'feature_analysis'}; +} + +=head2 norm_analysis + + Example : $imp->norm_analysis($anal); + Description: Getter/Setter for the normalisation analysis + Arg [1] : optional - Bio::EnsEMBL::Analysis + Returntype : Bio::EnsEMBL::Analysis + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub norm_analysis{ + my ($self) = shift; + + if (@_) { + my $anal = shift; + + #do we need this as we're checking in new? + if (! (ref($anal) && $anal->isa('Bio::EnsEMBL::Analysis') && $anal->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Analysis"); + } + + $self->{'norm_analysis'} = $anal; + } + + return $self->{'norm_analysis'}; +} + + + +=head2 cell_type + + Example : $imp->cell_type($ctype); + Description: Getter/Setter for Experiment CellType + Arg [1] : optional - Bio::EnsEMBL::Funcgen::CellType + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : Throws if arg is not valid or stored + Caller : general + Status : at risk + +=cut + +sub cell_type{ + my ($self) = shift; + + if (@_) { + my $ctype = shift; + + #do we need this as we're checking in new? + if (! ($ctype->isa('Bio::EnsEMBL::Funcgen::CellType') && $ctype->dbID())) { + throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::CellType"); + } + + $self->{'cell_type'} = $ctype; + } + + return $self->{'cell_type'}; +} + + +##=head2 ucsc_coords +## +## Example : $start += 1 if $self->ucsc_coords; +## Description: Getter for UCSC coordinate usage flag +## Returntype : boolean +## Exceptions : none +## Caller : general +## Status : at risk +## +##=cut +## +##sub ucsc_coords{ +## my $self = shift; +## return $self->{'ucsc_coords'}; +##} + + + +=head2 array_file + + Example : my $array_file = $imp->array_file(); + Description: Getter/Setter for sanger/design array file + Arg [1] : optional - path to adf or gff array definition/mapping file + Returntype : string + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub array_file{ + my ($self) = shift; + $self->{'array_file'} = shift if(@_); + return $self->{'array_file'}; +} + +=head2 array_name + + Example : my $array_name = $imp->array_name(); + Description: Getter/Setter for array name + Arg [1] : optional string - name of array + Returntype : string + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub array_name{ + my ($self) = shift; + $self->{'array_name'} = shift if(@_); + return $self->{'array_name'}; +} + + +=head2 array_set + + Example : $imp->array_set(1); + Description: Getter/Setter for array set flag + Arg [1] : optional boolean - treat all array chips as the same array + Returntype : boolean + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub array_set{ + my ($self) = shift; + $self->{'array_set'} = shift if(@_); + return $self->{'array_set'}; +} + + +=head2 add_Array + + Arg [1] : Bio::EnsEMBL::Funcgen::Array + Example : $self->add_Array($array); + Description: Setter for array elements + Returntype : none + Exceptions : throws if passed non Array or if more than one Array set + Caller : Importer + Status : At risk - Implement multiple arrays? Move to Experiment? + +=cut + +sub add_Array{ + my $self = shift; + + #do we need to check if stored? + if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Array')) { + throw("Must supply a Bio::EnsEMBL::Funcgen::Array"); + } elsif (@_) { + push @{$self->{'arrays'}}, @_; + } + + throw("Does not yet support multiple array imports") if(scalar (@{$self->{'arrays'}}) > 1); + #need to alter read_probe data at the very least + + return; +} + + + +=head2 arrays + + Example : foreach my $array(@{$imp->arrays}){ ...do an array of things ...}; + Description: Getter for the arrays attribute + Returntype : ARRAYREF + Exceptions : none + Caller : general + Status : at risk + +=cut + +sub arrays{ + my $self = shift; + + if (! defined $self->{'arrays'}) { + $self->{'arrays'} = $self->db->get_ArrayAdaptor->fetch_all_by_Experiment($self->experiment()); + } + + return $self->{'arrays'}; +} + + + +=head2 location + + Example : $imp->vendor("Hinxton"); + Description: Getter/Setter for group location + Arg [1] : optional - location + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub location{ + my ($self) = shift; + $self->{'location'} = shift if(@_); + return $self->{'location'}; +} + + +=head2 contact + + Example : my $contact = $imp->contact(); + Description: Getter/Setter for the group contact + Arg [1] : optional - contact name/email/address + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub contact{ + my ($self) = shift; + $self->{'contact'} = shift if(@_); + return $self->{'contact'}; +} + +=head2 name + + Example : $imp->name('Experiment1'); + Description: Getter/Setter for the experiment name + Arg [1] : optional - experiment name + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + + +sub name{ + my ($self) = shift; + $self->{'name'} = shift if(@_); + return $self->{'name'}; +} + +=head2 result_files + + Example : $imp->result_files(\@files); + Description: Getter/Setter for the result file paths + Arg [1] : Listref of file paths + Returntype : Listref + Exceptions : none + Caller : general + Status : At risk + +=cut + + +sub result_files{ + my ($self) = shift; + $self->{'result_files'} = shift if(@_); + return $self->{'result_files'}; +} + + + + + +=head2 experiment_date + + Example : $imp->experiment_date('2006-11-02'); + Description: Getter/Setter for the experiment date + Arg [1] : optional - date string in yyyy-mm-dd + Returntype : string + Exceptions : none + Caller : general + Status : At risk + +=cut + + + +sub experiment_date{ + my ($self) = shift; + + if (@_) { + my $date = shift; + + if ($date !~ /[0-9]{4}-[0-9]{2}[0-9]{2}/o) { + throw('Parameter -experiment_date needs to fe in the format: YYYY-MM-DD'); + } + + $self->{'experiment_date'} = $date; + } elsif ($self->vendor() eq "nimblegen" && ! defined $self->{'experiment_date'}) { + $self->{'experiment_date'} = &get_date("date", $self->get_config("chip_file")), + } + + return $self->{'experiment_date'}; +} + + + +=head2 group + + Example : my $exp_group = $imp->group(); + Description: Getter/Setter for the group name + Arg [1] : optional - group name + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub group{ + my ($self) = shift; + $self->{'group'} = shift if(@_); + return $self->{'group'}; +} + + +=head2 description + + Example : $imp->description("Human chrX H3 Lys 9 methlyation"); + Description: Getter/Setter for the experiment element + Arg [1] : optional - experiment description + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub description{ + my $self = shift; + + if (@_) { + $self->{'description'} = shift; + } + + return $self->{'description'}; +} + +##=head2 feature_set_description +## +## Example : $imp->description("ExperimentalSet description"); +## Description: Getter/Setter for the FeatureSet description for an +## InputSet import e.g. preprocessed GFF/Bed data +## Arg [1] : optional - string feature set description +## Returntype : string +## Exceptions : none +## Caller : general +## Status : At risk +## +##=cut +## +##sub feature_set_description{ +## my $self = shift; +## +## $self->{'feature_set_description'} = shift if @_; +## +## return $self->{'feature_set_description'}; +##} + +=head2 format + + Example : $imp->format("Tiled"); + Description: Getter/Setter for the array format + Arg [1] : optional - array format + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub format{ + my ($self) = shift; + $self->{'format'} = shift if(@_); + return $self->{'format'}; +} + +=head2 experiment + + Example : my $exp = $imp->experiment(); + Description: Getter/Setter for the Experiment element + Arg [1] : optional - Bio::EnsEMBL::Funcgen::Experiment + Returntype : Bio::EnsEMBL::Funcgen::Experiment + Exceptions : throws if arg is not an Experiment + Caller : general + Status : Stable + +=cut + +sub experiment{ + my ($self) = shift; + + if (@_) { + + if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Experiment')) { + throw("Must pass a Bio::EnsEMBL::Funcgen::Experiment object"); + } + + $self->{'experiment'} = shift; + } + + return $self->{'experiment'}; +} + +##=head2 db +## +## Example : $imp->db($funcgen_db); +## Description: Getter/Setter for the db element +## Arg [1] : optional - Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor +## Returntype : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor +## Exceptions : throws if arg is not an DBAdaptor +## Caller : general +## Status : Stable +## +##=cut +## +##sub db{ +## my $self = shift; +## +## if (defined $_[0] && $_[0]->isa("Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor")) { +## $self->{'db'} = shift; +## } elsif (defined $_[0]) { +## throw("Need to pass a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor"); +## } +## +## return $self->{'db'}; +##} +## +##=head2 pass +## +## Example : $imp->pass("password"); +## Description: Getter/Setter for the db password +## Arg [1] : optional - db password +## Returntype : string +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +## +##sub pass{ +## my $self = shift; +## $self->{'pass'} = shift if(@_); +## return $self->{'pass'}; +##} +## +##=head2 pass +## +## Example : $imp->host("hoastname"); +## Description: Getter/Setter for the db hostname +## Arg [1] : optional - db hostname +## Returntype : string +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +##sub host{ +## my $self = shift; +## $self->{'host'} = shift if(@_); +## return $self->{'host'}; +##} +## +##=head2 port +## +## Example : $imp->port(3306); +## Description: Getter/Setter for the db port number +## Arg [1] : optional - db port number +## Returntype : int +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +##sub port{ +## my $self = shift; +## $self->{'port'} = shift if(@_); +## return $self->{'port'}; +##} +## +##=head2 user +## +## Example : $imp->user("user_name"); +## Description: Getter/Setter for the db user name +## Arg [1] : optional - db user name +## Returntype : string +## Exceptions : none +## Caller : general +## Status : Stable +## +##=cut +## +##sub user{ +## my $self = shift; +## $self->{'user'} = shift if(@_); +## return $self->{'user'}; +##} + +##=head2 dump_fasta +## +## Example : if($self->dump_fasta()){...do fasta dump...} +## Description: Getter/Setter for the dump_fasta flag +## Arg [1] : optional - 0 or 1 +## Returntype : boolean +## Exceptions : none +## Caller : self +## Status : Stable +## +##=cut +## +## +##sub dump_fasta{ +## my $self = shift; +## $self->{'_dump_fasta'} = shift if @_; +## return $self->{'_dump_fasta'}; +##} +## + + +##=head2 species +## +## Example : $imp->species("homo_sapiens"); +## Description: Getter/Setter for species +## Arg [1] : optional - species name(alias?) +## Returntype : string +## Exceptions : none ? throw if no alias found? +## Caller : general +## Status : Medium - may move reg alias look up to this method +## +##=cut +## +##sub species{ +## my $self = shift; +## +## #should we do reg alias look up here? +## #Will we ever want to redefine species? +## #Change to just getter? +## +## $self->{'species'} = shift if(@_); +## +## return $self->{'species'}; +##} + +=head2 get_dir + + Example : $imp->get_dir("import"); + Description: Retrieves full path for given directory + Arg [1] : mandatory - dir name + Returntype : string + Exceptions : none + Caller : general + Status : at risk - move to Helper? + +=cut + +sub get_dir{ + my ($self, $dirname) = @_; + return $self->get_data("${dirname}_dir"); +} + +=head2 norm_method + + Example : my $norm_method = $imp->norm_method() + Description: Getter/Setter for normalisation method + Arg [1] : mandatory - method name + Returntype : string + Exceptions : none ? throw if no analysis with logic name + Caller : general + Status : At risk - restrict to logic_name and validate against DB, allow multiple + +=cut + +#Move to Nimblegen? +#Do we ever want to normalise other data? + +sub norm_method{ + my $self = shift; + + if (@_) { + $self->{'norm_method'} = shift; + } elsif (! defined $self->{'norm_method'}) { + $self->{'norm_method'}= $self->get_config('norm_method'); + } + + return $self->{'norm_method'}; +} + + +=head2 get_config + + Arg [1] : mandatory - name of the data element to retrieve from the config hash + Example : %dye_freqs = %{$imp->get_config('dye_freqs')}; + Description: returns data from the definitions hash + Returntype : various + Exceptions : none + Caller : Importer + Status : at risk - replace with direct calls in the inherited Defs class? + +=cut + + +sub get_config{ + my ($self, $data_name) = @_; + return $self->get_data('config', $data_name); #will this cause undefs? +} + + + + + +=head2 register_experiment + + Example : $imp->register_experiment() + Description: General control method, performs all data import and normalisations + Arg [1] : optional - dnadb DBAdaptor + Returntype : none + Exceptions : throws if arg is not Bio::EnsEMBL::DBSQL::DBAdaptor + Caller : general + Status : Medium + +=cut + + +#Need to split this method? +#Pre farm process +#Define and store all sets, +#pre process input file once if required +#How are we going to be able to tell wether this has been done successfully? +#runner will catch error, therefore safe to assume that file is complete if farm job +#unless we are not running with farm + +#farm specific processes +#actually parse and store data + +#Can we separate these for different import types? + +sub register_experiment{ + my ($self) = shift; + + #Need to check for dnadb passed with adaptor to contructor + if (@_) { + + if ( ! $_[0]->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")) { + throw("You need to pass a valid dnadb adaptor to register the experiment"); + } + $self->db->dnadb($_[0]); + } elsif ( ! $self->db) { + throw("You need to pass/set a DBAdaptor with a DNADB attached of the relevant data version"); + } + + #This could still be the default core db for the current version + #warn here if not passed DB? + #These should be vendor independent, only the read methods should need specific order? + + $self->init_experiment_import(); + #can we just have init here instead? + + + #This could do with a rewrite to move some things to the parsers + #$self::SUPER->register_experiment + + $self->write_validate_experiment_config if $self->can('write_validate_experiment_config'); + + #This is too array specific! + #Can we have an array of import_methods in the config? + #foreach my $method(@{$self->get_config{'import_methods'}}){ + #$self->method; + #} + #We're already doing this in read_data for each of these data_types + + #Need to be able to run this separately, so we can normalise previously imported sets with different methods + #should be able t do this without raw data files e.g. retrieve info from DB + #Is this implemented? + + $self->read_data("probe"); + $self->read_data("results"); + + + my $norm_method = $self->norm_method(); + + if (defined $norm_method) { + warn "norm method is $norm_method"; + + $self->R_norm($norm_method); + #change this to $self->$norm_method + #so we can have non-R_norm normalisation + } + + + return; +} + +#Move array specific ones to Nimblegen.pm? +#Also used by ArrayDesign and Sanger.pm +#So need to create Array.pm baseclass, which is a Helper. + +=head2 store_set_probes_features + + Arg [1] : mandatory - array chip id + Arg [2] : optional - Bio::EnsEMBL::Funcgen::ProbeSet + Arg [3] : mandatory - hashref of keys probe id, values are + hash of probe/features with values + Bio::EnsEMBL::Funcgen::Probe/Features for a given + probe set if defined. + Example : $self->store_set_probes_features($ac->dbID(), $ops, \%pfs); + Description: Stores probe set, probes and probe features + Returntype : none + Exceptions : none + Caller : self + Status : Medium + +=cut + +sub store_set_probes_features{ + my ($self, $ac_id, $pf_hash, $ops) = @_; + + ### Deal with ProbeSets + if ($ops) { + $ops->size(scalar(keys %$pf_hash)); + ($ops) = $self->db->get_ProbeSetAdaptor->store($ops); + } + + #If we're going to validate fully, we need to check for probes in this probeset on this array chip + #Update size if we have any new probes + #Overkill? Only do on recover? Do not read if array chip is IMPORTED + #This does not make any attempt to validate probes/set vs previously stored data + + for my $probe_id (keys %$pf_hash) { + + #set probeset in probe and store + #the process corresponding feature + my $probe = $pf_hash->{$probe_id}->{'probe'}; + $probe->probeset($ops) if $ops; + ($probe) = @{$self->db->get_ProbeAdaptor->store($probe)}; + + #Can't use get_all_Arrays here as we can't guarantee this will only ever be the array we've generated + #Might dynamically load array if non-present + #This is allowing multiple dbIDs per probe??? Is this wrong? + #$self->cache_probe_info($probe->get_probename(), $probe->dbID());###########Remove as we're now importing all then resolving + + + foreach my $feature (@{$pf_hash->{$probe_id}->{'features'}}) { + $feature->probe($probe); + ($feature) = @{$self->db->get_ProbeFeatureAdaptor->store($feature)}; + } + } + + undef $ops; #Will this persist in the caller? + undef %{$pf_hash}; + + return; +} + + +=head2 cache_slice + + Arg [0] : string - region_name e.g. X + Arg [1] : optional - coordinate system name e.g. supercontig, defaults to chromosome + Example : my $slice = $self->cache_slice(12); + Description: Caches or retrieves from cache a given slice + Returntype : Bio::EnsEMBL::Slice + Exceptions : throws f no region name specified + Caller : self + Status : At risk + +=cut + +sub cache_slice{ + my ($self, $region_name, $cs_name, $total_count) = @_; + + throw("Need to define a region_name to cache a slice from") if ! $region_name; + $self->{'slice_cache'} ||= {}; + $region_name =~ s/chr//; + $region_name = "MT" if $region_name eq "M"; + + if (! exists ${$self->{'seen_slice_cache'}}{$region_name}) { + my $slice = $self->slice_adaptor->fetch_by_region($cs_name, $region_name); + + #Set seen cache so we don't try this again + $self->{seen_slice_cache}{$region_name} = $slice; + + if (! $slice) { + warn("-- Could not generate a slice for ${cs_name}:$region_name\n"); + } else { + + my $sr_name = $slice->seq_region_name; #In case we passed a slice name + + if (@{$self->{seq_region_names}}) { + return if ! grep(/^${sr_name}$/, @{$self->{seq_region_names}}); #not on required slice + } + } + + $self->{'slice_cache'}->{$region_name} = $slice; + } + + if ($total_count && exists ${$self->{'seen_slice_cache'}}{$region_name}) { + #This is an InputSet specific method + $self->count('total_features') if $self->can('count'); + } + + #Only return if exists to avoid creating hash key + return (exists $self->{'slice_cache'}->{$region_name}) ? $self->{'slice_cache'}->{$region_name} : undef; +} + +=head2 slice_cache + + Example : my @seen_slices = values(%{$self->slice_cache});; + Description: Returns the slice cache i.e. all the Slices seen in the data filtered + by the defined slices. This method can be used to run only the appropriate + slice jobs after a prepare stage. + Returntype : Hashref of seq_region name Bio::EnsEMBL::Slice pairs + Exceptions : None + Caller : self + Status : At risk + +=cut + + +sub slice_cache{ + my $self = shift; + + return $self->{'slice_cache'}; +} + + + + +=head2 cache_probe_info + + Arg [0] : mandatory - probe name + Arg [1] : mandatory - probe dbID + Arg [2] : optioanl int - x coord of probe on array + Arg [3] : optional int - y coord of probe on array + Example : $self->cache_probe_info("Probe1", $probe->dbID()); + Or for result files which do not have X & Y, we need to cache + X & Y from the design files: $self->cache_probe_info('Probe2', $probe->dbID(), $x, $y); + Description: Setter for probe cache values + Returntype : none + Exceptions : throws is cache conflict encountered + Caller : self + Status : At risk - merge with following? + +=cut + +sub cache_probe_info{ + my ($self, $pname, $pid, $x, $y) = @_; + + throw('Deprecated, too memory expensive, now resolving DB duplicates and using Tied File cache'); + throw("Must provide a probe name and id") if (! defined $pname || ! defined $pid); + + + #do we need to loop through the file here? + #if we make sure we're testing for a previous dbID before storing probes then we don't need to do this + #we can catch the error when we get the probe id as we can check for >1 id for the same probe name + #if (defined $self->{'_probe_cache'}->{$pname} && ($self->{'_probe_cache'}->{$pname}->[0] != $pid)) { + # throw("Found two differing dbIDs for $pname, need to sort out redundant probe entries"); + #} + + $self->{'_probe_cache'}->{$pname} = (defined $x && defined $y) ? [$pid, $x, $y] : [$pid]; + + return; +} + + +=head2 get_probe_id_by_name_Array + + Arg [1] : mandatory - probe name + Example : $pid = $self->get_probe_id_by_name($pname); + Description: Getter for probe cache values + Returntype : int + Exceptions : none + Caller : self + Status : At risk - merge with previous, move to importer? + +=cut + +sub get_probe_id_by_name_Array{ + my ($self, $name, $array) = @_; + + #this is only ever called for fully imported ArrayChips, as will be deleted if recovering + $self->resolve_probe_data() if(! exists $self->{'_probe_cache'}{$array->name()}); + + #we want to cycle through the given cache starting from the last position or 0. + #we don't want to have to test for the size of the cache each time as this will be quite expensive + #so we should store sizes separately along with current position + + + my ($pid, $line); + + #check current line + if ($line = $self->{'_probe_cache'}{$array->name()}{'current_line'}) { + if ($line =~ /^\Q${name}\E\t/) { + $pid = (split/\t/o, $line)[1]; + } + } + + + if (! $pid) { + while ($line = $self->{'_probe_cache'}{$array->name()}{'handle'}->getline()) { + + if ($line =~ /^\Q${name}\E\t/) { + $pid = (split/\t/o, $line)[1]; + $self->{'_probe_cache'}{$array->name()}{'current_line'} = $line; + last; + } + } + } + + #do not remove this + if (! $pid) { + throw("Did not find probe name ($name) in cache, cache may need rebuilding, results may need sorting, or do you have an anomolaous probe?") + } else { + chomp $pid; + } + + return $pid; +} + +=head2 get_probe_cache_by_Array + + Arg[1] : Bio::EnsEMBL::Funcgen::Array + Arg[2] : boolean - from db flag, only to be used by Importer->resolve_probe_data ! + Example : $self->get_probe_cache_by_Array(); + Description: Gets the probe info cache which is an array tied to a file + Returntype : Boolean - True if cache has been generated and set successfully + Exceptions : none + Caller : general + Status : At risk + +=cut + +#from db flag should only be used by importer +#this is because there is no guarantee that it will be resolved unless +#called by resolve_probe_data +#which then renames the file and resets the handle +#can we clean this up and protect/hide this functionality? +#can we check the cache file name in the get methods and throw if it contains unresolved? +#make this private? + +sub get_probe_cache_by_Array{ + my ($self, $array, $from_db) = @_; + + my $msg = "Getting probe cache for ".$array->name(); + $msg .= " from DB" if $from_db; + $self->log($msg); #, 1); + + if (! ($array && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())) { + throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::Array object'); + } + + my $set = 0; + my $cache_file = $self->get_dir('caches').'/'.$array->name().'.probe_cache'; + + ### Generate and resolve fresh cache from DB + if ($from_db) { + + $cache_file .= '.unresolved'; #This will be renamed by the caller if it is resolved + + if (exists $self->{'_probe_cache'}{$array->name()}) { + $self->log('Rebuilding probe_cache from DB for '.$array->name(), 1); + + + #untie @{$self->{'_probe_cache'}{$array->name()}{'entries'}}; + #close($self->{'_probe_cache'}{$array->name()}{'handle'});#do we need to do this? + delete $self->{'_probe_cache'}{$array->name()}; #implicitly closes + $self->log('Deleted old cache', 1); + } else { + $self->log('Building probe_cache from DB for '.$array->name(), 1); + } + + #Move this to ProbeAdaptor? + #This is where we'd set the unique key for a vendor and resolves duplicates based on the key + my $cmd = 'SELECT name, probe_id from probe WHERE array_chip_id IN ('.join(',', @{$array->get_array_chip_ids()}).') ORDER by name, probe_id'; + $cmd = 'mysql '.$self->db->connect_string()." -e \"$cmd\" >".$cache_file; + run_system_cmd($cmd); + + } + + ### Set cache + if (-f $cache_file) { + $self->log('MD5 check here?',1); + $self->{'_probe_cache'}{$array->name()}{'current_line'} = undef; + $self->{'_probe_cache'}{$array->name()}{'handle'} = open_file($cache_file); + + #can we do a select count instead? and do this instead of the MD5? + #$cmd = "wc -l $cache_file"; + #my $size = `$cmd`; + + $set = 1; + } else { + warn 'Failed to get probe cache for array:'.$array->name(); + } + + return $set; +} + + +#should reorganise these emthods to split reading the array data, and the actual data +#currently: +#meta reads array and chip data +#probe reads probe_set, probes, which should definitely be in array, probe_feature? and results +#native data format may not map to these methods directly, so may need to call previous method if required data not defined + + +=head2 read_data + + Example : $self->read_data("probe") + Description: Calls each method in data_type array from config hash + Arg [1] : mandatory - data type + Returntype : none + Exceptions : none + Caller : self + Status : At risk + +=cut + +sub read_data{ + my($self, $data_type) = @_; + + map {my $method = "read_${_}_data"; $self->$method()} @{$self->get_config("${data_type}_data")}; + return; +} + + +=head2 design_type + + Example : $self->design_type("binding_site_identification") + Description: Getter/Setter for experimental design type + Arg [1] : optional - design type + Returntype : string + Exceptions : none + Caller : general + Status : At risk + +=cut + +sub design_type{ + my $self = shift; + return $self->{'design_type'}; +} + + +=head2 get_chr_seq_region_id + + Example : $seq_region_id = $self->get_seq_region_id('X'); + Description: Calls each method in data_type array from config hash + Arg [1] : mandatory - chromosome name + Arg [2] : optional - start value + Arg [3] : optional - end value + Returntype : int + Exceptions : none + Caller : self + Status : At risk + +=cut + +#convinience wrapper method +#could we use the seq region cache instead? +#this seems like a lot of overhead for getting the id +sub get_chr_seq_region_id{ + my ($self, $chr, $start, $end) = @_; + #what about strand info? + + #do we need the start and stop? + + #use start and stop to prevent problems with scaffodl assemblies, i.e. >1 seq_region_id + #my $slice = $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end); + #we could pass the slice back to the slice adaptor for this, to avoid dbid problems betwen DBs + + ###would need to implement other cs's here + + return $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end)->get_seq_region_id(); +} + + + +=head2 vsn_norm + + Example : $self->vsn_norm(); + Description: Convinience/Wrapper method for vsn R normalisation + Returntype : none + Exceptions : none + Caller : general + Status : At risk + +=cut + +#Have Norm class or contain methods in importer? +#Need to have analysis set up script for all standard analyses. + +sub vsn_norm{ + my $self = shift; + return $self->R_norm("VSN_GLOG"); +} + + +=head2 farm + + Arg [1] : Boolean + Example : $importer->farm(1); + Description: Flag to turn farm submission on + Returntype : Boolean + Exceptions : Throws is argument not a boolean + Caller : general + Status : At risk + +=cut + + +sub farm{ + my ($self, $farm) = @_; + + $self->{'farm'} ||= undef; #define farm + + if (defined $farm) { + throw("Argument to farm must be a boolean 1 or 0") if(! ($farm == 1 || $farm == 0)); + $self->{'farm'} = $farm; + } + + return $self->{'farm'}; + +} + +=head2 batch_job + + Arg [1] : Boolean + Example : $importer->batch_job(1); + Description: Flag to turn on batch_job status + Returntype : Boolean + Exceptions : Throws is argument not a boolean + Caller : general + Status : At risk + +=cut + + +sub batch_job{ + my ($self, $batch_job) = @_; + + #$self->{'batch_job'} ||= undef; + + if (defined $batch_job) { + throw("Argument to batch_job must be a boolean 1 or 0") if(! ($batch_job == 1 || $batch_job == 0)); + $self->{'batch_job'} = $batch_job; + } + + return $self->{'batch_job'}; + +} + +=head2 prepared + + Arg [1] : Boolean + Example : $importer->prepared(1); + Description: Flag to turn on prepared file status + This signifies that the files have been previously imported + using prepare mode and may not match the InputSubset names + Returntype : Boolean + Exceptions : None + Caller : general + Status : At risk + +=cut + + +sub prepared{ + my ($self, $prepared) = @_; + $self->{'prepared'} = $prepared if (defined $prepared); + return $self->{'prepared'}; +} + + +=head2 R_norm + + Example : $self->R_norm(@logic_names); + Description: Performs R normalisations for given logic names + Returntype : none + Exceptions : Throws if R exits with error code or if data not not valid for analysis + Caller : general + Status : At risk + +=cut + +sub R_norm{ + my ($self, @logic_names) = @_; + #This currently normalises a single two colour chip at a time + #rather than normalising across a set of chip + #also does in sets of analyses + #good for keeping data separate, but not efficient in terms of querying + #convert to use one script which only queries for each echip once, then does each anal + + + my $aa = $self->db->get_AnalysisAdaptor(); + my $rset_adaptor = $self->db->get_ResultSetAdaptor(); + my $ra_id = $aa->fetch_by_logic_name("RawValue")->dbID(); + my %r_config = ( + "VSN_GLOG" => {( libs => ['vsn'], + #norm_method => 'vsn', + )}, + + "T.Biweight" => {( + libs => ['affy'], + #norm_method => 'tukey.biweight', + )}, + ); + + + foreach my $logic_name (@logic_names) { + #throw("Not yet implemented TukeyBiweight") if $logic_name eq "Tukey_Biweight"; + + #this has already been chcecked and set as the norm_analysis + #need to resolve this multi method approach + my $norm_anal = $aa->fetch_by_logic_name($logic_name); + + + + #This only creates an RSet for the IMPORT set + #So if we re-run with a different analysis + #tab2mage will have already been validated + #So RSet generation will be skipped + #We need to recreate the each non-import RSet for this norm analysis + + #This also means the RSets are being created before the data has been imported + #This avoids having to parse tab2mage each time but means we have an uncertain status of these Rsets + + my $rset = $self->get_import_ResultSet($norm_anal, 'experimental_chip'); + + my @chips = (); + + if (! $rset) { + $self->log("All ExperimentalChips already have status:\t${logic_name}"); + } else { #Got data to normalise and import + my @dbids; + my $R_file = $self->get_dir("norm")."/${logic_name}.R"; + my $job_name = $self->experiment->name()."_${logic_name}"; + my $resultfile = $self->get_dir("norm")."/result.${logic_name}.txt"; + my $outfile = $self->get_dir("norm")."/${logic_name}.out"; + + #How do we get farm job output i.e. run time memusage + #from interactive job? + #This assumes R_PATH + my $errfile = $self->get_dir("norm")."/${logic_name}.err"; + + #Let's build this better so we capture the farm output aswell as the job output. + my $cmdline = "$ENV{'R_PATH'} --no-save < $R_file"; # >$errfile 2>&1"; + #-K option waits for job to complete + my $bsub = "bsub -K -J $job_name ".$ENV{'R_BSUB_OPTIONS'}. + " -e $errfile -o $outfile $ENV{'R_FARM_PATH'} CMD BATCH $R_file"; + + #Can we separate the out and err for commandline? + my $r_cmd = (! $self->farm()) ? "$cmdline >$outfile 2>&1" : $bsub; + + $self->backup_file($resultfile); #Need to do this as we're appending in the loop + + #setup qurey + #warn "Need to add host and port here"; + #Set up DB, defaults and libs for each logic name + my $query = "options(scipen=20);library(RMySQL);library(Ringo);"; + #scipen is to prevent probe_ids being converted to exponents + #Ringo is for default QC + + #foreach my $ln(@logic_names){ + + foreach my $lib (@{$r_config{$logic_name}{'libs'}}) { + $query .= "library($lib);"; + } + #} + + $query .= "con<-dbConnect(dbDriver(\"MySQL\"), host=\"".$self->db->dbc->host()."\", port=".$self->db->dbc->port().", dbname=\"".$self->db->dbc->dbname()."\", user=\"".$self->db->dbc->username()."\""; + + #should use read only pass here as we are printing this to file + $query .= ", pass=\"".$self->db->dbc->password."\")\n"; + + + #Build queries for each chip + foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { + + + #should implement logic name here? + #can't as we need seperate ResultSet for each + + + if ($echip->has_status($logic_name)) { + $self->log("ExperimentalChip ".$echip->unique_id()." already has status:\t$logic_name"); + } else { + + #warn "Need to roll back here if recovery, as norm import may fail halfway through"; + + push @chips, $echip; + my $cc_id = $rset->get_chip_channel_id($echip->dbID()); + + #if ($self->recovery()){ + # $self->log('Rolling back results for ExperimentalChip('.$echip->dbID().") $logic_name"); + # $self->db->rollback_results($cc_id) if $self->recovery(); + # } + + $self->log("Building $logic_name R cmd for ".$echip->unique_id()); + @dbids = (); + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->type() eq "EXPERIMENTAL") { + push @dbids, $chan->dbID(); + } else { + unshift @dbids, $chan->dbID(); + } + } + + throw("vsn does not accomodate more than 2 channels") if (scalar(@dbids > 2) && $logic_name eq "VSN_GLOG"); + + #should do some of this with maps? + #HARDCODED metric ID for raw data as one + + #Need to get total and experimental here and set db_id accordingly + #can probably do this directly into one df + + $query .= "c1<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as CONTROL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[0]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n"; + + $query .= "c2<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as EXPERIMENTAL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[1]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n"; + + #Can we define some of the basic structures here and reuse in the QC and each norm method? + + + #Is this going to eat up memory? + #can we strip out and separate the data from c1 and c2 into RGList and + #individual vector for probe_ids, then rm c1 and c2 to free up memory + + #create RGList object + $query .= "R<-as.matrix(c1['CONTROL_score'])\nG<-as.matrix(c2['EXPERIMENTAL_score'])\n"; + $query .= "genes<-cbind(c1['PROBE_ID'], c1['X'], c1['Y'])\n"; + $query .= "testRG<-new('RGList', list(R=R, G=G, genes=genes))\n"; + + + #QC plots here before doing norm + + #open pdf device + $query .= "pdf('".$self->get_dir('norm').'/'.$echip->unique_id."_QC.pdf', paper='a4', height = 15, width = 9)\n"; + #set format + $query .= "par(mfrow = c(2,2), font.lab = 2)\n"; + + #Channel densisties + #These need limma or Ringo + $query .= "plotDensities(testRG)\n"; + + #MvA Plot + + $query .= 'meanLogA<-((log(testRG$R, base=exp(2)) + log(testRG$G, base=exp(2)))/2)'."\n"; + $query .= 'logIntRatioM<-(log(testRG$R, base=exp(2)) - log(testRG$G, base=exp(2)))'."\n"; + $query .= "yMin<-min(logIntRatioM)\n"; + $query .= "yMax<-max(logIntRatioM)\n"; + + + + #Need to validate yMax here + #If is is Inf then we need to sort the vector and track back until we find the high real number + #count number of Infs and note on MvA plot + $query .= "infCount<-0\n"; + $query .= "if( yMax == Inf){; sortedM<-sort(logIntRatioM); lengthM<-length(logIntRatioM); indexM<-lengthM\n" + ."while (yMax == Inf){; indexM<-(indexM-1); yMax<-sortedM[indexM];}; infCount<-(lengthM-indexM);}\n"; + + # + $query .= "if(infCount == 0){\n"; + $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'")'."\n"; + $query .= "} else {\n"; + $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'", sub=paste(infCount, " Inf values not plotted"));'."}\n"; + + + #$query .= 'plot(log(testRG$R*testRG$G, base=exp(2))/2, log(testRG$R/testRG$G, base=exp(2)),xlab="A",ylab="M",pch=".",ylim=c(-3,3), main="'.$echip->unique_id."\")\n"; + + #Plate plots + $query .= 'image(testRG, 1, channel = "green", mycols = c("black", "green4", "springgreen"))'."\n"; + $query .= 'image(testRG, 1, channel = "red", mycols = c("black", "green4", "springgreen"))'."\n"; + + $query .= "dev.off()\n"; + #Finished QC pdf printing + + + + #The simple preprocess step of Ringo is actually vsn, so we can nest these in line + + + ### Build Analyses cmds ### + + if ($logic_name eq 'T.Biweight') { + + #log2 ratios + $query .= 'lr_df<-cbind((log(c2["EXPERIMENTAL_score"], base=exp(2)) - log(c1["CONTROL_score"], base=exp(2))))'."\n"; + + #Adjust using tukey.biweight weighted average + #inherits first col name + $query .= 'norm_df<-(lr_df["EXPERIMENTAL_score"]-tukey.biweight(as.matrix(lr_df)))'."\n"; + $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", norm_df[,1]), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n"; + + } elsif ($logic_name eq 'VSN_GLOG') { + #could do this directly + $query .= "raw_df<-cbind(c1[\"CONTROL_score\"], c2[\"EXPERIMENTAL_score\"])\n"; + #variance stabilise + $query .= "norm_df<-vsn(raw_df)\n"; + + + #do some more calcs here and print report? + #fold change exponentiate? See VSN docs + #should do someplot's of raw and glog and save here? + #set log func and params + #$query .= "par(mfrow = c(1, 2)); log.na = function(x) log(ifelse(x > 0, x, NA));"; + #plot + #$query .= "plot(exprs(glog_df), main = \"vsn\", pch = \".\");". + # "plot(log.na(exprs(raw_df)), main = \"raw\", pch = \".\");"; + #FAILS ON RAW PLOT!! + #par(mfrow = c(1, 2)) + #> meanSdPlot(nkid, ranks = TRUE) + #> meanSdPlot(nkid, ranks = FALSE) + + + #Now create table structure with glog values(diffs) + #3 sig dec places on scores(doesn't work?!) + $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", (exprs(norm_df[,2]) - exprs(norm_df[,1]))), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n"; + + } + #load back into DB + #c3results<-cbind(rep("", length(c3["probe_id"])), c3["probe_id"], c3["c3_score"], rep(1, length(c3["probe_id"])), rep(1, length(c3["probe_id"]))) + #may want to use safe.write here + #dbWriteTable(con, "result", c3results, append=TRUE) + #dbWriteTable returns true but does not load any data into table!!! + + $query .= "write.table(formatted_df, file=\"${resultfile}\", sep=\"\\t\", col.names=FALSE, row.names=FALSE, quote=FALSE, append=TRUE)\n"; + + #tidy up here?? + } + } + + $query .= "q();"; + + open(RFILE, ">$R_file") || throw("Cannot open $R_file for writing"); + print RFILE $query; + close(RFILE); + + my $submit_text = "Submitting $logic_name job"; + $submit_text .= ' to farm' if $self->farm; + $self->log("${submit_text}:\t".localtime()); + run_system_cmd($r_cmd); + $self->log("Finished $logic_name job:\t".localtime()); + $self->log('See '.$self->get_dir('norm').' for ExperimentalChip QC files'); + + #Now load file and update status + #Import directly here to avoid having to reparse all results if we crash!!!! + $self->log("Importing:\t$resultfile"); + $self->db->load_table_data("result", $resultfile); + $self->log("Finishing importing:\t$resultfile"); + + + foreach my $echip (@chips) { + $echip->adaptor->store_status($logic_name, $echip); + } + + #Recreate all non-import RSets for analysis if not already present + # + + my $rset_a = $self->db->get_ResultSetAdaptor(); + my %seen_rsets; + + foreach my $anal_rset (@{$rset_a->fetch_all_by_Experiment($self->experiment)}) { + next if($anal_rset->name =~ /_IMPORT$/o); + next if(exists $seen_rsets{$anal_rset->name}); + next if $anal_rset->analysis->logic_name eq $norm_anal->logic_name; + $seen_rsets{$rset->name} = 1; + $anal_rset->analysis($norm_anal); + $anal_rset->{'dbID'} = undef; + $anal_rset->{'adaptor'} = undef; + + #add the chip_channel_ids from the new anal IMPORT set + foreach my $table_id (@{$anal_rset->table_ids}) { + $anal_rset->{'table_id_hash'}{$table_id} = $rset->get_chip_channel_id($table_id); + } + + $self->log('Adding new ResultSet '.$anal_rset->name.' with analysis '.$norm_anal->logic_name); + $rset_a->store($anal_rset); + } + + + + } + } + + return; +} + +#can we sub this? args: table_name, logic_name +#also use result_set_name +#would also clean all data for result set if recovery +#return would be result_set +#Can we extend this to incorporate InputSet parser define_sets? + +sub get_import_ResultSet{ + my ($self, $anal, $table_name) = @_; + + if (!($anal && $anal->isa("Bio::EnsEMBL::Analysis") && $anal->dbID())) { + throw("Must provide a valid stored Bio::EnsEMBL::Analysis"); + } + + $self->log("Getting import $table_name ResultSet for analysis:\t".$anal->logic_name()); + + my ($rset, @new_chip_channels); + my $result_adaptor = $self->db->get_ResultSetAdaptor(); + my $logic_name = $anal->logic_name; + my $status = ($logic_name eq "RawValue") ? "IMPORTED" : $logic_name; + + if (($logic_name) eq 'RawValue' && ($table_name eq 'experimental_chip')) { + throw("Cannot have an ExperimentalChip ResultSet with a RawValue analysis, either specify 'channel' or another analysis"); + } + + #Build IMPORT Set for $table_name + foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { + + #clean chip import and generate rset + + if ($table_name eq 'experimental_chip') { + + if ($echip->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status + $self->log("ExperimentalChip(".$echip->unique_id().") already has status:\t".$status); + } else { + $self->log("Found ExperimentalChip(".$echip->unique_id().") without status $status"); + + push @new_chip_channels, $echip; + } + + } else { #channel + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status + $self->log("Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") already has status:\t".$status); + } else { + $self->log("Found Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") without status $status"); + push @new_chip_channels, $chan; + } + } + } + + if (( ! $rset) && @new_chip_channels) { + my(@tmp) = @{$result_adaptor->fetch_all_by_name_Analysis($self->name()."_IMPORT", $anal)}; + + if (scalar(@tmp) > 1) { + throw('Found more than one IMPORT ResultSet for '.$self->name().'_IMPORT with analysis '.$logic_name); + } + + $rset = shift @tmp; + + + #do we need to throw here if not recovery? + #what if we want the import result set elsewhere during the first import? + + #if ($self->recovery()) { + + #fetch by anal and experiment_id + #Need to change this to result_set.name! + # warn("add chip set handling here"); + + #my @tmp = @{$result_adaptor->fetch_all_by_Experiment_Analysis($self->experiment(), $anal)}; + #throw("Found more than one ResultSet for Experiment:\t".$self->experiment->name()."\tAnalysis:\t".$anal->logic_name().')' if (scalar(@tmp) >1); + #$rset = $tmp[0]; + + #warn "fetching rset with ".$self->name()."_IMPORT ". $anal->logic_name; + + #$rset = $result_adaptor->fetch_by_name_Analysis($self->name()."_IMPORT", $anal); + warn("Warning: Could not find recovery ResultSet for analysis ".$logic_name) if ! $rset; + #} + + if (! $rset) { + $self->log("Generating new ResultSet for analysis ".$logic_name); + + $rset = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -analysis => $anal, + -table_name => $table_name, + -name => $self->name()."_IMPORT", + -feature_type => $self->feature_type(), + -cell_type => $self->cell_type(), + ); + + #These types should be set to NULL during the MAGE-XML validation if we have more than one type in an experiment + + ($rset) = @{$result_adaptor->store($rset)}; + } + } + } + + #do we need this here as we're rolling back in the read methods? + #we only want to roll back those chips/channels which have not been registered + + if ($self->recovery()) { + + my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor(); + + foreach my $cc (@new_chip_channels) { + + #only roll back if already part of import set + #Not previously registered if not + if ($rset->contains($cc) && $rset->get_chip_channel_id($cc->dbID())) { + + if ($table_name eq 'channel') { + my $chan_name = $ec_adaptor->fetch_by_dbID($cc->experimental_chip_id())->unique_id()."_". + $self->get_config('dye_freqs')->{$cc->dye()}; + $self->log("Rolling back results for $table_name:\t".$chan_name); + + } else { + $self->log("Rolling back results for $table_name:\t".$cc->unique_id); + } + + $self->rollback_results([$rset->get_chip_channel_id($cc->dbID())]); + } + } + } + + + #check whether it is present in the ResultSet and add if not + if ($rset) { + #ids will already be present if not rset i.e. already imported + + foreach my $cc (@new_chip_channels) { + $rset->add_table_id($cc->dbID()) if(! $rset->contains($cc)); + } + } + + + + if ($rset) { + $result_adaptor->store_chip_channels($rset); + } else { + $self->log("All ExperimentalChips have status:\t$status"); + } + + #this only returns a result set if there is new data to import + return $rset; +} + + + +=head2 resolve_probe_data + + Example : $self->resolve_probe_data(); + Description: Resolves DB probe duplicates and builds local probe cache + Returntype : none + Exceptions : ???? + Caller : general + Status : At risk + +=cut + +sub resolve_probe_data{ + my $self = shift; + + $self->log("Resolving probe data", 1); + + warn "Probe cache resolution needs to accomodate probesets too!"; + + foreach my $array (@{$self->arrays()}) { + my $resolve = 0; + + if ($self->get_probe_cache_by_Array($array)) { #cache already generated + + #check if we have any new unresolved array chips to add to the cache + foreach my $achip (@{$array->get_ArrayChips()}) { + + if ($achip->has_status('RESOLVED')) { + $self->log("ArrayChip has RESOLVED status:\t".$achip->design_id()); #, 1); + next; + } else { + $self->log("Found un-RESOLVED ArrayChip:\t".$achip->design_id()); + $resolve = 1; + last; + } + } + } else { #no cache file + $resolve = 1; + $self->log('No probe cache found for array '.$array->name()); + } + + if ($resolve) { + $self->log('Resolving array duplicates('.$array->name().') and rebuilding probe cache.', 1); + $self->get_probe_cache_by_Array($array, 1); #get from DB + + #we need ot make sure we mark cache as unresolved, so we don't use it by mistake. + + + + my ($line, $name, $pid, @pids); + #my $index = 0; + my $tmp_name = ''; + my $tmp_id = ''; + + #miss the header + + while ($line = $self->{'_probe_cache'}{$array->name}{'handle'}->getline()) { + ($name, $pid) = split/\t/o, $line; + + if ($name eq $tmp_name) { + + if ($pid != $tmp_id) { + push @pids, $pid; + #should reset to pid here if we have x y data else undef + #ignore this and force result to have x y + } + + #can't do this naymore unless we figure out how to move the line pointer + #would still need to sed the file anyway, better to regen from DB? + #undef $self->{'_probe_cache'}{$array->name}{'entries'}->[$i];#delete true or to be resolved duplicate + } elsif ($name ne $tmp_name) { #new probe + $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1); + $tmp_name = $name; + $tmp_id = $pid; + @pids = ($pid); + #$index = $i + 1; + } + } + + $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1); + + #rename resovled cache and reset cache handle + my $cmd = 'mv '.$self->get_dir('caches').'/'.$array->name().'.probe_cache.unresolved '. + $self->get_dir('caches').'/'.$array->name().'.probe_cache'; + + run_system_cmd($cmd); + $self->get_probe_cache_by_Array($array); #This sets the caches + + + #warn "Only generate MD5 here, as this is guranteed to be correct"; + + foreach my $achip (@{$array->get_ArrayChips()}) { + + if (! $achip->has_status('RESOLVED')) { + $self->log("Updating ArrayChip to RESOLVED status:\t".$achip->design_id()); + $achip->adaptor->store_status('RESOLVED', $achip); + } + } + + $self->log('Finished building probe cache for '.$array->name(), 1); + } + } + + $self->log('Finished resolving probe data', 1); + + return; +} + + +sub tidy_duplicates{ + my ($self, $pids) = @_; + + my $pfa = $self->db->get_ProbeFeatureAdaptor(); + my ($feature, %features); + + foreach my $dup_id (@$pids) { + + foreach $feature(@{$pfa->fetch_all_by_Probe_id($dup_id)}) { + #can we safely assume end will be same too? + push @{$features{$feature->seq_region_name().':'.$feature->start()}}, $feature; + } + } + + my (@reassign_ids, @delete_ids); + + foreach my $seq_start_key (keys %features) { + my $reassign_features = 1; + + foreach $feature(@{$features{$seq_start_key}}) { + + if ($feature->probe_id() == $pids->[0]) { + $reassign_features = 0; + } else { + push @delete_ids, $feature->dbID(); + } + } + + #This assumes that we actually have at least one element to every seq_start_key array + if ($reassign_features) { + my $new_fid = pop @delete_ids; + push @reassign_ids, $new_fid; + } + } + + #resolve features first so we don't get any orphaned features if we crash. + $pfa->reassign_features_to_probe(\@reassign_ids, $pids->[0]) if @reassign_ids; + $pfa->delete_features(\@delete_ids) if @delete_ids; + + return; +} + +1;