comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1
2 =head1 LICENSE
3
4 Copyright (c) 1999-2012 The European Bioinformatics Institute and
5 Genome Research Limited. All rights reserved.
6
7 This software is distributed under a modified Apache license.
8 For license details, please see
9
10 http://www.ensembl.org/info/about/code_licence.html
11
12 =head1 CONTACT
13
14 Please email comments or questions to the public Ensembl
15 developers list at <ensembl-dev@ebi.ac.uk>.
16
17 Questions may also be sent to the Ensembl help desk at
18 <helpdesk@ensembl.org>.
19
20 =head1 NAME
21
22 Bio::EnsEMBL::Funcgen::Importer
23
24 =head1 SYNOPSIS
25
26 my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params);
27 $imp->register_experiment();
28
29
30 =head1 DESCRIPTION
31
32 B<This program> is the main class coordinating import of tiling array design and experimental data.
33 It utilises several underlying parser classes specific to array vendor or import file type.
34
35 =cut
36
37 ################################################################################
38
39 package Bio::EnsEMBL::Funcgen::Importer;
40
41 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(get_date open_file run_system_cmd);
42 use Bio::EnsEMBL::Utils::Exception qw( throw deprecate );
43 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
44 use Bio::EnsEMBL::Funcgen::Experiment;
45 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
46 use Bio::EnsEMBL::DBSQL::DBAdaptor;
47
48 use File::Path;
49 use strict;
50 use vars qw(@ISA);
51
52
53 ################################################################################
54
55 =head2 new
56
57 Description : Constructor method
58
59 Arg [1] : hash containing optional attributes:
60 -name Name of Experiment(dir)
61 -format of array e.g. Tiled(default)
62 -vendor name of array vendor
63 -description of the experiment
64 -pass DB password
65 -host DB host
66 -user DB user
67 -port DB port
68 -registry_host Host to load registry from
69 -registry_port Port for registry host
70 -registry_user User for registry host
71 -registry_pass Password for registry user
72 -ssh Flag to set connection over ssh via forwarded port to localhost (default = 0); remove?
73 -group name of experimental/research group
74 ` -location of experimental/research group
75 -contact e/mail address of primary contact for experimental group
76 -species
77 -assembly Genome assembly version i.e. 36 for NCBI36
78 -recover Recovery flag (default = 0)
79 -data_dir Root data directory (default = $ENV{'EFG_DATA'})
80 -output_dir review these dirs ???????
81 -input_dir ?????????
82 -import_dir ???????
83 -norm_dir ??????
84 -fasta dump FASTA flag (default =0)
85 -array_set Flag to treat all chip designs as part of same array (default = 0)
86 -array_name Name for array set
87 -array_file Path of array file to import for sanger ENCODE array
88 -result_set_name Name to give the raw and normalised result sets (default uses experiment and analysis name)
89 -norm_method Normalisation method (Nimblegen default = VSN_GLOG);
90 -dbname Override for autogeneration of funcgen dbaname
91 -reg_config path to local registry config file (default = ~/ensembl.init || undef)
92 -design_type MGED term (default = binding_site_identification) get from meta/MAGE?
93
94 -farm Flag to submit jobs to farm e.g. normalisation jobs
95 -batch_job Flag to signify that this Importer is running as a prepared batch/farm job
96 -prepared Flag to signify result files have been previously imported in prepare mode
97 and file names will differ to those record in InputSubset
98
99
100 #-use_defaults This changes some mandatory parameters to optional, instead using either DEFAULT or the input file name for the following options -name, -input_set, -feature_type, -cell_type etc ???
101
102 -verbose
103 ReturnType : Bio::EnsEMBL::Funcgen::Importer
104 Example : my $Exp = Bio::EnsEMBL::Importer->new(%params);
105 Exceptions : throws if mandatory params are not set or DB connect fails
106 Caller : General
107 Status : Medium - potential for %params names to change, remove %attrdata?
108
109 =cut
110
111 ################################################################################
112
113 sub new{
114 my ($caller) = shift;
115
116 #my $reg = "Bio::EnsEMBL::Registry";
117 my $class = ref($caller) || $caller;
118
119 #$user, $host, $port, $pass, $dbname, $db, $assm_version, $reg_config, $reg_db, $ucsc_coords, $verbose, $release, $reg_host, $reg_port, $reg_user, $reg_pass
120 #$ftype_name, $ctype_name,
121
122 my ($name, $format, $vendor, $group, $location, $contact,
123 $array_name, $array_set, $array_file, $data_dir, $result_files,
124 $exp_date, $desc, $design_type, $output_dir, $input_dir,
125 $batch_job, $farm, $prepared,
126 $norm_method, $old_dvd_format, $parser_type)
127 = rearrange(['NAME', 'FORMAT', 'VENDOR', 'GROUP', 'LOCATION', 'CONTACT',
128 'ARRAY_NAME', 'ARRAY_SET', 'ARRAY_FILE', 'DATA_DIR', 'RESULT_FILES',
129 'EXPERIMENT_DATE', 'DESCRIPTION',
130 'DESIGN_TYPE', 'OUTPUT_DIR', 'INPUT_DIR', #to allow override of defaults
131 'BATCH_JOB', 'FARM', 'PREPARED', 'NORM_METHOD',
132 'OLD_DVD_FORMAT', 'PARSER'], @_);
133
134
135
136 #### Define parent parser class based on vendor
137 throw("Mandatory argument -vendor not defined") if ! defined $vendor;
138
139 #This will override the default Vendor Parser type
140 #Evals simply protect from messy errors if parser type not found
141 my $parser_error;
142 my $vendor_parser = ucfirst(lc($vendor));
143
144
145 #WARNING evaling these parsers to enable pluggability hides errors in parser
146 #use a perl -MBio::EnsEMBL::Funcgen::Parsers:ParserType to debug
147 #get rid of all this case guessing and force correct parser name usage?
148
149
150 #WARNING
151 #Dynamic setting of ISA in this way reports the resultant object as Importer, when
152 #some throws/methods are actually in other base/custom Parsers
153 #This can seem a little counterintuitive, but allows plugability
154 #With out the need for separate control scripts
155
156
157 #Change this to be set and required/detected in the parse_and_import.pl script
158 #Then we can have Importer.pm as the base class and get rid of this.
159 #as well as set_config methods?
160
161
162 eval {require "Bio/EnsEMBL/Funcgen/Parsers/${vendor_parser}.pm";};
163
164 if ($@) {
165 #Don't warn/throw yet as we might have a standard parser format
166
167 $parser_error .= "There is no valid parser for the vendor your have specified:\t".$vendor.
168 "\nMaybe this is a typo or maybe you want to specify a default import format using the -parser option\n".$@;
169 }
170
171
172
173 if (defined $parser_type) {
174
175 #try normal case first
176 eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";};
177
178 if ($@) {
179 $parser_type = ucfirst(lc($parser_type));
180
181 #Now eval the new parser
182 eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";};
183
184 if ($@) {
185
186 #Might be no default
187 my $txt = "There is no valid parser for the -parser format your have specified:\t".$parser_type."\n";
188
189 if (! $parser_error) {
190 $txt .= "Maybe this is a typo or maybe you want run with the default $vendor_parser parser\n";
191 }
192
193 throw($txt.$@);
194 }
195
196 #warn about over riding vendor parser here
197 if (! $parser_error) {
198 #Can't log this as we haven't blessed the Helper yet
199 warn("WARNING\t::\tYou are over-riding the default ".$vendor." parser with -parser ".$parser_type);
200 }
201 }
202 } else {
203 throw($parser_error) if $parser_error;
204 $parser_type = $vendor_parser;
205 }
206
207
208 #we should now really set parser_type as an attrtibute?
209 unshift @ISA, 'Bio::EnsEMBL::Funcgen::Parsers::'.$parser_type;
210 #change this to be called explicitly from the load script?
211
212 #### Create object from parent class
213
214 my $self = $class->SUPER::new(@_);
215
216 #### Set vars and test minimum mandatory params for any import type
217
218 $self->{'name'} = $name || throw('Mandatory param -name not met'); #This is not mandatory for array design import
219 ## $self->{'user'} = $user || $ENV{'EFG_WRITE_USER'};
220 $self->vendor(uc($vendor)); #already tested
221 $self->{'format'} = uc($format) || 'TILED'; #remove default?
222 $self->group($group) if $group;
223 $self->location($location) if $location;
224 $self->contact($contact) if $contact;
225 ## $species || throw('Mandatory param -species not met');
226 $self->array_name($array_name) if $array_name;
227 $self->array_set($array_set) if $array_set;
228 $self->array_file($array_file) if $array_file;
229 $self->{'data_dir'} = $data_dir || $ENV{'EFG_DATA'};
230 $self->result_files($result_files)if $result_files;
231 $self->experiment_date($exp_date) if $exp_date;
232 $self->description($desc) if $desc; #experiment
233 ## $self->feature_set_description($fset_desc) if $fset_desc;
234
235 #$assm_version || throw('Mandatory param -assembly not met');
236 #Only required if setting DB by params e.g db not passed or generated from reg
237 #i.e. most of the time
238 #Why was this made mandatory?
239 #Default to dnadb||efgdb assm from the dbname
240
241 $self->{'design_type'} = $design_type || 'binding_site_identification'; #remove default?
242 $self->{'output_dir'} = $output_dir if $output_dir; #config default override
243 $self->{'input_dir'} = $input_dir if $input_dir; #config default override
244 $self->farm($farm) if $farm;
245 $self->batch_job($batch_job);
246 $self->prepared($prepared);
247 ## $self->{'ssh'} = $ssh || 0;
248 ## $self->{'_dump_fasta'} = $fasta || 0;
249 #$self->{'recover'} = $recover || 0; Now in BaseImporter
250 #check for ~/.ensembl_init to mirror general EnsEMBL behaviour
251 ## $self->{'reg_config'} = $reg_config || ((-f "$ENV{'HOME'}/.ensembl_init") ? "$ENV{'HOME'}/.ensembl_init" : undef);
252 $self->{'old_dvd_format'} = $old_dvd_format || 0;
253 ## $self->{'ucsc_coords'} = $ucsc_coords || 0;
254 ## $self->{'verbose'} = $verbose || 0;
255 ## $self->{'release'} = $release;
256
257
258
259 ## if($reg_host && $self->{'reg_config'}){
260 ## warn "You have specified registry parameters and a config file:\t".$self->{'reg_config'}.
261 ## "\nOver-riding config file with specified paramters:\t${reg_user}@${reg_host}:$reg_port";
262 ## }
263
264
265 #Will a general norm method be applicable for all imports?
266 #Already casued problems with Bed imports... remove?
267 #Could set NORM_METHOD in Parser!!
268 #warn "Need to fully implement norm_method is validate_mage, remove ENV NORM_METHOD?";
269 $self->{'norm_method'} = $norm_method; # || $ENV{'NORM_METHOD'};
270
271 #if ($self->vendor ne 'NIMBLEGEN'){
272 # $self->{'no_mage'} = 1;
273 # warn "Hardcoding no_mage for non-NIMBLEGEN imports";
274 # }
275
276
277 # if($self->{'no_mage'} && $self->{'write_mage'}){
278 # throw('-no_mage and -write_mage options are mutually exclusive, please select just one');
279 # }
280
281 ## #### Set up DBs and load and reconfig registry
282 ##
283 ## ### Load Registry
284 ## #Can we load the registry using the assembly version, then just redefine the efg DB?
285 ## #We have problems here if we try and load on a dev version, where no dev DBs are available on ensembldb
286 ## #Get the latest API version for the assembly we want to use
287 ## #Then load the registry from that version
288 ## #Then we can remove some of the dnadb setting code below?
289 ## #This may cause problems with API schema mismatches
290 ## #Can we just test whether the current default dnadb contains the assembly?
291 ## #Problem with this is that this will not have any other data e.g. genes etc
292 ## #which may be required for some parsers
293 ##
294 ## #How does the registry pick up the schema version??
295 ##
296 ## #We should really load the registry first given the dnadb assembly version
297 ## #Then reset the eFG DB as appropriate
298 ##
299 ##
300 ## if ($reg_host || ! defined $self->{'_reg_config'}) {
301 ## #defaults to current ensembl DBs
302 ## $reg_host ||= 'ensembldb.ensembl.org';
303 ## $reg_user ||= 'anonymous';
304 ##
305 ## #Default to the most recent port for ensdb
306 ## if( (! $reg_port) &&
307 ## ($reg_host eq 'ensdb-archive') ){
308 ## $reg_port = 5304;
309 ## }
310 ##
311 ## #This will try and load the dev DBs if we are using v49 schema or API?
312 ## #Need to be mindful about this when developing
313 ## #we need to tip all this on it's head and load the reg from the dnadb version!!!!!!!
314 ##
315 ## my $version_text= ($self->{'release'}) ? 'version '.$self->{'release'} : 'current version';
316 ## $self->log("Loading $version_text registry from $reg_user".'@'.$reg_host);
317 ##
318 ## #Note this defaults API version, hence running with head code
319 ## #And not specifying a release version will find not head version
320 ## #DBs on ensembldb, resulting in an exception from reset_DBAdaptor below
321 ## $reg->load_registry_from_db(
322 ## -host => $reg_host,
323 ## -user => $reg_user,
324 ## -port => $reg_port,
325 ## -pass => $reg_pass,
326 ## #-host => "ens-staging",
327 ## #-user => 'ensro',
328 ## -db_version => $self->{'release'},#51
329 ## -verbose => $self->verbose,
330 ## );
331 ##
332 ## throw('Not sensible to set the import DB as the default eFG DB from '.$reg_host.', please define db params') if ((! $dbname) && (! $db));
333 ## }
334 ## else{
335 ## $self->log("Loading registry from:\t".$self->{'_reg_config'});
336 ## $reg->load_all($self->{'_reg_config'}, 1);
337 ## }
338 ##
339 ##
340 ## #Need to test the DBs here, as we may not have loaded any!
341 ## #get_alias wil fail otherwise?
342 ## #This is a cyclical dependancy as we need alias to get species which we use to grab the DB
343 ## #alias is dependant on core DB being loaded with relevant meta entries.
344 ## #revise this when we split the Importer
345 ##
346 ##
347 ## #Validate species
348 ## my $alias = $reg->get_alias($species) || throw("Could not find valid species alias for $species\nYou might want to clean up:\t".$self->get_dir('output'));
349 ## $self->species($alias);
350 ## $self->{'param_species'} = $species;#Only used for dir generation
351 ##
352 ##
353 ## #SET UP DBs
354 ## if($db){
355 ## #db will have been defined before reg loaded, so will be present in reg
356 ##
357 ## if(! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){
358 ## $self->throw('-db must be a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor');
359 ## }
360 ## }
361 ## else{ #define eFG DB from params or registry
362 ##
363 ## if($reg_db){#load eFG DB from reg
364 ##
365 ## if($dbname){
366 ## throw("You cannot specify DB params($dbname) and load from the registry at the same time.");
367 ## }
368 ##
369 ## $self->log('WARNING: Loading eFG DB from Registry');
370 ## $db = $reg->get_DBAdaptor($self->species(), 'funcgen');
371 ## throw("Unable to retrieve ".$self->species." funcgen DB from the registry") if ! $db;
372 ## }
373 ## else{#resets the eFG DB in the custom or generic registry
374 ##
375 ## $dbname || throw('Must provide a -dbname if not using default custom registry config');
376 ## #$user || throw('Must provide a -user parameter');#make this default to EFG_WRITE_USER?
377 ## $pass || throw('Must provide a -pass parameter');
378 ##
379 ## #remove this and throw?
380 ## if(! defined $host){
381 ## $self->log('WARNING: Defaulting to localhost');
382 ## $host = 'localhost';
383 ## }
384 ##
385 ## $port ||= 3306;
386 ## my $host_ip = '127.0.0.1';#is this valid for all localhosts?
387 ##
388 ## if ($self->{'ssh'}) {
389 ## $host = `host localhost`; #mac specific? nslookup localhost wont work on server/non-PC
390 ## #will this always be the same?
391 ##
392 ## if (! (exists $ENV{'EFG_HOST_IP'})) {
393 ## warn "Environment variable EFG_HOST_IP not set for ssh mode, defaulting to $host_ip for $host";
394 ## } else {
395 ## $host_ip = $ENV{'EFG_HOST_IP'};
396 ## }
397 ##
398 ## if ($self->host() ne 'localhost') {
399 ## warn "Overriding host ".$self->host()." for ssh connection via localhost($host_ip)";
400 ## }
401 ## }
402 ##
403 ##
404 ## #data version is only used if we don't want to define the dbname
405 ## #This should never be guessed so don't need data_version here
406 ## #$dbname ||= $self->species()."_funcgen_".$self->data_version();
407 ##
408 ##
409 ## #Remove block below when we can
410 ## my $dbhost = ($self->{'ssh'}) ? $host_ip : $host;
411 ##
412 ## #This isn't set yet!?
413 ## #When we try to load, say 49, when we only have 48 on ensembldb
414 ## #This fails because there is not DB set for v49, as it is not on ensembl DB
415 ## #In this case we need to load from the previous version
416 ## #Trap this and suggest using the -schema_version/release option
417 ## #Can we autodetect this and reload the registry?
418 ## #We want to reload the registry anyway with the right version corresponding to the dnadb
419 ## #We could either test for the db in the registry or just pass the class.
420 ##
421 ## $db = $reg->reset_DBAdaptor($self->species(), 'funcgen', $dbname, $dbhost, $port, $self->user, $pass,
422 ## {
423 ## -dnadb_host => $reg_host,
424 ## -dnadb_port => $reg_port,
425 ## -dnadb_assembly => $assm_version,
426 ## -dnadb_user => $reg_user,
427 ## -dnadb_pass => $reg_pass,
428 ## });
429 ##
430 ##
431 ## #ConfigRegistry will try and set this
432 ## #This will fail if there is already one in the registry as it will try
433 ## #and defined a new unique species so as not to overwrite the original
434 ## #e.g. homo_sapiens1
435 ##
436 ## #This is why it was orignally written backwards as we can't easily dynamically redefine
437 ## #an adaptor in the registry without ConfigRegistry trying to change the name
438 ## #the very act of creating a new db to redefine the registry with causes ConfigRegistry
439 ## #to try and register it with a unique species name
440 ##
441 ## #Delete the old funcgen DB from the registry first
442 ## #$reg->remove_DBAdaptor($self->species, 'funcgen');
443 ##
444 ## #ConfigRegistry will automatically configure this new db
445 ##
446 ## #$db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(
447 ## # -user => $user,
448 ## # -host => ($self->{'ssh'}) ? $host_ip : $host,
449 ## # -port => $port,
450 ## # -pass => $pass,
451 ## # #we need to pass dbname else we can use non-standard dbs
452 ## # -dbname => $dbname,
453 ## # -species => $self->species(),
454 ## # -group => 'funcgen',
455 ## # );
456 ##
457 ##
458 ## #if we get a species like homo_sapiens1 here
459 ## #This is because ConfigRegistry is try to make the dbname different between the
460 ## #one already present and the one you're trying to add
461 ## }
462 ## }
463 ##
464 ##
465 ## ### VALIDATE DNADB
466 ## #This is now done in DBAdaptor
467 ##
468 ## #We can change this to just use the assembly version
469 ## #we could even have the wordy assmelby version from the meta table
470 ## #do the standard ensembl subs
471 ## #s/[A-Za-z]//g;
472 ## #s/\.//g;
473 ## #And then validate?
474 ## #Just stick to number version for now.
475 ##
476 ##
477 ## #Now we need to set the dnadb_host params to avoid ensembldb defaults
478 ## #This should check the registry first
479 ## #Then load from the registry db?
480 ## #If we have a multi host registry config file this isn't going to work!
481 ##
482 ## #Is this required anymore as the DBAdaptor handles this?
483 ## #Not if we pass a db with an incorrect dnadb attached.
484 ##
485 ## #if($db->_get_schema_build($db->dnadb()) !~ /_[0-9]+_${assm_version}[a-z]*$/){
486 ### my $warning = "WARNING: dnadb does not match assembly_version $assm_version. Using ensembldb.enembl.org to define the dnadb";
487 ### $warning .= ' rather than the reg_config' if (defined $self->{'_reg_config'});
488 ##
489 ## #We need to account for reg_config DBs which may have custom info in
490 ## #So try reg_config host first, then try ensembldb with warning
491 ## #Could have a reg_config only flag for core dbs
492 ## #Need to implement more params in set_dnadb_by_assembly_version
493 ### $self->log($warning);
494 ##
495 ### $db->set_dnadb_by_assembly_version($assm_version);
496 ### }
497 ##
498 ##
499 ##
500 ##
501 ## #Test connections
502 ## $self->db($db);
503 ## $db->dbc->db_handle;
504 ## $db->dnadb->dbc->db_handle;
505 ## #Set re/disconnect options
506 ##
507 ## #These really need setting dependant on the import parser
508 ## $db->dbc->disconnect_when_inactive(1);
509 ## $db->dnadb->dbc->disconnect_when_inactive(1);
510
511
512
513 ## ### Check analyses/feature_type/cell_type
514 ## if($feature_analysis){
515 ## my $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($feature_analysis);
516 ## throw("The Feature Analysis $feature_analysis does not exist in the database") if(!$fanal);
517 ## $self->feature_analysis($fanal);
518 ##
519 ## #This currently fails before the config gets loaded!
520 ## #Need to load config before this validation!
521 ## }
522 ##
523 ## if($ctype_name){
524 ## my $ctype = $self->db->get_CellTypeAdaptor->fetch_by_name($ctype_name);
525 ## throw("The CellType $ctype_name does not exist in the database") if(!$ctype);
526 ## $self->cell_type($ctype);
527 ## }
528 ##
529 ## if ($ftype_name) {
530 ## my $ftype = $self->db->get_FeatureTypeAdaptor->fetch_by_name($ftype_name);
531 ## throw("The FeatureType $ftype_name does not exist in the database") if(!$ftype);
532 ## $self->feature_type($ftype);
533 ## }
534
535
536 #Set config here instead?
537 #So we can check all mandatory params
538 #Set vendor specific attr dependent vars
539
540 #Generic input dir
541 $self->{'input_dir'} ||= $self->get_dir("data").'/input/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name();
542
543 if (! -d $self->get_dir('input')) {
544
545 if (@{$self->result_files}) {
546 #This is really InputSet specific
547 #Could go in init_experiment_import
548 $self->log("Processing files:\n\t\t".join("\n\t\t",@{$self->result_files}));
549 } else {
550 throw('input_dir is not defined or does not exist ('.$self->get_dir('input').')');
551 }
552 }
553
554 #Parser specific config
555 $self->set_config();
556
557
558 $self->debug(2, "Importer class instance created.");
559 $self->debug_hash(3, \$self);
560
561 return ($self);
562 }
563
564 =head2 registry_host
565
566 Example : my $reg_host = $imp->registry_host;
567 Description: Accessor for registry host attribute
568 Returntype : string e.g. ensembldb.ensembl.org
569 Exceptions : None
570 Caller : general
571 Status : at risk
572
573 =cut
574
575 sub registry_host{
576 return $_[0]->{'reg_host'};
577 }
578
579 =head2 registry_user
580
581 Example : my $reg_user = $imp->registry_user;
582 Description: Accessor for registry user attribute
583 Returntype : string e.g. ensembldb.ensembl.org
584 Exceptions : None
585 Caller : general
586 Status : at risk
587
588 =cut
589
590 sub registry_user{
591 return $_[0]->{'reg_user'};
592 }
593
594 =head2 registry_port
595
596 Example : my $reg_port = $imp->registry_port;
597 Description: Accessor for registry port attribute
598 Returntype : string e.g. ensembldb.ensembl.org
599 Exceptions : None
600 Caller : general
601 Status : at risk
602
603 =cut
604
605 sub registry_port{
606 return $_[0]->{'reg_port'};
607 }
608
609 =head2 registry_pass
610
611 Example : my $reg_pass = $imp->registry_pass;
612 Description: Accessor for registry pass attribute
613 Returntype : string e.g. ensembldb.ensembl.org
614 Exceptions : None
615 Caller : general
616 Status : at risk
617
618 =cut
619
620 sub registry_pass{
621 return $_[0]->{'reg_pass'};
622 }
623
624
625 #init method kept separate from new due to differing madatory check and set up
626
627 =head2 init_array_import
628
629 Example : $self->init_import();
630 Description: Initialises import by creating working directories
631 and by storing the Experiemnt
632 Returntype : none
633 Exceptions : warns and throws depending on recover and Experiment status
634 Caller : general
635 Status : at risk - merge with register_array_design
636
637 =cut
638
639 sub init_array_import{
640
641 my ($self) = shift;
642
643 # we need to define which paramters we'll be storing
644 #use the logic names of the analyses as the field headers
645
646 #need to test for vendor here
647
648 #Sanger, NIMBLEGEN(no design_id issue, could get from the ndf, but we want it in the DesignNotes.txt)
649 #Then we can change the Array/Chip generation to soley use the DesignNotes.txt rather than SampleKey
650 #which is experiment specific
651 #or eFG format.
652
653 $self->create_output_dirs('caches', 'fastas');
654
655
656 }
657
658
659 =head2 init_experiment_import
660
661 Example : $self->init_import();
662 Description: Initialises import by creating working directories
663 and by storing the Experiemnt
664 Returntype : none
665 Exceptions : warns and throws depending on recover and Experiment status
666 Caller : general
667 Status : at risk - merge with register exeriment
668
669 =cut
670
671 sub init_experiment_import{
672 my ($self) = shift;
673
674 #Change this to take config mandatory params?
675 #No specific to exp import
676 #Name is used in set_config anyway
677 #Currently we only have array and experiment import, both of which should have names
678 #Make mandatory?
679
680 foreach my $tmp ("group", "data_dir") { #name now generically mandatory
681 throw("Mandatory arg $tmp not been defined") if (! defined $self->{$tmp});
682 }
683 #Should we separate path on group here too, so we can have a dev/test group?
684
685 #Create output dirs
686 #This should be moved to the Parser to avoid generating directories which are needed for different imports
687 $self->create_output_dirs('raw', 'norm', 'caches', 'fastas');
688 throw("No result_files defined.") if (! defined $self->result_files());
689
690 #Log input files
691 #if (@{$self->result_files()}) {
692 # $self->log("Found result files arguments:\n\t".join("\n\t", @{$self->result_files()}));
693 #}
694 #This is done in new
695
696 #check for cell||feature and warn if no met file supplied?
697
698 if ($self->norm_method) {
699 my $norm_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->norm_method);
700
701 #should we list the valid analyses?
702 throw($self->norm_method.' is not a valid analysis') if ! $norm_anal;
703 $self->norm_analysis($norm_anal);
704 } else {
705 $self->log('WARNING: No normalisation analysis specified');
706 }
707
708 #warn "Need to check env vars here or in Parser or just after set_config?";
709 #Need generic method for checking ENV vars in Helper
710 #check for ENV vars?
711 #R_LIBS
712 #R_PATH if ! farm
713 #R_FARM_PATH
714
715 $self->validate_group(); #import experimental_group
716
717 #Get experiment
718 my $exp_adaptor = $self->db->get_ExperimentAdaptor();
719 my $exp = $exp_adaptor->fetch_by_name($self->name()); #, $self->group());
720 $self->process_experiment_config if $self->can('process_experiment_config'); #Parsers::MAGE::process_experiment_config
721
722 #Moved MAGE support form here to MAGE.pm
723
724 #Recovery now set so deal with experiment
725 if ($self->recovery() && ($exp)) {
726 $self->log("Using previously stored Experiment:\t".$exp->name);
727 } elsif ((! $self->recovery()) && $exp) {
728 throw("Your experiment name is already registered in the database, please choose a different \"name\", this will require renaming you input directory, or specify -recover if you are working with a failed/partial import.");
729 #can we skip this and store, and then check in register experiment if it is already stored then throw if not recovery
730 } else { # (recover && exp) || (recover && ! exp)
731
732
733 $exp = Bio::EnsEMBL::Funcgen::Experiment->new(
734 -EXPERIMENTAL_GROUP => $self->{egroup},
735 -NAME => $self->name(),
736 -DATE => $self->experiment_date(),
737 -PRIMARY_DESIGN_TYPE => $self->design_type(),
738 -DESCRIPTION => $self->description(),
739 -ADAPTOR => $self->db->get_ExperimentAdaptor(),
740 );
741
742 ($exp) = @{$exp_adaptor->store($exp)};
743 }
744
745
746 $self->experiment($exp);
747
748 #remove and add specific report, this is catchig some Root stuff
749 #$self->log("Initiated efg import with following parameters:\n".Data::Dumper::Dumper(\$self));
750
751 return;
752 }
753
754
755 #Move this to new or init_experiment
756
757 =head2 validate_group
758
759 Example : $self->validate_group();
760 Description: Validates groups details
761 Returntype : none
762 Exceptions : throws if insufficient info defined to store new Group and is not already present
763 Caller : general
764 Status : Medium - check location and contact i.e. group name clash?
765
766 =cut
767
768 sub validate_group{
769 my ($self) = shift;
770
771 my $egroup = $self->db->get_ExperimentalGroupAdaptor->fetch_by_name($self->group);
772
773 if (! defined $egroup) {
774
775 throw("validate_group does not yet fully support ExperimentalGroups, please add manually");
776
777 #if ($self->location() && $self->contact()) {
778 # $self->db->import_group($self->group(), $self->location, $self->contact());
779 #} else {
780 # throw("Group ".$self->group()." does not exist, please specify a location and contact to register the group");
781 #}
782 }
783
784 $self->{egroup} = $egroup;
785
786 return;
787 }
788
789 =head2 create_output_dirs
790
791 Example : $self->create_output_dirs();
792 Description: Does what it says on the tin, creates dirs in
793 the root output dir foreach @dirnames, also set paths in self
794 Arg [1] : mandatory - list of dir names
795 Returntype : none
796 Exceptions : none
797 Caller : general
798 Status : Medium - add throw?
799
800 =cut
801
802 sub create_output_dirs{
803 my ($self, @dirnames) = @_;
804
805 #output dir created in control script
806 #avoids errors when logs generated first
807
808
809 foreach my $name (@dirnames) {
810
811 if ($name eq 'caches') {
812 $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/".$self->db->dbc->dbname() if(! defined $self->{"${name}_dir"});
813 } elsif ($name eq 'fastas') {
814 $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/" if(! defined $self->{"${name}_dir"});
815 } else {
816 $self->{"${name}_dir"} = $self->get_dir('output')."/${name}" if(! defined $self->{"${name}_dir"});
817 }
818
819 if (! (-d $self->get_dir($name) || (-l $self->get_dir($name)))) {
820 $self->log("Creating directory:\t".$self->get_dir($name));
821 #This did not throw with mkdir!!
822 mkpath $self->get_dir($name) || throw('Failed to create directory: '. $self->get_dir($name));
823 chmod 0744, $self->get_dir($name);
824 }
825 }
826
827 return;
828 }
829
830 =head2 vendor
831
832 Example : $imp->vendor("NimbleGen");
833 Description: Getter/Setter for array vendor
834 Arg [1] : optional - vendor name
835 Returntype : string
836 Exceptions : none
837 Caller : general
838 Status : Stable
839
840 =cut
841
842 sub vendor{
843 my ($self) = shift;
844
845 if (@_) {
846 $self->{'vendor'} = shift;
847 $self->{'vendor'} = uc($self->{'vendor'});
848 }
849
850 return $self->{'vendor'};
851 }
852
853
854 =head2 feature_type
855
856 Example : $imp->feature_type($ftype);
857 Description: Getter/Setter for Experiment FeatureType
858 Arg [1] : optional - Bio::EnsEMBL::Funcgen::FeatureType
859 Returntype : Bio::EnsEMBL::Funcgen::FeatureType
860 Exceptions : Throws if arg is not valid or stored
861 Caller : general
862 Status : at risk
863
864 =cut
865
866 sub feature_type{
867 my ($self) = shift;
868
869 if (@_) {
870 my $ftype = shift;
871
872 #do we need this as we're checking in new?
873 if (! ($ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType') && $ftype->dbID())) {
874 throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType");
875 }
876
877 $self->{'feature_type'} = $ftype;
878 }
879
880 return $self->{'feature_type'};
881 }
882
883 =head2 feature_analysis
884
885 Example : $imp->feature_analysis($fanal);
886 Description: Getter/Setter for Analysis used for creating the imported Features
887 Arg [1] : optional - Bio::EnsEMBL::Analysis
888 Returntype : Bio::EnsEMBL::Analysis
889 Exceptions : Throws if arg is not valid or stored
890 Caller : general
891 Status : at risk
892
893 =cut
894
895 sub feature_analysis{
896 my ($self) = shift;
897
898 if (@_) {
899 my $fanal = shift;
900
901 #do we need this as we're checking in new?
902 if (! (ref ($fanal) && $fanal->isa('Bio::EnsEMBL::Analysis') && $fanal->dbID())) {
903 throw("Must pass a valid stored Bio::EnsEMBL::Analysis");
904 }
905
906 $self->{'feature_analysis'} = $fanal;
907 }
908
909 return $self->{'feature_analysis'};
910 }
911
912 =head2 norm_analysis
913
914 Example : $imp->norm_analysis($anal);
915 Description: Getter/Setter for the normalisation analysis
916 Arg [1] : optional - Bio::EnsEMBL::Analysis
917 Returntype : Bio::EnsEMBL::Analysis
918 Exceptions : Throws if arg is not valid or stored
919 Caller : general
920 Status : at risk
921
922 =cut
923
924 sub norm_analysis{
925 my ($self) = shift;
926
927 if (@_) {
928 my $anal = shift;
929
930 #do we need this as we're checking in new?
931 if (! (ref($anal) && $anal->isa('Bio::EnsEMBL::Analysis') && $anal->dbID())) {
932 throw("Must pass a valid stored Bio::EnsEMBL::Analysis");
933 }
934
935 $self->{'norm_analysis'} = $anal;
936 }
937
938 return $self->{'norm_analysis'};
939 }
940
941
942
943 =head2 cell_type
944
945 Example : $imp->cell_type($ctype);
946 Description: Getter/Setter for Experiment CellType
947 Arg [1] : optional - Bio::EnsEMBL::Funcgen::CellType
948 Returntype : Bio::EnsEMBL::Funcgen::CellType
949 Exceptions : Throws if arg is not valid or stored
950 Caller : general
951 Status : at risk
952
953 =cut
954
955 sub cell_type{
956 my ($self) = shift;
957
958 if (@_) {
959 my $ctype = shift;
960
961 #do we need this as we're checking in new?
962 if (! ($ctype->isa('Bio::EnsEMBL::Funcgen::CellType') && $ctype->dbID())) {
963 throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::CellType");
964 }
965
966 $self->{'cell_type'} = $ctype;
967 }
968
969 return $self->{'cell_type'};
970 }
971
972
973 ##=head2 ucsc_coords
974 ##
975 ## Example : $start += 1 if $self->ucsc_coords;
976 ## Description: Getter for UCSC coordinate usage flag
977 ## Returntype : boolean
978 ## Exceptions : none
979 ## Caller : general
980 ## Status : at risk
981 ##
982 ##=cut
983 ##
984 ##sub ucsc_coords{
985 ## my $self = shift;
986 ## return $self->{'ucsc_coords'};
987 ##}
988
989
990
991 =head2 array_file
992
993 Example : my $array_file = $imp->array_file();
994 Description: Getter/Setter for sanger/design array file
995 Arg [1] : optional - path to adf or gff array definition/mapping file
996 Returntype : string
997 Exceptions : none
998 Caller : general
999 Status : at risk
1000
1001 =cut
1002
1003 sub array_file{
1004 my ($self) = shift;
1005 $self->{'array_file'} = shift if(@_);
1006 return $self->{'array_file'};
1007 }
1008
1009 =head2 array_name
1010
1011 Example : my $array_name = $imp->array_name();
1012 Description: Getter/Setter for array name
1013 Arg [1] : optional string - name of array
1014 Returntype : string
1015 Exceptions : none
1016 Caller : general
1017 Status : at risk
1018
1019 =cut
1020
1021 sub array_name{
1022 my ($self) = shift;
1023 $self->{'array_name'} = shift if(@_);
1024 return $self->{'array_name'};
1025 }
1026
1027
1028 =head2 array_set
1029
1030 Example : $imp->array_set(1);
1031 Description: Getter/Setter for array set flag
1032 Arg [1] : optional boolean - treat all array chips as the same array
1033 Returntype : boolean
1034 Exceptions : none
1035 Caller : general
1036 Status : at risk
1037
1038 =cut
1039
1040 sub array_set{
1041 my ($self) = shift;
1042 $self->{'array_set'} = shift if(@_);
1043 return $self->{'array_set'};
1044 }
1045
1046
1047 =head2 add_Array
1048
1049 Arg [1] : Bio::EnsEMBL::Funcgen::Array
1050 Example : $self->add_Array($array);
1051 Description: Setter for array elements
1052 Returntype : none
1053 Exceptions : throws if passed non Array or if more than one Array set
1054 Caller : Importer
1055 Status : At risk - Implement multiple arrays? Move to Experiment?
1056
1057 =cut
1058
1059 sub add_Array{
1060 my $self = shift;
1061
1062 #do we need to check if stored?
1063 if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Array')) {
1064 throw("Must supply a Bio::EnsEMBL::Funcgen::Array");
1065 } elsif (@_) {
1066 push @{$self->{'arrays'}}, @_;
1067 }
1068
1069 throw("Does not yet support multiple array imports") if(scalar (@{$self->{'arrays'}}) > 1);
1070 #need to alter read_probe data at the very least
1071
1072 return;
1073 }
1074
1075
1076
1077 =head2 arrays
1078
1079 Example : foreach my $array(@{$imp->arrays}){ ...do an array of things ...};
1080 Description: Getter for the arrays attribute
1081 Returntype : ARRAYREF
1082 Exceptions : none
1083 Caller : general
1084 Status : at risk
1085
1086 =cut
1087
1088 sub arrays{
1089 my $self = shift;
1090
1091 if (! defined $self->{'arrays'}) {
1092 $self->{'arrays'} = $self->db->get_ArrayAdaptor->fetch_all_by_Experiment($self->experiment());
1093 }
1094
1095 return $self->{'arrays'};
1096 }
1097
1098
1099
1100 =head2 location
1101
1102 Example : $imp->vendor("Hinxton");
1103 Description: Getter/Setter for group location
1104 Arg [1] : optional - location
1105 Returntype : string
1106 Exceptions : none
1107 Caller : general
1108 Status : Stable
1109
1110 =cut
1111
1112
1113 sub location{
1114 my ($self) = shift;
1115 $self->{'location'} = shift if(@_);
1116 return $self->{'location'};
1117 }
1118
1119
1120 =head2 contact
1121
1122 Example : my $contact = $imp->contact();
1123 Description: Getter/Setter for the group contact
1124 Arg [1] : optional - contact name/email/address
1125 Returntype : string
1126 Exceptions : none
1127 Caller : general
1128 Status : Stable
1129
1130 =cut
1131
1132 sub contact{
1133 my ($self) = shift;
1134 $self->{'contact'} = shift if(@_);
1135 return $self->{'contact'};
1136 }
1137
1138 =head2 name
1139
1140 Example : $imp->name('Experiment1');
1141 Description: Getter/Setter for the experiment name
1142 Arg [1] : optional - experiment name
1143 Returntype : string
1144 Exceptions : none
1145 Caller : general
1146 Status : Stable
1147
1148 =cut
1149
1150
1151 sub name{
1152 my ($self) = shift;
1153 $self->{'name'} = shift if(@_);
1154 return $self->{'name'};
1155 }
1156
1157 =head2 result_files
1158
1159 Example : $imp->result_files(\@files);
1160 Description: Getter/Setter for the result file paths
1161 Arg [1] : Listref of file paths
1162 Returntype : Listref
1163 Exceptions : none
1164 Caller : general
1165 Status : At risk
1166
1167 =cut
1168
1169
1170 sub result_files{
1171 my ($self) = shift;
1172 $self->{'result_files'} = shift if(@_);
1173 return $self->{'result_files'};
1174 }
1175
1176
1177
1178
1179
1180 =head2 experiment_date
1181
1182 Example : $imp->experiment_date('2006-11-02');
1183 Description: Getter/Setter for the experiment date
1184 Arg [1] : optional - date string in yyyy-mm-dd
1185 Returntype : string
1186 Exceptions : none
1187 Caller : general
1188 Status : At risk
1189
1190 =cut
1191
1192
1193
1194 sub experiment_date{
1195 my ($self) = shift;
1196
1197 if (@_) {
1198 my $date = shift;
1199
1200 if ($date !~ /[0-9]{4}-[0-9]{2}[0-9]{2}/o) {
1201 throw('Parameter -experiment_date needs to fe in the format: YYYY-MM-DD');
1202 }
1203
1204 $self->{'experiment_date'} = $date;
1205 } elsif ($self->vendor() eq "nimblegen" && ! defined $self->{'experiment_date'}) {
1206 $self->{'experiment_date'} = &get_date("date", $self->get_config("chip_file")),
1207 }
1208
1209 return $self->{'experiment_date'};
1210 }
1211
1212
1213
1214 =head2 group
1215
1216 Example : my $exp_group = $imp->group();
1217 Description: Getter/Setter for the group name
1218 Arg [1] : optional - group name
1219 Returntype : string
1220 Exceptions : none
1221 Caller : general
1222 Status : Stable
1223
1224 =cut
1225
1226 sub group{
1227 my ($self) = shift;
1228 $self->{'group'} = shift if(@_);
1229 return $self->{'group'};
1230 }
1231
1232
1233 =head2 description
1234
1235 Example : $imp->description("Human chrX H3 Lys 9 methlyation");
1236 Description: Getter/Setter for the experiment element
1237 Arg [1] : optional - experiment description
1238 Returntype : string
1239 Exceptions : none
1240 Caller : general
1241 Status : Stable
1242
1243 =cut
1244
1245 sub description{
1246 my $self = shift;
1247
1248 if (@_) {
1249 $self->{'description'} = shift;
1250 }
1251
1252 return $self->{'description'};
1253 }
1254
1255 ##=head2 feature_set_description
1256 ##
1257 ## Example : $imp->description("ExperimentalSet description");
1258 ## Description: Getter/Setter for the FeatureSet description for an
1259 ## InputSet import e.g. preprocessed GFF/Bed data
1260 ## Arg [1] : optional - string feature set description
1261 ## Returntype : string
1262 ## Exceptions : none
1263 ## Caller : general
1264 ## Status : At risk
1265 ##
1266 ##=cut
1267 ##
1268 ##sub feature_set_description{
1269 ## my $self = shift;
1270 ##
1271 ## $self->{'feature_set_description'} = shift if @_;
1272 ##
1273 ## return $self->{'feature_set_description'};
1274 ##}
1275
1276 =head2 format
1277
1278 Example : $imp->format("Tiled");
1279 Description: Getter/Setter for the array format
1280 Arg [1] : optional - array format
1281 Returntype : string
1282 Exceptions : none
1283 Caller : general
1284 Status : Stable
1285
1286 =cut
1287
1288 sub format{
1289 my ($self) = shift;
1290 $self->{'format'} = shift if(@_);
1291 return $self->{'format'};
1292 }
1293
1294 =head2 experiment
1295
1296 Example : my $exp = $imp->experiment();
1297 Description: Getter/Setter for the Experiment element
1298 Arg [1] : optional - Bio::EnsEMBL::Funcgen::Experiment
1299 Returntype : Bio::EnsEMBL::Funcgen::Experiment
1300 Exceptions : throws if arg is not an Experiment
1301 Caller : general
1302 Status : Stable
1303
1304 =cut
1305
1306 sub experiment{
1307 my ($self) = shift;
1308
1309 if (@_) {
1310
1311 if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Experiment')) {
1312 throw("Must pass a Bio::EnsEMBL::Funcgen::Experiment object");
1313 }
1314
1315 $self->{'experiment'} = shift;
1316 }
1317
1318 return $self->{'experiment'};
1319 }
1320
1321 ##=head2 db
1322 ##
1323 ## Example : $imp->db($funcgen_db);
1324 ## Description: Getter/Setter for the db element
1325 ## Arg [1] : optional - Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor
1326 ## Returntype : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor
1327 ## Exceptions : throws if arg is not an DBAdaptor
1328 ## Caller : general
1329 ## Status : Stable
1330 ##
1331 ##=cut
1332 ##
1333 ##sub db{
1334 ## my $self = shift;
1335 ##
1336 ## if (defined $_[0] && $_[0]->isa("Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor")) {
1337 ## $self->{'db'} = shift;
1338 ## } elsif (defined $_[0]) {
1339 ## throw("Need to pass a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor");
1340 ## }
1341 ##
1342 ## return $self->{'db'};
1343 ##}
1344 ##
1345 ##=head2 pass
1346 ##
1347 ## Example : $imp->pass("password");
1348 ## Description: Getter/Setter for the db password
1349 ## Arg [1] : optional - db password
1350 ## Returntype : string
1351 ## Exceptions : none
1352 ## Caller : general
1353 ## Status : Stable
1354 ##
1355 ##=cut
1356 ##
1357 ##
1358 ##sub pass{
1359 ## my $self = shift;
1360 ## $self->{'pass'} = shift if(@_);
1361 ## return $self->{'pass'};
1362 ##}
1363 ##
1364 ##=head2 pass
1365 ##
1366 ## Example : $imp->host("hoastname");
1367 ## Description: Getter/Setter for the db hostname
1368 ## Arg [1] : optional - db hostname
1369 ## Returntype : string
1370 ## Exceptions : none
1371 ## Caller : general
1372 ## Status : Stable
1373 ##
1374 ##=cut
1375 ##
1376 ##sub host{
1377 ## my $self = shift;
1378 ## $self->{'host'} = shift if(@_);
1379 ## return $self->{'host'};
1380 ##}
1381 ##
1382 ##=head2 port
1383 ##
1384 ## Example : $imp->port(3306);
1385 ## Description: Getter/Setter for the db port number
1386 ## Arg [1] : optional - db port number
1387 ## Returntype : int
1388 ## Exceptions : none
1389 ## Caller : general
1390 ## Status : Stable
1391 ##
1392 ##=cut
1393 ##
1394 ##sub port{
1395 ## my $self = shift;
1396 ## $self->{'port'} = shift if(@_);
1397 ## return $self->{'port'};
1398 ##}
1399 ##
1400 ##=head2 user
1401 ##
1402 ## Example : $imp->user("user_name");
1403 ## Description: Getter/Setter for the db user name
1404 ## Arg [1] : optional - db user name
1405 ## Returntype : string
1406 ## Exceptions : none
1407 ## Caller : general
1408 ## Status : Stable
1409 ##
1410 ##=cut
1411 ##
1412 ##sub user{
1413 ## my $self = shift;
1414 ## $self->{'user'} = shift if(@_);
1415 ## return $self->{'user'};
1416 ##}
1417
1418 ##=head2 dump_fasta
1419 ##
1420 ## Example : if($self->dump_fasta()){...do fasta dump...}
1421 ## Description: Getter/Setter for the dump_fasta flag
1422 ## Arg [1] : optional - 0 or 1
1423 ## Returntype : boolean
1424 ## Exceptions : none
1425 ## Caller : self
1426 ## Status : Stable
1427 ##
1428 ##=cut
1429 ##
1430 ##
1431 ##sub dump_fasta{
1432 ## my $self = shift;
1433 ## $self->{'_dump_fasta'} = shift if @_;
1434 ## return $self->{'_dump_fasta'};
1435 ##}
1436 ##
1437
1438
1439 ##=head2 species
1440 ##
1441 ## Example : $imp->species("homo_sapiens");
1442 ## Description: Getter/Setter for species
1443 ## Arg [1] : optional - species name(alias?)
1444 ## Returntype : string
1445 ## Exceptions : none ? throw if no alias found?
1446 ## Caller : general
1447 ## Status : Medium - may move reg alias look up to this method
1448 ##
1449 ##=cut
1450 ##
1451 ##sub species{
1452 ## my $self = shift;
1453 ##
1454 ## #should we do reg alias look up here?
1455 ## #Will we ever want to redefine species?
1456 ## #Change to just getter?
1457 ##
1458 ## $self->{'species'} = shift if(@_);
1459 ##
1460 ## return $self->{'species'};
1461 ##}
1462
1463 =head2 get_dir
1464
1465 Example : $imp->get_dir("import");
1466 Description: Retrieves full path for given directory
1467 Arg [1] : mandatory - dir name
1468 Returntype : string
1469 Exceptions : none
1470 Caller : general
1471 Status : at risk - move to Helper?
1472
1473 =cut
1474
1475 sub get_dir{
1476 my ($self, $dirname) = @_;
1477 return $self->get_data("${dirname}_dir");
1478 }
1479
1480 =head2 norm_method
1481
1482 Example : my $norm_method = $imp->norm_method()
1483 Description: Getter/Setter for normalisation method
1484 Arg [1] : mandatory - method name
1485 Returntype : string
1486 Exceptions : none ? throw if no analysis with logic name
1487 Caller : general
1488 Status : At risk - restrict to logic_name and validate against DB, allow multiple
1489
1490 =cut
1491
1492 #Move to Nimblegen?
1493 #Do we ever want to normalise other data?
1494
1495 sub norm_method{
1496 my $self = shift;
1497
1498 if (@_) {
1499 $self->{'norm_method'} = shift;
1500 } elsif (! defined $self->{'norm_method'}) {
1501 $self->{'norm_method'}= $self->get_config('norm_method');
1502 }
1503
1504 return $self->{'norm_method'};
1505 }
1506
1507
1508 =head2 get_config
1509
1510 Arg [1] : mandatory - name of the data element to retrieve from the config hash
1511 Example : %dye_freqs = %{$imp->get_config('dye_freqs')};
1512 Description: returns data from the definitions hash
1513 Returntype : various
1514 Exceptions : none
1515 Caller : Importer
1516 Status : at risk - replace with direct calls in the inherited Defs class?
1517
1518 =cut
1519
1520
1521 sub get_config{
1522 my ($self, $data_name) = @_;
1523 return $self->get_data('config', $data_name); #will this cause undefs?
1524 }
1525
1526
1527
1528
1529
1530 =head2 register_experiment
1531
1532 Example : $imp->register_experiment()
1533 Description: General control method, performs all data import and normalisations
1534 Arg [1] : optional - dnadb DBAdaptor
1535 Returntype : none
1536 Exceptions : throws if arg is not Bio::EnsEMBL::DBSQL::DBAdaptor
1537 Caller : general
1538 Status : Medium
1539
1540 =cut
1541
1542
1543 #Need to split this method?
1544 #Pre farm process
1545 #Define and store all sets,
1546 #pre process input file once if required
1547 #How are we going to be able to tell wether this has been done successfully?
1548 #runner will catch error, therefore safe to assume that file is complete if farm job
1549 #unless we are not running with farm
1550
1551 #farm specific processes
1552 #actually parse and store data
1553
1554 #Can we separate these for different import types?
1555
1556 sub register_experiment{
1557 my ($self) = shift;
1558
1559 #Need to check for dnadb passed with adaptor to contructor
1560 if (@_) {
1561
1562 if ( ! $_[0]->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")) {
1563 throw("You need to pass a valid dnadb adaptor to register the experiment");
1564 }
1565 $self->db->dnadb($_[0]);
1566 } elsif ( ! $self->db) {
1567 throw("You need to pass/set a DBAdaptor with a DNADB attached of the relevant data version");
1568 }
1569
1570 #This could still be the default core db for the current version
1571 #warn here if not passed DB?
1572 #These should be vendor independent, only the read methods should need specific order?
1573
1574 $self->init_experiment_import();
1575 #can we just have init here instead?
1576
1577
1578 #This could do with a rewrite to move some things to the parsers
1579 #$self::SUPER->register_experiment
1580
1581 $self->write_validate_experiment_config if $self->can('write_validate_experiment_config');
1582
1583 #This is too array specific!
1584 #Can we have an array of import_methods in the config?
1585 #foreach my $method(@{$self->get_config{'import_methods'}}){
1586 #$self->method;
1587 #}
1588 #We're already doing this in read_data for each of these data_types
1589
1590 #Need to be able to run this separately, so we can normalise previously imported sets with different methods
1591 #should be able t do this without raw data files e.g. retrieve info from DB
1592 #Is this implemented?
1593
1594 $self->read_data("probe");
1595 $self->read_data("results");
1596
1597
1598 my $norm_method = $self->norm_method();
1599
1600 if (defined $norm_method) {
1601 warn "norm method is $norm_method";
1602
1603 $self->R_norm($norm_method);
1604 #change this to $self->$norm_method
1605 #so we can have non-R_norm normalisation
1606 }
1607
1608
1609 return;
1610 }
1611
1612 #Move array specific ones to Nimblegen.pm?
1613 #Also used by ArrayDesign and Sanger.pm
1614 #So need to create Array.pm baseclass, which is a Helper.
1615
1616 =head2 store_set_probes_features
1617
1618 Arg [1] : mandatory - array chip id
1619 Arg [2] : optional - Bio::EnsEMBL::Funcgen::ProbeSet
1620 Arg [3] : mandatory - hashref of keys probe id, values are
1621 hash of probe/features with values
1622 Bio::EnsEMBL::Funcgen::Probe/Features for a given
1623 probe set if defined.
1624 Example : $self->store_set_probes_features($ac->dbID(), $ops, \%pfs);
1625 Description: Stores probe set, probes and probe features
1626 Returntype : none
1627 Exceptions : none
1628 Caller : self
1629 Status : Medium
1630
1631 =cut
1632
1633 sub store_set_probes_features{
1634 my ($self, $ac_id, $pf_hash, $ops) = @_;
1635
1636 ### Deal with ProbeSets
1637 if ($ops) {
1638 $ops->size(scalar(keys %$pf_hash));
1639 ($ops) = $self->db->get_ProbeSetAdaptor->store($ops);
1640 }
1641
1642 #If we're going to validate fully, we need to check for probes in this probeset on this array chip
1643 #Update size if we have any new probes
1644 #Overkill? Only do on recover? Do not read if array chip is IMPORTED
1645 #This does not make any attempt to validate probes/set vs previously stored data
1646
1647 for my $probe_id (keys %$pf_hash) {
1648
1649 #set probeset in probe and store
1650 #the process corresponding feature
1651 my $probe = $pf_hash->{$probe_id}->{'probe'};
1652 $probe->probeset($ops) if $ops;
1653 ($probe) = @{$self->db->get_ProbeAdaptor->store($probe)};
1654
1655 #Can't use get_all_Arrays here as we can't guarantee this will only ever be the array we've generated
1656 #Might dynamically load array if non-present
1657 #This is allowing multiple dbIDs per probe??? Is this wrong?
1658 #$self->cache_probe_info($probe->get_probename(), $probe->dbID());###########Remove as we're now importing all then resolving
1659
1660
1661 foreach my $feature (@{$pf_hash->{$probe_id}->{'features'}}) {
1662 $feature->probe($probe);
1663 ($feature) = @{$self->db->get_ProbeFeatureAdaptor->store($feature)};
1664 }
1665 }
1666
1667 undef $ops; #Will this persist in the caller?
1668 undef %{$pf_hash};
1669
1670 return;
1671 }
1672
1673
1674 =head2 cache_slice
1675
1676 Arg [0] : string - region_name e.g. X
1677 Arg [1] : optional - coordinate system name e.g. supercontig, defaults to chromosome
1678 Example : my $slice = $self->cache_slice(12);
1679 Description: Caches or retrieves from cache a given slice
1680 Returntype : Bio::EnsEMBL::Slice
1681 Exceptions : throws f no region name specified
1682 Caller : self
1683 Status : At risk
1684
1685 =cut
1686
1687 sub cache_slice{
1688 my ($self, $region_name, $cs_name, $total_count) = @_;
1689
1690 throw("Need to define a region_name to cache a slice from") if ! $region_name;
1691 $self->{'slice_cache'} ||= {};
1692 $region_name =~ s/chr//;
1693 $region_name = "MT" if $region_name eq "M";
1694
1695 if (! exists ${$self->{'seen_slice_cache'}}{$region_name}) {
1696 my $slice = $self->slice_adaptor->fetch_by_region($cs_name, $region_name);
1697
1698 #Set seen cache so we don't try this again
1699 $self->{seen_slice_cache}{$region_name} = $slice;
1700
1701 if (! $slice) {
1702 warn("-- Could not generate a slice for ${cs_name}:$region_name\n");
1703 } else {
1704
1705 my $sr_name = $slice->seq_region_name; #In case we passed a slice name
1706
1707 if (@{$self->{seq_region_names}}) {
1708 return if ! grep(/^${sr_name}$/, @{$self->{seq_region_names}}); #not on required slice
1709 }
1710 }
1711
1712 $self->{'slice_cache'}->{$region_name} = $slice;
1713 }
1714
1715 if ($total_count && exists ${$self->{'seen_slice_cache'}}{$region_name}) {
1716 #This is an InputSet specific method
1717 $self->count('total_features') if $self->can('count');
1718 }
1719
1720 #Only return if exists to avoid creating hash key
1721 return (exists $self->{'slice_cache'}->{$region_name}) ? $self->{'slice_cache'}->{$region_name} : undef;
1722 }
1723
1724 =head2 slice_cache
1725
1726 Example : my @seen_slices = values(%{$self->slice_cache});;
1727 Description: Returns the slice cache i.e. all the Slices seen in the data filtered
1728 by the defined slices. This method can be used to run only the appropriate
1729 slice jobs after a prepare stage.
1730 Returntype : Hashref of seq_region name Bio::EnsEMBL::Slice pairs
1731 Exceptions : None
1732 Caller : self
1733 Status : At risk
1734
1735 =cut
1736
1737
1738 sub slice_cache{
1739 my $self = shift;
1740
1741 return $self->{'slice_cache'};
1742 }
1743
1744
1745
1746
1747 =head2 cache_probe_info
1748
1749 Arg [0] : mandatory - probe name
1750 Arg [1] : mandatory - probe dbID
1751 Arg [2] : optioanl int - x coord of probe on array
1752 Arg [3] : optional int - y coord of probe on array
1753 Example : $self->cache_probe_info("Probe1", $probe->dbID());
1754 Or for result files which do not have X & Y, we need to cache
1755 X & Y from the design files: $self->cache_probe_info('Probe2', $probe->dbID(), $x, $y);
1756 Description: Setter for probe cache values
1757 Returntype : none
1758 Exceptions : throws is cache conflict encountered
1759 Caller : self
1760 Status : At risk - merge with following?
1761
1762 =cut
1763
1764 sub cache_probe_info{
1765 my ($self, $pname, $pid, $x, $y) = @_;
1766
1767 throw('Deprecated, too memory expensive, now resolving DB duplicates and using Tied File cache');
1768 throw("Must provide a probe name and id") if (! defined $pname || ! defined $pid);
1769
1770
1771 #do we need to loop through the file here?
1772 #if we make sure we're testing for a previous dbID before storing probes then we don't need to do this
1773 #we can catch the error when we get the probe id as we can check for >1 id for the same probe name
1774 #if (defined $self->{'_probe_cache'}->{$pname} && ($self->{'_probe_cache'}->{$pname}->[0] != $pid)) {
1775 # throw("Found two differing dbIDs for $pname, need to sort out redundant probe entries");
1776 #}
1777
1778 $self->{'_probe_cache'}->{$pname} = (defined $x && defined $y) ? [$pid, $x, $y] : [$pid];
1779
1780 return;
1781 }
1782
1783
1784 =head2 get_probe_id_by_name_Array
1785
1786 Arg [1] : mandatory - probe name
1787 Example : $pid = $self->get_probe_id_by_name($pname);
1788 Description: Getter for probe cache values
1789 Returntype : int
1790 Exceptions : none
1791 Caller : self
1792 Status : At risk - merge with previous, move to importer?
1793
1794 =cut
1795
1796 sub get_probe_id_by_name_Array{
1797 my ($self, $name, $array) = @_;
1798
1799 #this is only ever called for fully imported ArrayChips, as will be deleted if recovering
1800 $self->resolve_probe_data() if(! exists $self->{'_probe_cache'}{$array->name()});
1801
1802 #we want to cycle through the given cache starting from the last position or 0.
1803 #we don't want to have to test for the size of the cache each time as this will be quite expensive
1804 #so we should store sizes separately along with current position
1805
1806
1807 my ($pid, $line);
1808
1809 #check current line
1810 if ($line = $self->{'_probe_cache'}{$array->name()}{'current_line'}) {
1811 if ($line =~ /^\Q${name}\E\t/) {
1812 $pid = (split/\t/o, $line)[1];
1813 }
1814 }
1815
1816
1817 if (! $pid) {
1818 while ($line = $self->{'_probe_cache'}{$array->name()}{'handle'}->getline()) {
1819
1820 if ($line =~ /^\Q${name}\E\t/) {
1821 $pid = (split/\t/o, $line)[1];
1822 $self->{'_probe_cache'}{$array->name()}{'current_line'} = $line;
1823 last;
1824 }
1825 }
1826 }
1827
1828 #do not remove this
1829 if (! $pid) {
1830 throw("Did not find probe name ($name) in cache, cache may need rebuilding, results may need sorting, or do you have an anomolaous probe?")
1831 } else {
1832 chomp $pid;
1833 }
1834
1835 return $pid;
1836 }
1837
1838 =head2 get_probe_cache_by_Array
1839
1840 Arg[1] : Bio::EnsEMBL::Funcgen::Array
1841 Arg[2] : boolean - from db flag, only to be used by Importer->resolve_probe_data !
1842 Example : $self->get_probe_cache_by_Array();
1843 Description: Gets the probe info cache which is an array tied to a file
1844 Returntype : Boolean - True if cache has been generated and set successfully
1845 Exceptions : none
1846 Caller : general
1847 Status : At risk
1848
1849 =cut
1850
1851 #from db flag should only be used by importer
1852 #this is because there is no guarantee that it will be resolved unless
1853 #called by resolve_probe_data
1854 #which then renames the file and resets the handle
1855 #can we clean this up and protect/hide this functionality?
1856 #can we check the cache file name in the get methods and throw if it contains unresolved?
1857 #make this private?
1858
1859 sub get_probe_cache_by_Array{
1860 my ($self, $array, $from_db) = @_;
1861
1862 my $msg = "Getting probe cache for ".$array->name();
1863 $msg .= " from DB" if $from_db;
1864 $self->log($msg); #, 1);
1865
1866 if (! ($array && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())) {
1867 throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::Array object');
1868 }
1869
1870 my $set = 0;
1871 my $cache_file = $self->get_dir('caches').'/'.$array->name().'.probe_cache';
1872
1873 ### Generate and resolve fresh cache from DB
1874 if ($from_db) {
1875
1876 $cache_file .= '.unresolved'; #This will be renamed by the caller if it is resolved
1877
1878 if (exists $self->{'_probe_cache'}{$array->name()}) {
1879 $self->log('Rebuilding probe_cache from DB for '.$array->name(), 1);
1880
1881
1882 #untie @{$self->{'_probe_cache'}{$array->name()}{'entries'}};
1883 #close($self->{'_probe_cache'}{$array->name()}{'handle'});#do we need to do this?
1884 delete $self->{'_probe_cache'}{$array->name()}; #implicitly closes
1885 $self->log('Deleted old cache', 1);
1886 } else {
1887 $self->log('Building probe_cache from DB for '.$array->name(), 1);
1888 }
1889
1890 #Move this to ProbeAdaptor?
1891 #This is where we'd set the unique key for a vendor and resolves duplicates based on the key
1892 my $cmd = 'SELECT name, probe_id from probe WHERE array_chip_id IN ('.join(',', @{$array->get_array_chip_ids()}).') ORDER by name, probe_id';
1893 $cmd = 'mysql '.$self->db->connect_string()." -e \"$cmd\" >".$cache_file;
1894 run_system_cmd($cmd);
1895
1896 }
1897
1898 ### Set cache
1899 if (-f $cache_file) {
1900 $self->log('MD5 check here?',1);
1901 $self->{'_probe_cache'}{$array->name()}{'current_line'} = undef;
1902 $self->{'_probe_cache'}{$array->name()}{'handle'} = open_file($cache_file);
1903
1904 #can we do a select count instead? and do this instead of the MD5?
1905 #$cmd = "wc -l $cache_file";
1906 #my $size = `$cmd`;
1907
1908 $set = 1;
1909 } else {
1910 warn 'Failed to get probe cache for array:'.$array->name();
1911 }
1912
1913 return $set;
1914 }
1915
1916
1917 #should reorganise these emthods to split reading the array data, and the actual data
1918 #currently:
1919 #meta reads array and chip data
1920 #probe reads probe_set, probes, which should definitely be in array, probe_feature? and results
1921 #native data format may not map to these methods directly, so may need to call previous method if required data not defined
1922
1923
1924 =head2 read_data
1925
1926 Example : $self->read_data("probe")
1927 Description: Calls each method in data_type array from config hash
1928 Arg [1] : mandatory - data type
1929 Returntype : none
1930 Exceptions : none
1931 Caller : self
1932 Status : At risk
1933
1934 =cut
1935
1936 sub read_data{
1937 my($self, $data_type) = @_;
1938
1939 map {my $method = "read_${_}_data"; $self->$method()} @{$self->get_config("${data_type}_data")};
1940 return;
1941 }
1942
1943
1944 =head2 design_type
1945
1946 Example : $self->design_type("binding_site_identification")
1947 Description: Getter/Setter for experimental design type
1948 Arg [1] : optional - design type
1949 Returntype : string
1950 Exceptions : none
1951 Caller : general
1952 Status : At risk
1953
1954 =cut
1955
1956 sub design_type{
1957 my $self = shift;
1958 return $self->{'design_type'};
1959 }
1960
1961
1962 =head2 get_chr_seq_region_id
1963
1964 Example : $seq_region_id = $self->get_seq_region_id('X');
1965 Description: Calls each method in data_type array from config hash
1966 Arg [1] : mandatory - chromosome name
1967 Arg [2] : optional - start value
1968 Arg [3] : optional - end value
1969 Returntype : int
1970 Exceptions : none
1971 Caller : self
1972 Status : At risk
1973
1974 =cut
1975
1976 #convinience wrapper method
1977 #could we use the seq region cache instead?
1978 #this seems like a lot of overhead for getting the id
1979 sub get_chr_seq_region_id{
1980 my ($self, $chr, $start, $end) = @_;
1981 #what about strand info?
1982
1983 #do we need the start and stop?
1984
1985 #use start and stop to prevent problems with scaffodl assemblies, i.e. >1 seq_region_id
1986 #my $slice = $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end);
1987 #we could pass the slice back to the slice adaptor for this, to avoid dbid problems betwen DBs
1988
1989 ###would need to implement other cs's here
1990
1991 return $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end)->get_seq_region_id();
1992 }
1993
1994
1995
1996 =head2 vsn_norm
1997
1998 Example : $self->vsn_norm();
1999 Description: Convinience/Wrapper method for vsn R normalisation
2000 Returntype : none
2001 Exceptions : none
2002 Caller : general
2003 Status : At risk
2004
2005 =cut
2006
2007 #Have Norm class or contain methods in importer?
2008 #Need to have analysis set up script for all standard analyses.
2009
2010 sub vsn_norm{
2011 my $self = shift;
2012 return $self->R_norm("VSN_GLOG");
2013 }
2014
2015
2016 =head2 farm
2017
2018 Arg [1] : Boolean
2019 Example : $importer->farm(1);
2020 Description: Flag to turn farm submission on
2021 Returntype : Boolean
2022 Exceptions : Throws is argument not a boolean
2023 Caller : general
2024 Status : At risk
2025
2026 =cut
2027
2028
2029 sub farm{
2030 my ($self, $farm) = @_;
2031
2032 $self->{'farm'} ||= undef; #define farm
2033
2034 if (defined $farm) {
2035 throw("Argument to farm must be a boolean 1 or 0") if(! ($farm == 1 || $farm == 0));
2036 $self->{'farm'} = $farm;
2037 }
2038
2039 return $self->{'farm'};
2040
2041 }
2042
2043 =head2 batch_job
2044
2045 Arg [1] : Boolean
2046 Example : $importer->batch_job(1);
2047 Description: Flag to turn on batch_job status
2048 Returntype : Boolean
2049 Exceptions : Throws is argument not a boolean
2050 Caller : general
2051 Status : At risk
2052
2053 =cut
2054
2055
2056 sub batch_job{
2057 my ($self, $batch_job) = @_;
2058
2059 #$self->{'batch_job'} ||= undef;
2060
2061 if (defined $batch_job) {
2062 throw("Argument to batch_job must be a boolean 1 or 0") if(! ($batch_job == 1 || $batch_job == 0));
2063 $self->{'batch_job'} = $batch_job;
2064 }
2065
2066 return $self->{'batch_job'};
2067
2068 }
2069
2070 =head2 prepared
2071
2072 Arg [1] : Boolean
2073 Example : $importer->prepared(1);
2074 Description: Flag to turn on prepared file status
2075 This signifies that the files have been previously imported
2076 using prepare mode and may not match the InputSubset names
2077 Returntype : Boolean
2078 Exceptions : None
2079 Caller : general
2080 Status : At risk
2081
2082 =cut
2083
2084
2085 sub prepared{
2086 my ($self, $prepared) = @_;
2087 $self->{'prepared'} = $prepared if (defined $prepared);
2088 return $self->{'prepared'};
2089 }
2090
2091
2092 =head2 R_norm
2093
2094 Example : $self->R_norm(@logic_names);
2095 Description: Performs R normalisations for given logic names
2096 Returntype : none
2097 Exceptions : Throws if R exits with error code or if data not not valid for analysis
2098 Caller : general
2099 Status : At risk
2100
2101 =cut
2102
2103 sub R_norm{
2104 my ($self, @logic_names) = @_;
2105 #This currently normalises a single two colour chip at a time
2106 #rather than normalising across a set of chip
2107 #also does in sets of analyses
2108 #good for keeping data separate, but not efficient in terms of querying
2109 #convert to use one script which only queries for each echip once, then does each anal
2110
2111
2112 my $aa = $self->db->get_AnalysisAdaptor();
2113 my $rset_adaptor = $self->db->get_ResultSetAdaptor();
2114 my $ra_id = $aa->fetch_by_logic_name("RawValue")->dbID();
2115 my %r_config = (
2116 "VSN_GLOG" => {( libs => ['vsn'],
2117 #norm_method => 'vsn',
2118 )},
2119
2120 "T.Biweight" => {(
2121 libs => ['affy'],
2122 #norm_method => 'tukey.biweight',
2123 )},
2124 );
2125
2126
2127 foreach my $logic_name (@logic_names) {
2128 #throw("Not yet implemented TukeyBiweight") if $logic_name eq "Tukey_Biweight";
2129
2130 #this has already been chcecked and set as the norm_analysis
2131 #need to resolve this multi method approach
2132 my $norm_anal = $aa->fetch_by_logic_name($logic_name);
2133
2134
2135
2136 #This only creates an RSet for the IMPORT set
2137 #So if we re-run with a different analysis
2138 #tab2mage will have already been validated
2139 #So RSet generation will be skipped
2140 #We need to recreate the each non-import RSet for this norm analysis
2141
2142 #This also means the RSets are being created before the data has been imported
2143 #This avoids having to parse tab2mage each time but means we have an uncertain status of these Rsets
2144
2145 my $rset = $self->get_import_ResultSet($norm_anal, 'experimental_chip');
2146
2147 my @chips = ();
2148
2149 if (! $rset) {
2150 $self->log("All ExperimentalChips already have status:\t${logic_name}");
2151 } else { #Got data to normalise and import
2152 my @dbids;
2153 my $R_file = $self->get_dir("norm")."/${logic_name}.R";
2154 my $job_name = $self->experiment->name()."_${logic_name}";
2155 my $resultfile = $self->get_dir("norm")."/result.${logic_name}.txt";
2156 my $outfile = $self->get_dir("norm")."/${logic_name}.out";
2157
2158 #How do we get farm job output i.e. run time memusage
2159 #from interactive job?
2160 #This assumes R_PATH
2161 my $errfile = $self->get_dir("norm")."/${logic_name}.err";
2162
2163 #Let's build this better so we capture the farm output aswell as the job output.
2164 my $cmdline = "$ENV{'R_PATH'} --no-save < $R_file"; # >$errfile 2>&1";
2165 #-K option waits for job to complete
2166 my $bsub = "bsub -K -J $job_name ".$ENV{'R_BSUB_OPTIONS'}.
2167 " -e $errfile -o $outfile $ENV{'R_FARM_PATH'} CMD BATCH $R_file";
2168
2169 #Can we separate the out and err for commandline?
2170 my $r_cmd = (! $self->farm()) ? "$cmdline >$outfile 2>&1" : $bsub;
2171
2172 $self->backup_file($resultfile); #Need to do this as we're appending in the loop
2173
2174 #setup qurey
2175 #warn "Need to add host and port here";
2176 #Set up DB, defaults and libs for each logic name
2177 my $query = "options(scipen=20);library(RMySQL);library(Ringo);";
2178 #scipen is to prevent probe_ids being converted to exponents
2179 #Ringo is for default QC
2180
2181 #foreach my $ln(@logic_names){
2182
2183 foreach my $lib (@{$r_config{$logic_name}{'libs'}}) {
2184 $query .= "library($lib);";
2185 }
2186 #}
2187
2188 $query .= "con<-dbConnect(dbDriver(\"MySQL\"), host=\"".$self->db->dbc->host()."\", port=".$self->db->dbc->port().", dbname=\"".$self->db->dbc->dbname()."\", user=\"".$self->db->dbc->username()."\"";
2189
2190 #should use read only pass here as we are printing this to file
2191 $query .= ", pass=\"".$self->db->dbc->password."\")\n";
2192
2193
2194 #Build queries for each chip
2195 foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) {
2196
2197
2198 #should implement logic name here?
2199 #can't as we need seperate ResultSet for each
2200
2201
2202 if ($echip->has_status($logic_name)) {
2203 $self->log("ExperimentalChip ".$echip->unique_id()." already has status:\t$logic_name");
2204 } else {
2205
2206 #warn "Need to roll back here if recovery, as norm import may fail halfway through";
2207
2208 push @chips, $echip;
2209 my $cc_id = $rset->get_chip_channel_id($echip->dbID());
2210
2211 #if ($self->recovery()){
2212 # $self->log('Rolling back results for ExperimentalChip('.$echip->dbID().") $logic_name");
2213 # $self->db->rollback_results($cc_id) if $self->recovery();
2214 # }
2215
2216 $self->log("Building $logic_name R cmd for ".$echip->unique_id());
2217 @dbids = ();
2218
2219 foreach my $chan (@{$echip->get_Channels()}) {
2220
2221 if ($chan->type() eq "EXPERIMENTAL") {
2222 push @dbids, $chan->dbID();
2223 } else {
2224 unshift @dbids, $chan->dbID();
2225 }
2226 }
2227
2228 throw("vsn does not accomodate more than 2 channels") if (scalar(@dbids > 2) && $logic_name eq "VSN_GLOG");
2229
2230 #should do some of this with maps?
2231 #HARDCODED metric ID for raw data as one
2232
2233 #Need to get total and experimental here and set db_id accordingly
2234 #can probably do this directly into one df
2235
2236 $query .= "c1<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as CONTROL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[0]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n";
2237
2238 $query .= "c2<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as EXPERIMENTAL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[1]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n";
2239
2240 #Can we define some of the basic structures here and reuse in the QC and each norm method?
2241
2242
2243 #Is this going to eat up memory?
2244 #can we strip out and separate the data from c1 and c2 into RGList and
2245 #individual vector for probe_ids, then rm c1 and c2 to free up memory
2246
2247 #create RGList object
2248 $query .= "R<-as.matrix(c1['CONTROL_score'])\nG<-as.matrix(c2['EXPERIMENTAL_score'])\n";
2249 $query .= "genes<-cbind(c1['PROBE_ID'], c1['X'], c1['Y'])\n";
2250 $query .= "testRG<-new('RGList', list(R=R, G=G, genes=genes))\n";
2251
2252
2253 #QC plots here before doing norm
2254
2255 #open pdf device
2256 $query .= "pdf('".$self->get_dir('norm').'/'.$echip->unique_id."_QC.pdf', paper='a4', height = 15, width = 9)\n";
2257 #set format
2258 $query .= "par(mfrow = c(2,2), font.lab = 2)\n";
2259
2260 #Channel densisties
2261 #These need limma or Ringo
2262 $query .= "plotDensities(testRG)\n";
2263
2264 #MvA Plot
2265
2266 $query .= 'meanLogA<-((log(testRG$R, base=exp(2)) + log(testRG$G, base=exp(2)))/2)'."\n";
2267 $query .= 'logIntRatioM<-(log(testRG$R, base=exp(2)) - log(testRG$G, base=exp(2)))'."\n";
2268 $query .= "yMin<-min(logIntRatioM)\n";
2269 $query .= "yMax<-max(logIntRatioM)\n";
2270
2271
2272
2273 #Need to validate yMax here
2274 #If is is Inf then we need to sort the vector and track back until we find the high real number
2275 #count number of Infs and note on MvA plot
2276 $query .= "infCount<-0\n";
2277 $query .= "if( yMax == Inf){; sortedM<-sort(logIntRatioM); lengthM<-length(logIntRatioM); indexM<-lengthM\n"
2278 ."while (yMax == Inf){; indexM<-(indexM-1); yMax<-sortedM[indexM];}; infCount<-(lengthM-indexM);}\n";
2279
2280 #
2281 $query .= "if(infCount == 0){\n";
2282 $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'")'."\n";
2283 $query .= "} else {\n";
2284 $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'", sub=paste(infCount, " Inf values not plotted"));'."}\n";
2285
2286
2287 #$query .= 'plot(log(testRG$R*testRG$G, base=exp(2))/2, log(testRG$R/testRG$G, base=exp(2)),xlab="A",ylab="M",pch=".",ylim=c(-3,3), main="'.$echip->unique_id."\")\n";
2288
2289 #Plate plots
2290 $query .= 'image(testRG, 1, channel = "green", mycols = c("black", "green4", "springgreen"))'."\n";
2291 $query .= 'image(testRG, 1, channel = "red", mycols = c("black", "green4", "springgreen"))'."\n";
2292
2293 $query .= "dev.off()\n";
2294 #Finished QC pdf printing
2295
2296
2297
2298 #The simple preprocess step of Ringo is actually vsn, so we can nest these in line
2299
2300
2301 ### Build Analyses cmds ###
2302
2303 if ($logic_name eq 'T.Biweight') {
2304
2305 #log2 ratios
2306 $query .= 'lr_df<-cbind((log(c2["EXPERIMENTAL_score"], base=exp(2)) - log(c1["CONTROL_score"], base=exp(2))))'."\n";
2307
2308 #Adjust using tukey.biweight weighted average
2309 #inherits first col name
2310 $query .= 'norm_df<-(lr_df["EXPERIMENTAL_score"]-tukey.biweight(as.matrix(lr_df)))'."\n";
2311 $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", norm_df[,1]), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n";
2312
2313 } elsif ($logic_name eq 'VSN_GLOG') {
2314 #could do this directly
2315 $query .= "raw_df<-cbind(c1[\"CONTROL_score\"], c2[\"EXPERIMENTAL_score\"])\n";
2316 #variance stabilise
2317 $query .= "norm_df<-vsn(raw_df)\n";
2318
2319
2320 #do some more calcs here and print report?
2321 #fold change exponentiate? See VSN docs
2322 #should do someplot's of raw and glog and save here?
2323 #set log func and params
2324 #$query .= "par(mfrow = c(1, 2)); log.na = function(x) log(ifelse(x > 0, x, NA));";
2325 #plot
2326 #$query .= "plot(exprs(glog_df), main = \"vsn\", pch = \".\");".
2327 # "plot(log.na(exprs(raw_df)), main = \"raw\", pch = \".\");";
2328 #FAILS ON RAW PLOT!!
2329 #par(mfrow = c(1, 2))
2330 #> meanSdPlot(nkid, ranks = TRUE)
2331 #> meanSdPlot(nkid, ranks = FALSE)
2332
2333
2334 #Now create table structure with glog values(diffs)
2335 #3 sig dec places on scores(doesn't work?!)
2336 $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", (exprs(norm_df[,2]) - exprs(norm_df[,1]))), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n";
2337
2338 }
2339 #load back into DB
2340 #c3results<-cbind(rep("", length(c3["probe_id"])), c3["probe_id"], c3["c3_score"], rep(1, length(c3["probe_id"])), rep(1, length(c3["probe_id"])))
2341 #may want to use safe.write here
2342 #dbWriteTable(con, "result", c3results, append=TRUE)
2343 #dbWriteTable returns true but does not load any data into table!!!
2344
2345 $query .= "write.table(formatted_df, file=\"${resultfile}\", sep=\"\\t\", col.names=FALSE, row.names=FALSE, quote=FALSE, append=TRUE)\n";
2346
2347 #tidy up here??
2348 }
2349 }
2350
2351 $query .= "q();";
2352
2353 open(RFILE, ">$R_file") || throw("Cannot open $R_file for writing");
2354 print RFILE $query;
2355 close(RFILE);
2356
2357 my $submit_text = "Submitting $logic_name job";
2358 $submit_text .= ' to farm' if $self->farm;
2359 $self->log("${submit_text}:\t".localtime());
2360 run_system_cmd($r_cmd);
2361 $self->log("Finished $logic_name job:\t".localtime());
2362 $self->log('See '.$self->get_dir('norm').' for ExperimentalChip QC files');
2363
2364 #Now load file and update status
2365 #Import directly here to avoid having to reparse all results if we crash!!!!
2366 $self->log("Importing:\t$resultfile");
2367 $self->db->load_table_data("result", $resultfile);
2368 $self->log("Finishing importing:\t$resultfile");
2369
2370
2371 foreach my $echip (@chips) {
2372 $echip->adaptor->store_status($logic_name, $echip);
2373 }
2374
2375 #Recreate all non-import RSets for analysis if not already present
2376 #
2377
2378 my $rset_a = $self->db->get_ResultSetAdaptor();
2379 my %seen_rsets;
2380
2381 foreach my $anal_rset (@{$rset_a->fetch_all_by_Experiment($self->experiment)}) {
2382 next if($anal_rset->name =~ /_IMPORT$/o);
2383 next if(exists $seen_rsets{$anal_rset->name});
2384 next if $anal_rset->analysis->logic_name eq $norm_anal->logic_name;
2385 $seen_rsets{$rset->name} = 1;
2386 $anal_rset->analysis($norm_anal);
2387 $anal_rset->{'dbID'} = undef;
2388 $anal_rset->{'adaptor'} = undef;
2389
2390 #add the chip_channel_ids from the new anal IMPORT set
2391 foreach my $table_id (@{$anal_rset->table_ids}) {
2392 $anal_rset->{'table_id_hash'}{$table_id} = $rset->get_chip_channel_id($table_id);
2393 }
2394
2395 $self->log('Adding new ResultSet '.$anal_rset->name.' with analysis '.$norm_anal->logic_name);
2396 $rset_a->store($anal_rset);
2397 }
2398
2399
2400
2401 }
2402 }
2403
2404 return;
2405 }
2406
2407 #can we sub this? args: table_name, logic_name
2408 #also use result_set_name
2409 #would also clean all data for result set if recovery
2410 #return would be result_set
2411 #Can we extend this to incorporate InputSet parser define_sets?
2412
2413 sub get_import_ResultSet{
2414 my ($self, $anal, $table_name) = @_;
2415
2416 if (!($anal && $anal->isa("Bio::EnsEMBL::Analysis") && $anal->dbID())) {
2417 throw("Must provide a valid stored Bio::EnsEMBL::Analysis");
2418 }
2419
2420 $self->log("Getting import $table_name ResultSet for analysis:\t".$anal->logic_name());
2421
2422 my ($rset, @new_chip_channels);
2423 my $result_adaptor = $self->db->get_ResultSetAdaptor();
2424 my $logic_name = $anal->logic_name;
2425 my $status = ($logic_name eq "RawValue") ? "IMPORTED" : $logic_name;
2426
2427 if (($logic_name) eq 'RawValue' && ($table_name eq 'experimental_chip')) {
2428 throw("Cannot have an ExperimentalChip ResultSet with a RawValue analysis, either specify 'channel' or another analysis");
2429 }
2430
2431 #Build IMPORT Set for $table_name
2432 foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) {
2433
2434 #clean chip import and generate rset
2435
2436 if ($table_name eq 'experimental_chip') {
2437
2438 if ($echip->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status
2439 $self->log("ExperimentalChip(".$echip->unique_id().") already has status:\t".$status);
2440 } else {
2441 $self->log("Found ExperimentalChip(".$echip->unique_id().") without status $status");
2442
2443 push @new_chip_channels, $echip;
2444 }
2445
2446 } else { #channel
2447
2448 foreach my $chan (@{$echip->get_Channels()}) {
2449
2450 if ($chan->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status
2451 $self->log("Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") already has status:\t".$status);
2452 } else {
2453 $self->log("Found Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") without status $status");
2454 push @new_chip_channels, $chan;
2455 }
2456 }
2457 }
2458
2459 if (( ! $rset) && @new_chip_channels) {
2460 my(@tmp) = @{$result_adaptor->fetch_all_by_name_Analysis($self->name()."_IMPORT", $anal)};
2461
2462 if (scalar(@tmp) > 1) {
2463 throw('Found more than one IMPORT ResultSet for '.$self->name().'_IMPORT with analysis '.$logic_name);
2464 }
2465
2466 $rset = shift @tmp;
2467
2468
2469 #do we need to throw here if not recovery?
2470 #what if we want the import result set elsewhere during the first import?
2471
2472 #if ($self->recovery()) {
2473
2474 #fetch by anal and experiment_id
2475 #Need to change this to result_set.name!
2476 # warn("add chip set handling here");
2477
2478 #my @tmp = @{$result_adaptor->fetch_all_by_Experiment_Analysis($self->experiment(), $anal)};
2479 #throw("Found more than one ResultSet for Experiment:\t".$self->experiment->name()."\tAnalysis:\t".$anal->logic_name().')' if (scalar(@tmp) >1);
2480 #$rset = $tmp[0];
2481
2482 #warn "fetching rset with ".$self->name()."_IMPORT ". $anal->logic_name;
2483
2484 #$rset = $result_adaptor->fetch_by_name_Analysis($self->name()."_IMPORT", $anal);
2485 warn("Warning: Could not find recovery ResultSet for analysis ".$logic_name) if ! $rset;
2486 #}
2487
2488 if (! $rset) {
2489 $self->log("Generating new ResultSet for analysis ".$logic_name);
2490
2491 $rset = Bio::EnsEMBL::Funcgen::ResultSet->new
2492 (
2493 -analysis => $anal,
2494 -table_name => $table_name,
2495 -name => $self->name()."_IMPORT",
2496 -feature_type => $self->feature_type(),
2497 -cell_type => $self->cell_type(),
2498 );
2499
2500 #These types should be set to NULL during the MAGE-XML validation if we have more than one type in an experiment
2501
2502 ($rset) = @{$result_adaptor->store($rset)};
2503 }
2504 }
2505 }
2506
2507 #do we need this here as we're rolling back in the read methods?
2508 #we only want to roll back those chips/channels which have not been registered
2509
2510 if ($self->recovery()) {
2511
2512 my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor();
2513
2514 foreach my $cc (@new_chip_channels) {
2515
2516 #only roll back if already part of import set
2517 #Not previously registered if not
2518 if ($rset->contains($cc) && $rset->get_chip_channel_id($cc->dbID())) {
2519
2520 if ($table_name eq 'channel') {
2521 my $chan_name = $ec_adaptor->fetch_by_dbID($cc->experimental_chip_id())->unique_id()."_".
2522 $self->get_config('dye_freqs')->{$cc->dye()};
2523 $self->log("Rolling back results for $table_name:\t".$chan_name);
2524
2525 } else {
2526 $self->log("Rolling back results for $table_name:\t".$cc->unique_id);
2527 }
2528
2529 $self->rollback_results([$rset->get_chip_channel_id($cc->dbID())]);
2530 }
2531 }
2532 }
2533
2534
2535 #check whether it is present in the ResultSet and add if not
2536 if ($rset) {
2537 #ids will already be present if not rset i.e. already imported
2538
2539 foreach my $cc (@new_chip_channels) {
2540 $rset->add_table_id($cc->dbID()) if(! $rset->contains($cc));
2541 }
2542 }
2543
2544
2545
2546 if ($rset) {
2547 $result_adaptor->store_chip_channels($rset);
2548 } else {
2549 $self->log("All ExperimentalChips have status:\t$status");
2550 }
2551
2552 #this only returns a result set if there is new data to import
2553 return $rset;
2554 }
2555
2556
2557
2558 =head2 resolve_probe_data
2559
2560 Example : $self->resolve_probe_data();
2561 Description: Resolves DB probe duplicates and builds local probe cache
2562 Returntype : none
2563 Exceptions : ????
2564 Caller : general
2565 Status : At risk
2566
2567 =cut
2568
2569 sub resolve_probe_data{
2570 my $self = shift;
2571
2572 $self->log("Resolving probe data", 1);
2573
2574 warn "Probe cache resolution needs to accomodate probesets too!";
2575
2576 foreach my $array (@{$self->arrays()}) {
2577 my $resolve = 0;
2578
2579 if ($self->get_probe_cache_by_Array($array)) { #cache already generated
2580
2581 #check if we have any new unresolved array chips to add to the cache
2582 foreach my $achip (@{$array->get_ArrayChips()}) {
2583
2584 if ($achip->has_status('RESOLVED')) {
2585 $self->log("ArrayChip has RESOLVED status:\t".$achip->design_id()); #, 1);
2586 next;
2587 } else {
2588 $self->log("Found un-RESOLVED ArrayChip:\t".$achip->design_id());
2589 $resolve = 1;
2590 last;
2591 }
2592 }
2593 } else { #no cache file
2594 $resolve = 1;
2595 $self->log('No probe cache found for array '.$array->name());
2596 }
2597
2598 if ($resolve) {
2599 $self->log('Resolving array duplicates('.$array->name().') and rebuilding probe cache.', 1);
2600 $self->get_probe_cache_by_Array($array, 1); #get from DB
2601
2602 #we need ot make sure we mark cache as unresolved, so we don't use it by mistake.
2603
2604
2605
2606 my ($line, $name, $pid, @pids);
2607 #my $index = 0;
2608 my $tmp_name = '';
2609 my $tmp_id = '';
2610
2611 #miss the header
2612
2613 while ($line = $self->{'_probe_cache'}{$array->name}{'handle'}->getline()) {
2614 ($name, $pid) = split/\t/o, $line;
2615
2616 if ($name eq $tmp_name) {
2617
2618 if ($pid != $tmp_id) {
2619 push @pids, $pid;
2620 #should reset to pid here if we have x y data else undef
2621 #ignore this and force result to have x y
2622 }
2623
2624 #can't do this naymore unless we figure out how to move the line pointer
2625 #would still need to sed the file anyway, better to regen from DB?
2626 #undef $self->{'_probe_cache'}{$array->name}{'entries'}->[$i];#delete true or to be resolved duplicate
2627 } elsif ($name ne $tmp_name) { #new probe
2628 $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1);
2629 $tmp_name = $name;
2630 $tmp_id = $pid;
2631 @pids = ($pid);
2632 #$index = $i + 1;
2633 }
2634 }
2635
2636 $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1);
2637
2638 #rename resovled cache and reset cache handle
2639 my $cmd = 'mv '.$self->get_dir('caches').'/'.$array->name().'.probe_cache.unresolved '.
2640 $self->get_dir('caches').'/'.$array->name().'.probe_cache';
2641
2642 run_system_cmd($cmd);
2643 $self->get_probe_cache_by_Array($array); #This sets the caches
2644
2645
2646 #warn "Only generate MD5 here, as this is guranteed to be correct";
2647
2648 foreach my $achip (@{$array->get_ArrayChips()}) {
2649
2650 if (! $achip->has_status('RESOLVED')) {
2651 $self->log("Updating ArrayChip to RESOLVED status:\t".$achip->design_id());
2652 $achip->adaptor->store_status('RESOLVED', $achip);
2653 }
2654 }
2655
2656 $self->log('Finished building probe cache for '.$array->name(), 1);
2657 }
2658 }
2659
2660 $self->log('Finished resolving probe data', 1);
2661
2662 return;
2663 }
2664
2665
2666 sub tidy_duplicates{
2667 my ($self, $pids) = @_;
2668
2669 my $pfa = $self->db->get_ProbeFeatureAdaptor();
2670 my ($feature, %features);
2671
2672 foreach my $dup_id (@$pids) {
2673
2674 foreach $feature(@{$pfa->fetch_all_by_Probe_id($dup_id)}) {
2675 #can we safely assume end will be same too?
2676 push @{$features{$feature->seq_region_name().':'.$feature->start()}}, $feature;
2677 }
2678 }
2679
2680 my (@reassign_ids, @delete_ids);
2681
2682 foreach my $seq_start_key (keys %features) {
2683 my $reassign_features = 1;
2684
2685 foreach $feature(@{$features{$seq_start_key}}) {
2686
2687 if ($feature->probe_id() == $pids->[0]) {
2688 $reassign_features = 0;
2689 } else {
2690 push @delete_ids, $feature->dbID();
2691 }
2692 }
2693
2694 #This assumes that we actually have at least one element to every seq_start_key array
2695 if ($reassign_features) {
2696 my $new_fid = pop @delete_ids;
2697 push @reassign_ids, $new_fid;
2698 }
2699 }
2700
2701 #resolve features first so we don't get any orphaned features if we crash.
2702 $pfa->reassign_features_to_probe(\@reassign_ids, $pids->[0]) if @reassign_ids;
2703 $pfa->delete_features(\@delete_ids) if @delete_ids;
2704
2705 return;
2706 }
2707
2708 1;