Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Importer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 | |
2 =head1 LICENSE | |
3 | |
4 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
5 Genome Research Limited. All rights reserved. | |
6 | |
7 This software is distributed under a modified Apache license. | |
8 For license details, please see | |
9 | |
10 http://www.ensembl.org/info/about/code_licence.html | |
11 | |
12 =head1 CONTACT | |
13 | |
14 Please email comments or questions to the public Ensembl | |
15 developers list at <ensembl-dev@ebi.ac.uk>. | |
16 | |
17 Questions may also be sent to the Ensembl help desk at | |
18 <helpdesk@ensembl.org>. | |
19 | |
20 =head1 NAME | |
21 | |
22 Bio::EnsEMBL::Funcgen::Importer | |
23 | |
24 =head1 SYNOPSIS | |
25 | |
26 my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); | |
27 $imp->register_experiment(); | |
28 | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 B<This program> is the main class coordinating import of tiling array design and experimental data. | |
33 It utilises several underlying parser classes specific to array vendor or import file type. | |
34 | |
35 =cut | |
36 | |
37 ################################################################################ | |
38 | |
39 package Bio::EnsEMBL::Funcgen::Importer; | |
40 | |
41 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(get_date open_file run_system_cmd); | |
42 use Bio::EnsEMBL::Utils::Exception qw( throw deprecate ); | |
43 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
44 use Bio::EnsEMBL::Funcgen::Experiment; | |
45 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; | |
46 use Bio::EnsEMBL::DBSQL::DBAdaptor; | |
47 | |
48 use File::Path; | |
49 use strict; | |
50 use vars qw(@ISA); | |
51 | |
52 | |
53 ################################################################################ | |
54 | |
55 =head2 new | |
56 | |
57 Description : Constructor method | |
58 | |
59 Arg [1] : hash containing optional attributes: | |
60 -name Name of Experiment(dir) | |
61 -format of array e.g. Tiled(default) | |
62 -vendor name of array vendor | |
63 -description of the experiment | |
64 -pass DB password | |
65 -host DB host | |
66 -user DB user | |
67 -port DB port | |
68 -registry_host Host to load registry from | |
69 -registry_port Port for registry host | |
70 -registry_user User for registry host | |
71 -registry_pass Password for registry user | |
72 -ssh Flag to set connection over ssh via forwarded port to localhost (default = 0); remove? | |
73 -group name of experimental/research group | |
74 ` -location of experimental/research group | |
75 -contact e/mail address of primary contact for experimental group | |
76 -species | |
77 -assembly Genome assembly version i.e. 36 for NCBI36 | |
78 -recover Recovery flag (default = 0) | |
79 -data_dir Root data directory (default = $ENV{'EFG_DATA'}) | |
80 -output_dir review these dirs ??????? | |
81 -input_dir ????????? | |
82 -import_dir ??????? | |
83 -norm_dir ?????? | |
84 -fasta dump FASTA flag (default =0) | |
85 -array_set Flag to treat all chip designs as part of same array (default = 0) | |
86 -array_name Name for array set | |
87 -array_file Path of array file to import for sanger ENCODE array | |
88 -result_set_name Name to give the raw and normalised result sets (default uses experiment and analysis name) | |
89 -norm_method Normalisation method (Nimblegen default = VSN_GLOG); | |
90 -dbname Override for autogeneration of funcgen dbaname | |
91 -reg_config path to local registry config file (default = ~/ensembl.init || undef) | |
92 -design_type MGED term (default = binding_site_identification) get from meta/MAGE? | |
93 | |
94 -farm Flag to submit jobs to farm e.g. normalisation jobs | |
95 -batch_job Flag to signify that this Importer is running as a prepared batch/farm job | |
96 -prepared Flag to signify result files have been previously imported in prepare mode | |
97 and file names will differ to those record in InputSubset | |
98 | |
99 | |
100 #-use_defaults This changes some mandatory parameters to optional, instead using either DEFAULT or the input file name for the following options -name, -input_set, -feature_type, -cell_type etc ??? | |
101 | |
102 -verbose | |
103 ReturnType : Bio::EnsEMBL::Funcgen::Importer | |
104 Example : my $Exp = Bio::EnsEMBL::Importer->new(%params); | |
105 Exceptions : throws if mandatory params are not set or DB connect fails | |
106 Caller : General | |
107 Status : Medium - potential for %params names to change, remove %attrdata? | |
108 | |
109 =cut | |
110 | |
111 ################################################################################ | |
112 | |
113 sub new{ | |
114 my ($caller) = shift; | |
115 | |
116 #my $reg = "Bio::EnsEMBL::Registry"; | |
117 my $class = ref($caller) || $caller; | |
118 | |
119 #$user, $host, $port, $pass, $dbname, $db, $assm_version, $reg_config, $reg_db, $ucsc_coords, $verbose, $release, $reg_host, $reg_port, $reg_user, $reg_pass | |
120 #$ftype_name, $ctype_name, | |
121 | |
122 my ($name, $format, $vendor, $group, $location, $contact, | |
123 $array_name, $array_set, $array_file, $data_dir, $result_files, | |
124 $exp_date, $desc, $design_type, $output_dir, $input_dir, | |
125 $batch_job, $farm, $prepared, | |
126 $norm_method, $old_dvd_format, $parser_type) | |
127 = rearrange(['NAME', 'FORMAT', 'VENDOR', 'GROUP', 'LOCATION', 'CONTACT', | |
128 'ARRAY_NAME', 'ARRAY_SET', 'ARRAY_FILE', 'DATA_DIR', 'RESULT_FILES', | |
129 'EXPERIMENT_DATE', 'DESCRIPTION', | |
130 'DESIGN_TYPE', 'OUTPUT_DIR', 'INPUT_DIR', #to allow override of defaults | |
131 'BATCH_JOB', 'FARM', 'PREPARED', 'NORM_METHOD', | |
132 'OLD_DVD_FORMAT', 'PARSER'], @_); | |
133 | |
134 | |
135 | |
136 #### Define parent parser class based on vendor | |
137 throw("Mandatory argument -vendor not defined") if ! defined $vendor; | |
138 | |
139 #This will override the default Vendor Parser type | |
140 #Evals simply protect from messy errors if parser type not found | |
141 my $parser_error; | |
142 my $vendor_parser = ucfirst(lc($vendor)); | |
143 | |
144 | |
145 #WARNING evaling these parsers to enable pluggability hides errors in parser | |
146 #use a perl -MBio::EnsEMBL::Funcgen::Parsers:ParserType to debug | |
147 #get rid of all this case guessing and force correct parser name usage? | |
148 | |
149 | |
150 #WARNING | |
151 #Dynamic setting of ISA in this way reports the resultant object as Importer, when | |
152 #some throws/methods are actually in other base/custom Parsers | |
153 #This can seem a little counterintuitive, but allows plugability | |
154 #With out the need for separate control scripts | |
155 | |
156 | |
157 #Change this to be set and required/detected in the parse_and_import.pl script | |
158 #Then we can have Importer.pm as the base class and get rid of this. | |
159 #as well as set_config methods? | |
160 | |
161 | |
162 eval {require "Bio/EnsEMBL/Funcgen/Parsers/${vendor_parser}.pm";}; | |
163 | |
164 if ($@) { | |
165 #Don't warn/throw yet as we might have a standard parser format | |
166 | |
167 $parser_error .= "There is no valid parser for the vendor your have specified:\t".$vendor. | |
168 "\nMaybe this is a typo or maybe you want to specify a default import format using the -parser option\n".$@; | |
169 } | |
170 | |
171 | |
172 | |
173 if (defined $parser_type) { | |
174 | |
175 #try normal case first | |
176 eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";}; | |
177 | |
178 if ($@) { | |
179 $parser_type = ucfirst(lc($parser_type)); | |
180 | |
181 #Now eval the new parser | |
182 eval {require "Bio/EnsEMBL/Funcgen/Parsers/${parser_type}.pm";}; | |
183 | |
184 if ($@) { | |
185 | |
186 #Might be no default | |
187 my $txt = "There is no valid parser for the -parser format your have specified:\t".$parser_type."\n"; | |
188 | |
189 if (! $parser_error) { | |
190 $txt .= "Maybe this is a typo or maybe you want run with the default $vendor_parser parser\n"; | |
191 } | |
192 | |
193 throw($txt.$@); | |
194 } | |
195 | |
196 #warn about over riding vendor parser here | |
197 if (! $parser_error) { | |
198 #Can't log this as we haven't blessed the Helper yet | |
199 warn("WARNING\t::\tYou are over-riding the default ".$vendor." parser with -parser ".$parser_type); | |
200 } | |
201 } | |
202 } else { | |
203 throw($parser_error) if $parser_error; | |
204 $parser_type = $vendor_parser; | |
205 } | |
206 | |
207 | |
208 #we should now really set parser_type as an attrtibute? | |
209 unshift @ISA, 'Bio::EnsEMBL::Funcgen::Parsers::'.$parser_type; | |
210 #change this to be called explicitly from the load script? | |
211 | |
212 #### Create object from parent class | |
213 | |
214 my $self = $class->SUPER::new(@_); | |
215 | |
216 #### Set vars and test minimum mandatory params for any import type | |
217 | |
218 $self->{'name'} = $name || throw('Mandatory param -name not met'); #This is not mandatory for array design import | |
219 ## $self->{'user'} = $user || $ENV{'EFG_WRITE_USER'}; | |
220 $self->vendor(uc($vendor)); #already tested | |
221 $self->{'format'} = uc($format) || 'TILED'; #remove default? | |
222 $self->group($group) if $group; | |
223 $self->location($location) if $location; | |
224 $self->contact($contact) if $contact; | |
225 ## $species || throw('Mandatory param -species not met'); | |
226 $self->array_name($array_name) if $array_name; | |
227 $self->array_set($array_set) if $array_set; | |
228 $self->array_file($array_file) if $array_file; | |
229 $self->{'data_dir'} = $data_dir || $ENV{'EFG_DATA'}; | |
230 $self->result_files($result_files)if $result_files; | |
231 $self->experiment_date($exp_date) if $exp_date; | |
232 $self->description($desc) if $desc; #experiment | |
233 ## $self->feature_set_description($fset_desc) if $fset_desc; | |
234 | |
235 #$assm_version || throw('Mandatory param -assembly not met'); | |
236 #Only required if setting DB by params e.g db not passed or generated from reg | |
237 #i.e. most of the time | |
238 #Why was this made mandatory? | |
239 #Default to dnadb||efgdb assm from the dbname | |
240 | |
241 $self->{'design_type'} = $design_type || 'binding_site_identification'; #remove default? | |
242 $self->{'output_dir'} = $output_dir if $output_dir; #config default override | |
243 $self->{'input_dir'} = $input_dir if $input_dir; #config default override | |
244 $self->farm($farm) if $farm; | |
245 $self->batch_job($batch_job); | |
246 $self->prepared($prepared); | |
247 ## $self->{'ssh'} = $ssh || 0; | |
248 ## $self->{'_dump_fasta'} = $fasta || 0; | |
249 #$self->{'recover'} = $recover || 0; Now in BaseImporter | |
250 #check for ~/.ensembl_init to mirror general EnsEMBL behaviour | |
251 ## $self->{'reg_config'} = $reg_config || ((-f "$ENV{'HOME'}/.ensembl_init") ? "$ENV{'HOME'}/.ensembl_init" : undef); | |
252 $self->{'old_dvd_format'} = $old_dvd_format || 0; | |
253 ## $self->{'ucsc_coords'} = $ucsc_coords || 0; | |
254 ## $self->{'verbose'} = $verbose || 0; | |
255 ## $self->{'release'} = $release; | |
256 | |
257 | |
258 | |
259 ## if($reg_host && $self->{'reg_config'}){ | |
260 ## warn "You have specified registry parameters and a config file:\t".$self->{'reg_config'}. | |
261 ## "\nOver-riding config file with specified paramters:\t${reg_user}@${reg_host}:$reg_port"; | |
262 ## } | |
263 | |
264 | |
265 #Will a general norm method be applicable for all imports? | |
266 #Already casued problems with Bed imports... remove? | |
267 #Could set NORM_METHOD in Parser!! | |
268 #warn "Need to fully implement norm_method is validate_mage, remove ENV NORM_METHOD?"; | |
269 $self->{'norm_method'} = $norm_method; # || $ENV{'NORM_METHOD'}; | |
270 | |
271 #if ($self->vendor ne 'NIMBLEGEN'){ | |
272 # $self->{'no_mage'} = 1; | |
273 # warn "Hardcoding no_mage for non-NIMBLEGEN imports"; | |
274 # } | |
275 | |
276 | |
277 # if($self->{'no_mage'} && $self->{'write_mage'}){ | |
278 # throw('-no_mage and -write_mage options are mutually exclusive, please select just one'); | |
279 # } | |
280 | |
281 ## #### Set up DBs and load and reconfig registry | |
282 ## | |
283 ## ### Load Registry | |
284 ## #Can we load the registry using the assembly version, then just redefine the efg DB? | |
285 ## #We have problems here if we try and load on a dev version, where no dev DBs are available on ensembldb | |
286 ## #Get the latest API version for the assembly we want to use | |
287 ## #Then load the registry from that version | |
288 ## #Then we can remove some of the dnadb setting code below? | |
289 ## #This may cause problems with API schema mismatches | |
290 ## #Can we just test whether the current default dnadb contains the assembly? | |
291 ## #Problem with this is that this will not have any other data e.g. genes etc | |
292 ## #which may be required for some parsers | |
293 ## | |
294 ## #How does the registry pick up the schema version?? | |
295 ## | |
296 ## #We should really load the registry first given the dnadb assembly version | |
297 ## #Then reset the eFG DB as appropriate | |
298 ## | |
299 ## | |
300 ## if ($reg_host || ! defined $self->{'_reg_config'}) { | |
301 ## #defaults to current ensembl DBs | |
302 ## $reg_host ||= 'ensembldb.ensembl.org'; | |
303 ## $reg_user ||= 'anonymous'; | |
304 ## | |
305 ## #Default to the most recent port for ensdb | |
306 ## if( (! $reg_port) && | |
307 ## ($reg_host eq 'ensdb-archive') ){ | |
308 ## $reg_port = 5304; | |
309 ## } | |
310 ## | |
311 ## #This will try and load the dev DBs if we are using v49 schema or API? | |
312 ## #Need to be mindful about this when developing | |
313 ## #we need to tip all this on it's head and load the reg from the dnadb version!!!!!!! | |
314 ## | |
315 ## my $version_text= ($self->{'release'}) ? 'version '.$self->{'release'} : 'current version'; | |
316 ## $self->log("Loading $version_text registry from $reg_user".'@'.$reg_host); | |
317 ## | |
318 ## #Note this defaults API version, hence running with head code | |
319 ## #And not specifying a release version will find not head version | |
320 ## #DBs on ensembldb, resulting in an exception from reset_DBAdaptor below | |
321 ## $reg->load_registry_from_db( | |
322 ## -host => $reg_host, | |
323 ## -user => $reg_user, | |
324 ## -port => $reg_port, | |
325 ## -pass => $reg_pass, | |
326 ## #-host => "ens-staging", | |
327 ## #-user => 'ensro', | |
328 ## -db_version => $self->{'release'},#51 | |
329 ## -verbose => $self->verbose, | |
330 ## ); | |
331 ## | |
332 ## throw('Not sensible to set the import DB as the default eFG DB from '.$reg_host.', please define db params') if ((! $dbname) && (! $db)); | |
333 ## } | |
334 ## else{ | |
335 ## $self->log("Loading registry from:\t".$self->{'_reg_config'}); | |
336 ## $reg->load_all($self->{'_reg_config'}, 1); | |
337 ## } | |
338 ## | |
339 ## | |
340 ## #Need to test the DBs here, as we may not have loaded any! | |
341 ## #get_alias wil fail otherwise? | |
342 ## #This is a cyclical dependancy as we need alias to get species which we use to grab the DB | |
343 ## #alias is dependant on core DB being loaded with relevant meta entries. | |
344 ## #revise this when we split the Importer | |
345 ## | |
346 ## | |
347 ## #Validate species | |
348 ## my $alias = $reg->get_alias($species) || throw("Could not find valid species alias for $species\nYou might want to clean up:\t".$self->get_dir('output')); | |
349 ## $self->species($alias); | |
350 ## $self->{'param_species'} = $species;#Only used for dir generation | |
351 ## | |
352 ## | |
353 ## #SET UP DBs | |
354 ## if($db){ | |
355 ## #db will have been defined before reg loaded, so will be present in reg | |
356 ## | |
357 ## if(! (ref($db) && $db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){ | |
358 ## $self->throw('-db must be a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'); | |
359 ## } | |
360 ## } | |
361 ## else{ #define eFG DB from params or registry | |
362 ## | |
363 ## if($reg_db){#load eFG DB from reg | |
364 ## | |
365 ## if($dbname){ | |
366 ## throw("You cannot specify DB params($dbname) and load from the registry at the same time."); | |
367 ## } | |
368 ## | |
369 ## $self->log('WARNING: Loading eFG DB from Registry'); | |
370 ## $db = $reg->get_DBAdaptor($self->species(), 'funcgen'); | |
371 ## throw("Unable to retrieve ".$self->species." funcgen DB from the registry") if ! $db; | |
372 ## } | |
373 ## else{#resets the eFG DB in the custom or generic registry | |
374 ## | |
375 ## $dbname || throw('Must provide a -dbname if not using default custom registry config'); | |
376 ## #$user || throw('Must provide a -user parameter');#make this default to EFG_WRITE_USER? | |
377 ## $pass || throw('Must provide a -pass parameter'); | |
378 ## | |
379 ## #remove this and throw? | |
380 ## if(! defined $host){ | |
381 ## $self->log('WARNING: Defaulting to localhost'); | |
382 ## $host = 'localhost'; | |
383 ## } | |
384 ## | |
385 ## $port ||= 3306; | |
386 ## my $host_ip = '127.0.0.1';#is this valid for all localhosts? | |
387 ## | |
388 ## if ($self->{'ssh'}) { | |
389 ## $host = `host localhost`; #mac specific? nslookup localhost wont work on server/non-PC | |
390 ## #will this always be the same? | |
391 ## | |
392 ## if (! (exists $ENV{'EFG_HOST_IP'})) { | |
393 ## warn "Environment variable EFG_HOST_IP not set for ssh mode, defaulting to $host_ip for $host"; | |
394 ## } else { | |
395 ## $host_ip = $ENV{'EFG_HOST_IP'}; | |
396 ## } | |
397 ## | |
398 ## if ($self->host() ne 'localhost') { | |
399 ## warn "Overriding host ".$self->host()." for ssh connection via localhost($host_ip)"; | |
400 ## } | |
401 ## } | |
402 ## | |
403 ## | |
404 ## #data version is only used if we don't want to define the dbname | |
405 ## #This should never be guessed so don't need data_version here | |
406 ## #$dbname ||= $self->species()."_funcgen_".$self->data_version(); | |
407 ## | |
408 ## | |
409 ## #Remove block below when we can | |
410 ## my $dbhost = ($self->{'ssh'}) ? $host_ip : $host; | |
411 ## | |
412 ## #This isn't set yet!? | |
413 ## #When we try to load, say 49, when we only have 48 on ensembldb | |
414 ## #This fails because there is not DB set for v49, as it is not on ensembl DB | |
415 ## #In this case we need to load from the previous version | |
416 ## #Trap this and suggest using the -schema_version/release option | |
417 ## #Can we autodetect this and reload the registry? | |
418 ## #We want to reload the registry anyway with the right version corresponding to the dnadb | |
419 ## #We could either test for the db in the registry or just pass the class. | |
420 ## | |
421 ## $db = $reg->reset_DBAdaptor($self->species(), 'funcgen', $dbname, $dbhost, $port, $self->user, $pass, | |
422 ## { | |
423 ## -dnadb_host => $reg_host, | |
424 ## -dnadb_port => $reg_port, | |
425 ## -dnadb_assembly => $assm_version, | |
426 ## -dnadb_user => $reg_user, | |
427 ## -dnadb_pass => $reg_pass, | |
428 ## }); | |
429 ## | |
430 ## | |
431 ## #ConfigRegistry will try and set this | |
432 ## #This will fail if there is already one in the registry as it will try | |
433 ## #and defined a new unique species so as not to overwrite the original | |
434 ## #e.g. homo_sapiens1 | |
435 ## | |
436 ## #This is why it was orignally written backwards as we can't easily dynamically redefine | |
437 ## #an adaptor in the registry without ConfigRegistry trying to change the name | |
438 ## #the very act of creating a new db to redefine the registry with causes ConfigRegistry | |
439 ## #to try and register it with a unique species name | |
440 ## | |
441 ## #Delete the old funcgen DB from the registry first | |
442 ## #$reg->remove_DBAdaptor($self->species, 'funcgen'); | |
443 ## | |
444 ## #ConfigRegistry will automatically configure this new db | |
445 ## | |
446 ## #$db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new( | |
447 ## # -user => $user, | |
448 ## # -host => ($self->{'ssh'}) ? $host_ip : $host, | |
449 ## # -port => $port, | |
450 ## # -pass => $pass, | |
451 ## # #we need to pass dbname else we can use non-standard dbs | |
452 ## # -dbname => $dbname, | |
453 ## # -species => $self->species(), | |
454 ## # -group => 'funcgen', | |
455 ## # ); | |
456 ## | |
457 ## | |
458 ## #if we get a species like homo_sapiens1 here | |
459 ## #This is because ConfigRegistry is try to make the dbname different between the | |
460 ## #one already present and the one you're trying to add | |
461 ## } | |
462 ## } | |
463 ## | |
464 ## | |
465 ## ### VALIDATE DNADB | |
466 ## #This is now done in DBAdaptor | |
467 ## | |
468 ## #We can change this to just use the assembly version | |
469 ## #we could even have the wordy assmelby version from the meta table | |
470 ## #do the standard ensembl subs | |
471 ## #s/[A-Za-z]//g; | |
472 ## #s/\.//g; | |
473 ## #And then validate? | |
474 ## #Just stick to number version for now. | |
475 ## | |
476 ## | |
477 ## #Now we need to set the dnadb_host params to avoid ensembldb defaults | |
478 ## #This should check the registry first | |
479 ## #Then load from the registry db? | |
480 ## #If we have a multi host registry config file this isn't going to work! | |
481 ## | |
482 ## #Is this required anymore as the DBAdaptor handles this? | |
483 ## #Not if we pass a db with an incorrect dnadb attached. | |
484 ## | |
485 ## #if($db->_get_schema_build($db->dnadb()) !~ /_[0-9]+_${assm_version}[a-z]*$/){ | |
486 ### my $warning = "WARNING: dnadb does not match assembly_version $assm_version. Using ensembldb.enembl.org to define the dnadb"; | |
487 ### $warning .= ' rather than the reg_config' if (defined $self->{'_reg_config'}); | |
488 ## | |
489 ## #We need to account for reg_config DBs which may have custom info in | |
490 ## #So try reg_config host first, then try ensembldb with warning | |
491 ## #Could have a reg_config only flag for core dbs | |
492 ## #Need to implement more params in set_dnadb_by_assembly_version | |
493 ### $self->log($warning); | |
494 ## | |
495 ### $db->set_dnadb_by_assembly_version($assm_version); | |
496 ### } | |
497 ## | |
498 ## | |
499 ## | |
500 ## | |
501 ## #Test connections | |
502 ## $self->db($db); | |
503 ## $db->dbc->db_handle; | |
504 ## $db->dnadb->dbc->db_handle; | |
505 ## #Set re/disconnect options | |
506 ## | |
507 ## #These really need setting dependant on the import parser | |
508 ## $db->dbc->disconnect_when_inactive(1); | |
509 ## $db->dnadb->dbc->disconnect_when_inactive(1); | |
510 | |
511 | |
512 | |
513 ## ### Check analyses/feature_type/cell_type | |
514 ## if($feature_analysis){ | |
515 ## my $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($feature_analysis); | |
516 ## throw("The Feature Analysis $feature_analysis does not exist in the database") if(!$fanal); | |
517 ## $self->feature_analysis($fanal); | |
518 ## | |
519 ## #This currently fails before the config gets loaded! | |
520 ## #Need to load config before this validation! | |
521 ## } | |
522 ## | |
523 ## if($ctype_name){ | |
524 ## my $ctype = $self->db->get_CellTypeAdaptor->fetch_by_name($ctype_name); | |
525 ## throw("The CellType $ctype_name does not exist in the database") if(!$ctype); | |
526 ## $self->cell_type($ctype); | |
527 ## } | |
528 ## | |
529 ## if ($ftype_name) { | |
530 ## my $ftype = $self->db->get_FeatureTypeAdaptor->fetch_by_name($ftype_name); | |
531 ## throw("The FeatureType $ftype_name does not exist in the database") if(!$ftype); | |
532 ## $self->feature_type($ftype); | |
533 ## } | |
534 | |
535 | |
536 #Set config here instead? | |
537 #So we can check all mandatory params | |
538 #Set vendor specific attr dependent vars | |
539 | |
540 #Generic input dir | |
541 $self->{'input_dir'} ||= $self->get_dir("data").'/input/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name(); | |
542 | |
543 if (! -d $self->get_dir('input')) { | |
544 | |
545 if (@{$self->result_files}) { | |
546 #This is really InputSet specific | |
547 #Could go in init_experiment_import | |
548 $self->log("Processing files:\n\t\t".join("\n\t\t",@{$self->result_files})); | |
549 } else { | |
550 throw('input_dir is not defined or does not exist ('.$self->get_dir('input').')'); | |
551 } | |
552 } | |
553 | |
554 #Parser specific config | |
555 $self->set_config(); | |
556 | |
557 | |
558 $self->debug(2, "Importer class instance created."); | |
559 $self->debug_hash(3, \$self); | |
560 | |
561 return ($self); | |
562 } | |
563 | |
564 =head2 registry_host | |
565 | |
566 Example : my $reg_host = $imp->registry_host; | |
567 Description: Accessor for registry host attribute | |
568 Returntype : string e.g. ensembldb.ensembl.org | |
569 Exceptions : None | |
570 Caller : general | |
571 Status : at risk | |
572 | |
573 =cut | |
574 | |
575 sub registry_host{ | |
576 return $_[0]->{'reg_host'}; | |
577 } | |
578 | |
579 =head2 registry_user | |
580 | |
581 Example : my $reg_user = $imp->registry_user; | |
582 Description: Accessor for registry user attribute | |
583 Returntype : string e.g. ensembldb.ensembl.org | |
584 Exceptions : None | |
585 Caller : general | |
586 Status : at risk | |
587 | |
588 =cut | |
589 | |
590 sub registry_user{ | |
591 return $_[0]->{'reg_user'}; | |
592 } | |
593 | |
594 =head2 registry_port | |
595 | |
596 Example : my $reg_port = $imp->registry_port; | |
597 Description: Accessor for registry port attribute | |
598 Returntype : string e.g. ensembldb.ensembl.org | |
599 Exceptions : None | |
600 Caller : general | |
601 Status : at risk | |
602 | |
603 =cut | |
604 | |
605 sub registry_port{ | |
606 return $_[0]->{'reg_port'}; | |
607 } | |
608 | |
609 =head2 registry_pass | |
610 | |
611 Example : my $reg_pass = $imp->registry_pass; | |
612 Description: Accessor for registry pass attribute | |
613 Returntype : string e.g. ensembldb.ensembl.org | |
614 Exceptions : None | |
615 Caller : general | |
616 Status : at risk | |
617 | |
618 =cut | |
619 | |
620 sub registry_pass{ | |
621 return $_[0]->{'reg_pass'}; | |
622 } | |
623 | |
624 | |
625 #init method kept separate from new due to differing madatory check and set up | |
626 | |
627 =head2 init_array_import | |
628 | |
629 Example : $self->init_import(); | |
630 Description: Initialises import by creating working directories | |
631 and by storing the Experiemnt | |
632 Returntype : none | |
633 Exceptions : warns and throws depending on recover and Experiment status | |
634 Caller : general | |
635 Status : at risk - merge with register_array_design | |
636 | |
637 =cut | |
638 | |
639 sub init_array_import{ | |
640 | |
641 my ($self) = shift; | |
642 | |
643 # we need to define which paramters we'll be storing | |
644 #use the logic names of the analyses as the field headers | |
645 | |
646 #need to test for vendor here | |
647 | |
648 #Sanger, NIMBLEGEN(no design_id issue, could get from the ndf, but we want it in the DesignNotes.txt) | |
649 #Then we can change the Array/Chip generation to soley use the DesignNotes.txt rather than SampleKey | |
650 #which is experiment specific | |
651 #or eFG format. | |
652 | |
653 $self->create_output_dirs('caches', 'fastas'); | |
654 | |
655 | |
656 } | |
657 | |
658 | |
659 =head2 init_experiment_import | |
660 | |
661 Example : $self->init_import(); | |
662 Description: Initialises import by creating working directories | |
663 and by storing the Experiemnt | |
664 Returntype : none | |
665 Exceptions : warns and throws depending on recover and Experiment status | |
666 Caller : general | |
667 Status : at risk - merge with register exeriment | |
668 | |
669 =cut | |
670 | |
671 sub init_experiment_import{ | |
672 my ($self) = shift; | |
673 | |
674 #Change this to take config mandatory params? | |
675 #No specific to exp import | |
676 #Name is used in set_config anyway | |
677 #Currently we only have array and experiment import, both of which should have names | |
678 #Make mandatory? | |
679 | |
680 foreach my $tmp ("group", "data_dir") { #name now generically mandatory | |
681 throw("Mandatory arg $tmp not been defined") if (! defined $self->{$tmp}); | |
682 } | |
683 #Should we separate path on group here too, so we can have a dev/test group? | |
684 | |
685 #Create output dirs | |
686 #This should be moved to the Parser to avoid generating directories which are needed for different imports | |
687 $self->create_output_dirs('raw', 'norm', 'caches', 'fastas'); | |
688 throw("No result_files defined.") if (! defined $self->result_files()); | |
689 | |
690 #Log input files | |
691 #if (@{$self->result_files()}) { | |
692 # $self->log("Found result files arguments:\n\t".join("\n\t", @{$self->result_files()})); | |
693 #} | |
694 #This is done in new | |
695 | |
696 #check for cell||feature and warn if no met file supplied? | |
697 | |
698 if ($self->norm_method) { | |
699 my $norm_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->norm_method); | |
700 | |
701 #should we list the valid analyses? | |
702 throw($self->norm_method.' is not a valid analysis') if ! $norm_anal; | |
703 $self->norm_analysis($norm_anal); | |
704 } else { | |
705 $self->log('WARNING: No normalisation analysis specified'); | |
706 } | |
707 | |
708 #warn "Need to check env vars here or in Parser or just after set_config?"; | |
709 #Need generic method for checking ENV vars in Helper | |
710 #check for ENV vars? | |
711 #R_LIBS | |
712 #R_PATH if ! farm | |
713 #R_FARM_PATH | |
714 | |
715 $self->validate_group(); #import experimental_group | |
716 | |
717 #Get experiment | |
718 my $exp_adaptor = $self->db->get_ExperimentAdaptor(); | |
719 my $exp = $exp_adaptor->fetch_by_name($self->name()); #, $self->group()); | |
720 $self->process_experiment_config if $self->can('process_experiment_config'); #Parsers::MAGE::process_experiment_config | |
721 | |
722 #Moved MAGE support form here to MAGE.pm | |
723 | |
724 #Recovery now set so deal with experiment | |
725 if ($self->recovery() && ($exp)) { | |
726 $self->log("Using previously stored Experiment:\t".$exp->name); | |
727 } elsif ((! $self->recovery()) && $exp) { | |
728 throw("Your experiment name is already registered in the database, please choose a different \"name\", this will require renaming you input directory, or specify -recover if you are working with a failed/partial import."); | |
729 #can we skip this and store, and then check in register experiment if it is already stored then throw if not recovery | |
730 } else { # (recover && exp) || (recover && ! exp) | |
731 | |
732 | |
733 $exp = Bio::EnsEMBL::Funcgen::Experiment->new( | |
734 -EXPERIMENTAL_GROUP => $self->{egroup}, | |
735 -NAME => $self->name(), | |
736 -DATE => $self->experiment_date(), | |
737 -PRIMARY_DESIGN_TYPE => $self->design_type(), | |
738 -DESCRIPTION => $self->description(), | |
739 -ADAPTOR => $self->db->get_ExperimentAdaptor(), | |
740 ); | |
741 | |
742 ($exp) = @{$exp_adaptor->store($exp)}; | |
743 } | |
744 | |
745 | |
746 $self->experiment($exp); | |
747 | |
748 #remove and add specific report, this is catchig some Root stuff | |
749 #$self->log("Initiated efg import with following parameters:\n".Data::Dumper::Dumper(\$self)); | |
750 | |
751 return; | |
752 } | |
753 | |
754 | |
755 #Move this to new or init_experiment | |
756 | |
757 =head2 validate_group | |
758 | |
759 Example : $self->validate_group(); | |
760 Description: Validates groups details | |
761 Returntype : none | |
762 Exceptions : throws if insufficient info defined to store new Group and is not already present | |
763 Caller : general | |
764 Status : Medium - check location and contact i.e. group name clash? | |
765 | |
766 =cut | |
767 | |
768 sub validate_group{ | |
769 my ($self) = shift; | |
770 | |
771 my $egroup = $self->db->get_ExperimentalGroupAdaptor->fetch_by_name($self->group); | |
772 | |
773 if (! defined $egroup) { | |
774 | |
775 throw("validate_group does not yet fully support ExperimentalGroups, please add manually"); | |
776 | |
777 #if ($self->location() && $self->contact()) { | |
778 # $self->db->import_group($self->group(), $self->location, $self->contact()); | |
779 #} else { | |
780 # throw("Group ".$self->group()." does not exist, please specify a location and contact to register the group"); | |
781 #} | |
782 } | |
783 | |
784 $self->{egroup} = $egroup; | |
785 | |
786 return; | |
787 } | |
788 | |
789 =head2 create_output_dirs | |
790 | |
791 Example : $self->create_output_dirs(); | |
792 Description: Does what it says on the tin, creates dirs in | |
793 the root output dir foreach @dirnames, also set paths in self | |
794 Arg [1] : mandatory - list of dir names | |
795 Returntype : none | |
796 Exceptions : none | |
797 Caller : general | |
798 Status : Medium - add throw? | |
799 | |
800 =cut | |
801 | |
802 sub create_output_dirs{ | |
803 my ($self, @dirnames) = @_; | |
804 | |
805 #output dir created in control script | |
806 #avoids errors when logs generated first | |
807 | |
808 | |
809 foreach my $name (@dirnames) { | |
810 | |
811 if ($name eq 'caches') { | |
812 $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/".$self->db->dbc->dbname() if(! defined $self->{"${name}_dir"}); | |
813 } elsif ($name eq 'fastas') { | |
814 $self->{"${name}_dir"} = $ENV{'EFG_DATA'}."/${name}/" if(! defined $self->{"${name}_dir"}); | |
815 } else { | |
816 $self->{"${name}_dir"} = $self->get_dir('output')."/${name}" if(! defined $self->{"${name}_dir"}); | |
817 } | |
818 | |
819 if (! (-d $self->get_dir($name) || (-l $self->get_dir($name)))) { | |
820 $self->log("Creating directory:\t".$self->get_dir($name)); | |
821 #This did not throw with mkdir!! | |
822 mkpath $self->get_dir($name) || throw('Failed to create directory: '. $self->get_dir($name)); | |
823 chmod 0744, $self->get_dir($name); | |
824 } | |
825 } | |
826 | |
827 return; | |
828 } | |
829 | |
830 =head2 vendor | |
831 | |
832 Example : $imp->vendor("NimbleGen"); | |
833 Description: Getter/Setter for array vendor | |
834 Arg [1] : optional - vendor name | |
835 Returntype : string | |
836 Exceptions : none | |
837 Caller : general | |
838 Status : Stable | |
839 | |
840 =cut | |
841 | |
842 sub vendor{ | |
843 my ($self) = shift; | |
844 | |
845 if (@_) { | |
846 $self->{'vendor'} = shift; | |
847 $self->{'vendor'} = uc($self->{'vendor'}); | |
848 } | |
849 | |
850 return $self->{'vendor'}; | |
851 } | |
852 | |
853 | |
854 =head2 feature_type | |
855 | |
856 Example : $imp->feature_type($ftype); | |
857 Description: Getter/Setter for Experiment FeatureType | |
858 Arg [1] : optional - Bio::EnsEMBL::Funcgen::FeatureType | |
859 Returntype : Bio::EnsEMBL::Funcgen::FeatureType | |
860 Exceptions : Throws if arg is not valid or stored | |
861 Caller : general | |
862 Status : at risk | |
863 | |
864 =cut | |
865 | |
866 sub feature_type{ | |
867 my ($self) = shift; | |
868 | |
869 if (@_) { | |
870 my $ftype = shift; | |
871 | |
872 #do we need this as we're checking in new? | |
873 if (! ($ftype->isa('Bio::EnsEMBL::Funcgen::FeatureType') && $ftype->dbID())) { | |
874 throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::FeatureType"); | |
875 } | |
876 | |
877 $self->{'feature_type'} = $ftype; | |
878 } | |
879 | |
880 return $self->{'feature_type'}; | |
881 } | |
882 | |
883 =head2 feature_analysis | |
884 | |
885 Example : $imp->feature_analysis($fanal); | |
886 Description: Getter/Setter for Analysis used for creating the imported Features | |
887 Arg [1] : optional - Bio::EnsEMBL::Analysis | |
888 Returntype : Bio::EnsEMBL::Analysis | |
889 Exceptions : Throws if arg is not valid or stored | |
890 Caller : general | |
891 Status : at risk | |
892 | |
893 =cut | |
894 | |
895 sub feature_analysis{ | |
896 my ($self) = shift; | |
897 | |
898 if (@_) { | |
899 my $fanal = shift; | |
900 | |
901 #do we need this as we're checking in new? | |
902 if (! (ref ($fanal) && $fanal->isa('Bio::EnsEMBL::Analysis') && $fanal->dbID())) { | |
903 throw("Must pass a valid stored Bio::EnsEMBL::Analysis"); | |
904 } | |
905 | |
906 $self->{'feature_analysis'} = $fanal; | |
907 } | |
908 | |
909 return $self->{'feature_analysis'}; | |
910 } | |
911 | |
912 =head2 norm_analysis | |
913 | |
914 Example : $imp->norm_analysis($anal); | |
915 Description: Getter/Setter for the normalisation analysis | |
916 Arg [1] : optional - Bio::EnsEMBL::Analysis | |
917 Returntype : Bio::EnsEMBL::Analysis | |
918 Exceptions : Throws if arg is not valid or stored | |
919 Caller : general | |
920 Status : at risk | |
921 | |
922 =cut | |
923 | |
924 sub norm_analysis{ | |
925 my ($self) = shift; | |
926 | |
927 if (@_) { | |
928 my $anal = shift; | |
929 | |
930 #do we need this as we're checking in new? | |
931 if (! (ref($anal) && $anal->isa('Bio::EnsEMBL::Analysis') && $anal->dbID())) { | |
932 throw("Must pass a valid stored Bio::EnsEMBL::Analysis"); | |
933 } | |
934 | |
935 $self->{'norm_analysis'} = $anal; | |
936 } | |
937 | |
938 return $self->{'norm_analysis'}; | |
939 } | |
940 | |
941 | |
942 | |
943 =head2 cell_type | |
944 | |
945 Example : $imp->cell_type($ctype); | |
946 Description: Getter/Setter for Experiment CellType | |
947 Arg [1] : optional - Bio::EnsEMBL::Funcgen::CellType | |
948 Returntype : Bio::EnsEMBL::Funcgen::CellType | |
949 Exceptions : Throws if arg is not valid or stored | |
950 Caller : general | |
951 Status : at risk | |
952 | |
953 =cut | |
954 | |
955 sub cell_type{ | |
956 my ($self) = shift; | |
957 | |
958 if (@_) { | |
959 my $ctype = shift; | |
960 | |
961 #do we need this as we're checking in new? | |
962 if (! ($ctype->isa('Bio::EnsEMBL::Funcgen::CellType') && $ctype->dbID())) { | |
963 throw("Must pass a valid stored Bio::EnsEMBL::Funcgen::CellType"); | |
964 } | |
965 | |
966 $self->{'cell_type'} = $ctype; | |
967 } | |
968 | |
969 return $self->{'cell_type'}; | |
970 } | |
971 | |
972 | |
973 ##=head2 ucsc_coords | |
974 ## | |
975 ## Example : $start += 1 if $self->ucsc_coords; | |
976 ## Description: Getter for UCSC coordinate usage flag | |
977 ## Returntype : boolean | |
978 ## Exceptions : none | |
979 ## Caller : general | |
980 ## Status : at risk | |
981 ## | |
982 ##=cut | |
983 ## | |
984 ##sub ucsc_coords{ | |
985 ## my $self = shift; | |
986 ## return $self->{'ucsc_coords'}; | |
987 ##} | |
988 | |
989 | |
990 | |
991 =head2 array_file | |
992 | |
993 Example : my $array_file = $imp->array_file(); | |
994 Description: Getter/Setter for sanger/design array file | |
995 Arg [1] : optional - path to adf or gff array definition/mapping file | |
996 Returntype : string | |
997 Exceptions : none | |
998 Caller : general | |
999 Status : at risk | |
1000 | |
1001 =cut | |
1002 | |
1003 sub array_file{ | |
1004 my ($self) = shift; | |
1005 $self->{'array_file'} = shift if(@_); | |
1006 return $self->{'array_file'}; | |
1007 } | |
1008 | |
1009 =head2 array_name | |
1010 | |
1011 Example : my $array_name = $imp->array_name(); | |
1012 Description: Getter/Setter for array name | |
1013 Arg [1] : optional string - name of array | |
1014 Returntype : string | |
1015 Exceptions : none | |
1016 Caller : general | |
1017 Status : at risk | |
1018 | |
1019 =cut | |
1020 | |
1021 sub array_name{ | |
1022 my ($self) = shift; | |
1023 $self->{'array_name'} = shift if(@_); | |
1024 return $self->{'array_name'}; | |
1025 } | |
1026 | |
1027 | |
1028 =head2 array_set | |
1029 | |
1030 Example : $imp->array_set(1); | |
1031 Description: Getter/Setter for array set flag | |
1032 Arg [1] : optional boolean - treat all array chips as the same array | |
1033 Returntype : boolean | |
1034 Exceptions : none | |
1035 Caller : general | |
1036 Status : at risk | |
1037 | |
1038 =cut | |
1039 | |
1040 sub array_set{ | |
1041 my ($self) = shift; | |
1042 $self->{'array_set'} = shift if(@_); | |
1043 return $self->{'array_set'}; | |
1044 } | |
1045 | |
1046 | |
1047 =head2 add_Array | |
1048 | |
1049 Arg [1] : Bio::EnsEMBL::Funcgen::Array | |
1050 Example : $self->add_Array($array); | |
1051 Description: Setter for array elements | |
1052 Returntype : none | |
1053 Exceptions : throws if passed non Array or if more than one Array set | |
1054 Caller : Importer | |
1055 Status : At risk - Implement multiple arrays? Move to Experiment? | |
1056 | |
1057 =cut | |
1058 | |
1059 sub add_Array{ | |
1060 my $self = shift; | |
1061 | |
1062 #do we need to check if stored? | |
1063 if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Array')) { | |
1064 throw("Must supply a Bio::EnsEMBL::Funcgen::Array"); | |
1065 } elsif (@_) { | |
1066 push @{$self->{'arrays'}}, @_; | |
1067 } | |
1068 | |
1069 throw("Does not yet support multiple array imports") if(scalar (@{$self->{'arrays'}}) > 1); | |
1070 #need to alter read_probe data at the very least | |
1071 | |
1072 return; | |
1073 } | |
1074 | |
1075 | |
1076 | |
1077 =head2 arrays | |
1078 | |
1079 Example : foreach my $array(@{$imp->arrays}){ ...do an array of things ...}; | |
1080 Description: Getter for the arrays attribute | |
1081 Returntype : ARRAYREF | |
1082 Exceptions : none | |
1083 Caller : general | |
1084 Status : at risk | |
1085 | |
1086 =cut | |
1087 | |
1088 sub arrays{ | |
1089 my $self = shift; | |
1090 | |
1091 if (! defined $self->{'arrays'}) { | |
1092 $self->{'arrays'} = $self->db->get_ArrayAdaptor->fetch_all_by_Experiment($self->experiment()); | |
1093 } | |
1094 | |
1095 return $self->{'arrays'}; | |
1096 } | |
1097 | |
1098 | |
1099 | |
1100 =head2 location | |
1101 | |
1102 Example : $imp->vendor("Hinxton"); | |
1103 Description: Getter/Setter for group location | |
1104 Arg [1] : optional - location | |
1105 Returntype : string | |
1106 Exceptions : none | |
1107 Caller : general | |
1108 Status : Stable | |
1109 | |
1110 =cut | |
1111 | |
1112 | |
1113 sub location{ | |
1114 my ($self) = shift; | |
1115 $self->{'location'} = shift if(@_); | |
1116 return $self->{'location'}; | |
1117 } | |
1118 | |
1119 | |
1120 =head2 contact | |
1121 | |
1122 Example : my $contact = $imp->contact(); | |
1123 Description: Getter/Setter for the group contact | |
1124 Arg [1] : optional - contact name/email/address | |
1125 Returntype : string | |
1126 Exceptions : none | |
1127 Caller : general | |
1128 Status : Stable | |
1129 | |
1130 =cut | |
1131 | |
1132 sub contact{ | |
1133 my ($self) = shift; | |
1134 $self->{'contact'} = shift if(@_); | |
1135 return $self->{'contact'}; | |
1136 } | |
1137 | |
1138 =head2 name | |
1139 | |
1140 Example : $imp->name('Experiment1'); | |
1141 Description: Getter/Setter for the experiment name | |
1142 Arg [1] : optional - experiment name | |
1143 Returntype : string | |
1144 Exceptions : none | |
1145 Caller : general | |
1146 Status : Stable | |
1147 | |
1148 =cut | |
1149 | |
1150 | |
1151 sub name{ | |
1152 my ($self) = shift; | |
1153 $self->{'name'} = shift if(@_); | |
1154 return $self->{'name'}; | |
1155 } | |
1156 | |
1157 =head2 result_files | |
1158 | |
1159 Example : $imp->result_files(\@files); | |
1160 Description: Getter/Setter for the result file paths | |
1161 Arg [1] : Listref of file paths | |
1162 Returntype : Listref | |
1163 Exceptions : none | |
1164 Caller : general | |
1165 Status : At risk | |
1166 | |
1167 =cut | |
1168 | |
1169 | |
1170 sub result_files{ | |
1171 my ($self) = shift; | |
1172 $self->{'result_files'} = shift if(@_); | |
1173 return $self->{'result_files'}; | |
1174 } | |
1175 | |
1176 | |
1177 | |
1178 | |
1179 | |
1180 =head2 experiment_date | |
1181 | |
1182 Example : $imp->experiment_date('2006-11-02'); | |
1183 Description: Getter/Setter for the experiment date | |
1184 Arg [1] : optional - date string in yyyy-mm-dd | |
1185 Returntype : string | |
1186 Exceptions : none | |
1187 Caller : general | |
1188 Status : At risk | |
1189 | |
1190 =cut | |
1191 | |
1192 | |
1193 | |
1194 sub experiment_date{ | |
1195 my ($self) = shift; | |
1196 | |
1197 if (@_) { | |
1198 my $date = shift; | |
1199 | |
1200 if ($date !~ /[0-9]{4}-[0-9]{2}[0-9]{2}/o) { | |
1201 throw('Parameter -experiment_date needs to fe in the format: YYYY-MM-DD'); | |
1202 } | |
1203 | |
1204 $self->{'experiment_date'} = $date; | |
1205 } elsif ($self->vendor() eq "nimblegen" && ! defined $self->{'experiment_date'}) { | |
1206 $self->{'experiment_date'} = &get_date("date", $self->get_config("chip_file")), | |
1207 } | |
1208 | |
1209 return $self->{'experiment_date'}; | |
1210 } | |
1211 | |
1212 | |
1213 | |
1214 =head2 group | |
1215 | |
1216 Example : my $exp_group = $imp->group(); | |
1217 Description: Getter/Setter for the group name | |
1218 Arg [1] : optional - group name | |
1219 Returntype : string | |
1220 Exceptions : none | |
1221 Caller : general | |
1222 Status : Stable | |
1223 | |
1224 =cut | |
1225 | |
1226 sub group{ | |
1227 my ($self) = shift; | |
1228 $self->{'group'} = shift if(@_); | |
1229 return $self->{'group'}; | |
1230 } | |
1231 | |
1232 | |
1233 =head2 description | |
1234 | |
1235 Example : $imp->description("Human chrX H3 Lys 9 methlyation"); | |
1236 Description: Getter/Setter for the experiment element | |
1237 Arg [1] : optional - experiment description | |
1238 Returntype : string | |
1239 Exceptions : none | |
1240 Caller : general | |
1241 Status : Stable | |
1242 | |
1243 =cut | |
1244 | |
1245 sub description{ | |
1246 my $self = shift; | |
1247 | |
1248 if (@_) { | |
1249 $self->{'description'} = shift; | |
1250 } | |
1251 | |
1252 return $self->{'description'}; | |
1253 } | |
1254 | |
1255 ##=head2 feature_set_description | |
1256 ## | |
1257 ## Example : $imp->description("ExperimentalSet description"); | |
1258 ## Description: Getter/Setter for the FeatureSet description for an | |
1259 ## InputSet import e.g. preprocessed GFF/Bed data | |
1260 ## Arg [1] : optional - string feature set description | |
1261 ## Returntype : string | |
1262 ## Exceptions : none | |
1263 ## Caller : general | |
1264 ## Status : At risk | |
1265 ## | |
1266 ##=cut | |
1267 ## | |
1268 ##sub feature_set_description{ | |
1269 ## my $self = shift; | |
1270 ## | |
1271 ## $self->{'feature_set_description'} = shift if @_; | |
1272 ## | |
1273 ## return $self->{'feature_set_description'}; | |
1274 ##} | |
1275 | |
1276 =head2 format | |
1277 | |
1278 Example : $imp->format("Tiled"); | |
1279 Description: Getter/Setter for the array format | |
1280 Arg [1] : optional - array format | |
1281 Returntype : string | |
1282 Exceptions : none | |
1283 Caller : general | |
1284 Status : Stable | |
1285 | |
1286 =cut | |
1287 | |
1288 sub format{ | |
1289 my ($self) = shift; | |
1290 $self->{'format'} = shift if(@_); | |
1291 return $self->{'format'}; | |
1292 } | |
1293 | |
1294 =head2 experiment | |
1295 | |
1296 Example : my $exp = $imp->experiment(); | |
1297 Description: Getter/Setter for the Experiment element | |
1298 Arg [1] : optional - Bio::EnsEMBL::Funcgen::Experiment | |
1299 Returntype : Bio::EnsEMBL::Funcgen::Experiment | |
1300 Exceptions : throws if arg is not an Experiment | |
1301 Caller : general | |
1302 Status : Stable | |
1303 | |
1304 =cut | |
1305 | |
1306 sub experiment{ | |
1307 my ($self) = shift; | |
1308 | |
1309 if (@_) { | |
1310 | |
1311 if (! $_[0]->isa('Bio::EnsEMBL::Funcgen::Experiment')) { | |
1312 throw("Must pass a Bio::EnsEMBL::Funcgen::Experiment object"); | |
1313 } | |
1314 | |
1315 $self->{'experiment'} = shift; | |
1316 } | |
1317 | |
1318 return $self->{'experiment'}; | |
1319 } | |
1320 | |
1321 ##=head2 db | |
1322 ## | |
1323 ## Example : $imp->db($funcgen_db); | |
1324 ## Description: Getter/Setter for the db element | |
1325 ## Arg [1] : optional - Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor | |
1326 ## Returntype : Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor | |
1327 ## Exceptions : throws if arg is not an DBAdaptor | |
1328 ## Caller : general | |
1329 ## Status : Stable | |
1330 ## | |
1331 ##=cut | |
1332 ## | |
1333 ##sub db{ | |
1334 ## my $self = shift; | |
1335 ## | |
1336 ## if (defined $_[0] && $_[0]->isa("Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor")) { | |
1337 ## $self->{'db'} = shift; | |
1338 ## } elsif (defined $_[0]) { | |
1339 ## throw("Need to pass a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor"); | |
1340 ## } | |
1341 ## | |
1342 ## return $self->{'db'}; | |
1343 ##} | |
1344 ## | |
1345 ##=head2 pass | |
1346 ## | |
1347 ## Example : $imp->pass("password"); | |
1348 ## Description: Getter/Setter for the db password | |
1349 ## Arg [1] : optional - db password | |
1350 ## Returntype : string | |
1351 ## Exceptions : none | |
1352 ## Caller : general | |
1353 ## Status : Stable | |
1354 ## | |
1355 ##=cut | |
1356 ## | |
1357 ## | |
1358 ##sub pass{ | |
1359 ## my $self = shift; | |
1360 ## $self->{'pass'} = shift if(@_); | |
1361 ## return $self->{'pass'}; | |
1362 ##} | |
1363 ## | |
1364 ##=head2 pass | |
1365 ## | |
1366 ## Example : $imp->host("hoastname"); | |
1367 ## Description: Getter/Setter for the db hostname | |
1368 ## Arg [1] : optional - db hostname | |
1369 ## Returntype : string | |
1370 ## Exceptions : none | |
1371 ## Caller : general | |
1372 ## Status : Stable | |
1373 ## | |
1374 ##=cut | |
1375 ## | |
1376 ##sub host{ | |
1377 ## my $self = shift; | |
1378 ## $self->{'host'} = shift if(@_); | |
1379 ## return $self->{'host'}; | |
1380 ##} | |
1381 ## | |
1382 ##=head2 port | |
1383 ## | |
1384 ## Example : $imp->port(3306); | |
1385 ## Description: Getter/Setter for the db port number | |
1386 ## Arg [1] : optional - db port number | |
1387 ## Returntype : int | |
1388 ## Exceptions : none | |
1389 ## Caller : general | |
1390 ## Status : Stable | |
1391 ## | |
1392 ##=cut | |
1393 ## | |
1394 ##sub port{ | |
1395 ## my $self = shift; | |
1396 ## $self->{'port'} = shift if(@_); | |
1397 ## return $self->{'port'}; | |
1398 ##} | |
1399 ## | |
1400 ##=head2 user | |
1401 ## | |
1402 ## Example : $imp->user("user_name"); | |
1403 ## Description: Getter/Setter for the db user name | |
1404 ## Arg [1] : optional - db user name | |
1405 ## Returntype : string | |
1406 ## Exceptions : none | |
1407 ## Caller : general | |
1408 ## Status : Stable | |
1409 ## | |
1410 ##=cut | |
1411 ## | |
1412 ##sub user{ | |
1413 ## my $self = shift; | |
1414 ## $self->{'user'} = shift if(@_); | |
1415 ## return $self->{'user'}; | |
1416 ##} | |
1417 | |
1418 ##=head2 dump_fasta | |
1419 ## | |
1420 ## Example : if($self->dump_fasta()){...do fasta dump...} | |
1421 ## Description: Getter/Setter for the dump_fasta flag | |
1422 ## Arg [1] : optional - 0 or 1 | |
1423 ## Returntype : boolean | |
1424 ## Exceptions : none | |
1425 ## Caller : self | |
1426 ## Status : Stable | |
1427 ## | |
1428 ##=cut | |
1429 ## | |
1430 ## | |
1431 ##sub dump_fasta{ | |
1432 ## my $self = shift; | |
1433 ## $self->{'_dump_fasta'} = shift if @_; | |
1434 ## return $self->{'_dump_fasta'}; | |
1435 ##} | |
1436 ## | |
1437 | |
1438 | |
1439 ##=head2 species | |
1440 ## | |
1441 ## Example : $imp->species("homo_sapiens"); | |
1442 ## Description: Getter/Setter for species | |
1443 ## Arg [1] : optional - species name(alias?) | |
1444 ## Returntype : string | |
1445 ## Exceptions : none ? throw if no alias found? | |
1446 ## Caller : general | |
1447 ## Status : Medium - may move reg alias look up to this method | |
1448 ## | |
1449 ##=cut | |
1450 ## | |
1451 ##sub species{ | |
1452 ## my $self = shift; | |
1453 ## | |
1454 ## #should we do reg alias look up here? | |
1455 ## #Will we ever want to redefine species? | |
1456 ## #Change to just getter? | |
1457 ## | |
1458 ## $self->{'species'} = shift if(@_); | |
1459 ## | |
1460 ## return $self->{'species'}; | |
1461 ##} | |
1462 | |
1463 =head2 get_dir | |
1464 | |
1465 Example : $imp->get_dir("import"); | |
1466 Description: Retrieves full path for given directory | |
1467 Arg [1] : mandatory - dir name | |
1468 Returntype : string | |
1469 Exceptions : none | |
1470 Caller : general | |
1471 Status : at risk - move to Helper? | |
1472 | |
1473 =cut | |
1474 | |
1475 sub get_dir{ | |
1476 my ($self, $dirname) = @_; | |
1477 return $self->get_data("${dirname}_dir"); | |
1478 } | |
1479 | |
1480 =head2 norm_method | |
1481 | |
1482 Example : my $norm_method = $imp->norm_method() | |
1483 Description: Getter/Setter for normalisation method | |
1484 Arg [1] : mandatory - method name | |
1485 Returntype : string | |
1486 Exceptions : none ? throw if no analysis with logic name | |
1487 Caller : general | |
1488 Status : At risk - restrict to logic_name and validate against DB, allow multiple | |
1489 | |
1490 =cut | |
1491 | |
1492 #Move to Nimblegen? | |
1493 #Do we ever want to normalise other data? | |
1494 | |
1495 sub norm_method{ | |
1496 my $self = shift; | |
1497 | |
1498 if (@_) { | |
1499 $self->{'norm_method'} = shift; | |
1500 } elsif (! defined $self->{'norm_method'}) { | |
1501 $self->{'norm_method'}= $self->get_config('norm_method'); | |
1502 } | |
1503 | |
1504 return $self->{'norm_method'}; | |
1505 } | |
1506 | |
1507 | |
1508 =head2 get_config | |
1509 | |
1510 Arg [1] : mandatory - name of the data element to retrieve from the config hash | |
1511 Example : %dye_freqs = %{$imp->get_config('dye_freqs')}; | |
1512 Description: returns data from the definitions hash | |
1513 Returntype : various | |
1514 Exceptions : none | |
1515 Caller : Importer | |
1516 Status : at risk - replace with direct calls in the inherited Defs class? | |
1517 | |
1518 =cut | |
1519 | |
1520 | |
1521 sub get_config{ | |
1522 my ($self, $data_name) = @_; | |
1523 return $self->get_data('config', $data_name); #will this cause undefs? | |
1524 } | |
1525 | |
1526 | |
1527 | |
1528 | |
1529 | |
1530 =head2 register_experiment | |
1531 | |
1532 Example : $imp->register_experiment() | |
1533 Description: General control method, performs all data import and normalisations | |
1534 Arg [1] : optional - dnadb DBAdaptor | |
1535 Returntype : none | |
1536 Exceptions : throws if arg is not Bio::EnsEMBL::DBSQL::DBAdaptor | |
1537 Caller : general | |
1538 Status : Medium | |
1539 | |
1540 =cut | |
1541 | |
1542 | |
1543 #Need to split this method? | |
1544 #Pre farm process | |
1545 #Define and store all sets, | |
1546 #pre process input file once if required | |
1547 #How are we going to be able to tell wether this has been done successfully? | |
1548 #runner will catch error, therefore safe to assume that file is complete if farm job | |
1549 #unless we are not running with farm | |
1550 | |
1551 #farm specific processes | |
1552 #actually parse and store data | |
1553 | |
1554 #Can we separate these for different import types? | |
1555 | |
1556 sub register_experiment{ | |
1557 my ($self) = shift; | |
1558 | |
1559 #Need to check for dnadb passed with adaptor to contructor | |
1560 if (@_) { | |
1561 | |
1562 if ( ! $_[0]->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")) { | |
1563 throw("You need to pass a valid dnadb adaptor to register the experiment"); | |
1564 } | |
1565 $self->db->dnadb($_[0]); | |
1566 } elsif ( ! $self->db) { | |
1567 throw("You need to pass/set a DBAdaptor with a DNADB attached of the relevant data version"); | |
1568 } | |
1569 | |
1570 #This could still be the default core db for the current version | |
1571 #warn here if not passed DB? | |
1572 #These should be vendor independent, only the read methods should need specific order? | |
1573 | |
1574 $self->init_experiment_import(); | |
1575 #can we just have init here instead? | |
1576 | |
1577 | |
1578 #This could do with a rewrite to move some things to the parsers | |
1579 #$self::SUPER->register_experiment | |
1580 | |
1581 $self->write_validate_experiment_config if $self->can('write_validate_experiment_config'); | |
1582 | |
1583 #This is too array specific! | |
1584 #Can we have an array of import_methods in the config? | |
1585 #foreach my $method(@{$self->get_config{'import_methods'}}){ | |
1586 #$self->method; | |
1587 #} | |
1588 #We're already doing this in read_data for each of these data_types | |
1589 | |
1590 #Need to be able to run this separately, so we can normalise previously imported sets with different methods | |
1591 #should be able t do this without raw data files e.g. retrieve info from DB | |
1592 #Is this implemented? | |
1593 | |
1594 $self->read_data("probe"); | |
1595 $self->read_data("results"); | |
1596 | |
1597 | |
1598 my $norm_method = $self->norm_method(); | |
1599 | |
1600 if (defined $norm_method) { | |
1601 warn "norm method is $norm_method"; | |
1602 | |
1603 $self->R_norm($norm_method); | |
1604 #change this to $self->$norm_method | |
1605 #so we can have non-R_norm normalisation | |
1606 } | |
1607 | |
1608 | |
1609 return; | |
1610 } | |
1611 | |
1612 #Move array specific ones to Nimblegen.pm? | |
1613 #Also used by ArrayDesign and Sanger.pm | |
1614 #So need to create Array.pm baseclass, which is a Helper. | |
1615 | |
1616 =head2 store_set_probes_features | |
1617 | |
1618 Arg [1] : mandatory - array chip id | |
1619 Arg [2] : optional - Bio::EnsEMBL::Funcgen::ProbeSet | |
1620 Arg [3] : mandatory - hashref of keys probe id, values are | |
1621 hash of probe/features with values | |
1622 Bio::EnsEMBL::Funcgen::Probe/Features for a given | |
1623 probe set if defined. | |
1624 Example : $self->store_set_probes_features($ac->dbID(), $ops, \%pfs); | |
1625 Description: Stores probe set, probes and probe features | |
1626 Returntype : none | |
1627 Exceptions : none | |
1628 Caller : self | |
1629 Status : Medium | |
1630 | |
1631 =cut | |
1632 | |
1633 sub store_set_probes_features{ | |
1634 my ($self, $ac_id, $pf_hash, $ops) = @_; | |
1635 | |
1636 ### Deal with ProbeSets | |
1637 if ($ops) { | |
1638 $ops->size(scalar(keys %$pf_hash)); | |
1639 ($ops) = $self->db->get_ProbeSetAdaptor->store($ops); | |
1640 } | |
1641 | |
1642 #If we're going to validate fully, we need to check for probes in this probeset on this array chip | |
1643 #Update size if we have any new probes | |
1644 #Overkill? Only do on recover? Do not read if array chip is IMPORTED | |
1645 #This does not make any attempt to validate probes/set vs previously stored data | |
1646 | |
1647 for my $probe_id (keys %$pf_hash) { | |
1648 | |
1649 #set probeset in probe and store | |
1650 #the process corresponding feature | |
1651 my $probe = $pf_hash->{$probe_id}->{'probe'}; | |
1652 $probe->probeset($ops) if $ops; | |
1653 ($probe) = @{$self->db->get_ProbeAdaptor->store($probe)}; | |
1654 | |
1655 #Can't use get_all_Arrays here as we can't guarantee this will only ever be the array we've generated | |
1656 #Might dynamically load array if non-present | |
1657 #This is allowing multiple dbIDs per probe??? Is this wrong? | |
1658 #$self->cache_probe_info($probe->get_probename(), $probe->dbID());###########Remove as we're now importing all then resolving | |
1659 | |
1660 | |
1661 foreach my $feature (@{$pf_hash->{$probe_id}->{'features'}}) { | |
1662 $feature->probe($probe); | |
1663 ($feature) = @{$self->db->get_ProbeFeatureAdaptor->store($feature)}; | |
1664 } | |
1665 } | |
1666 | |
1667 undef $ops; #Will this persist in the caller? | |
1668 undef %{$pf_hash}; | |
1669 | |
1670 return; | |
1671 } | |
1672 | |
1673 | |
1674 =head2 cache_slice | |
1675 | |
1676 Arg [0] : string - region_name e.g. X | |
1677 Arg [1] : optional - coordinate system name e.g. supercontig, defaults to chromosome | |
1678 Example : my $slice = $self->cache_slice(12); | |
1679 Description: Caches or retrieves from cache a given slice | |
1680 Returntype : Bio::EnsEMBL::Slice | |
1681 Exceptions : throws f no region name specified | |
1682 Caller : self | |
1683 Status : At risk | |
1684 | |
1685 =cut | |
1686 | |
1687 sub cache_slice{ | |
1688 my ($self, $region_name, $cs_name, $total_count) = @_; | |
1689 | |
1690 throw("Need to define a region_name to cache a slice from") if ! $region_name; | |
1691 $self->{'slice_cache'} ||= {}; | |
1692 $region_name =~ s/chr//; | |
1693 $region_name = "MT" if $region_name eq "M"; | |
1694 | |
1695 if (! exists ${$self->{'seen_slice_cache'}}{$region_name}) { | |
1696 my $slice = $self->slice_adaptor->fetch_by_region($cs_name, $region_name); | |
1697 | |
1698 #Set seen cache so we don't try this again | |
1699 $self->{seen_slice_cache}{$region_name} = $slice; | |
1700 | |
1701 if (! $slice) { | |
1702 warn("-- Could not generate a slice for ${cs_name}:$region_name\n"); | |
1703 } else { | |
1704 | |
1705 my $sr_name = $slice->seq_region_name; #In case we passed a slice name | |
1706 | |
1707 if (@{$self->{seq_region_names}}) { | |
1708 return if ! grep(/^${sr_name}$/, @{$self->{seq_region_names}}); #not on required slice | |
1709 } | |
1710 } | |
1711 | |
1712 $self->{'slice_cache'}->{$region_name} = $slice; | |
1713 } | |
1714 | |
1715 if ($total_count && exists ${$self->{'seen_slice_cache'}}{$region_name}) { | |
1716 #This is an InputSet specific method | |
1717 $self->count('total_features') if $self->can('count'); | |
1718 } | |
1719 | |
1720 #Only return if exists to avoid creating hash key | |
1721 return (exists $self->{'slice_cache'}->{$region_name}) ? $self->{'slice_cache'}->{$region_name} : undef; | |
1722 } | |
1723 | |
1724 =head2 slice_cache | |
1725 | |
1726 Example : my @seen_slices = values(%{$self->slice_cache});; | |
1727 Description: Returns the slice cache i.e. all the Slices seen in the data filtered | |
1728 by the defined slices. This method can be used to run only the appropriate | |
1729 slice jobs after a prepare stage. | |
1730 Returntype : Hashref of seq_region name Bio::EnsEMBL::Slice pairs | |
1731 Exceptions : None | |
1732 Caller : self | |
1733 Status : At risk | |
1734 | |
1735 =cut | |
1736 | |
1737 | |
1738 sub slice_cache{ | |
1739 my $self = shift; | |
1740 | |
1741 return $self->{'slice_cache'}; | |
1742 } | |
1743 | |
1744 | |
1745 | |
1746 | |
1747 =head2 cache_probe_info | |
1748 | |
1749 Arg [0] : mandatory - probe name | |
1750 Arg [1] : mandatory - probe dbID | |
1751 Arg [2] : optioanl int - x coord of probe on array | |
1752 Arg [3] : optional int - y coord of probe on array | |
1753 Example : $self->cache_probe_info("Probe1", $probe->dbID()); | |
1754 Or for result files which do not have X & Y, we need to cache | |
1755 X & Y from the design files: $self->cache_probe_info('Probe2', $probe->dbID(), $x, $y); | |
1756 Description: Setter for probe cache values | |
1757 Returntype : none | |
1758 Exceptions : throws is cache conflict encountered | |
1759 Caller : self | |
1760 Status : At risk - merge with following? | |
1761 | |
1762 =cut | |
1763 | |
1764 sub cache_probe_info{ | |
1765 my ($self, $pname, $pid, $x, $y) = @_; | |
1766 | |
1767 throw('Deprecated, too memory expensive, now resolving DB duplicates and using Tied File cache'); | |
1768 throw("Must provide a probe name and id") if (! defined $pname || ! defined $pid); | |
1769 | |
1770 | |
1771 #do we need to loop through the file here? | |
1772 #if we make sure we're testing for a previous dbID before storing probes then we don't need to do this | |
1773 #we can catch the error when we get the probe id as we can check for >1 id for the same probe name | |
1774 #if (defined $self->{'_probe_cache'}->{$pname} && ($self->{'_probe_cache'}->{$pname}->[0] != $pid)) { | |
1775 # throw("Found two differing dbIDs for $pname, need to sort out redundant probe entries"); | |
1776 #} | |
1777 | |
1778 $self->{'_probe_cache'}->{$pname} = (defined $x && defined $y) ? [$pid, $x, $y] : [$pid]; | |
1779 | |
1780 return; | |
1781 } | |
1782 | |
1783 | |
1784 =head2 get_probe_id_by_name_Array | |
1785 | |
1786 Arg [1] : mandatory - probe name | |
1787 Example : $pid = $self->get_probe_id_by_name($pname); | |
1788 Description: Getter for probe cache values | |
1789 Returntype : int | |
1790 Exceptions : none | |
1791 Caller : self | |
1792 Status : At risk - merge with previous, move to importer? | |
1793 | |
1794 =cut | |
1795 | |
1796 sub get_probe_id_by_name_Array{ | |
1797 my ($self, $name, $array) = @_; | |
1798 | |
1799 #this is only ever called for fully imported ArrayChips, as will be deleted if recovering | |
1800 $self->resolve_probe_data() if(! exists $self->{'_probe_cache'}{$array->name()}); | |
1801 | |
1802 #we want to cycle through the given cache starting from the last position or 0. | |
1803 #we don't want to have to test for the size of the cache each time as this will be quite expensive | |
1804 #so we should store sizes separately along with current position | |
1805 | |
1806 | |
1807 my ($pid, $line); | |
1808 | |
1809 #check current line | |
1810 if ($line = $self->{'_probe_cache'}{$array->name()}{'current_line'}) { | |
1811 if ($line =~ /^\Q${name}\E\t/) { | |
1812 $pid = (split/\t/o, $line)[1]; | |
1813 } | |
1814 } | |
1815 | |
1816 | |
1817 if (! $pid) { | |
1818 while ($line = $self->{'_probe_cache'}{$array->name()}{'handle'}->getline()) { | |
1819 | |
1820 if ($line =~ /^\Q${name}\E\t/) { | |
1821 $pid = (split/\t/o, $line)[1]; | |
1822 $self->{'_probe_cache'}{$array->name()}{'current_line'} = $line; | |
1823 last; | |
1824 } | |
1825 } | |
1826 } | |
1827 | |
1828 #do not remove this | |
1829 if (! $pid) { | |
1830 throw("Did not find probe name ($name) in cache, cache may need rebuilding, results may need sorting, or do you have an anomolaous probe?") | |
1831 } else { | |
1832 chomp $pid; | |
1833 } | |
1834 | |
1835 return $pid; | |
1836 } | |
1837 | |
1838 =head2 get_probe_cache_by_Array | |
1839 | |
1840 Arg[1] : Bio::EnsEMBL::Funcgen::Array | |
1841 Arg[2] : boolean - from db flag, only to be used by Importer->resolve_probe_data ! | |
1842 Example : $self->get_probe_cache_by_Array(); | |
1843 Description: Gets the probe info cache which is an array tied to a file | |
1844 Returntype : Boolean - True if cache has been generated and set successfully | |
1845 Exceptions : none | |
1846 Caller : general | |
1847 Status : At risk | |
1848 | |
1849 =cut | |
1850 | |
1851 #from db flag should only be used by importer | |
1852 #this is because there is no guarantee that it will be resolved unless | |
1853 #called by resolve_probe_data | |
1854 #which then renames the file and resets the handle | |
1855 #can we clean this up and protect/hide this functionality? | |
1856 #can we check the cache file name in the get methods and throw if it contains unresolved? | |
1857 #make this private? | |
1858 | |
1859 sub get_probe_cache_by_Array{ | |
1860 my ($self, $array, $from_db) = @_; | |
1861 | |
1862 my $msg = "Getting probe cache for ".$array->name(); | |
1863 $msg .= " from DB" if $from_db; | |
1864 $self->log($msg); #, 1); | |
1865 | |
1866 if (! ($array && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID())) { | |
1867 throw('Must provide a valid stored Bio::EnsEMBL::Funcgen::Array object'); | |
1868 } | |
1869 | |
1870 my $set = 0; | |
1871 my $cache_file = $self->get_dir('caches').'/'.$array->name().'.probe_cache'; | |
1872 | |
1873 ### Generate and resolve fresh cache from DB | |
1874 if ($from_db) { | |
1875 | |
1876 $cache_file .= '.unresolved'; #This will be renamed by the caller if it is resolved | |
1877 | |
1878 if (exists $self->{'_probe_cache'}{$array->name()}) { | |
1879 $self->log('Rebuilding probe_cache from DB for '.$array->name(), 1); | |
1880 | |
1881 | |
1882 #untie @{$self->{'_probe_cache'}{$array->name()}{'entries'}}; | |
1883 #close($self->{'_probe_cache'}{$array->name()}{'handle'});#do we need to do this? | |
1884 delete $self->{'_probe_cache'}{$array->name()}; #implicitly closes | |
1885 $self->log('Deleted old cache', 1); | |
1886 } else { | |
1887 $self->log('Building probe_cache from DB for '.$array->name(), 1); | |
1888 } | |
1889 | |
1890 #Move this to ProbeAdaptor? | |
1891 #This is where we'd set the unique key for a vendor and resolves duplicates based on the key | |
1892 my $cmd = 'SELECT name, probe_id from probe WHERE array_chip_id IN ('.join(',', @{$array->get_array_chip_ids()}).') ORDER by name, probe_id'; | |
1893 $cmd = 'mysql '.$self->db->connect_string()." -e \"$cmd\" >".$cache_file; | |
1894 run_system_cmd($cmd); | |
1895 | |
1896 } | |
1897 | |
1898 ### Set cache | |
1899 if (-f $cache_file) { | |
1900 $self->log('MD5 check here?',1); | |
1901 $self->{'_probe_cache'}{$array->name()}{'current_line'} = undef; | |
1902 $self->{'_probe_cache'}{$array->name()}{'handle'} = open_file($cache_file); | |
1903 | |
1904 #can we do a select count instead? and do this instead of the MD5? | |
1905 #$cmd = "wc -l $cache_file"; | |
1906 #my $size = `$cmd`; | |
1907 | |
1908 $set = 1; | |
1909 } else { | |
1910 warn 'Failed to get probe cache for array:'.$array->name(); | |
1911 } | |
1912 | |
1913 return $set; | |
1914 } | |
1915 | |
1916 | |
1917 #should reorganise these emthods to split reading the array data, and the actual data | |
1918 #currently: | |
1919 #meta reads array and chip data | |
1920 #probe reads probe_set, probes, which should definitely be in array, probe_feature? and results | |
1921 #native data format may not map to these methods directly, so may need to call previous method if required data not defined | |
1922 | |
1923 | |
1924 =head2 read_data | |
1925 | |
1926 Example : $self->read_data("probe") | |
1927 Description: Calls each method in data_type array from config hash | |
1928 Arg [1] : mandatory - data type | |
1929 Returntype : none | |
1930 Exceptions : none | |
1931 Caller : self | |
1932 Status : At risk | |
1933 | |
1934 =cut | |
1935 | |
1936 sub read_data{ | |
1937 my($self, $data_type) = @_; | |
1938 | |
1939 map {my $method = "read_${_}_data"; $self->$method()} @{$self->get_config("${data_type}_data")}; | |
1940 return; | |
1941 } | |
1942 | |
1943 | |
1944 =head2 design_type | |
1945 | |
1946 Example : $self->design_type("binding_site_identification") | |
1947 Description: Getter/Setter for experimental design type | |
1948 Arg [1] : optional - design type | |
1949 Returntype : string | |
1950 Exceptions : none | |
1951 Caller : general | |
1952 Status : At risk | |
1953 | |
1954 =cut | |
1955 | |
1956 sub design_type{ | |
1957 my $self = shift; | |
1958 return $self->{'design_type'}; | |
1959 } | |
1960 | |
1961 | |
1962 =head2 get_chr_seq_region_id | |
1963 | |
1964 Example : $seq_region_id = $self->get_seq_region_id('X'); | |
1965 Description: Calls each method in data_type array from config hash | |
1966 Arg [1] : mandatory - chromosome name | |
1967 Arg [2] : optional - start value | |
1968 Arg [3] : optional - end value | |
1969 Returntype : int | |
1970 Exceptions : none | |
1971 Caller : self | |
1972 Status : At risk | |
1973 | |
1974 =cut | |
1975 | |
1976 #convinience wrapper method | |
1977 #could we use the seq region cache instead? | |
1978 #this seems like a lot of overhead for getting the id | |
1979 sub get_chr_seq_region_id{ | |
1980 my ($self, $chr, $start, $end) = @_; | |
1981 #what about strand info? | |
1982 | |
1983 #do we need the start and stop? | |
1984 | |
1985 #use start and stop to prevent problems with scaffodl assemblies, i.e. >1 seq_region_id | |
1986 #my $slice = $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end); | |
1987 #we could pass the slice back to the slice adaptor for this, to avoid dbid problems betwen DBs | |
1988 | |
1989 ###would need to implement other cs's here | |
1990 | |
1991 return $self->slice_adaptor->fetch_by_region("chromosome", $chr, $start, $end)->get_seq_region_id(); | |
1992 } | |
1993 | |
1994 | |
1995 | |
1996 =head2 vsn_norm | |
1997 | |
1998 Example : $self->vsn_norm(); | |
1999 Description: Convinience/Wrapper method for vsn R normalisation | |
2000 Returntype : none | |
2001 Exceptions : none | |
2002 Caller : general | |
2003 Status : At risk | |
2004 | |
2005 =cut | |
2006 | |
2007 #Have Norm class or contain methods in importer? | |
2008 #Need to have analysis set up script for all standard analyses. | |
2009 | |
2010 sub vsn_norm{ | |
2011 my $self = shift; | |
2012 return $self->R_norm("VSN_GLOG"); | |
2013 } | |
2014 | |
2015 | |
2016 =head2 farm | |
2017 | |
2018 Arg [1] : Boolean | |
2019 Example : $importer->farm(1); | |
2020 Description: Flag to turn farm submission on | |
2021 Returntype : Boolean | |
2022 Exceptions : Throws is argument not a boolean | |
2023 Caller : general | |
2024 Status : At risk | |
2025 | |
2026 =cut | |
2027 | |
2028 | |
2029 sub farm{ | |
2030 my ($self, $farm) = @_; | |
2031 | |
2032 $self->{'farm'} ||= undef; #define farm | |
2033 | |
2034 if (defined $farm) { | |
2035 throw("Argument to farm must be a boolean 1 or 0") if(! ($farm == 1 || $farm == 0)); | |
2036 $self->{'farm'} = $farm; | |
2037 } | |
2038 | |
2039 return $self->{'farm'}; | |
2040 | |
2041 } | |
2042 | |
2043 =head2 batch_job | |
2044 | |
2045 Arg [1] : Boolean | |
2046 Example : $importer->batch_job(1); | |
2047 Description: Flag to turn on batch_job status | |
2048 Returntype : Boolean | |
2049 Exceptions : Throws is argument not a boolean | |
2050 Caller : general | |
2051 Status : At risk | |
2052 | |
2053 =cut | |
2054 | |
2055 | |
2056 sub batch_job{ | |
2057 my ($self, $batch_job) = @_; | |
2058 | |
2059 #$self->{'batch_job'} ||= undef; | |
2060 | |
2061 if (defined $batch_job) { | |
2062 throw("Argument to batch_job must be a boolean 1 or 0") if(! ($batch_job == 1 || $batch_job == 0)); | |
2063 $self->{'batch_job'} = $batch_job; | |
2064 } | |
2065 | |
2066 return $self->{'batch_job'}; | |
2067 | |
2068 } | |
2069 | |
2070 =head2 prepared | |
2071 | |
2072 Arg [1] : Boolean | |
2073 Example : $importer->prepared(1); | |
2074 Description: Flag to turn on prepared file status | |
2075 This signifies that the files have been previously imported | |
2076 using prepare mode and may not match the InputSubset names | |
2077 Returntype : Boolean | |
2078 Exceptions : None | |
2079 Caller : general | |
2080 Status : At risk | |
2081 | |
2082 =cut | |
2083 | |
2084 | |
2085 sub prepared{ | |
2086 my ($self, $prepared) = @_; | |
2087 $self->{'prepared'} = $prepared if (defined $prepared); | |
2088 return $self->{'prepared'}; | |
2089 } | |
2090 | |
2091 | |
2092 =head2 R_norm | |
2093 | |
2094 Example : $self->R_norm(@logic_names); | |
2095 Description: Performs R normalisations for given logic names | |
2096 Returntype : none | |
2097 Exceptions : Throws if R exits with error code or if data not not valid for analysis | |
2098 Caller : general | |
2099 Status : At risk | |
2100 | |
2101 =cut | |
2102 | |
2103 sub R_norm{ | |
2104 my ($self, @logic_names) = @_; | |
2105 #This currently normalises a single two colour chip at a time | |
2106 #rather than normalising across a set of chip | |
2107 #also does in sets of analyses | |
2108 #good for keeping data separate, but not efficient in terms of querying | |
2109 #convert to use one script which only queries for each echip once, then does each anal | |
2110 | |
2111 | |
2112 my $aa = $self->db->get_AnalysisAdaptor(); | |
2113 my $rset_adaptor = $self->db->get_ResultSetAdaptor(); | |
2114 my $ra_id = $aa->fetch_by_logic_name("RawValue")->dbID(); | |
2115 my %r_config = ( | |
2116 "VSN_GLOG" => {( libs => ['vsn'], | |
2117 #norm_method => 'vsn', | |
2118 )}, | |
2119 | |
2120 "T.Biweight" => {( | |
2121 libs => ['affy'], | |
2122 #norm_method => 'tukey.biweight', | |
2123 )}, | |
2124 ); | |
2125 | |
2126 | |
2127 foreach my $logic_name (@logic_names) { | |
2128 #throw("Not yet implemented TukeyBiweight") if $logic_name eq "Tukey_Biweight"; | |
2129 | |
2130 #this has already been chcecked and set as the norm_analysis | |
2131 #need to resolve this multi method approach | |
2132 my $norm_anal = $aa->fetch_by_logic_name($logic_name); | |
2133 | |
2134 | |
2135 | |
2136 #This only creates an RSet for the IMPORT set | |
2137 #So if we re-run with a different analysis | |
2138 #tab2mage will have already been validated | |
2139 #So RSet generation will be skipped | |
2140 #We need to recreate the each non-import RSet for this norm analysis | |
2141 | |
2142 #This also means the RSets are being created before the data has been imported | |
2143 #This avoids having to parse tab2mage each time but means we have an uncertain status of these Rsets | |
2144 | |
2145 my $rset = $self->get_import_ResultSet($norm_anal, 'experimental_chip'); | |
2146 | |
2147 my @chips = (); | |
2148 | |
2149 if (! $rset) { | |
2150 $self->log("All ExperimentalChips already have status:\t${logic_name}"); | |
2151 } else { #Got data to normalise and import | |
2152 my @dbids; | |
2153 my $R_file = $self->get_dir("norm")."/${logic_name}.R"; | |
2154 my $job_name = $self->experiment->name()."_${logic_name}"; | |
2155 my $resultfile = $self->get_dir("norm")."/result.${logic_name}.txt"; | |
2156 my $outfile = $self->get_dir("norm")."/${logic_name}.out"; | |
2157 | |
2158 #How do we get farm job output i.e. run time memusage | |
2159 #from interactive job? | |
2160 #This assumes R_PATH | |
2161 my $errfile = $self->get_dir("norm")."/${logic_name}.err"; | |
2162 | |
2163 #Let's build this better so we capture the farm output aswell as the job output. | |
2164 my $cmdline = "$ENV{'R_PATH'} --no-save < $R_file"; # >$errfile 2>&1"; | |
2165 #-K option waits for job to complete | |
2166 my $bsub = "bsub -K -J $job_name ".$ENV{'R_BSUB_OPTIONS'}. | |
2167 " -e $errfile -o $outfile $ENV{'R_FARM_PATH'} CMD BATCH $R_file"; | |
2168 | |
2169 #Can we separate the out and err for commandline? | |
2170 my $r_cmd = (! $self->farm()) ? "$cmdline >$outfile 2>&1" : $bsub; | |
2171 | |
2172 $self->backup_file($resultfile); #Need to do this as we're appending in the loop | |
2173 | |
2174 #setup qurey | |
2175 #warn "Need to add host and port here"; | |
2176 #Set up DB, defaults and libs for each logic name | |
2177 my $query = "options(scipen=20);library(RMySQL);library(Ringo);"; | |
2178 #scipen is to prevent probe_ids being converted to exponents | |
2179 #Ringo is for default QC | |
2180 | |
2181 #foreach my $ln(@logic_names){ | |
2182 | |
2183 foreach my $lib (@{$r_config{$logic_name}{'libs'}}) { | |
2184 $query .= "library($lib);"; | |
2185 } | |
2186 #} | |
2187 | |
2188 $query .= "con<-dbConnect(dbDriver(\"MySQL\"), host=\"".$self->db->dbc->host()."\", port=".$self->db->dbc->port().", dbname=\"".$self->db->dbc->dbname()."\", user=\"".$self->db->dbc->username()."\""; | |
2189 | |
2190 #should use read only pass here as we are printing this to file | |
2191 $query .= ", pass=\"".$self->db->dbc->password."\")\n"; | |
2192 | |
2193 | |
2194 #Build queries for each chip | |
2195 foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { | |
2196 | |
2197 | |
2198 #should implement logic name here? | |
2199 #can't as we need seperate ResultSet for each | |
2200 | |
2201 | |
2202 if ($echip->has_status($logic_name)) { | |
2203 $self->log("ExperimentalChip ".$echip->unique_id()." already has status:\t$logic_name"); | |
2204 } else { | |
2205 | |
2206 #warn "Need to roll back here if recovery, as norm import may fail halfway through"; | |
2207 | |
2208 push @chips, $echip; | |
2209 my $cc_id = $rset->get_chip_channel_id($echip->dbID()); | |
2210 | |
2211 #if ($self->recovery()){ | |
2212 # $self->log('Rolling back results for ExperimentalChip('.$echip->dbID().") $logic_name"); | |
2213 # $self->db->rollback_results($cc_id) if $self->recovery(); | |
2214 # } | |
2215 | |
2216 $self->log("Building $logic_name R cmd for ".$echip->unique_id()); | |
2217 @dbids = (); | |
2218 | |
2219 foreach my $chan (@{$echip->get_Channels()}) { | |
2220 | |
2221 if ($chan->type() eq "EXPERIMENTAL") { | |
2222 push @dbids, $chan->dbID(); | |
2223 } else { | |
2224 unshift @dbids, $chan->dbID(); | |
2225 } | |
2226 } | |
2227 | |
2228 throw("vsn does not accomodate more than 2 channels") if (scalar(@dbids > 2) && $logic_name eq "VSN_GLOG"); | |
2229 | |
2230 #should do some of this with maps? | |
2231 #HARDCODED metric ID for raw data as one | |
2232 | |
2233 #Need to get total and experimental here and set db_id accordingly | |
2234 #can probably do this directly into one df | |
2235 | |
2236 $query .= "c1<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as CONTROL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[0]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n"; | |
2237 | |
2238 $query .= "c2<-dbGetQuery(con, 'select r.probe_id as PROBE_ID, r.score as EXPERIMENTAL_score, r.X, r.Y from result r, chip_channel c, result_set rs where c.table_name=\"channel\" and c.table_id=${dbids[1]} and c.result_set_id=rs.result_set_id and rs.analysis_id=${ra_id} and c.chip_channel_id=r.chip_channel_id')\n"; | |
2239 | |
2240 #Can we define some of the basic structures here and reuse in the QC and each norm method? | |
2241 | |
2242 | |
2243 #Is this going to eat up memory? | |
2244 #can we strip out and separate the data from c1 and c2 into RGList and | |
2245 #individual vector for probe_ids, then rm c1 and c2 to free up memory | |
2246 | |
2247 #create RGList object | |
2248 $query .= "R<-as.matrix(c1['CONTROL_score'])\nG<-as.matrix(c2['EXPERIMENTAL_score'])\n"; | |
2249 $query .= "genes<-cbind(c1['PROBE_ID'], c1['X'], c1['Y'])\n"; | |
2250 $query .= "testRG<-new('RGList', list(R=R, G=G, genes=genes))\n"; | |
2251 | |
2252 | |
2253 #QC plots here before doing norm | |
2254 | |
2255 #open pdf device | |
2256 $query .= "pdf('".$self->get_dir('norm').'/'.$echip->unique_id."_QC.pdf', paper='a4', height = 15, width = 9)\n"; | |
2257 #set format | |
2258 $query .= "par(mfrow = c(2,2), font.lab = 2)\n"; | |
2259 | |
2260 #Channel densisties | |
2261 #These need limma or Ringo | |
2262 $query .= "plotDensities(testRG)\n"; | |
2263 | |
2264 #MvA Plot | |
2265 | |
2266 $query .= 'meanLogA<-((log(testRG$R, base=exp(2)) + log(testRG$G, base=exp(2)))/2)'."\n"; | |
2267 $query .= 'logIntRatioM<-(log(testRG$R, base=exp(2)) - log(testRG$G, base=exp(2)))'."\n"; | |
2268 $query .= "yMin<-min(logIntRatioM)\n"; | |
2269 $query .= "yMax<-max(logIntRatioM)\n"; | |
2270 | |
2271 | |
2272 | |
2273 #Need to validate yMax here | |
2274 #If is is Inf then we need to sort the vector and track back until we find the high real number | |
2275 #count number of Infs and note on MvA plot | |
2276 $query .= "infCount<-0\n"; | |
2277 $query .= "if( yMax == Inf){; sortedM<-sort(logIntRatioM); lengthM<-length(logIntRatioM); indexM<-lengthM\n" | |
2278 ."while (yMax == Inf){; indexM<-(indexM-1); yMax<-sortedM[indexM];}; infCount<-(lengthM-indexM);}\n"; | |
2279 | |
2280 # | |
2281 $query .= "if(infCount == 0){\n"; | |
2282 $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'")'."\n"; | |
2283 $query .= "} else {\n"; | |
2284 $query .= 'plot(meanLogA, logIntRatioM, xlab="A - Average Log Ratio",ylab="M - Log Ratio",pch=".",ylim=c(yMin,yMax), main="'.$echip->unique_id.'", sub=paste(infCount, " Inf values not plotted"));'."}\n"; | |
2285 | |
2286 | |
2287 #$query .= 'plot(log(testRG$R*testRG$G, base=exp(2))/2, log(testRG$R/testRG$G, base=exp(2)),xlab="A",ylab="M",pch=".",ylim=c(-3,3), main="'.$echip->unique_id."\")\n"; | |
2288 | |
2289 #Plate plots | |
2290 $query .= 'image(testRG, 1, channel = "green", mycols = c("black", "green4", "springgreen"))'."\n"; | |
2291 $query .= 'image(testRG, 1, channel = "red", mycols = c("black", "green4", "springgreen"))'."\n"; | |
2292 | |
2293 $query .= "dev.off()\n"; | |
2294 #Finished QC pdf printing | |
2295 | |
2296 | |
2297 | |
2298 #The simple preprocess step of Ringo is actually vsn, so we can nest these in line | |
2299 | |
2300 | |
2301 ### Build Analyses cmds ### | |
2302 | |
2303 if ($logic_name eq 'T.Biweight') { | |
2304 | |
2305 #log2 ratios | |
2306 $query .= 'lr_df<-cbind((log(c2["EXPERIMENTAL_score"], base=exp(2)) - log(c1["CONTROL_score"], base=exp(2))))'."\n"; | |
2307 | |
2308 #Adjust using tukey.biweight weighted average | |
2309 #inherits first col name | |
2310 $query .= 'norm_df<-(lr_df["EXPERIMENTAL_score"]-tukey.biweight(as.matrix(lr_df)))'."\n"; | |
2311 $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", norm_df[,1]), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n"; | |
2312 | |
2313 } elsif ($logic_name eq 'VSN_GLOG') { | |
2314 #could do this directly | |
2315 $query .= "raw_df<-cbind(c1[\"CONTROL_score\"], c2[\"EXPERIMENTAL_score\"])\n"; | |
2316 #variance stabilise | |
2317 $query .= "norm_df<-vsn(raw_df)\n"; | |
2318 | |
2319 | |
2320 #do some more calcs here and print report? | |
2321 #fold change exponentiate? See VSN docs | |
2322 #should do someplot's of raw and glog and save here? | |
2323 #set log func and params | |
2324 #$query .= "par(mfrow = c(1, 2)); log.na = function(x) log(ifelse(x > 0, x, NA));"; | |
2325 #plot | |
2326 #$query .= "plot(exprs(glog_df), main = \"vsn\", pch = \".\");". | |
2327 # "plot(log.na(exprs(raw_df)), main = \"raw\", pch = \".\");"; | |
2328 #FAILS ON RAW PLOT!! | |
2329 #par(mfrow = c(1, 2)) | |
2330 #> meanSdPlot(nkid, ranks = TRUE) | |
2331 #> meanSdPlot(nkid, ranks = FALSE) | |
2332 | |
2333 | |
2334 #Now create table structure with glog values(diffs) | |
2335 #3 sig dec places on scores(doesn't work?!) | |
2336 $query .= 'formatted_df<-cbind(rep.int(0, length(c1["PROBE_ID"])), c1["PROBE_ID"], sprintf("%.3f", (exprs(norm_df[,2]) - exprs(norm_df[,1]))), rep.int('.$cc_id.', length(c1["PROBE_ID"])), c1["X"], c1["Y"])'."\n"; | |
2337 | |
2338 } | |
2339 #load back into DB | |
2340 #c3results<-cbind(rep("", length(c3["probe_id"])), c3["probe_id"], c3["c3_score"], rep(1, length(c3["probe_id"])), rep(1, length(c3["probe_id"]))) | |
2341 #may want to use safe.write here | |
2342 #dbWriteTable(con, "result", c3results, append=TRUE) | |
2343 #dbWriteTable returns true but does not load any data into table!!! | |
2344 | |
2345 $query .= "write.table(formatted_df, file=\"${resultfile}\", sep=\"\\t\", col.names=FALSE, row.names=FALSE, quote=FALSE, append=TRUE)\n"; | |
2346 | |
2347 #tidy up here?? | |
2348 } | |
2349 } | |
2350 | |
2351 $query .= "q();"; | |
2352 | |
2353 open(RFILE, ">$R_file") || throw("Cannot open $R_file for writing"); | |
2354 print RFILE $query; | |
2355 close(RFILE); | |
2356 | |
2357 my $submit_text = "Submitting $logic_name job"; | |
2358 $submit_text .= ' to farm' if $self->farm; | |
2359 $self->log("${submit_text}:\t".localtime()); | |
2360 run_system_cmd($r_cmd); | |
2361 $self->log("Finished $logic_name job:\t".localtime()); | |
2362 $self->log('See '.$self->get_dir('norm').' for ExperimentalChip QC files'); | |
2363 | |
2364 #Now load file and update status | |
2365 #Import directly here to avoid having to reparse all results if we crash!!!! | |
2366 $self->log("Importing:\t$resultfile"); | |
2367 $self->db->load_table_data("result", $resultfile); | |
2368 $self->log("Finishing importing:\t$resultfile"); | |
2369 | |
2370 | |
2371 foreach my $echip (@chips) { | |
2372 $echip->adaptor->store_status($logic_name, $echip); | |
2373 } | |
2374 | |
2375 #Recreate all non-import RSets for analysis if not already present | |
2376 # | |
2377 | |
2378 my $rset_a = $self->db->get_ResultSetAdaptor(); | |
2379 my %seen_rsets; | |
2380 | |
2381 foreach my $anal_rset (@{$rset_a->fetch_all_by_Experiment($self->experiment)}) { | |
2382 next if($anal_rset->name =~ /_IMPORT$/o); | |
2383 next if(exists $seen_rsets{$anal_rset->name}); | |
2384 next if $anal_rset->analysis->logic_name eq $norm_anal->logic_name; | |
2385 $seen_rsets{$rset->name} = 1; | |
2386 $anal_rset->analysis($norm_anal); | |
2387 $anal_rset->{'dbID'} = undef; | |
2388 $anal_rset->{'adaptor'} = undef; | |
2389 | |
2390 #add the chip_channel_ids from the new anal IMPORT set | |
2391 foreach my $table_id (@{$anal_rset->table_ids}) { | |
2392 $anal_rset->{'table_id_hash'}{$table_id} = $rset->get_chip_channel_id($table_id); | |
2393 } | |
2394 | |
2395 $self->log('Adding new ResultSet '.$anal_rset->name.' with analysis '.$norm_anal->logic_name); | |
2396 $rset_a->store($anal_rset); | |
2397 } | |
2398 | |
2399 | |
2400 | |
2401 } | |
2402 } | |
2403 | |
2404 return; | |
2405 } | |
2406 | |
2407 #can we sub this? args: table_name, logic_name | |
2408 #also use result_set_name | |
2409 #would also clean all data for result set if recovery | |
2410 #return would be result_set | |
2411 #Can we extend this to incorporate InputSet parser define_sets? | |
2412 | |
2413 sub get_import_ResultSet{ | |
2414 my ($self, $anal, $table_name) = @_; | |
2415 | |
2416 if (!($anal && $anal->isa("Bio::EnsEMBL::Analysis") && $anal->dbID())) { | |
2417 throw("Must provide a valid stored Bio::EnsEMBL::Analysis"); | |
2418 } | |
2419 | |
2420 $self->log("Getting import $table_name ResultSet for analysis:\t".$anal->logic_name()); | |
2421 | |
2422 my ($rset, @new_chip_channels); | |
2423 my $result_adaptor = $self->db->get_ResultSetAdaptor(); | |
2424 my $logic_name = $anal->logic_name; | |
2425 my $status = ($logic_name eq "RawValue") ? "IMPORTED" : $logic_name; | |
2426 | |
2427 if (($logic_name) eq 'RawValue' && ($table_name eq 'experimental_chip')) { | |
2428 throw("Cannot have an ExperimentalChip ResultSet with a RawValue analysis, either specify 'channel' or another analysis"); | |
2429 } | |
2430 | |
2431 #Build IMPORT Set for $table_name | |
2432 foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { | |
2433 | |
2434 #clean chip import and generate rset | |
2435 | |
2436 if ($table_name eq 'experimental_chip') { | |
2437 | |
2438 if ($echip->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status | |
2439 $self->log("ExperimentalChip(".$echip->unique_id().") already has status:\t".$status); | |
2440 } else { | |
2441 $self->log("Found ExperimentalChip(".$echip->unique_id().") without status $status"); | |
2442 | |
2443 push @new_chip_channels, $echip; | |
2444 } | |
2445 | |
2446 } else { #channel | |
2447 | |
2448 foreach my $chan (@{$echip->get_Channels()}) { | |
2449 | |
2450 if ($chan->has_status($status)) { #this translates to each channel have the IMPORTED_RawValue status | |
2451 $self->log("Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") already has status:\t".$status); | |
2452 } else { | |
2453 $self->log("Found Channel(".$echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}.") without status $status"); | |
2454 push @new_chip_channels, $chan; | |
2455 } | |
2456 } | |
2457 } | |
2458 | |
2459 if (( ! $rset) && @new_chip_channels) { | |
2460 my(@tmp) = @{$result_adaptor->fetch_all_by_name_Analysis($self->name()."_IMPORT", $anal)}; | |
2461 | |
2462 if (scalar(@tmp) > 1) { | |
2463 throw('Found more than one IMPORT ResultSet for '.$self->name().'_IMPORT with analysis '.$logic_name); | |
2464 } | |
2465 | |
2466 $rset = shift @tmp; | |
2467 | |
2468 | |
2469 #do we need to throw here if not recovery? | |
2470 #what if we want the import result set elsewhere during the first import? | |
2471 | |
2472 #if ($self->recovery()) { | |
2473 | |
2474 #fetch by anal and experiment_id | |
2475 #Need to change this to result_set.name! | |
2476 # warn("add chip set handling here"); | |
2477 | |
2478 #my @tmp = @{$result_adaptor->fetch_all_by_Experiment_Analysis($self->experiment(), $anal)}; | |
2479 #throw("Found more than one ResultSet for Experiment:\t".$self->experiment->name()."\tAnalysis:\t".$anal->logic_name().')' if (scalar(@tmp) >1); | |
2480 #$rset = $tmp[0]; | |
2481 | |
2482 #warn "fetching rset with ".$self->name()."_IMPORT ". $anal->logic_name; | |
2483 | |
2484 #$rset = $result_adaptor->fetch_by_name_Analysis($self->name()."_IMPORT", $anal); | |
2485 warn("Warning: Could not find recovery ResultSet for analysis ".$logic_name) if ! $rset; | |
2486 #} | |
2487 | |
2488 if (! $rset) { | |
2489 $self->log("Generating new ResultSet for analysis ".$logic_name); | |
2490 | |
2491 $rset = Bio::EnsEMBL::Funcgen::ResultSet->new | |
2492 ( | |
2493 -analysis => $anal, | |
2494 -table_name => $table_name, | |
2495 -name => $self->name()."_IMPORT", | |
2496 -feature_type => $self->feature_type(), | |
2497 -cell_type => $self->cell_type(), | |
2498 ); | |
2499 | |
2500 #These types should be set to NULL during the MAGE-XML validation if we have more than one type in an experiment | |
2501 | |
2502 ($rset) = @{$result_adaptor->store($rset)}; | |
2503 } | |
2504 } | |
2505 } | |
2506 | |
2507 #do we need this here as we're rolling back in the read methods? | |
2508 #we only want to roll back those chips/channels which have not been registered | |
2509 | |
2510 if ($self->recovery()) { | |
2511 | |
2512 my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor(); | |
2513 | |
2514 foreach my $cc (@new_chip_channels) { | |
2515 | |
2516 #only roll back if already part of import set | |
2517 #Not previously registered if not | |
2518 if ($rset->contains($cc) && $rset->get_chip_channel_id($cc->dbID())) { | |
2519 | |
2520 if ($table_name eq 'channel') { | |
2521 my $chan_name = $ec_adaptor->fetch_by_dbID($cc->experimental_chip_id())->unique_id()."_". | |
2522 $self->get_config('dye_freqs')->{$cc->dye()}; | |
2523 $self->log("Rolling back results for $table_name:\t".$chan_name); | |
2524 | |
2525 } else { | |
2526 $self->log("Rolling back results for $table_name:\t".$cc->unique_id); | |
2527 } | |
2528 | |
2529 $self->rollback_results([$rset->get_chip_channel_id($cc->dbID())]); | |
2530 } | |
2531 } | |
2532 } | |
2533 | |
2534 | |
2535 #check whether it is present in the ResultSet and add if not | |
2536 if ($rset) { | |
2537 #ids will already be present if not rset i.e. already imported | |
2538 | |
2539 foreach my $cc (@new_chip_channels) { | |
2540 $rset->add_table_id($cc->dbID()) if(! $rset->contains($cc)); | |
2541 } | |
2542 } | |
2543 | |
2544 | |
2545 | |
2546 if ($rset) { | |
2547 $result_adaptor->store_chip_channels($rset); | |
2548 } else { | |
2549 $self->log("All ExperimentalChips have status:\t$status"); | |
2550 } | |
2551 | |
2552 #this only returns a result set if there is new data to import | |
2553 return $rset; | |
2554 } | |
2555 | |
2556 | |
2557 | |
2558 =head2 resolve_probe_data | |
2559 | |
2560 Example : $self->resolve_probe_data(); | |
2561 Description: Resolves DB probe duplicates and builds local probe cache | |
2562 Returntype : none | |
2563 Exceptions : ???? | |
2564 Caller : general | |
2565 Status : At risk | |
2566 | |
2567 =cut | |
2568 | |
2569 sub resolve_probe_data{ | |
2570 my $self = shift; | |
2571 | |
2572 $self->log("Resolving probe data", 1); | |
2573 | |
2574 warn "Probe cache resolution needs to accomodate probesets too!"; | |
2575 | |
2576 foreach my $array (@{$self->arrays()}) { | |
2577 my $resolve = 0; | |
2578 | |
2579 if ($self->get_probe_cache_by_Array($array)) { #cache already generated | |
2580 | |
2581 #check if we have any new unresolved array chips to add to the cache | |
2582 foreach my $achip (@{$array->get_ArrayChips()}) { | |
2583 | |
2584 if ($achip->has_status('RESOLVED')) { | |
2585 $self->log("ArrayChip has RESOLVED status:\t".$achip->design_id()); #, 1); | |
2586 next; | |
2587 } else { | |
2588 $self->log("Found un-RESOLVED ArrayChip:\t".$achip->design_id()); | |
2589 $resolve = 1; | |
2590 last; | |
2591 } | |
2592 } | |
2593 } else { #no cache file | |
2594 $resolve = 1; | |
2595 $self->log('No probe cache found for array '.$array->name()); | |
2596 } | |
2597 | |
2598 if ($resolve) { | |
2599 $self->log('Resolving array duplicates('.$array->name().') and rebuilding probe cache.', 1); | |
2600 $self->get_probe_cache_by_Array($array, 1); #get from DB | |
2601 | |
2602 #we need ot make sure we mark cache as unresolved, so we don't use it by mistake. | |
2603 | |
2604 | |
2605 | |
2606 my ($line, $name, $pid, @pids); | |
2607 #my $index = 0; | |
2608 my $tmp_name = ''; | |
2609 my $tmp_id = ''; | |
2610 | |
2611 #miss the header | |
2612 | |
2613 while ($line = $self->{'_probe_cache'}{$array->name}{'handle'}->getline()) { | |
2614 ($name, $pid) = split/\t/o, $line; | |
2615 | |
2616 if ($name eq $tmp_name) { | |
2617 | |
2618 if ($pid != $tmp_id) { | |
2619 push @pids, $pid; | |
2620 #should reset to pid here if we have x y data else undef | |
2621 #ignore this and force result to have x y | |
2622 } | |
2623 | |
2624 #can't do this naymore unless we figure out how to move the line pointer | |
2625 #would still need to sed the file anyway, better to regen from DB? | |
2626 #undef $self->{'_probe_cache'}{$array->name}{'entries'}->[$i];#delete true or to be resolved duplicate | |
2627 } elsif ($name ne $tmp_name) { #new probe | |
2628 $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1); | |
2629 $tmp_name = $name; | |
2630 $tmp_id = $pid; | |
2631 @pids = ($pid); | |
2632 #$index = $i + 1; | |
2633 } | |
2634 } | |
2635 | |
2636 $self->tidy_duplicates(\@pids) if(scalar(@pids) > 1); | |
2637 | |
2638 #rename resovled cache and reset cache handle | |
2639 my $cmd = 'mv '.$self->get_dir('caches').'/'.$array->name().'.probe_cache.unresolved '. | |
2640 $self->get_dir('caches').'/'.$array->name().'.probe_cache'; | |
2641 | |
2642 run_system_cmd($cmd); | |
2643 $self->get_probe_cache_by_Array($array); #This sets the caches | |
2644 | |
2645 | |
2646 #warn "Only generate MD5 here, as this is guranteed to be correct"; | |
2647 | |
2648 foreach my $achip (@{$array->get_ArrayChips()}) { | |
2649 | |
2650 if (! $achip->has_status('RESOLVED')) { | |
2651 $self->log("Updating ArrayChip to RESOLVED status:\t".$achip->design_id()); | |
2652 $achip->adaptor->store_status('RESOLVED', $achip); | |
2653 } | |
2654 } | |
2655 | |
2656 $self->log('Finished building probe cache for '.$array->name(), 1); | |
2657 } | |
2658 } | |
2659 | |
2660 $self->log('Finished resolving probe data', 1); | |
2661 | |
2662 return; | |
2663 } | |
2664 | |
2665 | |
2666 sub tidy_duplicates{ | |
2667 my ($self, $pids) = @_; | |
2668 | |
2669 my $pfa = $self->db->get_ProbeFeatureAdaptor(); | |
2670 my ($feature, %features); | |
2671 | |
2672 foreach my $dup_id (@$pids) { | |
2673 | |
2674 foreach $feature(@{$pfa->fetch_all_by_Probe_id($dup_id)}) { | |
2675 #can we safely assume end will be same too? | |
2676 push @{$features{$feature->seq_region_name().':'.$feature->start()}}, $feature; | |
2677 } | |
2678 } | |
2679 | |
2680 my (@reassign_ids, @delete_ids); | |
2681 | |
2682 foreach my $seq_start_key (keys %features) { | |
2683 my $reassign_features = 1; | |
2684 | |
2685 foreach $feature(@{$features{$seq_start_key}}) { | |
2686 | |
2687 if ($feature->probe_id() == $pids->[0]) { | |
2688 $reassign_features = 0; | |
2689 } else { | |
2690 push @delete_ids, $feature->dbID(); | |
2691 } | |
2692 } | |
2693 | |
2694 #This assumes that we actually have at least one element to every seq_start_key array | |
2695 if ($reassign_features) { | |
2696 my $new_fid = pop @delete_ids; | |
2697 push @reassign_ids, $new_fid; | |
2698 } | |
2699 } | |
2700 | |
2701 #resolve features first so we don't get any orphaned features if we crash. | |
2702 $pfa->reassign_features_to_probe(\@reassign_ids, $pids->[0]) if @reassign_ids; | |
2703 $pfa->delete_features(\@delete_ids) if @delete_ids; | |
2704 | |
2705 return; | |
2706 } | |
2707 | |
2708 1; |