0
|
1 #
|
|
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet
|
|
3 #
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::EnsEMBL::Funcgen::Parsers::InputSet
|
|
26
|
|
27 =head1 SYNOPSIS
|
|
28
|
|
29 use vars qw(@ISA);
|
|
30 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet);
|
|
31
|
|
32
|
|
33 =head1 DESCRIPTION
|
|
34
|
|
35 This is a base class to support simple file format parsers. For simple imports the vendor is
|
|
36 set to the parser type i.e. the file format. The generic read_and_import_simple_data assumes
|
|
37 a one line per feature format, other format need there own read_and_import_format_data method,
|
|
38 which will need defining in the result_data config element. Features are stored either as
|
|
39 ResultFeature collections or AnnotatedFeatures dependan ton the input feature class.
|
|
40
|
|
41 =cut
|
|
42
|
|
43 # To do
|
|
44 # Add Parsers for BAM/SAM
|
|
45 # Rename to InputSet
|
|
46 # Handle mysqlimport for large data sets e.g. reads
|
|
47 # Incorporate collection code
|
|
48 # Implement matrix storage
|
|
49
|
|
50 package Bio::EnsEMBL::Funcgen::Parsers::InputSet;
|
|
51
|
|
52 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
|
|
53 use Bio::EnsEMBL::Funcgen::SegmentationFeature;
|
|
54 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
|
|
55 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped);
|
|
56 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
57 #use Bio::EnsEMBL::Funcgen::Utils::Helper;
|
|
58 use strict;
|
|
59
|
|
60 #config stuff, move to BaseImporter?
|
|
61 use Bio::EnsEMBL::Analysis;
|
|
62 use Bio::EnsEMBL::Funcgen::FeatureType;
|
|
63
|
|
64
|
|
65 use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10
|
|
66
|
|
67 #use vars qw(@ISA);
|
|
68 #@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
|
|
69
|
|
70 my %valid_types = (
|
|
71 result => undef,
|
|
72 annotated => undef,
|
|
73 segmentation => undef,
|
|
74 );
|
|
75
|
|
76
|
|
77 =head2 new
|
|
78
|
|
79 Example : my $self = $class->SUPER::new(@_);
|
|
80 Description: Constructor method for Bed class
|
|
81 Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple
|
|
82 Exceptions : throws if caller is not Importer
|
|
83 Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple
|
|
84 Status : at risk
|
|
85
|
|
86 =cut
|
|
87
|
|
88
|
|
89 sub new{
|
|
90 my $caller = shift;
|
|
91
|
|
92 my $class = ref($caller) || $caller;
|
|
93 my $self = $class->SUPER::new(@_);
|
|
94
|
|
95 # my $config_file;
|
|
96
|
|
97
|
|
98 ($self->{'input_set_name'},
|
|
99 $self->{'input_feature_class'},
|
|
100 #$self->{'slices'},
|
|
101 $self->{total_features},
|
|
102 $self->{force}, #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser
|
|
103 $self->{dbfile_data_root}, #only appropriate for result input_feature_class
|
|
104 # $config_file, #User defined config hash file
|
|
105 ) = rearrange(['input_set_name', 'input_feature_class',
|
|
106 'total_features', 'force', 'dbfile_data_root'], @_);
|
|
107
|
|
108
|
|
109 #Could potentially take fields params directly to define a custom format
|
|
110 #Take direct field mappings, plus special fields which needs parsing differently
|
|
111 #i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping
|
|
112
|
|
113
|
|
114 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly")
|
|
115 if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
|
|
116
|
|
117 $self->{'config'} =
|
|
118 {(
|
|
119 #can we omit these?
|
|
120 array_data => [],#['experiment'],
|
|
121 probe_data => [],#["probe"],
|
|
122 norm_method => undef,
|
|
123 #protocols => {()},
|
|
124 'results_data' => ["and_import"],
|
|
125 )};
|
|
126
|
|
127 #set up feature params
|
|
128 $self->{'_feature_params'} = {};
|
|
129 $self->{'_dbentry_params'} = [];
|
|
130
|
|
131 #$self->{'counts'} = {};
|
|
132 #$self->{'slices'} = [];
|
|
133 #$self->{'seq_region_names'} = [];#Used for slice based import
|
|
134
|
|
135
|
|
136 # USER CONFIG #
|
|
137 #Here we need to read config based on external file
|
|
138 #Should do something similar to set_feature_sets
|
|
139 #and validate_and_store_feature_types in BaseExternalParser
|
|
140 #but we are using define and validate sets instead
|
|
141
|
|
142 #BaseExternalParser and BaseImporter really need to be merged
|
|
143 #After we have stripped out all the array/experiment specific stuff
|
|
144
|
|
145 #Do dev here so we are not developing more stuff in the Importer which will need migrating
|
|
146 #to the BaseImporter
|
|
147
|
|
148 #if($config_file){
|
|
149 # my $config;
|
|
150
|
|
151 # $self->log("Reading config file:\t".$config_file);
|
|
152
|
|
153 # if(! ($config = do "$config_file")){
|
|
154 # throw("Couldn't parse config file:\t$config_file:\n$@") if $@;
|
|
155 # throw("Couldn't do config:\t$config_file\n$!") if ! defined $config;
|
|
156 # throw("Couldn't compile config_file:\t$config_file") if ! $config;
|
|
157 # }
|
|
158
|
|
159 # #At least check it is hash
|
|
160 # if(ref($config) ne 'HASH'){
|
|
161 # throw("Config file does not define a valid HASH:\t$config_file");
|
|
162 # }
|
|
163 #
|
|
164 # $self->{user_config} = $config;
|
|
165 # }
|
|
166
|
|
167
|
|
168 return $self;
|
|
169 }
|
|
170
|
|
171
|
|
172 sub output_file{
|
|
173 my ($self, $output_file) = @_;
|
|
174
|
|
175 $self->{'output_file'} = $output_file if $output_file;
|
|
176 return $self->{'output_file'};
|
|
177 }
|
|
178
|
|
179 sub input_file{
|
|
180 my ($self, $input_file) = @_;
|
|
181
|
|
182 $self->{'input_file'} = $input_file if $input_file;
|
|
183 return $self->{'input_file'};
|
|
184 }
|
|
185
|
|
186
|
|
187
|
|
188 =head2 set_config
|
|
189
|
|
190 Example : my $self->set_config;
|
|
191 Description: Sets attribute dependent config
|
|
192 Returntype : None
|
|
193 Exceptions : None
|
|
194 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
195 Status : at risk
|
|
196
|
|
197 =cut
|
|
198
|
|
199
|
|
200 sub set_config{
|
|
201 my $self = shift;
|
|
202
|
|
203 #Move all this to new when we fix the inheritance in Importer
|
|
204
|
|
205 #We could set input_set_name to experiment name
|
|
206 #But we would have to make warning in define_and_validate_sets mention -input_set_name
|
|
207
|
|
208 throw('Must provide an -input_set name for a '.uc($self->vendor).' import') if ! defined $self->input_set_name();
|
|
209
|
|
210 #Mandatory checks
|
|
211 if(! defined $self->feature_analysis){
|
|
212 throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports');
|
|
213 }
|
|
214
|
|
215
|
|
216 if(! exists $valid_types{$self->input_feature_class}){
|
|
217 throw("You must define a valid input_feature_class:\t".
|
|
218 join(', ', keys %valid_types));
|
|
219 }
|
|
220
|
|
221 $self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.ucfirst($self->input_feature_class).'Feature';
|
|
222
|
|
223
|
|
224 #We need to undef norm method as it has been set to the env var
|
|
225 $self->{'config'}{'norm_method'} = undef;
|
|
226
|
|
227 #dirs are not set in config to enable generic get_dir method access
|
|
228 $self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector
|
|
229
|
|
230 #some convenience methods
|
|
231 my $adaptor_method = 'get_'.ucfirst($self->input_feature_class).'FeatureAdaptor';
|
|
232 $self->{'feature_adaptor'} = $self->db->$adaptor_method;
|
|
233 $self->{'dbentry_adaptor'} = $self->db->get_DBEntryAdaptor;
|
|
234 $self->{'input_set_adaptor'} = $self->db->get_InputSetAdaptor;
|
|
235 ##$self->{'slice_adaptor'} = $self->db->dnadb->get_SliceAdaptor;
|
|
236
|
|
237 #Validate slices
|
|
238 $self->slices($self->{'slices'}) if defined $self->{'slices'};
|
|
239
|
|
240 #Move to new when we sort out inheritance
|
|
241 $self->validate_and_store_config([$self->name]);
|
|
242 #Could use input_set_name here?
|
|
243 #This was to support >1 input set per experiment (name)
|
|
244
|
|
245 #This current breaks for no config imports
|
|
246 #i.e. standard Bed import e.g. result_feature collections
|
|
247 #segmentation imports use Bed and config
|
|
248 #allow no config imports in BaseImporter?
|
|
249 #or ultimately set the params as part of the user_config?
|
|
250
|
|
251 return;
|
|
252 }
|
|
253
|
|
254
|
|
255 sub define_sets{
|
|
256 my ($self) = @_;
|
|
257
|
|
258 my $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->input_set_name);
|
|
259
|
|
260 if(! defined $eset){
|
|
261 $eset = Bio::EnsEMBL::Funcgen::InputSet->new
|
|
262 (
|
|
263 -name => $self->input_set_name(),
|
|
264 -experiment => $self->experiment(),
|
|
265 -feature_type => $self->feature_type(),
|
|
266 -cell_type => $self->cell_type(),
|
|
267 -vendor => $self->vendor(),
|
|
268 -format => $self->format(),
|
|
269 -analysis => $self->feature_analysis,
|
|
270 -feature_class => $self->input_feature_class,
|
|
271 );
|
|
272 ($eset) = @{$self->db->get_InputSetAdaptor->store($eset)};
|
|
273 }
|
|
274
|
|
275 #Use define_and_validate with fetch/append as we may have a pre-existing set
|
|
276 #This now needs to handle ResultSets based on InputSets
|
|
277
|
|
278
|
|
279 my $dset = $self->define_and_validate_sets
|
|
280 (
|
|
281 -dbadaptor => $self->db,
|
|
282 -name => $self->input_set_name,#or name?
|
|
283 -feature_type => $self->feature_type,
|
|
284 -cell_type => $self->cell_type,
|
|
285 -analysis => $self->feature_analysis,
|
|
286 -feature_class=> $self->input_feature_class,
|
|
287 -description => $self->feature_set_description,
|
|
288 #-append => 1,#Omit append to ensure we only have this eset
|
|
289 -recovery => $self->recovery,
|
|
290 -supporting_sets => [$eset],
|
|
291 -slices => $self->slices,
|
|
292 #Can't set rollback here, as we don't know until we've validated the files
|
|
293 #Can't validate the files until we have the sets.
|
|
294 #So we're doing this manually in validate_files
|
|
295 );
|
|
296
|
|
297 #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully
|
|
298 #However we already have IMPORTED on the InputSubSet
|
|
299 #We should add it to FeatureSet to remain consistent.
|
|
300 #See Helper::define_and_validate_sets for more notes on
|
|
301 #potential problems with FeatureSet IMPORTED status
|
|
302
|
|
303
|
|
304 #define_and_validate_sets should also replicate ResultSets?
|
|
305 #Questionable, mapped reads are never normalised across replicates
|
|
306 #There are generally used as input for peak calling individually.
|
|
307 #So files in this instance are expected to be separate parts of the same replicate
|
|
308 #e.g. different chromosomes
|
|
309 #Force one input file?
|
|
310 #What if we want to link several assays(feature/cell_types) to the same experiment?
|
|
311
|
|
312 $self->{'_data_set'} = $dset;
|
|
313
|
|
314 return $dset;
|
|
315 }
|
|
316
|
|
317
|
|
318
|
|
319
|
|
320 #we have rollback functionality incorporated here
|
|
321
|
|
322 sub validate_files{
|
|
323 my ($self, $prepare) = @_;
|
|
324
|
|
325 #Get file
|
|
326 if (! @{$self->result_files()}) {
|
|
327 my $list = "ls ".$self->get_dir('input').'/'.$self->input_set_name().'*.';#.lc($self->vendor);#could use vendor here? Actually need suffix attr
|
|
328 my @rfiles = `$list`;
|
|
329 $self->result_files(\@rfiles);
|
|
330 }
|
|
331
|
|
332 #We don't yet support multiple files
|
|
333 if(scalar(@{$self->result_files()}) >1){
|
|
334 warn('Found more than one '.$self->vendor." file:\n".
|
|
335 join("\n", @{$self->result_files()})."\nThe InputSet parser does not yet handle multiple input files(e.g. replicates).".
|
|
336 " We need to resolve how we are going handle replicates with random cluster IDs");
|
|
337 #do we even need to?
|
|
338 }
|
|
339
|
|
340 #Here were are tracking the import of individual files by adding them as InputSubSets
|
|
341 #Recovery would never know what to delete
|
|
342 #So would need to delete all, Hence no point in setting status?
|
|
343 #We do not rollback IMPORTED data here. This is done via separate scripts
|
|
344 #To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback
|
|
345 #flag in the run script
|
|
346
|
|
347 ### VALIDATE FILES ###
|
|
348 #We need validate all the files first, so the import doesn't fall over half way through
|
|
349 #Or if we come across a rollback halfway through
|
|
350 my (%new_data, $eset);
|
|
351 my $dset = $self->data_set;
|
|
352
|
|
353 if((scalar(@{$self->slices}) > 1) &&
|
|
354 ! $prepare){
|
|
355 throw('Validate files does not yet support multiple Slice rollback');
|
|
356 }
|
|
357
|
|
358 #This all assumes that there is only ever 1 InputSet
|
|
359
|
|
360 if($self->input_feature_class eq 'result'){
|
|
361 $eset = $dset->get_supporting_sets->[0]->get_InputSets->[0];
|
|
362 }
|
|
363 else{#annotated/segmentation
|
|
364 $eset = $dset->get_supporting_sets->[0];
|
|
365 }
|
|
366
|
|
367
|
|
368 #IMPORTED status here may prevent
|
|
369 #futher slice based imports
|
|
370 #so we have wait to set this until we know all the slices
|
|
371 #are loaded, unless we store slice based IMPORTED states
|
|
372 #We currently get around this be never settign IMPORTED for slice based jobs
|
|
373 #and always rolling back by slice before import
|
|
374
|
|
375 #This loop supports multiple files
|
|
376 my (@rollback_sets, %file_paths);
|
|
377 my $auto_rollback = ($self->rollback) ? 0 : 1;
|
|
378
|
|
379 foreach my $filepath( @{$self->result_files} ) {
|
|
380 my ($filename, $sub_set);
|
|
381 chomp $filepath;
|
|
382 ($filename = $filepath) =~ s/.*\///;
|
|
383 $file_paths{$filename} = $filepath;
|
|
384 $filename =~ s/^prepared\.// if $self->prepared; #reset filename to that originally used to create the Inputsubsets
|
|
385
|
|
386 $self->log('Validating '.$self->vendor." file:\t$filename");
|
|
387 throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath);#Can deal with links
|
|
388
|
|
389 if( $sub_set = $eset->get_subset_by_name($filename) ){
|
|
390 #IMPORTED status here is just for the file
|
|
391 #Any changes to analysis or coord_system should result in different InputSubset(file)
|
|
392 #Will only ever be imported into one Feature|ResultSet
|
|
393
|
|
394 #Currently conflating recover_unimported and rollback
|
|
395 #as they serve the same purpose until we implement InputSubset level recovery
|
|
396
|
|
397
|
|
398 if( $sub_set->has_status('IMPORTED') ){
|
|
399 $new_data{$filepath} = 0;
|
|
400 $self->log("Found previously IMPORTED InputSubset:\t$filename");
|
|
401 }
|
|
402 else{
|
|
403 $self->log("Found existing InputSubset without IMPORTED status:\t$filename");
|
|
404 push @rollback_sets, $sub_set;
|
|
405 }
|
|
406 }
|
|
407 else{
|
|
408 $self->log("Found new InputSubset:\t${filename}");
|
|
409 throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared;
|
|
410 $new_data{$filepath} = 1;
|
|
411 $sub_set = $eset->add_new_subset($filename);
|
|
412 $self->input_set_adaptor->store_InputSubsets([$sub_set]);
|
|
413 }
|
|
414 }
|
|
415
|
|
416
|
|
417 #Does -recover allow a single extra new file to be added to an existing InputSet?
|
|
418
|
|
419 if(@rollback_sets && #recoverable sets i.e. exists but not IMPORTED
|
|
420 ( (! $self->recovery) && (! $self->rollback) ) ){
|
|
421 throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n".
|
|
422 "You must specify -recover or -rollback to perform a full rollback");
|
|
423
|
|
424 if($self->recovery){
|
|
425 #Change these to logger->warn
|
|
426 $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined");
|
|
427 $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n");
|
|
428 }
|
|
429 }
|
|
430
|
|
431
|
|
432 if($self->rollback){
|
|
433 #Check we have all existing InputSubsets files before we do full rollback
|
|
434 #Can probably remove this if we support InputSubset(file/slice) level rollback
|
|
435 $self->log('Rolling back all InputSubsets');
|
|
436 @rollback_sets = @{$eset->get_InputSubsets};
|
|
437
|
|
438 foreach my $isset(@rollback_sets){
|
|
439
|
|
440 if(! exists $file_paths{$isset->name}){
|
|
441 throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name.
|
|
442 "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names");
|
|
443 }
|
|
444 }
|
|
445 }
|
|
446
|
|
447
|
|
448 foreach my $esset(@rollback_sets){
|
|
449 #This needs to be mapped to the specified filepaths
|
|
450 my $fp_key = $esset->name;
|
|
451 $fp_key = 'prepared.'.$fp_key if $self->prepared;
|
|
452
|
|
453 $new_data{$file_paths{$fp_key}} = 1;
|
|
454 $self->log("Revoking states for InputSubset:\t\t\t".$esset->name);
|
|
455 $eset->adaptor->revoke_states($esset);
|
|
456
|
|
457 if(! $prepare){
|
|
458 #This was to avoid redundant rollback in prepare step
|
|
459 $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name);
|
|
460
|
|
461 if($self->input_feature_class eq 'result'){
|
|
462 #Can we do this by slice for parallelisation?
|
|
463 #This will only ever be a single ResultSet due to Helper::define_and_validate_sets
|
|
464 #flags are rollback_results and force(as this won't be a direct input to the product feature set)
|
|
465 $self->rollback_ResultSet($self->data_set->get_supporting_sets->[0], 1, $self->slices->[0], 1);
|
|
466 #Do no have rollback_InputSet here as we may have parallel Slice based imports running
|
|
467 }
|
|
468 else{#annotated/segmentation
|
|
469 $self->rollback_FeatureSet($self->data_set->product_FeatureSet, undef, $self->slices->[0]);
|
|
470 $self->rollback_InputSet($eset);
|
|
471 last;
|
|
472 }
|
|
473 }
|
|
474 }
|
|
475
|
|
476 return \%new_data;
|
|
477 }
|
|
478
|
|
479
|
|
480
|
|
481
|
|
482 sub set_feature_separator{
|
|
483 my ($self, $separator) = @_;
|
|
484
|
|
485 #How do we test if something undefined was passed?
|
|
486 #Rather than nothing passed at all?
|
|
487 #Can't do this as this is the accessor
|
|
488 #Need to split method
|
|
489
|
|
490 throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') );
|
|
491
|
|
492 $self->{'_feature_separator'} = $separator;
|
|
493
|
|
494 }
|
|
495
|
|
496 # SIMPLE ACCESSORS
|
|
497 # Some of these can be called for each record
|
|
498 # Trim the access time as much as possible
|
|
499
|
|
500 sub input_feature_class{ return $_[0]->{'input_feature_class'}; }
|
|
501
|
|
502 sub input_set_name{ return $_[0]->{'input_set_name'}; } #Set in new
|
|
503
|
|
504 sub feature_adaptor{ return $_[0]->{'feature_adaptor'}; }
|
|
505
|
|
506 sub dbentry_adaptor{ return $_[0]->{'dbentry_adaptor'}; }
|
|
507
|
|
508 sub input_set_adaptor{ return $_[0]->{'input_set_adaptor'}; }
|
|
509
|
|
510 sub set{ return $_[0]->{'set'}; } #Feature or Result, set in define_sets
|
|
511
|
|
512 ##sub slice_adaptor{ return $_[0]->{'slice_adaptor'}; }
|
|
513
|
|
514 sub data_set{ return $_[0]->{'_data_set'}; }
|
|
515
|
|
516 sub feature_separator{ return $_[0]->{'_feature_separator'}; }
|
|
517
|
|
518 sub feature_params{ return $_[0]->{'_feature_params'}; }
|
|
519
|
|
520 sub dbentry_params{ return $_[0]->{'_dbentry_params'}; }
|
|
521
|
|
522 sub input_gzipped{ return $_[0]->{'input_gzipped'}; }
|
|
523
|
|
524
|
|
525 sub input_file_operator{
|
|
526 my ($self, $op) = @_;
|
|
527 #Should be set in format parser
|
|
528 $self->{'input_file_operator'} = $op if defined $op;
|
|
529
|
|
530 return $self->{'input_file_operator'};
|
|
531 }
|
|
532
|
|
533 # prepare boolean, simply stores the sets and preprocesses the file
|
|
534 # so we don't get each batch job trying to sort etc
|
|
535
|
|
536
|
|
537 #Still need to implement prepare in other Parsers!!
|
|
538
|
|
539 sub read_and_import_data{
|
|
540 my ($self, $prepare) = @_;
|
|
541
|
|
542 my $action = ($prepare) ? 'preparing' : 'importing';
|
|
543
|
|
544 $self->log("Reading and $action ".$self->vendor()." data");
|
|
545 my ($eset, $filename, $output_set, $fh, $f_out, %feature_params, @lines);
|
|
546
|
|
547 if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){
|
|
548 throw('prepare mode is only currently implemented for the Bed parser');
|
|
549 }
|
|
550
|
|
551
|
|
552 #Test for conflicting run modes
|
|
553 if($prepare &&
|
|
554 ($self->batch_job || $self->prepared)){
|
|
555 #prepare should be called once by the runner, not in each batch_job
|
|
556 #don't prepare if already prepared
|
|
557 throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job');
|
|
558 }
|
|
559
|
|
560 my $dset = $self->define_sets;
|
|
561
|
|
562 #We also need to account for bsub'd slice based import
|
|
563 #seq alignments loaded into a ResultSet
|
|
564 #Cannot have 0 window for ChIP Seq alignments
|
|
565 #As this would mean storing all the individual reads
|
|
566 #Hence we need to remap to a new assm before we import!
|
|
567
|
|
568 if($self->input_feature_class eq 'result'){
|
|
569 $output_set = $dset->get_supporting_sets->[0];
|
|
570 $eset = $output_set->get_InputSets->[0];
|
|
571 $self->result_set($output_set);#required for ResultFeature Collector and Bed Parser
|
|
572 }
|
|
573 else{#annotated/segmentation
|
|
574 $output_set = $dset->product_FeatureSet;
|
|
575 $eset = $dset->get_supporting_sets->[0];
|
|
576 }
|
|
577
|
|
578
|
|
579 #If we can do these the other way araound we can get define_sets to rollback the FeatureSet
|
|
580 #Cyclical dependency for the sets :|
|
|
581 my $new_data = $self->validate_files($prepare);
|
|
582 my $seen_new_data = 0;
|
|
583
|
|
584
|
|
585 ### READ AND IMPORT FILES ###
|
|
586 foreach my $filepath(@{$self->result_files()}) {
|
|
587 chomp $filepath;
|
|
588
|
|
589 ($filename = $filepath) =~ s/.*\///;
|
|
590 $self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method
|
|
591
|
|
592 if($new_data->{$filepath} ){ #This will currently autovivify!
|
|
593 $seen_new_data = 1;
|
|
594 $self->{'input_gzipped'} = &is_gzipped($filepath);
|
|
595
|
|
596 $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file');
|
|
597
|
|
598 $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath);
|
|
599
|
|
600 #We need to be able to optional open pipe to gzip | sort here
|
|
601 #i.e. define open command
|
|
602 $fh = open_file($filepath, $self->input_file_operator);
|
|
603
|
|
604 #This my become way too large for some reads files
|
|
605 #Currently no problems
|
|
606 #This is not working as we are sorting the file!
|
|
607 #$self->parse_header($fh) if $self->can('parse_header');
|
|
608
|
|
609 #For result features some times we want to run
|
|
610 #locally and just sort without dumping
|
|
611 #i.e if we are not a batch job
|
|
612 #as there is no need to dump if it is a single process
|
|
613
|
|
614
|
|
615 #Should this be prepared?
|
|
616
|
|
617 if((($self->input_feature_class eq 'result') && ! $prepare)){
|
|
618 #(($self->input_feature_class eq 'result') && (! $self->batch_job))){ #Local run on just 1 chr
|
|
619 #
|
|
620
|
|
621 #Use the ResultFeature Collector here
|
|
622 #Omiting the 0 wsize
|
|
623 #How are we going to omit 0 wsize when doing the fetch?
|
|
624 #simply check table name in ResultSet?
|
|
625
|
|
626 #Should we do this for multiple chrs?
|
|
627 #or fail here
|
|
628 # we need to pass self
|
|
629 #for access to get_Features_by_Slice
|
|
630 #which should be in the specific parser e.g Bed
|
|
631
|
|
632 #Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice?
|
|
633 #Could really do with separating the pure file parsers from the importer code, so these can be reused
|
|
634 #by other code. Then simply use Bed import parser for specific import functions and as wrapper to
|
|
635 #Bed file parser
|
|
636 #So should really have
|
|
637 #Parsers::File::Bed
|
|
638 #and
|
|
639 #Parsers::Import::Bed
|
|
640 #This can probably wait until we update BioPerl and just grab the Bed parser from there?
|
|
641
|
|
642 my $slices = $self->slices;
|
|
643
|
|
644 #Should this be caught in new?
|
|
645 if(! @$slices){
|
|
646 throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name);
|
|
647 }
|
|
648
|
|
649
|
|
650 if(scalar(@$slices) > 1){
|
|
651 throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n"
|
|
652 ."Please submit these to the farm as single slice jobs");
|
|
653 }
|
|
654
|
|
655 #restrict to just 1 slice as we don't yet support disk seeking
|
|
656 #if the slices are not in the same order as they appear in the file
|
|
657 #also we want to parallelise this
|
|
658
|
|
659 #Set as attr for parse_Features_by_Slice in format sepcific Parsers
|
|
660
|
|
661
|
|
662 $self->file_handle(open_file($filepath, $self->input_file_operator));
|
|
663
|
|
664
|
|
665 foreach my $slice(@$slices){
|
|
666 $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self,
|
|
667 (
|
|
668 #Force needs reimplementing here?
|
|
669 -force => $self->{force},
|
|
670 -dbfile_data_root => $self->{dbfile_data_root},
|
|
671 ));
|
|
672 }
|
|
673
|
|
674 warn "Need to update InputSubset status to IMPORTED after all slices have been loaded";
|
|
675 #Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures?
|
|
676
|
|
677
|
|
678
|
|
679 warn "Closing $filename\nDisregard the following 'Broken pipe' warning";
|
|
680
|
|
681 #Closing the read end of a pipe before the process writing to it at the other end
|
|
682 #is done writing results in the writer receiving a SIGPIPE. If the other end can't
|
|
683 #handle that, be sure to read all the data before closing the pipe.
|
|
684 #This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file?
|
|
685 #$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead?
|
|
686 #Or maybe an eval will catch the STDERR better?
|
|
687 #sub catch_SIGPIPE{
|
|
688 # my $sig = shift @_;
|
|
689 # print " Caught SIGPIPE: $sig $1 \n";
|
|
690 # return;
|
|
691 #
|
|
692 #}
|
|
693 #$SIG{PIPE} = \&catch_SIGPIPE;
|
|
694 #my $retval = eval { no warnings 'all'; $fh->close };
|
|
695 #if($@){
|
|
696 # warn "after eval with error $@\nretval is $retval";
|
|
697 #}
|
|
698 #Neither of these catch gzip: stdout: Broken pipe
|
|
699
|
|
700 #IO::UnCompress::Gunzip?
|
|
701
|
|
702
|
|
703 $fh->close;
|
|
704 }
|
|
705 else{
|
|
706
|
|
707
|
|
708 #Revoke FeatureSet IMPORTED state here incase we fail halfway through
|
|
709 $output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare));
|
|
710
|
|
711 #What about IMPORTED_"CSVERSION"
|
|
712 #This may leave us with an incomplete import which still has
|
|
713 #an IMPORTED_CSVERSION state
|
|
714 #We need to depend on IMPORTED for completeness of set
|
|
715 #DAS currently only uses IMPORTED_CSVERSION
|
|
716 #This is okayish but we also need to write HCs for any sets
|
|
717 #which do not have IMPORTED state!
|
|
718 my ($line, @outlines, $out_fh);
|
|
719
|
|
720
|
|
721 if($prepare && ! $self->batch_job){
|
|
722 #Assume we want gzipped output
|
|
723 #filename is actull based on input, so may not have gz in file name
|
|
724 $out_fh = open_file($self->output_file, "| gzip -c > %s");
|
|
725 }
|
|
726
|
|
727
|
|
728 while(defined ($line=<$fh>)){
|
|
729 #Generic line processing
|
|
730 #Move these to parse_line?
|
|
731 $line =~ s/\r*\n//o;
|
|
732 next if $line =~ /^\#/;
|
|
733 next if $line =~ /^$/;
|
|
734
|
|
735 #This has now been simplified to process_line method
|
|
736 #my @fields = split/\t/o, $line;
|
|
737 #start building parameters hash
|
|
738 #foreach my $field_index(@{$self->get_field_indices}){
|
|
739 # my $field = $self->get_field_by_index($field_index);
|
|
740 # $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]);
|
|
741 # }
|
|
742
|
|
743
|
|
744 #We also need to enable different parse line methods if we have different file
|
|
745 #e.g. cisRED
|
|
746 #Code refs?
|
|
747
|
|
748
|
|
749 if($self->parse_line($line, $prepare)){
|
|
750 $self->count('total parsed lines');
|
|
751
|
|
752 #Cache or print to sorted file
|
|
753 if($prepare && ! $self->batch_job){
|
|
754
|
|
755 if(scalar(@outlines) >1000){
|
|
756 print $out_fh join("\n", @outlines)."\n";
|
|
757 @outlines = ();
|
|
758 }
|
|
759 else{
|
|
760 push @outlines, $line;
|
|
761 }
|
|
762 }
|
|
763 }
|
|
764 }
|
|
765
|
|
766 close($fh);
|
|
767
|
|
768 #Print last of sorted file
|
|
769 if($prepare && ! $self->batch_job){
|
|
770 print $out_fh join("\n", @outlines)."\n";
|
|
771 close($out_fh);
|
|
772 @outlines = ();
|
|
773 }
|
|
774
|
|
775 if(! $prepare){
|
|
776 #Now we need to deal with anything left in the read cache
|
|
777 $self->process_params if $self->can('process_params');
|
|
778
|
|
779 #To speed things up we may need to also do file based import here with WRITE lock?
|
|
780 #mysqlimport will write lock the table by default?
|
|
781
|
|
782 #reset filename to that originally used to create the Inputsubsets
|
|
783 $filename =~ s/^prepared\.// if $self->prepared;
|
|
784
|
|
785 my $sub_set = $eset->get_subset_by_name($filename);
|
|
786 $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job;
|
|
787 }
|
|
788 }
|
|
789
|
|
790
|
|
791 if($prepare){
|
|
792 $self->log("Finished preparing import from:\t$filepath");
|
|
793 }
|
|
794 else{
|
|
795 #Need to tweak this for slice based import
|
|
796 $self->log('Finished importing '.$self->counts('features').' '.
|
|
797 $output_set->name." features from:\t$filepath");
|
|
798
|
|
799 }
|
|
800
|
|
801
|
|
802 #This currently fails here if the uncaught file sort was not successful
|
|
803
|
|
804 foreach my $key (keys %{$self->counts}){
|
|
805 $self->log("Count $key:\t".$self->counts($key));
|
|
806 }
|
|
807 }
|
|
808 }
|
|
809
|
|
810 #Here we should set IMPORTED on the FeatureSet
|
|
811 #We could also log the first dbID of each feature in a subset to facilitate subset rollback
|
|
812 #in feature table
|
|
813 #this would be sketchy at best
|
|
814 #delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id
|
|
815 #This may already have IMPORTED status as we don't revoke the status whilst
|
|
816 #updating to protect the feature set due to lack of supportingset tracking
|
|
817 #see Helper::defined_and_validate_sets for more notes.
|
|
818 #Is there any point in setting it if we don't revoke it?
|
|
819 #To allow consistent status handling across sets. Just need to be aware of fset status caveat.
|
|
820 #Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job
|
|
821
|
|
822 if(! $prepare){
|
|
823 $output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job;
|
|
824 $self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data};
|
|
825 $self->log("Finished parsing and importing results");
|
|
826 }
|
|
827
|
|
828 return;
|
|
829 }
|
|
830
|
|
831
|
|
832 #Should be called from format parser e.g. BED, GFF, eQTL etc
|
|
833 #Why don't we pass feature_params and dbentry_params directly?
|
|
834
|
|
835 sub load_feature_and_xrefs{
|
|
836 my $self = shift;
|
|
837
|
|
838 #warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n";
|
|
839 #This now only fails once on the first run and then
|
|
840 #Need to count based on feature_type?
|
|
841
|
|
842 #new rather than new fast here as we want to validate the import
|
|
843 my $feature = $self->{feature_class}->new(%{$self->feature_params});
|
|
844 ($feature) = @{$self->feature_adaptor->store($feature)};
|
|
845 $self->count('features');
|
|
846
|
|
847 #Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key?
|
|
848
|
|
849
|
|
850
|
|
851 ##This needs to be handled in caller as we are validating loci?
|
|
852 #if($self->ucsc_coords){
|
|
853 # $start += 1;
|
|
854 # $end += 1;
|
|
855 # }
|
|
856
|
|
857 #This needs to be put in a separate sub and called by the caller
|
|
858 #if(! $self->cache_slice($chr)){
|
|
859 # warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr";
|
|
860 #}
|
|
861 #else{
|
|
862 #grab seq if dump fasta and available
|
|
863 #my $seq;
|
|
864 #if(exists $self->feature_params->{'sequence'}){
|
|
865 # $seq = $self->feature_params->{'sequence'};
|
|
866 # delete $self->feature_params->{'sequence'};
|
|
867 # }
|
|
868 # else{
|
|
869 # $self->log('No fasta sequence available for '.$self->feature_params->display_label);
|
|
870 # }
|
|
871 # }
|
|
872
|
|
873 #dump fasta here
|
|
874 #if ($self->dump_fasta){
|
|
875 # $self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n";
|
|
876 # }
|
|
877
|
|
878 #Store the xrefs
|
|
879
|
|
880 foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){
|
|
881 my $ftype = $dbentry_hash->{feature_type};
|
|
882 delete $dbentry_hash->{feature_type};
|
|
883
|
|
884 my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash});
|
|
885 $self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag
|
|
886 #count here? no count in caller
|
|
887 }
|
|
888
|
|
889
|
|
890 #Clean data cache
|
|
891 $self->{'_feature_params'} = {};
|
|
892 $self->{'_dbentry_params'} = [];
|
|
893
|
|
894 return $feature;
|
|
895 }
|
|
896
|
|
897 #This should really be handled in Bio::EnsEMBL::Feature?
|
|
898 #Move to Helper?
|
|
899
|
|
900 sub set_strand{
|
|
901 my ($self, $strand) = @_;
|
|
902
|
|
903 my $ens_strand = 0;
|
|
904
|
|
905 my %strand_vals = (
|
|
906 '1' => 1,
|
|
907 '0' => 0,
|
|
908 '-1' => -1,
|
|
909 '+' => 1,
|
|
910 '-' => -1,
|
|
911 '.' => 0,
|
|
912 );
|
|
913
|
|
914 if($strand){
|
|
915
|
|
916 if(exists $strand_vals{$strand}){
|
|
917 $ens_strand = $strand_vals{$strand};
|
|
918 }
|
|
919 else{
|
|
920 throw("Could not identify strand value for $strand");
|
|
921 }
|
|
922 }
|
|
923
|
|
924 return $ens_strand;
|
|
925 }
|
|
926
|
|
927 sub total_features{
|
|
928 my ($self, $total) = @_;
|
|
929
|
|
930 $self->{'total_features'} = $total if defined $total;
|
|
931 return $self->{'total_features'};
|
|
932 }
|
|
933
|
|
934 #Currently only required for Bed::parse_Features_by_Slice
|
|
935
|
|
936 #filehandle
|
|
937
|
|
938
|
|
939 sub file_handle{
|
|
940 my ($self, $fh) = @_;
|
|
941
|
|
942 $self->{'file_handle'} = $fh if defined $fh;
|
|
943 return $self->{'file_handle'};
|
|
944 }
|
|
945
|
|
946 sub result_set{
|
|
947 my ($self, $rset) = @_;
|
|
948
|
|
949 #already tested/created by self
|
|
950
|
|
951 $self->{'result_set'} = $rset if $rset;
|
|
952 return $self->{'result_set'};
|
|
953 }
|
|
954
|
|
955 1;
|