Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # | |
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet | |
3 # | |
4 | |
5 =head1 LICENSE | |
6 | |
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
8 Genome Research Limited. All rights reserved. | |
9 | |
10 This software is distributed under a modified Apache license. | |
11 For license details, please see | |
12 | |
13 http://www.ensembl.org/info/about/code_licence.html | |
14 | |
15 =head1 CONTACT | |
16 | |
17 Please email comments or questions to the public Ensembl | |
18 developers list at <ensembl-dev@ebi.ac.uk>. | |
19 | |
20 Questions may also be sent to the Ensembl help desk at | |
21 <helpdesk@ensembl.org>. | |
22 | |
23 =head1 NAME | |
24 | |
25 Bio::EnsEMBL::Funcgen::Parsers::InputSet | |
26 | |
27 =head1 SYNOPSIS | |
28 | |
29 use vars qw(@ISA); | |
30 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet); | |
31 | |
32 | |
33 =head1 DESCRIPTION | |
34 | |
35 This is a base class to support simple file format parsers. For simple imports the vendor is | |
36 set to the parser type i.e. the file format. The generic read_and_import_simple_data assumes | |
37 a one line per feature format, other format need there own read_and_import_format_data method, | |
38 which will need defining in the result_data config element. Features are stored either as | |
39 ResultFeature collections or AnnotatedFeatures dependan ton the input feature class. | |
40 | |
41 =cut | |
42 | |
43 # To do | |
44 # Add Parsers for BAM/SAM | |
45 # Rename to InputSet | |
46 # Handle mysqlimport for large data sets e.g. reads | |
47 # Incorporate collection code | |
48 # Implement matrix storage | |
49 | |
50 package Bio::EnsEMBL::Funcgen::Parsers::InputSet; | |
51 | |
52 use Bio::EnsEMBL::Funcgen::AnnotatedFeature; | |
53 use Bio::EnsEMBL::Funcgen::SegmentationFeature; | |
54 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); | |
55 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped); | |
56 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
57 #use Bio::EnsEMBL::Funcgen::Utils::Helper; | |
58 use strict; | |
59 | |
60 #config stuff, move to BaseImporter? | |
61 use Bio::EnsEMBL::Analysis; | |
62 use Bio::EnsEMBL::Funcgen::FeatureType; | |
63 | |
64 | |
65 use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10 | |
66 | |
67 #use vars qw(@ISA); | |
68 #@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); | |
69 | |
70 my %valid_types = ( | |
71 result => undef, | |
72 annotated => undef, | |
73 segmentation => undef, | |
74 ); | |
75 | |
76 | |
77 =head2 new | |
78 | |
79 Example : my $self = $class->SUPER::new(@_); | |
80 Description: Constructor method for Bed class | |
81 Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple | |
82 Exceptions : throws if caller is not Importer | |
83 Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple | |
84 Status : at risk | |
85 | |
86 =cut | |
87 | |
88 | |
89 sub new{ | |
90 my $caller = shift; | |
91 | |
92 my $class = ref($caller) || $caller; | |
93 my $self = $class->SUPER::new(@_); | |
94 | |
95 # my $config_file; | |
96 | |
97 | |
98 ($self->{'input_set_name'}, | |
99 $self->{'input_feature_class'}, | |
100 #$self->{'slices'}, | |
101 $self->{total_features}, | |
102 $self->{force}, #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser | |
103 $self->{dbfile_data_root}, #only appropriate for result input_feature_class | |
104 # $config_file, #User defined config hash file | |
105 ) = rearrange(['input_set_name', 'input_feature_class', | |
106 'total_features', 'force', 'dbfile_data_root'], @_); | |
107 | |
108 | |
109 #Could potentially take fields params directly to define a custom format | |
110 #Take direct field mappings, plus special fields which needs parsing differently | |
111 #i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping | |
112 | |
113 | |
114 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") | |
115 if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); | |
116 | |
117 $self->{'config'} = | |
118 {( | |
119 #can we omit these? | |
120 array_data => [],#['experiment'], | |
121 probe_data => [],#["probe"], | |
122 norm_method => undef, | |
123 #protocols => {()}, | |
124 'results_data' => ["and_import"], | |
125 )}; | |
126 | |
127 #set up feature params | |
128 $self->{'_feature_params'} = {}; | |
129 $self->{'_dbentry_params'} = []; | |
130 | |
131 #$self->{'counts'} = {}; | |
132 #$self->{'slices'} = []; | |
133 #$self->{'seq_region_names'} = [];#Used for slice based import | |
134 | |
135 | |
136 # USER CONFIG # | |
137 #Here we need to read config based on external file | |
138 #Should do something similar to set_feature_sets | |
139 #and validate_and_store_feature_types in BaseExternalParser | |
140 #but we are using define and validate sets instead | |
141 | |
142 #BaseExternalParser and BaseImporter really need to be merged | |
143 #After we have stripped out all the array/experiment specific stuff | |
144 | |
145 #Do dev here so we are not developing more stuff in the Importer which will need migrating | |
146 #to the BaseImporter | |
147 | |
148 #if($config_file){ | |
149 # my $config; | |
150 | |
151 # $self->log("Reading config file:\t".$config_file); | |
152 | |
153 # if(! ($config = do "$config_file")){ | |
154 # throw("Couldn't parse config file:\t$config_file:\n$@") if $@; | |
155 # throw("Couldn't do config:\t$config_file\n$!") if ! defined $config; | |
156 # throw("Couldn't compile config_file:\t$config_file") if ! $config; | |
157 # } | |
158 | |
159 # #At least check it is hash | |
160 # if(ref($config) ne 'HASH'){ | |
161 # throw("Config file does not define a valid HASH:\t$config_file"); | |
162 # } | |
163 # | |
164 # $self->{user_config} = $config; | |
165 # } | |
166 | |
167 | |
168 return $self; | |
169 } | |
170 | |
171 | |
172 sub output_file{ | |
173 my ($self, $output_file) = @_; | |
174 | |
175 $self->{'output_file'} = $output_file if $output_file; | |
176 return $self->{'output_file'}; | |
177 } | |
178 | |
179 sub input_file{ | |
180 my ($self, $input_file) = @_; | |
181 | |
182 $self->{'input_file'} = $input_file if $input_file; | |
183 return $self->{'input_file'}; | |
184 } | |
185 | |
186 | |
187 | |
188 =head2 set_config | |
189 | |
190 Example : my $self->set_config; | |
191 Description: Sets attribute dependent config | |
192 Returntype : None | |
193 Exceptions : None | |
194 Caller : Bio::EnsEMBL::Funcgen::Importer | |
195 Status : at risk | |
196 | |
197 =cut | |
198 | |
199 | |
200 sub set_config{ | |
201 my $self = shift; | |
202 | |
203 #Move all this to new when we fix the inheritance in Importer | |
204 | |
205 #We could set input_set_name to experiment name | |
206 #But we would have to make warning in define_and_validate_sets mention -input_set_name | |
207 | |
208 throw('Must provide an -input_set name for a '.uc($self->vendor).' import') if ! defined $self->input_set_name(); | |
209 | |
210 #Mandatory checks | |
211 if(! defined $self->feature_analysis){ | |
212 throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports'); | |
213 } | |
214 | |
215 | |
216 if(! exists $valid_types{$self->input_feature_class}){ | |
217 throw("You must define a valid input_feature_class:\t". | |
218 join(', ', keys %valid_types)); | |
219 } | |
220 | |
221 $self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.ucfirst($self->input_feature_class).'Feature'; | |
222 | |
223 | |
224 #We need to undef norm method as it has been set to the env var | |
225 $self->{'config'}{'norm_method'} = undef; | |
226 | |
227 #dirs are not set in config to enable generic get_dir method access | |
228 $self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector | |
229 | |
230 #some convenience methods | |
231 my $adaptor_method = 'get_'.ucfirst($self->input_feature_class).'FeatureAdaptor'; | |
232 $self->{'feature_adaptor'} = $self->db->$adaptor_method; | |
233 $self->{'dbentry_adaptor'} = $self->db->get_DBEntryAdaptor; | |
234 $self->{'input_set_adaptor'} = $self->db->get_InputSetAdaptor; | |
235 ##$self->{'slice_adaptor'} = $self->db->dnadb->get_SliceAdaptor; | |
236 | |
237 #Validate slices | |
238 $self->slices($self->{'slices'}) if defined $self->{'slices'}; | |
239 | |
240 #Move to new when we sort out inheritance | |
241 $self->validate_and_store_config([$self->name]); | |
242 #Could use input_set_name here? | |
243 #This was to support >1 input set per experiment (name) | |
244 | |
245 #This current breaks for no config imports | |
246 #i.e. standard Bed import e.g. result_feature collections | |
247 #segmentation imports use Bed and config | |
248 #allow no config imports in BaseImporter? | |
249 #or ultimately set the params as part of the user_config? | |
250 | |
251 return; | |
252 } | |
253 | |
254 | |
255 sub define_sets{ | |
256 my ($self) = @_; | |
257 | |
258 my $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->input_set_name); | |
259 | |
260 if(! defined $eset){ | |
261 $eset = Bio::EnsEMBL::Funcgen::InputSet->new | |
262 ( | |
263 -name => $self->input_set_name(), | |
264 -experiment => $self->experiment(), | |
265 -feature_type => $self->feature_type(), | |
266 -cell_type => $self->cell_type(), | |
267 -vendor => $self->vendor(), | |
268 -format => $self->format(), | |
269 -analysis => $self->feature_analysis, | |
270 -feature_class => $self->input_feature_class, | |
271 ); | |
272 ($eset) = @{$self->db->get_InputSetAdaptor->store($eset)}; | |
273 } | |
274 | |
275 #Use define_and_validate with fetch/append as we may have a pre-existing set | |
276 #This now needs to handle ResultSets based on InputSets | |
277 | |
278 | |
279 my $dset = $self->define_and_validate_sets | |
280 ( | |
281 -dbadaptor => $self->db, | |
282 -name => $self->input_set_name,#or name? | |
283 -feature_type => $self->feature_type, | |
284 -cell_type => $self->cell_type, | |
285 -analysis => $self->feature_analysis, | |
286 -feature_class=> $self->input_feature_class, | |
287 -description => $self->feature_set_description, | |
288 #-append => 1,#Omit append to ensure we only have this eset | |
289 -recovery => $self->recovery, | |
290 -supporting_sets => [$eset], | |
291 -slices => $self->slices, | |
292 #Can't set rollback here, as we don't know until we've validated the files | |
293 #Can't validate the files until we have the sets. | |
294 #So we're doing this manually in validate_files | |
295 ); | |
296 | |
297 #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully | |
298 #However we already have IMPORTED on the InputSubSet | |
299 #We should add it to FeatureSet to remain consistent. | |
300 #See Helper::define_and_validate_sets for more notes on | |
301 #potential problems with FeatureSet IMPORTED status | |
302 | |
303 | |
304 #define_and_validate_sets should also replicate ResultSets? | |
305 #Questionable, mapped reads are never normalised across replicates | |
306 #There are generally used as input for peak calling individually. | |
307 #So files in this instance are expected to be separate parts of the same replicate | |
308 #e.g. different chromosomes | |
309 #Force one input file? | |
310 #What if we want to link several assays(feature/cell_types) to the same experiment? | |
311 | |
312 $self->{'_data_set'} = $dset; | |
313 | |
314 return $dset; | |
315 } | |
316 | |
317 | |
318 | |
319 | |
320 #we have rollback functionality incorporated here | |
321 | |
322 sub validate_files{ | |
323 my ($self, $prepare) = @_; | |
324 | |
325 #Get file | |
326 if (! @{$self->result_files()}) { | |
327 my $list = "ls ".$self->get_dir('input').'/'.$self->input_set_name().'*.';#.lc($self->vendor);#could use vendor here? Actually need suffix attr | |
328 my @rfiles = `$list`; | |
329 $self->result_files(\@rfiles); | |
330 } | |
331 | |
332 #We don't yet support multiple files | |
333 if(scalar(@{$self->result_files()}) >1){ | |
334 warn('Found more than one '.$self->vendor." file:\n". | |
335 join("\n", @{$self->result_files()})."\nThe InputSet parser does not yet handle multiple input files(e.g. replicates).". | |
336 " We need to resolve how we are going handle replicates with random cluster IDs"); | |
337 #do we even need to? | |
338 } | |
339 | |
340 #Here were are tracking the import of individual files by adding them as InputSubSets | |
341 #Recovery would never know what to delete | |
342 #So would need to delete all, Hence no point in setting status? | |
343 #We do not rollback IMPORTED data here. This is done via separate scripts | |
344 #To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback | |
345 #flag in the run script | |
346 | |
347 ### VALIDATE FILES ### | |
348 #We need validate all the files first, so the import doesn't fall over half way through | |
349 #Or if we come across a rollback halfway through | |
350 my (%new_data, $eset); | |
351 my $dset = $self->data_set; | |
352 | |
353 if((scalar(@{$self->slices}) > 1) && | |
354 ! $prepare){ | |
355 throw('Validate files does not yet support multiple Slice rollback'); | |
356 } | |
357 | |
358 #This all assumes that there is only ever 1 InputSet | |
359 | |
360 if($self->input_feature_class eq 'result'){ | |
361 $eset = $dset->get_supporting_sets->[0]->get_InputSets->[0]; | |
362 } | |
363 else{#annotated/segmentation | |
364 $eset = $dset->get_supporting_sets->[0]; | |
365 } | |
366 | |
367 | |
368 #IMPORTED status here may prevent | |
369 #futher slice based imports | |
370 #so we have wait to set this until we know all the slices | |
371 #are loaded, unless we store slice based IMPORTED states | |
372 #We currently get around this be never settign IMPORTED for slice based jobs | |
373 #and always rolling back by slice before import | |
374 | |
375 #This loop supports multiple files | |
376 my (@rollback_sets, %file_paths); | |
377 my $auto_rollback = ($self->rollback) ? 0 : 1; | |
378 | |
379 foreach my $filepath( @{$self->result_files} ) { | |
380 my ($filename, $sub_set); | |
381 chomp $filepath; | |
382 ($filename = $filepath) =~ s/.*\///; | |
383 $file_paths{$filename} = $filepath; | |
384 $filename =~ s/^prepared\.// if $self->prepared; #reset filename to that originally used to create the Inputsubsets | |
385 | |
386 $self->log('Validating '.$self->vendor." file:\t$filename"); | |
387 throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath);#Can deal with links | |
388 | |
389 if( $sub_set = $eset->get_subset_by_name($filename) ){ | |
390 #IMPORTED status here is just for the file | |
391 #Any changes to analysis or coord_system should result in different InputSubset(file) | |
392 #Will only ever be imported into one Feature|ResultSet | |
393 | |
394 #Currently conflating recover_unimported and rollback | |
395 #as they serve the same purpose until we implement InputSubset level recovery | |
396 | |
397 | |
398 if( $sub_set->has_status('IMPORTED') ){ | |
399 $new_data{$filepath} = 0; | |
400 $self->log("Found previously IMPORTED InputSubset:\t$filename"); | |
401 } | |
402 else{ | |
403 $self->log("Found existing InputSubset without IMPORTED status:\t$filename"); | |
404 push @rollback_sets, $sub_set; | |
405 } | |
406 } | |
407 else{ | |
408 $self->log("Found new InputSubset:\t${filename}"); | |
409 throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared; | |
410 $new_data{$filepath} = 1; | |
411 $sub_set = $eset->add_new_subset($filename); | |
412 $self->input_set_adaptor->store_InputSubsets([$sub_set]); | |
413 } | |
414 } | |
415 | |
416 | |
417 #Does -recover allow a single extra new file to be added to an existing InputSet? | |
418 | |
419 if(@rollback_sets && #recoverable sets i.e. exists but not IMPORTED | |
420 ( (! $self->recovery) && (! $self->rollback) ) ){ | |
421 throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n". | |
422 "You must specify -recover or -rollback to perform a full rollback"); | |
423 | |
424 if($self->recovery){ | |
425 #Change these to logger->warn | |
426 $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined"); | |
427 $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n"); | |
428 } | |
429 } | |
430 | |
431 | |
432 if($self->rollback){ | |
433 #Check we have all existing InputSubsets files before we do full rollback | |
434 #Can probably remove this if we support InputSubset(file/slice) level rollback | |
435 $self->log('Rolling back all InputSubsets'); | |
436 @rollback_sets = @{$eset->get_InputSubsets}; | |
437 | |
438 foreach my $isset(@rollback_sets){ | |
439 | |
440 if(! exists $file_paths{$isset->name}){ | |
441 throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name. | |
442 "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names"); | |
443 } | |
444 } | |
445 } | |
446 | |
447 | |
448 foreach my $esset(@rollback_sets){ | |
449 #This needs to be mapped to the specified filepaths | |
450 my $fp_key = $esset->name; | |
451 $fp_key = 'prepared.'.$fp_key if $self->prepared; | |
452 | |
453 $new_data{$file_paths{$fp_key}} = 1; | |
454 $self->log("Revoking states for InputSubset:\t\t\t".$esset->name); | |
455 $eset->adaptor->revoke_states($esset); | |
456 | |
457 if(! $prepare){ | |
458 #This was to avoid redundant rollback in prepare step | |
459 $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name); | |
460 | |
461 if($self->input_feature_class eq 'result'){ | |
462 #Can we do this by slice for parallelisation? | |
463 #This will only ever be a single ResultSet due to Helper::define_and_validate_sets | |
464 #flags are rollback_results and force(as this won't be a direct input to the product feature set) | |
465 $self->rollback_ResultSet($self->data_set->get_supporting_sets->[0], 1, $self->slices->[0], 1); | |
466 #Do no have rollback_InputSet here as we may have parallel Slice based imports running | |
467 } | |
468 else{#annotated/segmentation | |
469 $self->rollback_FeatureSet($self->data_set->product_FeatureSet, undef, $self->slices->[0]); | |
470 $self->rollback_InputSet($eset); | |
471 last; | |
472 } | |
473 } | |
474 } | |
475 | |
476 return \%new_data; | |
477 } | |
478 | |
479 | |
480 | |
481 | |
482 sub set_feature_separator{ | |
483 my ($self, $separator) = @_; | |
484 | |
485 #How do we test if something undefined was passed? | |
486 #Rather than nothing passed at all? | |
487 #Can't do this as this is the accessor | |
488 #Need to split method | |
489 | |
490 throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') ); | |
491 | |
492 $self->{'_feature_separator'} = $separator; | |
493 | |
494 } | |
495 | |
496 # SIMPLE ACCESSORS | |
497 # Some of these can be called for each record | |
498 # Trim the access time as much as possible | |
499 | |
500 sub input_feature_class{ return $_[0]->{'input_feature_class'}; } | |
501 | |
502 sub input_set_name{ return $_[0]->{'input_set_name'}; } #Set in new | |
503 | |
504 sub feature_adaptor{ return $_[0]->{'feature_adaptor'}; } | |
505 | |
506 sub dbentry_adaptor{ return $_[0]->{'dbentry_adaptor'}; } | |
507 | |
508 sub input_set_adaptor{ return $_[0]->{'input_set_adaptor'}; } | |
509 | |
510 sub set{ return $_[0]->{'set'}; } #Feature or Result, set in define_sets | |
511 | |
512 ##sub slice_adaptor{ return $_[0]->{'slice_adaptor'}; } | |
513 | |
514 sub data_set{ return $_[0]->{'_data_set'}; } | |
515 | |
516 sub feature_separator{ return $_[0]->{'_feature_separator'}; } | |
517 | |
518 sub feature_params{ return $_[0]->{'_feature_params'}; } | |
519 | |
520 sub dbentry_params{ return $_[0]->{'_dbentry_params'}; } | |
521 | |
522 sub input_gzipped{ return $_[0]->{'input_gzipped'}; } | |
523 | |
524 | |
525 sub input_file_operator{ | |
526 my ($self, $op) = @_; | |
527 #Should be set in format parser | |
528 $self->{'input_file_operator'} = $op if defined $op; | |
529 | |
530 return $self->{'input_file_operator'}; | |
531 } | |
532 | |
533 # prepare boolean, simply stores the sets and preprocesses the file | |
534 # so we don't get each batch job trying to sort etc | |
535 | |
536 | |
537 #Still need to implement prepare in other Parsers!! | |
538 | |
539 sub read_and_import_data{ | |
540 my ($self, $prepare) = @_; | |
541 | |
542 my $action = ($prepare) ? 'preparing' : 'importing'; | |
543 | |
544 $self->log("Reading and $action ".$self->vendor()." data"); | |
545 my ($eset, $filename, $output_set, $fh, $f_out, %feature_params, @lines); | |
546 | |
547 if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){ | |
548 throw('prepare mode is only currently implemented for the Bed parser'); | |
549 } | |
550 | |
551 | |
552 #Test for conflicting run modes | |
553 if($prepare && | |
554 ($self->batch_job || $self->prepared)){ | |
555 #prepare should be called once by the runner, not in each batch_job | |
556 #don't prepare if already prepared | |
557 throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job'); | |
558 } | |
559 | |
560 my $dset = $self->define_sets; | |
561 | |
562 #We also need to account for bsub'd slice based import | |
563 #seq alignments loaded into a ResultSet | |
564 #Cannot have 0 window for ChIP Seq alignments | |
565 #As this would mean storing all the individual reads | |
566 #Hence we need to remap to a new assm before we import! | |
567 | |
568 if($self->input_feature_class eq 'result'){ | |
569 $output_set = $dset->get_supporting_sets->[0]; | |
570 $eset = $output_set->get_InputSets->[0]; | |
571 $self->result_set($output_set);#required for ResultFeature Collector and Bed Parser | |
572 } | |
573 else{#annotated/segmentation | |
574 $output_set = $dset->product_FeatureSet; | |
575 $eset = $dset->get_supporting_sets->[0]; | |
576 } | |
577 | |
578 | |
579 #If we can do these the other way araound we can get define_sets to rollback the FeatureSet | |
580 #Cyclical dependency for the sets :| | |
581 my $new_data = $self->validate_files($prepare); | |
582 my $seen_new_data = 0; | |
583 | |
584 | |
585 ### READ AND IMPORT FILES ### | |
586 foreach my $filepath(@{$self->result_files()}) { | |
587 chomp $filepath; | |
588 | |
589 ($filename = $filepath) =~ s/.*\///; | |
590 $self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method | |
591 | |
592 if($new_data->{$filepath} ){ #This will currently autovivify! | |
593 $seen_new_data = 1; | |
594 $self->{'input_gzipped'} = &is_gzipped($filepath); | |
595 | |
596 $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file'); | |
597 | |
598 $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath); | |
599 | |
600 #We need to be able to optional open pipe to gzip | sort here | |
601 #i.e. define open command | |
602 $fh = open_file($filepath, $self->input_file_operator); | |
603 | |
604 #This my become way too large for some reads files | |
605 #Currently no problems | |
606 #This is not working as we are sorting the file! | |
607 #$self->parse_header($fh) if $self->can('parse_header'); | |
608 | |
609 #For result features some times we want to run | |
610 #locally and just sort without dumping | |
611 #i.e if we are not a batch job | |
612 #as there is no need to dump if it is a single process | |
613 | |
614 | |
615 #Should this be prepared? | |
616 | |
617 if((($self->input_feature_class eq 'result') && ! $prepare)){ | |
618 #(($self->input_feature_class eq 'result') && (! $self->batch_job))){ #Local run on just 1 chr | |
619 # | |
620 | |
621 #Use the ResultFeature Collector here | |
622 #Omiting the 0 wsize | |
623 #How are we going to omit 0 wsize when doing the fetch? | |
624 #simply check table name in ResultSet? | |
625 | |
626 #Should we do this for multiple chrs? | |
627 #or fail here | |
628 # we need to pass self | |
629 #for access to get_Features_by_Slice | |
630 #which should be in the specific parser e.g Bed | |
631 | |
632 #Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice? | |
633 #Could really do with separating the pure file parsers from the importer code, so these can be reused | |
634 #by other code. Then simply use Bed import parser for specific import functions and as wrapper to | |
635 #Bed file parser | |
636 #So should really have | |
637 #Parsers::File::Bed | |
638 #and | |
639 #Parsers::Import::Bed | |
640 #This can probably wait until we update BioPerl and just grab the Bed parser from there? | |
641 | |
642 my $slices = $self->slices; | |
643 | |
644 #Should this be caught in new? | |
645 if(! @$slices){ | |
646 throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name); | |
647 } | |
648 | |
649 | |
650 if(scalar(@$slices) > 1){ | |
651 throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n" | |
652 ."Please submit these to the farm as single slice jobs"); | |
653 } | |
654 | |
655 #restrict to just 1 slice as we don't yet support disk seeking | |
656 #if the slices are not in the same order as they appear in the file | |
657 #also we want to parallelise this | |
658 | |
659 #Set as attr for parse_Features_by_Slice in format sepcific Parsers | |
660 | |
661 | |
662 $self->file_handle(open_file($filepath, $self->input_file_operator)); | |
663 | |
664 | |
665 foreach my $slice(@$slices){ | |
666 $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self, | |
667 ( | |
668 #Force needs reimplementing here? | |
669 -force => $self->{force}, | |
670 -dbfile_data_root => $self->{dbfile_data_root}, | |
671 )); | |
672 } | |
673 | |
674 warn "Need to update InputSubset status to IMPORTED after all slices have been loaded"; | |
675 #Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures? | |
676 | |
677 | |
678 | |
679 warn "Closing $filename\nDisregard the following 'Broken pipe' warning"; | |
680 | |
681 #Closing the read end of a pipe before the process writing to it at the other end | |
682 #is done writing results in the writer receiving a SIGPIPE. If the other end can't | |
683 #handle that, be sure to read all the data before closing the pipe. | |
684 #This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file? | |
685 #$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead? | |
686 #Or maybe an eval will catch the STDERR better? | |
687 #sub catch_SIGPIPE{ | |
688 # my $sig = shift @_; | |
689 # print " Caught SIGPIPE: $sig $1 \n"; | |
690 # return; | |
691 # | |
692 #} | |
693 #$SIG{PIPE} = \&catch_SIGPIPE; | |
694 #my $retval = eval { no warnings 'all'; $fh->close }; | |
695 #if($@){ | |
696 # warn "after eval with error $@\nretval is $retval"; | |
697 #} | |
698 #Neither of these catch gzip: stdout: Broken pipe | |
699 | |
700 #IO::UnCompress::Gunzip? | |
701 | |
702 | |
703 $fh->close; | |
704 } | |
705 else{ | |
706 | |
707 | |
708 #Revoke FeatureSet IMPORTED state here incase we fail halfway through | |
709 $output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare)); | |
710 | |
711 #What about IMPORTED_"CSVERSION" | |
712 #This may leave us with an incomplete import which still has | |
713 #an IMPORTED_CSVERSION state | |
714 #We need to depend on IMPORTED for completeness of set | |
715 #DAS currently only uses IMPORTED_CSVERSION | |
716 #This is okayish but we also need to write HCs for any sets | |
717 #which do not have IMPORTED state! | |
718 my ($line, @outlines, $out_fh); | |
719 | |
720 | |
721 if($prepare && ! $self->batch_job){ | |
722 #Assume we want gzipped output | |
723 #filename is actull based on input, so may not have gz in file name | |
724 $out_fh = open_file($self->output_file, "| gzip -c > %s"); | |
725 } | |
726 | |
727 | |
728 while(defined ($line=<$fh>)){ | |
729 #Generic line processing | |
730 #Move these to parse_line? | |
731 $line =~ s/\r*\n//o; | |
732 next if $line =~ /^\#/; | |
733 next if $line =~ /^$/; | |
734 | |
735 #This has now been simplified to process_line method | |
736 #my @fields = split/\t/o, $line; | |
737 #start building parameters hash | |
738 #foreach my $field_index(@{$self->get_field_indices}){ | |
739 # my $field = $self->get_field_by_index($field_index); | |
740 # $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]); | |
741 # } | |
742 | |
743 | |
744 #We also need to enable different parse line methods if we have different file | |
745 #e.g. cisRED | |
746 #Code refs? | |
747 | |
748 | |
749 if($self->parse_line($line, $prepare)){ | |
750 $self->count('total parsed lines'); | |
751 | |
752 #Cache or print to sorted file | |
753 if($prepare && ! $self->batch_job){ | |
754 | |
755 if(scalar(@outlines) >1000){ | |
756 print $out_fh join("\n", @outlines)."\n"; | |
757 @outlines = (); | |
758 } | |
759 else{ | |
760 push @outlines, $line; | |
761 } | |
762 } | |
763 } | |
764 } | |
765 | |
766 close($fh); | |
767 | |
768 #Print last of sorted file | |
769 if($prepare && ! $self->batch_job){ | |
770 print $out_fh join("\n", @outlines)."\n"; | |
771 close($out_fh); | |
772 @outlines = (); | |
773 } | |
774 | |
775 if(! $prepare){ | |
776 #Now we need to deal with anything left in the read cache | |
777 $self->process_params if $self->can('process_params'); | |
778 | |
779 #To speed things up we may need to also do file based import here with WRITE lock? | |
780 #mysqlimport will write lock the table by default? | |
781 | |
782 #reset filename to that originally used to create the Inputsubsets | |
783 $filename =~ s/^prepared\.// if $self->prepared; | |
784 | |
785 my $sub_set = $eset->get_subset_by_name($filename); | |
786 $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job; | |
787 } | |
788 } | |
789 | |
790 | |
791 if($prepare){ | |
792 $self->log("Finished preparing import from:\t$filepath"); | |
793 } | |
794 else{ | |
795 #Need to tweak this for slice based import | |
796 $self->log('Finished importing '.$self->counts('features').' '. | |
797 $output_set->name." features from:\t$filepath"); | |
798 | |
799 } | |
800 | |
801 | |
802 #This currently fails here if the uncaught file sort was not successful | |
803 | |
804 foreach my $key (keys %{$self->counts}){ | |
805 $self->log("Count $key:\t".$self->counts($key)); | |
806 } | |
807 } | |
808 } | |
809 | |
810 #Here we should set IMPORTED on the FeatureSet | |
811 #We could also log the first dbID of each feature in a subset to facilitate subset rollback | |
812 #in feature table | |
813 #this would be sketchy at best | |
814 #delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id | |
815 #This may already have IMPORTED status as we don't revoke the status whilst | |
816 #updating to protect the feature set due to lack of supportingset tracking | |
817 #see Helper::defined_and_validate_sets for more notes. | |
818 #Is there any point in setting it if we don't revoke it? | |
819 #To allow consistent status handling across sets. Just need to be aware of fset status caveat. | |
820 #Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job | |
821 | |
822 if(! $prepare){ | |
823 $output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job; | |
824 $self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data}; | |
825 $self->log("Finished parsing and importing results"); | |
826 } | |
827 | |
828 return; | |
829 } | |
830 | |
831 | |
832 #Should be called from format parser e.g. BED, GFF, eQTL etc | |
833 #Why don't we pass feature_params and dbentry_params directly? | |
834 | |
835 sub load_feature_and_xrefs{ | |
836 my $self = shift; | |
837 | |
838 #warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n"; | |
839 #This now only fails once on the first run and then | |
840 #Need to count based on feature_type? | |
841 | |
842 #new rather than new fast here as we want to validate the import | |
843 my $feature = $self->{feature_class}->new(%{$self->feature_params}); | |
844 ($feature) = @{$self->feature_adaptor->store($feature)}; | |
845 $self->count('features'); | |
846 | |
847 #Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key? | |
848 | |
849 | |
850 | |
851 ##This needs to be handled in caller as we are validating loci? | |
852 #if($self->ucsc_coords){ | |
853 # $start += 1; | |
854 # $end += 1; | |
855 # } | |
856 | |
857 #This needs to be put in a separate sub and called by the caller | |
858 #if(! $self->cache_slice($chr)){ | |
859 # warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr"; | |
860 #} | |
861 #else{ | |
862 #grab seq if dump fasta and available | |
863 #my $seq; | |
864 #if(exists $self->feature_params->{'sequence'}){ | |
865 # $seq = $self->feature_params->{'sequence'}; | |
866 # delete $self->feature_params->{'sequence'}; | |
867 # } | |
868 # else{ | |
869 # $self->log('No fasta sequence available for '.$self->feature_params->display_label); | |
870 # } | |
871 # } | |
872 | |
873 #dump fasta here | |
874 #if ($self->dump_fasta){ | |
875 # $self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n"; | |
876 # } | |
877 | |
878 #Store the xrefs | |
879 | |
880 foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){ | |
881 my $ftype = $dbentry_hash->{feature_type}; | |
882 delete $dbentry_hash->{feature_type}; | |
883 | |
884 my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash}); | |
885 $self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag | |
886 #count here? no count in caller | |
887 } | |
888 | |
889 | |
890 #Clean data cache | |
891 $self->{'_feature_params'} = {}; | |
892 $self->{'_dbentry_params'} = []; | |
893 | |
894 return $feature; | |
895 } | |
896 | |
897 #This should really be handled in Bio::EnsEMBL::Feature? | |
898 #Move to Helper? | |
899 | |
900 sub set_strand{ | |
901 my ($self, $strand) = @_; | |
902 | |
903 my $ens_strand = 0; | |
904 | |
905 my %strand_vals = ( | |
906 '1' => 1, | |
907 '0' => 0, | |
908 '-1' => -1, | |
909 '+' => 1, | |
910 '-' => -1, | |
911 '.' => 0, | |
912 ); | |
913 | |
914 if($strand){ | |
915 | |
916 if(exists $strand_vals{$strand}){ | |
917 $ens_strand = $strand_vals{$strand}; | |
918 } | |
919 else{ | |
920 throw("Could not identify strand value for $strand"); | |
921 } | |
922 } | |
923 | |
924 return $ens_strand; | |
925 } | |
926 | |
927 sub total_features{ | |
928 my ($self, $total) = @_; | |
929 | |
930 $self->{'total_features'} = $total if defined $total; | |
931 return $self->{'total_features'}; | |
932 } | |
933 | |
934 #Currently only required for Bed::parse_Features_by_Slice | |
935 | |
936 #filehandle | |
937 | |
938 | |
939 sub file_handle{ | |
940 my ($self, $fh) = @_; | |
941 | |
942 $self->{'file_handle'} = $fh if defined $fh; | |
943 return $self->{'file_handle'}; | |
944 } | |
945 | |
946 sub result_set{ | |
947 my ($self, $rset) = @_; | |
948 | |
949 #already tested/created by self | |
950 | |
951 $self->{'result_set'} = $rset if $rset; | |
952 return $self->{'result_set'}; | |
953 } | |
954 | |
955 1; |