0
|
1 #
|
|
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Bed
|
|
3 #
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::EnsEMBL::Funcgen::Parsers::Bed
|
|
26
|
|
27 =head1 SYNOPSIS
|
|
28
|
|
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Bed";
|
|
30 push @INC, $parser_type;
|
|
31 my $imp = $class->SUPER::new(@_);
|
|
32
|
|
33
|
|
34 =head1 DESCRIPTION
|
|
35
|
|
36 This is a definitions class which should not be instatiated directly, it
|
|
37 normally set by the Importer as the parent class. Bed contains meta
|
|
38 data and methods specific to data in bed format, to aid
|
|
39 parsing and importing of experimental data.
|
|
40
|
|
41 =cut
|
|
42
|
|
43 #Import/Parser rework
|
|
44 #We now have potential to use indexed DBFile and Parsers
|
|
45 #Importer should become BaseImporter, inherited from InputSet/Nimblegen
|
|
46 #Have Bed(format) importer which sets the generic Bed(format) Parser
|
|
47
|
|
48 package Bio::EnsEMBL::Funcgen::Parsers::Bed;
|
|
49
|
|
50 use Bio::EnsEMBL::Funcgen::Parsers::InputSet;
|
|
51 use Bio::EnsEMBL::Funcgen::FeatureSet;
|
|
52 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
|
|
53 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
|
|
54 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(open_file is_bed);
|
|
55 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
56 use Bio::EnsEMBL::Funcgen::Utils::Helper;
|
|
57 use File::Basename;
|
|
58 use strict;
|
|
59
|
|
60 use Data::Dumper;
|
|
61
|
|
62 use vars qw(@ISA);
|
|
63 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet);
|
|
64
|
|
65 #To do
|
|
66 # Extend this so we can import features to annotated_feature
|
|
67 # and profiles/reads to result_feature
|
|
68 # This will replace individual tables for current bed das sources
|
|
69 # Altho' idea of separate tables for each set is not necessarily a bad one
|
|
70 # We would have to have extra table or fields details registration
|
|
71 # Easier admin i.e. drop tables
|
|
72 # What about partitioning? Irrelevant if we are moving to matrix
|
|
73 # Not easy to patch dynamically named tables? Can't assign values to user varaible from query!
|
|
74 # Would need to be a stored procedure
|
|
75 # Could have separate matrix files for result_features as we probably wouldn't want
|
|
76 # to do any cross set querying at this level.
|
|
77 # Could we also use this in the RunnableDBs?
|
|
78 # Simply separate those methods required by both normal bed annotated_feature import
|
|
79 # and RunnableDB based annotated_feature import
|
|
80 # Importing into result_feature requires a result_set which assumes a chip experiment
|
|
81 # We need to alter result_set such that it can be optionally associated with ExperimentalSets
|
|
82 # Or should we just go for a different table? read_feature?
|
|
83 # NOTE: PARITION by key may run into problems if different sets have different windows
|
|
84 # Would also need to modify ResultFeatureAdaptor?
|
|
85
|
|
86 =head2 new
|
|
87
|
|
88 Arg[0] : hash containing optional attributes:
|
|
89 -bed_reads => 0|1, #Set input as read alignments rather than peak calls (default is 0)
|
|
90 Example : my $self = $class->SUPER::new(@_);
|
|
91 Description: Constructor method for Bed class
|
|
92 Returntype : Bio::EnsEMBL::Funcgen::Parsers::Bed
|
|
93 Exceptions : throws if Experiment name not defined or if caller is not Importer
|
|
94 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
95 Status : at risk
|
|
96
|
|
97 =cut
|
|
98
|
|
99 sub new{
|
|
100 my $caller = shift;
|
|
101 my $class = ref($caller) || $caller;
|
|
102 my $self = $class->SUPER::new(@_, no_disconnect => 1);
|
|
103
|
|
104 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly")
|
|
105 if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
|
|
106
|
|
107
|
|
108 #This was over-riding InputSet new config
|
|
109 #$self->{'config'} =
|
|
110 # {(
|
|
111 # #order of these method arrays is important!
|
|
112 # #remove method arrays and execute serially?
|
|
113 # #Just move these to individual Parser register_experiment methods
|
|
114 # array_data => [],#['experiment'],
|
|
115 # probe_data => [],#["probe"],
|
|
116 # results_data => ["and_import"],
|
|
117 # norm_method => undef,
|
|
118 #
|
|
119 # #Need to make these definable?
|
|
120 # #have protocolfile arg and just parse tab2mage protocol section format
|
|
121 # #SEE NIMBLEGEN FOR EXAMPLE
|
|
122 # protocols => {()},
|
|
123 # )};
|
|
124
|
|
125 $self->{'overhang_features'} = []; #Move to InputSet?
|
|
126 #Maybe used by other formats
|
|
127
|
|
128 return $self;
|
|
129 }
|
|
130
|
|
131
|
|
132 =head2 set_config
|
|
133
|
|
134 Example : my $self->set_config;
|
|
135 Description: Sets attribute dependent config
|
|
136 Returntype : None
|
|
137 Exceptions : None
|
|
138 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
139 Status : at risk -remove
|
|
140
|
|
141 =cut
|
|
142
|
|
143
|
|
144 sub set_config{
|
|
145 my $self = shift;
|
|
146
|
|
147 $self->SUPER::set_config;
|
|
148 #dir are not set in config to enable generic get_dir method access
|
|
149 #Need to define output dir if we are processing reads
|
|
150
|
|
151 return;
|
|
152 }
|
|
153
|
|
154
|
|
155
|
|
156 sub pre_process_file{
|
|
157 my ($self, $filepath, $prepare) = @_;
|
|
158
|
|
159 #Test file format
|
|
160 throw("Input file is not bed format:\t$filepath") if ! &is_bed($filepath);
|
|
161
|
|
162
|
|
163 #separate sort keys stop lexical sorting of start/end
|
|
164 #when faced with a non numerical seq_region_name
|
|
165 my $sort = ($prepare || ! $self->prepared) ? 'sort -k1,1 -k2,2n -k3,3n ' : '';
|
|
166
|
|
167
|
|
168 if($self->input_gzipped){
|
|
169 $sort .= '|' if $sort;
|
|
170 $self->input_file_operator("gzip -dc %s | $sort ");
|
|
171 }
|
|
172 else{
|
|
173 #This is really only required for read alignments
|
|
174 $self->input_file_operator("$sort %s |");
|
|
175 }
|
|
176
|
|
177
|
|
178 if(! defined $self->output_file && $self->input_feature_class eq 'result'){
|
|
179 my ($name) = fileparse($filepath);
|
|
180 $name =~ s/\.gz// if $self->input_gzipped;
|
|
181
|
|
182 if($prepare){
|
|
183 #This will be filtered for seq_region_name
|
|
184 $self->output_file($self->get_dir('output')."/prepared.${name}.gz");
|
|
185 }
|
|
186 else{
|
|
187 #Not currently used as we use direct import
|
|
188 #via AnnotatedFeatures or ResultFeature Collections
|
|
189
|
|
190 #output_file would only be used DAS read mysqlimport loading
|
|
191 #This could also be used by SAM/BAM so put in InputSet
|
|
192
|
|
193 #Or do we need a file for each discrete set?
|
|
194 #Each import consititutes one discrete data set
|
|
195 #So this is okay for replicates in a result set
|
|
196 #But not for different cell/feature types
|
|
197 #Use one file
|
|
198 #This is imposed case in INputSet::validate_files
|
|
199 #We can't pipe this to gzip as we need to mysqlimport it, which does not support gzip?
|
|
200 #gzip -dc file.sql | mysql would work but would be much slower due ti individual inserts
|
|
201 #$self->output_file($self->get_dir('output')."/result_feature.${name}.txt");
|
|
202 }
|
|
203 }
|
|
204
|
|
205 return $filepath;
|
|
206 }
|
|
207
|
|
208
|
|
209 #sub parse_header{
|
|
210 # my ($self, $fh) = @_;
|
|
211 #
|
|
212 #
|
|
213 # #This will not work on a sorted file, so would have to
|
|
214 # #store header and test match every line!
|
|
215 # #just test for track name= for now
|
|
216 #
|
|
217 #
|
|
218 # warn "PARSING HEADER";
|
|
219 # my $nr = 0;
|
|
220 #
|
|
221 # for my $line(<$fh>){
|
|
222 # $nr++;
|
|
223 #
|
|
224 # #my $nr = $fh->input_line_number();#This always return 3451? Length of file?
|
|
225 # #This is not yet reliable here!!!!
|
|
226 # #Is this because of the gzip sort?
|
|
227 # #So let's depend on count?
|
|
228 # #If we don't know when the header is supposed to finish (i.e. multi line header)
|
|
229 # #We will need to decrement the seek position somehow
|
|
230 #
|
|
231 # warn "INPUT LINE = $nr $line";
|
|
232 #
|
|
233 # #exit;
|
|
234 # if ($nr == 1){#$INPUT_LINE_NUMBER;
|
|
235 # #sanity check here
|
|
236 # return if($line =~ /track name=/o);
|
|
237 # $self->log(":: WARNING ::\tBED file does not appear to have valid header. First line($nr) will be treated as data:\t$line");
|
|
238 # }
|
|
239 #
|
|
240 # exit;
|
|
241 #
|
|
242 # }
|
|
243 #
|
|
244 #
|
|
245 # exit;
|
|
246 #
|
|
247 # return;
|
|
248 #}
|
|
249
|
|
250
|
|
251
|
|
252 sub parse_line{
|
|
253 my ($self, $line, $prepare) = @_;
|
|
254
|
|
255 #Need to handle header here for bed is always $.?
|
|
256 #Also files which do not have chr prefix? i.e. Ensembl BED rather than UCSC Bed with is also half open coords
|
|
257
|
|
258 #if ($. == 0){#$INPUT_LINE_NUMBER;
|
|
259 # #sanity check here
|
|
260 return 0 if($line =~ /track name=/o);
|
|
261 $line =~ s/\r*\n//o;#chump accounts for windows files
|
|
262
|
|
263
|
|
264 my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t?
|
|
265 #Should we define minimum fields or microbed format with no naqme and just score?
|
|
266 #my ($chr, $start, $end, $score) = split/\t/o, $line;#Mikkelson hack
|
|
267 #Validate variables types here beofre we get a nasty error from bind_param?
|
|
268
|
|
269 #Any more valid BED fields here?
|
|
270 # thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
|
|
271 # thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
|
|
272 # itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
|
|
273 # blockCount - The number of blocks (exons) in the BED line.
|
|
274 # blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
|
|
275 # blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
|
|
276
|
|
277 my $slice = $self->cache_slice($chr, undef, $prepare);
|
|
278 #prepare counts total features for RPKM
|
|
279 #This also filter slices for those defined
|
|
280
|
|
281 if(! $slice){
|
|
282 return 0;
|
|
283 }
|
|
284 else{
|
|
285 my $sr_name = $slice->seq_region_name;
|
|
286 my $slice_name = $slice->name;
|
|
287
|
|
288
|
|
289 if(! $prepare){
|
|
290
|
|
291 $strand = $self->set_strand($strand);
|
|
292 $start += 1 if $self->ucsc_coords;
|
|
293
|
|
294 # Set name dependantly on input class
|
|
295 my %name_param;
|
|
296
|
|
297 if($self->input_feature_class eq 'segmentation'){
|
|
298 #Expand this into pluggable config
|
|
299 #defined by field position against the input param name
|
|
300
|
|
301 #Need to define ftype and analysis config outside
|
|
302 #of this parser anyway
|
|
303
|
|
304 #Can we use similar set up to external parser config
|
|
305 #but extract actual config to separate file
|
|
306
|
|
307
|
|
308
|
|
309 if(! exists $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name}){
|
|
310 #No need to test is valid as we have already done this
|
|
311 #just need to make sure we don't initialise the hash key
|
|
312 throw("Found segmentation BED name which is not defined in the feature_types".
|
|
313 " config for your feature_set:\t$name");
|
|
314 }
|
|
315
|
|
316 #warn "$name ftype is ".$self->{user_config}{feature_sets}{$self->name}{feature_types}{$name};
|
|
317 $name_param{'-FEATURE_TYPE'} = $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name};
|
|
318
|
|
319 #DISPLAY_LABEL Let this get autogenerated?
|
|
320
|
|
321 }
|
|
322 else{#annotated
|
|
323 $name_param{'-DISPLAY_LABEL'} = $name;
|
|
324 }
|
|
325
|
|
326
|
|
327 $self->{_feature_params} = {
|
|
328 -START => $start,
|
|
329 -END => $end,
|
|
330 -STRAND => $strand,
|
|
331 -SLICE => $self->cache_slice($chr),
|
|
332 -SCORE => $score,
|
|
333 -FEATURE_SET => $self->data_set->product_FeatureSet,
|
|
334 %name_param,
|
|
335 };
|
|
336
|
|
337 $self->load_feature_and_xrefs;
|
|
338 }
|
|
339 }
|
|
340
|
|
341 return 1;
|
|
342 }
|
|
343
|
|
344
|
|
345 #For the purposes of creating ResultFeature Collections
|
|
346 #Dependancy on creating features is overkill
|
|
347 #altho not critical as this is never used for display
|
|
348
|
|
349 #Should really move this to InputSet parser
|
|
350 #Altho this would require an extra method call per line to parse the record
|
|
351
|
|
352 sub parse_Features_by_Slice{
|
|
353 my ($self, $slice) = @_;
|
|
354
|
|
355 #Slice should have been checked by now in caller
|
|
356 if($slice->strand != 1){
|
|
357 throw("Bed Parser does not support parsing features by non +ve/forward strand Slices\n".
|
|
358 'This is to speed up generation of ResultFeature Collections for large sequencing data sets');
|
|
359 }
|
|
360
|
|
361 my $slice_chr = $slice->seq_region_name;
|
|
362
|
|
363 #This method assumes that method calls will walk through a seq_region
|
|
364 #using adjacent slices
|
|
365
|
|
366 #We need to maintain a feature cache, which contains all the features which over hang
|
|
367 #the current slice, such that we can include them in the next batch of features returned
|
|
368
|
|
369 my @features;
|
|
370 my $slice_end = $slice->end;
|
|
371 my $slice_start = $slice->start;
|
|
372 my $last_slice = $self->last_slice;
|
|
373 my $last_slice_end = ($last_slice) ? $last_slice->end : ($slice_start - 1);
|
|
374 my $last_slice_name = ($last_slice) ? $last_slice->seq_region_name : $slice->seq_region_name;
|
|
375 my $rset_id = $self->result_set->dbID;
|
|
376
|
|
377 if(! ($slice_start == ($last_slice_end + 1) &&
|
|
378 ($slice->seq_region_name eq $last_slice_name))){
|
|
379 #Need to reopen the file as we are doing a second pass over the same data
|
|
380 #This is not guaranteed to work for re-reading sets of slices
|
|
381 #This would also not be caught by this test
|
|
382
|
|
383 #To be safe we need to reset the file handle from the caller context
|
|
384
|
|
385 throw("Bed parser does not yet support parsing features from successive non-adjacent Slices\n".
|
|
386 "Last slice end - Next slice start:\t$last_slice_name:${last_slice_end} - ".
|
|
387 $slice->seq_region_name.':'.$slice_start);
|
|
388 }
|
|
389
|
|
390
|
|
391 #Deal with 5' overhang first
|
|
392 foreach my $feature(@{$self->overhang_features}){
|
|
393 $feature = $feature->transfer($slice);
|
|
394 push @features, $feature if $feature;#This should always be true
|
|
395 }
|
|
396
|
|
397 $self->{'overhang_features'} = []; #reset overhang features
|
|
398 my $fh = $self->file_handle;
|
|
399 my ($line, $feature);
|
|
400 my $parse = 1;
|
|
401 #Add counts here, or leave to Collector?
|
|
402 my $seen_chr = 0;
|
|
403
|
|
404 #This currently parses the rest of the file once we have seen the data we want
|
|
405
|
|
406 while((defined ($line = <$fh>)) && $parse){
|
|
407 #This does not catch the end of the file!
|
|
408
|
|
409
|
|
410 if($self->last_line){#Deal with previous line first
|
|
411 $line = $self->last_line;
|
|
412 $self->last_line('');
|
|
413 }
|
|
414 else{
|
|
415 $line = <$fh>;
|
|
416 }
|
|
417
|
|
418 #Still need to chump here in case no other fields
|
|
419 $line =~ s/\r*\n//o if $line;#chump accounts for windows files
|
|
420
|
|
421 if(! $line){
|
|
422 warn("Skipping empty line");
|
|
423 next;
|
|
424 }
|
|
425
|
|
426 #We could use a generic method to parse here
|
|
427 #But it is small enough and simple enough to have twice
|
|
428 my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t?
|
|
429
|
|
430 if($slice_chr eq $chr){#Skim through the file until we find the slice
|
|
431 $seen_chr = 1;
|
|
432 if($end >= $slice_start){
|
|
433
|
|
434 if($start <= $slice_end){#feature is on slice
|
|
435
|
|
436 $feature = Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast
|
|
437 ({
|
|
438 start => ($start - $slice_start + 1),
|
|
439 end => ($end - $slice_start + 1),
|
|
440 strand => $strand,
|
|
441 scores => [$score],
|
|
442 result_set_id => $rset_id,
|
|
443 window_size => 0,#wsize
|
|
444 slice => $slice,
|
|
445 });
|
|
446 push @features, $feature;
|
|
447
|
|
448 if($end > $slice_end){
|
|
449 #This will also capture last feature which may not be part of current slice
|
|
450 $self->overhang_features($feature);
|
|
451 }
|
|
452 }
|
|
453 else{#feature is past end of current slice
|
|
454 $parse = 0;
|
|
455 $self->last_line($line);#But maybe part of next slice chunk
|
|
456 }
|
|
457 }
|
|
458 }
|
|
459 elsif($seen_chr){
|
|
460 #We have reached the end of the chromsome!
|
|
461 $self->last_line($line);#in case we are parsing slice serially
|
|
462 $parse = 0;
|
|
463 }
|
|
464 }
|
|
465
|
|
466 $self->last_slice($slice);
|
|
467 #$self->log("Added logging of parsing (seen = $seen_chr) for memory footprinting through file", 'logmemflag');
|
|
468
|
|
469 return \@features;
|
|
470 }
|
|
471
|
|
472 #Move these potentially generic methods to InputSet Parser for use by other Parsers
|
|
473
|
|
474
|
|
475 sub last_line{
|
|
476 my ($self, $lline) = @_;
|
|
477
|
|
478 $self->{'last_line'} = $lline if defined $lline;
|
|
479 return $self->{'last_line'};
|
|
480
|
|
481 }
|
|
482
|
|
483 sub last_slice{
|
|
484 my ($self, $lslice) = @_;
|
|
485
|
|
486 $self->{'last_slice'} = $lslice if $lslice;
|
|
487 return $self->{'last_slice'};
|
|
488 }
|
|
489
|
|
490
|
|
491 sub overhang_features{
|
|
492 my ($self, $feature) = @_;
|
|
493
|
|
494 push @{$self->{'overhang_features'}}, $feature if $feature;
|
|
495
|
|
496 return $self->{'overhang_features'};
|
|
497
|
|
498 }
|
|
499
|
|
500 1;
|