comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Bed.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Bed
3 #
4
5 =head1 LICENSE
6
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
8 Genome Research Limited. All rights reserved.
9
10 This software is distributed under a modified Apache license.
11 For license details, please see
12
13 http://www.ensembl.org/info/about/code_licence.html
14
15 =head1 CONTACT
16
17 Please email comments or questions to the public Ensembl
18 developers list at <ensembl-dev@ebi.ac.uk>.
19
20 Questions may also be sent to the Ensembl help desk at
21 <helpdesk@ensembl.org>.
22
23 =head1 NAME
24
25 Bio::EnsEMBL::Funcgen::Parsers::Bed
26
27 =head1 SYNOPSIS
28
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Bed";
30 push @INC, $parser_type;
31 my $imp = $class->SUPER::new(@_);
32
33
34 =head1 DESCRIPTION
35
36 This is a definitions class which should not be instatiated directly, it
37 normally set by the Importer as the parent class. Bed contains meta
38 data and methods specific to data in bed format, to aid
39 parsing and importing of experimental data.
40
41 =cut
42
43 #Import/Parser rework
44 #We now have potential to use indexed DBFile and Parsers
45 #Importer should become BaseImporter, inherited from InputSet/Nimblegen
46 #Have Bed(format) importer which sets the generic Bed(format) Parser
47
48 package Bio::EnsEMBL::Funcgen::Parsers::Bed;
49
50 use Bio::EnsEMBL::Funcgen::Parsers::InputSet;
51 use Bio::EnsEMBL::Funcgen::FeatureSet;
52 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
53 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
54 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(open_file is_bed);
55 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
56 use Bio::EnsEMBL::Funcgen::Utils::Helper;
57 use File::Basename;
58 use strict;
59
60 use Data::Dumper;
61
62 use vars qw(@ISA);
63 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet);
64
65 #To do
66 # Extend this so we can import features to annotated_feature
67 # and profiles/reads to result_feature
68 # This will replace individual tables for current bed das sources
69 # Altho' idea of separate tables for each set is not necessarily a bad one
70 # We would have to have extra table or fields details registration
71 # Easier admin i.e. drop tables
72 # What about partitioning? Irrelevant if we are moving to matrix
73 # Not easy to patch dynamically named tables? Can't assign values to user varaible from query!
74 # Would need to be a stored procedure
75 # Could have separate matrix files for result_features as we probably wouldn't want
76 # to do any cross set querying at this level.
77 # Could we also use this in the RunnableDBs?
78 # Simply separate those methods required by both normal bed annotated_feature import
79 # and RunnableDB based annotated_feature import
80 # Importing into result_feature requires a result_set which assumes a chip experiment
81 # We need to alter result_set such that it can be optionally associated with ExperimentalSets
82 # Or should we just go for a different table? read_feature?
83 # NOTE: PARITION by key may run into problems if different sets have different windows
84 # Would also need to modify ResultFeatureAdaptor?
85
86 =head2 new
87
88 Arg[0] : hash containing optional attributes:
89 -bed_reads => 0|1, #Set input as read alignments rather than peak calls (default is 0)
90 Example : my $self = $class->SUPER::new(@_);
91 Description: Constructor method for Bed class
92 Returntype : Bio::EnsEMBL::Funcgen::Parsers::Bed
93 Exceptions : throws if Experiment name not defined or if caller is not Importer
94 Caller : Bio::EnsEMBL::Funcgen::Importer
95 Status : at risk
96
97 =cut
98
99 sub new{
100 my $caller = shift;
101 my $class = ref($caller) || $caller;
102 my $self = $class->SUPER::new(@_, no_disconnect => 1);
103
104 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly")
105 if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
106
107
108 #This was over-riding InputSet new config
109 #$self->{'config'} =
110 # {(
111 # #order of these method arrays is important!
112 # #remove method arrays and execute serially?
113 # #Just move these to individual Parser register_experiment methods
114 # array_data => [],#['experiment'],
115 # probe_data => [],#["probe"],
116 # results_data => ["and_import"],
117 # norm_method => undef,
118 #
119 # #Need to make these definable?
120 # #have protocolfile arg and just parse tab2mage protocol section format
121 # #SEE NIMBLEGEN FOR EXAMPLE
122 # protocols => {()},
123 # )};
124
125 $self->{'overhang_features'} = []; #Move to InputSet?
126 #Maybe used by other formats
127
128 return $self;
129 }
130
131
132 =head2 set_config
133
134 Example : my $self->set_config;
135 Description: Sets attribute dependent config
136 Returntype : None
137 Exceptions : None
138 Caller : Bio::EnsEMBL::Funcgen::Importer
139 Status : at risk -remove
140
141 =cut
142
143
144 sub set_config{
145 my $self = shift;
146
147 $self->SUPER::set_config;
148 #dir are not set in config to enable generic get_dir method access
149 #Need to define output dir if we are processing reads
150
151 return;
152 }
153
154
155
156 sub pre_process_file{
157 my ($self, $filepath, $prepare) = @_;
158
159 #Test file format
160 throw("Input file is not bed format:\t$filepath") if ! &is_bed($filepath);
161
162
163 #separate sort keys stop lexical sorting of start/end
164 #when faced with a non numerical seq_region_name
165 my $sort = ($prepare || ! $self->prepared) ? 'sort -k1,1 -k2,2n -k3,3n ' : '';
166
167
168 if($self->input_gzipped){
169 $sort .= '|' if $sort;
170 $self->input_file_operator("gzip -dc %s | $sort ");
171 }
172 else{
173 #This is really only required for read alignments
174 $self->input_file_operator("$sort %s |");
175 }
176
177
178 if(! defined $self->output_file && $self->input_feature_class eq 'result'){
179 my ($name) = fileparse($filepath);
180 $name =~ s/\.gz// if $self->input_gzipped;
181
182 if($prepare){
183 #This will be filtered for seq_region_name
184 $self->output_file($self->get_dir('output')."/prepared.${name}.gz");
185 }
186 else{
187 #Not currently used as we use direct import
188 #via AnnotatedFeatures or ResultFeature Collections
189
190 #output_file would only be used DAS read mysqlimport loading
191 #This could also be used by SAM/BAM so put in InputSet
192
193 #Or do we need a file for each discrete set?
194 #Each import consititutes one discrete data set
195 #So this is okay for replicates in a result set
196 #But not for different cell/feature types
197 #Use one file
198 #This is imposed case in INputSet::validate_files
199 #We can't pipe this to gzip as we need to mysqlimport it, which does not support gzip?
200 #gzip -dc file.sql | mysql would work but would be much slower due ti individual inserts
201 #$self->output_file($self->get_dir('output')."/result_feature.${name}.txt");
202 }
203 }
204
205 return $filepath;
206 }
207
208
209 #sub parse_header{
210 # my ($self, $fh) = @_;
211 #
212 #
213 # #This will not work on a sorted file, so would have to
214 # #store header and test match every line!
215 # #just test for track name= for now
216 #
217 #
218 # warn "PARSING HEADER";
219 # my $nr = 0;
220 #
221 # for my $line(<$fh>){
222 # $nr++;
223 #
224 # #my $nr = $fh->input_line_number();#This always return 3451? Length of file?
225 # #This is not yet reliable here!!!!
226 # #Is this because of the gzip sort?
227 # #So let's depend on count?
228 # #If we don't know when the header is supposed to finish (i.e. multi line header)
229 # #We will need to decrement the seek position somehow
230 #
231 # warn "INPUT LINE = $nr $line";
232 #
233 # #exit;
234 # if ($nr == 1){#$INPUT_LINE_NUMBER;
235 # #sanity check here
236 # return if($line =~ /track name=/o);
237 # $self->log(":: WARNING ::\tBED file does not appear to have valid header. First line($nr) will be treated as data:\t$line");
238 # }
239 #
240 # exit;
241 #
242 # }
243 #
244 #
245 # exit;
246 #
247 # return;
248 #}
249
250
251
252 sub parse_line{
253 my ($self, $line, $prepare) = @_;
254
255 #Need to handle header here for bed is always $.?
256 #Also files which do not have chr prefix? i.e. Ensembl BED rather than UCSC Bed with is also half open coords
257
258 #if ($. == 0){#$INPUT_LINE_NUMBER;
259 # #sanity check here
260 return 0 if($line =~ /track name=/o);
261 $line =~ s/\r*\n//o;#chump accounts for windows files
262
263
264 my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t?
265 #Should we define minimum fields or microbed format with no naqme and just score?
266 #my ($chr, $start, $end, $score) = split/\t/o, $line;#Mikkelson hack
267 #Validate variables types here beofre we get a nasty error from bind_param?
268
269 #Any more valid BED fields here?
270 # thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
271 # thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
272 # itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
273 # blockCount - The number of blocks (exons) in the BED line.
274 # blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
275 # blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
276
277 my $slice = $self->cache_slice($chr, undef, $prepare);
278 #prepare counts total features for RPKM
279 #This also filter slices for those defined
280
281 if(! $slice){
282 return 0;
283 }
284 else{
285 my $sr_name = $slice->seq_region_name;
286 my $slice_name = $slice->name;
287
288
289 if(! $prepare){
290
291 $strand = $self->set_strand($strand);
292 $start += 1 if $self->ucsc_coords;
293
294 # Set name dependantly on input class
295 my %name_param;
296
297 if($self->input_feature_class eq 'segmentation'){
298 #Expand this into pluggable config
299 #defined by field position against the input param name
300
301 #Need to define ftype and analysis config outside
302 #of this parser anyway
303
304 #Can we use similar set up to external parser config
305 #but extract actual config to separate file
306
307
308
309 if(! exists $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name}){
310 #No need to test is valid as we have already done this
311 #just need to make sure we don't initialise the hash key
312 throw("Found segmentation BED name which is not defined in the feature_types".
313 " config for your feature_set:\t$name");
314 }
315
316 #warn "$name ftype is ".$self->{user_config}{feature_sets}{$self->name}{feature_types}{$name};
317 $name_param{'-FEATURE_TYPE'} = $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name};
318
319 #DISPLAY_LABEL Let this get autogenerated?
320
321 }
322 else{#annotated
323 $name_param{'-DISPLAY_LABEL'} = $name;
324 }
325
326
327 $self->{_feature_params} = {
328 -START => $start,
329 -END => $end,
330 -STRAND => $strand,
331 -SLICE => $self->cache_slice($chr),
332 -SCORE => $score,
333 -FEATURE_SET => $self->data_set->product_FeatureSet,
334 %name_param,
335 };
336
337 $self->load_feature_and_xrefs;
338 }
339 }
340
341 return 1;
342 }
343
344
345 #For the purposes of creating ResultFeature Collections
346 #Dependancy on creating features is overkill
347 #altho not critical as this is never used for display
348
349 #Should really move this to InputSet parser
350 #Altho this would require an extra method call per line to parse the record
351
352 sub parse_Features_by_Slice{
353 my ($self, $slice) = @_;
354
355 #Slice should have been checked by now in caller
356 if($slice->strand != 1){
357 throw("Bed Parser does not support parsing features by non +ve/forward strand Slices\n".
358 'This is to speed up generation of ResultFeature Collections for large sequencing data sets');
359 }
360
361 my $slice_chr = $slice->seq_region_name;
362
363 #This method assumes that method calls will walk through a seq_region
364 #using adjacent slices
365
366 #We need to maintain a feature cache, which contains all the features which over hang
367 #the current slice, such that we can include them in the next batch of features returned
368
369 my @features;
370 my $slice_end = $slice->end;
371 my $slice_start = $slice->start;
372 my $last_slice = $self->last_slice;
373 my $last_slice_end = ($last_slice) ? $last_slice->end : ($slice_start - 1);
374 my $last_slice_name = ($last_slice) ? $last_slice->seq_region_name : $slice->seq_region_name;
375 my $rset_id = $self->result_set->dbID;
376
377 if(! ($slice_start == ($last_slice_end + 1) &&
378 ($slice->seq_region_name eq $last_slice_name))){
379 #Need to reopen the file as we are doing a second pass over the same data
380 #This is not guaranteed to work for re-reading sets of slices
381 #This would also not be caught by this test
382
383 #To be safe we need to reset the file handle from the caller context
384
385 throw("Bed parser does not yet support parsing features from successive non-adjacent Slices\n".
386 "Last slice end - Next slice start:\t$last_slice_name:${last_slice_end} - ".
387 $slice->seq_region_name.':'.$slice_start);
388 }
389
390
391 #Deal with 5' overhang first
392 foreach my $feature(@{$self->overhang_features}){
393 $feature = $feature->transfer($slice);
394 push @features, $feature if $feature;#This should always be true
395 }
396
397 $self->{'overhang_features'} = []; #reset overhang features
398 my $fh = $self->file_handle;
399 my ($line, $feature);
400 my $parse = 1;
401 #Add counts here, or leave to Collector?
402 my $seen_chr = 0;
403
404 #This currently parses the rest of the file once we have seen the data we want
405
406 while((defined ($line = <$fh>)) && $parse){
407 #This does not catch the end of the file!
408
409
410 if($self->last_line){#Deal with previous line first
411 $line = $self->last_line;
412 $self->last_line('');
413 }
414 else{
415 $line = <$fh>;
416 }
417
418 #Still need to chump here in case no other fields
419 $line =~ s/\r*\n//o if $line;#chump accounts for windows files
420
421 if(! $line){
422 warn("Skipping empty line");
423 next;
424 }
425
426 #We could use a generic method to parse here
427 #But it is small enough and simple enough to have twice
428 my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t?
429
430 if($slice_chr eq $chr){#Skim through the file until we find the slice
431 $seen_chr = 1;
432 if($end >= $slice_start){
433
434 if($start <= $slice_end){#feature is on slice
435
436 $feature = Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast
437 ({
438 start => ($start - $slice_start + 1),
439 end => ($end - $slice_start + 1),
440 strand => $strand,
441 scores => [$score],
442 result_set_id => $rset_id,
443 window_size => 0,#wsize
444 slice => $slice,
445 });
446 push @features, $feature;
447
448 if($end > $slice_end){
449 #This will also capture last feature which may not be part of current slice
450 $self->overhang_features($feature);
451 }
452 }
453 else{#feature is past end of current slice
454 $parse = 0;
455 $self->last_line($line);#But maybe part of next slice chunk
456 }
457 }
458 }
459 elsif($seen_chr){
460 #We have reached the end of the chromsome!
461 $self->last_line($line);#in case we are parsing slice serially
462 $parse = 0;
463 }
464 }
465
466 $self->last_slice($slice);
467 #$self->log("Added logging of parsing (seen = $seen_chr) for memory footprinting through file", 'logmemflag');
468
469 return \@features;
470 }
471
472 #Move these potentially generic methods to InputSet Parser for use by other Parsers
473
474
475 sub last_line{
476 my ($self, $lline) = @_;
477
478 $self->{'last_line'} = $lline if defined $lline;
479 return $self->{'last_line'};
480
481 }
482
483 sub last_slice{
484 my ($self, $lslice) = @_;
485
486 $self->{'last_slice'} = $lslice if $lslice;
487 return $self->{'last_slice'};
488 }
489
490
491 sub overhang_features{
492 my ($self, $feature) = @_;
493
494 push @{$self->{'overhang_features'}}, $feature if $feature;
495
496 return $self->{'overhang_features'};
497
498 }
499
500 1;