comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: Collector.pm,v 1.7 2011/01/10 11:27:34 nj1 Exp $
2
3
4 =head1 LICENSE
5
6 Copyright (c) 1999-2011 The European Bioinformatics Institute and
7 Genome Research Limited. All rights reserved.
8
9 This software is distributed under a modified Apache license.
10 For license details, please see
11
12 http://www.ensembl.org/info/about/code_licence.html
13
14 =head1 CONTACT
15
16 Please email comments or questions to the public Ensembl
17 developers list at <ensembl-dev@ebi.ac.uk>.
18
19 Questions may also be sent to the Ensembl help desk at
20 <helpdesk@ensembl.org>.
21
22 =cut
23
24 #Your Bio::Ensembl::Collection::Feature defs module should inherit from here
25 #This could be a local defs file which you have created and require'd into your script
26
27 #If your collections defs module refers to a Bio::EnsEMBL::Feature,
28 #then it's adaptor should inherit from the collections defs module
29
30
31
32 package Bio::EnsEMBL::Funcgen::Collector;
33 #Move this to Bio::EnsEMBL::Utils::Collector for 59?
34
35
36 use strict;
37 use warnings;
38
39 use Bio::EnsEMBL::Utils::Argument ('rearrange');
40 use Bio::EnsEMBL::Utils::Exception ('throw');
41 use Bio::EnsEMBL::Funcgen::ResultFeature;
42
43 #use base('Bio::EnsEMBL::Collection');#ISA
44
45 our ($pack_template, $packed_size, @window_sizes); #These get set in the FeatureAdaptor
46 #Make these constants and remove setter functionality in methods?
47 #Only really important for pack template and windows, maybe these if we are going to start
48
49
50 our $max_data_type_size = 16777216; #Default is 16MB for long blob
51 #we need to deduct the size of the rest of the record here!
52 #For a 2byte packet the smallest window size possible is:
53 #(slice->length/(16777216/2)
54 #so int(bin_size)+1
55 #Obviously have to use the largest slice here, for human chr1:
56 #249,250,621/(16777216/2) = 29.7???
57 #We may need to up this slightly to account for larger chrs?
58 #Implications on memory usage? Is it 4 times for blob manipulation?
59 #Does substr require this manipulation?
60 #This max_allowed_packet_size does not seem to translate directly to the size of the
61 #data being stored e.g. quite a bit more is needed. ISG haven't got to the bottom of this yet.
62 #But have simply upped the config to 67108864 to handle the largest human chr.
63
64 our $max_view_width = 500000;#Max width in Region In Detail;
65
66
67 #our %VALID_BINNING_METHODS
68 #Remove this in favour of can->('calculate_.$method) and coderefs?
69
70
71
72
73 #To do
74 # 1 DONE Merge in Collection code, (no need to do this, removed inheritance)
75 # 2 Write simple BED input to flat file output.
76 # 3 Separate store method so we can simply get, then wrap store around this
77 # 4 Test get method with slice adjusts
78 # 5 separate set_config?
79 # 6 optimise generate_bin_chunks to handle just one window size for display?
80 # 7 Handle packed_size pack_template as methods constants
81 # 8 Provide override method in basefeature adaptor which will use package constant in feature adaptor
82 # This is because these are really adaptor config, the collector only needs to know the
83 # packed_size, and in the absence of an feature adaptor also provides the default methods for both.
84 # If we substr in the API then we need to set sensible limits on blob size, otherwise we will have to unpack a lot of data
85 # to get at the slice we want.
86 # OR
87 # Change adaptor to substr in DB based on known blob ranges/window size
88 # and stitch together any which cross boundaries. This depends on speed of substr at end of large blob TEST!
89 # Load with current code first and test this before making either change!
90 # Delete empty (non-0) collections? i.e. For seq_regions which do not have any source features.
91 #
92 # 9 Handle PAR/HAP regions using fetch_normalised_slice_projections This has to be done in the feature adaptor! Then restrict to non_dup regions in calling script
93
94
95
96 =head2 new
97
98 Args : None
99 Example : my $collector = Bio::EnsEMBL::(Funcgen|Compara|Variation::)Collector::FEATURE->new;
100 $collector->store_windows_by_Slice($slice);
101 Description: Simple new method to enable use of collector when not inherited by
102 a descendant of Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor
103 Returntype : Bio::EnsEMBL::Funcgen::Collector
104 Exceptions : None
105 Caller : Collector script
106 Status : At Risk
107
108 =cut
109
110 sub new{
111 return bless {}, $_[0];#Simple blesses this class as an empty hash
112
113 #Do not set anything here
114 #As will not be first in ISA for feature adaptors
115 #Hence not guaranteed to be called
116
117 }
118
119 #Setter/Getter methods if we don't have a dedicated Collector module
120 #to set the package variables in? Also to allow overriding of defaults.
121 #This can be used by the write_collection method
122 #to determine when to build and store a compressed collection
123 #Effectively the max size of the data type you are using to store
124 #a compressed score. defaults to max for long blob 16MB
125
126
127
128 #Generic method, but only ever called by write_collection in descendant
129
130 sub new_assembly{
131 my ($self, $new_ass) = @_;
132
133 if($new_ass){
134 #Validate new assm to project to
135
136
137 $self->{'new_assembly'} = $new_ass;
138 }
139
140 return $self->{'new_assembly'};
141 }
142
143
144 sub max_data_type_size{
145 my ($self, $size) = @_;
146
147 #Validate is sensible integer?
148
149 if($size && ! int($size)){
150 throw("max_data_type_size must be a integer of bytes, not $size");
151 }
152 elsif($size){
153 $self->{'max_data_type_size'} = $size;
154 }
155 elsif(! defined $self->{'max_data_type_size'}){
156 #default set at head of this module or in descendant Collector
157 $self->{'max_data_type_size'} = $Bio::EnsEMBL::Funcgen::Collector::max_data_type_size;
158 }
159
160 return $self->{'max_data_type_size'};
161 }
162
163 sub max_view_width{
164 my ($self, $size) = @_;
165
166 #Validate is sensible integer?
167
168 if($size && ! int($size)){
169 throw("max_view_width must be a integer, not $size");
170 }
171 elsif($size){
172 $self->{'max_view_width'} = $size;
173 }
174 elsif(! defined $self->{'max_view_width'}){
175 #default set at head of this module or in descendant Collector
176 $self->{'max_view_width'} = $Bio::EnsEMBL::Funcgen::Collector::max_view_width;
177 }
178
179 return $self->{'max_view_width'};
180 }
181
182
183 sub bins_per_record(){
184 #$collector_class::bins_per_record = ($collector_class::max_data_type_size/$collector_class::packed_size);#This should be done dynamically as we may redefine either of these variables?
185
186 my ($self) = shift;
187
188 return int($self->max_data_type_size/$self->packed_size);
189 }
190
191
192 #The defaults for these should be defined in the feature/format specific Collector descendant
193 #either by specifying the package variables or using config attrs to set methods?
194 #general config should be parsed here.
195 #rename bin_method?
196
197 sub bin_method{
198 my ($self, $method) = @_;
199
200 if($method || ! $self->{'bin_method'}){
201
202 if($method){
203 $self->{'bin_method'} = $method;
204 #should test can here? or validate versus hash?
205 }
206 elsif(! $self->{'bin_method'}){
207
208 if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_method){
209 throw('Must pass a bin_method in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_method in your Collector');
210 }
211
212 $self->{'bin_method'} = $Bio::EnsEMBL::Funcgen::Collector::bin_method;
213 }
214
215 #or current validate method if we are keeping the method in the if/else block
216
217
218 #if(! $self->can("calculate_${method}"))){
219 #throw("$method is no a valid a valid binning method");
220 #}
221
222 }
223
224 return $self->{'bin_method'};
225 }
226
227 #We could replace this with a hash of bin_methods and models?
228 #This could then be used to validate
229 #Altho if we are going to commodotise the bin methods, then we need to be able to
230 #define this in the child Collector. Could still do this by modifying the method/model
231 #hash from the child Collector
232
233 sub bin_model{
234 my ($self, $bin_model) = @_;
235
236 if($bin_model || ! $self->{'bin_model'}){
237
238 if($bin_model){
239 $self->{'bin_model'} = $bin_model;
240 }
241 elsif(! $self->{'bin_model'}){
242
243 #Set as global constant defined in descendant Collector
244 if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_model){
245 throw('Must pass -bin_model in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_model in your Collector');
246 }
247
248 $self->{'bin_model'} = $Bio::EnsEMBL::Funcgen::Collector::bin_model;
249 }
250
251 #Need to validate bin models here
252 throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models') if $self->{'bin_model'} ne 'SIMPLE';
253 }
254
255 return $self->{'bin_model'};
256 }
257
258 #This can be overridden by adaptor method
259 #At present this could cause problems as we can pass window sizes in the config, but they will never be set
260 #as adaptor method is not a setter. Adaptor method should throw if we try and set them as this could cause problems when fetching and not knowing the custom sizes?
261
262 sub window_sizes{
263 my ($self, $sizes) = @_;
264
265 if($sizes || ! $self->{'window_sizes'}){
266
267 if($sizes){
268 $self->{'window_sizes'} = $sizes;
269 }
270 else{#! $self->{'windows_sizes'
271
272 if (! @window_sizes){
273 throw('Must pass -windows_sizes in the config or define @Bio::EnsEMBL::Funcgen::Collector::window_sizes in your Collector');
274 }
275
276 @{$self->{'window_sizes'}} = @window_sizes;
277 }
278
279 if(ref($self->{'window_sizes'}) ne 'ARRAY' ||
280 scalar(@{$self->{'window_sizes'}}) == 0){
281 throw('window_sizes must be an arrayref of at least one window size');
282 }
283 }
284
285 return $self->{'window_sizes'};
286 }
287
288
289
290
291 #Optional attrs dependant on whether Collection is packed
292 #Can be redefined in the adaptor but becareful never to redefine the actual values
293 #As these should really be constants for a given Collector
294 #What is best here? We only want pack methods for storing/fetching compressed collections
295 #Move this to base feature adaptor and define attrs as constants using
296 #package variable? Or directly in new?
297 #Then direct modification will be caught.
298 #Just leave here for now.
299
300 #Caller _obj_from_sth/store
301
302 sub pack_template{
303 my ($self, $template) = @_;
304
305 if($template){
306 $self->{'pack_template'} = $template;
307 }
308 elsif(! $self->{'pack_template'}){
309
310 #Set as global constant defined in descendant Collector
311
312 if (! defined $Bio::EnsEMBL::Funcgen::Collector::pack_template){
313 throw('Must pass a per score pack_template in the config or define $Bio::EnsEMBL::Funcgen::Collector::pack_template in your Collector');
314 }
315
316 $self->{'pack_template'} = $Bio::EnsEMBL::Funcgen::Collector::pack_template;
317 }
318
319 return $self->{'pack_template'};
320
321 }
322
323 #Caller _obj_from_sth/store & current_packed_size
324
325 sub packed_size{
326 my ($self, $size) = @_;
327
328 if($size){
329
330 if(! int($size)){
331 throw("$size is not an integer, must pass a size integer for packed_size which specifies size of pack_template:\t".$self->pack_template);
332 }
333
334 $self->{'packed_size'} = $size;
335 }
336 elsif(! $self->{'packed_size'}){
337
338 #Set as global constant defined in descendant Collector
339
340 if (! defined $Bio::EnsEMBL::Funcgen::Collector::packed_size){
341 throw('Must pass a packed_size(wrt to pack_template) in the config or define $Bio::EnsEMBL::Funcgen::Collector::packed_size in your Collector');
342 }
343
344 $self->{'packed_size'} = $Bio::EnsEMBL::Funcgen::Collector::packed_size;
345 }
346
347 return $self->{'packed_size'};
348
349 }
350
351 #These methods are used by the descendant Collector
352 #For caching infor whilst building collections
353 #This is used to log how big a collection has grown before storing
354
355 sub current_packed_size{
356 my ($self, $wsize) = @_;
357
358 #$self->{'current_packed_size'}{$wsize} ||= 0;
359
360 #if(defined $cps){
361 # $self->{'current_packed_size'}{$wsize} = $cps;
362 # }
363 # else{
364 # return $self->{'current_packed_size'}{$wsize};
365 # }
366
367 return (scalar(@{$self->score_cache($wsize)})*$self->packed_size);
368
369 }
370
371
372 sub score_cache{
373 my ($self, $wsize, $scores) = @_;
374
375 $self->{'score_cache'}{$wsize} ||= [];
376
377 if(defined $scores){
378 push @{$self->{'score_cache'}{$wsize}}, @{$scores};
379 }
380 else{
381 #Do this here to stop passing the ref everytime
382 #Will this be faster?
383 #Would certainly be faster if we were not returning a ref
384 return $self->{'score_cache'}{$wsize};
385 }
386 }
387
388 #These last methods are only used for the 0 wsize
389 #natural resolution and ar wrt the orig_slice passed
390 #to store_windows_by_Slice
391
392 sub collection_start{
393 my ($self, $wsize, $sr_start) = @_;
394
395 if(defined $sr_start){
396 $self->{'collection_start'}{$wsize} = $sr_start;
397 }
398 else{
399 return $self->{'collection_start'}{$wsize};
400 }
401 }
402
403 sub collection_end{
404 my ($self, $wsize, $sr_end) = @_;
405
406 if(defined $sr_end){
407 $self->{'collection_end'}{$wsize} = $sr_end;
408 }
409 else{
410 return $self->{'collection_end'}{$wsize};
411 }
412 }
413
414 sub collection_strand{
415 my ($self, $wsize, $strand) = @_;
416
417 if(defined $strand){
418 $self->{'collection_strand'}{$wsize} = $strand;
419 }
420 else{
421 return $self->{'collection_strand'}{$wsize};
422 }
423 }
424
425
426
427 =pod
428
429 sub _create_feature {
430 my ( $this, $feature_type, $args ) = @_;
431
432 my $feature = $this->SUPER::_create_feature( $feature_type, $args );
433
434 if ( !$this->_lightweight() ) {
435 my ( $phase, $end_phase, $stable_id, $version, $created_date,
436 $modified_date, $is_current )
437 = rearrange( [ 'PHASE', 'END_PHASE',
438 'STABLE_ID', 'VERSION',
439 'CREATED_DATE', 'MODIFIED_DATE',
440 'IS_CURRENT'
441 ],
442 %{$args} );
443
444 push( @{$feature},
445 $phase, $end_phase, $stable_id, $version, $created_date,
446 $modified_date, $is_current );
447 }
448
449 return $feature;
450 }
451
452 sub _create_feature_fast {
453 my ( $this, $feature_type, $args ) = @_;
454
455 my $feature =
456 $this->SUPER::_create_feature_fast( $feature_type, $args );
457
458 return $feature;
459 }
460
461
462 #This might not be sensible for Features which are split across tables
463
464 sub _tables {
465 my ($this) = @_;
466
467 my @tables = $this->SUPER::_tables();
468
469 if ( $this->_lightweight() ) {
470 return ( $tables[0] );
471 }
472
473 return @tables;
474 }
475
476
477 sub _columns {
478 my ($this) = @_;
479
480 my @columns = $this->SUPER::_columns();
481
482 if ( $this->_lightweight() ) {
483
484 #What is this doing?
485 #Probably not sensible for ResultFeature
486 @columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns );
487 }
488
489 return @columns;
490 }
491
492 #Also not sensible for objects spread across several tables
493
494 sub _default_where_clause {
495 my ($this) = @_;
496
497 if ( $this->_lightweight() ) {
498 return '';
499 }
500
501 return $this->SUPER::_default_where_clause();
502 }
503
504 =cut
505
506
507
508 #This need to be generic
509 #Again we need to pass an accessor method/reference?
510 #Will be some sort of generic fetch for feature adaptors
511 #or while loop for in flat file accessor
512 #rollback to be handled in caller?
513
514 # To do
515 #
516 # 1 Allow variable chunks lengths (so we only have one resolution of windows?)
517 # This will allow SNP collections which currently define classification i.e colour
518 # Density of SNPs within window will define shading. Count will be displayed in zmenu
519 # This maybe something we have to do in the descendant
520 #
521 # 2 Implement collection param definition in/from descendant
522
523
524 # return collection config from adaptor fetch
525 # window size
526 # fixed width?
527 # render/collection style?
528 # This chould be implemented in BaseFeatureAdaptor::generic_fetch?
529 # Or could be done in the calling fetchmethod?
530 #
531
532 #need to change this to get_window_bin_by_Slice
533 #to enable generating bins on uncompressed data
534 #Need to remove all counts and store based code to store caller
535 #this would mean removing any pack based code too
536 #separate set_config method
537
538
539 #Probelm here is size of slice?
540 #We need to generate bins all in one go, but also need to store at interval
541 #so as not to explode memory
542 #Do we need to separate the window generation from the bin generation code?
543
544
545 #Define the optimal way to generate windowed data by
546 #finding the most common denominator
547
548 sub _define_window_chunks{
549 my ($self, $window_sizes, $max_view_size) = @_;
550
551 ### DEFINE CHUNKS WRT WINDOWS
552
553 #Shortcut for on the fly uncompressed collection retrieval
554 #if(scalar(@wsizes) = 1){
555 #
556 #}
557 #else{
558
559 #Calulate sensible slice length based on window sizes
560 my @wsizes = sort {$a <=> $b} @$window_sizes;
561
562 #We need a default when only calculating 0 resolution
563 #Will binning code work with only 0 resolution?
564 if((scalar(@wsizes) == 1) &&
565 $wsizes[0] == 0){
566 return { $self->max_view_width => [0] };
567 }
568
569
570 my $multiplier = int($max_view_size/$wsizes[$#wsizes]);
571 my $chunk_length = $multiplier * $wsizes[$#wsizes];
572 my $not_divisible = 1;
573 my %chunk_windows;#Registry of chunk lengths to run with windows
574 my %workable_chunks = map {$_ => {}} @wsizes;
575 delete $workable_chunks{'0'};#get rid of natural resolution as this will always work
576
577
578 while($not_divisible && $chunk_length != 0){
579 $not_divisible = 0;
580
581 foreach my $wsize(@wsizes){
582 next if $wsize == 0;#Special wsize for normal data
583
584 #Set not divisible if modulus is true
585 if($chunk_length % $wsize){
586 $not_divisible = 1;
587 }
588 else{
589 #No need to listref here?
590 $workable_chunks{$wsize}{$chunk_length} = [];
591 }
592
593 #warn "chunk length is $chunk_length and not_divisible is $not_divisible";
594 }
595
596 #Gradually shrink the length until we find a workable slice length for all windows
597 $chunk_length -= $wsizes[$#wsizes] if $not_divisible;
598 }
599
600 my %chunk_sets;
601
602
603 if($chunk_length == 0){
604 print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n";
605
606 foreach my $wsize(keys %workable_chunks){
607 #Loop through windows, seeing if they are workable in the other windows
608
609 foreach my $chunk(keys %{$workable_chunks{$wsize}}){
610
611 foreach my $other_wsize(keys %workable_chunks){
612 next if $wsize == $other_wsize;
613
614 if(exists $workable_chunks{$other_wsize}{$chunk}){
615 #only push it onto the other wsize, as we will do the reverse later
616 $chunk_sets{$chunk}{$wsize} = undef;
617 }
618 }
619 }
620 }
621
622 #Now we have a register of co-occurence of wsizes with repect to chunks
623 #Loop through finding the least amount of sets with the longest chunk length?
624 #There is no way to decide which is best?
625 #we could calculate the number of loops? Factored by the chunk length?
626 #Let's just print out and see what we get
627
628 #warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets);
629
630
631 #For now let's just take the one which has the most windows and the longest chunk
632 #Then we just get the largest which handles the rest.
633
634 #define possible set lengths
635 my $i = 0;
636 my %set_lengths;
637 map {$set_lengths{$i} = []; $i++} @wsizes;
638 delete $set_lengths{'0'};#get rid of natural resolution as this will always work
639
640
641 #store chunks lengths for each set size
642 foreach my $chunk(keys %chunk_sets){
643 my $set_size = scalar(values %{$chunk_sets{$chunk}});
644 push @{$set_lengths{$set_size}}, $chunk;
645 }
646
647 #Now we get the biggest set with the longest length;
648 my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop
649 my $found_largest_set = 0;
650
651 while(! $found_largest_set){
652 $largest_size--;
653
654 if(scalar(@{$set_lengths{$largest_size}}>0)){
655 $found_largest_set = 1;
656 }
657 }
658
659
660 #We should be able to loop this bit, to find all the biggest sets.
661 my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}};
662 #we could even be selective here, but let's just take the first one for now
663
664 my @largest_windows = keys %{$chunk_sets{$largest_chunk}};
665 @{$chunk_windows{$largest_chunk}} = @largest_windows;
666 print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n";
667
668 my %remaining_windows = map {$_ => {}} @wsizes;
669 delete $remaining_windows{'0'};#get rid of natural resolution as this will always work
670 map { delete $remaining_windows{$_} } @largest_windows;
671 my $remaining_set_size = scalar(keys %remaining_windows);
672
673 #swapping to array here for practicality, would need to maintain hash if we need to iterate
674 my @rwindows = keys %remaining_windows;
675
676 #This could just be one window, but this will not be inthe co-occurence hash %chunk_sets
677 #Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window.
678 my $next_chunk;
679
680 if(scalar(@rwindows) == 1){
681 #we just want to find a suitably large chunk for this one window.
682 my ($last_window) = @rwindows;
683
684 $multiplier = int(500000/$last_window);
685 $next_chunk = $multiplier * $last_window;
686 }
687 else{
688 #Now were are doing something very similar to above
689 #populating a set_size chunk length registry
690 #my %seen_hash;
691
692
693 foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){
694 my $seen_count = 0;
695
696 foreach my $rwindow(@rwindows){
697
698 $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}});
699 }
700
701 if ($seen_count == $remaining_set_size){
702
703 $next_chunk = $chunk;
704 last;
705 }
706 }
707 }
708
709 @{$chunk_windows{$next_chunk}} = @rwindows;
710
711
712 if($next_chunk){
713
714 print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n";
715
716 #Now we want to cycle through all the set lengths which could contain the ones not in the first
717 #so we need to
718 }
719 else{
720 warn "Need to write iterative sub for set definition";
721 throw('Could not find workable slice length for remaining windows: '.
722 join(', ', @rwindows));
723
724 }
725 }
726 else{
727 @{$chunk_windows{$chunk_length}} = keys(%workable_chunks);
728 print "Found workable chunk length($chunk_length) for all window sizes:\t".
729 join(' ', @{$chunk_windows{$chunk_length}})."\n";
730 }
731
732 return \%chunk_windows;
733 }
734
735
736 #Let's concentrate on store function first before we split out into store and fetch methods
737 #How will this work with the Bed parser?
738 #The descendant collector will sort the input and detect the current slice before calling
739 #store_window_bins_by_Slice. This may require some caching of line or seeking as we will see the next slice before we have a chance to set it.
740 #This will store as ResultFeature collections, so maybe we need to separate the input from output code?
741 #i.e. Bed parser/wrapper
742 # ResultFeatureAdaptor wrapper
743 #These
744
745 #Problem with passing window_sizes here
746 #We need to check that they aren't already defined a class variables as this could potentially
747 #screw up retrieval, expect for only 0 or all but 0
748 #Should we remove this config and force the class variable to be set in the 'adaptor'
749 #Method is then only used internally, make private or only getter? Set by changing class vars?
750
751 sub store_window_bins_by_Slice{
752 my ($self, $slice, %config) = @_;
753
754 my ($window_sizes, $logic_name, $bin_method, $fetch_method_ref, $max_view_width,
755 $max_data_type_size, $pack_template, $packed_size, $bin_model, $new_assm, $skip_zero_window) =
756 rearrange( [ 'WINDOW_SIZES', 'LOGIC_NAME', 'BIN_METHOD', 'FETCH_METHOD_REF', 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', 'PACK_TEMPLATE', 'PACKED_SIZE', 'BIN_MODEL', 'NEW_ASSEMBLY', 'SKIP_ZERO_WINDOW'], %config );
757
758 warn "Need to be careful here about cleaning start end strand caches between serially run slices";
759
760
761
762 ### VAILDATE VARS/CONFIG
763
764 #This could be done once in set_config, could then remove setter bahviour from attr methods?
765 #All default defs params/methods can be overridden by config params
766 #Attrs used in this method
767 $bin_method = $self->bin_method($bin_method);
768 $bin_model = $self->bin_model($bin_model);
769 #$window_sizes = $self->window_sizes($window_sizes);#Now done below
770 #Set to undef if we ave empty array
771 $window_sizes = undef if (ref($window_sizes) eq 'ARRAY' && scalar(@$window_sizes) == 0);
772 #Attrs used in other (store) methods
773 $self->pack_template($pack_template);
774 $self->packed_size($packed_size);
775 $self->max_data_type_size($max_data_type_size);
776 $self->max_view_width($max_view_width);
777
778 #Other vars
779 $self->new_assembly($new_assm);
780
781 #Need to validate slice here
782
783 warn "temp hack for bin_method validation";
784 $bin_method = $self->validate_bin_method($bin_method);
785
786 ### Set window_sizes
787
788 if($self->new_assembly){
789 print "Assembly projection may cause problems for large Collections, defaulting to window_sizes = (0)\n";
790 #Then build the bins on the projected 0 level single ResultFeatures
791
792 #Test we haven't explicity set window_sizes to be soemthing else
793
794 if($window_sizes &&
795 ! ( scalar(@$window_sizes) == 1 && $window_sizes[0] == 0)){
796
797 throw("You have set window_sizes config which are not safe when projecting to a new assembly($new_assm), please omit window_sizes config or set to 0");
798
799 }
800 $window_sizes = $self->window_sizes([0]);
801 }
802 else{
803
804 if($window_sizes && $skip_zero_window && grep/^0$/,@$window_sizes){
805 throw("You have specied skip_zero_window and window_size 0 in your config, please remove one of these");
806 }
807 elsif($window_sizes && ! grep/^0$/,@$window_sizes){
808 $skip_zero_window = 1;
809 unshift @$window_sizes, 0;#re-add 0 window as we need this to build the collections
810 }
811
812 $window_sizes = $self->window_sizes($window_sizes);
813 }
814
815
816 #This is already done in the script
817 if($skip_zero_window && $new_assm){
818 throw("You cannot -skip_zero_window or omit 0 from -window_sizes when projecting to a new assembly($new_assm) which should only be generated using window_size=0");
819 }
820
821
822
823 ### Rollback previously stored features
824
825 if($self->can('rollback_Features_by_Slice')){
826 $self->rollback_Features_by_Slice($slice);
827 }
828 else{
829 #This is currently the only warn output we can't get rid off
830 warn ref($self)." cannot rollback_Features_by_Slice. This may result in duplicate Collections being stored if there is pre-existing data";
831 }
832
833
834
835 ### PROCESS CHUNKS
836
837 #Not lightweight as we will be storing them
838 # Temporarily set the collection to be lightweight???
839 #my $old_value = $this->_lightweight();
840 #if ( defined($lightweight) ) { $this->_lightweight($lightweight) }
841 #else { $this->_lightweight(1) }
842
843 my %chunk_windows = %{$self->_define_window_chunks($self->window_sizes, $self->max_view_width)};
844 my (%counts, $store_natural);
845 $store_natural = grep/^0/, @$window_sizes;
846 $counts{0}=0;#Set natural res count to 0
847 my $slice_end = $slice->end;
848 my $orig_slice = $slice;
849 my $orig_start = $slice->start;
850 #my $slice_adj = $slice->start - 1;#Removed this as we are now generating features local to orig_slice
851 #start/end conversion will be done in write/store_collection
852 my $region = $slice->coord_system_name;
853 my $version = $slice->coord_system->version;
854 my $seq_region_name = $slice->seq_region_name;
855 my $strand = $slice->strand;
856 my $only_natural = 0;
857 #my $slice_adj = 0;
858
859
860 #We need to account for only 0 here when doing projection
861 #The chunk window is set to max_view_widht in _define_chunk_windows
862
863 $only_natural = 1 if $store_natural && scalar(@$window_sizes) == 1;
864 $store_natural = 0 if $skip_zero_window;
865 #SHould really test these two, but should already be caught by now
866
867 #Set the initial collection_start to orig_start
868 #Could default to 1, but we may not be starting from 1
869 #This is not the case for 0 wsize where it must always be
870 #The first feature start
871
872
873 for my $wsize(@{$self->window_sizes}){
874
875 next if $wsize == 0;# && $skip_zero_window;#We never want to assume start of 0 window collection
876 $self->collection_start($wsize, $orig_start);
877 }
878
879
880
881 foreach my $chunk_length(sort keys %chunk_windows){
882
883 print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}}).
884 " with chunk length $chunk_length\n";
885 map $counts{$_} = 0, @{$chunk_windows{$chunk_length}}; #Set window counts to 0
886
887 #Now walk through slice using slice length chunks and build all windows in each chunk
888 my $in_slice = 1;
889 my $start_adj = 0;
890 my ($sub_end, $features, $bins);
891 my $sub_start = 1;
892 my $slice_length = $slice->length;
893
894
895 #Can we subslice and then exclusivly use bin_start(local to orig_slice)
896 #Then we never have to deal with sr coord until we store
897 #This should me we never have to do the sr conversion unless we
898 #use a slice which doesn't start at 1(PAR or test)
899 #Always create in local coords for fetch
900 #Then change to seq_region coords for store if required
901
902
903 while($in_slice){
904 #$sr_start = $slice_start + $start_adj;
905 $sub_start += $start_adj;
906
907 #$slice_start = $sr_start;#Keep for next slice
908 #$sr_end = $sr_start + $chunk_length - 1;
909 $sub_end = $sub_start + $chunk_length - 1;
910
911 #Last chunk might not be the correct window length
912 #Hence why we should do this on whole chromosomes
913 if($sub_end >= $slice_length){
914 #$sub_end = $slice_end;
915 #No longer set to slice end, as we don't want to corrupt the bin definition?
916 #Surplus bins are removed in store/write_collection in caller
917 #We could simply add the largest window the the end of the slice?
918 #Then we will only build the minimum of excess bins?
919 #This should be okay for bin calcs
920 #But may screw up bin trimming in caller as we currently expect $ub_end to be a valid bin end
921 #for all wsizes
922 #bin trimming should handle this, but this will corrupt the bin definition???
923 #bin definition is depedant on method
924 #So this method need to be agnostic
925 #And deal with the rest in descendant
926 $in_slice = 0;
927 }
928
929
930 $slice = $slice->adaptor->fetch_by_region($region, $seq_region_name, ($sub_start + $orig_start -1), ($sub_end + $orig_start - 1), $strand, $version);
931 #Can't subslice as this will not clip if we go over the length of the slice, unlike normal slice fetching
932 #hence we cannot rely on this
933 #$slice = $orig_slice->sub_Slice($sub_start, $sub_end, $orig_slice->strand);
934 #warn "got sub slice $slice as $sub_start - $sub_end from ".$orig_slice->name;
935
936
937 ### Grab features and shift chunk coords
938 #features may already be a 0 wsize collection if we have projected from an old assembly
939 #Could move this check to get_Features_by_Slice?
940
941 #e.g. [ $features, \%config ]
942 $features = $self->get_Features_by_Slice($slice);
943 #next if scalar(@$features) == 0;#We want to store values for all windows
944
945 if( (@$features) &&
946 (ref($features->[0]) =~ /Bio::EnsEMBL::Funcgen::Collection/) ){#Change to isa 'Bio::EnsEMBL::Collection
947
948 #Check that the returned feature/collections support window_size
949
950 if($features->[0]->can('window_size')){
951
952 if($features->[0]->window_size != 0){
953 throw("You are trying to generated Collections from a non-zero window sized Collection:\t".$features->[1]->{'window_size'});
954 }
955
956 #This should never happen
957 if(! $skip_zero_window){
958 throw('You have retrieved data from a Collection which without using -skip_zero_window i.e. you are trying to generate overwrite the data you are generating the Collections from');
959 }
960 }
961 else{
962 throw('Something si wrong, the Collection you have retrieved does not support the method window_size');
963 }
964 }
965
966
967
968 #Set collection start here for 0 window_size
969 if(@$features && $store_natural && ! defined $self->collection_start(0)){
970 $self->collection_start(0, ($features->[0]->start + $sub_start));
971 }
972
973
974
975 $start_adj = $chunk_length if($in_slice);
976
977
978
979 #This should return a hash of window size => bin array pairs
980 if(! $only_natural){
981
982 $bins = $self->_bin_features_by_window_sizes(
983 -slice => $slice,
984 -window_sizes => $chunk_windows{$chunk_length},
985 -bin_method => $bin_method,
986 -features => $features,
987 );
988
989
990
991
992 }
993
994 #my $bin_start = $sr_start + $slice_adj;#This was only required for storing individual bins
995 #Could calc bin_start + slice_adjust ahere for all features
996 #Doing this will break old code for single window collections
997
998 #This is sr start and should be local to orig_slice!
999
1000
1001
1002
1003 #We need to handle strandedness of slice!?
1004
1005 #Store all normal features in result_feature
1006
1007
1008
1009 if($store_natural){
1010
1011 foreach my $feature(@$features){
1012 $counts{0}++;
1013 #warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->scores->[0]));
1014
1015
1016 #Should we handle bin trimming here for overhanging slices
1017 #Then counts wil be correct and wont have to do in caller
1018
1019 #We could stop here if the feature seq_region start > orig_slice end
1020 #Current done in write/store_collection
1021 #This may mean working in seq_region values rather than slice values
1022
1023
1024 #write_collection is implemented in descendant e.g. Bio::EnsEMBL::Funcgen::Collector::ResultFeature
1025 #as wrapper to adaptor store method or print to file
1026
1027 #These params need to be generated in a way defined by the descendant
1028 #
1029
1030 if($bin_model eq 'SIMPLE'){
1031 #We need to pass the slice with this so we can sub slice when storing
1032 #the collection and set the start/end to 1 and length of slice
1033 #we still need to store the first start to be able to sub slice correctly
1034
1035 $self->collection_start(0, ($feature->start + $sub_start));
1036
1037 #Need to pass strand for 0 resolution
1038 $self->write_collection(0,
1039 $orig_slice,
1040 #These are now wrt orig_slice
1041 #($feature->start + $sub_start),
1042 ($feature->end + $sub_start),
1043 $feature->strand,
1044 $feature->scores,
1045 );
1046
1047 #We can have problems here if the original score type
1048 #does not match the collected score type
1049 #For max magnitude this is not an issue
1050 #as we take the larget value from the bin
1051 #But for other methods this may not be true
1052 #e.g. count
1053 #Hence, if we want to preserve the 0 window
1054 #We must account for this in the feature collector
1055 #e.g. set_collection_defs_by_ResultSet_window_size?
1056 #Just omit 0 window for reads
1057
1058 }
1059 }
1060
1061 print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins for:\t".$slice->name."\n";
1062 }
1063
1064 #Now store bins
1065 # my ($bin_end, $bin_scores);
1066 my $num_bins;
1067
1068 foreach my $wsize(sort keys %{$bins}){
1069 $num_bins = scalar(@{$bins->{$wsize}});
1070 #warn "$num_bins bin scores for $wsize:\t".join(',', @{$bins->{$wsize}});
1071
1072 #Should we handle bin trimming here for overhanging slices
1073 #Then counts wil be correct and wont have to do in caller
1074
1075
1076 $counts{$wsize}+= $num_bins;
1077
1078
1079
1080 #We don't need this loop for collections as we can simply push all the scores at once
1081 #Just use the slice start and end
1082 if($bin_model eq 'SIMPLE'){
1083
1084 $self->write_collection($wsize,
1085 $orig_slice,
1086 #$sub_start,
1087 $sub_end,
1088 $orig_slice->strand,#This is most likely 1!
1089 #Override this woth 0 in descendant Collector if required.
1090 $bins->{$wsize},
1091 );
1092
1093 }
1094 else{
1095 throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
1096 #i.e. More than one score
1097 }
1098
1099
1100
1101 # #Reset start and end for new wsize
1102 # $bin_start = $slice->start;
1103 # $bin_end = $slice->start;
1104 #
1105 #
1106 #
1107 # #We don't need this loop for collections as we can simply push all the scores at once
1108 #
1109 #
1110 # foreach my $bin_index(0..$#{$bins->{$wsize}}){
1111 #
1112 #
1113 #
1114 # #default method to handle simple fixed width bin?
1115 # #bin_end need to be defined dependant on the bin type
1116 # #($bin_start) = $self->process_default_bin($bins->{$wsize}->[$bin_index], $wsize);#?
1117 #
1118 #
1119 #
1120 # #either define default bin method in descendant
1121 # #Or can we set a process_bin_method var?
1122 # #No just pass all this info to write collection and handle it there?
1123 #
1124 # #Can we have just predefined rotueines handling different bin types?
1125 # #Simple
1126 # #Simple compressed
1127 # #Clipped
1128 # #This will prevent hanving to make attrs/method for storing persistent start/end/score info
1129 #
1130 #
1131 #
1132 # #Need validate bin_type method
1133 # #Could convert these to numbers for speed as with binning methods
1134 #
1135 # if($bin_model eq 'SIMPLE'){
1136 #
1137 # $bin_scores = $bins->{$wsize}->[$bin_index];
1138 #
1139 # warn "bin scores is $bin_scores";
1140 #
1141 #
1142 # #next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores
1143 #
1144 # $bin_end += $wsize;
1145 #
1146 # #if($bin_score){#Removed this as we always want to write the score even if it is 0
1147 #
1148 # #This is a little backwards as we are generating the object to store it
1149 # #If we are aiming for speed the maybe we could also commodotise the store method
1150 # #store by args arrays? store_fast?
1151 # #Speed not essential for storing!
1152 #
1153 # #Note: list ref passed
1154 #
1155 # #Don't need to pass all this info for fixed width blob collections
1156 # #Need to write some default handlers depedant on the collection type
1157 # #Simple(original)
1158 # #Simple compressed
1159 # #Multi compressed
1160 # #Clipped uncompressed?
1161 #
1162 #
1163 # $self->write_collection($wsize,
1164 # $orig_slice,
1165 # ($bin_start + $slice_adj),
1166 # ($bin_end + $slice_adj),
1167 # $orig_slice->strand,#This is most likely 0
1168 # $bin_scores,
1169 # );
1170 #
1171 # #Only count if we have a stored(projected?) feature
1172 # $count++;#Change this to attr/method?
1173 # #}
1174 #
1175 # $bin_start += $wsize;
1176 # }
1177 # else{
1178 # throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
1179 # }
1180 # }
1181
1182 #warn "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins";
1183 #$counts{$wsize}+= $count;
1184 }
1185 }
1186
1187 $store_natural = 0; #Turn off storing of natural resolution for next chunk length sets
1188 }
1189
1190 #Now need to write last collections for each wsize
1191
1192 foreach my $wsize(@{$self->window_sizes}){
1193
1194 next if $wsize == 0 && ! $store_natural;
1195 next if $wsize != 0 && $only_natural;
1196
1197 print "Writing final $wsize window_size collection, this may result in slightly different bin numbers from counts due to removing overhanging bins past end of slice\n";
1198
1199 $self->write_collection($wsize, $orig_slice);#store last collection
1200 }
1201
1202
1203 #Print some counts here
1204 foreach my $wsize(sort (keys %counts)){
1205 print "Generated ".$counts{$wsize}." bins for window size $wsize for ".$orig_slice->name."\n";
1206 #Some may have failed to store if we are projecting to a new assembly
1207 #Need collection count here too, but would need methods for this?
1208 }
1209
1210 #Return this counts hash so we can print/log from the caller, hence we don't print in here?
1211
1212 return;
1213 }
1214
1215
1216
1217 =head2 _bin_features_by_window_sizes
1218
1219 Args[0] : Bio::EnsEMBL::Slice
1220 Args[1] : ARRAYREF of window sizes
1221 Args[2] : int - bin method, currently defined by validate_bin_methods
1222 Args[3] : ARRAYREF of Bio::EnsEMBL::Features
1223 Example : $bins = $self->_bin_features_by_window_sizes(
1224 -slice => $slice,
1225 -window_sizes => $chunk_windows{$chunk_length},
1226 -bin_method => $bin_method,
1227 -features => $features,
1228 );
1229 Description: Bins feature scores for a given list of window sizes and predefined method number
1230 Returntype : HASHREF of scores per bin per window size
1231 Exceptions : Throws if bin method not supported
1232 Caller : store_window_bins_by_Slice
1233 Status : At Risk
1234
1235 =cut
1236
1237
1238 #To do
1239 # 1 Remove Bio::EnsEMBL::Feature dependancy? Or just create Features for non adaptor Collectors.
1240 # Is there a way we can skip the object generation in the adaptor completely and just
1241 # pass the values we need?
1242 # 2 Separate methods, so we can define custom methods in descendants?
1243 # 3 Expand %bins model to optionally be one of
1244 # the following dependant on binning method
1245 # Simple: fixed width containing arrays of scores for each window
1246 # Multi: fixed width containing multiple arrays of scores for each window
1247 # Non-simple?: Separate aggregated features, either fixed width or not, not BLOB!
1248 # Clipped: default fixed width with option to clip start and end. Needs start/end attrs
1249 # Can't store this in a blob due to non-standard start ends?
1250 # Most likely want more than one score here? Count/Density SNPs?
1251 # Removes data skew from standard window bins, would need to store each bin and post
1252 # process. Or do in line to avoid 2nd post-processing loop,requires awareness of when
1253 # we have moved to a new bin between features. This holds for overlapping and
1254 # non-overlapping features. Once we have observed a gap we need to clip the end of the
1255 # last bin and clip the start of the new bin. This requires knowing the greatest end
1256 # values from the last bin's feature. what if two overlapping features had the same
1257 # start and different end, would we see the longest last? Check default slice_fetch sort
1258
1259 sub _bin_features_by_window_sizes{
1260 my $this = shift;
1261 my ( $slice, $window_sizes, $method, $features ) =
1262 rearrange( [ 'SLICE', 'WINDOW_SIZES', 'BIN_METHOD', 'FEATURES' ], @_ );
1263
1264
1265 #Do this conditional on the Collection type
1266 #i.e. is collection seq_region blob then no else yes
1267 #if ( !defined($features) || !@{$features} ) { return {} }
1268
1269 #warn 'Processing '.scalar(@$features).' features for window sizes '.join(', ',@$window_sizes).' for slice '.$slice->name."\n";
1270
1271 #Set up some hashes to store data by window_size
1272 my (%bins, %nbins, %bin_counts);
1273 my $slice_start = $slice->start();
1274
1275 #Default handlers for
1276 #my($first_bin);
1277 #if ( $method == 0 || # 'count' or 'density'
1278 # $method == 3 || # 'fractional_count' or 'weight'
1279 # $method == 4 # 'coverage'
1280 # ){
1281 # # For binning methods where each bin contain numerical values.
1282 # $first_bin = 0;
1283 # }
1284 # else {
1285 # # For binning methods where each bin does not contain numerical
1286 # # values.
1287 #
1288 # #Remove this
1289 # $first_bin = undef;
1290 # }
1291
1292
1293 #Set up some bin data for the windows
1294 my $slice_length = $slice->length;
1295
1296 foreach my $wsize (@$window_sizes) {
1297 #TO DO: Need to modify this block if default 0's are undesirable for collection type
1298 #i.e. should it be undef instead? May have prolbems representing undef in blob
1299
1300 $nbins{$wsize} = int($slice_length / $wsize); #int rounds down
1301 #nbins is actually the index of the bin not the 'number'
1302 #Unless slice_Length is a multiple!
1303 $nbins{$wsize}-- if(! ($slice_length % $wsize));
1304
1305 #Create default bins with 0
1306 @{$bins{$wsize}} = ();
1307 map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize});
1308
1309 #Set bin counts to 0 for each bin
1310 @{$bin_counts{$wsize}} = ();
1311
1312 #This is adding an undef to the start of the array!?
1313 map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}};
1314
1315 foreach my $bin(@{$bins{$wsize}}){
1316 $bin_counts{$wsize}->[$bin] = 0;
1317 }
1318 }
1319
1320 #warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts);
1321 #This fails for slices which are smaller than the chunk length;
1322 my $feature_index = 0;
1323 my ($bin_index, @bin_masks);
1324
1325 foreach my $feature ( @{$features} ) {
1326 #Set up the bins for each window size
1327
1328 #Omit test for Bio::EnsEMBL::Feature here for speed
1329 #Only needs start/end methods
1330
1331 foreach my $wsize (@$window_sizes) {
1332
1333 #We have already highjacked the object creation by here
1334 #This is done in core BaseFeatureAdaptor
1335 #We probably don't want to do this for ResultFeatures as we don't use the
1336 #standard feature implementation
1337 #we already use an array and we don't store the slice
1338 #as this is already known by the caller
1339 #and we always build on top level so we don't need to remap
1340
1341 #We do however need the slice to store, as we only store local starts when generating
1342 #We need a store by Slice method?
1343 #This will remove the need to inherit from Feature.
1344 #These will need to be regenerated everytime we import a new build
1345 #As we do with the probe_features themselves
1346 #This also mean the result_feature status has to be associated with a coord_system_id
1347
1348 #Which bins do the start and end lie in for this feature?
1349 #Already dealing with local starts, so no slice subtraction
1350 #Could wrap these start/end methods via the descendant Collector
1351 #to remove the Feature dependancy? Or just create Features when parsing in the caller
1352 my $start_bin = int(($feature->start ) / $wsize);
1353 my $end_bin = int(($feature->end) / $wsize );
1354 $end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize};
1355
1356
1357
1358 #Slightly obfuscated code to match method number(faster)
1359 #by avoiding string comparisons.
1360 #Could call methods directly using coderef set in validate_bin_method
1361 #Accessor may slow things down, but should be uniform for all methods
1362 #rather than being dependant on position in if/else block below
1363
1364 #reserve 0 for descendant defined method?
1365 #There fore always fastest in this block, or use coderefs?
1366 if ( $method == 0 ) {
1367 # ----------------------------------------------------------------
1368 # For 'count' and 'density'.
1369
1370 for ( $bin_index = $start_bin ;
1371 $bin_index <= $end_bin ;
1372 ++$bin_index ) {
1373
1374 $bins{$wsize}->[$bin_index]++;
1375
1376 #warn "setting $wsize bin $bin_index to ". $bins{$wsize}->[$bin_index];
1377
1378 }
1379 }
1380
1381 =pod
1382
1383 } elsif ( $method == 1 ) {
1384 # ----------------------------------------------------------------
1385 # For 'indices' and 'index'
1386
1387
1388 #How is this useful?
1389 #Is this not just count per bin?
1390 #No this is a list of the feature indices
1391 #So forms a distribution?
1392
1393 throw('Not implemented for method for index');
1394
1395 for ( my $bin_index = $start_bin ;
1396 $bin_index <= $end_bin ;
1397 ++$bin_index ) {
1398 push( @{ $bins[$bin_index] }, $feature_index );
1399 }
1400
1401 ++$feature_index;
1402
1403 } elsif ( $method == 2 ) {
1404 # ----------------------------------------------------------------
1405 # For 'features' and 'feature'.
1406
1407 throw('Not implemented for method for feature/features');
1408
1409 for ( my $bin_index = $start_bin ;
1410 $bin_index <= $end_bin ;
1411 ++$bin_index ) {
1412 push( @{ $bins[$bin_index] }, $feature );
1413 }
1414
1415 } elsif ( $method == 3 ) {
1416 # ----------------------------------------------------------------
1417 # For 'fractional_count' and 'weight'.
1418
1419
1420 throw('Not implemented for method for fractional_count/weight');
1421
1422 if ( $start_bin == $end_bin ) {
1423 ++$bins[$start_bin];
1424 } else {
1425
1426 my $feature_length =
1427 $feature->[FEATURE_END] - $feature->[FEATURE_START] + 1;
1428
1429 # The first bin...
1430 $bins[$start_bin] +=
1431 ( ( $start_bin + 1 )*$bin_length -
1432 ( $feature->[FEATURE_START] - $slice_start ) )/
1433 $feature_length;
1434
1435 # The intermediate bins (if there are any)...
1436 for ( my $bin_index = $start_bin + 1 ;
1437 $bin_index <= $end_bin - 1 ;
1438 ++$bin_index ) {
1439 $bins[$bin_index] += $bin_length/$feature_length;
1440 }
1441
1442 # The last bin...
1443 $bins[$end_bin] +=
1444 ( ( $feature->[FEATURE_END] - $slice_start ) -
1445 $end_bin*$bin_length +
1446 1 )/$feature_length;
1447
1448 } ## end else [ if ( $start_bin == $end_bin)
1449
1450 }
1451 elsif ( $method == 4 ) {
1452 # ----------------------------------------------------------------
1453 # For 'coverage'.
1454
1455 #What exactly is this doing?
1456 #This is coverage of bin
1457 #Rather than coverage of feature as in fractional_count
1458
1459
1460 # my $feature_start = $feature->[FEATURE_START] - $slice_start;
1461 # my $feature_end = $feature->[FEATURE_END] - $slice_start;
1462 #
1463 # if ( !defined( $bin_masks[$start_bin] )
1464 # || ( defined( $bin_masks[$start_bin] )
1465 # && $bin_masks[$start_bin] != 1 ) ) {
1466 # # Mask the $start_bin from the start of the feature to the end
1467 # # of the bin, or to the end of the feature (whichever occurs
1468 # # first).
1469 # my $bin_start = int( $start_bin*$bin_length );
1470 # my $bin_end = int( ( $start_bin + 1 )*$bin_length - 1 );
1471 # for ( my $pos = $feature_start;
1472 # $pos <= $bin_end && $pos <= $feature_end ;
1473 # ++$pos ) {
1474 # $bin_masks[$start_bin][ $pos - $bin_start ] = 1;
1475 # }
1476 # }
1477 #
1478 # for ( my $bin_index = $start_bin + 1 ;
1479 # $bin_index <= $end_bin - 1 ;
1480 # ++$bin_index ) {
1481 # # Mark the middle bins between $start_bin and $end_bin as fully
1482 # # masked out.
1483 # $bin_masks[$bin_index] = 1;
1484 # }
1485 #
1486 # if ( $end_bin != $start_bin ) {
1487 #
1488 # if ( !defined( $bin_masks[$end_bin] )
1489 # || ( defined( $bin_masks[$end_bin] )
1490 # && $bin_masks[$end_bin] != 1 ) ) {
1491 # # Mask the $end_bin from the start of the bin to the end of
1492 # # the feature, or to the end of the bin (whichever occurs
1493 # # first).
1494 # my $bin_start = int( $end_bin*$bin_length );
1495 # my $bin_end = int( ( $end_bin + 1 )*$bin_length - 1 );
1496 # for ( my $pos = $bin_start ;
1497 # $pos <= $feature_end && $pos <= $bin_end ;
1498 # ++$pos ) {
1499 # $bin_masks[$end_bin][ $pos - $bin_start ] = 1;
1500 # }
1501 # }
1502 # }
1503 # } ## end elsif ( $method == 4 )
1504
1505 =cut
1506
1507
1508 elsif ( $method == 5 ) {
1509 #$self->$method($bin_index, $start_bin, $end_bin, $wsize, \%bins, \%bin_counts);
1510
1511
1512 #average score
1513 #This is simple an average of all the scores for features which overlap this bin
1514 #No weighting with respect to the bin or the feature
1515
1516 for ( $bin_index = $start_bin ;
1517 $bin_index <= $end_bin ;
1518 ++$bin_index ) {
1519
1520 #we should really push onto array here so we can have median or mean.
1521 $bins{$wsize}->[$bin_index] += $this->get_score_by_Feature($feature);
1522 $bin_counts{$wsize}->[$bin_index]++;
1523 }
1524 }
1525 elsif( $method == 6){
1526 #Max magnitude
1527 #Take the highest value +ve or -ve score
1528 for ( $bin_index = $start_bin ;
1529 $bin_index <= $end_bin ;
1530 ++$bin_index ) {
1531
1532 #we really need to capture the lowest -ve and higest +ve scores here and post process
1533 #To pick between them
1534
1535 my $score = $this->get_score_by_Feature($feature);
1536 #Write score method as wrapper to scores?
1537
1538 $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
1539
1540
1541 #warn "Comparing wsize $wsize bin $bin_index score $score to ". $bins{$wsize}->[$bin_index]->[0].' '.$bins{$wsize}->[$bin_index]->[1]."\n";
1542
1543 if($score < $bins{$wsize}->[$bin_index]->[0]){
1544 #warn "setting -ve bin to $score\n";
1545 $bins{$wsize}->[$bin_index]->[0] = $score;
1546 }
1547 elsif($score > $bins{$wsize}->[$bin_index][1]){
1548 #warn "setting +ve bin to $score\n";
1549 $bins{$wsize}->[$bin_index]->[1] = $score;
1550 }
1551 }
1552 }
1553 else {
1554 throw("Only accomodates average score method");
1555 }
1556
1557
1558 }
1559
1560 } ## end foreach my $feature ( @{$features...
1561
1562
1563 #Now do post processing of bins
1564
1565 =pod
1566
1567 if ( $method == 4 ) {
1568
1569 # ------------------------------------------------------------------
1570 # For the 'coverage' method: Finish up by going through @bin_masks
1571 # and sum up the arrays.
1572
1573 for ( my $bin_index = 0 ; $bin_index < $nbins ; ++$bin_index ) {
1574 if ( defined( $bin_masks[$bin_index] ) ) {
1575 if ( !ref( $bin_masks[$bin_index] ) ) {
1576 $bins[$bin_index] = 1;
1577 } else {
1578 $bins[$bin_index] =
1579 scalar( grep ( defined($_), @{ $bin_masks[$bin_index] } ) )/
1580 $bin_length;
1581 }
1582 }
1583 }
1584 }
1585
1586 =cut
1587
1588 if( $method == 5){
1589 #For average score, need to divide bins by bin_counts
1590
1591 foreach my $wsize(keys %bins){
1592
1593 foreach my $bin_index(0..$#{$bins{$wsize}}){
1594
1595 if($bin_counts{$wsize}->[$bin_index]){
1596 $bins{$wsize}->[$bin_index] /= $bin_counts{$wsize}->[$bin_index];
1597 }
1598 #warn "bin_index $wsize:$bin_index has score ".$bins{$wsize}->[$bin_index];
1599 }
1600 }
1601 }
1602 elsif( $method == 6){
1603 #Max magnitude
1604 #Take the highest value +ve or -ve score
1605
1606 foreach my $wsize(keys %bins){
1607
1608 foreach my $bin_index(0..$#{$bins{$wsize}}){
1609
1610 #So we have the potential that we have no listref in a given bin
1611
1612 #default value if we haven't seen anything is 0
1613 #we actually want an array of -ve +ve values
1614
1615 #warn "Are we storing 0 values for absent data?";
1616 #Not for max_magnitude, but maybe for others?
1617
1618 if($bins{$wsize}->[$bin_index]){
1619 #warn $wsize.':'.$bin_index.':'.$bins{$wsize}->[$bin_index]->[0].'-'.$bins{$wsize}->[$bin_index]->[1];
1620 my $tmp_minus = $bins{$wsize}->[$bin_index]->[0] * -1;
1621
1622 if($tmp_minus > $bins{$wsize}->[$bin_index]->[1]){
1623 $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[0];
1624 }
1625 else{
1626 $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[1];
1627 }
1628
1629 #warn "bin $bin_index now ". $bins{$wsize}->[$bin_index];
1630 }
1631 }
1632 }
1633 }
1634 elsif($method != 0){#Do no post processing for count(0)
1635 throw('Collector currently only accomodates average_score, count and max magnitude methods');
1636 }
1637
1638
1639 #Could return bin_counts too summary reporting in zmenu
1640 #Could also do counting of specific type
1641
1642 #warn "returning bins ".Data::Dumper::Dumper(\%bins);
1643
1644 return \%bins;
1645 } ## end sub _bin_features
1646
1647
1648 =pod
1649
1650 #These could potentially be used as code refs to avoid having the if else block
1651 #This way we can also define new methods in the descendant Collector?
1652 #Would have to have pass args and refs to bin hashes
1653 #This would slow things down over direct access here
1654 #But speed is no longer that critical as we do not use the Collector for display
1655 #purposes, only to build the Collections which are then used for display directly.
1656
1657 sub calculate_average_score{
1658 my $self = shift;
1659
1660 if ( $method == 5 ) {
1661 #average score
1662 #This is simple an average of all the scores for features which overlap this bin
1663 #No weighting with respect to the bin or the feature
1664
1665 for ( my $bin_index = $start_bin ;
1666 $bin_index <= $end_bin ;
1667 ++$bin_index ) {
1668
1669 #we should really push onto array here so we can have median or mean.
1670 $bins{$wsize}->[$bin_index] += $feature->score;
1671 $bin_counts{$wsize}->[$bin_index]++;
1672 }
1673 }
1674
1675
1676
1677 }
1678
1679
1680 sub post_process_average_score{
1681
1682 }
1683
1684
1685 sub calculate_max_magnitude{
1686 my $self = shift;
1687
1688 #Max magnitude
1689 #Take the highest value +ve or -ve score
1690 for ( my $bin_index = $start_bin ;
1691 $bin_index <= $end_bin ;
1692 ++$bin_index ) {
1693
1694 #we really need to capture the lowest -ve and higest +ve scores here and post process
1695 #To pick between them
1696
1697 my $score = $feature->score;
1698 $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
1699
1700 if($score < $bins{$wsize}->[$bin_index]->[0]){
1701 $bins{$wsize}->[$bin_index]->[0] = $score;
1702 }
1703 elsif($score > $bins{$wsize}->[$bin_index][1]){
1704 $bins{$wsize}->[$bin_index]->[1] = $score;
1705 }
1706 }
1707 }
1708
1709
1710 sub post_process_max_magnitude{
1711
1712 }
1713
1714 =cut
1715
1716 #separated to allow addition of non-standard methods
1717 #Could potentially add these in new
1718 #and put this back in _bin_features
1719
1720
1721 sub validate_bin_method{
1722 my ($self, $method) = @_;
1723
1724
1725 #change this to set the coderefs
1726 #Just set anonymous sub to immediately return for non post processed methods
1727 #No need for coderef, just set the method name?
1728
1729 #if(! $self->can('calculate_'.$method)){
1730 #throw("$method method does not have a valid calculate_${method} method");
1731 #}
1732
1733 #if($self->can('post_process_'.$method)){
1734 ##set post process flag?
1735 #or simply do this can in line in the _bin_features sub?
1736 #}
1737
1738
1739
1740
1741 #Add average_score to avoid changing Collection.pm
1742 my $class = ref($self);
1743 ${$class::VALID_BINNING_METHODS}{'average_score'} = 5;
1744 ${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6;
1745 ${$class::VALID_BINNING_METHODS}{'count'} = 0;
1746
1747
1748
1749 #foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){
1750 # warn "valid method is $method name";
1751 # }
1752
1753
1754 if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) {
1755 throw(
1756 sprintf(
1757 "Invalid binning method '%s', valid methods are:\n\t%s\n",
1758 $method,
1759 join( "\n\t", sort( keys(%{$class::VALID_BINNING_METHODS}) ) ) ) );
1760 }
1761 else{
1762 #warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method};
1763 }
1764
1765 return ${$class::VALID_BINNING_METHODS}{$method};
1766 }
1767
1768
1769
1770 1;