0
|
1 # $Id: Collector.pm,v 1.7 2011/01/10 11:27:34 nj1 Exp $
|
|
2
|
|
3
|
|
4 =head1 LICENSE
|
|
5
|
|
6 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
7 Genome Research Limited. All rights reserved.
|
|
8
|
|
9 This software is distributed under a modified Apache license.
|
|
10 For license details, please see
|
|
11
|
|
12 http://www.ensembl.org/info/about/code_licence.html
|
|
13
|
|
14 =head1 CONTACT
|
|
15
|
|
16 Please email comments or questions to the public Ensembl
|
|
17 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
18
|
|
19 Questions may also be sent to the Ensembl help desk at
|
|
20 <helpdesk@ensembl.org>.
|
|
21
|
|
22 =cut
|
|
23
|
|
24 #Your Bio::Ensembl::Collection::Feature defs module should inherit from here
|
|
25 #This could be a local defs file which you have created and require'd into your script
|
|
26
|
|
27 #If your collections defs module refers to a Bio::EnsEMBL::Feature,
|
|
28 #then it's adaptor should inherit from the collections defs module
|
|
29
|
|
30
|
|
31
|
|
32 package Bio::EnsEMBL::Funcgen::Collector;
|
|
33 #Move this to Bio::EnsEMBL::Utils::Collector for 59?
|
|
34
|
|
35
|
|
36 use strict;
|
|
37 use warnings;
|
|
38
|
|
39 use Bio::EnsEMBL::Utils::Argument ('rearrange');
|
|
40 use Bio::EnsEMBL::Utils::Exception ('throw');
|
|
41 use Bio::EnsEMBL::Funcgen::ResultFeature;
|
|
42
|
|
43 #use base('Bio::EnsEMBL::Collection');#ISA
|
|
44
|
|
45 our ($pack_template, $packed_size, @window_sizes); #These get set in the FeatureAdaptor
|
|
46 #Make these constants and remove setter functionality in methods?
|
|
47 #Only really important for pack template and windows, maybe these if we are going to start
|
|
48
|
|
49
|
|
50 our $max_data_type_size = 16777216; #Default is 16MB for long blob
|
|
51 #we need to deduct the size of the rest of the record here!
|
|
52 #For a 2byte packet the smallest window size possible is:
|
|
53 #(slice->length/(16777216/2)
|
|
54 #so int(bin_size)+1
|
|
55 #Obviously have to use the largest slice here, for human chr1:
|
|
56 #249,250,621/(16777216/2) = 29.7???
|
|
57 #We may need to up this slightly to account for larger chrs?
|
|
58 #Implications on memory usage? Is it 4 times for blob manipulation?
|
|
59 #Does substr require this manipulation?
|
|
60 #This max_allowed_packet_size does not seem to translate directly to the size of the
|
|
61 #data being stored e.g. quite a bit more is needed. ISG haven't got to the bottom of this yet.
|
|
62 #But have simply upped the config to 67108864 to handle the largest human chr.
|
|
63
|
|
64 our $max_view_width = 500000;#Max width in Region In Detail;
|
|
65
|
|
66
|
|
67 #our %VALID_BINNING_METHODS
|
|
68 #Remove this in favour of can->('calculate_.$method) and coderefs?
|
|
69
|
|
70
|
|
71
|
|
72
|
|
73 #To do
|
|
74 # 1 DONE Merge in Collection code, (no need to do this, removed inheritance)
|
|
75 # 2 Write simple BED input to flat file output.
|
|
76 # 3 Separate store method so we can simply get, then wrap store around this
|
|
77 # 4 Test get method with slice adjusts
|
|
78 # 5 separate set_config?
|
|
79 # 6 optimise generate_bin_chunks to handle just one window size for display?
|
|
80 # 7 Handle packed_size pack_template as methods constants
|
|
81 # 8 Provide override method in basefeature adaptor which will use package constant in feature adaptor
|
|
82 # This is because these are really adaptor config, the collector only needs to know the
|
|
83 # packed_size, and in the absence of an feature adaptor also provides the default methods for both.
|
|
84 # If we substr in the API then we need to set sensible limits on blob size, otherwise we will have to unpack a lot of data
|
|
85 # to get at the slice we want.
|
|
86 # OR
|
|
87 # Change adaptor to substr in DB based on known blob ranges/window size
|
|
88 # and stitch together any which cross boundaries. This depends on speed of substr at end of large blob TEST!
|
|
89 # Load with current code first and test this before making either change!
|
|
90 # Delete empty (non-0) collections? i.e. For seq_regions which do not have any source features.
|
|
91 #
|
|
92 # 9 Handle PAR/HAP regions using fetch_normalised_slice_projections This has to be done in the feature adaptor! Then restrict to non_dup regions in calling script
|
|
93
|
|
94
|
|
95
|
|
96 =head2 new
|
|
97
|
|
98 Args : None
|
|
99 Example : my $collector = Bio::EnsEMBL::(Funcgen|Compara|Variation::)Collector::FEATURE->new;
|
|
100 $collector->store_windows_by_Slice($slice);
|
|
101 Description: Simple new method to enable use of collector when not inherited by
|
|
102 a descendant of Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor
|
|
103 Returntype : Bio::EnsEMBL::Funcgen::Collector
|
|
104 Exceptions : None
|
|
105 Caller : Collector script
|
|
106 Status : At Risk
|
|
107
|
|
108 =cut
|
|
109
|
|
110 sub new{
|
|
111 return bless {}, $_[0];#Simple blesses this class as an empty hash
|
|
112
|
|
113 #Do not set anything here
|
|
114 #As will not be first in ISA for feature adaptors
|
|
115 #Hence not guaranteed to be called
|
|
116
|
|
117 }
|
|
118
|
|
119 #Setter/Getter methods if we don't have a dedicated Collector module
|
|
120 #to set the package variables in? Also to allow overriding of defaults.
|
|
121 #This can be used by the write_collection method
|
|
122 #to determine when to build and store a compressed collection
|
|
123 #Effectively the max size of the data type you are using to store
|
|
124 #a compressed score. defaults to max for long blob 16MB
|
|
125
|
|
126
|
|
127
|
|
128 #Generic method, but only ever called by write_collection in descendant
|
|
129
|
|
130 sub new_assembly{
|
|
131 my ($self, $new_ass) = @_;
|
|
132
|
|
133 if($new_ass){
|
|
134 #Validate new assm to project to
|
|
135
|
|
136
|
|
137 $self->{'new_assembly'} = $new_ass;
|
|
138 }
|
|
139
|
|
140 return $self->{'new_assembly'};
|
|
141 }
|
|
142
|
|
143
|
|
144 sub max_data_type_size{
|
|
145 my ($self, $size) = @_;
|
|
146
|
|
147 #Validate is sensible integer?
|
|
148
|
|
149 if($size && ! int($size)){
|
|
150 throw("max_data_type_size must be a integer of bytes, not $size");
|
|
151 }
|
|
152 elsif($size){
|
|
153 $self->{'max_data_type_size'} = $size;
|
|
154 }
|
|
155 elsif(! defined $self->{'max_data_type_size'}){
|
|
156 #default set at head of this module or in descendant Collector
|
|
157 $self->{'max_data_type_size'} = $Bio::EnsEMBL::Funcgen::Collector::max_data_type_size;
|
|
158 }
|
|
159
|
|
160 return $self->{'max_data_type_size'};
|
|
161 }
|
|
162
|
|
163 sub max_view_width{
|
|
164 my ($self, $size) = @_;
|
|
165
|
|
166 #Validate is sensible integer?
|
|
167
|
|
168 if($size && ! int($size)){
|
|
169 throw("max_view_width must be a integer, not $size");
|
|
170 }
|
|
171 elsif($size){
|
|
172 $self->{'max_view_width'} = $size;
|
|
173 }
|
|
174 elsif(! defined $self->{'max_view_width'}){
|
|
175 #default set at head of this module or in descendant Collector
|
|
176 $self->{'max_view_width'} = $Bio::EnsEMBL::Funcgen::Collector::max_view_width;
|
|
177 }
|
|
178
|
|
179 return $self->{'max_view_width'};
|
|
180 }
|
|
181
|
|
182
|
|
183 sub bins_per_record(){
|
|
184 #$collector_class::bins_per_record = ($collector_class::max_data_type_size/$collector_class::packed_size);#This should be done dynamically as we may redefine either of these variables?
|
|
185
|
|
186 my ($self) = shift;
|
|
187
|
|
188 return int($self->max_data_type_size/$self->packed_size);
|
|
189 }
|
|
190
|
|
191
|
|
192 #The defaults for these should be defined in the feature/format specific Collector descendant
|
|
193 #either by specifying the package variables or using config attrs to set methods?
|
|
194 #general config should be parsed here.
|
|
195 #rename bin_method?
|
|
196
|
|
197 sub bin_method{
|
|
198 my ($self, $method) = @_;
|
|
199
|
|
200 if($method || ! $self->{'bin_method'}){
|
|
201
|
|
202 if($method){
|
|
203 $self->{'bin_method'} = $method;
|
|
204 #should test can here? or validate versus hash?
|
|
205 }
|
|
206 elsif(! $self->{'bin_method'}){
|
|
207
|
|
208 if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_method){
|
|
209 throw('Must pass a bin_method in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_method in your Collector');
|
|
210 }
|
|
211
|
|
212 $self->{'bin_method'} = $Bio::EnsEMBL::Funcgen::Collector::bin_method;
|
|
213 }
|
|
214
|
|
215 #or current validate method if we are keeping the method in the if/else block
|
|
216
|
|
217
|
|
218 #if(! $self->can("calculate_${method}"))){
|
|
219 #throw("$method is no a valid a valid binning method");
|
|
220 #}
|
|
221
|
|
222 }
|
|
223
|
|
224 return $self->{'bin_method'};
|
|
225 }
|
|
226
|
|
227 #We could replace this with a hash of bin_methods and models?
|
|
228 #This could then be used to validate
|
|
229 #Altho if we are going to commodotise the bin methods, then we need to be able to
|
|
230 #define this in the child Collector. Could still do this by modifying the method/model
|
|
231 #hash from the child Collector
|
|
232
|
|
233 sub bin_model{
|
|
234 my ($self, $bin_model) = @_;
|
|
235
|
|
236 if($bin_model || ! $self->{'bin_model'}){
|
|
237
|
|
238 if($bin_model){
|
|
239 $self->{'bin_model'} = $bin_model;
|
|
240 }
|
|
241 elsif(! $self->{'bin_model'}){
|
|
242
|
|
243 #Set as global constant defined in descendant Collector
|
|
244 if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_model){
|
|
245 throw('Must pass -bin_model in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_model in your Collector');
|
|
246 }
|
|
247
|
|
248 $self->{'bin_model'} = $Bio::EnsEMBL::Funcgen::Collector::bin_model;
|
|
249 }
|
|
250
|
|
251 #Need to validate bin models here
|
|
252 throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models') if $self->{'bin_model'} ne 'SIMPLE';
|
|
253 }
|
|
254
|
|
255 return $self->{'bin_model'};
|
|
256 }
|
|
257
|
|
258 #This can be overridden by adaptor method
|
|
259 #At present this could cause problems as we can pass window sizes in the config, but they will never be set
|
|
260 #as adaptor method is not a setter. Adaptor method should throw if we try and set them as this could cause problems when fetching and not knowing the custom sizes?
|
|
261
|
|
262 sub window_sizes{
|
|
263 my ($self, $sizes) = @_;
|
|
264
|
|
265 if($sizes || ! $self->{'window_sizes'}){
|
|
266
|
|
267 if($sizes){
|
|
268 $self->{'window_sizes'} = $sizes;
|
|
269 }
|
|
270 else{#! $self->{'windows_sizes'
|
|
271
|
|
272 if (! @window_sizes){
|
|
273 throw('Must pass -windows_sizes in the config or define @Bio::EnsEMBL::Funcgen::Collector::window_sizes in your Collector');
|
|
274 }
|
|
275
|
|
276 @{$self->{'window_sizes'}} = @window_sizes;
|
|
277 }
|
|
278
|
|
279 if(ref($self->{'window_sizes'}) ne 'ARRAY' ||
|
|
280 scalar(@{$self->{'window_sizes'}}) == 0){
|
|
281 throw('window_sizes must be an arrayref of at least one window size');
|
|
282 }
|
|
283 }
|
|
284
|
|
285 return $self->{'window_sizes'};
|
|
286 }
|
|
287
|
|
288
|
|
289
|
|
290
|
|
291 #Optional attrs dependant on whether Collection is packed
|
|
292 #Can be redefined in the adaptor but becareful never to redefine the actual values
|
|
293 #As these should really be constants for a given Collector
|
|
294 #What is best here? We only want pack methods for storing/fetching compressed collections
|
|
295 #Move this to base feature adaptor and define attrs as constants using
|
|
296 #package variable? Or directly in new?
|
|
297 #Then direct modification will be caught.
|
|
298 #Just leave here for now.
|
|
299
|
|
300 #Caller _obj_from_sth/store
|
|
301
|
|
302 sub pack_template{
|
|
303 my ($self, $template) = @_;
|
|
304
|
|
305 if($template){
|
|
306 $self->{'pack_template'} = $template;
|
|
307 }
|
|
308 elsif(! $self->{'pack_template'}){
|
|
309
|
|
310 #Set as global constant defined in descendant Collector
|
|
311
|
|
312 if (! defined $Bio::EnsEMBL::Funcgen::Collector::pack_template){
|
|
313 throw('Must pass a per score pack_template in the config or define $Bio::EnsEMBL::Funcgen::Collector::pack_template in your Collector');
|
|
314 }
|
|
315
|
|
316 $self->{'pack_template'} = $Bio::EnsEMBL::Funcgen::Collector::pack_template;
|
|
317 }
|
|
318
|
|
319 return $self->{'pack_template'};
|
|
320
|
|
321 }
|
|
322
|
|
323 #Caller _obj_from_sth/store & current_packed_size
|
|
324
|
|
325 sub packed_size{
|
|
326 my ($self, $size) = @_;
|
|
327
|
|
328 if($size){
|
|
329
|
|
330 if(! int($size)){
|
|
331 throw("$size is not an integer, must pass a size integer for packed_size which specifies size of pack_template:\t".$self->pack_template);
|
|
332 }
|
|
333
|
|
334 $self->{'packed_size'} = $size;
|
|
335 }
|
|
336 elsif(! $self->{'packed_size'}){
|
|
337
|
|
338 #Set as global constant defined in descendant Collector
|
|
339
|
|
340 if (! defined $Bio::EnsEMBL::Funcgen::Collector::packed_size){
|
|
341 throw('Must pass a packed_size(wrt to pack_template) in the config or define $Bio::EnsEMBL::Funcgen::Collector::packed_size in your Collector');
|
|
342 }
|
|
343
|
|
344 $self->{'packed_size'} = $Bio::EnsEMBL::Funcgen::Collector::packed_size;
|
|
345 }
|
|
346
|
|
347 return $self->{'packed_size'};
|
|
348
|
|
349 }
|
|
350
|
|
351 #These methods are used by the descendant Collector
|
|
352 #For caching infor whilst building collections
|
|
353 #This is used to log how big a collection has grown before storing
|
|
354
|
|
355 sub current_packed_size{
|
|
356 my ($self, $wsize) = @_;
|
|
357
|
|
358 #$self->{'current_packed_size'}{$wsize} ||= 0;
|
|
359
|
|
360 #if(defined $cps){
|
|
361 # $self->{'current_packed_size'}{$wsize} = $cps;
|
|
362 # }
|
|
363 # else{
|
|
364 # return $self->{'current_packed_size'}{$wsize};
|
|
365 # }
|
|
366
|
|
367 return (scalar(@{$self->score_cache($wsize)})*$self->packed_size);
|
|
368
|
|
369 }
|
|
370
|
|
371
|
|
372 sub score_cache{
|
|
373 my ($self, $wsize, $scores) = @_;
|
|
374
|
|
375 $self->{'score_cache'}{$wsize} ||= [];
|
|
376
|
|
377 if(defined $scores){
|
|
378 push @{$self->{'score_cache'}{$wsize}}, @{$scores};
|
|
379 }
|
|
380 else{
|
|
381 #Do this here to stop passing the ref everytime
|
|
382 #Will this be faster?
|
|
383 #Would certainly be faster if we were not returning a ref
|
|
384 return $self->{'score_cache'}{$wsize};
|
|
385 }
|
|
386 }
|
|
387
|
|
388 #These last methods are only used for the 0 wsize
|
|
389 #natural resolution and ar wrt the orig_slice passed
|
|
390 #to store_windows_by_Slice
|
|
391
|
|
392 sub collection_start{
|
|
393 my ($self, $wsize, $sr_start) = @_;
|
|
394
|
|
395 if(defined $sr_start){
|
|
396 $self->{'collection_start'}{$wsize} = $sr_start;
|
|
397 }
|
|
398 else{
|
|
399 return $self->{'collection_start'}{$wsize};
|
|
400 }
|
|
401 }
|
|
402
|
|
403 sub collection_end{
|
|
404 my ($self, $wsize, $sr_end) = @_;
|
|
405
|
|
406 if(defined $sr_end){
|
|
407 $self->{'collection_end'}{$wsize} = $sr_end;
|
|
408 }
|
|
409 else{
|
|
410 return $self->{'collection_end'}{$wsize};
|
|
411 }
|
|
412 }
|
|
413
|
|
414 sub collection_strand{
|
|
415 my ($self, $wsize, $strand) = @_;
|
|
416
|
|
417 if(defined $strand){
|
|
418 $self->{'collection_strand'}{$wsize} = $strand;
|
|
419 }
|
|
420 else{
|
|
421 return $self->{'collection_strand'}{$wsize};
|
|
422 }
|
|
423 }
|
|
424
|
|
425
|
|
426
|
|
427 =pod
|
|
428
|
|
429 sub _create_feature {
|
|
430 my ( $this, $feature_type, $args ) = @_;
|
|
431
|
|
432 my $feature = $this->SUPER::_create_feature( $feature_type, $args );
|
|
433
|
|
434 if ( !$this->_lightweight() ) {
|
|
435 my ( $phase, $end_phase, $stable_id, $version, $created_date,
|
|
436 $modified_date, $is_current )
|
|
437 = rearrange( [ 'PHASE', 'END_PHASE',
|
|
438 'STABLE_ID', 'VERSION',
|
|
439 'CREATED_DATE', 'MODIFIED_DATE',
|
|
440 'IS_CURRENT'
|
|
441 ],
|
|
442 %{$args} );
|
|
443
|
|
444 push( @{$feature},
|
|
445 $phase, $end_phase, $stable_id, $version, $created_date,
|
|
446 $modified_date, $is_current );
|
|
447 }
|
|
448
|
|
449 return $feature;
|
|
450 }
|
|
451
|
|
452 sub _create_feature_fast {
|
|
453 my ( $this, $feature_type, $args ) = @_;
|
|
454
|
|
455 my $feature =
|
|
456 $this->SUPER::_create_feature_fast( $feature_type, $args );
|
|
457
|
|
458 return $feature;
|
|
459 }
|
|
460
|
|
461
|
|
462 #This might not be sensible for Features which are split across tables
|
|
463
|
|
464 sub _tables {
|
|
465 my ($this) = @_;
|
|
466
|
|
467 my @tables = $this->SUPER::_tables();
|
|
468
|
|
469 if ( $this->_lightweight() ) {
|
|
470 return ( $tables[0] );
|
|
471 }
|
|
472
|
|
473 return @tables;
|
|
474 }
|
|
475
|
|
476
|
|
477 sub _columns {
|
|
478 my ($this) = @_;
|
|
479
|
|
480 my @columns = $this->SUPER::_columns();
|
|
481
|
|
482 if ( $this->_lightweight() ) {
|
|
483
|
|
484 #What is this doing?
|
|
485 #Probably not sensible for ResultFeature
|
|
486 @columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns );
|
|
487 }
|
|
488
|
|
489 return @columns;
|
|
490 }
|
|
491
|
|
492 #Also not sensible for objects spread across several tables
|
|
493
|
|
494 sub _default_where_clause {
|
|
495 my ($this) = @_;
|
|
496
|
|
497 if ( $this->_lightweight() ) {
|
|
498 return '';
|
|
499 }
|
|
500
|
|
501 return $this->SUPER::_default_where_clause();
|
|
502 }
|
|
503
|
|
504 =cut
|
|
505
|
|
506
|
|
507
|
|
508 #This need to be generic
|
|
509 #Again we need to pass an accessor method/reference?
|
|
510 #Will be some sort of generic fetch for feature adaptors
|
|
511 #or while loop for in flat file accessor
|
|
512 #rollback to be handled in caller?
|
|
513
|
|
514 # To do
|
|
515 #
|
|
516 # 1 Allow variable chunks lengths (so we only have one resolution of windows?)
|
|
517 # This will allow SNP collections which currently define classification i.e colour
|
|
518 # Density of SNPs within window will define shading. Count will be displayed in zmenu
|
|
519 # This maybe something we have to do in the descendant
|
|
520 #
|
|
521 # 2 Implement collection param definition in/from descendant
|
|
522
|
|
523
|
|
524 # return collection config from adaptor fetch
|
|
525 # window size
|
|
526 # fixed width?
|
|
527 # render/collection style?
|
|
528 # This chould be implemented in BaseFeatureAdaptor::generic_fetch?
|
|
529 # Or could be done in the calling fetchmethod?
|
|
530 #
|
|
531
|
|
532 #need to change this to get_window_bin_by_Slice
|
|
533 #to enable generating bins on uncompressed data
|
|
534 #Need to remove all counts and store based code to store caller
|
|
535 #this would mean removing any pack based code too
|
|
536 #separate set_config method
|
|
537
|
|
538
|
|
539 #Probelm here is size of slice?
|
|
540 #We need to generate bins all in one go, but also need to store at interval
|
|
541 #so as not to explode memory
|
|
542 #Do we need to separate the window generation from the bin generation code?
|
|
543
|
|
544
|
|
545 #Define the optimal way to generate windowed data by
|
|
546 #finding the most common denominator
|
|
547
|
|
548 sub _define_window_chunks{
|
|
549 my ($self, $window_sizes, $max_view_size) = @_;
|
|
550
|
|
551 ### DEFINE CHUNKS WRT WINDOWS
|
|
552
|
|
553 #Shortcut for on the fly uncompressed collection retrieval
|
|
554 #if(scalar(@wsizes) = 1){
|
|
555 #
|
|
556 #}
|
|
557 #else{
|
|
558
|
|
559 #Calulate sensible slice length based on window sizes
|
|
560 my @wsizes = sort {$a <=> $b} @$window_sizes;
|
|
561
|
|
562 #We need a default when only calculating 0 resolution
|
|
563 #Will binning code work with only 0 resolution?
|
|
564 if((scalar(@wsizes) == 1) &&
|
|
565 $wsizes[0] == 0){
|
|
566 return { $self->max_view_width => [0] };
|
|
567 }
|
|
568
|
|
569
|
|
570 my $multiplier = int($max_view_size/$wsizes[$#wsizes]);
|
|
571 my $chunk_length = $multiplier * $wsizes[$#wsizes];
|
|
572 my $not_divisible = 1;
|
|
573 my %chunk_windows;#Registry of chunk lengths to run with windows
|
|
574 my %workable_chunks = map {$_ => {}} @wsizes;
|
|
575 delete $workable_chunks{'0'};#get rid of natural resolution as this will always work
|
|
576
|
|
577
|
|
578 while($not_divisible && $chunk_length != 0){
|
|
579 $not_divisible = 0;
|
|
580
|
|
581 foreach my $wsize(@wsizes){
|
|
582 next if $wsize == 0;#Special wsize for normal data
|
|
583
|
|
584 #Set not divisible if modulus is true
|
|
585 if($chunk_length % $wsize){
|
|
586 $not_divisible = 1;
|
|
587 }
|
|
588 else{
|
|
589 #No need to listref here?
|
|
590 $workable_chunks{$wsize}{$chunk_length} = [];
|
|
591 }
|
|
592
|
|
593 #warn "chunk length is $chunk_length and not_divisible is $not_divisible";
|
|
594 }
|
|
595
|
|
596 #Gradually shrink the length until we find a workable slice length for all windows
|
|
597 $chunk_length -= $wsizes[$#wsizes] if $not_divisible;
|
|
598 }
|
|
599
|
|
600 my %chunk_sets;
|
|
601
|
|
602
|
|
603 if($chunk_length == 0){
|
|
604 print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n";
|
|
605
|
|
606 foreach my $wsize(keys %workable_chunks){
|
|
607 #Loop through windows, seeing if they are workable in the other windows
|
|
608
|
|
609 foreach my $chunk(keys %{$workable_chunks{$wsize}}){
|
|
610
|
|
611 foreach my $other_wsize(keys %workable_chunks){
|
|
612 next if $wsize == $other_wsize;
|
|
613
|
|
614 if(exists $workable_chunks{$other_wsize}{$chunk}){
|
|
615 #only push it onto the other wsize, as we will do the reverse later
|
|
616 $chunk_sets{$chunk}{$wsize} = undef;
|
|
617 }
|
|
618 }
|
|
619 }
|
|
620 }
|
|
621
|
|
622 #Now we have a register of co-occurence of wsizes with repect to chunks
|
|
623 #Loop through finding the least amount of sets with the longest chunk length?
|
|
624 #There is no way to decide which is best?
|
|
625 #we could calculate the number of loops? Factored by the chunk length?
|
|
626 #Let's just print out and see what we get
|
|
627
|
|
628 #warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets);
|
|
629
|
|
630
|
|
631 #For now let's just take the one which has the most windows and the longest chunk
|
|
632 #Then we just get the largest which handles the rest.
|
|
633
|
|
634 #define possible set lengths
|
|
635 my $i = 0;
|
|
636 my %set_lengths;
|
|
637 map {$set_lengths{$i} = []; $i++} @wsizes;
|
|
638 delete $set_lengths{'0'};#get rid of natural resolution as this will always work
|
|
639
|
|
640
|
|
641 #store chunks lengths for each set size
|
|
642 foreach my $chunk(keys %chunk_sets){
|
|
643 my $set_size = scalar(values %{$chunk_sets{$chunk}});
|
|
644 push @{$set_lengths{$set_size}}, $chunk;
|
|
645 }
|
|
646
|
|
647 #Now we get the biggest set with the longest length;
|
|
648 my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop
|
|
649 my $found_largest_set = 0;
|
|
650
|
|
651 while(! $found_largest_set){
|
|
652 $largest_size--;
|
|
653
|
|
654 if(scalar(@{$set_lengths{$largest_size}}>0)){
|
|
655 $found_largest_set = 1;
|
|
656 }
|
|
657 }
|
|
658
|
|
659
|
|
660 #We should be able to loop this bit, to find all the biggest sets.
|
|
661 my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}};
|
|
662 #we could even be selective here, but let's just take the first one for now
|
|
663
|
|
664 my @largest_windows = keys %{$chunk_sets{$largest_chunk}};
|
|
665 @{$chunk_windows{$largest_chunk}} = @largest_windows;
|
|
666 print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n";
|
|
667
|
|
668 my %remaining_windows = map {$_ => {}} @wsizes;
|
|
669 delete $remaining_windows{'0'};#get rid of natural resolution as this will always work
|
|
670 map { delete $remaining_windows{$_} } @largest_windows;
|
|
671 my $remaining_set_size = scalar(keys %remaining_windows);
|
|
672
|
|
673 #swapping to array here for practicality, would need to maintain hash if we need to iterate
|
|
674 my @rwindows = keys %remaining_windows;
|
|
675
|
|
676 #This could just be one window, but this will not be inthe co-occurence hash %chunk_sets
|
|
677 #Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window.
|
|
678 my $next_chunk;
|
|
679
|
|
680 if(scalar(@rwindows) == 1){
|
|
681 #we just want to find a suitably large chunk for this one window.
|
|
682 my ($last_window) = @rwindows;
|
|
683
|
|
684 $multiplier = int(500000/$last_window);
|
|
685 $next_chunk = $multiplier * $last_window;
|
|
686 }
|
|
687 else{
|
|
688 #Now were are doing something very similar to above
|
|
689 #populating a set_size chunk length registry
|
|
690 #my %seen_hash;
|
|
691
|
|
692
|
|
693 foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){
|
|
694 my $seen_count = 0;
|
|
695
|
|
696 foreach my $rwindow(@rwindows){
|
|
697
|
|
698 $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}});
|
|
699 }
|
|
700
|
|
701 if ($seen_count == $remaining_set_size){
|
|
702
|
|
703 $next_chunk = $chunk;
|
|
704 last;
|
|
705 }
|
|
706 }
|
|
707 }
|
|
708
|
|
709 @{$chunk_windows{$next_chunk}} = @rwindows;
|
|
710
|
|
711
|
|
712 if($next_chunk){
|
|
713
|
|
714 print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n";
|
|
715
|
|
716 #Now we want to cycle through all the set lengths which could contain the ones not in the first
|
|
717 #so we need to
|
|
718 }
|
|
719 else{
|
|
720 warn "Need to write iterative sub for set definition";
|
|
721 throw('Could not find workable slice length for remaining windows: '.
|
|
722 join(', ', @rwindows));
|
|
723
|
|
724 }
|
|
725 }
|
|
726 else{
|
|
727 @{$chunk_windows{$chunk_length}} = keys(%workable_chunks);
|
|
728 print "Found workable chunk length($chunk_length) for all window sizes:\t".
|
|
729 join(' ', @{$chunk_windows{$chunk_length}})."\n";
|
|
730 }
|
|
731
|
|
732 return \%chunk_windows;
|
|
733 }
|
|
734
|
|
735
|
|
736 #Let's concentrate on store function first before we split out into store and fetch methods
|
|
737 #How will this work with the Bed parser?
|
|
738 #The descendant collector will sort the input and detect the current slice before calling
|
|
739 #store_window_bins_by_Slice. This may require some caching of line or seeking as we will see the next slice before we have a chance to set it.
|
|
740 #This will store as ResultFeature collections, so maybe we need to separate the input from output code?
|
|
741 #i.e. Bed parser/wrapper
|
|
742 # ResultFeatureAdaptor wrapper
|
|
743 #These
|
|
744
|
|
745 #Problem with passing window_sizes here
|
|
746 #We need to check that they aren't already defined a class variables as this could potentially
|
|
747 #screw up retrieval, expect for only 0 or all but 0
|
|
748 #Should we remove this config and force the class variable to be set in the 'adaptor'
|
|
749 #Method is then only used internally, make private or only getter? Set by changing class vars?
|
|
750
|
|
751 sub store_window_bins_by_Slice{
|
|
752 my ($self, $slice, %config) = @_;
|
|
753
|
|
754 my ($window_sizes, $logic_name, $bin_method, $fetch_method_ref, $max_view_width,
|
|
755 $max_data_type_size, $pack_template, $packed_size, $bin_model, $new_assm, $skip_zero_window) =
|
|
756 rearrange( [ 'WINDOW_SIZES', 'LOGIC_NAME', 'BIN_METHOD', 'FETCH_METHOD_REF', 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', 'PACK_TEMPLATE', 'PACKED_SIZE', 'BIN_MODEL', 'NEW_ASSEMBLY', 'SKIP_ZERO_WINDOW'], %config );
|
|
757
|
|
758 warn "Need to be careful here about cleaning start end strand caches between serially run slices";
|
|
759
|
|
760
|
|
761
|
|
762 ### VAILDATE VARS/CONFIG
|
|
763
|
|
764 #This could be done once in set_config, could then remove setter bahviour from attr methods?
|
|
765 #All default defs params/methods can be overridden by config params
|
|
766 #Attrs used in this method
|
|
767 $bin_method = $self->bin_method($bin_method);
|
|
768 $bin_model = $self->bin_model($bin_model);
|
|
769 #$window_sizes = $self->window_sizes($window_sizes);#Now done below
|
|
770 #Set to undef if we ave empty array
|
|
771 $window_sizes = undef if (ref($window_sizes) eq 'ARRAY' && scalar(@$window_sizes) == 0);
|
|
772 #Attrs used in other (store) methods
|
|
773 $self->pack_template($pack_template);
|
|
774 $self->packed_size($packed_size);
|
|
775 $self->max_data_type_size($max_data_type_size);
|
|
776 $self->max_view_width($max_view_width);
|
|
777
|
|
778 #Other vars
|
|
779 $self->new_assembly($new_assm);
|
|
780
|
|
781 #Need to validate slice here
|
|
782
|
|
783 warn "temp hack for bin_method validation";
|
|
784 $bin_method = $self->validate_bin_method($bin_method);
|
|
785
|
|
786 ### Set window_sizes
|
|
787
|
|
788 if($self->new_assembly){
|
|
789 print "Assembly projection may cause problems for large Collections, defaulting to window_sizes = (0)\n";
|
|
790 #Then build the bins on the projected 0 level single ResultFeatures
|
|
791
|
|
792 #Test we haven't explicity set window_sizes to be soemthing else
|
|
793
|
|
794 if($window_sizes &&
|
|
795 ! ( scalar(@$window_sizes) == 1 && $window_sizes[0] == 0)){
|
|
796
|
|
797 throw("You have set window_sizes config which are not safe when projecting to a new assembly($new_assm), please omit window_sizes config or set to 0");
|
|
798
|
|
799 }
|
|
800 $window_sizes = $self->window_sizes([0]);
|
|
801 }
|
|
802 else{
|
|
803
|
|
804 if($window_sizes && $skip_zero_window && grep/^0$/,@$window_sizes){
|
|
805 throw("You have specied skip_zero_window and window_size 0 in your config, please remove one of these");
|
|
806 }
|
|
807 elsif($window_sizes && ! grep/^0$/,@$window_sizes){
|
|
808 $skip_zero_window = 1;
|
|
809 unshift @$window_sizes, 0;#re-add 0 window as we need this to build the collections
|
|
810 }
|
|
811
|
|
812 $window_sizes = $self->window_sizes($window_sizes);
|
|
813 }
|
|
814
|
|
815
|
|
816 #This is already done in the script
|
|
817 if($skip_zero_window && $new_assm){
|
|
818 throw("You cannot -skip_zero_window or omit 0 from -window_sizes when projecting to a new assembly($new_assm) which should only be generated using window_size=0");
|
|
819 }
|
|
820
|
|
821
|
|
822
|
|
823 ### Rollback previously stored features
|
|
824
|
|
825 if($self->can('rollback_Features_by_Slice')){
|
|
826 $self->rollback_Features_by_Slice($slice);
|
|
827 }
|
|
828 else{
|
|
829 #This is currently the only warn output we can't get rid off
|
|
830 warn ref($self)." cannot rollback_Features_by_Slice. This may result in duplicate Collections being stored if there is pre-existing data";
|
|
831 }
|
|
832
|
|
833
|
|
834
|
|
835 ### PROCESS CHUNKS
|
|
836
|
|
837 #Not lightweight as we will be storing them
|
|
838 # Temporarily set the collection to be lightweight???
|
|
839 #my $old_value = $this->_lightweight();
|
|
840 #if ( defined($lightweight) ) { $this->_lightweight($lightweight) }
|
|
841 #else { $this->_lightweight(1) }
|
|
842
|
|
843 my %chunk_windows = %{$self->_define_window_chunks($self->window_sizes, $self->max_view_width)};
|
|
844 my (%counts, $store_natural);
|
|
845 $store_natural = grep/^0/, @$window_sizes;
|
|
846 $counts{0}=0;#Set natural res count to 0
|
|
847 my $slice_end = $slice->end;
|
|
848 my $orig_slice = $slice;
|
|
849 my $orig_start = $slice->start;
|
|
850 #my $slice_adj = $slice->start - 1;#Removed this as we are now generating features local to orig_slice
|
|
851 #start/end conversion will be done in write/store_collection
|
|
852 my $region = $slice->coord_system_name;
|
|
853 my $version = $slice->coord_system->version;
|
|
854 my $seq_region_name = $slice->seq_region_name;
|
|
855 my $strand = $slice->strand;
|
|
856 my $only_natural = 0;
|
|
857 #my $slice_adj = 0;
|
|
858
|
|
859
|
|
860 #We need to account for only 0 here when doing projection
|
|
861 #The chunk window is set to max_view_widht in _define_chunk_windows
|
|
862
|
|
863 $only_natural = 1 if $store_natural && scalar(@$window_sizes) == 1;
|
|
864 $store_natural = 0 if $skip_zero_window;
|
|
865 #SHould really test these two, but should already be caught by now
|
|
866
|
|
867 #Set the initial collection_start to orig_start
|
|
868 #Could default to 1, but we may not be starting from 1
|
|
869 #This is not the case for 0 wsize where it must always be
|
|
870 #The first feature start
|
|
871
|
|
872
|
|
873 for my $wsize(@{$self->window_sizes}){
|
|
874
|
|
875 next if $wsize == 0;# && $skip_zero_window;#We never want to assume start of 0 window collection
|
|
876 $self->collection_start($wsize, $orig_start);
|
|
877 }
|
|
878
|
|
879
|
|
880
|
|
881 foreach my $chunk_length(sort keys %chunk_windows){
|
|
882
|
|
883 print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}}).
|
|
884 " with chunk length $chunk_length\n";
|
|
885 map $counts{$_} = 0, @{$chunk_windows{$chunk_length}}; #Set window counts to 0
|
|
886
|
|
887 #Now walk through slice using slice length chunks and build all windows in each chunk
|
|
888 my $in_slice = 1;
|
|
889 my $start_adj = 0;
|
|
890 my ($sub_end, $features, $bins);
|
|
891 my $sub_start = 1;
|
|
892 my $slice_length = $slice->length;
|
|
893
|
|
894
|
|
895 #Can we subslice and then exclusivly use bin_start(local to orig_slice)
|
|
896 #Then we never have to deal with sr coord until we store
|
|
897 #This should me we never have to do the sr conversion unless we
|
|
898 #use a slice which doesn't start at 1(PAR or test)
|
|
899 #Always create in local coords for fetch
|
|
900 #Then change to seq_region coords for store if required
|
|
901
|
|
902
|
|
903 while($in_slice){
|
|
904 #$sr_start = $slice_start + $start_adj;
|
|
905 $sub_start += $start_adj;
|
|
906
|
|
907 #$slice_start = $sr_start;#Keep for next slice
|
|
908 #$sr_end = $sr_start + $chunk_length - 1;
|
|
909 $sub_end = $sub_start + $chunk_length - 1;
|
|
910
|
|
911 #Last chunk might not be the correct window length
|
|
912 #Hence why we should do this on whole chromosomes
|
|
913 if($sub_end >= $slice_length){
|
|
914 #$sub_end = $slice_end;
|
|
915 #No longer set to slice end, as we don't want to corrupt the bin definition?
|
|
916 #Surplus bins are removed in store/write_collection in caller
|
|
917 #We could simply add the largest window the the end of the slice?
|
|
918 #Then we will only build the minimum of excess bins?
|
|
919 #This should be okay for bin calcs
|
|
920 #But may screw up bin trimming in caller as we currently expect $ub_end to be a valid bin end
|
|
921 #for all wsizes
|
|
922 #bin trimming should handle this, but this will corrupt the bin definition???
|
|
923 #bin definition is depedant on method
|
|
924 #So this method need to be agnostic
|
|
925 #And deal with the rest in descendant
|
|
926 $in_slice = 0;
|
|
927 }
|
|
928
|
|
929
|
|
930 $slice = $slice->adaptor->fetch_by_region($region, $seq_region_name, ($sub_start + $orig_start -1), ($sub_end + $orig_start - 1), $strand, $version);
|
|
931 #Can't subslice as this will not clip if we go over the length of the slice, unlike normal slice fetching
|
|
932 #hence we cannot rely on this
|
|
933 #$slice = $orig_slice->sub_Slice($sub_start, $sub_end, $orig_slice->strand);
|
|
934 #warn "got sub slice $slice as $sub_start - $sub_end from ".$orig_slice->name;
|
|
935
|
|
936
|
|
937 ### Grab features and shift chunk coords
|
|
938 #features may already be a 0 wsize collection if we have projected from an old assembly
|
|
939 #Could move this check to get_Features_by_Slice?
|
|
940
|
|
941 #e.g. [ $features, \%config ]
|
|
942 $features = $self->get_Features_by_Slice($slice);
|
|
943 #next if scalar(@$features) == 0;#We want to store values for all windows
|
|
944
|
|
945 if( (@$features) &&
|
|
946 (ref($features->[0]) =~ /Bio::EnsEMBL::Funcgen::Collection/) ){#Change to isa 'Bio::EnsEMBL::Collection
|
|
947
|
|
948 #Check that the returned feature/collections support window_size
|
|
949
|
|
950 if($features->[0]->can('window_size')){
|
|
951
|
|
952 if($features->[0]->window_size != 0){
|
|
953 throw("You are trying to generated Collections from a non-zero window sized Collection:\t".$features->[1]->{'window_size'});
|
|
954 }
|
|
955
|
|
956 #This should never happen
|
|
957 if(! $skip_zero_window){
|
|
958 throw('You have retrieved data from a Collection which without using -skip_zero_window i.e. you are trying to generate overwrite the data you are generating the Collections from');
|
|
959 }
|
|
960 }
|
|
961 else{
|
|
962 throw('Something si wrong, the Collection you have retrieved does not support the method window_size');
|
|
963 }
|
|
964 }
|
|
965
|
|
966
|
|
967
|
|
968 #Set collection start here for 0 window_size
|
|
969 if(@$features && $store_natural && ! defined $self->collection_start(0)){
|
|
970 $self->collection_start(0, ($features->[0]->start + $sub_start));
|
|
971 }
|
|
972
|
|
973
|
|
974
|
|
975 $start_adj = $chunk_length if($in_slice);
|
|
976
|
|
977
|
|
978
|
|
979 #This should return a hash of window size => bin array pairs
|
|
980 if(! $only_natural){
|
|
981
|
|
982 $bins = $self->_bin_features_by_window_sizes(
|
|
983 -slice => $slice,
|
|
984 -window_sizes => $chunk_windows{$chunk_length},
|
|
985 -bin_method => $bin_method,
|
|
986 -features => $features,
|
|
987 );
|
|
988
|
|
989
|
|
990
|
|
991
|
|
992 }
|
|
993
|
|
994 #my $bin_start = $sr_start + $slice_adj;#This was only required for storing individual bins
|
|
995 #Could calc bin_start + slice_adjust ahere for all features
|
|
996 #Doing this will break old code for single window collections
|
|
997
|
|
998 #This is sr start and should be local to orig_slice!
|
|
999
|
|
1000
|
|
1001
|
|
1002
|
|
1003 #We need to handle strandedness of slice!?
|
|
1004
|
|
1005 #Store all normal features in result_feature
|
|
1006
|
|
1007
|
|
1008
|
|
1009 if($store_natural){
|
|
1010
|
|
1011 foreach my $feature(@$features){
|
|
1012 $counts{0}++;
|
|
1013 #warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->scores->[0]));
|
|
1014
|
|
1015
|
|
1016 #Should we handle bin trimming here for overhanging slices
|
|
1017 #Then counts wil be correct and wont have to do in caller
|
|
1018
|
|
1019 #We could stop here if the feature seq_region start > orig_slice end
|
|
1020 #Current done in write/store_collection
|
|
1021 #This may mean working in seq_region values rather than slice values
|
|
1022
|
|
1023
|
|
1024 #write_collection is implemented in descendant e.g. Bio::EnsEMBL::Funcgen::Collector::ResultFeature
|
|
1025 #as wrapper to adaptor store method or print to file
|
|
1026
|
|
1027 #These params need to be generated in a way defined by the descendant
|
|
1028 #
|
|
1029
|
|
1030 if($bin_model eq 'SIMPLE'){
|
|
1031 #We need to pass the slice with this so we can sub slice when storing
|
|
1032 #the collection and set the start/end to 1 and length of slice
|
|
1033 #we still need to store the first start to be able to sub slice correctly
|
|
1034
|
|
1035 $self->collection_start(0, ($feature->start + $sub_start));
|
|
1036
|
|
1037 #Need to pass strand for 0 resolution
|
|
1038 $self->write_collection(0,
|
|
1039 $orig_slice,
|
|
1040 #These are now wrt orig_slice
|
|
1041 #($feature->start + $sub_start),
|
|
1042 ($feature->end + $sub_start),
|
|
1043 $feature->strand,
|
|
1044 $feature->scores,
|
|
1045 );
|
|
1046
|
|
1047 #We can have problems here if the original score type
|
|
1048 #does not match the collected score type
|
|
1049 #For max magnitude this is not an issue
|
|
1050 #as we take the larget value from the bin
|
|
1051 #But for other methods this may not be true
|
|
1052 #e.g. count
|
|
1053 #Hence, if we want to preserve the 0 window
|
|
1054 #We must account for this in the feature collector
|
|
1055 #e.g. set_collection_defs_by_ResultSet_window_size?
|
|
1056 #Just omit 0 window for reads
|
|
1057
|
|
1058 }
|
|
1059 }
|
|
1060
|
|
1061 print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins for:\t".$slice->name."\n";
|
|
1062 }
|
|
1063
|
|
1064 #Now store bins
|
|
1065 # my ($bin_end, $bin_scores);
|
|
1066 my $num_bins;
|
|
1067
|
|
1068 foreach my $wsize(sort keys %{$bins}){
|
|
1069 $num_bins = scalar(@{$bins->{$wsize}});
|
|
1070 #warn "$num_bins bin scores for $wsize:\t".join(',', @{$bins->{$wsize}});
|
|
1071
|
|
1072 #Should we handle bin trimming here for overhanging slices
|
|
1073 #Then counts wil be correct and wont have to do in caller
|
|
1074
|
|
1075
|
|
1076 $counts{$wsize}+= $num_bins;
|
|
1077
|
|
1078
|
|
1079
|
|
1080 #We don't need this loop for collections as we can simply push all the scores at once
|
|
1081 #Just use the slice start and end
|
|
1082 if($bin_model eq 'SIMPLE'){
|
|
1083
|
|
1084 $self->write_collection($wsize,
|
|
1085 $orig_slice,
|
|
1086 #$sub_start,
|
|
1087 $sub_end,
|
|
1088 $orig_slice->strand,#This is most likely 1!
|
|
1089 #Override this woth 0 in descendant Collector if required.
|
|
1090 $bins->{$wsize},
|
|
1091 );
|
|
1092
|
|
1093 }
|
|
1094 else{
|
|
1095 throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
|
|
1096 #i.e. More than one score
|
|
1097 }
|
|
1098
|
|
1099
|
|
1100
|
|
1101 # #Reset start and end for new wsize
|
|
1102 # $bin_start = $slice->start;
|
|
1103 # $bin_end = $slice->start;
|
|
1104 #
|
|
1105 #
|
|
1106 #
|
|
1107 # #We don't need this loop for collections as we can simply push all the scores at once
|
|
1108 #
|
|
1109 #
|
|
1110 # foreach my $bin_index(0..$#{$bins->{$wsize}}){
|
|
1111 #
|
|
1112 #
|
|
1113 #
|
|
1114 # #default method to handle simple fixed width bin?
|
|
1115 # #bin_end need to be defined dependant on the bin type
|
|
1116 # #($bin_start) = $self->process_default_bin($bins->{$wsize}->[$bin_index], $wsize);#?
|
|
1117 #
|
|
1118 #
|
|
1119 #
|
|
1120 # #either define default bin method in descendant
|
|
1121 # #Or can we set a process_bin_method var?
|
|
1122 # #No just pass all this info to write collection and handle it there?
|
|
1123 #
|
|
1124 # #Can we have just predefined rotueines handling different bin types?
|
|
1125 # #Simple
|
|
1126 # #Simple compressed
|
|
1127 # #Clipped
|
|
1128 # #This will prevent hanving to make attrs/method for storing persistent start/end/score info
|
|
1129 #
|
|
1130 #
|
|
1131 #
|
|
1132 # #Need validate bin_type method
|
|
1133 # #Could convert these to numbers for speed as with binning methods
|
|
1134 #
|
|
1135 # if($bin_model eq 'SIMPLE'){
|
|
1136 #
|
|
1137 # $bin_scores = $bins->{$wsize}->[$bin_index];
|
|
1138 #
|
|
1139 # warn "bin scores is $bin_scores";
|
|
1140 #
|
|
1141 #
|
|
1142 # #next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores
|
|
1143 #
|
|
1144 # $bin_end += $wsize;
|
|
1145 #
|
|
1146 # #if($bin_score){#Removed this as we always want to write the score even if it is 0
|
|
1147 #
|
|
1148 # #This is a little backwards as we are generating the object to store it
|
|
1149 # #If we are aiming for speed the maybe we could also commodotise the store method
|
|
1150 # #store by args arrays? store_fast?
|
|
1151 # #Speed not essential for storing!
|
|
1152 #
|
|
1153 # #Note: list ref passed
|
|
1154 #
|
|
1155 # #Don't need to pass all this info for fixed width blob collections
|
|
1156 # #Need to write some default handlers depedant on the collection type
|
|
1157 # #Simple(original)
|
|
1158 # #Simple compressed
|
|
1159 # #Multi compressed
|
|
1160 # #Clipped uncompressed?
|
|
1161 #
|
|
1162 #
|
|
1163 # $self->write_collection($wsize,
|
|
1164 # $orig_slice,
|
|
1165 # ($bin_start + $slice_adj),
|
|
1166 # ($bin_end + $slice_adj),
|
|
1167 # $orig_slice->strand,#This is most likely 0
|
|
1168 # $bin_scores,
|
|
1169 # );
|
|
1170 #
|
|
1171 # #Only count if we have a stored(projected?) feature
|
|
1172 # $count++;#Change this to attr/method?
|
|
1173 # #}
|
|
1174 #
|
|
1175 # $bin_start += $wsize;
|
|
1176 # }
|
|
1177 # else{
|
|
1178 # throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
|
|
1179 # }
|
|
1180 # }
|
|
1181
|
|
1182 #warn "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins";
|
|
1183 #$counts{$wsize}+= $count;
|
|
1184 }
|
|
1185 }
|
|
1186
|
|
1187 $store_natural = 0; #Turn off storing of natural resolution for next chunk length sets
|
|
1188 }
|
|
1189
|
|
1190 #Now need to write last collections for each wsize
|
|
1191
|
|
1192 foreach my $wsize(@{$self->window_sizes}){
|
|
1193
|
|
1194 next if $wsize == 0 && ! $store_natural;
|
|
1195 next if $wsize != 0 && $only_natural;
|
|
1196
|
|
1197 print "Writing final $wsize window_size collection, this may result in slightly different bin numbers from counts due to removing overhanging bins past end of slice\n";
|
|
1198
|
|
1199 $self->write_collection($wsize, $orig_slice);#store last collection
|
|
1200 }
|
|
1201
|
|
1202
|
|
1203 #Print some counts here
|
|
1204 foreach my $wsize(sort (keys %counts)){
|
|
1205 print "Generated ".$counts{$wsize}." bins for window size $wsize for ".$orig_slice->name."\n";
|
|
1206 #Some may have failed to store if we are projecting to a new assembly
|
|
1207 #Need collection count here too, but would need methods for this?
|
|
1208 }
|
|
1209
|
|
1210 #Return this counts hash so we can print/log from the caller, hence we don't print in here?
|
|
1211
|
|
1212 return;
|
|
1213 }
|
|
1214
|
|
1215
|
|
1216
|
|
1217 =head2 _bin_features_by_window_sizes
|
|
1218
|
|
1219 Args[0] : Bio::EnsEMBL::Slice
|
|
1220 Args[1] : ARRAYREF of window sizes
|
|
1221 Args[2] : int - bin method, currently defined by validate_bin_methods
|
|
1222 Args[3] : ARRAYREF of Bio::EnsEMBL::Features
|
|
1223 Example : $bins = $self->_bin_features_by_window_sizes(
|
|
1224 -slice => $slice,
|
|
1225 -window_sizes => $chunk_windows{$chunk_length},
|
|
1226 -bin_method => $bin_method,
|
|
1227 -features => $features,
|
|
1228 );
|
|
1229 Description: Bins feature scores for a given list of window sizes and predefined method number
|
|
1230 Returntype : HASHREF of scores per bin per window size
|
|
1231 Exceptions : Throws if bin method not supported
|
|
1232 Caller : store_window_bins_by_Slice
|
|
1233 Status : At Risk
|
|
1234
|
|
1235 =cut
|
|
1236
|
|
1237
|
|
1238 #To do
|
|
1239 # 1 Remove Bio::EnsEMBL::Feature dependancy? Or just create Features for non adaptor Collectors.
|
|
1240 # Is there a way we can skip the object generation in the adaptor completely and just
|
|
1241 # pass the values we need?
|
|
1242 # 2 Separate methods, so we can define custom methods in descendants?
|
|
1243 # 3 Expand %bins model to optionally be one of
|
|
1244 # the following dependant on binning method
|
|
1245 # Simple: fixed width containing arrays of scores for each window
|
|
1246 # Multi: fixed width containing multiple arrays of scores for each window
|
|
1247 # Non-simple?: Separate aggregated features, either fixed width or not, not BLOB!
|
|
1248 # Clipped: default fixed width with option to clip start and end. Needs start/end attrs
|
|
1249 # Can't store this in a blob due to non-standard start ends?
|
|
1250 # Most likely want more than one score here? Count/Density SNPs?
|
|
1251 # Removes data skew from standard window bins, would need to store each bin and post
|
|
1252 # process. Or do in line to avoid 2nd post-processing loop,requires awareness of when
|
|
1253 # we have moved to a new bin between features. This holds for overlapping and
|
|
1254 # non-overlapping features. Once we have observed a gap we need to clip the end of the
|
|
1255 # last bin and clip the start of the new bin. This requires knowing the greatest end
|
|
1256 # values from the last bin's feature. what if two overlapping features had the same
|
|
1257 # start and different end, would we see the longest last? Check default slice_fetch sort
|
|
1258
|
|
1259 sub _bin_features_by_window_sizes{
|
|
1260 my $this = shift;
|
|
1261 my ( $slice, $window_sizes, $method, $features ) =
|
|
1262 rearrange( [ 'SLICE', 'WINDOW_SIZES', 'BIN_METHOD', 'FEATURES' ], @_ );
|
|
1263
|
|
1264
|
|
1265 #Do this conditional on the Collection type
|
|
1266 #i.e. is collection seq_region blob then no else yes
|
|
1267 #if ( !defined($features) || !@{$features} ) { return {} }
|
|
1268
|
|
1269 #warn 'Processing '.scalar(@$features).' features for window sizes '.join(', ',@$window_sizes).' for slice '.$slice->name."\n";
|
|
1270
|
|
1271 #Set up some hashes to store data by window_size
|
|
1272 my (%bins, %nbins, %bin_counts);
|
|
1273 my $slice_start = $slice->start();
|
|
1274
|
|
1275 #Default handlers for
|
|
1276 #my($first_bin);
|
|
1277 #if ( $method == 0 || # 'count' or 'density'
|
|
1278 # $method == 3 || # 'fractional_count' or 'weight'
|
|
1279 # $method == 4 # 'coverage'
|
|
1280 # ){
|
|
1281 # # For binning methods where each bin contain numerical values.
|
|
1282 # $first_bin = 0;
|
|
1283 # }
|
|
1284 # else {
|
|
1285 # # For binning methods where each bin does not contain numerical
|
|
1286 # # values.
|
|
1287 #
|
|
1288 # #Remove this
|
|
1289 # $first_bin = undef;
|
|
1290 # }
|
|
1291
|
|
1292
|
|
1293 #Set up some bin data for the windows
|
|
1294 my $slice_length = $slice->length;
|
|
1295
|
|
1296 foreach my $wsize (@$window_sizes) {
|
|
1297 #TO DO: Need to modify this block if default 0's are undesirable for collection type
|
|
1298 #i.e. should it be undef instead? May have prolbems representing undef in blob
|
|
1299
|
|
1300 $nbins{$wsize} = int($slice_length / $wsize); #int rounds down
|
|
1301 #nbins is actually the index of the bin not the 'number'
|
|
1302 #Unless slice_Length is a multiple!
|
|
1303 $nbins{$wsize}-- if(! ($slice_length % $wsize));
|
|
1304
|
|
1305 #Create default bins with 0
|
|
1306 @{$bins{$wsize}} = ();
|
|
1307 map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize});
|
|
1308
|
|
1309 #Set bin counts to 0 for each bin
|
|
1310 @{$bin_counts{$wsize}} = ();
|
|
1311
|
|
1312 #This is adding an undef to the start of the array!?
|
|
1313 map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}};
|
|
1314
|
|
1315 foreach my $bin(@{$bins{$wsize}}){
|
|
1316 $bin_counts{$wsize}->[$bin] = 0;
|
|
1317 }
|
|
1318 }
|
|
1319
|
|
1320 #warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts);
|
|
1321 #This fails for slices which are smaller than the chunk length;
|
|
1322 my $feature_index = 0;
|
|
1323 my ($bin_index, @bin_masks);
|
|
1324
|
|
1325 foreach my $feature ( @{$features} ) {
|
|
1326 #Set up the bins for each window size
|
|
1327
|
|
1328 #Omit test for Bio::EnsEMBL::Feature here for speed
|
|
1329 #Only needs start/end methods
|
|
1330
|
|
1331 foreach my $wsize (@$window_sizes) {
|
|
1332
|
|
1333 #We have already highjacked the object creation by here
|
|
1334 #This is done in core BaseFeatureAdaptor
|
|
1335 #We probably don't want to do this for ResultFeatures as we don't use the
|
|
1336 #standard feature implementation
|
|
1337 #we already use an array and we don't store the slice
|
|
1338 #as this is already known by the caller
|
|
1339 #and we always build on top level so we don't need to remap
|
|
1340
|
|
1341 #We do however need the slice to store, as we only store local starts when generating
|
|
1342 #We need a store by Slice method?
|
|
1343 #This will remove the need to inherit from Feature.
|
|
1344 #These will need to be regenerated everytime we import a new build
|
|
1345 #As we do with the probe_features themselves
|
|
1346 #This also mean the result_feature status has to be associated with a coord_system_id
|
|
1347
|
|
1348 #Which bins do the start and end lie in for this feature?
|
|
1349 #Already dealing with local starts, so no slice subtraction
|
|
1350 #Could wrap these start/end methods via the descendant Collector
|
|
1351 #to remove the Feature dependancy? Or just create Features when parsing in the caller
|
|
1352 my $start_bin = int(($feature->start ) / $wsize);
|
|
1353 my $end_bin = int(($feature->end) / $wsize );
|
|
1354 $end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize};
|
|
1355
|
|
1356
|
|
1357
|
|
1358 #Slightly obfuscated code to match method number(faster)
|
|
1359 #by avoiding string comparisons.
|
|
1360 #Could call methods directly using coderef set in validate_bin_method
|
|
1361 #Accessor may slow things down, but should be uniform for all methods
|
|
1362 #rather than being dependant on position in if/else block below
|
|
1363
|
|
1364 #reserve 0 for descendant defined method?
|
|
1365 #There fore always fastest in this block, or use coderefs?
|
|
1366 if ( $method == 0 ) {
|
|
1367 # ----------------------------------------------------------------
|
|
1368 # For 'count' and 'density'.
|
|
1369
|
|
1370 for ( $bin_index = $start_bin ;
|
|
1371 $bin_index <= $end_bin ;
|
|
1372 ++$bin_index ) {
|
|
1373
|
|
1374 $bins{$wsize}->[$bin_index]++;
|
|
1375
|
|
1376 #warn "setting $wsize bin $bin_index to ". $bins{$wsize}->[$bin_index];
|
|
1377
|
|
1378 }
|
|
1379 }
|
|
1380
|
|
1381 =pod
|
|
1382
|
|
1383 } elsif ( $method == 1 ) {
|
|
1384 # ----------------------------------------------------------------
|
|
1385 # For 'indices' and 'index'
|
|
1386
|
|
1387
|
|
1388 #How is this useful?
|
|
1389 #Is this not just count per bin?
|
|
1390 #No this is a list of the feature indices
|
|
1391 #So forms a distribution?
|
|
1392
|
|
1393 throw('Not implemented for method for index');
|
|
1394
|
|
1395 for ( my $bin_index = $start_bin ;
|
|
1396 $bin_index <= $end_bin ;
|
|
1397 ++$bin_index ) {
|
|
1398 push( @{ $bins[$bin_index] }, $feature_index );
|
|
1399 }
|
|
1400
|
|
1401 ++$feature_index;
|
|
1402
|
|
1403 } elsif ( $method == 2 ) {
|
|
1404 # ----------------------------------------------------------------
|
|
1405 # For 'features' and 'feature'.
|
|
1406
|
|
1407 throw('Not implemented for method for feature/features');
|
|
1408
|
|
1409 for ( my $bin_index = $start_bin ;
|
|
1410 $bin_index <= $end_bin ;
|
|
1411 ++$bin_index ) {
|
|
1412 push( @{ $bins[$bin_index] }, $feature );
|
|
1413 }
|
|
1414
|
|
1415 } elsif ( $method == 3 ) {
|
|
1416 # ----------------------------------------------------------------
|
|
1417 # For 'fractional_count' and 'weight'.
|
|
1418
|
|
1419
|
|
1420 throw('Not implemented for method for fractional_count/weight');
|
|
1421
|
|
1422 if ( $start_bin == $end_bin ) {
|
|
1423 ++$bins[$start_bin];
|
|
1424 } else {
|
|
1425
|
|
1426 my $feature_length =
|
|
1427 $feature->[FEATURE_END] - $feature->[FEATURE_START] + 1;
|
|
1428
|
|
1429 # The first bin...
|
|
1430 $bins[$start_bin] +=
|
|
1431 ( ( $start_bin + 1 )*$bin_length -
|
|
1432 ( $feature->[FEATURE_START] - $slice_start ) )/
|
|
1433 $feature_length;
|
|
1434
|
|
1435 # The intermediate bins (if there are any)...
|
|
1436 for ( my $bin_index = $start_bin + 1 ;
|
|
1437 $bin_index <= $end_bin - 1 ;
|
|
1438 ++$bin_index ) {
|
|
1439 $bins[$bin_index] += $bin_length/$feature_length;
|
|
1440 }
|
|
1441
|
|
1442 # The last bin...
|
|
1443 $bins[$end_bin] +=
|
|
1444 ( ( $feature->[FEATURE_END] - $slice_start ) -
|
|
1445 $end_bin*$bin_length +
|
|
1446 1 )/$feature_length;
|
|
1447
|
|
1448 } ## end else [ if ( $start_bin == $end_bin)
|
|
1449
|
|
1450 }
|
|
1451 elsif ( $method == 4 ) {
|
|
1452 # ----------------------------------------------------------------
|
|
1453 # For 'coverage'.
|
|
1454
|
|
1455 #What exactly is this doing?
|
|
1456 #This is coverage of bin
|
|
1457 #Rather than coverage of feature as in fractional_count
|
|
1458
|
|
1459
|
|
1460 # my $feature_start = $feature->[FEATURE_START] - $slice_start;
|
|
1461 # my $feature_end = $feature->[FEATURE_END] - $slice_start;
|
|
1462 #
|
|
1463 # if ( !defined( $bin_masks[$start_bin] )
|
|
1464 # || ( defined( $bin_masks[$start_bin] )
|
|
1465 # && $bin_masks[$start_bin] != 1 ) ) {
|
|
1466 # # Mask the $start_bin from the start of the feature to the end
|
|
1467 # # of the bin, or to the end of the feature (whichever occurs
|
|
1468 # # first).
|
|
1469 # my $bin_start = int( $start_bin*$bin_length );
|
|
1470 # my $bin_end = int( ( $start_bin + 1 )*$bin_length - 1 );
|
|
1471 # for ( my $pos = $feature_start;
|
|
1472 # $pos <= $bin_end && $pos <= $feature_end ;
|
|
1473 # ++$pos ) {
|
|
1474 # $bin_masks[$start_bin][ $pos - $bin_start ] = 1;
|
|
1475 # }
|
|
1476 # }
|
|
1477 #
|
|
1478 # for ( my $bin_index = $start_bin + 1 ;
|
|
1479 # $bin_index <= $end_bin - 1 ;
|
|
1480 # ++$bin_index ) {
|
|
1481 # # Mark the middle bins between $start_bin and $end_bin as fully
|
|
1482 # # masked out.
|
|
1483 # $bin_masks[$bin_index] = 1;
|
|
1484 # }
|
|
1485 #
|
|
1486 # if ( $end_bin != $start_bin ) {
|
|
1487 #
|
|
1488 # if ( !defined( $bin_masks[$end_bin] )
|
|
1489 # || ( defined( $bin_masks[$end_bin] )
|
|
1490 # && $bin_masks[$end_bin] != 1 ) ) {
|
|
1491 # # Mask the $end_bin from the start of the bin to the end of
|
|
1492 # # the feature, or to the end of the bin (whichever occurs
|
|
1493 # # first).
|
|
1494 # my $bin_start = int( $end_bin*$bin_length );
|
|
1495 # my $bin_end = int( ( $end_bin + 1 )*$bin_length - 1 );
|
|
1496 # for ( my $pos = $bin_start ;
|
|
1497 # $pos <= $feature_end && $pos <= $bin_end ;
|
|
1498 # ++$pos ) {
|
|
1499 # $bin_masks[$end_bin][ $pos - $bin_start ] = 1;
|
|
1500 # }
|
|
1501 # }
|
|
1502 # }
|
|
1503 # } ## end elsif ( $method == 4 )
|
|
1504
|
|
1505 =cut
|
|
1506
|
|
1507
|
|
1508 elsif ( $method == 5 ) {
|
|
1509 #$self->$method($bin_index, $start_bin, $end_bin, $wsize, \%bins, \%bin_counts);
|
|
1510
|
|
1511
|
|
1512 #average score
|
|
1513 #This is simple an average of all the scores for features which overlap this bin
|
|
1514 #No weighting with respect to the bin or the feature
|
|
1515
|
|
1516 for ( $bin_index = $start_bin ;
|
|
1517 $bin_index <= $end_bin ;
|
|
1518 ++$bin_index ) {
|
|
1519
|
|
1520 #we should really push onto array here so we can have median or mean.
|
|
1521 $bins{$wsize}->[$bin_index] += $this->get_score_by_Feature($feature);
|
|
1522 $bin_counts{$wsize}->[$bin_index]++;
|
|
1523 }
|
|
1524 }
|
|
1525 elsif( $method == 6){
|
|
1526 #Max magnitude
|
|
1527 #Take the highest value +ve or -ve score
|
|
1528 for ( $bin_index = $start_bin ;
|
|
1529 $bin_index <= $end_bin ;
|
|
1530 ++$bin_index ) {
|
|
1531
|
|
1532 #we really need to capture the lowest -ve and higest +ve scores here and post process
|
|
1533 #To pick between them
|
|
1534
|
|
1535 my $score = $this->get_score_by_Feature($feature);
|
|
1536 #Write score method as wrapper to scores?
|
|
1537
|
|
1538 $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
|
|
1539
|
|
1540
|
|
1541 #warn "Comparing wsize $wsize bin $bin_index score $score to ". $bins{$wsize}->[$bin_index]->[0].' '.$bins{$wsize}->[$bin_index]->[1]."\n";
|
|
1542
|
|
1543 if($score < $bins{$wsize}->[$bin_index]->[0]){
|
|
1544 #warn "setting -ve bin to $score\n";
|
|
1545 $bins{$wsize}->[$bin_index]->[0] = $score;
|
|
1546 }
|
|
1547 elsif($score > $bins{$wsize}->[$bin_index][1]){
|
|
1548 #warn "setting +ve bin to $score\n";
|
|
1549 $bins{$wsize}->[$bin_index]->[1] = $score;
|
|
1550 }
|
|
1551 }
|
|
1552 }
|
|
1553 else {
|
|
1554 throw("Only accomodates average score method");
|
|
1555 }
|
|
1556
|
|
1557
|
|
1558 }
|
|
1559
|
|
1560 } ## end foreach my $feature ( @{$features...
|
|
1561
|
|
1562
|
|
1563 #Now do post processing of bins
|
|
1564
|
|
1565 =pod
|
|
1566
|
|
1567 if ( $method == 4 ) {
|
|
1568
|
|
1569 # ------------------------------------------------------------------
|
|
1570 # For the 'coverage' method: Finish up by going through @bin_masks
|
|
1571 # and sum up the arrays.
|
|
1572
|
|
1573 for ( my $bin_index = 0 ; $bin_index < $nbins ; ++$bin_index ) {
|
|
1574 if ( defined( $bin_masks[$bin_index] ) ) {
|
|
1575 if ( !ref( $bin_masks[$bin_index] ) ) {
|
|
1576 $bins[$bin_index] = 1;
|
|
1577 } else {
|
|
1578 $bins[$bin_index] =
|
|
1579 scalar( grep ( defined($_), @{ $bin_masks[$bin_index] } ) )/
|
|
1580 $bin_length;
|
|
1581 }
|
|
1582 }
|
|
1583 }
|
|
1584 }
|
|
1585
|
|
1586 =cut
|
|
1587
|
|
1588 if( $method == 5){
|
|
1589 #For average score, need to divide bins by bin_counts
|
|
1590
|
|
1591 foreach my $wsize(keys %bins){
|
|
1592
|
|
1593 foreach my $bin_index(0..$#{$bins{$wsize}}){
|
|
1594
|
|
1595 if($bin_counts{$wsize}->[$bin_index]){
|
|
1596 $bins{$wsize}->[$bin_index] /= $bin_counts{$wsize}->[$bin_index];
|
|
1597 }
|
|
1598 #warn "bin_index $wsize:$bin_index has score ".$bins{$wsize}->[$bin_index];
|
|
1599 }
|
|
1600 }
|
|
1601 }
|
|
1602 elsif( $method == 6){
|
|
1603 #Max magnitude
|
|
1604 #Take the highest value +ve or -ve score
|
|
1605
|
|
1606 foreach my $wsize(keys %bins){
|
|
1607
|
|
1608 foreach my $bin_index(0..$#{$bins{$wsize}}){
|
|
1609
|
|
1610 #So we have the potential that we have no listref in a given bin
|
|
1611
|
|
1612 #default value if we haven't seen anything is 0
|
|
1613 #we actually want an array of -ve +ve values
|
|
1614
|
|
1615 #warn "Are we storing 0 values for absent data?";
|
|
1616 #Not for max_magnitude, but maybe for others?
|
|
1617
|
|
1618 if($bins{$wsize}->[$bin_index]){
|
|
1619 #warn $wsize.':'.$bin_index.':'.$bins{$wsize}->[$bin_index]->[0].'-'.$bins{$wsize}->[$bin_index]->[1];
|
|
1620 my $tmp_minus = $bins{$wsize}->[$bin_index]->[0] * -1;
|
|
1621
|
|
1622 if($tmp_minus > $bins{$wsize}->[$bin_index]->[1]){
|
|
1623 $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[0];
|
|
1624 }
|
|
1625 else{
|
|
1626 $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[1];
|
|
1627 }
|
|
1628
|
|
1629 #warn "bin $bin_index now ". $bins{$wsize}->[$bin_index];
|
|
1630 }
|
|
1631 }
|
|
1632 }
|
|
1633 }
|
|
1634 elsif($method != 0){#Do no post processing for count(0)
|
|
1635 throw('Collector currently only accomodates average_score, count and max magnitude methods');
|
|
1636 }
|
|
1637
|
|
1638
|
|
1639 #Could return bin_counts too summary reporting in zmenu
|
|
1640 #Could also do counting of specific type
|
|
1641
|
|
1642 #warn "returning bins ".Data::Dumper::Dumper(\%bins);
|
|
1643
|
|
1644 return \%bins;
|
|
1645 } ## end sub _bin_features
|
|
1646
|
|
1647
|
|
1648 =pod
|
|
1649
|
|
1650 #These could potentially be used as code refs to avoid having the if else block
|
|
1651 #This way we can also define new methods in the descendant Collector?
|
|
1652 #Would have to have pass args and refs to bin hashes
|
|
1653 #This would slow things down over direct access here
|
|
1654 #But speed is no longer that critical as we do not use the Collector for display
|
|
1655 #purposes, only to build the Collections which are then used for display directly.
|
|
1656
|
|
1657 sub calculate_average_score{
|
|
1658 my $self = shift;
|
|
1659
|
|
1660 if ( $method == 5 ) {
|
|
1661 #average score
|
|
1662 #This is simple an average of all the scores for features which overlap this bin
|
|
1663 #No weighting with respect to the bin or the feature
|
|
1664
|
|
1665 for ( my $bin_index = $start_bin ;
|
|
1666 $bin_index <= $end_bin ;
|
|
1667 ++$bin_index ) {
|
|
1668
|
|
1669 #we should really push onto array here so we can have median or mean.
|
|
1670 $bins{$wsize}->[$bin_index] += $feature->score;
|
|
1671 $bin_counts{$wsize}->[$bin_index]++;
|
|
1672 }
|
|
1673 }
|
|
1674
|
|
1675
|
|
1676
|
|
1677 }
|
|
1678
|
|
1679
|
|
1680 sub post_process_average_score{
|
|
1681
|
|
1682 }
|
|
1683
|
|
1684
|
|
1685 sub calculate_max_magnitude{
|
|
1686 my $self = shift;
|
|
1687
|
|
1688 #Max magnitude
|
|
1689 #Take the highest value +ve or -ve score
|
|
1690 for ( my $bin_index = $start_bin ;
|
|
1691 $bin_index <= $end_bin ;
|
|
1692 ++$bin_index ) {
|
|
1693
|
|
1694 #we really need to capture the lowest -ve and higest +ve scores here and post process
|
|
1695 #To pick between them
|
|
1696
|
|
1697 my $score = $feature->score;
|
|
1698 $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
|
|
1699
|
|
1700 if($score < $bins{$wsize}->[$bin_index]->[0]){
|
|
1701 $bins{$wsize}->[$bin_index]->[0] = $score;
|
|
1702 }
|
|
1703 elsif($score > $bins{$wsize}->[$bin_index][1]){
|
|
1704 $bins{$wsize}->[$bin_index]->[1] = $score;
|
|
1705 }
|
|
1706 }
|
|
1707 }
|
|
1708
|
|
1709
|
|
1710 sub post_process_max_magnitude{
|
|
1711
|
|
1712 }
|
|
1713
|
|
1714 =cut
|
|
1715
|
|
1716 #separated to allow addition of non-standard methods
|
|
1717 #Could potentially add these in new
|
|
1718 #and put this back in _bin_features
|
|
1719
|
|
1720
|
|
1721 sub validate_bin_method{
|
|
1722 my ($self, $method) = @_;
|
|
1723
|
|
1724
|
|
1725 #change this to set the coderefs
|
|
1726 #Just set anonymous sub to immediately return for non post processed methods
|
|
1727 #No need for coderef, just set the method name?
|
|
1728
|
|
1729 #if(! $self->can('calculate_'.$method)){
|
|
1730 #throw("$method method does not have a valid calculate_${method} method");
|
|
1731 #}
|
|
1732
|
|
1733 #if($self->can('post_process_'.$method)){
|
|
1734 ##set post process flag?
|
|
1735 #or simply do this can in line in the _bin_features sub?
|
|
1736 #}
|
|
1737
|
|
1738
|
|
1739
|
|
1740
|
|
1741 #Add average_score to avoid changing Collection.pm
|
|
1742 my $class = ref($self);
|
|
1743 ${$class::VALID_BINNING_METHODS}{'average_score'} = 5;
|
|
1744 ${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6;
|
|
1745 ${$class::VALID_BINNING_METHODS}{'count'} = 0;
|
|
1746
|
|
1747
|
|
1748
|
|
1749 #foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){
|
|
1750 # warn "valid method is $method name";
|
|
1751 # }
|
|
1752
|
|
1753
|
|
1754 if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) {
|
|
1755 throw(
|
|
1756 sprintf(
|
|
1757 "Invalid binning method '%s', valid methods are:\n\t%s\n",
|
|
1758 $method,
|
|
1759 join( "\n\t", sort( keys(%{$class::VALID_BINNING_METHODS}) ) ) ) );
|
|
1760 }
|
|
1761 else{
|
|
1762 #warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method};
|
|
1763 }
|
|
1764
|
|
1765 return ${$class::VALID_BINNING_METHODS}{$method};
|
|
1766 }
|
|
1767
|
|
1768
|
|
1769
|
|
1770 1;
|