Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Collector.pm,v 1.7 2011/01/10 11:27:34 nj1 Exp $ | |
2 | |
3 | |
4 =head1 LICENSE | |
5 | |
6 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
7 Genome Research Limited. All rights reserved. | |
8 | |
9 This software is distributed under a modified Apache license. | |
10 For license details, please see | |
11 | |
12 http://www.ensembl.org/info/about/code_licence.html | |
13 | |
14 =head1 CONTACT | |
15 | |
16 Please email comments or questions to the public Ensembl | |
17 developers list at <ensembl-dev@ebi.ac.uk>. | |
18 | |
19 Questions may also be sent to the Ensembl help desk at | |
20 <helpdesk@ensembl.org>. | |
21 | |
22 =cut | |
23 | |
24 #Your Bio::Ensembl::Collection::Feature defs module should inherit from here | |
25 #This could be a local defs file which you have created and require'd into your script | |
26 | |
27 #If your collections defs module refers to a Bio::EnsEMBL::Feature, | |
28 #then it's adaptor should inherit from the collections defs module | |
29 | |
30 | |
31 | |
32 package Bio::EnsEMBL::Funcgen::Collector; | |
33 #Move this to Bio::EnsEMBL::Utils::Collector for 59? | |
34 | |
35 | |
36 use strict; | |
37 use warnings; | |
38 | |
39 use Bio::EnsEMBL::Utils::Argument ('rearrange'); | |
40 use Bio::EnsEMBL::Utils::Exception ('throw'); | |
41 use Bio::EnsEMBL::Funcgen::ResultFeature; | |
42 | |
43 #use base('Bio::EnsEMBL::Collection');#ISA | |
44 | |
45 our ($pack_template, $packed_size, @window_sizes); #These get set in the FeatureAdaptor | |
46 #Make these constants and remove setter functionality in methods? | |
47 #Only really important for pack template and windows, maybe these if we are going to start | |
48 | |
49 | |
50 our $max_data_type_size = 16777216; #Default is 16MB for long blob | |
51 #we need to deduct the size of the rest of the record here! | |
52 #For a 2byte packet the smallest window size possible is: | |
53 #(slice->length/(16777216/2) | |
54 #so int(bin_size)+1 | |
55 #Obviously have to use the largest slice here, for human chr1: | |
56 #249,250,621/(16777216/2) = 29.7??? | |
57 #We may need to up this slightly to account for larger chrs? | |
58 #Implications on memory usage? Is it 4 times for blob manipulation? | |
59 #Does substr require this manipulation? | |
60 #This max_allowed_packet_size does not seem to translate directly to the size of the | |
61 #data being stored e.g. quite a bit more is needed. ISG haven't got to the bottom of this yet. | |
62 #But have simply upped the config to 67108864 to handle the largest human chr. | |
63 | |
64 our $max_view_width = 500000;#Max width in Region In Detail; | |
65 | |
66 | |
67 #our %VALID_BINNING_METHODS | |
68 #Remove this in favour of can->('calculate_.$method) and coderefs? | |
69 | |
70 | |
71 | |
72 | |
73 #To do | |
74 # 1 DONE Merge in Collection code, (no need to do this, removed inheritance) | |
75 # 2 Write simple BED input to flat file output. | |
76 # 3 Separate store method so we can simply get, then wrap store around this | |
77 # 4 Test get method with slice adjusts | |
78 # 5 separate set_config? | |
79 # 6 optimise generate_bin_chunks to handle just one window size for display? | |
80 # 7 Handle packed_size pack_template as methods constants | |
81 # 8 Provide override method in basefeature adaptor which will use package constant in feature adaptor | |
82 # This is because these are really adaptor config, the collector only needs to know the | |
83 # packed_size, and in the absence of an feature adaptor also provides the default methods for both. | |
84 # If we substr in the API then we need to set sensible limits on blob size, otherwise we will have to unpack a lot of data | |
85 # to get at the slice we want. | |
86 # OR | |
87 # Change adaptor to substr in DB based on known blob ranges/window size | |
88 # and stitch together any which cross boundaries. This depends on speed of substr at end of large blob TEST! | |
89 # Load with current code first and test this before making either change! | |
90 # Delete empty (non-0) collections? i.e. For seq_regions which do not have any source features. | |
91 # | |
92 # 9 Handle PAR/HAP regions using fetch_normalised_slice_projections This has to be done in the feature adaptor! Then restrict to non_dup regions in calling script | |
93 | |
94 | |
95 | |
96 =head2 new | |
97 | |
98 Args : None | |
99 Example : my $collector = Bio::EnsEMBL::(Funcgen|Compara|Variation::)Collector::FEATURE->new; | |
100 $collector->store_windows_by_Slice($slice); | |
101 Description: Simple new method to enable use of collector when not inherited by | |
102 a descendant of Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor | |
103 Returntype : Bio::EnsEMBL::Funcgen::Collector | |
104 Exceptions : None | |
105 Caller : Collector script | |
106 Status : At Risk | |
107 | |
108 =cut | |
109 | |
110 sub new{ | |
111 return bless {}, $_[0];#Simple blesses this class as an empty hash | |
112 | |
113 #Do not set anything here | |
114 #As will not be first in ISA for feature adaptors | |
115 #Hence not guaranteed to be called | |
116 | |
117 } | |
118 | |
119 #Setter/Getter methods if we don't have a dedicated Collector module | |
120 #to set the package variables in? Also to allow overriding of defaults. | |
121 #This can be used by the write_collection method | |
122 #to determine when to build and store a compressed collection | |
123 #Effectively the max size of the data type you are using to store | |
124 #a compressed score. defaults to max for long blob 16MB | |
125 | |
126 | |
127 | |
128 #Generic method, but only ever called by write_collection in descendant | |
129 | |
130 sub new_assembly{ | |
131 my ($self, $new_ass) = @_; | |
132 | |
133 if($new_ass){ | |
134 #Validate new assm to project to | |
135 | |
136 | |
137 $self->{'new_assembly'} = $new_ass; | |
138 } | |
139 | |
140 return $self->{'new_assembly'}; | |
141 } | |
142 | |
143 | |
144 sub max_data_type_size{ | |
145 my ($self, $size) = @_; | |
146 | |
147 #Validate is sensible integer? | |
148 | |
149 if($size && ! int($size)){ | |
150 throw("max_data_type_size must be a integer of bytes, not $size"); | |
151 } | |
152 elsif($size){ | |
153 $self->{'max_data_type_size'} = $size; | |
154 } | |
155 elsif(! defined $self->{'max_data_type_size'}){ | |
156 #default set at head of this module or in descendant Collector | |
157 $self->{'max_data_type_size'} = $Bio::EnsEMBL::Funcgen::Collector::max_data_type_size; | |
158 } | |
159 | |
160 return $self->{'max_data_type_size'}; | |
161 } | |
162 | |
163 sub max_view_width{ | |
164 my ($self, $size) = @_; | |
165 | |
166 #Validate is sensible integer? | |
167 | |
168 if($size && ! int($size)){ | |
169 throw("max_view_width must be a integer, not $size"); | |
170 } | |
171 elsif($size){ | |
172 $self->{'max_view_width'} = $size; | |
173 } | |
174 elsif(! defined $self->{'max_view_width'}){ | |
175 #default set at head of this module or in descendant Collector | |
176 $self->{'max_view_width'} = $Bio::EnsEMBL::Funcgen::Collector::max_view_width; | |
177 } | |
178 | |
179 return $self->{'max_view_width'}; | |
180 } | |
181 | |
182 | |
183 sub bins_per_record(){ | |
184 #$collector_class::bins_per_record = ($collector_class::max_data_type_size/$collector_class::packed_size);#This should be done dynamically as we may redefine either of these variables? | |
185 | |
186 my ($self) = shift; | |
187 | |
188 return int($self->max_data_type_size/$self->packed_size); | |
189 } | |
190 | |
191 | |
192 #The defaults for these should be defined in the feature/format specific Collector descendant | |
193 #either by specifying the package variables or using config attrs to set methods? | |
194 #general config should be parsed here. | |
195 #rename bin_method? | |
196 | |
197 sub bin_method{ | |
198 my ($self, $method) = @_; | |
199 | |
200 if($method || ! $self->{'bin_method'}){ | |
201 | |
202 if($method){ | |
203 $self->{'bin_method'} = $method; | |
204 #should test can here? or validate versus hash? | |
205 } | |
206 elsif(! $self->{'bin_method'}){ | |
207 | |
208 if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_method){ | |
209 throw('Must pass a bin_method in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_method in your Collector'); | |
210 } | |
211 | |
212 $self->{'bin_method'} = $Bio::EnsEMBL::Funcgen::Collector::bin_method; | |
213 } | |
214 | |
215 #or current validate method if we are keeping the method in the if/else block | |
216 | |
217 | |
218 #if(! $self->can("calculate_${method}"))){ | |
219 #throw("$method is no a valid a valid binning method"); | |
220 #} | |
221 | |
222 } | |
223 | |
224 return $self->{'bin_method'}; | |
225 } | |
226 | |
227 #We could replace this with a hash of bin_methods and models? | |
228 #This could then be used to validate | |
229 #Altho if we are going to commodotise the bin methods, then we need to be able to | |
230 #define this in the child Collector. Could still do this by modifying the method/model | |
231 #hash from the child Collector | |
232 | |
233 sub bin_model{ | |
234 my ($self, $bin_model) = @_; | |
235 | |
236 if($bin_model || ! $self->{'bin_model'}){ | |
237 | |
238 if($bin_model){ | |
239 $self->{'bin_model'} = $bin_model; | |
240 } | |
241 elsif(! $self->{'bin_model'}){ | |
242 | |
243 #Set as global constant defined in descendant Collector | |
244 if (! defined $Bio::EnsEMBL::Funcgen::Collector::bin_model){ | |
245 throw('Must pass -bin_model in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_model in your Collector'); | |
246 } | |
247 | |
248 $self->{'bin_model'} = $Bio::EnsEMBL::Funcgen::Collector::bin_model; | |
249 } | |
250 | |
251 #Need to validate bin models here | |
252 throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models') if $self->{'bin_model'} ne 'SIMPLE'; | |
253 } | |
254 | |
255 return $self->{'bin_model'}; | |
256 } | |
257 | |
258 #This can be overridden by adaptor method | |
259 #At present this could cause problems as we can pass window sizes in the config, but they will never be set | |
260 #as adaptor method is not a setter. Adaptor method should throw if we try and set them as this could cause problems when fetching and not knowing the custom sizes? | |
261 | |
262 sub window_sizes{ | |
263 my ($self, $sizes) = @_; | |
264 | |
265 if($sizes || ! $self->{'window_sizes'}){ | |
266 | |
267 if($sizes){ | |
268 $self->{'window_sizes'} = $sizes; | |
269 } | |
270 else{#! $self->{'windows_sizes' | |
271 | |
272 if (! @window_sizes){ | |
273 throw('Must pass -windows_sizes in the config or define @Bio::EnsEMBL::Funcgen::Collector::window_sizes in your Collector'); | |
274 } | |
275 | |
276 @{$self->{'window_sizes'}} = @window_sizes; | |
277 } | |
278 | |
279 if(ref($self->{'window_sizes'}) ne 'ARRAY' || | |
280 scalar(@{$self->{'window_sizes'}}) == 0){ | |
281 throw('window_sizes must be an arrayref of at least one window size'); | |
282 } | |
283 } | |
284 | |
285 return $self->{'window_sizes'}; | |
286 } | |
287 | |
288 | |
289 | |
290 | |
291 #Optional attrs dependant on whether Collection is packed | |
292 #Can be redefined in the adaptor but becareful never to redefine the actual values | |
293 #As these should really be constants for a given Collector | |
294 #What is best here? We only want pack methods for storing/fetching compressed collections | |
295 #Move this to base feature adaptor and define attrs as constants using | |
296 #package variable? Or directly in new? | |
297 #Then direct modification will be caught. | |
298 #Just leave here for now. | |
299 | |
300 #Caller _obj_from_sth/store | |
301 | |
302 sub pack_template{ | |
303 my ($self, $template) = @_; | |
304 | |
305 if($template){ | |
306 $self->{'pack_template'} = $template; | |
307 } | |
308 elsif(! $self->{'pack_template'}){ | |
309 | |
310 #Set as global constant defined in descendant Collector | |
311 | |
312 if (! defined $Bio::EnsEMBL::Funcgen::Collector::pack_template){ | |
313 throw('Must pass a per score pack_template in the config or define $Bio::EnsEMBL::Funcgen::Collector::pack_template in your Collector'); | |
314 } | |
315 | |
316 $self->{'pack_template'} = $Bio::EnsEMBL::Funcgen::Collector::pack_template; | |
317 } | |
318 | |
319 return $self->{'pack_template'}; | |
320 | |
321 } | |
322 | |
323 #Caller _obj_from_sth/store & current_packed_size | |
324 | |
325 sub packed_size{ | |
326 my ($self, $size) = @_; | |
327 | |
328 if($size){ | |
329 | |
330 if(! int($size)){ | |
331 throw("$size is not an integer, must pass a size integer for packed_size which specifies size of pack_template:\t".$self->pack_template); | |
332 } | |
333 | |
334 $self->{'packed_size'} = $size; | |
335 } | |
336 elsif(! $self->{'packed_size'}){ | |
337 | |
338 #Set as global constant defined in descendant Collector | |
339 | |
340 if (! defined $Bio::EnsEMBL::Funcgen::Collector::packed_size){ | |
341 throw('Must pass a packed_size(wrt to pack_template) in the config or define $Bio::EnsEMBL::Funcgen::Collector::packed_size in your Collector'); | |
342 } | |
343 | |
344 $self->{'packed_size'} = $Bio::EnsEMBL::Funcgen::Collector::packed_size; | |
345 } | |
346 | |
347 return $self->{'packed_size'}; | |
348 | |
349 } | |
350 | |
351 #These methods are used by the descendant Collector | |
352 #For caching infor whilst building collections | |
353 #This is used to log how big a collection has grown before storing | |
354 | |
355 sub current_packed_size{ | |
356 my ($self, $wsize) = @_; | |
357 | |
358 #$self->{'current_packed_size'}{$wsize} ||= 0; | |
359 | |
360 #if(defined $cps){ | |
361 # $self->{'current_packed_size'}{$wsize} = $cps; | |
362 # } | |
363 # else{ | |
364 # return $self->{'current_packed_size'}{$wsize}; | |
365 # } | |
366 | |
367 return (scalar(@{$self->score_cache($wsize)})*$self->packed_size); | |
368 | |
369 } | |
370 | |
371 | |
372 sub score_cache{ | |
373 my ($self, $wsize, $scores) = @_; | |
374 | |
375 $self->{'score_cache'}{$wsize} ||= []; | |
376 | |
377 if(defined $scores){ | |
378 push @{$self->{'score_cache'}{$wsize}}, @{$scores}; | |
379 } | |
380 else{ | |
381 #Do this here to stop passing the ref everytime | |
382 #Will this be faster? | |
383 #Would certainly be faster if we were not returning a ref | |
384 return $self->{'score_cache'}{$wsize}; | |
385 } | |
386 } | |
387 | |
388 #These last methods are only used for the 0 wsize | |
389 #natural resolution and ar wrt the orig_slice passed | |
390 #to store_windows_by_Slice | |
391 | |
392 sub collection_start{ | |
393 my ($self, $wsize, $sr_start) = @_; | |
394 | |
395 if(defined $sr_start){ | |
396 $self->{'collection_start'}{$wsize} = $sr_start; | |
397 } | |
398 else{ | |
399 return $self->{'collection_start'}{$wsize}; | |
400 } | |
401 } | |
402 | |
403 sub collection_end{ | |
404 my ($self, $wsize, $sr_end) = @_; | |
405 | |
406 if(defined $sr_end){ | |
407 $self->{'collection_end'}{$wsize} = $sr_end; | |
408 } | |
409 else{ | |
410 return $self->{'collection_end'}{$wsize}; | |
411 } | |
412 } | |
413 | |
414 sub collection_strand{ | |
415 my ($self, $wsize, $strand) = @_; | |
416 | |
417 if(defined $strand){ | |
418 $self->{'collection_strand'}{$wsize} = $strand; | |
419 } | |
420 else{ | |
421 return $self->{'collection_strand'}{$wsize}; | |
422 } | |
423 } | |
424 | |
425 | |
426 | |
427 =pod | |
428 | |
429 sub _create_feature { | |
430 my ( $this, $feature_type, $args ) = @_; | |
431 | |
432 my $feature = $this->SUPER::_create_feature( $feature_type, $args ); | |
433 | |
434 if ( !$this->_lightweight() ) { | |
435 my ( $phase, $end_phase, $stable_id, $version, $created_date, | |
436 $modified_date, $is_current ) | |
437 = rearrange( [ 'PHASE', 'END_PHASE', | |
438 'STABLE_ID', 'VERSION', | |
439 'CREATED_DATE', 'MODIFIED_DATE', | |
440 'IS_CURRENT' | |
441 ], | |
442 %{$args} ); | |
443 | |
444 push( @{$feature}, | |
445 $phase, $end_phase, $stable_id, $version, $created_date, | |
446 $modified_date, $is_current ); | |
447 } | |
448 | |
449 return $feature; | |
450 } | |
451 | |
452 sub _create_feature_fast { | |
453 my ( $this, $feature_type, $args ) = @_; | |
454 | |
455 my $feature = | |
456 $this->SUPER::_create_feature_fast( $feature_type, $args ); | |
457 | |
458 return $feature; | |
459 } | |
460 | |
461 | |
462 #This might not be sensible for Features which are split across tables | |
463 | |
464 sub _tables { | |
465 my ($this) = @_; | |
466 | |
467 my @tables = $this->SUPER::_tables(); | |
468 | |
469 if ( $this->_lightweight() ) { | |
470 return ( $tables[0] ); | |
471 } | |
472 | |
473 return @tables; | |
474 } | |
475 | |
476 | |
477 sub _columns { | |
478 my ($this) = @_; | |
479 | |
480 my @columns = $this->SUPER::_columns(); | |
481 | |
482 if ( $this->_lightweight() ) { | |
483 | |
484 #What is this doing? | |
485 #Probably not sensible for ResultFeature | |
486 @columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns ); | |
487 } | |
488 | |
489 return @columns; | |
490 } | |
491 | |
492 #Also not sensible for objects spread across several tables | |
493 | |
494 sub _default_where_clause { | |
495 my ($this) = @_; | |
496 | |
497 if ( $this->_lightweight() ) { | |
498 return ''; | |
499 } | |
500 | |
501 return $this->SUPER::_default_where_clause(); | |
502 } | |
503 | |
504 =cut | |
505 | |
506 | |
507 | |
508 #This need to be generic | |
509 #Again we need to pass an accessor method/reference? | |
510 #Will be some sort of generic fetch for feature adaptors | |
511 #or while loop for in flat file accessor | |
512 #rollback to be handled in caller? | |
513 | |
514 # To do | |
515 # | |
516 # 1 Allow variable chunks lengths (so we only have one resolution of windows?) | |
517 # This will allow SNP collections which currently define classification i.e colour | |
518 # Density of SNPs within window will define shading. Count will be displayed in zmenu | |
519 # This maybe something we have to do in the descendant | |
520 # | |
521 # 2 Implement collection param definition in/from descendant | |
522 | |
523 | |
524 # return collection config from adaptor fetch | |
525 # window size | |
526 # fixed width? | |
527 # render/collection style? | |
528 # This chould be implemented in BaseFeatureAdaptor::generic_fetch? | |
529 # Or could be done in the calling fetchmethod? | |
530 # | |
531 | |
532 #need to change this to get_window_bin_by_Slice | |
533 #to enable generating bins on uncompressed data | |
534 #Need to remove all counts and store based code to store caller | |
535 #this would mean removing any pack based code too | |
536 #separate set_config method | |
537 | |
538 | |
539 #Probelm here is size of slice? | |
540 #We need to generate bins all in one go, but also need to store at interval | |
541 #so as not to explode memory | |
542 #Do we need to separate the window generation from the bin generation code? | |
543 | |
544 | |
545 #Define the optimal way to generate windowed data by | |
546 #finding the most common denominator | |
547 | |
548 sub _define_window_chunks{ | |
549 my ($self, $window_sizes, $max_view_size) = @_; | |
550 | |
551 ### DEFINE CHUNKS WRT WINDOWS | |
552 | |
553 #Shortcut for on the fly uncompressed collection retrieval | |
554 #if(scalar(@wsizes) = 1){ | |
555 # | |
556 #} | |
557 #else{ | |
558 | |
559 #Calulate sensible slice length based on window sizes | |
560 my @wsizes = sort {$a <=> $b} @$window_sizes; | |
561 | |
562 #We need a default when only calculating 0 resolution | |
563 #Will binning code work with only 0 resolution? | |
564 if((scalar(@wsizes) == 1) && | |
565 $wsizes[0] == 0){ | |
566 return { $self->max_view_width => [0] }; | |
567 } | |
568 | |
569 | |
570 my $multiplier = int($max_view_size/$wsizes[$#wsizes]); | |
571 my $chunk_length = $multiplier * $wsizes[$#wsizes]; | |
572 my $not_divisible = 1; | |
573 my %chunk_windows;#Registry of chunk lengths to run with windows | |
574 my %workable_chunks = map {$_ => {}} @wsizes; | |
575 delete $workable_chunks{'0'};#get rid of natural resolution as this will always work | |
576 | |
577 | |
578 while($not_divisible && $chunk_length != 0){ | |
579 $not_divisible = 0; | |
580 | |
581 foreach my $wsize(@wsizes){ | |
582 next if $wsize == 0;#Special wsize for normal data | |
583 | |
584 #Set not divisible if modulus is true | |
585 if($chunk_length % $wsize){ | |
586 $not_divisible = 1; | |
587 } | |
588 else{ | |
589 #No need to listref here? | |
590 $workable_chunks{$wsize}{$chunk_length} = []; | |
591 } | |
592 | |
593 #warn "chunk length is $chunk_length and not_divisible is $not_divisible"; | |
594 } | |
595 | |
596 #Gradually shrink the length until we find a workable slice length for all windows | |
597 $chunk_length -= $wsizes[$#wsizes] if $not_divisible; | |
598 } | |
599 | |
600 my %chunk_sets; | |
601 | |
602 | |
603 if($chunk_length == 0){ | |
604 print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n"; | |
605 | |
606 foreach my $wsize(keys %workable_chunks){ | |
607 #Loop through windows, seeing if they are workable in the other windows | |
608 | |
609 foreach my $chunk(keys %{$workable_chunks{$wsize}}){ | |
610 | |
611 foreach my $other_wsize(keys %workable_chunks){ | |
612 next if $wsize == $other_wsize; | |
613 | |
614 if(exists $workable_chunks{$other_wsize}{$chunk}){ | |
615 #only push it onto the other wsize, as we will do the reverse later | |
616 $chunk_sets{$chunk}{$wsize} = undef; | |
617 } | |
618 } | |
619 } | |
620 } | |
621 | |
622 #Now we have a register of co-occurence of wsizes with repect to chunks | |
623 #Loop through finding the least amount of sets with the longest chunk length? | |
624 #There is no way to decide which is best? | |
625 #we could calculate the number of loops? Factored by the chunk length? | |
626 #Let's just print out and see what we get | |
627 | |
628 #warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets); | |
629 | |
630 | |
631 #For now let's just take the one which has the most windows and the longest chunk | |
632 #Then we just get the largest which handles the rest. | |
633 | |
634 #define possible set lengths | |
635 my $i = 0; | |
636 my %set_lengths; | |
637 map {$set_lengths{$i} = []; $i++} @wsizes; | |
638 delete $set_lengths{'0'};#get rid of natural resolution as this will always work | |
639 | |
640 | |
641 #store chunks lengths for each set size | |
642 foreach my $chunk(keys %chunk_sets){ | |
643 my $set_size = scalar(values %{$chunk_sets{$chunk}}); | |
644 push @{$set_lengths{$set_size}}, $chunk; | |
645 } | |
646 | |
647 #Now we get the biggest set with the longest length; | |
648 my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop | |
649 my $found_largest_set = 0; | |
650 | |
651 while(! $found_largest_set){ | |
652 $largest_size--; | |
653 | |
654 if(scalar(@{$set_lengths{$largest_size}}>0)){ | |
655 $found_largest_set = 1; | |
656 } | |
657 } | |
658 | |
659 | |
660 #We should be able to loop this bit, to find all the biggest sets. | |
661 my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}}; | |
662 #we could even be selective here, but let's just take the first one for now | |
663 | |
664 my @largest_windows = keys %{$chunk_sets{$largest_chunk}}; | |
665 @{$chunk_windows{$largest_chunk}} = @largest_windows; | |
666 print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n"; | |
667 | |
668 my %remaining_windows = map {$_ => {}} @wsizes; | |
669 delete $remaining_windows{'0'};#get rid of natural resolution as this will always work | |
670 map { delete $remaining_windows{$_} } @largest_windows; | |
671 my $remaining_set_size = scalar(keys %remaining_windows); | |
672 | |
673 #swapping to array here for practicality, would need to maintain hash if we need to iterate | |
674 my @rwindows = keys %remaining_windows; | |
675 | |
676 #This could just be one window, but this will not be inthe co-occurence hash %chunk_sets | |
677 #Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window. | |
678 my $next_chunk; | |
679 | |
680 if(scalar(@rwindows) == 1){ | |
681 #we just want to find a suitably large chunk for this one window. | |
682 my ($last_window) = @rwindows; | |
683 | |
684 $multiplier = int(500000/$last_window); | |
685 $next_chunk = $multiplier * $last_window; | |
686 } | |
687 else{ | |
688 #Now were are doing something very similar to above | |
689 #populating a set_size chunk length registry | |
690 #my %seen_hash; | |
691 | |
692 | |
693 foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){ | |
694 my $seen_count = 0; | |
695 | |
696 foreach my $rwindow(@rwindows){ | |
697 | |
698 $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}}); | |
699 } | |
700 | |
701 if ($seen_count == $remaining_set_size){ | |
702 | |
703 $next_chunk = $chunk; | |
704 last; | |
705 } | |
706 } | |
707 } | |
708 | |
709 @{$chunk_windows{$next_chunk}} = @rwindows; | |
710 | |
711 | |
712 if($next_chunk){ | |
713 | |
714 print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n"; | |
715 | |
716 #Now we want to cycle through all the set lengths which could contain the ones not in the first | |
717 #so we need to | |
718 } | |
719 else{ | |
720 warn "Need to write iterative sub for set definition"; | |
721 throw('Could not find workable slice length for remaining windows: '. | |
722 join(', ', @rwindows)); | |
723 | |
724 } | |
725 } | |
726 else{ | |
727 @{$chunk_windows{$chunk_length}} = keys(%workable_chunks); | |
728 print "Found workable chunk length($chunk_length) for all window sizes:\t". | |
729 join(' ', @{$chunk_windows{$chunk_length}})."\n"; | |
730 } | |
731 | |
732 return \%chunk_windows; | |
733 } | |
734 | |
735 | |
736 #Let's concentrate on store function first before we split out into store and fetch methods | |
737 #How will this work with the Bed parser? | |
738 #The descendant collector will sort the input and detect the current slice before calling | |
739 #store_window_bins_by_Slice. This may require some caching of line or seeking as we will see the next slice before we have a chance to set it. | |
740 #This will store as ResultFeature collections, so maybe we need to separate the input from output code? | |
741 #i.e. Bed parser/wrapper | |
742 # ResultFeatureAdaptor wrapper | |
743 #These | |
744 | |
745 #Problem with passing window_sizes here | |
746 #We need to check that they aren't already defined a class variables as this could potentially | |
747 #screw up retrieval, expect for only 0 or all but 0 | |
748 #Should we remove this config and force the class variable to be set in the 'adaptor' | |
749 #Method is then only used internally, make private or only getter? Set by changing class vars? | |
750 | |
751 sub store_window_bins_by_Slice{ | |
752 my ($self, $slice, %config) = @_; | |
753 | |
754 my ($window_sizes, $logic_name, $bin_method, $fetch_method_ref, $max_view_width, | |
755 $max_data_type_size, $pack_template, $packed_size, $bin_model, $new_assm, $skip_zero_window) = | |
756 rearrange( [ 'WINDOW_SIZES', 'LOGIC_NAME', 'BIN_METHOD', 'FETCH_METHOD_REF', 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', 'PACK_TEMPLATE', 'PACKED_SIZE', 'BIN_MODEL', 'NEW_ASSEMBLY', 'SKIP_ZERO_WINDOW'], %config ); | |
757 | |
758 warn "Need to be careful here about cleaning start end strand caches between serially run slices"; | |
759 | |
760 | |
761 | |
762 ### VAILDATE VARS/CONFIG | |
763 | |
764 #This could be done once in set_config, could then remove setter bahviour from attr methods? | |
765 #All default defs params/methods can be overridden by config params | |
766 #Attrs used in this method | |
767 $bin_method = $self->bin_method($bin_method); | |
768 $bin_model = $self->bin_model($bin_model); | |
769 #$window_sizes = $self->window_sizes($window_sizes);#Now done below | |
770 #Set to undef if we ave empty array | |
771 $window_sizes = undef if (ref($window_sizes) eq 'ARRAY' && scalar(@$window_sizes) == 0); | |
772 #Attrs used in other (store) methods | |
773 $self->pack_template($pack_template); | |
774 $self->packed_size($packed_size); | |
775 $self->max_data_type_size($max_data_type_size); | |
776 $self->max_view_width($max_view_width); | |
777 | |
778 #Other vars | |
779 $self->new_assembly($new_assm); | |
780 | |
781 #Need to validate slice here | |
782 | |
783 warn "temp hack for bin_method validation"; | |
784 $bin_method = $self->validate_bin_method($bin_method); | |
785 | |
786 ### Set window_sizes | |
787 | |
788 if($self->new_assembly){ | |
789 print "Assembly projection may cause problems for large Collections, defaulting to window_sizes = (0)\n"; | |
790 #Then build the bins on the projected 0 level single ResultFeatures | |
791 | |
792 #Test we haven't explicity set window_sizes to be soemthing else | |
793 | |
794 if($window_sizes && | |
795 ! ( scalar(@$window_sizes) == 1 && $window_sizes[0] == 0)){ | |
796 | |
797 throw("You have set window_sizes config which are not safe when projecting to a new assembly($new_assm), please omit window_sizes config or set to 0"); | |
798 | |
799 } | |
800 $window_sizes = $self->window_sizes([0]); | |
801 } | |
802 else{ | |
803 | |
804 if($window_sizes && $skip_zero_window && grep/^0$/,@$window_sizes){ | |
805 throw("You have specied skip_zero_window and window_size 0 in your config, please remove one of these"); | |
806 } | |
807 elsif($window_sizes && ! grep/^0$/,@$window_sizes){ | |
808 $skip_zero_window = 1; | |
809 unshift @$window_sizes, 0;#re-add 0 window as we need this to build the collections | |
810 } | |
811 | |
812 $window_sizes = $self->window_sizes($window_sizes); | |
813 } | |
814 | |
815 | |
816 #This is already done in the script | |
817 if($skip_zero_window && $new_assm){ | |
818 throw("You cannot -skip_zero_window or omit 0 from -window_sizes when projecting to a new assembly($new_assm) which should only be generated using window_size=0"); | |
819 } | |
820 | |
821 | |
822 | |
823 ### Rollback previously stored features | |
824 | |
825 if($self->can('rollback_Features_by_Slice')){ | |
826 $self->rollback_Features_by_Slice($slice); | |
827 } | |
828 else{ | |
829 #This is currently the only warn output we can't get rid off | |
830 warn ref($self)." cannot rollback_Features_by_Slice. This may result in duplicate Collections being stored if there is pre-existing data"; | |
831 } | |
832 | |
833 | |
834 | |
835 ### PROCESS CHUNKS | |
836 | |
837 #Not lightweight as we will be storing them | |
838 # Temporarily set the collection to be lightweight??? | |
839 #my $old_value = $this->_lightweight(); | |
840 #if ( defined($lightweight) ) { $this->_lightweight($lightweight) } | |
841 #else { $this->_lightweight(1) } | |
842 | |
843 my %chunk_windows = %{$self->_define_window_chunks($self->window_sizes, $self->max_view_width)}; | |
844 my (%counts, $store_natural); | |
845 $store_natural = grep/^0/, @$window_sizes; | |
846 $counts{0}=0;#Set natural res count to 0 | |
847 my $slice_end = $slice->end; | |
848 my $orig_slice = $slice; | |
849 my $orig_start = $slice->start; | |
850 #my $slice_adj = $slice->start - 1;#Removed this as we are now generating features local to orig_slice | |
851 #start/end conversion will be done in write/store_collection | |
852 my $region = $slice->coord_system_name; | |
853 my $version = $slice->coord_system->version; | |
854 my $seq_region_name = $slice->seq_region_name; | |
855 my $strand = $slice->strand; | |
856 my $only_natural = 0; | |
857 #my $slice_adj = 0; | |
858 | |
859 | |
860 #We need to account for only 0 here when doing projection | |
861 #The chunk window is set to max_view_widht in _define_chunk_windows | |
862 | |
863 $only_natural = 1 if $store_natural && scalar(@$window_sizes) == 1; | |
864 $store_natural = 0 if $skip_zero_window; | |
865 #SHould really test these two, but should already be caught by now | |
866 | |
867 #Set the initial collection_start to orig_start | |
868 #Could default to 1, but we may not be starting from 1 | |
869 #This is not the case for 0 wsize where it must always be | |
870 #The first feature start | |
871 | |
872 | |
873 for my $wsize(@{$self->window_sizes}){ | |
874 | |
875 next if $wsize == 0;# && $skip_zero_window;#We never want to assume start of 0 window collection | |
876 $self->collection_start($wsize, $orig_start); | |
877 } | |
878 | |
879 | |
880 | |
881 foreach my $chunk_length(sort keys %chunk_windows){ | |
882 | |
883 print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}}). | |
884 " with chunk length $chunk_length\n"; | |
885 map $counts{$_} = 0, @{$chunk_windows{$chunk_length}}; #Set window counts to 0 | |
886 | |
887 #Now walk through slice using slice length chunks and build all windows in each chunk | |
888 my $in_slice = 1; | |
889 my $start_adj = 0; | |
890 my ($sub_end, $features, $bins); | |
891 my $sub_start = 1; | |
892 my $slice_length = $slice->length; | |
893 | |
894 | |
895 #Can we subslice and then exclusivly use bin_start(local to orig_slice) | |
896 #Then we never have to deal with sr coord until we store | |
897 #This should me we never have to do the sr conversion unless we | |
898 #use a slice which doesn't start at 1(PAR or test) | |
899 #Always create in local coords for fetch | |
900 #Then change to seq_region coords for store if required | |
901 | |
902 | |
903 while($in_slice){ | |
904 #$sr_start = $slice_start + $start_adj; | |
905 $sub_start += $start_adj; | |
906 | |
907 #$slice_start = $sr_start;#Keep for next slice | |
908 #$sr_end = $sr_start + $chunk_length - 1; | |
909 $sub_end = $sub_start + $chunk_length - 1; | |
910 | |
911 #Last chunk might not be the correct window length | |
912 #Hence why we should do this on whole chromosomes | |
913 if($sub_end >= $slice_length){ | |
914 #$sub_end = $slice_end; | |
915 #No longer set to slice end, as we don't want to corrupt the bin definition? | |
916 #Surplus bins are removed in store/write_collection in caller | |
917 #We could simply add the largest window the the end of the slice? | |
918 #Then we will only build the minimum of excess bins? | |
919 #This should be okay for bin calcs | |
920 #But may screw up bin trimming in caller as we currently expect $ub_end to be a valid bin end | |
921 #for all wsizes | |
922 #bin trimming should handle this, but this will corrupt the bin definition??? | |
923 #bin definition is depedant on method | |
924 #So this method need to be agnostic | |
925 #And deal with the rest in descendant | |
926 $in_slice = 0; | |
927 } | |
928 | |
929 | |
930 $slice = $slice->adaptor->fetch_by_region($region, $seq_region_name, ($sub_start + $orig_start -1), ($sub_end + $orig_start - 1), $strand, $version); | |
931 #Can't subslice as this will not clip if we go over the length of the slice, unlike normal slice fetching | |
932 #hence we cannot rely on this | |
933 #$slice = $orig_slice->sub_Slice($sub_start, $sub_end, $orig_slice->strand); | |
934 #warn "got sub slice $slice as $sub_start - $sub_end from ".$orig_slice->name; | |
935 | |
936 | |
937 ### Grab features and shift chunk coords | |
938 #features may already be a 0 wsize collection if we have projected from an old assembly | |
939 #Could move this check to get_Features_by_Slice? | |
940 | |
941 #e.g. [ $features, \%config ] | |
942 $features = $self->get_Features_by_Slice($slice); | |
943 #next if scalar(@$features) == 0;#We want to store values for all windows | |
944 | |
945 if( (@$features) && | |
946 (ref($features->[0]) =~ /Bio::EnsEMBL::Funcgen::Collection/) ){#Change to isa 'Bio::EnsEMBL::Collection | |
947 | |
948 #Check that the returned feature/collections support window_size | |
949 | |
950 if($features->[0]->can('window_size')){ | |
951 | |
952 if($features->[0]->window_size != 0){ | |
953 throw("You are trying to generated Collections from a non-zero window sized Collection:\t".$features->[1]->{'window_size'}); | |
954 } | |
955 | |
956 #This should never happen | |
957 if(! $skip_zero_window){ | |
958 throw('You have retrieved data from a Collection which without using -skip_zero_window i.e. you are trying to generate overwrite the data you are generating the Collections from'); | |
959 } | |
960 } | |
961 else{ | |
962 throw('Something si wrong, the Collection you have retrieved does not support the method window_size'); | |
963 } | |
964 } | |
965 | |
966 | |
967 | |
968 #Set collection start here for 0 window_size | |
969 if(@$features && $store_natural && ! defined $self->collection_start(0)){ | |
970 $self->collection_start(0, ($features->[0]->start + $sub_start)); | |
971 } | |
972 | |
973 | |
974 | |
975 $start_adj = $chunk_length if($in_slice); | |
976 | |
977 | |
978 | |
979 #This should return a hash of window size => bin array pairs | |
980 if(! $only_natural){ | |
981 | |
982 $bins = $self->_bin_features_by_window_sizes( | |
983 -slice => $slice, | |
984 -window_sizes => $chunk_windows{$chunk_length}, | |
985 -bin_method => $bin_method, | |
986 -features => $features, | |
987 ); | |
988 | |
989 | |
990 | |
991 | |
992 } | |
993 | |
994 #my $bin_start = $sr_start + $slice_adj;#This was only required for storing individual bins | |
995 #Could calc bin_start + slice_adjust ahere for all features | |
996 #Doing this will break old code for single window collections | |
997 | |
998 #This is sr start and should be local to orig_slice! | |
999 | |
1000 | |
1001 | |
1002 | |
1003 #We need to handle strandedness of slice!? | |
1004 | |
1005 #Store all normal features in result_feature | |
1006 | |
1007 | |
1008 | |
1009 if($store_natural){ | |
1010 | |
1011 foreach my $feature(@$features){ | |
1012 $counts{0}++; | |
1013 #warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->scores->[0])); | |
1014 | |
1015 | |
1016 #Should we handle bin trimming here for overhanging slices | |
1017 #Then counts wil be correct and wont have to do in caller | |
1018 | |
1019 #We could stop here if the feature seq_region start > orig_slice end | |
1020 #Current done in write/store_collection | |
1021 #This may mean working in seq_region values rather than slice values | |
1022 | |
1023 | |
1024 #write_collection is implemented in descendant e.g. Bio::EnsEMBL::Funcgen::Collector::ResultFeature | |
1025 #as wrapper to adaptor store method or print to file | |
1026 | |
1027 #These params need to be generated in a way defined by the descendant | |
1028 # | |
1029 | |
1030 if($bin_model eq 'SIMPLE'){ | |
1031 #We need to pass the slice with this so we can sub slice when storing | |
1032 #the collection and set the start/end to 1 and length of slice | |
1033 #we still need to store the first start to be able to sub slice correctly | |
1034 | |
1035 $self->collection_start(0, ($feature->start + $sub_start)); | |
1036 | |
1037 #Need to pass strand for 0 resolution | |
1038 $self->write_collection(0, | |
1039 $orig_slice, | |
1040 #These are now wrt orig_slice | |
1041 #($feature->start + $sub_start), | |
1042 ($feature->end + $sub_start), | |
1043 $feature->strand, | |
1044 $feature->scores, | |
1045 ); | |
1046 | |
1047 #We can have problems here if the original score type | |
1048 #does not match the collected score type | |
1049 #For max magnitude this is not an issue | |
1050 #as we take the larget value from the bin | |
1051 #But for other methods this may not be true | |
1052 #e.g. count | |
1053 #Hence, if we want to preserve the 0 window | |
1054 #We must account for this in the feature collector | |
1055 #e.g. set_collection_defs_by_ResultSet_window_size? | |
1056 #Just omit 0 window for reads | |
1057 | |
1058 } | |
1059 } | |
1060 | |
1061 print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins for:\t".$slice->name."\n"; | |
1062 } | |
1063 | |
1064 #Now store bins | |
1065 # my ($bin_end, $bin_scores); | |
1066 my $num_bins; | |
1067 | |
1068 foreach my $wsize(sort keys %{$bins}){ | |
1069 $num_bins = scalar(@{$bins->{$wsize}}); | |
1070 #warn "$num_bins bin scores for $wsize:\t".join(',', @{$bins->{$wsize}}); | |
1071 | |
1072 #Should we handle bin trimming here for overhanging slices | |
1073 #Then counts wil be correct and wont have to do in caller | |
1074 | |
1075 | |
1076 $counts{$wsize}+= $num_bins; | |
1077 | |
1078 | |
1079 | |
1080 #We don't need this loop for collections as we can simply push all the scores at once | |
1081 #Just use the slice start and end | |
1082 if($bin_model eq 'SIMPLE'){ | |
1083 | |
1084 $self->write_collection($wsize, | |
1085 $orig_slice, | |
1086 #$sub_start, | |
1087 $sub_end, | |
1088 $orig_slice->strand,#This is most likely 1! | |
1089 #Override this woth 0 in descendant Collector if required. | |
1090 $bins->{$wsize}, | |
1091 ); | |
1092 | |
1093 } | |
1094 else{ | |
1095 throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models'); | |
1096 #i.e. More than one score | |
1097 } | |
1098 | |
1099 | |
1100 | |
1101 # #Reset start and end for new wsize | |
1102 # $bin_start = $slice->start; | |
1103 # $bin_end = $slice->start; | |
1104 # | |
1105 # | |
1106 # | |
1107 # #We don't need this loop for collections as we can simply push all the scores at once | |
1108 # | |
1109 # | |
1110 # foreach my $bin_index(0..$#{$bins->{$wsize}}){ | |
1111 # | |
1112 # | |
1113 # | |
1114 # #default method to handle simple fixed width bin? | |
1115 # #bin_end need to be defined dependant on the bin type | |
1116 # #($bin_start) = $self->process_default_bin($bins->{$wsize}->[$bin_index], $wsize);#? | |
1117 # | |
1118 # | |
1119 # | |
1120 # #either define default bin method in descendant | |
1121 # #Or can we set a process_bin_method var? | |
1122 # #No just pass all this info to write collection and handle it there? | |
1123 # | |
1124 # #Can we have just predefined rotueines handling different bin types? | |
1125 # #Simple | |
1126 # #Simple compressed | |
1127 # #Clipped | |
1128 # #This will prevent hanving to make attrs/method for storing persistent start/end/score info | |
1129 # | |
1130 # | |
1131 # | |
1132 # #Need validate bin_type method | |
1133 # #Could convert these to numbers for speed as with binning methods | |
1134 # | |
1135 # if($bin_model eq 'SIMPLE'){ | |
1136 # | |
1137 # $bin_scores = $bins->{$wsize}->[$bin_index]; | |
1138 # | |
1139 # warn "bin scores is $bin_scores"; | |
1140 # | |
1141 # | |
1142 # #next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores | |
1143 # | |
1144 # $bin_end += $wsize; | |
1145 # | |
1146 # #if($bin_score){#Removed this as we always want to write the score even if it is 0 | |
1147 # | |
1148 # #This is a little backwards as we are generating the object to store it | |
1149 # #If we are aiming for speed the maybe we could also commodotise the store method | |
1150 # #store by args arrays? store_fast? | |
1151 # #Speed not essential for storing! | |
1152 # | |
1153 # #Note: list ref passed | |
1154 # | |
1155 # #Don't need to pass all this info for fixed width blob collections | |
1156 # #Need to write some default handlers depedant on the collection type | |
1157 # #Simple(original) | |
1158 # #Simple compressed | |
1159 # #Multi compressed | |
1160 # #Clipped uncompressed? | |
1161 # | |
1162 # | |
1163 # $self->write_collection($wsize, | |
1164 # $orig_slice, | |
1165 # ($bin_start + $slice_adj), | |
1166 # ($bin_end + $slice_adj), | |
1167 # $orig_slice->strand,#This is most likely 0 | |
1168 # $bin_scores, | |
1169 # ); | |
1170 # | |
1171 # #Only count if we have a stored(projected?) feature | |
1172 # $count++;#Change this to attr/method? | |
1173 # #} | |
1174 # | |
1175 # $bin_start += $wsize; | |
1176 # } | |
1177 # else{ | |
1178 # throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models'); | |
1179 # } | |
1180 # } | |
1181 | |
1182 #warn "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins"; | |
1183 #$counts{$wsize}+= $count; | |
1184 } | |
1185 } | |
1186 | |
1187 $store_natural = 0; #Turn off storing of natural resolution for next chunk length sets | |
1188 } | |
1189 | |
1190 #Now need to write last collections for each wsize | |
1191 | |
1192 foreach my $wsize(@{$self->window_sizes}){ | |
1193 | |
1194 next if $wsize == 0 && ! $store_natural; | |
1195 next if $wsize != 0 && $only_natural; | |
1196 | |
1197 print "Writing final $wsize window_size collection, this may result in slightly different bin numbers from counts due to removing overhanging bins past end of slice\n"; | |
1198 | |
1199 $self->write_collection($wsize, $orig_slice);#store last collection | |
1200 } | |
1201 | |
1202 | |
1203 #Print some counts here | |
1204 foreach my $wsize(sort (keys %counts)){ | |
1205 print "Generated ".$counts{$wsize}." bins for window size $wsize for ".$orig_slice->name."\n"; | |
1206 #Some may have failed to store if we are projecting to a new assembly | |
1207 #Need collection count here too, but would need methods for this? | |
1208 } | |
1209 | |
1210 #Return this counts hash so we can print/log from the caller, hence we don't print in here? | |
1211 | |
1212 return; | |
1213 } | |
1214 | |
1215 | |
1216 | |
1217 =head2 _bin_features_by_window_sizes | |
1218 | |
1219 Args[0] : Bio::EnsEMBL::Slice | |
1220 Args[1] : ARRAYREF of window sizes | |
1221 Args[2] : int - bin method, currently defined by validate_bin_methods | |
1222 Args[3] : ARRAYREF of Bio::EnsEMBL::Features | |
1223 Example : $bins = $self->_bin_features_by_window_sizes( | |
1224 -slice => $slice, | |
1225 -window_sizes => $chunk_windows{$chunk_length}, | |
1226 -bin_method => $bin_method, | |
1227 -features => $features, | |
1228 ); | |
1229 Description: Bins feature scores for a given list of window sizes and predefined method number | |
1230 Returntype : HASHREF of scores per bin per window size | |
1231 Exceptions : Throws if bin method not supported | |
1232 Caller : store_window_bins_by_Slice | |
1233 Status : At Risk | |
1234 | |
1235 =cut | |
1236 | |
1237 | |
1238 #To do | |
1239 # 1 Remove Bio::EnsEMBL::Feature dependancy? Or just create Features for non adaptor Collectors. | |
1240 # Is there a way we can skip the object generation in the adaptor completely and just | |
1241 # pass the values we need? | |
1242 # 2 Separate methods, so we can define custom methods in descendants? | |
1243 # 3 Expand %bins model to optionally be one of | |
1244 # the following dependant on binning method | |
1245 # Simple: fixed width containing arrays of scores for each window | |
1246 # Multi: fixed width containing multiple arrays of scores for each window | |
1247 # Non-simple?: Separate aggregated features, either fixed width or not, not BLOB! | |
1248 # Clipped: default fixed width with option to clip start and end. Needs start/end attrs | |
1249 # Can't store this in a blob due to non-standard start ends? | |
1250 # Most likely want more than one score here? Count/Density SNPs? | |
1251 # Removes data skew from standard window bins, would need to store each bin and post | |
1252 # process. Or do in line to avoid 2nd post-processing loop,requires awareness of when | |
1253 # we have moved to a new bin between features. This holds for overlapping and | |
1254 # non-overlapping features. Once we have observed a gap we need to clip the end of the | |
1255 # last bin and clip the start of the new bin. This requires knowing the greatest end | |
1256 # values from the last bin's feature. what if two overlapping features had the same | |
1257 # start and different end, would we see the longest last? Check default slice_fetch sort | |
1258 | |
1259 sub _bin_features_by_window_sizes{ | |
1260 my $this = shift; | |
1261 my ( $slice, $window_sizes, $method, $features ) = | |
1262 rearrange( [ 'SLICE', 'WINDOW_SIZES', 'BIN_METHOD', 'FEATURES' ], @_ ); | |
1263 | |
1264 | |
1265 #Do this conditional on the Collection type | |
1266 #i.e. is collection seq_region blob then no else yes | |
1267 #if ( !defined($features) || !@{$features} ) { return {} } | |
1268 | |
1269 #warn 'Processing '.scalar(@$features).' features for window sizes '.join(', ',@$window_sizes).' for slice '.$slice->name."\n"; | |
1270 | |
1271 #Set up some hashes to store data by window_size | |
1272 my (%bins, %nbins, %bin_counts); | |
1273 my $slice_start = $slice->start(); | |
1274 | |
1275 #Default handlers for | |
1276 #my($first_bin); | |
1277 #if ( $method == 0 || # 'count' or 'density' | |
1278 # $method == 3 || # 'fractional_count' or 'weight' | |
1279 # $method == 4 # 'coverage' | |
1280 # ){ | |
1281 # # For binning methods where each bin contain numerical values. | |
1282 # $first_bin = 0; | |
1283 # } | |
1284 # else { | |
1285 # # For binning methods where each bin does not contain numerical | |
1286 # # values. | |
1287 # | |
1288 # #Remove this | |
1289 # $first_bin = undef; | |
1290 # } | |
1291 | |
1292 | |
1293 #Set up some bin data for the windows | |
1294 my $slice_length = $slice->length; | |
1295 | |
1296 foreach my $wsize (@$window_sizes) { | |
1297 #TO DO: Need to modify this block if default 0's are undesirable for collection type | |
1298 #i.e. should it be undef instead? May have prolbems representing undef in blob | |
1299 | |
1300 $nbins{$wsize} = int($slice_length / $wsize); #int rounds down | |
1301 #nbins is actually the index of the bin not the 'number' | |
1302 #Unless slice_Length is a multiple! | |
1303 $nbins{$wsize}-- if(! ($slice_length % $wsize)); | |
1304 | |
1305 #Create default bins with 0 | |
1306 @{$bins{$wsize}} = (); | |
1307 map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize}); | |
1308 | |
1309 #Set bin counts to 0 for each bin | |
1310 @{$bin_counts{$wsize}} = (); | |
1311 | |
1312 #This is adding an undef to the start of the array!? | |
1313 map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}}; | |
1314 | |
1315 foreach my $bin(@{$bins{$wsize}}){ | |
1316 $bin_counts{$wsize}->[$bin] = 0; | |
1317 } | |
1318 } | |
1319 | |
1320 #warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts); | |
1321 #This fails for slices which are smaller than the chunk length; | |
1322 my $feature_index = 0; | |
1323 my ($bin_index, @bin_masks); | |
1324 | |
1325 foreach my $feature ( @{$features} ) { | |
1326 #Set up the bins for each window size | |
1327 | |
1328 #Omit test for Bio::EnsEMBL::Feature here for speed | |
1329 #Only needs start/end methods | |
1330 | |
1331 foreach my $wsize (@$window_sizes) { | |
1332 | |
1333 #We have already highjacked the object creation by here | |
1334 #This is done in core BaseFeatureAdaptor | |
1335 #We probably don't want to do this for ResultFeatures as we don't use the | |
1336 #standard feature implementation | |
1337 #we already use an array and we don't store the slice | |
1338 #as this is already known by the caller | |
1339 #and we always build on top level so we don't need to remap | |
1340 | |
1341 #We do however need the slice to store, as we only store local starts when generating | |
1342 #We need a store by Slice method? | |
1343 #This will remove the need to inherit from Feature. | |
1344 #These will need to be regenerated everytime we import a new build | |
1345 #As we do with the probe_features themselves | |
1346 #This also mean the result_feature status has to be associated with a coord_system_id | |
1347 | |
1348 #Which bins do the start and end lie in for this feature? | |
1349 #Already dealing with local starts, so no slice subtraction | |
1350 #Could wrap these start/end methods via the descendant Collector | |
1351 #to remove the Feature dependancy? Or just create Features when parsing in the caller | |
1352 my $start_bin = int(($feature->start ) / $wsize); | |
1353 my $end_bin = int(($feature->end) / $wsize ); | |
1354 $end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize}; | |
1355 | |
1356 | |
1357 | |
1358 #Slightly obfuscated code to match method number(faster) | |
1359 #by avoiding string comparisons. | |
1360 #Could call methods directly using coderef set in validate_bin_method | |
1361 #Accessor may slow things down, but should be uniform for all methods | |
1362 #rather than being dependant on position in if/else block below | |
1363 | |
1364 #reserve 0 for descendant defined method? | |
1365 #There fore always fastest in this block, or use coderefs? | |
1366 if ( $method == 0 ) { | |
1367 # ---------------------------------------------------------------- | |
1368 # For 'count' and 'density'. | |
1369 | |
1370 for ( $bin_index = $start_bin ; | |
1371 $bin_index <= $end_bin ; | |
1372 ++$bin_index ) { | |
1373 | |
1374 $bins{$wsize}->[$bin_index]++; | |
1375 | |
1376 #warn "setting $wsize bin $bin_index to ". $bins{$wsize}->[$bin_index]; | |
1377 | |
1378 } | |
1379 } | |
1380 | |
1381 =pod | |
1382 | |
1383 } elsif ( $method == 1 ) { | |
1384 # ---------------------------------------------------------------- | |
1385 # For 'indices' and 'index' | |
1386 | |
1387 | |
1388 #How is this useful? | |
1389 #Is this not just count per bin? | |
1390 #No this is a list of the feature indices | |
1391 #So forms a distribution? | |
1392 | |
1393 throw('Not implemented for method for index'); | |
1394 | |
1395 for ( my $bin_index = $start_bin ; | |
1396 $bin_index <= $end_bin ; | |
1397 ++$bin_index ) { | |
1398 push( @{ $bins[$bin_index] }, $feature_index ); | |
1399 } | |
1400 | |
1401 ++$feature_index; | |
1402 | |
1403 } elsif ( $method == 2 ) { | |
1404 # ---------------------------------------------------------------- | |
1405 # For 'features' and 'feature'. | |
1406 | |
1407 throw('Not implemented for method for feature/features'); | |
1408 | |
1409 for ( my $bin_index = $start_bin ; | |
1410 $bin_index <= $end_bin ; | |
1411 ++$bin_index ) { | |
1412 push( @{ $bins[$bin_index] }, $feature ); | |
1413 } | |
1414 | |
1415 } elsif ( $method == 3 ) { | |
1416 # ---------------------------------------------------------------- | |
1417 # For 'fractional_count' and 'weight'. | |
1418 | |
1419 | |
1420 throw('Not implemented for method for fractional_count/weight'); | |
1421 | |
1422 if ( $start_bin == $end_bin ) { | |
1423 ++$bins[$start_bin]; | |
1424 } else { | |
1425 | |
1426 my $feature_length = | |
1427 $feature->[FEATURE_END] - $feature->[FEATURE_START] + 1; | |
1428 | |
1429 # The first bin... | |
1430 $bins[$start_bin] += | |
1431 ( ( $start_bin + 1 )*$bin_length - | |
1432 ( $feature->[FEATURE_START] - $slice_start ) )/ | |
1433 $feature_length; | |
1434 | |
1435 # The intermediate bins (if there are any)... | |
1436 for ( my $bin_index = $start_bin + 1 ; | |
1437 $bin_index <= $end_bin - 1 ; | |
1438 ++$bin_index ) { | |
1439 $bins[$bin_index] += $bin_length/$feature_length; | |
1440 } | |
1441 | |
1442 # The last bin... | |
1443 $bins[$end_bin] += | |
1444 ( ( $feature->[FEATURE_END] - $slice_start ) - | |
1445 $end_bin*$bin_length + | |
1446 1 )/$feature_length; | |
1447 | |
1448 } ## end else [ if ( $start_bin == $end_bin) | |
1449 | |
1450 } | |
1451 elsif ( $method == 4 ) { | |
1452 # ---------------------------------------------------------------- | |
1453 # For 'coverage'. | |
1454 | |
1455 #What exactly is this doing? | |
1456 #This is coverage of bin | |
1457 #Rather than coverage of feature as in fractional_count | |
1458 | |
1459 | |
1460 # my $feature_start = $feature->[FEATURE_START] - $slice_start; | |
1461 # my $feature_end = $feature->[FEATURE_END] - $slice_start; | |
1462 # | |
1463 # if ( !defined( $bin_masks[$start_bin] ) | |
1464 # || ( defined( $bin_masks[$start_bin] ) | |
1465 # && $bin_masks[$start_bin] != 1 ) ) { | |
1466 # # Mask the $start_bin from the start of the feature to the end | |
1467 # # of the bin, or to the end of the feature (whichever occurs | |
1468 # # first). | |
1469 # my $bin_start = int( $start_bin*$bin_length ); | |
1470 # my $bin_end = int( ( $start_bin + 1 )*$bin_length - 1 ); | |
1471 # for ( my $pos = $feature_start; | |
1472 # $pos <= $bin_end && $pos <= $feature_end ; | |
1473 # ++$pos ) { | |
1474 # $bin_masks[$start_bin][ $pos - $bin_start ] = 1; | |
1475 # } | |
1476 # } | |
1477 # | |
1478 # for ( my $bin_index = $start_bin + 1 ; | |
1479 # $bin_index <= $end_bin - 1 ; | |
1480 # ++$bin_index ) { | |
1481 # # Mark the middle bins between $start_bin and $end_bin as fully | |
1482 # # masked out. | |
1483 # $bin_masks[$bin_index] = 1; | |
1484 # } | |
1485 # | |
1486 # if ( $end_bin != $start_bin ) { | |
1487 # | |
1488 # if ( !defined( $bin_masks[$end_bin] ) | |
1489 # || ( defined( $bin_masks[$end_bin] ) | |
1490 # && $bin_masks[$end_bin] != 1 ) ) { | |
1491 # # Mask the $end_bin from the start of the bin to the end of | |
1492 # # the feature, or to the end of the bin (whichever occurs | |
1493 # # first). | |
1494 # my $bin_start = int( $end_bin*$bin_length ); | |
1495 # my $bin_end = int( ( $end_bin + 1 )*$bin_length - 1 ); | |
1496 # for ( my $pos = $bin_start ; | |
1497 # $pos <= $feature_end && $pos <= $bin_end ; | |
1498 # ++$pos ) { | |
1499 # $bin_masks[$end_bin][ $pos - $bin_start ] = 1; | |
1500 # } | |
1501 # } | |
1502 # } | |
1503 # } ## end elsif ( $method == 4 ) | |
1504 | |
1505 =cut | |
1506 | |
1507 | |
1508 elsif ( $method == 5 ) { | |
1509 #$self->$method($bin_index, $start_bin, $end_bin, $wsize, \%bins, \%bin_counts); | |
1510 | |
1511 | |
1512 #average score | |
1513 #This is simple an average of all the scores for features which overlap this bin | |
1514 #No weighting with respect to the bin or the feature | |
1515 | |
1516 for ( $bin_index = $start_bin ; | |
1517 $bin_index <= $end_bin ; | |
1518 ++$bin_index ) { | |
1519 | |
1520 #we should really push onto array here so we can have median or mean. | |
1521 $bins{$wsize}->[$bin_index] += $this->get_score_by_Feature($feature); | |
1522 $bin_counts{$wsize}->[$bin_index]++; | |
1523 } | |
1524 } | |
1525 elsif( $method == 6){ | |
1526 #Max magnitude | |
1527 #Take the highest value +ve or -ve score | |
1528 for ( $bin_index = $start_bin ; | |
1529 $bin_index <= $end_bin ; | |
1530 ++$bin_index ) { | |
1531 | |
1532 #we really need to capture the lowest -ve and higest +ve scores here and post process | |
1533 #To pick between them | |
1534 | |
1535 my $score = $this->get_score_by_Feature($feature); | |
1536 #Write score method as wrapper to scores? | |
1537 | |
1538 $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve | |
1539 | |
1540 | |
1541 #warn "Comparing wsize $wsize bin $bin_index score $score to ". $bins{$wsize}->[$bin_index]->[0].' '.$bins{$wsize}->[$bin_index]->[1]."\n"; | |
1542 | |
1543 if($score < $bins{$wsize}->[$bin_index]->[0]){ | |
1544 #warn "setting -ve bin to $score\n"; | |
1545 $bins{$wsize}->[$bin_index]->[0] = $score; | |
1546 } | |
1547 elsif($score > $bins{$wsize}->[$bin_index][1]){ | |
1548 #warn "setting +ve bin to $score\n"; | |
1549 $bins{$wsize}->[$bin_index]->[1] = $score; | |
1550 } | |
1551 } | |
1552 } | |
1553 else { | |
1554 throw("Only accomodates average score method"); | |
1555 } | |
1556 | |
1557 | |
1558 } | |
1559 | |
1560 } ## end foreach my $feature ( @{$features... | |
1561 | |
1562 | |
1563 #Now do post processing of bins | |
1564 | |
1565 =pod | |
1566 | |
1567 if ( $method == 4 ) { | |
1568 | |
1569 # ------------------------------------------------------------------ | |
1570 # For the 'coverage' method: Finish up by going through @bin_masks | |
1571 # and sum up the arrays. | |
1572 | |
1573 for ( my $bin_index = 0 ; $bin_index < $nbins ; ++$bin_index ) { | |
1574 if ( defined( $bin_masks[$bin_index] ) ) { | |
1575 if ( !ref( $bin_masks[$bin_index] ) ) { | |
1576 $bins[$bin_index] = 1; | |
1577 } else { | |
1578 $bins[$bin_index] = | |
1579 scalar( grep ( defined($_), @{ $bin_masks[$bin_index] } ) )/ | |
1580 $bin_length; | |
1581 } | |
1582 } | |
1583 } | |
1584 } | |
1585 | |
1586 =cut | |
1587 | |
1588 if( $method == 5){ | |
1589 #For average score, need to divide bins by bin_counts | |
1590 | |
1591 foreach my $wsize(keys %bins){ | |
1592 | |
1593 foreach my $bin_index(0..$#{$bins{$wsize}}){ | |
1594 | |
1595 if($bin_counts{$wsize}->[$bin_index]){ | |
1596 $bins{$wsize}->[$bin_index] /= $bin_counts{$wsize}->[$bin_index]; | |
1597 } | |
1598 #warn "bin_index $wsize:$bin_index has score ".$bins{$wsize}->[$bin_index]; | |
1599 } | |
1600 } | |
1601 } | |
1602 elsif( $method == 6){ | |
1603 #Max magnitude | |
1604 #Take the highest value +ve or -ve score | |
1605 | |
1606 foreach my $wsize(keys %bins){ | |
1607 | |
1608 foreach my $bin_index(0..$#{$bins{$wsize}}){ | |
1609 | |
1610 #So we have the potential that we have no listref in a given bin | |
1611 | |
1612 #default value if we haven't seen anything is 0 | |
1613 #we actually want an array of -ve +ve values | |
1614 | |
1615 #warn "Are we storing 0 values for absent data?"; | |
1616 #Not for max_magnitude, but maybe for others? | |
1617 | |
1618 if($bins{$wsize}->[$bin_index]){ | |
1619 #warn $wsize.':'.$bin_index.':'.$bins{$wsize}->[$bin_index]->[0].'-'.$bins{$wsize}->[$bin_index]->[1]; | |
1620 my $tmp_minus = $bins{$wsize}->[$bin_index]->[0] * -1; | |
1621 | |
1622 if($tmp_minus > $bins{$wsize}->[$bin_index]->[1]){ | |
1623 $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[0]; | |
1624 } | |
1625 else{ | |
1626 $bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[1]; | |
1627 } | |
1628 | |
1629 #warn "bin $bin_index now ". $bins{$wsize}->[$bin_index]; | |
1630 } | |
1631 } | |
1632 } | |
1633 } | |
1634 elsif($method != 0){#Do no post processing for count(0) | |
1635 throw('Collector currently only accomodates average_score, count and max magnitude methods'); | |
1636 } | |
1637 | |
1638 | |
1639 #Could return bin_counts too summary reporting in zmenu | |
1640 #Could also do counting of specific type | |
1641 | |
1642 #warn "returning bins ".Data::Dumper::Dumper(\%bins); | |
1643 | |
1644 return \%bins; | |
1645 } ## end sub _bin_features | |
1646 | |
1647 | |
1648 =pod | |
1649 | |
1650 #These could potentially be used as code refs to avoid having the if else block | |
1651 #This way we can also define new methods in the descendant Collector? | |
1652 #Would have to have pass args and refs to bin hashes | |
1653 #This would slow things down over direct access here | |
1654 #But speed is no longer that critical as we do not use the Collector for display | |
1655 #purposes, only to build the Collections which are then used for display directly. | |
1656 | |
1657 sub calculate_average_score{ | |
1658 my $self = shift; | |
1659 | |
1660 if ( $method == 5 ) { | |
1661 #average score | |
1662 #This is simple an average of all the scores for features which overlap this bin | |
1663 #No weighting with respect to the bin or the feature | |
1664 | |
1665 for ( my $bin_index = $start_bin ; | |
1666 $bin_index <= $end_bin ; | |
1667 ++$bin_index ) { | |
1668 | |
1669 #we should really push onto array here so we can have median or mean. | |
1670 $bins{$wsize}->[$bin_index] += $feature->score; | |
1671 $bin_counts{$wsize}->[$bin_index]++; | |
1672 } | |
1673 } | |
1674 | |
1675 | |
1676 | |
1677 } | |
1678 | |
1679 | |
1680 sub post_process_average_score{ | |
1681 | |
1682 } | |
1683 | |
1684 | |
1685 sub calculate_max_magnitude{ | |
1686 my $self = shift; | |
1687 | |
1688 #Max magnitude | |
1689 #Take the highest value +ve or -ve score | |
1690 for ( my $bin_index = $start_bin ; | |
1691 $bin_index <= $end_bin ; | |
1692 ++$bin_index ) { | |
1693 | |
1694 #we really need to capture the lowest -ve and higest +ve scores here and post process | |
1695 #To pick between them | |
1696 | |
1697 my $score = $feature->score; | |
1698 $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve | |
1699 | |
1700 if($score < $bins{$wsize}->[$bin_index]->[0]){ | |
1701 $bins{$wsize}->[$bin_index]->[0] = $score; | |
1702 } | |
1703 elsif($score > $bins{$wsize}->[$bin_index][1]){ | |
1704 $bins{$wsize}->[$bin_index]->[1] = $score; | |
1705 } | |
1706 } | |
1707 } | |
1708 | |
1709 | |
1710 sub post_process_max_magnitude{ | |
1711 | |
1712 } | |
1713 | |
1714 =cut | |
1715 | |
1716 #separated to allow addition of non-standard methods | |
1717 #Could potentially add these in new | |
1718 #and put this back in _bin_features | |
1719 | |
1720 | |
1721 sub validate_bin_method{ | |
1722 my ($self, $method) = @_; | |
1723 | |
1724 | |
1725 #change this to set the coderefs | |
1726 #Just set anonymous sub to immediately return for non post processed methods | |
1727 #No need for coderef, just set the method name? | |
1728 | |
1729 #if(! $self->can('calculate_'.$method)){ | |
1730 #throw("$method method does not have a valid calculate_${method} method"); | |
1731 #} | |
1732 | |
1733 #if($self->can('post_process_'.$method)){ | |
1734 ##set post process flag? | |
1735 #or simply do this can in line in the _bin_features sub? | |
1736 #} | |
1737 | |
1738 | |
1739 | |
1740 | |
1741 #Add average_score to avoid changing Collection.pm | |
1742 my $class = ref($self); | |
1743 ${$class::VALID_BINNING_METHODS}{'average_score'} = 5; | |
1744 ${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6; | |
1745 ${$class::VALID_BINNING_METHODS}{'count'} = 0; | |
1746 | |
1747 | |
1748 | |
1749 #foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){ | |
1750 # warn "valid method is $method name"; | |
1751 # } | |
1752 | |
1753 | |
1754 if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) { | |
1755 throw( | |
1756 sprintf( | |
1757 "Invalid binning method '%s', valid methods are:\n\t%s\n", | |
1758 $method, | |
1759 join( "\n\t", sort( keys(%{$class::VALID_BINNING_METHODS}) ) ) ) ); | |
1760 } | |
1761 else{ | |
1762 #warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method}; | |
1763 } | |
1764 | |
1765 return ${$class::VALID_BINNING_METHODS}{$method}; | |
1766 } | |
1767 | |
1768 | |
1769 | |
1770 1; |