comparison variant_effect_predictor/Bio/EnsEMBL/Utils/Collector.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Utils::Collector
24
25 =head1 SYNOPSIS
26
27 # Inherit this base module in your feature specific Collector
28 # instance:
29
30 package Bio::EnsEMBL::Funcgen::Collector::ResultFeature;
31 use base('Bio::EnsEMBL::Utils::Collector');
32
33 # ... and define package config variables
34 $Bio::EnsEMBL::Funcgen::Collector::bin_model = 'SIMPLE';
35 $Bio::EnsEMBL::Funcgen::Collector::window_sizes =
36 [ 30, 65, 130, 260, 450, 648, 950, 1296 ];
37 # Could replace 30 with 0 here for low density data at natural resolution
38
39 $Bio::EnsEMBL::Utils::Collector::bin_method =
40 'count'; # only used by collector
41
42 $Bio::EnsEMBL::Utils::Collector::packed_size = 2;
43
44 # ... or simply use this module in a script either defining package
45 # config variables, or passing as parameters to the constructor:
46
47 my $collector =
48 Bio::EnsEMBL::Utils::BaseCollector->new( -pack_template => 'v' );
49
50 $Bio::EnsEMBL::Funcgen::Collector::pack_template = 'v';
51
52 # Config variables can also be over-ridden by passing a config hash to
53 # the store_window_bins_by_Slice() method:
54
55 $collector->store_window_bins_by_Slice( $slice, (
56 -pack_template => 'v',
57 -packed_size => 2 ) );
58
59 # NOTE: Over-riding default config variables can cause problems when
60 # storing or fetching data. e.g. Fetch may revert to using defaults or
61 # table partitions may not match window sizes.
62
63 =head1 DESCRIPTION
64
65 This package is the base Collector class which contains generic
66 getter/setter methods along with the main 'collecting' methods which
67 perform the majority of the work in generating compressed data
68 collections optimised for web display. The bins produced are aimed at
69 storage in a BLOB representing an entire seq_region i.e. even bins with
70 no features/null data are encoded as a 0 score. Non-BLOB collections
71 are currently not supported.
72
73 If your Collection class defines a Bio::EnsEMBL::Feature, then its
74 adaptor should inherit from the relevant Collection class.
75
76 The minimum prerequisites of the input features/data are that they have
77 a start() and end() method. For instance a Bio::EnsEMBL::Features
78 generated from a database or parsed from a flat file.
79
80 NOTE: This Collector does not have a lightweight mode previously used
81 for dynamic/on the fly collecting i.e. it does not take advantage of
82 bypassing object creation via the related BaseFeatureAdaptor method.
83
84 =cut
85
86 package Bio::EnsEMBL::Utils::Collector;
87
88 use Bio::EnsEMBL::Utils::Argument ('rearrange');
89 use Bio::EnsEMBL::Utils::Exception ('throw');
90 use strict;
91 use warnings;
92
93 ### Global package config vars
94
95 # Defaults
96 our $max_view_width = 1000000; # Max bp width in location/detailed view
97 our $max_data_type_size = 16777216; # Default is 16MB for long BLOB
98 # This is really a guide value as this should be set in the inheriting
99 # Collector class by deducting the rest of the row size from this value.
100 # Is is upto the inheritor to handle checking whether this size has been
101 # exceeded.
102
103 # NOTE: Theoretically the min window size is: slice_length/(16777216/2)
104 # So for human chr1: 249,250,621/(16777216/2) = 29.7 => 30. However,
105 # this size does not seem to directly translate to the MySQL
106 # max_allowed_packet_size. Increasing max_allowed_packet_size to 64MB
107 # solves this issue, and substr operation doesn't appear to incur any of
108 # the potential memory(4*) usage issues.
109
110 # Others global package variables which are set in the inheriting
111 # Collector class.
112 our ( $bin_model, $bin_method, $pack_template,
113 $packed_size, $window_sizes );
114
115 =head2 new
116
117 Args : None
118 Example :
119
120 my $collector = Bio::EnsEMBL::XXX::Collector::FEATURE->new();
121 $collector->store_windows_by_Slice($slice);
122
123 # Where XXX is, e.g. Compara, FuncGen etc.
124
125 Description: Simple new method to enable use of collector
126 when not inherited by a descendant of
127 Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor
128
129 Returntype : Bio::EnsEMBL::XXX::Collector
130 Exceptions : None
131 Caller : Collector script
132 Status : At Risk
133
134 =cut
135
136 sub new {
137 return bless {}, $_[0]; # Simply blesses this class as an empty hash.
138
139 # Do not set anything here, as will not be first in ISA for
140 # FeatureAdaptors. Hence, not guaranteed to be called.
141 }
142
143 =head2 new_assembly
144
145 Args : optional - string assembly version e.g. GRCh37
146 Example : $collector->new_assembly('GRCh37');
147 Description: Getter/Setter for new assembly version which should be
148 used to project only 0 wsize Collections.
149 Returntype : string
150 Exceptions : None
151 Caller : store_window_bins_by_Slice() or
152 write_collection() in inheriting Collector class.
153 Status : At Risk
154
155 =cut
156
157 sub new_assembly {
158 my ( $self, $new_asm ) = @_;
159
160 if ( defined($new_asm) ) {
161 $self->{'new_assembly'} = $new_asm;
162 }
163
164 return $self->{'new_assembly'};
165 }
166
167 ### Setter/Getter methods for basic/mandatory config
168 # Can also be set using package variables in the inheriting
169 # Collector/Adaptor or run script. Allows over-riding of defaults set
170 # in Adaptor/Collector.
171
172 # Package variables used here instead of attrs to enable easy
173 # default config in inheriting class/script method. Provided
174 # for easy/standardised fetch access outside of this package
175 # i.e. Collectors/Adaptors
176
177 =head2 max_data_type_size
178
179 Args : optional - int Maximum size of collection in bytes
180 Example : $collector->max_data_type_size($new_max_size);
181 Description: Getter/Setter for max_data_type_size, default is
182 currently set at in this class as 16777216 (16MB), for
183 long BLOB. This is used by the write_collection()
184 method to determine when to build and store a compressed
185 collection.
186 Returntype : int
187 Exceptions : None
188 Caller : bins_per_record() and
189 write_collection() in inheriting Collector class.
190 Status : At Risk
191
192 =cut
193
194 sub max_data_type_size {
195 my ( $self, $size ) = @_;
196
197 # Validate is sensible integer
198
199 if ( defined($size) ) {
200 if ( $size !~ /^\d+$/ ) {
201 throw("max_data_type_size must be a integer of bytes, not $size");
202 }
203 $max_data_type_size = $size;
204 } elsif ( !defined($max_data_type_size) ) {
205 # This should never happen as we have defaults in this module.
206 throw( 'You must define a '
207 . '$Bio::EnsEMBL::Utils::Collector::max_data_type_size '
208 . 'or pass -max_data_type_size config' );
209 }
210
211 return $max_data_type_size;
212 }
213
214 =head2 max_view_width
215
216 Args : optional - int Maximum width of view
217 Example : $collector->max_view_width($new_max_width);
218 Description: Getter/Setter for max_view_width, default is currently
219 set at in this class as 500000bp, for maximum level of
220 zoom permitted by location view.
221 Returntype : int
222 Exceptions : None
223 Caller : general
224 Status : At Risk
225
226 =cut
227
228 sub max_view_width {
229 my ( $self, $size ) = @_;
230
231 # Validate is sensible integer
232
233 if ( defined($size) ) {
234 if ( $size !~ /^\d+$/ ) {
235 throw("max_view_width must be a integer, not $size");
236 }
237 $max_view_width = $size;
238 } elsif ( !defined $max_view_width ) {
239 # This should never happen as we have defaults in this module.
240 throw( 'You must define a '
241 . '$Bio::EnsEMBL::Utils::Collector::max_view_width '
242 . 'or pass -max_view_width config' );
243 }
244
245 return $max_view_width;
246 }
247
248 =head2 bin_method
249
250 Args[0] : optional - string name of bin method e.g. 'max_magnitude'
251 Args[1] : optional - Bio::EnsEMBL::Funcgen::Parsers::InputSet
252 Example : my $bin_method = $self->bin_method();
253 Description: Getter/Setter for bin_method, default is normally set in
254 the inheriting Collector class either by package variable
255 or by passing a config hash via the store methods.
256 Returntype : string
257 Exceptions : Throws if cannot set by package variable
258 Caller : general
259 Status : At Risk
260
261 =cut
262
263 sub bin_method {
264 my ( $self, $bmethod, $config ) = @_;
265
266 if ( defined($bmethod) ) {
267 $bin_method = $bmethod;
268 }
269
270 if ( !defined($bin_method) ) {
271 throw( 'You must define a '
272 . '$Bio::EnsEMBL::Utils::Collector::bin_method '
273 . 'or pass -bin_method config' );
274 }
275
276 if ( !$self->can( "_calculate_" . $bin_method ) ) {
277 throw("$bin_method is not a valid/available binning method");
278 }
279
280 my $set_up_method = "_set_up_" . $bin_method;
281 if ( $self->can($set_up_method) ) {
282 $self->$set_up_method($config);
283 }
284
285 return $bin_method;
286 }
287
288 =head2 bin_model
289
290 Args : optional - string bin model e.g. SIMPLE or COMPLEX
291 Example : my $bin_model = $self->bin_model;
292 Description: Getter/Setter for bin_model, default should be set in
293 inheriting Collector class. Currently only supports
294 'SIMPLE' bin model.
295 Returntype : string
296 Exceptions : Throws if bin_model is not SIMPLE
297 Caller : general
298 Status : At Risk
299
300 =cut
301
302 sub bin_model {
303 my ( $self, $bmodel ) = @_;
304
305 if ( defined($bmodel) ) {
306 $bin_model = $bmodel;
307 }
308
309 if ( !defined($bin_model) ) {
310 throw( 'You must define a '
311 . '$Bio::EnsEMBL::Utils::Collector::bin_model '
312 . 'or pass -bin_model config' );
313 }
314
315 if ( $bin_model ne 'SIMPLE' ) {
316 throw( 'Bio::EnsEMBL::Utils::Collector does not yet support '
317 . 'non-SIMPLE bin models' );
318 }
319
320 return $bin_model;
321 }
322
323
324 =head2 window_sizes
325
326 Args : optional - arrayref of window sizes
327 Example :
328
329 foreach my $wsize ( @{ $collector->window_sizes } )
330 { # Do some collecting
331 }
332
333 Description: Getter/Setter for window_sizes. Default should be set
334 in inheriting Collector (if the config is dynamic),
335 FeatureAdaptor class or script using package variable or
336 this method.
337 NOTE: Redefining these may cause a mismatch with the
338 table partition definition.
339 Returntype : arrayref of ints
340 Exceptions : Throws if cannot set a valid array of int window sizes
341 Caller : general
342 Status : At Risk - rename bin_sizes?
343
344 =cut
345
346 sub window_sizes {
347 my ( $self, $sizes ) = @_;
348
349 if ( defined($sizes) ) {
350 $window_sizes = $sizes;
351 }
352
353 if ( !( ref($window_sizes)
354 && ( ref($window_sizes) eq 'ARRAY' )
355 && ( scalar(@$window_sizes) > 0 ) ) )
356 {
357 throw('Must pass -windows_sizes in the config '
358 . 'or define $Bio::EnsEMBL::Utils::Collector::window_sizes '
359 . 'in your Collector as an array ref of integer window_sizes' );
360 }
361
362 return $window_sizes;
363 }
364
365
366
367
368 =head2 has_window_size
369
370 Args : int - window size to validate
371 Example : if( $collector->has_window_size('30') ){
372 #Do something wrt to 30bp window size
373 }
374
375 Description: Simple utility method to validate whether this Collector
376 has a given window_size
377 Returntype : Boolean
378 Exceptions : Throws if window size not specified
379 Caller : general
380 Status : At Risk
381
382 =cut
383
384
385 sub has_window_size{
386 my ( $self, $size ) = @_;
387
388 if(! defined $size){
389 throw('You must pass a window size to validate');
390 }
391
392 return grep(/$size/, @$window_sizes);
393 }
394
395
396
397
398 ### Getter/Setters for BLOB collection config
399 # NOTE: Overriding the defaults here may cause a mismatch when the data
400 # is retrieved.
401
402 =head2 pack_template
403
404 Args : optional - string perl 'pack' template
405 Example : $self->pack_template('v');
406 Description: Getter/Setter for pack_template. Default should be set
407 in inheriting Collector (if the config is dynamic),
408 FeatureAdaptor class or script using package variable or
409 this method.
410 Returntype : string
411 Exceptions : Throws if cannot set pack_template from package variable
412 Caller : FeatureAdaptor::_obj_from_sth
413 Status : At Risk
414
415 =cut
416
417 sub pack_template {
418 my ( $self, $template ) = @_;
419
420 if ( defined($template) ) {
421 $pack_template = $template;
422 }
423
424 if ( !defined($pack_template) ) {
425 throw( 'Must pass a per score '
426 . '-pack_template in the config '
427 . 'or define $Bio::EnsEMBL::Utils::Collector::pack_template '
428 . 'in your Collector' );
429 }
430
431 return $pack_template;
432 }
433
434 =head2 packed_size
435
436 Args : optional - int size of perl 'pack' template in bytes
437 Example : $self->packed_size(2);
438 Description: Getter/Setter for packed_size. Default should be set
439 in inheriting Collector (if the config is dynamic),
440 FeatureAdaptor class or script using package variable or
441 this method.
442 Returntype : string
443 Exceptions : Throws if cannot set pack_template from pacakge variable
444 Caller : current_packed_size() and
445 FeatureAdaptor::_obj_from_sth()
446 Status : At Risk
447
448 =cut
449
450 sub packed_size {
451 my ( $self, $size ) = @_;
452
453 if ( defined($size) ) {
454 $packed_size = $size;
455 }
456
457 if ( !defined($packed_size) ) {
458 throw( 'Must pass -packed_size(wrt to pack_template) config '
459 . 'or define $Bio::EnsEMBL::Utils::Collector::packed_size '
460 . 'in your Collector' );
461 }
462
463 if ( $packed_size !~ /^\d+$/ ) {
464 throw( "$packed_size is not an integer, "
465 . "must pass a size integer for packed_size "
466 . "which specifies size of pack_template:\t"
467 . $pack_template );
468 }
469
470 return $packed_size;
471 }
472
473 =head2 bins_per_record
474
475 Example : my $bin_per_records = $self->bin_per_record
476 Description: Simple method to calculate the max number of bins
477 allowed per record given the current config.
478 Returntype : int
479 Exceptions : None
480 Caller :
481 Status : At Risk
482
483 =cut
484
485 sub bins_per_record {
486 return int( $max_data_type_size/$packed_size );
487 }
488
489
490 =head2 current_packed_size
491
492 Arg[0] : int - window size
493 Example : my $cps = $self->current_packed_size($wsize);
494 Description: Simple method to calculate the max number of bins
495 allowed per record given the current config.
496 Returntype : int
497 Exceptions : None
498 Caller :
499 Status : At Risk
500
501 =cut
502
503 sub current_packed_size {
504 my ( $self, $wsize ) = @_;
505 return ( scalar( @{ $self->score_cache($wsize) } )*$packed_size );
506 }
507
508
509 =head2 score_cache
510
511 Arg[0] : int - window size
512 Example : my $cps = $self->current_packed_size($wsize);
513 Description: Handles caching of bin scores for each window size
514 Returntype : arrayref
515 Exceptions : Throws if no window size defined
516 Caller : current_packed_size() and store_collection()
517 methods
518 Status : At Risk
519
520 =cut
521
522 sub score_cache {
523 my ( $self, $wsize, $scores ) = @_;
524
525 if ( !defined($wsize) ) {
526 throw('Must pass a window size argument');
527 }
528
529 $self->{'score_cache'}{$wsize} ||= [];
530
531 if ( defined($scores) ) {
532 push( @{ $self->{'score_cache'}{$wsize} }, @{$scores} );
533 }
534
535 return $self->{'score_cache'}{$wsize};
536 }
537
538
539 =head2 collection_start
540
541 Arg[0] : int - window_size
542 Arg[1] : optional int - seq_region_start
543 Example : my $coll_start->(150);
544 Description: Getter/Setter collection seq_region_start
545 Returntype : int
546 Exceptions : Throws if no window size defined
547 Caller : store_window_bin_by_Slice() and write_collection()
548 Status : At Risk
549
550 =cut
551
552 sub collection_start {
553 my ( $self, $wsize, $sr_start ) = @_;
554
555 if ( !defined($wsize) ) {
556 throw('Must pass a window size argument');
557 }
558
559 if ( defined($sr_start) ) {
560 $self->{'collection_start'}{$wsize} = $sr_start;
561 }
562
563 return $self->{'collection_start'}{$wsize};
564 }
565
566
567 =head2 collection_end
568
569 Arg[0] : int - window_size
570 Arg[1] : optional int - seq_region_end
571 Example : my $coll_end->(150);
572 Description: Getter/Setter collection seq_region_end
573 Returntype : int
574 Exceptions : Throws if no window size defined
575 Caller : inheriting Collector write_collection method
576 Status : At Risk
577
578 =cut
579
580 sub collection_end{
581 my ($self, $wsize, $sr_end) = @_;
582 throw('Must pass a window size argument') if ! defined $wsize;
583
584 if(defined $sr_end){
585 $self->{'collection_end'}{$wsize} = $sr_end;
586 }
587 else{
588 return $self->{'collection_end'}{$wsize};
589 }
590 }
591
592
593 =head2 collection_strand
594
595 Arg[0] : int - window_size
596 Arg[1] : optional int - seq_region_strand
597 Example : my $coll_start->(0);
598 Description: Getter/Setter collection seq_region_strand
599 Returntype : int
600 Exceptions : Throws if no window size defined
601 Caller : inheriting Collector write_collection method
602 Status : At Risk - Collections are currently strandless
603
604 =cut
605
606 sub collection_strand {
607 my ( $self, $wsize, $strand ) = @_;
608
609 if ( !defined($wsize) ) {
610 throw('Must pass a window size argument');
611 }
612
613 if ( defined $strand ) {
614 $self->{'collection_strand'}{$wsize} = $strand;
615 }
616
617 return $self->{'collection_strand'}{$wsize};
618 }
619
620
621 ### Here follows the actual working methods
622
623 =head2 _get_Slice_chunks
624
625 Description: Defines the optimal set of slice chunks to use for
626 generating collections such that redundant fetches
627 are minimized.
628 Returntype : hashref of window_size chunk size pairs
629 Exceptions : Throws if no window sizes or max_view_width defined
630 Caller : store_window_bin_by_Slice()
631 Status : At Risk
632
633 =cut
634
635 sub _get_Slice_chunks {
636 my $self = shift;
637
638 if ( !defined($window_sizes) || !defined($max_view_width) ) {
639 throw( 'You must pass both a window_size array ref '
640 . 'and max_view_width arguments' );
641 }
642
643 if ( !defined( $self->{'_slice_chunks'} ) ) {
644 # Calulate sensible slice length based on window sizes
645 my @wsizes = sort { $a <=> $b } @$window_sizes;
646
647 # Handle calculating only 0 wsize
648 if ( scalar(@wsizes) == 1
649 && $wsizes[0] == 0 )
650 {
651 return { $max_view_width => [0] };
652 }
653
654 my $multiplier = int( $max_view_width/$wsizes[$#wsizes] );
655
656 my $chunk_length = $multiplier*$wsizes[$#wsizes];
657 my $not_divisible = 1;
658
659 my %chunk_windows; # Registry of chunk lengths to run with windows
660 my %workable_chunks = map { $_ => {} } @wsizes;
661
662 # get rid of natural resolution as this will always work
663 delete $workable_chunks{'0'};
664
665 while ( $not_divisible && $chunk_length != 0 ) {
666 $not_divisible = 0;
667
668 foreach my $wsize (@wsizes) {
669 if ( $wsize == 0 ) {
670 # Special wsize for normal data
671 next;
672 }
673
674 # Set not divisible if modulus is true
675 if ( $chunk_length % $wsize ) {
676 $not_divisible = 1;
677 } else {
678 $workable_chunks{$wsize}{$chunk_length} = [];
679 }
680 }
681
682 # Gradually shrink the length until we find a workable slice
683 # length for all windows.
684 if ($not_divisible) {
685 $chunk_length -= $wsizes[$#wsizes];
686 }
687 }
688
689 my %chunk_sets;
690
691 if ( $chunk_length == 0 ) {
692 print "Could not find chunk length "
693 . "for all window sizes, "
694 . "attempting to subset windows "
695 . "using alternate slice length\n";
696
697 foreach my $wsize ( keys(%workable_chunks) ) {
698 # Loop through windows, seeing if they are workable in the other
699 # windows.
700
701 foreach my $chunk ( keys( %{ $workable_chunks{$wsize} } ) ) {
702
703 foreach my $other_wsize ( keys %workable_chunks ) {
704 next if $wsize == $other_wsize;
705
706 if ( exists( $workable_chunks{$other_wsize}{$chunk} ) ) {
707 # only push it onto the other wsize, as we will do the
708 # reverse later
709 $chunk_sets{$chunk}{$wsize} = undef;
710 }
711 }
712 }
713 }
714
715 # %chunk_sets represents co-occurence of wsizes with repect to
716 # chunks. Take the set which has the most windows and the longest
717 # chunk. Then get the largest which handles the rest.
718
719 # define possible set lengths
720 my $i = 0;
721 my %set_lengths;
722
723 map { $set_lengths{$i} = []; $i++ } @wsizes;
724
725 # get rid of natural resolution as this will always work
726 delete $set_lengths{'0'};
727
728 # Store chunks lengths for each set size
729 foreach my $chunk ( keys(%chunk_sets) ) {
730 my $set_size = scalar( values( %{ $chunk_sets{$chunk} } ) );
731 push( @{ $set_lengths{$set_size} }, $chunk );
732 }
733
734 # Get the biggest set with the longest length;
735
736 # Scalar here as we are disregarding natural resolution of 0 in
737 # loop.
738 my $largest_size = scalar(@wsizes);
739 my $found_largest_set = 0;
740
741 while ( !$found_largest_set ) {
742 $largest_size--;
743
744 if ( scalar( @{ $set_lengths{$largest_size} } ) > 0 ) {
745 $found_largest_set = 1;
746 }
747 }
748
749 my ($largest_chunk) =
750 sort { $b <=> $a } @{ $set_lengths{$largest_size} };
751
752 my @largest_windows = keys %{ $chunk_sets{$largest_chunk} };
753 @{ $chunk_windows{$largest_chunk} } = @largest_windows;
754
755 print "Largest chunk $largest_chunk($largest_size) "
756 . "contains windows: @largest_windows\n";
757
758 my %remaining_windows = map { $_ => {} } @wsizes;
759
760 # get rid of natural resolution as this will always work
761 delete $remaining_windows{'0'};
762
763 map { delete $remaining_windows{$_} } @largest_windows;
764 my $remaining_set_size = scalar( keys(%remaining_windows) );
765
766 # Use array here for practicality, would need to maintain hash if
767 # we need to iterate.
768 my @rwindows = keys(%remaining_windows);
769
770 # Could be one window, but this will not be in the co-occurence
771 # hash %chunk_sets.
772 my $next_chunk;
773
774 if ( scalar(@rwindows) == 1 ) {
775 my ($last_window) = @rwindows;
776 # Find a suitably large chunk for this one window.
777 $multiplier = int( 500000/$last_window );
778 $next_chunk = $multiplier*$last_window;
779 } else {
780
781 foreach my $chunk ( sort { $b <=> $a }
782 @{ $set_lengths{$remaining_set_size} } )
783 {
784 my $seen_count = 0;
785
786 foreach my $rwindow (@rwindows) {
787 if ( grep /$rwindow/,
788 ( values( %{ $chunk_sets{$chunk} } ) ) )
789 {
790 $seen_count++;
791 }
792 }
793
794 if ( $seen_count == $remaining_set_size ) {
795 $next_chunk = $chunk;
796 last;
797 }
798 }
799
800 }
801
802 @{ $chunk_windows{$next_chunk} } = @rwindows;
803
804 if ( defined($next_chunk) ) {
805 print "Found next chunk length $next_chunk "
806 . "contains remaining windows:\t@rwindows\n";
807 } else {
808 warn "Need to write iterative method for set definition";
809 throw( 'Could not find workable slice length '
810 . 'for remaining windows: '
811 . join( ', ', @rwindows ) );
812 }
813 } else {
814 @{ $chunk_windows{$chunk_length} } = keys(%workable_chunks);
815 print "Found workable chunk length $chunk_length "
816 . "for all window sizes:\t"
817 . join( ' ', @{ $chunk_windows{$chunk_length} } ) . "\n";
818 }
819
820 $self->{'_slice_chunks'} = \%chunk_windows;
821 } ## end if ( !defined( $self->...))
822
823 return $self->{'_slice_chunks'};
824 } ## end sub _get_Slice_chunks
825
826
827
828
829 =head2 set_config
830
831 Arg[0] : optional hash - parameter hash(see above methods for more info):
832
833 WINDOW_SIZES => array ref - subset of defined window
834 sizes
835 BIN_METHOD => string
836 MAX_VIEW_WIDTH => int
837 MAX_DATA_TYPE_SIZE => int
838 PACK_TEMPLATE => string
839 PACKED_SIZE => int
840 BIN_MODEL => string
841 NEW_ASSEMBLY => string
842 METHOD_CONFIG => hash of method specific config params
843 SKIP_ZERO_WINDOW => boolean - skips generation of 0 wsize
844 this is used if already generated
845 from an assembly projection.
846
847 NOTE: Over-riding any of the default config may cause
848 problems when storing or retrieving Collection data,
849 except sub sets of default window sizes.
850
851 Description: This method replaces the constructor as new will not be
852 called for Adaptor based Collectors.
853 Separating this from the store method is currently
854 redundant as jobs are normally submitetd in Slice based
855 jobs. However, this will be required if the store method
856 is further seaprated into fetch/generate and store methods
857 Returntype : None
858 Exceptions : Throws if no window sizes or max_view_width defined
859 Caller : Inheritor Collector e.g. Bio::EnsEMBL::Funcgen:Collector::ResultFeature
860 or script.
861 Status : At Risk
862
863 =cut
864
865 sub set_config {
866 my ( $self, %config ) = @_;
867
868 my ( $wsizes, $bmethod, $mv_width,
869 $md_type_size, $template, $psize,
870 $bmodel, $new_assm, $skip_zero_window,
871 $method_config )
872 = rearrange( [ 'WINDOW_SIZES', 'BIN_METHOD',
873 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE',
874 'PACK_TEMPLATE', 'PACKED_SIZE',
875 'BIN_MODEL', 'NEW_ASSEMBLY',
876 'SKIP_ZERO_WINDOW', 'METHOD_CONFIG' ],
877 %config );
878
879 ### VAILDATE/SET VARS/CONFIG
880
881 # Attrs used in this method
882 $self->bin_method( $bmethod, $method_config );
883 $self->bin_model($bmodel);
884 $self->window_sizes($wsizes);
885
886 # Set to undef if we have empty array? To change this we need to
887 # pass the config hash -window_sizes conditionally
888 # This currently overwrite the defaults!
889 # if ( ref($window_sizes) eq 'ARRAY'
890 # && scalar( @{$window_sizes} ) == 0 )
891 # {
892 # $window_sizes = undef;
893 # }
894
895 # Attrs used in other (store) methods
896 $self->pack_template($template);
897 $self->packed_size($psize);
898 $self->max_data_type_size($md_type_size);
899 $self->max_view_width($mv_width);
900
901 # Other vars
902 $self->new_assembly($new_assm);
903 $self->{'_only_natural'} = 0;
904 $self->{'_store_natural'} = grep /^0$/, @$window_sizes;
905
906 ### Set window_sizes
907
908 if ( $self->new_assembly() ) {
909 print "Assembly projection may cause problems "
910 . "for large Collections, "
911 . "defaulting to window_sizes = (0)\n";
912
913
914 if ( $skip_zero_window ) {
915 throw( "You cannot -skip_zero_window or "
916 . "omit 0 from -window_sizes "
917 . "when projecting to a new assembly($new_assm) "
918 . "which should only be generated using window_size=0" );
919 }
920
921
922
923
924 # Then build the bins on the projected 0 level single Features
925
926 # Test we haven't explicity set window_sizes to be something else
927 if ( defined($wsizes)
928 && !( scalar(@$wsizes) == 1 && $wsizes->[0] == 0 ) )
929 {
930 throw( "You have set window_sizes config "
931 . "which are not safe when projecting to "
932 . "a new assembly($new_assm), "
933 . "please omit window_sizes config or set to 0" );
934 }
935
936 $self->window_sizes( [0] );
937 } else {
938
939 if ( $wsizes && $skip_zero_window &&
940 ( grep /^0$/, @$wsizes )) {
941 #Only test passed params not default config
942
943 throw( "You have specied skip_zero_window "
944 . "and window_size 0 in your parameters, "
945 . "please remove one of these" );
946 }
947 elsif ( defined($window_sizes) && !grep /^0$/, @$window_sizes ) {
948 $skip_zero_window = 1;
949 # re-add 0 window as we need this to build the collections
950 # see ...
951 unshift( @{$window_sizes}, 0 );
952 }
953 }
954
955
956 if ( $self->{'_store_natural'} && scalar( @{$window_sizes} ) == 1 ) {
957 $self->{'_only_natural'} = 1;
958 }
959 if ($skip_zero_window) {
960 $self->{'_store_natural'} = 0;
961 }
962
963 return;
964 } ## end sub set_config
965
966 =head2 store_window_bins_by_Slice
967
968 Arg[0] : Bio::EnsEMBL:Slice
969 Example : $collector->store_window_bins_by_Slice($slice);
970 Description: This is the main run method, it loops through
971 optimal slice chunks from _define_window_chunks,
972 calls _bin_features_by_Slice as appropriate and
973 calls write_collection in the inheriting Collector
974 class/script.
975 Returntype : None
976 Exceptions : Throws if Bio::EnsEMBL::Slice is not defined
977 Caller : store methods in inheriting Collector class/script
978 Status : At Risk
979
980 =cut
981
982 sub store_window_bins_by_Slice {
983 my ( $self, $slice ) = @_;
984
985 warn "Need to be careful here "
986 . "about cleaning start end strand caches between "
987 . "serially run slices";
988
989 if ( !( defined($slice)
990 && ref($slice)
991 && $slice->isa('Bio::EnsEMBL::Slice') ) )
992 {
993 throw('You must pass a valid Bio::EnsEMBL::Slice');
994 }
995
996 # Rollback previously stored features.
997 # Change 'can' to empty method stubb with pod ???
998 if ( $self->can('rollback_Features_by_Slice') ) {
999 $self->rollback_Features_by_Slice($slice);
1000 } else {
1001 warn ref($self)
1002 . " cannot rollback_Features_by_Slice. "
1003 . "This may result in storage failure "
1004 . "or duplicate Collections if there is pre-existing data";
1005 }
1006
1007 ### PROCESS CHUNKS
1008 my %chunk_windows = %{ $self->_get_Slice_chunks };
1009 my (%counts);
1010 my $store_natural = $self->{'_store_natural'};
1011 my $only_natural = $self->{'_only_natural'};
1012 $counts{0} = 0; # Set natural res count to 0
1013 my $slice_end = $slice->end;
1014 my $orig_start = $slice->start;
1015 my $region = $slice->coord_system_name;
1016 my $version = $slice->coord_system->version;
1017 my $seq_region_name = $slice->seq_region_name;
1018 my $strand = $slice->strand;
1019
1020 # Warn if this is not a full slice. Version needed in case we are
1021 # projecting from a non-default version slice
1022 my $full_slice =
1023 $slice->adaptor->fetch_by_region( $region, $seq_region_name, undef,
1024 undef, undef, $version );
1025
1026 if ( ( $full_slice->start() != $orig_start )
1027 || ( $full_slice->end() != $slice_end ) )
1028 {
1029 warn "Generating collections using sub-Slices "
1030 . "can result in data issues/artifacts";
1031 # Last chunk might not be the correct window length. Test
1032 # slices less than chunk length can cause failures in
1033 # _bin_features_by_window_sizes others?
1034 }
1035
1036 # Set the initial collection_start to orig_start. This is not the
1037 # case for 0 wsize where it must always be the true feature start.
1038 for my $wsize (@$window_sizes) {
1039 if ( $wsize == 0 ) { next }
1040 $self->collection_start( $wsize, $orig_start );
1041
1042 # Also reset collection end and score cache in case we are running
1043 # serially.
1044 $self->{collection_end}{$wsize} = undef;
1045 $self->{'score_cache'}{$wsize} = [];
1046 }
1047
1048 my $first_chunk_length = 1;
1049
1050 foreach my $chunk_length ( sort keys %chunk_windows ) {
1051 print "Processing windows "
1052 . join( ', ', @{ $chunk_windows{$chunk_length} } )
1053 . " with chunk length $chunk_length\n";
1054
1055 # Set window counts to 0
1056 map $counts{$_} = 0, @{ $chunk_windows{$chunk_length} };
1057
1058 # May need to reset flat file parser handle or other caches via
1059 # inheriting Collector
1060 if ( !$first_chunk_length ) {
1061 # Change 'can' to empty method stubb with pod???
1062 if ( $self->can('reinitialise_input') ) {
1063 $self->reinitialise_input();
1064 }
1065 }
1066
1067 $first_chunk_length = 0;
1068
1069 # Now walk through slice using slice length chunks and build all
1070 # windows in each chunk.
1071 my $in_slice = 1;
1072 my $start_adj = 0;
1073 my ( $sub_slice, $sub_end, $features, $bins );
1074 my $sub_start = 1;
1075 my $slice_length = $slice->length();
1076
1077 # Always create in local coords for fetch
1078 # Then change to seq_region coords for store if required
1079
1080 while ($in_slice) {
1081 $sub_start += $start_adj;
1082 $sub_end = $sub_start + $chunk_length - 1;
1083
1084 if ( $sub_end >= $slice_length ) {
1085 # Surplus bins are removed in store/write_collection in caller
1086 $in_slice = 0;
1087 }
1088
1089 $sub_slice =
1090 $slice->adaptor->fetch_by_region( $region, $seq_region_name,
1091 $sub_start + $orig_start - 1,
1092 $sub_end + $orig_start - 1,
1093 $strand, $version );
1094
1095 # Can't subslice as this will not clip if we go over the length of
1096 # the slice, unlike normal slice fetching. Will clipping the end
1097 # to the slice end cause any problems here? How will this affect
1098 # bin clipping?
1099
1100 ### Grab features and shift chunk coords
1101 $features = $self->get_Features_by_Slice($sub_slice);
1102
1103 # warn "Binning "
1104 # . scalar(@$features)
1105 # . " Features for chunk length $chunk_length, on Slice "
1106 # . $sub_slice->name;
1107
1108 if ( ( @{$features} )
1109 && ref( $features->[0] ) =~ /Bio::EnsEMBL::Utils::Collection/ )
1110 {
1111 # Would need to create base module with generic methods:
1112 # window_size, ...
1113
1114 # Check that the returned feature/collections support window_size.
1115 # All Collections should be able to
1116
1117 if ( $features->[0]->can('window_size') ) {
1118 if ( $features->[0]->window_size != 0 ) {
1119 throw( "You are trying to generated Collections from "
1120 . "a non-zero window sized Collection:\t"
1121 . $features->[1]->{'window_size'} );
1122 }
1123
1124 # This should never happen
1125 # if ( !$skip_zero_window ) {
1126 # throw( 'You have retrieved data from a Collection '
1127 # . 'which without using -skip_zero_window '
1128 # . 'i.e. you are trying to generate overwrite '
1129 # . 'the data you are generating the Collections from' );
1130 # }
1131
1132 } else {
1133 throw( 'Something is wrong, '
1134 . 'the Collection you have retrieved '
1135 . 'does not support the method window_size' );
1136 }
1137 } ## end if ( ( @{$features} ) ...)
1138
1139 # Set collection start here for 0 window_size
1140 if ( @{$features}
1141 && $store_natural
1142 && !defined( $self->collection_start(0) ) )
1143 {
1144 $self->collection_start( 0,
1145 $features->[0]->start + $sub_start );
1146 }
1147
1148 if ($in_slice) {
1149 $start_adj = $chunk_length;
1150 }
1151
1152 # Collect features into wsize bins
1153 if ( !$only_natural ) {
1154 # Get hashref of wsize=>bin array pairs
1155 $bins =
1156 $self->_bin_features_by_Slice_window_sizes(
1157 -slice => $sub_slice,
1158 -window_sizes => $chunk_windows{$chunk_length},
1159 -features => $features, );
1160 }
1161
1162 # Handle 0 wsize
1163 if ($store_natural) {
1164 foreach my $feature ( @{$features} ) {
1165 $counts{0}++;
1166
1167 if ( $bin_model eq 'SIMPLE' ) {
1168 $self->collection_start( 0, $feature->start + $sub_start );
1169
1170 $self->write_collection(
1171 0,
1172 $slice, # Pass Slice to sub-slice when storing
1173 $feature->end + $sub_start,
1174 $feature->strand, # Need to pass strand for 0 resolution
1175 $feature->scores, );
1176 }
1177 }
1178
1179 print "Window size 0 (natural resolution) has "
1180 . scalar( @{$features} )
1181 . " feature bins for:\t"
1182 . $sub_slice->name . "\n";
1183 }
1184
1185 # Now store collections for wsizes >0
1186 my $num_bins;
1187
1188 foreach my $wsize ( sort keys( %{$bins} ) ) {
1189 $num_bins = scalar( @{ $bins->{$wsize} } );
1190 $counts{$wsize} += $num_bins;
1191
1192 if ( $bin_model eq 'SIMPLE' ) {
1193 $self->write_collection(
1194 $wsize,
1195 $slice,
1196 #$sub_start,
1197 $sub_end,
1198 $slice->strand, # This is most likely 1!
1199 # Override this woth 0 in descendant Collector if required.
1200 $bins->{$wsize}, );
1201
1202 } else {
1203 throw( 'Bio::EnsEMBL::Utils::Collector '
1204 . 'does not yet support non-SIMPLE bin models' );
1205 # i.e. More than one score
1206 }
1207 }
1208 } ## end while ($in_slice)
1209
1210 # Turn off storing of natural resolution for next chunk length sets
1211 $store_natural = 0;
1212 } ## end foreach my $chunk_length ( ...)
1213
1214 # Write last collections for each wsize
1215
1216 foreach my $wsize (@$window_sizes) {
1217
1218 if ( ( $wsize == 0 && !$store_natural )
1219 || ( $wsize != 0 && $only_natural ) )
1220 {
1221 next;
1222 }
1223
1224 print "Writing final $wsize window_size collection, "
1225 . "this may result in slightly different "
1226 . "bin numbers from counts due to removing "
1227 . "overhanging bins past end of slice\n";
1228 $self->write_collection( $wsize, $slice );
1229 }
1230
1231 # Print some counts
1232 foreach my $wsize ( sort ( keys %counts ) ) {
1233 print "Generated "
1234 . $counts{$wsize}
1235 . " bins for window size $wsize for "
1236 . $slice->name . "\n";
1237 # Some may have failed to store if we are projecting to a new
1238 # assembly.
1239 }
1240
1241 return;
1242 } ## end sub store_window_bins_by_Slice
1243
1244 =head2 _bin_features_by_Slice_window_sizes
1245
1246 Args[0] : Bio::EnsEMBL::Slice
1247 Args[1] : ARRAYREF of window sizes
1248 Args[2] : ARRAYREF of features with start and end method
1249 e.g. Bio::EnsEMBL::Features
1250 Example :
1251
1252 $bins =
1253 $self->_bin_features_by_window_sizes(
1254 -slice => $slice,
1255 -window_sizes => $chunk_windows{$chunk_length},
1256 -features => $features, );
1257
1258 Description: Bins feature scores for a given list of window sizes and
1259 predefined method.
1260 Returntype : HASHREF of scores per bin per window size
1261 Exceptions : None
1262 Caller : store_window_bins_by_Slice
1263 Status : At Risk
1264
1265 =cut
1266
1267 sub _bin_features_by_Slice_window_sizes {
1268 my ( $self, @args ) = @_;
1269
1270 my ( $slice, $wsizes, $features ) =
1271 rearrange( [ 'SLICE', 'WINDOW_SIZES', 'FEATURES' ], @args );
1272
1273 # Generate these once in caller?
1274 my $calc_method = '_calculate_' . $bin_method;
1275 my $post_method = '_post_process_' . $bin_method;
1276
1277 # Do this conditional on the Collection type i.e. is
1278 # collection seq_region blob then no else yes Would need
1279 # $Bio::EnsEMBL::Utils::Collector::collection_format=BLOB|STANDARD
1280 # if ( !defined($features) || !@{$features} ) { return {} }
1281
1282 # Set up some hashes to store data by window_size
1283 my ( %bins, %nbins, %bin_counts );
1284 my $slice_start = $slice->start();
1285 my $slice_length = $slice->length();
1286
1287 # Set up some bin data for the windows
1288 foreach my $wsize (@$wsizes) {
1289 $nbins{$wsize} = int( $slice_length/$wsize ); # int rounds down
1290 # nbins is index of the bin not the 'number'
1291 # Unless $slice_length is a multiple!
1292 if ( !( $slice_length % $wsize ) ) { $nbins{$wsize}-- }
1293
1294 # Create default bins with 0
1295 $bins{$wsize} = [];
1296 map { $bins{$wsize}->[$_] = 0 } ( 0 .. $nbins{$wsize} );
1297
1298 # Set bin counts to 0 for each bin
1299 $bin_counts{$wsize} = [];
1300
1301 # This is adding an undef to the start of the array!?
1302 map { $bin_counts{$wsize}->[ ($_) ] = 0 } @{ $bins{$wsize} };
1303
1304 foreach my $bin ( @{ $bins{$wsize} } ) {
1305 $bin_counts{$wsize}->[$bin] = 0;
1306 }
1307 }
1308
1309 my $feature_index = 0;
1310 my ( $bin_index, @bin_masks );
1311
1312 foreach my $feature ( @{$features} ) {
1313 # Set up the bins for each window size
1314
1315 foreach my $wsize (@$wsizes) {
1316 my $start_bin = int( ( $feature->start )/$wsize );
1317 my $end_bin = int( ( $feature->end )/$wsize );
1318
1319 if ( $end_bin > $nbins{$wsize} ) {
1320 $end_bin = $nbins{$wsize};
1321 }
1322
1323 $self->$calc_method( $feature, $start_bin, $end_bin,
1324 $wsize, \%bins, \%bin_counts );
1325 }
1326
1327 }
1328
1329 # Now do post processing of bins if required
1330 if ( $self->can($post_method) ) {
1331 $self->$post_method( \%bins, \%bin_counts );
1332 }
1333
1334 return \%bins;
1335 } ## end sub _bin_features_by_Slice_window_sizes
1336 # end sub _bin_features_by_Slice
1337
1338
1339 ### Here follows the bin methods
1340 # These may also be defined in the inheriting Collector class. No tests
1341 # as these are internal and require speed.
1342
1343
1344 =head2 _calculate_count
1345
1346 Args[0] : feature e.g. Bio::EnsEMBL::Feature
1347 Args[1] : int - start bin
1348 Args[2] : int - end bin
1349 Args[3] : int - window_size
1350 Args[4] : hashref - score bins
1351 Example : $self->$calc_method
1352 Description: Adds count to bins which this feature overlaps
1353 Returntype : None
1354 Exceptions : None
1355 Caller : _bin_features_by_window_sizes
1356 Status : At Risk
1357
1358 =cut
1359
1360 sub _calculate_count {
1361 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_;
1362
1363 my $bin_index;
1364
1365 for ( $bin_index = $start_bin; $bin_index <= $end_bin; ++$bin_index )
1366 {
1367 $bins_ref->{$wsize}->[$bin_index]++;
1368 }
1369
1370 return;
1371 }
1372
1373
1374 =head2 _calculate_average_score
1375
1376 Args[0] : feature e.g. Bio::EnsEMBL::Feature
1377 Args[1] : int - start bin
1378 Args[2] : int - end bin
1379 Args[3] : int - window_size
1380 Args[4] : hashref - score bins
1381 Example : $self->$calc_method
1382 Description: Adds score to bins which this feature overlaps
1383 Returntype : None
1384 Exceptions : None
1385 Caller : _bin_features_by_window_sizes
1386 Status : At Risk
1387
1388 =cut
1389
1390
1391 sub _calculate_average_score {
1392 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref,
1393 $bin_counts_ref )
1394 = @_;
1395
1396 # This is simple an average of all the scores for features which
1397 # overlap this bin. No weighting with respect to the bin or the
1398 # feature.
1399
1400 my $score = $self->get_score_by_Feature($feature);
1401
1402 for ( my $bin_index = $start_bin;
1403 $bin_index <= $end_bin;
1404 ++$bin_index )
1405 {
1406 # We should really push onto array here so we can have median or
1407 # mean.
1408
1409 $bins_ref->{$wsize}->[$bin_index] += $score;
1410 $bin_counts_ref->{$wsize}->[$bin_index]++;
1411 }
1412
1413 return;
1414 }
1415
1416
1417 =head2 _post_process_average_score
1418
1419 Args[0] : hashref - score bins
1420 Args[1] : hashref - count bins
1421 Example : $self->$post_method
1422 Description: Post processes bins to calculate average score
1423 Returntype : None
1424 Exceptions : None
1425 Caller : _bin_features_by_window_sizes
1426 Status : At Risk
1427
1428 =cut
1429
1430 sub _post_process_average_score {
1431 my ( $self, $bins_ref, $bin_counts_ref ) = @_;
1432
1433 foreach my $wsize ( keys %{$bins_ref} ) {
1434 foreach my $bin_index ( 0 .. $#{ $bins_ref->{$wsize} } ) {
1435
1436 if ( $bin_counts_ref->{$wsize}->[$bin_index] ) {
1437 $bins_ref->{$wsize}->[$bin_index] /=
1438 $bin_counts_ref->{$wsize}->[$bin_index];
1439 }
1440
1441 }
1442 }
1443
1444 return;
1445 }
1446
1447
1448 =head2 _calculate_max_magnitude
1449
1450 Args[0] : feature e.g. Bio::EnsEMBL::Feature
1451 Args[1] : int - start bin
1452 Args[2] : int - end bin
1453 Args[3] : int - window_size
1454 Args[4] : hashref - score bins
1455 Example : $self->$calc_method
1456 Description: Sets max +/-ve scores for bins which this feature overlaps
1457 Returntype : None
1458 Exceptions : None
1459 Caller : _bin_features_by_window_sizes
1460 Status : At Risk
1461
1462 =cut
1463
1464 sub _calculate_max_magnitude {
1465 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_;
1466
1467 my $score = $self->get_score_by_Feature($feature);
1468
1469 # Max magnitude
1470 # Take the highest value +ve or -ve score
1471 for ( my $bin_index = $start_bin;
1472 $bin_index <= $end_bin;
1473 ++$bin_index )
1474 {
1475
1476 # We really need to capture the lowest -ve and higest +ve scores
1477 # here and post process to pick between them.
1478
1479 $bins_ref->{$wsize}->[$bin_index] ||= [ 0, 0 ]; #-ve, +ve
1480
1481 if ( $score < $bins_ref->{$wsize}->[$bin_index]->[0] ) {
1482 $bins_ref->{$wsize}->[$bin_index]->[0] = $score;
1483 } elsif ( $score > $bins_ref->{$wsize}->[$bin_index][1] ) {
1484 $bins_ref->{$wsize}->[$bin_index]->[1] = $score;
1485 }
1486 }
1487
1488 return;
1489 } ## end sub _calculate_max_magnitude
1490
1491
1492 =head2 _post_process_max_magnitude
1493
1494 Args[0] : hashref - score bins
1495 Args[1] : hashref - count bins
1496 Example : $self->$post_method
1497 Description: Post processes bins to pick largest +ve or -ve score
1498 Returntype : None
1499 Exceptions : None
1500 Caller : _bin_features_by_window_sizes
1501 Status : At Risk
1502
1503 =cut
1504
1505 sub _post_process_max_magnitude {
1506 my ( $self, $bins_ref ) = @_;
1507
1508 # Take the highest value +ve or -ve score
1509
1510 foreach my $wsize ( keys %{$bins_ref} ) {
1511 foreach my $bin_index ( 0 .. $#{ $bins_ref->{$wsize} } ) {
1512
1513 # Have potential for no listref in a given bin
1514
1515 # default value if we haven't seen anything is 0
1516 # Actually want an array of -ve +ve values
1517
1518 if ( $bins_ref->{$wsize}->[$bin_index] ) {
1519 my $tmp_minus = -$bins_ref->{$wsize}->[$bin_index]->[0];
1520
1521 if ( $tmp_minus > $bins_ref->{$wsize}->[$bin_index]->[1] ) {
1522 $bins_ref->{$wsize}->[$bin_index] =
1523 $bins_ref->{$wsize}->[$bin_index]->[0];
1524 } else {
1525 $bins_ref->{$wsize}->[$bin_index] =
1526 $bins_ref->{$wsize}->[$bin_index]->[1];
1527 }
1528
1529 }
1530
1531 }
1532 }
1533 return;
1534 } ## end sub _post_process_max_magnitude
1535
1536
1537 =head2 _calculate_RPKM
1538
1539 Args[0] : feature e.g. Bio::EnsEMBL::Feature
1540 Args[1] : int - start bin
1541 Args[2] : int - end bin
1542 Args[3] : int - window_size
1543 Args[4] : hashref - score bins
1544 Example : $self->$calc_method
1545 Description: Stores counts to calculate Read Per Kb per Million(RPKM)
1546 Returntype : None
1547 Exceptions : None
1548 Caller : _bin_features_by_window_sizes
1549 Status : At Risk
1550
1551 =cut
1552
1553 sub _calculate_RPKM {
1554 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_;
1555
1556 $self->_calculate_count( $feature, $start_bin, $end_bin,
1557 $wsize, $bins_ref );
1558
1559 return;
1560 }
1561
1562
1563 =head2 _post_process_RPKM
1564
1565 Args[0] : hashref - score bins
1566 Args[1] : hashref - count bins
1567 Example : $self->$post_method
1568 Description: Post processes bins to calculate average score
1569 Returntype : None
1570 Exceptions : None
1571 Caller : _bin_features_by_window_sizes
1572 Status : At Risk
1573
1574 =cut
1575
1576 sub _post_process_RPKM {
1577 my ( $self, $bins_ref ) = @_;
1578
1579 #10^9 x C / NGB
1580 #C = Reads overlapping bin
1581 #N = Total reads in the experiment
1582 #G = Length of bin in bps
1583 #(don't really have to account for non-ref/HAPs or gender here
1584 #as should be close enough, CellTypes/gender differences will be miniscule)
1585 #B = length of each bin
1586
1587 foreach my $wsize(keys %{$bins_ref}){
1588
1589 foreach my $bin_index(0..$#{$bins_ref->{$wsize}}){
1590 $bins_ref->{$wsize}->[$bin_index] =
1591 ((10**9) *
1592 $bins_ref->{$wsize}->[$bin_index])/(($self->_RPKM_factor($wsize)) * $wsize);
1593 }
1594 }
1595
1596 return;
1597
1598 }
1599
1600
1601 =head2 _set_up_RPKM
1602
1603 Args[0] : hashref - method config e.g
1604 {
1605 DNADB => Bio::EnsEMBL::DBSQL::DBAdaptor,
1606 TOTAL_FEATURE => $total_feature_count,
1607 }
1608
1609 Example : $self->$set_up_method($config);
1610 Description: Sets the RPKM factor
1611 Returntype : None
1612 Exceptions : Throws is required config params are not set
1613 Caller : bin_method
1614 Status : At Risk
1615
1616 =cut
1617
1618
1619 sub _set_up_RPKM{
1620 my ($self, $config) = @_;
1621
1622 my ($dnadb, $total_features) = rearrange([ 'DNADB', 'TOTAL_FEATURES'], %{$config});
1623
1624 #Test specifically here to notify about config hash
1625 if(! $total_features){
1626 throw("For RPKM you must pass a valid 'total_features' ".
1627 "as part of the method config hash.");
1628 }
1629
1630 if(! $dnadb){
1631 throw("For RPKM you must pass 'dnadb' as part of the method config hash.");
1632 }
1633
1634 foreach my $wsize(@{$self->window_sizes}){
1635 #Should never have 0 here
1636 $self->_RPKM_factor($wsize, ($wsize * $total_features)); #N*G
1637
1638 warn "setting $wsize RPKM factor($wsize * $total_features) to ".
1639 $self->_RPKM_factor($wsize);
1640 }
1641
1642 return;
1643 } ## end sub _set_up_RPKM
1644
1645
1646 =head2 _RPKM_factor
1647
1648 Args[0] : int - RPKM factor i.e. (Total reads in the experiment *
1649 Genome length)
1650 Example : $self->_RPKM_factor($wsize, $factor);
1651 Description: Gets/Sets the RPKM factor
1652 Returntype : int
1653 Exceptions : None
1654 Caller : _set_up_RPKM, _post_process_RPKM
1655 Status : At Risk
1656
1657 =cut
1658
1659 sub _RPKM_factor{
1660 my ($self, $wsize, $factor) = @_;
1661
1662 if (! defined $wsize){
1663 throw("You must pass at least window_size to get or set the RPKM factor");
1664 }
1665
1666 if(defined $factor){
1667 $self->{'RPKM_factor'}{$wsize} = $factor;
1668 }
1669 elsif(! exists $self->{'RPKM_factor'}{$wsize}){
1670 #This should never happen unless the window sizes
1671 #are redefined after initialisation
1672 throw("You have requested an RPKM factor for a window_size".
1673 " which has not been set:\t$wsize");
1674 }
1675
1676 return $self->{'RPKM_factor'}{$wsize};
1677 }
1678
1679 =head2 get_diploid_genome_length_by_gender
1680
1681 Args[0] : string - RPKM factor i.e. (Total reads in the experiment *
1682 Genome length)
1683 Args[1] : string - gender e.g. male or female
1684 Example :
1685
1686 my $glength =
1687 $self->get_diploid_genome_length_by_gender( $dnadb, $gender );
1688
1689 Description: Gets the gender specific diploid genome length,
1690 including non-ref but not including haplotypes. Only
1691 handles species with X/Y sex chromosomes.
1692 Returntype : int
1693 Exceptions : None
1694 Caller : _set_up_RPKM, _post_process_RPKM
1695 Status : At Risk - Move to and export from generic Utils Slice module???
1696
1697 =cut
1698
1699 sub get_diploid_genome_length_by_gender {
1700 my ( $dnadb, $gender ) = @_;
1701
1702 my %sex_chrs = ( 'Y' => 'male',
1703 'X' => 'female', );
1704
1705 my $dip_length = 0;
1706
1707 if (!(( ref($dnadb) && $dnadb->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') )
1708 && $dnadb->grou() eq 'core'
1709 && ( defined $gender && $gender =~ /(male|female)/ ) ) )
1710 {
1711 throw( "Must provide valid "
1712 . "Bio::EnsEMBL::DBSQL::DBAdaptor($dnadb) and "
1713 . "gender ($gender) arguments" );
1714 }
1715
1716 my @ref_slices = $dnadb->get_SliceAdaptor->fetch_all('toplevel');
1717
1718 # Include non-ref(unassembled), but omit haps/lrgs(i.e. redundant)
1719
1720 foreach my $slice (
1721 @{ $dnadb->get_SliceAdaptor->fetch_all( 'toplevel', undef, 1, 1 ) }
1722 )
1723 {
1724 # Include duplicated region for true diploid length
1725
1726 # Skip haps/lrgs
1727 if ( ( $slice->coord_system->name() eq 'chromosome'
1728 && !$slice->is_reference() )
1729 || $slice->coord_system->name() eq 'lrg' )
1730 {
1731 next;
1732 }
1733
1734 if ( exists( $sex_chrs{ $slice->seq_region_name() } ) ) {
1735 if ( $gender eq 'male' ) {
1736 $dip_length += $slice->length;
1737 } elsif ( $sex_chrs{ $slice->seq_region_name } eq 'male' ) {
1738 next;
1739 }
1740 }
1741
1742 $dip_length += 2*$slice->length;
1743 }
1744
1745 return $dip_length;
1746 } ## end sub get_diploid_genome_length_by_gender
1747
1748
1749 1;