Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,1770 @@
+# $Id: Collector.pm,v 1.7 2011/01/10 11:27:34 nj1 Exp $
+
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+#Your Bio::Ensembl::Collection::Feature defs module should inherit from here
+#This could be a local defs file which you have created and require'd into your script
+
+#If your collections defs module refers to a Bio::EnsEMBL::Feature, 
+#then it's adaptor should inherit from the collections defs module
+
+
+
+package Bio::EnsEMBL::Funcgen::Collector;
+#Move this to Bio::EnsEMBL::Utils::Collector for 59?
+
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Utils::Argument  ('rearrange');
+use Bio::EnsEMBL::Utils::Exception ('throw');
+use Bio::EnsEMBL::Funcgen::ResultFeature;
+
+#use base('Bio::EnsEMBL::Collection');#ISA
+
+our ($pack_template, $packed_size, @window_sizes); #These get set in the FeatureAdaptor
+#Make these constants and remove setter functionality in methods?
+#Only really important for pack template and windows, maybe these if we are going to start
+
+
+our $max_data_type_size = 16777216; #Default is 16MB for long blob
+#we need to deduct the size of the rest of the record here!
+#For a 2byte packet the smallest window size possible is:
+#(slice->length/(16777216/2)
+#so int(bin_size)+1
+#Obviously have to use the largest slice here, for human chr1:
+#249,250,621/(16777216/2) = 29.7???
+#We may need to up this slightly to account for larger chrs?
+#Implications on memory usage? Is it 4 times for blob manipulation?
+#Does substr require this manipulation?
+#This max_allowed_packet_size does not seem to translate directly to the size of the
+#data being stored e.g. quite a bit more is needed.  ISG haven't got to the bottom of this yet.
+#But have simply upped the config to 67108864 to handle the largest human chr.
+
+our $max_view_width     = 500000;#Max width in Region In Detail;
+
+
+#our %VALID_BINNING_METHODS
+#Remove this in favour of can->('calculate_.$method) and coderefs?
+
+
+
+
+#To do
+# 1 DONE Merge in Collection code, (no need to do this, removed inheritance)
+# 2 Write simple BED input to flat file output.
+# 3 Separate store method so we can simply get, then wrap store around this
+# 4 Test get method with slice adjusts
+# 5 separate set_config? 
+# 6 optimise generate_bin_chunks to handle just one window size for display?
+# 7 Handle packed_size pack_template as methods  constants
+# 8 Provide override method in basefeature adaptor which will use package constant in feature adaptor
+# This is because these are really adaptor config, the collector only needs to know the
+# packed_size, and in the absence of an feature adaptor also provides the default methods for both.
+# If we substr in the API then we need to set sensible limits on blob size, otherwise we will have to unpack a lot of data
+# to get at the slice we want.
+# OR
+# Change adaptor to substr in DB based on known blob ranges/window size
+# and stitch together any which cross boundaries. This depends on speed of substr at end of large blob TEST!
+# Load with current code first and test this before making either change!
+# Delete empty (non-0) collections? i.e. For seq_regions which do not have any source features.
+#
+# 9 Handle PAR/HAP regions using fetch_normalised_slice_projections This has to be done in the feature adaptor! Then restrict  to non_dup regions in calling script
+
+
+
+=head2 new
+
+  Args       : None
+  Example    : my $collector = Bio::EnsEMBL::(Funcgen|Compara|Variation::)Collector::FEATURE->new;
+               $collector->store_windows_by_Slice($slice);
+  Description: Simple new method to enable use of collector when not inherited by
+               a descendant of Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor
+  Returntype : Bio::EnsEMBL::Funcgen::Collector
+  Exceptions : None
+  Caller     : Collector script
+  Status     : At Risk
+
+=cut
+
+sub new{
+  return bless {}, $_[0];#Simple blesses this class as an empty hash
+
+  #Do not set anything here
+  #As will not be first in ISA for feature adaptors
+  #Hence not guaranteed to be called
+
+}
+
+#Setter/Getter methods if we don't have a dedicated Collector module
+#to set the package variables in? Also to allow overriding of defaults.
+#This can be used by the write_collection method
+#to determine when to build and store a compressed collection
+#Effectively the max size of the data type you are using to store 
+#a compressed score. defaults to max for long blob 16MB
+
+
+
+#Generic method, but only ever called by write_collection in descendant
+
+sub new_assembly{
+  my ($self, $new_ass) = @_;
+
+  if($new_ass){
+	#Validate new assm to project to
+
+
+	$self->{'new_assembly'} = $new_ass;
+  }
+
+  return $self->{'new_assembly'};
+}
+
+
+sub max_data_type_size{
+  my ($self, $size) = @_;
+
+  #Validate is sensible integer?
+
+  if($size && ! int($size)){
+	throw("max_data_type_size must be a integer of bytes, not $size");
+  }
+  elsif($size){
+	$self->{'max_data_type_size'} = $size;
+  }
+  elsif(! defined $self->{'max_data_type_size'}){
+	#default set at head of this module or in descendant Collector
+	$self->{'max_data_type_size'} = $Bio::EnsEMBL::Funcgen::Collector::max_data_type_size;
+  }
+
+  return $self->{'max_data_type_size'};
+}
+
+sub max_view_width{
+  my ($self, $size) = @_;
+
+  #Validate is sensible integer?
+
+  if($size && ! int($size)){
+	throw("max_view_width must be a integer, not $size");
+  }
+  elsif($size){
+	$self->{'max_view_width'} = $size;
+  }
+  elsif(! defined $self->{'max_view_width'}){
+	#default set at head of this module or in descendant Collector
+	$self->{'max_view_width'} = $Bio::EnsEMBL::Funcgen::Collector::max_view_width;
+  }
+
+  return $self->{'max_view_width'};
+}
+
+
+sub bins_per_record(){
+#$collector_class::bins_per_record = ($collector_class::max_data_type_size/$collector_class::packed_size);#This should be done dynamically as we may redefine either of these variables?
+
+  my ($self) = shift;
+
+  return int($self->max_data_type_size/$self->packed_size);
+}
+
+
+#The defaults for these should be defined in the feature/format specific Collector descendant
+#either by specifying the package variables or using config attrs to set methods?
+#general config should be parsed here.
+#rename bin_method?
+
+sub bin_method{
+  my ($self, $method) = @_;
+
+  if($method || ! $self->{'bin_method'}){
+
+	if($method){
+	  $self->{'bin_method'} = $method;
+	  #should test can here? or validate versus hash?
+	}
+	elsif(! $self->{'bin_method'}){
+	  
+	  if (! defined  $Bio::EnsEMBL::Funcgen::Collector::bin_method){
+		throw('Must pass a bin_method in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_method in your Collector');
+	  }
+	  
+	  $self->{'bin_method'} = $Bio::EnsEMBL::Funcgen::Collector::bin_method;
+	}
+
+	#or current validate method if we are keeping the method in the if/else block
+	
+
+  	#if(! $self->can("calculate_${method}"))){
+	#throw("$method is no a valid a valid binning method");
+	#}
+  
+  }
+
+  return $self->{'bin_method'};
+}
+
+#We could replace this with a hash of bin_methods and models?
+#This could then be used to validate
+#Altho if we are going to commodotise the bin methods, then we need to be able to 
+#define this in the child Collector. Could still do this by modifying the method/model
+#hash from the child Collector
+
+sub bin_model{
+  my ($self, $bin_model) = @_;
+  
+  if($bin_model || ! $self->{'bin_model'}){
+
+	if($bin_model){
+	  $self->{'bin_model'} = $bin_model;
+	}
+	elsif(! $self->{'bin_model'}){
+	  
+	  #Set as global constant defined in descendant Collector
+	  if (! defined  $Bio::EnsEMBL::Funcgen::Collector::bin_model){
+		throw('Must pass -bin_model in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_model in your Collector');
+	  }
+	  
+	  $self->{'bin_model'} = $Bio::EnsEMBL::Funcgen::Collector::bin_model;
+	}
+	
+	#Need to validate bin models here
+	throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models')	if $self->{'bin_model'} ne 'SIMPLE';
+  }
+
+  return $self->{'bin_model'};
+}
+
+#This can be overridden by adaptor method
+#At present this could cause problems as we can pass window sizes in the config, but they will never be set
+#as adaptor method is not a setter. Adaptor method should throw if we try and set them as this could cause problems when fetching and not knowing the custom sizes?
+
+sub window_sizes{
+  my ($self, $sizes) = @_;
+
+  if($sizes || ! $self->{'window_sizes'}){
+
+	if($sizes){
+	  $self->{'window_sizes'} = $sizes;
+	}
+	else{#! $self->{'windows_sizes'
+
+	  if (! @window_sizes){
+		throw('Must pass -windows_sizes in the config or define @Bio::EnsEMBL::Funcgen::Collector::window_sizes in your Collector');
+	  }
+	  
+ 	  @{$self->{'window_sizes'}} = @window_sizes;
+	}
+	
+	if(ref($self->{'window_sizes'}) ne 'ARRAY' ||
+	   scalar(@{$self->{'window_sizes'}}) == 0){
+	  throw('window_sizes must be an arrayref of at least one window size');
+	}
+  }
+
+  return $self->{'window_sizes'};
+}
+
+
+
+
+#Optional attrs dependant on whether Collection is packed
+#Can be redefined in the adaptor but becareful never to redefine the actual values
+#As these should really be constants for a given Collector
+#What is best here? We only want pack methods for storing/fetching compressed collections
+#Move this to base feature adaptor and define attrs as constants using
+#package variable? Or directly in new?
+#Then direct modification will be caught.
+#Just leave here for now.
+
+#Caller _obj_from_sth/store
+
+sub pack_template{
+  my ($self, $template) = @_;
+
+  if($template){
+	$self->{'pack_template'} = $template;
+  }
+  elsif(! $self->{'pack_template'}){
+
+	#Set as global constant defined in descendant Collector
+	
+	if (! defined  $Bio::EnsEMBL::Funcgen::Collector::pack_template){
+	  throw('Must pass a per score pack_template in the config or define $Bio::EnsEMBL::Funcgen::Collector::pack_template in your Collector');
+	}
+
+	$self->{'pack_template'} = $Bio::EnsEMBL::Funcgen::Collector::pack_template;
+  }
+
+  return $self->{'pack_template'};
+
+}
+
+#Caller _obj_from_sth/store & current_packed_size
+
+sub packed_size{
+  my ($self, $size) = @_;
+
+  if($size){
+
+	if(! int($size)){
+	  throw("$size is not an integer, must pass a size integer for packed_size which specifies size of pack_template:\t".$self->pack_template);
+	}
+
+	$self->{'packed_size'} = $size;
+  }
+  elsif(! $self->{'packed_size'}){
+
+	#Set as global constant defined in descendant Collector
+	
+	if (! defined  $Bio::EnsEMBL::Funcgen::Collector::packed_size){
+	  throw('Must pass a packed_size(wrt to pack_template) in the config or define $Bio::EnsEMBL::Funcgen::Collector::packed_size in your Collector');
+	}
+
+	$self->{'packed_size'} = $Bio::EnsEMBL::Funcgen::Collector::packed_size;
+  }
+
+  return $self->{'packed_size'};
+
+}
+
+#These methods are used by the descendant Collector
+#For caching infor whilst building collections
+#This is used to log how big a collection has grown before storing
+
+sub current_packed_size{
+  my ($self, $wsize) = @_;
+  
+  #$self->{'current_packed_size'}{$wsize} ||= 0;
+
+  #if(defined $cps){
+#	$self->{'current_packed_size'}{$wsize} = $cps;
+#  }
+#  else{
+#	return $self->{'current_packed_size'}{$wsize};
+#  }
+
+  return (scalar(@{$self->score_cache($wsize)})*$self->packed_size);
+
+}
+
+
+sub score_cache{
+  my ($self, $wsize, $scores) = @_;
+
+  $self->{'score_cache'}{$wsize} ||= [];
+
+  if(defined $scores){
+	push @{$self->{'score_cache'}{$wsize}}, @{$scores};
+  }
+  else{
+	#Do this here to stop passing the ref everytime
+	#Will this be faster?
+	#Would certainly be faster if we were not returning a ref
+	return $self->{'score_cache'}{$wsize};
+  }
+}
+
+#These last methods are only used for the 0 wsize
+#natural resolution and ar wrt the orig_slice passed
+#to store_windows_by_Slice
+
+sub collection_start{
+  my ($self, $wsize, $sr_start) = @_;
+
+  if(defined $sr_start){
+	$self->{'collection_start'}{$wsize} = $sr_start;
+  }
+  else{
+	return $self->{'collection_start'}{$wsize};
+  }
+}
+
+sub collection_end{
+  my ($self, $wsize, $sr_end) = @_;
+
+  if(defined $sr_end){
+	$self->{'collection_end'}{$wsize} = $sr_end;
+  }
+  else{
+	return $self->{'collection_end'}{$wsize};
+  }
+}
+
+sub collection_strand{
+   my ($self, $wsize, $strand) = @_;
+
+   if(defined $strand){
+	 $self->{'collection_strand'}{$wsize} = $strand;
+   }
+   else{
+	 return $self->{'collection_strand'}{$wsize};
+  }
+}
+
+
+
+=pod
+
+sub _create_feature {
+  my ( $this, $feature_type, $args ) = @_;
+
+  my $feature = $this->SUPER::_create_feature( $feature_type, $args );
+
+  if ( !$this->_lightweight() ) {
+    my ( $phase, $end_phase, $stable_id, $version, $created_date,
+         $modified_date, $is_current )
+      = rearrange( [ 'PHASE',        'END_PHASE',
+                     'STABLE_ID',    'VERSION',
+                     'CREATED_DATE', 'MODIFIED_DATE',
+                     'IS_CURRENT'
+                   ],
+                   %{$args} );
+
+    push( @{$feature},
+          $phase, $end_phase, $stable_id, $version, $created_date,
+          $modified_date, $is_current );
+  }
+
+  return $feature;
+}
+
+sub _create_feature_fast {
+  my ( $this, $feature_type, $args ) = @_;
+
+  my $feature =
+    $this->SUPER::_create_feature_fast( $feature_type, $args );
+
+  return $feature;
+}
+
+
+#This might not be sensible for Features which are split across tables
+
+sub _tables {
+  my ($this) = @_;
+
+  my @tables = $this->SUPER::_tables();
+
+  if ( $this->_lightweight() ) {
+    return ( $tables[0] );
+  }
+
+  return @tables;
+}
+
+
+sub _columns {
+  my ($this) = @_;
+
+  my @columns = $this->SUPER::_columns();
+
+  if ( $this->_lightweight() ) {
+
+	#What is this doing?
+	#Probably not sensible for ResultFeature
+    @columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns );
+  }
+
+  return @columns;
+}
+
+#Also not sensible for objects spread across several tables
+
+sub _default_where_clause {
+  my ($this) = @_;
+
+  if ( $this->_lightweight() ) {
+    return '';
+  }
+
+  return $this->SUPER::_default_where_clause();
+}
+
+=cut
+
+
+
+#This need to be generic
+#Again we need to pass an accessor method/reference?
+#Will be some sort of generic fetch for feature adaptors
+#or while loop for in flat file accessor
+#rollback to be handled in caller?
+
+# To do
+#
+# 1 Allow variable chunks lengths (so we only have one resolution of windows?)
+#   This will allow SNP collections which currently define classification i.e colour
+#   Density of SNPs within window will define shading. Count will be displayed in zmenu
+#   This maybe something we have to do in the descendant
+#
+# 2 Implement collection param definition in/from descendant
+
+
+# return collection config from adaptor fetch
+# window size
+# fixed width?
+# render/collection style?
+# This chould be implemented in BaseFeatureAdaptor::generic_fetch?
+# Or could be done in the calling fetchmethod?
+#
+
+#need to change this to get_window_bin_by_Slice
+#to enable generating bins on uncompressed data
+#Need to remove all counts and store based code to store caller
+#this would mean removing any pack based code too
+#separate set_config method
+
+
+#Probelm here is size of slice?
+#We need to generate bins all in one go, but also need to store at interval
+#so as not to explode memory
+#Do we need to separate the window generation from the bin generation code?
+
+
+#Define the optimal way to generate windowed data by
+#finding the most common denominator
+
+sub _define_window_chunks{
+  my ($self, $window_sizes, $max_view_size) = @_;
+
+  ### DEFINE CHUNKS WRT WINDOWS
+
+  #Shortcut for on the fly uncompressed collection retrieval
+  #if(scalar(@wsizes) = 1){
+  #
+  #}
+  #else{
+   
+  #Calulate sensible slice length based on window sizes
+  my @wsizes = sort {$a <=> $b} @$window_sizes;
+  
+  #We need a default when only calculating 0 resolution
+  #Will binning code work with only 0 resolution?
+  if((scalar(@wsizes) == 1) &&
+	 $wsizes[0] == 0){
+	return { $self->max_view_width => [0] };
+  }
+  
+
+  my $multiplier = int($max_view_size/$wsizes[$#wsizes]);
+  my $chunk_length = $multiplier * $wsizes[$#wsizes];
+  my $not_divisible = 1;
+  my %chunk_windows;#Registry of chunk lengths to run with windows
+  my %workable_chunks = map {$_ => {}} @wsizes;
+  delete $workable_chunks{'0'};#get rid of natural resolution as this will always work
+
+
+  while($not_divisible && $chunk_length != 0){
+	$not_divisible = 0;
+	
+	foreach my $wsize(@wsizes){
+	  next if $wsize == 0;#Special wsize for normal data
+	  
+	  #Set not divisible if modulus is true
+	  if($chunk_length % $wsize){
+		$not_divisible = 1;
+	  }
+	  else{
+		#No need to listref here?
+		$workable_chunks{$wsize}{$chunk_length} = [];
+	  }
+	  
+	  #warn "chunk length is $chunk_length and not_divisible is $not_divisible";
+	}
+	
+	#Gradually shrink the length until we find a workable slice length for all windows
+	$chunk_length -= $wsizes[$#wsizes] if $not_divisible;
+  }
+  
+  my %chunk_sets;
+  
+  
+  if($chunk_length == 0){
+	print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n";
+	
+	foreach my $wsize(keys %workable_chunks){
+	  #Loop through windows, seeing if they are workable in the other windows
+	  
+	  foreach my $chunk(keys %{$workable_chunks{$wsize}}){
+		
+		foreach my $other_wsize(keys %workable_chunks){
+		  next if $wsize == $other_wsize;
+		  
+		  if(exists $workable_chunks{$other_wsize}{$chunk}){
+			#only push it onto the other wsize, as we will do the reverse later			
+			$chunk_sets{$chunk}{$wsize} = undef;
+		  }
+		}
+	  }
+	}
+	
+	#Now we have a register of co-occurence of wsizes with repect to chunks
+	#Loop through finding the least amount of sets with the longest chunk length?
+	#There is no way to decide which is best?
+	#we could calculate the number of loops? Factored by the chunk length?
+	#Let's just print out and see what we get
+	
+	#warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets);
+
+
+	#For now let's just take the one which has the most windows and the longest chunk
+	#Then we just get the largest which handles the rest.
+
+	#define possible set lengths
+	my $i = 0;
+	my %set_lengths;
+	map {$set_lengths{$i} = []; $i++} @wsizes;
+	delete $set_lengths{'0'};#get rid of natural resolution as this will always work
+
+
+	#store chunks lengths for each set size
+	foreach my $chunk(keys %chunk_sets){
+	  my $set_size = scalar(values %{$chunk_sets{$chunk}});
+	  push @{$set_lengths{$set_size}}, $chunk;
+	}		
+  
+	#Now we get the biggest set with the longest length;
+	my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop
+	my $found_largest_set = 0;
+
+	while(! $found_largest_set){
+	  $largest_size--;
+
+	  if(scalar(@{$set_lengths{$largest_size}}>0)){
+		$found_largest_set = 1;
+	  }
+	}
+
+
+	#We should be able to loop this bit, to find all the biggest sets.
+	my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}};
+	#we could even be selective here, but let's just take the first one for now
+
+	my @largest_windows = keys %{$chunk_sets{$largest_chunk}};
+	@{$chunk_windows{$largest_chunk}} = @largest_windows;
+	print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n";
+
+	my %remaining_windows = map {$_ => {}} @wsizes;
+	delete $remaining_windows{'0'};#get rid of natural resolution as this will always work
+	map { delete $remaining_windows{$_} } @largest_windows;	
+	my $remaining_set_size = scalar(keys %remaining_windows);
+
+	#swapping to array here for practicality, would need to maintain hash if we need to iterate
+	my @rwindows = keys %remaining_windows;
+
+	#This could just be one window, but this will not be inthe co-occurence hash %chunk_sets
+	#Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window.
+	my $next_chunk;
+
+	if(scalar(@rwindows) == 1){
+	  #we just want to find a suitably large chunk for this one window.
+	  my ($last_window) = @rwindows;
+
+	  $multiplier = int(500000/$last_window);
+	  $next_chunk = $multiplier * $last_window;
+	}
+	else{
+	  #Now were are doing something very similar to above
+	  #populating a set_size chunk length registry
+	  #my %seen_hash;
+	 	  
+
+	  foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){
+	  	my $seen_count = 0;
+		
+		foreach my $rwindow(@rwindows){
+		  
+		  $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}});
+		}
+		
+		if ($seen_count == $remaining_set_size){
+		  
+		  $next_chunk = $chunk;
+		  last;
+		}
+	  }
+	}
+
+	@{$chunk_windows{$next_chunk}} = @rwindows;
+
+
+	if($next_chunk){
+
+	  print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n";
+
+	#Now we want to cycle through all the set lengths which could contain the ones not in the first
+	#so we need to
+	}
+	else{
+	  warn "Need to write iterative sub for set definition";
+	  throw('Could not find workable slice length for remaining windows: '.
+			join(', ', @rwindows));
+
+	}
+  }
+  else{
+	@{$chunk_windows{$chunk_length}} = keys(%workable_chunks);
+   	print "Found workable chunk length($chunk_length) for all window sizes:\t".
+	  join(' ', @{$chunk_windows{$chunk_length}})."\n";
+  }
+
+  return \%chunk_windows;
+}
+
+
+#Let's concentrate on store function first before we split out into store and fetch methods
+#How will this work with the Bed parser?
+#The descendant collector will sort the input and detect the current slice before calling 
+#store_window_bins_by_Slice.  This may require some caching of line or seeking as we will see the next slice before we have a chance to set it.
+#This will store as ResultFeature collections, so maybe we need to separate the input from output code?
+#i.e. Bed parser/wrapper
+#     ResultFeatureAdaptor wrapper
+#These 
+
+#Problem with passing window_sizes here
+#We need to check that they aren't already defined a class variables as this could potentially
+#screw up retrieval, expect for only 0 or all but 0
+#Should we remove this config and force the class variable to be set in the 'adaptor'
+#Method is then only used internally, make private or only getter? Set by changing class vars?
+
+sub store_window_bins_by_Slice{
+  my ($self, $slice, %config) = @_;
+
+  my ($window_sizes, $logic_name, $bin_method, $fetch_method_ref, $max_view_width, 
+	  $max_data_type_size, $pack_template, $packed_size, $bin_model, $new_assm, $skip_zero_window) =
+    rearrange( [ 'WINDOW_SIZES', 'LOGIC_NAME', 'BIN_METHOD', 'FETCH_METHOD_REF', 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', 'PACK_TEMPLATE', 'PACKED_SIZE', 'BIN_MODEL', 'NEW_ASSEMBLY', 'SKIP_ZERO_WINDOW'], %config );
+
+  warn "Need to be careful here about cleaning start end strand caches between serially run slices";
+
+
+
+  ### VAILDATE VARS/CONFIG
+
+  #This could be done once in set_config, could then remove setter bahviour from attr methods?
+  #All default defs params/methods can be overridden by config params
+  #Attrs used in this method
+  $bin_method   = $self->bin_method($bin_method);
+  $bin_model    = $self->bin_model($bin_model);
+  #$window_sizes = $self->window_sizes($window_sizes);#Now done below
+  #Set to undef if we ave empty array
+  $window_sizes = undef if (ref($window_sizes) eq 'ARRAY' && scalar(@$window_sizes) == 0);
+  #Attrs used in other (store) methods
+  $self->pack_template($pack_template);
+  $self->packed_size($packed_size);
+  $self->max_data_type_size($max_data_type_size);
+  $self->max_view_width($max_view_width);
+
+  #Other vars
+  $self->new_assembly($new_assm);
+
+  #Need to validate slice here
+
+  warn "temp hack for bin_method validation";
+  $bin_method = $self->validate_bin_method($bin_method);
+    
+  ### Set window_sizes
+  
+  if($self->new_assembly){
+	print "Assembly projection may cause problems for large Collections, defaulting to window_sizes = (0)\n";
+	#Then build the bins on the projected 0 level single ResultFeatures
+
+	#Test we haven't explicity set window_sizes to be soemthing else
+
+	if($window_sizes &&
+	   ! ( scalar(@$window_sizes) == 1 && $window_sizes[0] == 0)){
+
+	  throw("You have set window_sizes config which are not safe when projecting to a new assembly($new_assm), please omit window_sizes config or set to 0");
+
+	}
+	$window_sizes = $self->window_sizes([0]);
+  }
+  else{
+
+	if($window_sizes && $skip_zero_window && grep/^0$/,@$window_sizes){
+	  throw("You have specied skip_zero_window and window_size 0 in your config, please remove one of these");
+	}
+	elsif($window_sizes && ! grep/^0$/,@$window_sizes){
+	  $skip_zero_window = 1;
+	  unshift @$window_sizes, 0;#re-add 0 window as we need this to build the collections
+	}
+
+	$window_sizes = $self->window_sizes($window_sizes);
+  }
+  
+
+  #This is already done in the script
+  if($skip_zero_window && $new_assm){
+	throw("You cannot -skip_zero_window or omit 0 from -window_sizes when projecting to a new assembly($new_assm) which should only be generated using window_size=0");
+  }
+ 
+
+
+  ### Rollback previously stored features
+  
+  if($self->can('rollback_Features_by_Slice')){
+	$self->rollback_Features_by_Slice($slice);
+  }
+  else{
+	#This is currently the only warn output we can't get rid off
+	warn ref($self)." cannot rollback_Features_by_Slice. This may result in duplicate Collections being stored if there is pre-existing data";
+  }
+
+
+
+  ### PROCESS CHUNKS
+
+  #Not lightweight as we will be storing them
+  # Temporarily set the collection to be lightweight???
+  #my $old_value = $this->_lightweight();
+  #if   ( defined($lightweight) ) { $this->_lightweight($lightweight) }
+  #else                           { $this->_lightweight(1) }
+
+  my %chunk_windows = %{$self->_define_window_chunks($self->window_sizes, $self->max_view_width)};
+  my (%counts, $store_natural);
+  $store_natural = grep/^0/, @$window_sizes;
+  $counts{0}=0;#Set natural res count to 0
+  my $slice_end        = $slice->end;
+  my $orig_slice       = $slice;
+  my $orig_start       = $slice->start;
+  #my $slice_adj = $slice->start - 1;#Removed this as we are now generating features local to orig_slice
+  #start/end conversion will be done in write/store_collection
+  my $region           = $slice->coord_system_name;
+  my $version          = $slice->coord_system->version;
+  my $seq_region_name  = $slice->seq_region_name;
+  my $strand           = $slice->strand;
+  my $only_natural     = 0;
+  #my $slice_adj = 0;
+
+
+  #We need to account for only 0 here when doing projection
+  #The chunk window is set to max_view_widht in _define_chunk_windows
+
+  $only_natural = 1 if $store_natural && scalar(@$window_sizes) == 1;
+  $store_natural = 0 if $skip_zero_window;
+  #SHould really test these two, but should already be caught by now
+
+  #Set the initial  collection_start to orig_start
+  #Could default to 1, but we may not be starting from 1
+  #This is not the case for 0 wsize where it must always be 
+  #The first feature start
+
+
+  for my $wsize(@{$self->window_sizes}){
+
+	next if $wsize == 0;# && $skip_zero_window;#We never want to assume start of 0 window collection
+	$self->collection_start($wsize, $orig_start);
+  }
+
+
+ 
+  foreach my $chunk_length(sort keys %chunk_windows){
+
+	print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}}).
+	  " with chunk length $chunk_length\n";
+	map  $counts{$_} = 0, @{$chunk_windows{$chunk_length}};	#Set window counts to 0
+
+	#Now walk through slice using slice length chunks and build all windows in each chunk
+	my $in_slice     = 1;
+	my $start_adj    = 0;
+	my ($sub_end, $features, $bins);
+	my $sub_start    = 1;
+	my $slice_length = $slice->length;
+	
+
+	#Can we subslice and then exclusivly use bin_start(local to orig_slice)
+	#Then we never have to deal with sr coord until we store
+	#This should me we never have to do the sr conversion unless we 
+	#use a slice which doesn't start at 1(PAR or test)
+	#Always create in local coords for fetch
+	#Then change to seq_region coords for store if required
+
+
+	while($in_slice){
+	  #$sr_start       = $slice_start + $start_adj;
+	  $sub_start += $start_adj;
+
+	  #$slice_start = $sr_start;#Keep for next slice
+	  #$sr_end   = $sr_start + $chunk_length - 1;
+	  $sub_end   = $sub_start + $chunk_length - 1;
+
+	  #Last chunk might not be the correct window length
+	  #Hence why we should do this on whole chromosomes
+	  if($sub_end >= $slice_length){
+		#$sub_end = $slice_end;
+		#No longer set to slice end, as we don't want to corrupt the bin definition?
+		#Surplus bins are removed in store/write_collection in caller
+		#We could simply add the largest window the the end of the slice?
+		#Then we will only build the minimum of excess bins?
+		#This should be okay for bin calcs
+		#But may screw up bin trimming in caller as we currently expect $ub_end to be a valid bin end
+		#for all wsizes
+		#bin trimming should handle this, but this will corrupt the bin definition???
+		#bin definition is depedant on method
+		#So this method need to be agnostic
+		#And deal with the rest in descendant
+		$in_slice = 0;
+	  }
+	  
+	
+	  $slice = $slice->adaptor->fetch_by_region($region, $seq_region_name, ($sub_start + $orig_start -1), ($sub_end + $orig_start - 1), $strand, $version);
+	  #Can't subslice as this will not clip if we go over the length of the slice, unlike normal slice fetching
+	  #hence we cannot rely on this
+	  #$slice = $orig_slice->sub_Slice($sub_start, $sub_end, $orig_slice->strand);
+	  #warn "got sub slice $slice as $sub_start - $sub_end from ".$orig_slice->name;
+
+	  
+	  ### Grab features and shift chunk coords
+	  #features may already be a 0 wsize collection if we have projected from an old assembly
+	  #Could move this check to get_Features_by_Slice?
+	  
+	  #e.g. [ $features, \%config ]
+	  $features = $self->get_Features_by_Slice($slice);
+	  #next if scalar(@$features) == 0;#We want to store values for all windows
+
+	  if( (@$features) &&
+		  (ref($features->[0]) =~ /Bio::EnsEMBL::Funcgen::Collection/) ){#Change to isa 'Bio::EnsEMBL::Collection
+		
+		#Check that the returned feature/collections support window_size
+
+		if($features->[0]->can('window_size')){
+		  
+		  if($features->[0]->window_size != 0){
+			throw("You are trying to generated Collections from a non-zero window sized Collection:\t".$features->[1]->{'window_size'});
+		  }
+		  
+		  #This should never happen
+		  if(! $skip_zero_window){ 
+			throw('You have retrieved data from a Collection which without using -skip_zero_window i.e. you are trying to generate overwrite the data you are generating the Collections from');
+		  }
+		}
+		else{
+		  throw('Something si wrong, the Collection you have retrieved does not support the method window_size');
+		}
+	  }
+
+
+
+	  #Set collection start here for 0 window_size
+	  if(@$features && $store_natural && ! defined $self->collection_start(0)){
+		$self->collection_start(0, ($features->[0]->start + $sub_start));
+	  }
+
+	  
+
+	  $start_adj = $chunk_length if($in_slice);
+	  	  
+	
+	  
+	  #This should return a hash of window size => bin array pairs
+	  if(! $only_natural){
+		
+		$bins = $self->_bin_features_by_window_sizes(
+													 -slice         => $slice,
+													 -window_sizes  => $chunk_windows{$chunk_length},
+													 -bin_method        => $bin_method,
+													 -features      => $features,
+													);
+
+
+		
+
+	  }
+
+	  #my $bin_start = $sr_start + $slice_adj;#This was only required for storing individual bins
+	  #Could calc bin_start + slice_adjust ahere for all features
+	  #Doing this will break old code for single window collections
+	  
+	  #This is sr start and should be local to orig_slice!
+	  
+
+
+
+	  #We need to handle strandedness of slice!?	  
+	  
+	  #Store all normal features in result_feature
+
+
+	  
+	  if($store_natural){
+		
+		foreach my $feature(@$features){
+		  $counts{0}++;
+		  #warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->scores->[0]));
+		  
+
+		  #Should we handle bin trimming here for overhanging slices
+		  #Then counts wil be correct and wont have to do in caller
+
+		  #We could stop here if the feature seq_region start > orig_slice end
+		  #Current done in write/store_collection
+		  #This may mean working in seq_region values rather than slice values
+
+
+		  #write_collection is implemented in descendant e.g. Bio::EnsEMBL::Funcgen::Collector::ResultFeature
+		  #as wrapper to adaptor store method or print to file
+		  
+		  #These params need to be generated in a way defined by the descendant
+		  #
+		  
+		  if($bin_model eq 'SIMPLE'){
+			#We need to pass the slice with this so we can sub slice when storing
+			#the collection and set the start/end to 1 and length of slice
+			#we still need to store the first start to be able to sub slice correctly
+			
+			$self->collection_start(0, ($feature->start + $sub_start));
+
+			#Need to pass strand for 0 resolution
+			$self->write_collection(0,
+									$orig_slice,
+									#These are now wrt orig_slice
+									#($feature->start + $sub_start),
+									($feature->end   + $sub_start),
+									$feature->strand,
+									$feature->scores,
+									);
+
+			#We can have problems here if the original score type
+			#does not match the collected score type
+			#For max magnitude this is not an issue
+			#as we take the larget value from the bin
+			#But for other methods this may not be true
+			#e.g. count
+			#Hence, if we want to preserve the 0 window
+			#We must account for this in the feature collector
+			#e.g. set_collection_defs_by_ResultSet_window_size?
+			#Just omit 0 window for reads
+
+		  }
+		}
+		
+		print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins for:\t".$slice->name."\n";	
+	  }
+
+	  #Now store bins
+	  #	  my ($bin_end, $bin_scores);
+	  my $num_bins;
+
+	  foreach my $wsize(sort keys %{$bins}){
+		$num_bins = scalar(@{$bins->{$wsize}});
+		#warn "$num_bins bin scores for $wsize:\t".join(',', @{$bins->{$wsize}});
+
+		#Should we handle bin trimming here for overhanging slices
+		#Then counts wil be correct and wont have to do in caller
+
+
+		$counts{$wsize}+= $num_bins;	
+		
+	   
+
+		#We don't need this loop for collections as we can simply push all the scores at once
+		#Just use the slice start and end
+		if($bin_model eq 'SIMPLE'){
+
+		  $self->write_collection($wsize,
+								  $orig_slice,
+								  #$sub_start,
+								  $sub_end,
+								  $orig_slice->strand,#This is most likely 1!
+								  #Override this woth 0 in descendant Collector if required.
+								  $bins->{$wsize},
+								 );
+
+		}
+		else{
+		  throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
+		  #i.e. More than one score
+		}
+	  
+
+		
+#		#Reset start and end for new wsize
+#		$bin_start = $slice->start;
+#		$bin_end   = $slice->start;
+#
+#
+#
+#		#We don't need this loop for collections as we can simply push all the scores at once
+#		
+#
+#		foreach my $bin_index(0..$#{$bins->{$wsize}}){
+#
+#
+#
+#		  #default method to handle simple fixed width bin?
+#		  #bin_end need to be defined dependant on the bin type
+#		  #($bin_start) = $self->process_default_bin($bins->{$wsize}->[$bin_index], $wsize);#?
+#
+#		  
+#
+#		  #either define default bin method in descendant
+#		  #Or can we set a process_bin_method var?
+#		  #No just pass all this info to write collection and handle it there?
+#		  
+#		  #Can we have just predefined rotueines handling different bin types? 
+#		  #Simple
+#		  #Simple compressed
+#		  #Clipped
+#		  #This will prevent hanving to make attrs/method for storing persistent start/end/score info
+#		  
+#
+#
+#		  #Need validate bin_type method
+#		  #Could convert these to numbers for speed as with binning methods
+#
+#		  if($bin_model eq 'SIMPLE'){
+#
+#			$bin_scores = $bins->{$wsize}->[$bin_index];
+#
+#			warn "bin scores is $bin_scores";
+#
+#			
+#			#next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores
+#
+#			$bin_end += $wsize;
+#			
+#			#if($bin_score){#Removed this as we always want to write the score even if it is 0
+#			  
+#			  #This is a little backwards as we are generating the object to store it
+#			  #If we are aiming for speed the maybe we could also commodotise the store method
+#			  #store by args arrays? store_fast?
+#			  #Speed not essential for storing!
+#			  
+#			  #Note: list ref passed
+#			  			
+#			  #Don't need to pass all this info for fixed width blob collections
+#			  #Need to write some default handlers depedant on the collection type
+#			  #Simple(original)
+#			  #Simple compressed
+#			  #Multi compressed
+#			  #Clipped uncompressed?
+#
+#		
+#			  $self->write_collection($wsize,
+#									  $orig_slice,
+#									  ($bin_start + $slice_adj), 
+#									  ($bin_end   + $slice_adj),
+#									  $orig_slice->strand,#This is most likely 0
+#									  $bin_scores,
+#									 );
+#
+#			  #Only count if we have a stored(projected?) feature
+#			$count++;#Change this to attr/method?
+#			#}
+#			
+#			$bin_start += $wsize;
+#		  }
+#		  else{
+#			throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
+		#	  }
+		#	}
+		
+		#warn "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins";
+		#$counts{$wsize}+= $count;	
+	  }
+	}
+
+	$store_natural = 0;	#Turn off storing of natural resolution for next chunk length sets
+  }
+  
+  #Now need to write last collections for each wsize
+  
+  foreach my $wsize(@{$self->window_sizes}){
+
+	next if $wsize == 0 && ! $store_natural;
+	next if $wsize != 0 && $only_natural;
+
+	print "Writing final $wsize window_size collection, this may result in slightly different bin numbers from counts due to removing overhanging bins past end of slice\n";
+
+	$self->write_collection($wsize, $orig_slice);#store last collection
+  }
+
+
+  #Print some counts here 
+  foreach my $wsize(sort (keys %counts)){
+	print "Generated ".$counts{$wsize}." bins for window size $wsize for ".$orig_slice->name."\n";
+	#Some may have failed to store if we are projecting to a new assembly
+	#Need collection count here too, but would need methods for this?
+  }
+
+  #Return this counts hash so we can print/log from the caller, hence we don't print in here?
+  
+  return;
+}
+
+
+
+=head2 _bin_features_by_window_sizes
+
+  Args[0]    : Bio::EnsEMBL::Slice
+  Args[1]    : ARRAYREF of window sizes
+  Args[2]    : int - bin method, currently defined by validate_bin_methods
+  Args[3]    : ARRAYREF of Bio::EnsEMBL::Features
+  Example    : $bins = $self->_bin_features_by_window_sizes(
+													 -slice         => $slice,
+													 -window_sizes  => $chunk_windows{$chunk_length},
+													 -bin_method    => $bin_method,
+													 -features      => $features,
+													);
+  Description: Bins feature scores for a given list of window sizes and predefined method number
+  Returntype : HASHREF of scores per bin per window size
+  Exceptions : Throws if bin method not supported
+  Caller     : store_window_bins_by_Slice
+  Status     : At Risk
+
+=cut
+
+
+#To do
+# 1 Remove Bio::EnsEMBL::Feature dependancy? Or just create Features for non adaptor Collectors.
+#   Is there a way we can skip the object generation in the adaptor completely and just 
+#   pass the values we need?
+# 2 Separate methods, so we can define custom methods in descendants?
+# 3 Expand %bins model to optionally be one of
+#   the following dependant on binning method
+#   Simple:  fixed width containing arrays of scores for each window
+#   Multi:   fixed width containing multiple arrays of scores for each window
+#   Non-simple?: Separate aggregated features, either fixed width or not, not BLOB!
+#   Clipped: default fixed width with option to clip start and end.  Needs start/end attrs
+#         Can't store this in a blob due to non-standard start ends?
+#         Most likely want more than one score here? Count/Density SNPs?
+#         Removes data skew from standard window bins, would need to store each bin and post
+#         process. Or do in line to avoid 2nd post-processing loop,requires awareness of when 
+#         we have moved to a new bin between features.  This holds for overlapping and 
+#         non-overlapping features. Once we have observed a gap we need to clip the end of the
+#         last bin and clip the start of the new bin. This requires knowing the greatest end 
+#         values from the last bin's feature. what if two overlapping features had the same 
+#         start and different end, would we see the longest last? Check default slice_fetch sort
+
+sub _bin_features_by_window_sizes{
+  my $this = shift;
+  my ( $slice, $window_sizes, $method, $features ) =
+    rearrange( [ 'SLICE', 'WINDOW_SIZES', 'BIN_METHOD', 'FEATURES' ], @_ );
+
+  
+  #Do this conditional on the Collection type
+  #i.e. is collection seq_region blob then no else yes
+  #if ( !defined($features) || !@{$features} ) { return {} }
+
+  #warn 'Processing '.scalar(@$features).' features for window sizes '.join(', ',@$window_sizes).' for slice '.$slice->name."\n";	 
+  
+  #Set up some hashes to store data by window_size
+  my (%bins, %nbins, %bin_counts);
+  my $slice_start = $slice->start();
+
+   #Default handlers for
+  #my($first_bin);
+  #if ( $method == 0 ||    # 'count' or 'density'
+  #     $method == 3 ||    # 'fractional_count' or 'weight'
+  #     $method == 4       # 'coverage'
+  #  ){
+  #  # For binning methods where each bin contain numerical values.
+  #	$first_bin = 0;
+  #  } 
+  #  else {
+  #	# For binning methods where each bin does not contain numerical
+  #   # values.
+  #
+  #	#Remove this
+  #	$first_bin = undef;
+  #  }
+
+
+  #Set up some bin data for the windows
+  my $slice_length = $slice->length;
+
+  foreach my $wsize (@$window_sizes) {
+	#TO DO: Need to modify this block if default 0's are undesirable for collection type
+	#i.e. should it be undef instead? May have prolbems representing undef in blob
+
+	$nbins{$wsize}         = int($slice_length / $wsize); #int rounds down
+	#nbins is actually the index of the bin not the 'number'
+	#Unless slice_Length is a multiple!
+	$nbins{$wsize}-- if(! ($slice_length % $wsize));
+
+	#Create default bins with 0
+	@{$bins{$wsize}} = ();
+	map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize});		
+	
+	#Set bin counts to 0 for each bin
+	@{$bin_counts{$wsize}}    = ();
+
+	#This is adding an undef to the start of the array!?
+    map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}};
+
+	foreach my $bin(@{$bins{$wsize}}){
+	  $bin_counts{$wsize}->[$bin] = 0;
+	}	
+  }
+
+  #warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts);
+  #This fails for slices which are smaller than the chunk length; 
+  my $feature_index = 0;
+  my ($bin_index, @bin_masks);
+ 
+  foreach my $feature ( @{$features} ) {
+	#Set up the bins for each window size
+
+	#Omit test for Bio::EnsEMBL::Feature here for speed
+	#Only needs start/end methods
+
+	foreach my $wsize (@$window_sizes) {
+	
+	  #We have already highjacked the object creation by here
+	  #This is done in core BaseFeatureAdaptor
+	  #We probably don't want to do this for ResultFeatures as we don't use the
+	  #standard feature implementation
+	  #we already use an array and we don't store the slice
+	  #as this is already known by the caller
+	  #and we always build on top level so we don't need to remap
+
+	  #We do however need the slice to store, as we only store local starts when generating
+	  #We need a store by Slice method?
+	  #This will remove the need to inherit from Feature.
+	  #These will need to be regenerated everytime we import a new build
+	  #As we do with the probe_features themselves
+	  #This also mean the result_feature status has to be associated with a coord_system_id
+	  
+	  #Which bins do the start and end lie in for this feature?
+	  #Already dealing with local starts, so no slice subtraction
+	  #Could wrap these start/end methods via the descendant Collector
+	  #to remove the Feature dependancy? Or just create Features when parsing in the caller
+	  my $start_bin =  int(($feature->start ) / $wsize);
+	  my $end_bin   =  int(($feature->end) / $wsize );
+   	  $end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize};
+
+
+
+	  #Slightly obfuscated code to match method number(faster)
+	  #by avoiding string comparisons.
+	  #Could call methods directly using coderef set in validate_bin_method
+	  #Accessor may slow things down, but should be uniform for all methods
+	  #rather than being dependant on position in if/else block below
+
+	  #reserve 0 for descendant defined method?
+	  #There fore always fastest in this block, or use coderefs?
+	  if ( $method == 0 ) {
+		# ----------------------------------------------------------------
+		# For 'count' and 'density'.
+		
+		for ( $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+
+		  $bins{$wsize}->[$bin_index]++;
+
+		  #warn "setting $wsize bin $bin_index to ". $bins{$wsize}->[$bin_index];
+
+		}
+	  }
+
+=pod
+	
+	  } elsif ( $method == 1 ) {
+		# ----------------------------------------------------------------
+		# For 'indices' and 'index'
+
+	  
+		#How is this useful?
+		#Is this not just count per bin?
+		#No this is a list of the feature indices
+		#So forms a distribution?
+
+		throw('Not implemented for method for index'); 
+
+		for ( my $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  push( @{ $bins[$bin_index] }, $feature_index );
+		}
+
+		++$feature_index;
+
+	  } elsif ( $method == 2 ) {
+		# ----------------------------------------------------------------
+		# For 'features' and 'feature'.
+
+		throw('Not implemented for method for feature/features'); 
+		
+		for ( my $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  push( @{ $bins[$bin_index] }, $feature );
+		}
+		
+	  } elsif ( $method == 3 ) {
+		# ----------------------------------------------------------------
+		# For 'fractional_count' and 'weight'.
+		
+
+		throw('Not implemented for method for fractional_count/weight'); 
+		
+		if ( $start_bin == $end_bin ) {
+		  ++$bins[$start_bin];
+		} else {
+		  
+		  my $feature_length =
+			$feature->[FEATURE_END] - $feature->[FEATURE_START] + 1;
+		  
+		  # The first bin...
+		  $bins[$start_bin] +=
+			( ( $start_bin + 1 )*$bin_length -
+			  ( $feature->[FEATURE_START] - $slice_start ) )/
+				$feature_length;
+
+		  # The intermediate bins (if there are any)...
+		  for ( my $bin_index = $start_bin + 1 ;
+				$bin_index <= $end_bin - 1 ;
+				++$bin_index ) {
+			$bins[$bin_index] += $bin_length/$feature_length;
+		  }
+		  
+		  # The last bin...
+		  $bins[$end_bin] +=
+			( ( $feature->[FEATURE_END] - $slice_start ) -
+			  $end_bin*$bin_length +
+			  1 )/$feature_length;
+		  
+		}						## end else [ if ( $start_bin == $end_bin)
+
+	  } 
+	  elsif ( $method == 4 ) {
+		# ----------------------------------------------------------------
+		# For 'coverage'.
+		
+		#What exactly is this doing?
+		#This is coverage of bin
+		#Rather than coverage of feature as in fractional_count
+	  
+		
+	#	my $feature_start = $feature->[FEATURE_START] - $slice_start;
+	#	my $feature_end   = $feature->[FEATURE_END] - $slice_start;
+	#	
+  	#	if ( !defined( $bin_masks[$start_bin] )
+	# 		 || ( defined( $bin_masks[$start_bin] )
+	#	 		  && $bin_masks[$start_bin] != 1 ) ) {
+ 	#	  # Mask the $start_bin from the start of the feature to the end
+ 	#	  # of the bin, or to the end of the feature (whichever occurs
+	#	  # first).
+ 	#	  my $bin_start = int( $start_bin*$bin_length );
+ 	#	  my $bin_end = int( ( $start_bin + 1 )*$bin_length - 1 ); 
+	#	  for ( my $pos = $feature_start;
+ 	#			$pos <= $bin_end && $pos <= $feature_end ;
+ 	#			++$pos ) {
+ 	#		$bin_masks[$start_bin][ $pos - $bin_start ] = 1;
+ 	#	  }
+ 	#	}
+ 	#	
+ 	#	for ( my $bin_index = $start_bin + 1 ;
+ 	#		  $bin_index <= $end_bin - 1 ;
+ 	#		  ++$bin_index ) {
+ 	#	  # Mark the middle bins between $start_bin and $end_bin as fully
+ 	#	  # masked out.
+ 	#	  $bin_masks[$bin_index] = 1;
+ 	#	}
+ 	#	
+ 	#	if ( $end_bin != $start_bin ) {
+ 	#	
+ 	#	  if ( !defined( $bin_masks[$end_bin] )
+ 	#		   || ( defined( $bin_masks[$end_bin] )
+ 	#				&& $bin_masks[$end_bin] != 1 ) ) {
+ 	#		# Mask the $end_bin from the start of the bin to the end of
+ 	#		# the feature, or to the end of the bin (whichever occurs
+ 	#		# first).
+ 	#		my $bin_start = int( $end_bin*$bin_length );
+ 	#		my $bin_end = int( ( $end_bin + 1 )*$bin_length - 1 );
+ 	#		for ( my $pos = $bin_start ;
+ 	#			  $pos <= $feature_end && $pos <= $bin_end ;
+ 	#			  ++$pos ) {
+ 	#		  $bin_masks[$end_bin][ $pos - $bin_start ] = 1;
+ 	#		}
+ 	#	  }
+  	#	}
+  	 # }							## end elsif ( $method == 4 )
+
+=cut
+
+	  
+	  elsif ( $method == 5 ) {
+		#$self->$method($bin_index, $start_bin, $end_bin, $wsize, \%bins, \%bin_counts);
+
+
+		#average score
+		#This is simple an average of all the scores for features which overlap this bin
+		#No weighting with respect to the bin or the feature
+		
+		for ( $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+
+		  #we should really push onto array here so we can have median or mean.
+		  $bins{$wsize}->[$bin_index] += $this->get_score_by_Feature($feature);
+		  $bin_counts{$wsize}->[$bin_index]++;
+		}
+	  }  
+	  elsif( $method == 6){
+		#Max magnitude
+		#Take the highest value +ve or -ve score
+		for ( $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+
+		  #we really need to capture the lowest -ve and higest +ve scores here and post process
+		  #To pick between them
+		  
+		  my $score = $this->get_score_by_Feature($feature);
+		  #Write score method as wrapper to scores?
+
+		  $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
+		  
+
+		  #warn "Comparing wsize $wsize bin $bin_index score $score to ".  $bins{$wsize}->[$bin_index]->[0].' '.$bins{$wsize}->[$bin_index]->[1]."\n";
+
+		  if($score <  $bins{$wsize}->[$bin_index]->[0]){
+			#warn "setting -ve bin to $score\n";
+			$bins{$wsize}->[$bin_index]->[0] = $score;
+		  }
+		  elsif($score > $bins{$wsize}->[$bin_index][1]){
+			#warn "setting +ve bin to $score\n";
+			$bins{$wsize}->[$bin_index]->[1] = $score;
+		  }
+		}
+	  }
+	  else {
+		throw("Only accomodates average score method");
+	  }
+	
+	
+	}
+				
+  }	## end foreach my $feature ( @{$features...
+
+
+  #Now do post processing of bins
+
+=pod
+
+  if ( $method == 4 ) {
+
+    # ------------------------------------------------------------------
+    # For the 'coverage' method: Finish up by going through @bin_masks
+    # and sum up the arrays.
+
+    for ( my $bin_index = 0 ; $bin_index < $nbins ; ++$bin_index ) {
+       if ( defined( $bin_masks[$bin_index] ) ) {
+        if ( !ref( $bin_masks[$bin_index] ) ) {
+          $bins[$bin_index] = 1;
+        } else {
+          $bins[$bin_index] =
+            scalar( grep ( defined($_), @{ $bin_masks[$bin_index] } ) )/
+            $bin_length;
+        }
+      }
+    }
+  }
+
+=cut
+
+  if( $method == 5){
+	#For average score, need to divide bins by bin_counts
+
+	foreach my $wsize(keys %bins){
+
+	  foreach my $bin_index(0..$#{$bins{$wsize}}){
+
+		if($bin_counts{$wsize}->[$bin_index]){
+		  $bins{$wsize}->[$bin_index] /= $bin_counts{$wsize}->[$bin_index];
+		}
+		#warn "bin_index $wsize:$bin_index has score ".$bins{$wsize}->[$bin_index];
+	  }
+	}
+  }
+  elsif( $method == 6){
+	#Max magnitude
+	#Take the highest value +ve or -ve score
+
+	foreach my $wsize(keys %bins){
+
+	  foreach my $bin_index(0..$#{$bins{$wsize}}){
+
+		#So we have the potential that we have no listref in a given bin
+
+		#default value if we haven't seen anything is 0
+		#we actually want an array of -ve +ve values
+
+		#warn "Are we storing 0 values for absent data?";
+		#Not for max_magnitude, but maybe for others?
+
+		if($bins{$wsize}->[$bin_index]){
+		  #warn $wsize.':'.$bin_index.':'.$bins{$wsize}->[$bin_index]->[0].'-'.$bins{$wsize}->[$bin_index]->[1];
+		  my $tmp_minus = $bins{$wsize}->[$bin_index]->[0] * -1;
+		  
+		  if($tmp_minus > $bins{$wsize}->[$bin_index]->[1]){
+			$bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[0];
+		  }
+		  else{
+			$bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[1];
+		  }
+
+		  #warn "bin $bin_index now ".	$bins{$wsize}->[$bin_index];
+		}
+	  }
+	}
+  }
+  elsif($method != 0){#Do no post processing for count(0)
+	throw('Collector currently only accomodates average_score, count and max magnitude methods');
+  }
+
+
+  #Could return bin_counts too summary reporting in zmenu
+  #Could also do counting of specific type
+
+  #warn "returning bins ".Data::Dumper::Dumper(\%bins);
+
+  return \%bins;
+} ## end sub _bin_features
+
+
+=pod
+
+#These could potentially be used as code refs to avoid having the if else block
+#This way we can also define new methods in the descendant Collector?
+#Would have to have pass args and refs to bin hashes
+#This would slow things down over direct access here
+#But speed is no longer that critical as we do not use the Collector for display
+#purposes, only to build the Collections which are then used for display directly.
+
+sub calculate_average_score{
+  my $self = shift;
+
+  if ( $method == 5 ) {
+		#average score
+		#This is simple an average of all the scores for features which overlap this bin
+		#No weighting with respect to the bin or the feature
+		
+		for ( my $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+
+		  #we should really push onto array here so we can have median or mean.
+		  $bins{$wsize}->[$bin_index] += $feature->score;
+		  $bin_counts{$wsize}->[$bin_index]++;
+		}
+	  }  
+
+
+
+}
+	
+
+sub post_process_average_score{
+
+}
+
+
+sub calculate_max_magnitude{
+  my $self = shift;
+
+  #Max magnitude
+  #Take the highest value +ve or -ve score
+  for ( my $bin_index = $start_bin ;
+		$bin_index <= $end_bin ;
+		++$bin_index ) {
+	
+	#we really need to capture the lowest -ve and higest +ve scores here and post process
+	#To pick between them
+	
+	my $score = $feature->score;
+	$bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
+	
+	if($score <  $bins{$wsize}->[$bin_index]->[0]){
+	  $bins{$wsize}->[$bin_index]->[0] = $score;
+	}
+	elsif($score > $bins{$wsize}->[$bin_index][1]){
+	  $bins{$wsize}->[$bin_index]->[1] = $score;
+	}
+  }
+}
+
+
+sub post_process_max_magnitude{
+
+}
+
+=cut
+
+#separated to allow addition of non-standard methods
+#Could potentially add these in new
+#and put this back in _bin_features
+
+
+sub validate_bin_method{
+  my ($self, $method) = @_;
+
+
+  #change this to set the coderefs
+  #Just set anonymous sub to immediately return for non post processed methods
+  #No need for coderef, just set the method name?
+
+  #if(! $self->can('calculate_'.$method)){
+  #throw("$method method does not have a valid calculate_${method} method");
+  #}
+
+  #if($self->can('post_process_'.$method)){
+  ##set post process flag?
+  #or simply do this can in line in the _bin_features sub?
+  #}
+  
+
+
+
+  #Add average_score to avoid changing Collection.pm
+  my $class = ref($self);
+  ${$class::VALID_BINNING_METHODS}{'average_score'} = 5;
+  ${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6;
+  ${$class::VALID_BINNING_METHODS}{'count'} = 0;
+  
+
+
+    #foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){
+#	warn "valid method is $method name";
+#  }
+
+
+  if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) {
+    throw(
+		  sprintf(
+				  "Invalid binning method '%s', valid methods are:\n\t%s\n",
+				  $method,
+				  join( "\n\t", sort( keys(%{$class::VALID_BINNING_METHODS}) ) ) ) );
+  }
+  else{
+	#warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method};
+  }
+  
+  return ${$class::VALID_BINNING_METHODS}{$method};
+}
+
+
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children