ensembl: variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm comparison

comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+# $Id: Collector.pm,v 1.7 2011/01/10 11:27:34 nj1 Exp $
+=head1 LICENSE
+Copyright (c) 1999-2011 The European Bioinformatics Institute and
+Genome Research Limited.  All rights reserved.
+This software is distributed under a modified Apache license.
+For license details, please see
+http://www.ensembl.org/info/about/code_licence.html
+=head1 CONTACT
+Please email comments or questions to the public Ensembl
+developers list at <ensembl-dev@ebi.ac.uk>.
+Questions may also be sent to the Ensembl help desk at
+<helpdesk@ensembl.org>.
+=cut
+#Your Bio::Ensembl::Collection::Feature defs module should inherit from here
+#This could be a local defs file which you have created and require'd into your script
+#If your collections defs module refers to a Bio::EnsEMBL::Feature,
+#then it's adaptor should inherit from the collections defs module
+package Bio::EnsEMBL::Funcgen::Collector;
+#Move this to Bio::EnsEMBL::Utils::Collector for 59?
+use strict;
+use warnings;
+use Bio::EnsEMBL::Utils::Argument  ('rearrange');
+use Bio::EnsEMBL::Utils::Exception ('throw');
+use Bio::EnsEMBL::Funcgen::ResultFeature;
+#use base('Bio::EnsEMBL::Collection');#ISA
+our ($pack_template, $packed_size, @window_sizes); #These get set in the FeatureAdaptor
+#Make these constants and remove setter functionality in methods?
+#Only really important for pack template and windows, maybe these if we are going to start
+our $max_data_type_size = 16777216; #Default is 16MB for long blob
+#we need to deduct the size of the rest of the record here!
+#For a 2byte packet the smallest window size possible is:
+#(slice->length/(16777216/2)
+#so int(bin_size)+1
+#Obviously have to use the largest slice here, for human chr1:
+#249,250,621/(16777216/2) = 29.7???
+#We may need to up this slightly to account for larger chrs?
+#Implications on memory usage? Is it 4 times for blob manipulation?
+#Does substr require this manipulation?
+#This max_allowed_packet_size does not seem to translate directly to the size of the
+#data being stored e.g. quite a bit more is needed.  ISG haven't got to the bottom of this yet.
+#But have simply upped the config to 67108864 to handle the largest human chr.
+our $max_view_width     = 500000;#Max width in Region In Detail;
+#our %VALID_BINNING_METHODS
+#Remove this in favour of can->('calculate_.$method) and coderefs?
+#To do
+# 1 DONE Merge in Collection code, (no need to do this, removed inheritance)
+# 2 Write simple BED input to flat file output.
+# 3 Separate store method so we can simply get, then wrap store around this
+# 4 Test get method with slice adjusts
+# 5 separate set_config?
+# 6 optimise generate_bin_chunks to handle just one window size for display?
+# 7 Handle packed_size pack_template as methods  constants
+# 8 Provide override method in basefeature adaptor which will use package constant in feature adaptor
+# This is because these are really adaptor config, the collector only needs to know the
+# packed_size, and in the absence of an feature adaptor also provides the default methods for both.
+# If we substr in the API then we need to set sensible limits on blob size, otherwise we will have to unpack a lot of data
+# to get at the slice we want.
+# OR
+# Change adaptor to substr in DB based on known blob ranges/window size
+# and stitch together any which cross boundaries. This depends on speed of substr at end of large blob TEST!
+# Load with current code first and test this before making either change!
+# Delete empty (non-0) collections? i.e. For seq_regions which do not have any source features.
+#
+# 9 Handle PAR/HAP regions using fetch_normalised_slice_projections This has to be done in the feature adaptor! Then restrict  to non_dup regions in calling script
+=head2 new
+Args       : None
+Example    : my $collector = Bio::EnsEMBL::(Funcgen|Compara|Variation::)Collector::FEATURE->new;
+$collector->store_windows_by_Slice($slice);
+Description: Simple new method to enable use of collector when not inherited by
+a descendant of Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor
+Returntype : Bio::EnsEMBL::Funcgen::Collector
+Exceptions : None
+Caller     : Collector script
+Status     : At Risk
+=cut
+sub new{
+return bless {}, $_[0];#Simple blesses this class as an empty hash
+#Do not set anything here
+#As will not be first in ISA for feature adaptors
+#Hence not guaranteed to be called
+}
+#Setter/Getter methods if we don't have a dedicated Collector module
+#to set the package variables in? Also to allow overriding of defaults.
+#This can be used by the write_collection method
+#to determine when to build and store a compressed collection
+#Effectively the max size of the data type you are using to store
+#a compressed score. defaults to max for long blob 16MB
+#Generic method, but only ever called by write_collection in descendant
+sub new_assembly{
+my ($self, $new_ass) = @_;
+if($new_ass){
+	#Validate new assm to project to
+	$self->{'new_assembly'} = $new_ass;
+}
+return $self->{'new_assembly'};
+}
+sub max_data_type_size{
+my ($self, $size) = @_;
+#Validate is sensible integer?
+if($size && ! int($size)){
+	throw("max_data_type_size must be a integer of bytes, not $size");
+}
+elsif($size){
+	$self->{'max_data_type_size'} = $size;
+}
+elsif(! defined $self->{'max_data_type_size'}){
+	#default set at head of this module or in descendant Collector
+	$self->{'max_data_type_size'} = $Bio::EnsEMBL::Funcgen::Collector::max_data_type_size;
+}
+return $self->{'max_data_type_size'};
+}
+sub max_view_width{
+my ($self, $size) = @_;
+#Validate is sensible integer?
+if($size && ! int($size)){
+	throw("max_view_width must be a integer, not $size");
+}
+elsif($size){
+	$self->{'max_view_width'} = $size;
+}
+elsif(! defined $self->{'max_view_width'}){
+	#default set at head of this module or in descendant Collector
+	$self->{'max_view_width'} = $Bio::EnsEMBL::Funcgen::Collector::max_view_width;
+}
+return $self->{'max_view_width'};
+}
+sub bins_per_record(){
+#$collector_class::bins_per_record = ($collector_class::max_data_type_size/$collector_class::packed_size);#This should be done dynamically as we may redefine either of these variables?
+my ($self) = shift;
+return int($self->max_data_type_size/$self->packed_size);
+}
+#The defaults for these should be defined in the feature/format specific Collector descendant
+#either by specifying the package variables or using config attrs to set methods?
+#general config should be parsed here.
+#rename bin_method?
+sub bin_method{
+my ($self, $method) = @_;
+if($method || ! $self->{'bin_method'}){
+	if($method){
+	  $self->{'bin_method'} = $method;
+	  #should test can here? or validate versus hash?
+	}
+	elsif(! $self->{'bin_method'}){
+	  if (! defined  $Bio::EnsEMBL::Funcgen::Collector::bin_method){
+		throw('Must pass a bin_method in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_method in your Collector');
+	  }
+	  $self->{'bin_method'} = $Bio::EnsEMBL::Funcgen::Collector::bin_method;
+	}
+	#or current validate method if we are keeping the method in the if/else block
+	#if(! $self->can("calculate_${method}"))){
+	#throw("$method is no a valid a valid binning method");
+	#}
+}
+return $self->{'bin_method'};
+}
+#We could replace this with a hash of bin_methods and models?
+#This could then be used to validate
+#Altho if we are going to commodotise the bin methods, then we need to be able to
+#define this in the child Collector. Could still do this by modifying the method/model
+#hash from the child Collector
+sub bin_model{
+my ($self, $bin_model) = @_;
+if($bin_model || ! $self->{'bin_model'}){
+	if($bin_model){
+	  $self->{'bin_model'} = $bin_model;
+	}
+	elsif(! $self->{'bin_model'}){
+	  #Set as global constant defined in descendant Collector
+	  if (! defined  $Bio::EnsEMBL::Funcgen::Collector::bin_model){
+		throw('Must pass -bin_model in the config or define $Bio::EnsEMBL::Funcgen::Collector::bin_model in your Collector');
+	  }
+	  $self->{'bin_model'} = $Bio::EnsEMBL::Funcgen::Collector::bin_model;
+	}
+	#Need to validate bin models here
+	throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models')	if $self->{'bin_model'} ne 'SIMPLE';
+}
+return $self->{'bin_model'};
+}
+#This can be overridden by adaptor method
+#At present this could cause problems as we can pass window sizes in the config, but they will never be set
+#as adaptor method is not a setter. Adaptor method should throw if we try and set them as this could cause problems when fetching and not knowing the custom sizes?
+sub window_sizes{
+my ($self, $sizes) = @_;
+if($sizes || ! $self->{'window_sizes'}){
+	if($sizes){
+	  $self->{'window_sizes'} = $sizes;
+	}
+	else{#! $self->{'windows_sizes'
+	  if (! @window_sizes){
+		throw('Must pass -windows_sizes in the config or define @Bio::EnsEMBL::Funcgen::Collector::window_sizes in your Collector');
+	  }
+	  @{$self->{'window_sizes'}} = @window_sizes;
+	}
+	if(ref($self->{'window_sizes'}) ne 'ARRAY' ||
+	   scalar(@{$self->{'window_sizes'}}) == 0){
+	  throw('window_sizes must be an arrayref of at least one window size');
+	}
+}
+return $self->{'window_sizes'};
+}
+#Optional attrs dependant on whether Collection is packed
+#Can be redefined in the adaptor but becareful never to redefine the actual values
+#As these should really be constants for a given Collector
+#What is best here? We only want pack methods for storing/fetching compressed collections
+#Move this to base feature adaptor and define attrs as constants using
+#package variable? Or directly in new?
+#Then direct modification will be caught.
+#Just leave here for now.
+#Caller _obj_from_sth/store
+sub pack_template{
+my ($self, $template) = @_;
+if($template){
+	$self->{'pack_template'} = $template;
+}
+elsif(! $self->{'pack_template'}){
+	#Set as global constant defined in descendant Collector
+	if (! defined  $Bio::EnsEMBL::Funcgen::Collector::pack_template){
+	  throw('Must pass a per score pack_template in the config or define $Bio::EnsEMBL::Funcgen::Collector::pack_template in your Collector');
+	}
+	$self->{'pack_template'} = $Bio::EnsEMBL::Funcgen::Collector::pack_template;
+}
+return $self->{'pack_template'};
+}
+#Caller _obj_from_sth/store & current_packed_size
+sub packed_size{
+my ($self, $size) = @_;
+if($size){
+	if(! int($size)){
+	  throw("$size is not an integer, must pass a size integer for packed_size which specifies size of pack_template:\t".$self->pack_template);
+	}
+	$self->{'packed_size'} = $size;
+}
+elsif(! $self->{'packed_size'}){
+	#Set as global constant defined in descendant Collector
+	if (! defined  $Bio::EnsEMBL::Funcgen::Collector::packed_size){
+	  throw('Must pass a packed_size(wrt to pack_template) in the config or define $Bio::EnsEMBL::Funcgen::Collector::packed_size in your Collector');
+	}
+	$self->{'packed_size'} = $Bio::EnsEMBL::Funcgen::Collector::packed_size;
+}
+return $self->{'packed_size'};
+}
+#These methods are used by the descendant Collector
+#For caching infor whilst building collections
+#This is used to log how big a collection has grown before storing
+sub current_packed_size{
+my ($self, $wsize) = @_;
+#$self->{'current_packed_size'}{$wsize} ||= 0;
+#if(defined $cps){
+#	$self->{'current_packed_size'}{$wsize} = $cps;
+#  }
+#  else{
+#	return $self->{'current_packed_size'}{$wsize};
+#  }
+return (scalar(@{$self->score_cache($wsize)})*$self->packed_size);
+}
+sub score_cache{
+my ($self, $wsize, $scores) = @_;
+$self->{'score_cache'}{$wsize} ||= [];
+if(defined $scores){
+	push @{$self->{'score_cache'}{$wsize}}, @{$scores};
+}
+else{
+	#Do this here to stop passing the ref everytime
+	#Will this be faster?
+	#Would certainly be faster if we were not returning a ref
+	return $self->{'score_cache'}{$wsize};
+}
+}
+#These last methods are only used for the 0 wsize
+#natural resolution and ar wrt the orig_slice passed
+#to store_windows_by_Slice
+sub collection_start{
+my ($self, $wsize, $sr_start) = @_;
+if(defined $sr_start){
+	$self->{'collection_start'}{$wsize} = $sr_start;
+}
+else{
+	return $self->{'collection_start'}{$wsize};
+}
+}
+sub collection_end{
+my ($self, $wsize, $sr_end) = @_;
+if(defined $sr_end){
+	$self->{'collection_end'}{$wsize} = $sr_end;
+}
+else{
+	return $self->{'collection_end'}{$wsize};
+}
+}
+sub collection_strand{
+my ($self, $wsize, $strand) = @_;
+if(defined $strand){
+	 $self->{'collection_strand'}{$wsize} = $strand;
+}
+else{
+	 return $self->{'collection_strand'}{$wsize};
+}
+}
+=pod
+sub _create_feature {
+my ( $this, $feature_type, $args ) = @_;
+my $feature = $this->SUPER::_create_feature( $feature_type, $args );
+if ( !$this->_lightweight() ) {
+my ( $phase, $end_phase, $stable_id, $version, $created_date,
+$modified_date, $is_current )
+= rearrange( [ 'PHASE',        'END_PHASE',
+'STABLE_ID',    'VERSION',
+'CREATED_DATE', 'MODIFIED_DATE',
+'IS_CURRENT'
+],
+%{$args} );
+push( @{$feature},
+$phase, $end_phase, $stable_id, $version, $created_date,
+$modified_date, $is_current );
+}
+return $feature;
+}
+sub _create_feature_fast {
+my ( $this, $feature_type, $args ) = @_;
+my $feature =
+$this->SUPER::_create_feature_fast( $feature_type, $args );
+return $feature;
+}
+#This might not be sensible for Features which are split across tables
+sub _tables {
+my ($this) = @_;
+my @tables = $this->SUPER::_tables();
+if ( $this->_lightweight() ) {
+return ( $tables[0] );
+}
+return @tables;
+}
+sub _columns {
+my ($this) = @_;
+my @columns = $this->SUPER::_columns();
+if ( $this->_lightweight() ) {
+	#What is this doing?
+	#Probably not sensible for ResultFeature
+@columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns );
+}
+return @columns;
+}
+#Also not sensible for objects spread across several tables
+sub _default_where_clause {
+my ($this) = @_;
+if ( $this->_lightweight() ) {
+return '';
+}
+return $this->SUPER::_default_where_clause();
+}
+=cut
+#This need to be generic
+#Again we need to pass an accessor method/reference?
+#Will be some sort of generic fetch for feature adaptors
+#or while loop for in flat file accessor
+#rollback to be handled in caller?
+# To do
+#
+# 1 Allow variable chunks lengths (so we only have one resolution of windows?)
+#   This will allow SNP collections which currently define classification i.e colour
+#   Density of SNPs within window will define shading. Count will be displayed in zmenu
+#   This maybe something we have to do in the descendant
+#
+# 2 Implement collection param definition in/from descendant
+# return collection config from adaptor fetch
+# window size
+# fixed width?
+# render/collection style?
+# This chould be implemented in BaseFeatureAdaptor::generic_fetch?
+# Or could be done in the calling fetchmethod?
+#
+#need to change this to get_window_bin_by_Slice
+#to enable generating bins on uncompressed data
+#Need to remove all counts and store based code to store caller
+#this would mean removing any pack based code too
+#separate set_config method
+#Probelm here is size of slice?
+#We need to generate bins all in one go, but also need to store at interval
+#so as not to explode memory
+#Do we need to separate the window generation from the bin generation code?
+#Define the optimal way to generate windowed data by
+#finding the most common denominator
+sub _define_window_chunks{
+my ($self, $window_sizes, $max_view_size) = @_;
+### DEFINE CHUNKS WRT WINDOWS
+#Shortcut for on the fly uncompressed collection retrieval
+#if(scalar(@wsizes) = 1){
+#
+#}
+#else{
+#Calulate sensible slice length based on window sizes
+my @wsizes = sort {$a <=> $b} @$window_sizes;
+#We need a default when only calculating 0 resolution
+#Will binning code work with only 0 resolution?
+if((scalar(@wsizes) == 1) &&
+	 $wsizes[0] == 0){
+	return { $self->max_view_width => [0] };
+}
+my $multiplier = int($max_view_size/$wsizes[$#wsizes]);
+my $chunk_length = $multiplier * $wsizes[$#wsizes];
+my $not_divisible = 1;
+my %chunk_windows;#Registry of chunk lengths to run with windows
+my %workable_chunks = map {$_ => {}} @wsizes;
+delete $workable_chunks{'0'};#get rid of natural resolution as this will always work
+while($not_divisible && $chunk_length != 0){
+	$not_divisible = 0;
+	foreach my $wsize(@wsizes){
+	  next if $wsize == 0;#Special wsize for normal data
+	  #Set not divisible if modulus is true
+	  if($chunk_length % $wsize){
+		$not_divisible = 1;
+	  }
+	  else{
+		#No need to listref here?
+		$workable_chunks{$wsize}{$chunk_length} = [];
+	  }
+	  #warn "chunk length is $chunk_length and not_divisible is $not_divisible";
+	}
+	#Gradually shrink the length until we find a workable slice length for all windows
+	$chunk_length -= $wsizes[$#wsizes] if $not_divisible;
+}
+my %chunk_sets;
+if($chunk_length == 0){
+	print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n";
+	foreach my $wsize(keys %workable_chunks){
+	  #Loop through windows, seeing if they are workable in the other windows
+	  foreach my $chunk(keys %{$workable_chunks{$wsize}}){
+		foreach my $other_wsize(keys %workable_chunks){
+		  next if $wsize == $other_wsize;
+		  if(exists $workable_chunks{$other_wsize}{$chunk}){
+			#only push it onto the other wsize, as we will do the reverse later
+			$chunk_sets{$chunk}{$wsize} = undef;
+		  }
+		}
+	  }
+	}
+	#Now we have a register of co-occurence of wsizes with repect to chunks
+	#Loop through finding the least amount of sets with the longest chunk length?
+	#There is no way to decide which is best?
+	#we could calculate the number of loops? Factored by the chunk length?
+	#Let's just print out and see what we get
+	#warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets);
+	#For now let's just take the one which has the most windows and the longest chunk
+	#Then we just get the largest which handles the rest.
+	#define possible set lengths
+	my $i = 0;
+	my %set_lengths;
+	map {$set_lengths{$i} = []; $i++} @wsizes;
+	delete $set_lengths{'0'};#get rid of natural resolution as this will always work
+	#store chunks lengths for each set size
+	foreach my $chunk(keys %chunk_sets){
+	  my $set_size = scalar(values %{$chunk_sets{$chunk}});
+	  push @{$set_lengths{$set_size}}, $chunk;
+	}
+	#Now we get the biggest set with the longest length;
+	my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop
+	my $found_largest_set = 0;
+	while(! $found_largest_set){
+	  $largest_size--;
+	  if(scalar(@{$set_lengths{$largest_size}}>0)){
+		$found_largest_set = 1;
+	  }
+	}
+	#We should be able to loop this bit, to find all the biggest sets.
+	my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}};
+	#we could even be selective here, but let's just take the first one for now
+	my @largest_windows = keys %{$chunk_sets{$largest_chunk}};
+	@{$chunk_windows{$largest_chunk}} = @largest_windows;
+	print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n";
+	my %remaining_windows = map {$_ => {}} @wsizes;
+	delete $remaining_windows{'0'};#get rid of natural resolution as this will always work
+	map { delete $remaining_windows{$_} } @largest_windows;
+	my $remaining_set_size = scalar(keys %remaining_windows);
+	#swapping to array here for practicality, would need to maintain hash if we need to iterate
+	my @rwindows = keys %remaining_windows;
+	#This could just be one window, but this will not be inthe co-occurence hash %chunk_sets
+	#Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window.
+	my $next_chunk;
+	if(scalar(@rwindows) == 1){
+	  #we just want to find a suitably large chunk for this one window.
+	  my ($last_window) = @rwindows;
+	  $multiplier = int(500000/$last_window);
+	  $next_chunk = $multiplier * $last_window;
+	}
+	else{
+	  #Now were are doing something very similar to above
+	  #populating a set_size chunk length registry
+	  #my %seen_hash;
+	  foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){
+	  	my $seen_count = 0;
+		foreach my $rwindow(@rwindows){
+		  $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}});
+		}
+		if ($seen_count == $remaining_set_size){
+		  $next_chunk = $chunk;
+		  last;
+		}
+	  }
+	}
+	@{$chunk_windows{$next_chunk}} = @rwindows;
+	if($next_chunk){
+	  print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n";
+	#Now we want to cycle through all the set lengths which could contain the ones not in the first
+	#so we need to
+	}
+	else{
+	  warn "Need to write iterative sub for set definition";
+	  throw('Could not find workable slice length for remaining windows: '.
+			join(', ', @rwindows));
+	}
+}
+else{
+	@{$chunk_windows{$chunk_length}} = keys(%workable_chunks);
+	print "Found workable chunk length($chunk_length) for all window sizes:\t".
+	  join(' ', @{$chunk_windows{$chunk_length}})."\n";
+}
+return \%chunk_windows;
+}
+#Let's concentrate on store function first before we split out into store and fetch methods
+#How will this work with the Bed parser?
+#The descendant collector will sort the input and detect the current slice before calling
+#store_window_bins_by_Slice.  This may require some caching of line or seeking as we will see the next slice before we have a chance to set it.
+#This will store as ResultFeature collections, so maybe we need to separate the input from output code?
+#i.e. Bed parser/wrapper
+#     ResultFeatureAdaptor wrapper
+#These
+#Problem with passing window_sizes here
+#We need to check that they aren't already defined a class variables as this could potentially
+#screw up retrieval, expect for only 0 or all but 0
+#Should we remove this config and force the class variable to be set in the 'adaptor'
+#Method is then only used internally, make private or only getter? Set by changing class vars?
+sub store_window_bins_by_Slice{
+my ($self, $slice, %config) = @_;
+my ($window_sizes, $logic_name, $bin_method, $fetch_method_ref, $max_view_width,
+	  $max_data_type_size, $pack_template, $packed_size, $bin_model, $new_assm, $skip_zero_window) =
+rearrange( [ 'WINDOW_SIZES', 'LOGIC_NAME', 'BIN_METHOD', 'FETCH_METHOD_REF', 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', 'PACK_TEMPLATE', 'PACKED_SIZE', 'BIN_MODEL', 'NEW_ASSEMBLY', 'SKIP_ZERO_WINDOW'], %config );
+warn "Need to be careful here about cleaning start end strand caches between serially run slices";
+### VAILDATE VARS/CONFIG
+#This could be done once in set_config, could then remove setter bahviour from attr methods?
+#All default defs params/methods can be overridden by config params
+#Attrs used in this method
+$bin_method   = $self->bin_method($bin_method);
+$bin_model    = $self->bin_model($bin_model);
+#$window_sizes = $self->window_sizes($window_sizes);#Now done below
+#Set to undef if we ave empty array
+$window_sizes = undef if (ref($window_sizes) eq 'ARRAY' && scalar(@$window_sizes) == 0);
+#Attrs used in other (store) methods
+$self->pack_template($pack_template);
+$self->packed_size($packed_size);
+$self->max_data_type_size($max_data_type_size);
+$self->max_view_width($max_view_width);
+#Other vars
+$self->new_assembly($new_assm);
+#Need to validate slice here
+warn "temp hack for bin_method validation";
+$bin_method = $self->validate_bin_method($bin_method);
+### Set window_sizes
+if($self->new_assembly){
+	print "Assembly projection may cause problems for large Collections, defaulting to window_sizes = (0)\n";
+	#Then build the bins on the projected 0 level single ResultFeatures
+	#Test we haven't explicity set window_sizes to be soemthing else
+	if($window_sizes &&
+	   ! ( scalar(@$window_sizes) == 1 && $window_sizes[0] == 0)){
+	  throw("You have set window_sizes config which are not safe when projecting to a new assembly($new_assm), please omit window_sizes config or set to 0");
+	}
+	$window_sizes = $self->window_sizes([0]);
+}
+else{
+	if($window_sizes && $skip_zero_window && grep/^0$/,@$window_sizes){
+	  throw("You have specied skip_zero_window and window_size 0 in your config, please remove one of these");
+	}
+	elsif($window_sizes && ! grep/^0$/,@$window_sizes){
+	  $skip_zero_window = 1;
+	  unshift @$window_sizes, 0;#re-add 0 window as we need this to build the collections
+	}
+	$window_sizes = $self->window_sizes($window_sizes);
+}
+#This is already done in the script
+if($skip_zero_window && $new_assm){
+	throw("You cannot -skip_zero_window or omit 0 from -window_sizes when projecting to a new assembly($new_assm) which should only be generated using window_size=0");
+}
+### Rollback previously stored features
+if($self->can('rollback_Features_by_Slice')){
+	$self->rollback_Features_by_Slice($slice);
+}
+else{
+	#This is currently the only warn output we can't get rid off
+	warn ref($self)." cannot rollback_Features_by_Slice. This may result in duplicate Collections being stored if there is pre-existing data";
+}
+### PROCESS CHUNKS
+#Not lightweight as we will be storing them
+# Temporarily set the collection to be lightweight???
+#my $old_value = $this->_lightweight();
+#if   ( defined($lightweight) ) { $this->_lightweight($lightweight) }
+#else                           { $this->_lightweight(1) }
+my %chunk_windows = %{$self->_define_window_chunks($self->window_sizes, $self->max_view_width)};
+my (%counts, $store_natural);
+$store_natural = grep/^0/, @$window_sizes;
+$counts{0}=0;#Set natural res count to 0
+my $slice_end        = $slice->end;
+my $orig_slice       = $slice;
+my $orig_start       = $slice->start;
+#my $slice_adj = $slice->start - 1;#Removed this as we are now generating features local to orig_slice
+#start/end conversion will be done in write/store_collection
+my $region           = $slice->coord_system_name;
+my $version          = $slice->coord_system->version;
+my $seq_region_name  = $slice->seq_region_name;
+my $strand           = $slice->strand;
+my $only_natural     = 0;
+#my $slice_adj = 0;
+#We need to account for only 0 here when doing projection
+#The chunk window is set to max_view_widht in _define_chunk_windows
+$only_natural = 1 if $store_natural && scalar(@$window_sizes) == 1;
+$store_natural = 0 if $skip_zero_window;
+#SHould really test these two, but should already be caught by now
+#Set the initial  collection_start to orig_start
+#Could default to 1, but we may not be starting from 1
+#This is not the case for 0 wsize where it must always be
+#The first feature start
+for my $wsize(@{$self->window_sizes}){
+	next if $wsize == 0;# && $skip_zero_window;#We never want to assume start of 0 window collection
+	$self->collection_start($wsize, $orig_start);
+}
+foreach my $chunk_length(sort keys %chunk_windows){
+	print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}}).
+	  " with chunk length $chunk_length\n";
+	map  $counts{$_} = 0, @{$chunk_windows{$chunk_length}};	#Set window counts to 0
+	#Now walk through slice using slice length chunks and build all windows in each chunk
+	my $in_slice     = 1;
+	my $start_adj    = 0;
+	my ($sub_end, $features, $bins);
+	my $sub_start    = 1;
+	my $slice_length = $slice->length;
+	#Can we subslice and then exclusivly use bin_start(local to orig_slice)
+	#Then we never have to deal with sr coord until we store
+	#This should me we never have to do the sr conversion unless we
+	#use a slice which doesn't start at 1(PAR or test)
+	#Always create in local coords for fetch
+	#Then change to seq_region coords for store if required
+	while($in_slice){
+	  #$sr_start       = $slice_start + $start_adj;
+	  $sub_start += $start_adj;
+	  #$slice_start = $sr_start;#Keep for next slice
+	  #$sr_end   = $sr_start + $chunk_length - 1;
+	  $sub_end   = $sub_start + $chunk_length - 1;
+	  #Last chunk might not be the correct window length
+	  #Hence why we should do this on whole chromosomes
+	  if($sub_end >= $slice_length){
+		#$sub_end = $slice_end;
+		#No longer set to slice end, as we don't want to corrupt the bin definition?
+		#Surplus bins are removed in store/write_collection in caller
+		#We could simply add the largest window the the end of the slice?
+		#Then we will only build the minimum of excess bins?
+		#This should be okay for bin calcs
+		#But may screw up bin trimming in caller as we currently expect $ub_end to be a valid bin end
+		#for all wsizes
+		#bin trimming should handle this, but this will corrupt the bin definition???
+		#bin definition is depedant on method
+		#So this method need to be agnostic
+		#And deal with the rest in descendant
+		$in_slice = 0;
+	  }
+	  $slice = $slice->adaptor->fetch_by_region($region, $seq_region_name, ($sub_start + $orig_start -1), ($sub_end + $orig_start - 1), $strand, $version);
+	  #Can't subslice as this will not clip if we go over the length of the slice, unlike normal slice fetching
+	  #hence we cannot rely on this
+	  #$slice = $orig_slice->sub_Slice($sub_start, $sub_end, $orig_slice->strand);
+	  #warn "got sub slice $slice as $sub_start - $sub_end from ".$orig_slice->name;
+	  ### Grab features and shift chunk coords
+	  #features may already be a 0 wsize collection if we have projected from an old assembly
+	  #Could move this check to get_Features_by_Slice?
+	  #e.g. [ $features, \%config ]
+	  $features = $self->get_Features_by_Slice($slice);
+	  #next if scalar(@$features) == 0;#We want to store values for all windows
+	  if( (@$features) &&
+		  (ref($features->[0]) =~ /Bio::EnsEMBL::Funcgen::Collection/) ){#Change to isa 'Bio::EnsEMBL::Collection
+		#Check that the returned feature/collections support window_size
+		if($features->[0]->can('window_size')){
+		  if($features->[0]->window_size != 0){
+			throw("You are trying to generated Collections from a non-zero window sized Collection:\t".$features->[1]->{'window_size'});
+		  }
+		  #This should never happen
+		  if(! $skip_zero_window){
+			throw('You have retrieved data from a Collection which without using -skip_zero_window i.e. you are trying to generate overwrite the data you are generating the Collections from');
+		  }
+		}
+		else{
+		  throw('Something si wrong, the Collection you have retrieved does not support the method window_size');
+		}
+	  }
+	  #Set collection start here for 0 window_size
+	  if(@$features && $store_natural && ! defined $self->collection_start(0)){
+		$self->collection_start(0, ($features->[0]->start + $sub_start));
+	  }
+	  $start_adj = $chunk_length if($in_slice);
+	  #This should return a hash of window size => bin array pairs
+	  if(! $only_natural){
+		$bins = $self->_bin_features_by_window_sizes(
+													 -slice         => $slice,
+													 -window_sizes  => $chunk_windows{$chunk_length},
+													 -bin_method        => $bin_method,
+													 -features      => $features,
+													);
+	  }
+	  #my $bin_start = $sr_start + $slice_adj;#This was only required for storing individual bins
+	  #Could calc bin_start + slice_adjust ahere for all features
+	  #Doing this will break old code for single window collections
+	  #This is sr start and should be local to orig_slice!
+	  #We need to handle strandedness of slice!?
+	  #Store all normal features in result_feature
+	  if($store_natural){
+		foreach my $feature(@$features){
+		  $counts{0}++;
+		  #warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->scores->[0]));
+		  #Should we handle bin trimming here for overhanging slices
+		  #Then counts wil be correct and wont have to do in caller
+		  #We could stop here if the feature seq_region start > orig_slice end
+		  #Current done in write/store_collection
+		  #This may mean working in seq_region values rather than slice values
+		  #write_collection is implemented in descendant e.g. Bio::EnsEMBL::Funcgen::Collector::ResultFeature
+		  #as wrapper to adaptor store method or print to file
+		  #These params need to be generated in a way defined by the descendant
+		  #
+		  if($bin_model eq 'SIMPLE'){
+			#We need to pass the slice with this so we can sub slice when storing
+			#the collection and set the start/end to 1 and length of slice
+			#we still need to store the first start to be able to sub slice correctly
+			$self->collection_start(0, ($feature->start + $sub_start));
+			#Need to pass strand for 0 resolution
+			$self->write_collection(0,
+									$orig_slice,
+									#These are now wrt orig_slice
+									#($feature->start + $sub_start),
+									($feature->end   + $sub_start),
+									$feature->strand,
+									$feature->scores,
+									);
+			#We can have problems here if the original score type
+			#does not match the collected score type
+			#For max magnitude this is not an issue
+			#as we take the larget value from the bin
+			#But for other methods this may not be true
+			#e.g. count
+			#Hence, if we want to preserve the 0 window
+			#We must account for this in the feature collector
+			#e.g. set_collection_defs_by_ResultSet_window_size?
+			#Just omit 0 window for reads
+		  }
+		}
+		print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins for:\t".$slice->name."\n";
+	  }
+	  #Now store bins
+	  #	  my ($bin_end, $bin_scores);
+	  my $num_bins;
+	  foreach my $wsize(sort keys %{$bins}){
+		$num_bins = scalar(@{$bins->{$wsize}});
+		#warn "$num_bins bin scores for $wsize:\t".join(',', @{$bins->{$wsize}});
+		#Should we handle bin trimming here for overhanging slices
+		#Then counts wil be correct and wont have to do in caller
+		$counts{$wsize}+= $num_bins;
+		#We don't need this loop for collections as we can simply push all the scores at once
+		#Just use the slice start and end
+		if($bin_model eq 'SIMPLE'){
+		  $self->write_collection($wsize,
+								  $orig_slice,
+								  #$sub_start,
+								  $sub_end,
+								  $orig_slice->strand,#This is most likely 1!
+								  #Override this woth 0 in descendant Collector if required.
+								  $bins->{$wsize},
+								 );
+		}
+		else{
+		  throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
+		  #i.e. More than one score
+		}
+#		#Reset start and end for new wsize
+#		$bin_start = $slice->start;
+#		$bin_end   = $slice->start;
+#
+#
+#
+#		#We don't need this loop for collections as we can simply push all the scores at once
+#
+#
+#		foreach my $bin_index(0..$#{$bins->{$wsize}}){
+#
+#
+#
+#		  #default method to handle simple fixed width bin?
+#		  #bin_end need to be defined dependant on the bin type
+#		  #($bin_start) = $self->process_default_bin($bins->{$wsize}->[$bin_index], $wsize);#?
+#
+#
+#
+#		  #either define default bin method in descendant
+#		  #Or can we set a process_bin_method var?
+#		  #No just pass all this info to write collection and handle it there?
+#
+#		  #Can we have just predefined rotueines handling different bin types?
+#		  #Simple
+#		  #Simple compressed
+#		  #Clipped
+#		  #This will prevent hanving to make attrs/method for storing persistent start/end/score info
+#
+#
+#
+#		  #Need validate bin_type method
+#		  #Could convert these to numbers for speed as with binning methods
+#
+#		  if($bin_model eq 'SIMPLE'){
+#
+#			$bin_scores = $bins->{$wsize}->[$bin_index];
+#
+#			warn "bin scores is $bin_scores";
+#
+#
+#			#next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores
+#
+#			$bin_end += $wsize;
+#
+#			#if($bin_score){#Removed this as we always want to write the score even if it is 0
+#
+#			  #This is a little backwards as we are generating the object to store it
+#			  #If we are aiming for speed the maybe we could also commodotise the store method
+#			  #store by args arrays? store_fast?
+#			  #Speed not essential for storing!
+#
+#			  #Note: list ref passed
+#
+#			  #Don't need to pass all this info for fixed width blob collections
+#			  #Need to write some default handlers depedant on the collection type
+#			  #Simple(original)
+#			  #Simple compressed
+#			  #Multi compressed
+#			  #Clipped uncompressed?
+#
+#
+#			  $self->write_collection($wsize,
+#									  $orig_slice,
+#									  ($bin_start + $slice_adj),
+#									  ($bin_end   + $slice_adj),
+#									  $orig_slice->strand,#This is most likely 0
+#									  $bin_scores,
+#									 );
+#
+#			  #Only count if we have a stored(projected?) feature
+#			$count++;#Change this to attr/method?
+#			#}
+#
+#			$bin_start += $wsize;
+#		  }
+#		  else{
+#			throw('Bio::EnsEMBL::Funcgen::Collector does not yet support non-SIMPLE bin models');
+		#	  }
+		#	}
+		#warn "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins";
+		#$counts{$wsize}+= $count;
+	  }
+	}
+	$store_natural = 0;	#Turn off storing of natural resolution for next chunk length sets
+}
+#Now need to write last collections for each wsize
+foreach my $wsize(@{$self->window_sizes}){
+	next if $wsize == 0 && ! $store_natural;
+	next if $wsize != 0 && $only_natural;
+	print "Writing final $wsize window_size collection, this may result in slightly different bin numbers from counts due to removing overhanging bins past end of slice\n";
+	$self->write_collection($wsize, $orig_slice);#store last collection
+}
+#Print some counts here
+foreach my $wsize(sort (keys %counts)){
+	print "Generated ".$counts{$wsize}." bins for window size $wsize for ".$orig_slice->name."\n";
+	#Some may have failed to store if we are projecting to a new assembly
+	#Need collection count here too, but would need methods for this?
+}
+#Return this counts hash so we can print/log from the caller, hence we don't print in here?
+return;
+}
+=head2 _bin_features_by_window_sizes
+Args[0]    : Bio::EnsEMBL::Slice
+Args[1]    : ARRAYREF of window sizes
+Args[2]    : int - bin method, currently defined by validate_bin_methods
+Args[3]    : ARRAYREF of Bio::EnsEMBL::Features
+Example    : $bins = $self->_bin_features_by_window_sizes(
+													 -slice         => $slice,
+													 -window_sizes  => $chunk_windows{$chunk_length},
+													 -bin_method    => $bin_method,
+													 -features      => $features,
+													);
+Description: Bins feature scores for a given list of window sizes and predefined method number
+Returntype : HASHREF of scores per bin per window size
+Exceptions : Throws if bin method not supported
+Caller     : store_window_bins_by_Slice
+Status     : At Risk
+=cut
+#To do
+# 1 Remove Bio::EnsEMBL::Feature dependancy? Or just create Features for non adaptor Collectors.
+#   Is there a way we can skip the object generation in the adaptor completely and just
+#   pass the values we need?
+# 2 Separate methods, so we can define custom methods in descendants?
+# 3 Expand %bins model to optionally be one of
+#   the following dependant on binning method
+#   Simple:  fixed width containing arrays of scores for each window
+#   Multi:   fixed width containing multiple arrays of scores for each window
+#   Non-simple?: Separate aggregated features, either fixed width or not, not BLOB!
+#   Clipped: default fixed width with option to clip start and end.  Needs start/end attrs
+#         Can't store this in a blob due to non-standard start ends?
+#         Most likely want more than one score here? Count/Density SNPs?
+#         Removes data skew from standard window bins, would need to store each bin and post
+#         process. Or do in line to avoid 2nd post-processing loop,requires awareness of when
+#         we have moved to a new bin between features.  This holds for overlapping and
+#         non-overlapping features. Once we have observed a gap we need to clip the end of the
+#         last bin and clip the start of the new bin. This requires knowing the greatest end
+#         values from the last bin's feature. what if two overlapping features had the same
+#         start and different end, would we see the longest last? Check default slice_fetch sort
+sub _bin_features_by_window_sizes{
+my $this = shift;
+my ( $slice, $window_sizes, $method, $features ) =
+rearrange( [ 'SLICE', 'WINDOW_SIZES', 'BIN_METHOD', 'FEATURES' ], @_ );
+#Do this conditional on the Collection type
+#i.e. is collection seq_region blob then no else yes
+#if ( !defined($features) || !@{$features} ) { return {} }
+#warn 'Processing '.scalar(@$features).' features for window sizes '.join(', ',@$window_sizes).' for slice '.$slice->name."\n";
+#Set up some hashes to store data by window_size
+my (%bins, %nbins, %bin_counts);
+my $slice_start = $slice->start();
+#Default handlers for
+#my($first_bin);
+#if ( $method == 0 ||    # 'count' or 'density'
+#     $method == 3 ||    # 'fractional_count' or 'weight'
+#     $method == 4       # 'coverage'
+#  ){
+#  # For binning methods where each bin contain numerical values.
+#	$first_bin = 0;
+#  }
+#  else {
+#	# For binning methods where each bin does not contain numerical
+#   # values.
+#
+#	#Remove this
+#	$first_bin = undef;
+#  }
+#Set up some bin data for the windows
+my $slice_length = $slice->length;
+foreach my $wsize (@$window_sizes) {
+	#TO DO: Need to modify this block if default 0's are undesirable for collection type
+	#i.e. should it be undef instead? May have prolbems representing undef in blob
+	$nbins{$wsize}         = int($slice_length / $wsize); #int rounds down
+	#nbins is actually the index of the bin not the 'number'
+	#Unless slice_Length is a multiple!
+	$nbins{$wsize}-- if(! ($slice_length % $wsize));
+	#Create default bins with 0
+	@{$bins{$wsize}} = ();
+	map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize});
+	#Set bin counts to 0 for each bin
+	@{$bin_counts{$wsize}}    = ();
+	#This is adding an undef to the start of the array!?
+map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}};
+	foreach my $bin(@{$bins{$wsize}}){
+	  $bin_counts{$wsize}->[$bin] = 0;
+	}
+}
+#warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts);
+#This fails for slices which are smaller than the chunk length;
+my $feature_index = 0;
+my ($bin_index, @bin_masks);
+foreach my $feature ( @{$features} ) {
+	#Set up the bins for each window size
+	#Omit test for Bio::EnsEMBL::Feature here for speed
+	#Only needs start/end methods
+	foreach my $wsize (@$window_sizes) {
+	  #We have already highjacked the object creation by here
+	  #This is done in core BaseFeatureAdaptor
+	  #We probably don't want to do this for ResultFeatures as we don't use the
+	  #standard feature implementation
+	  #we already use an array and we don't store the slice
+	  #as this is already known by the caller
+	  #and we always build on top level so we don't need to remap
+	  #We do however need the slice to store, as we only store local starts when generating
+	  #We need a store by Slice method?
+	  #This will remove the need to inherit from Feature.
+	  #These will need to be regenerated everytime we import a new build
+	  #As we do with the probe_features themselves
+	  #This also mean the result_feature status has to be associated with a coord_system_id
+	  #Which bins do the start and end lie in for this feature?
+	  #Already dealing with local starts, so no slice subtraction
+	  #Could wrap these start/end methods via the descendant Collector
+	  #to remove the Feature dependancy? Or just create Features when parsing in the caller
+	  my $start_bin =  int(($feature->start ) / $wsize);
+	  my $end_bin   =  int(($feature->end) / $wsize );
+	  $end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize};
+	  #Slightly obfuscated code to match method number(faster)
+	  #by avoiding string comparisons.
+	  #Could call methods directly using coderef set in validate_bin_method
+	  #Accessor may slow things down, but should be uniform for all methods
+	  #rather than being dependant on position in if/else block below
+	  #reserve 0 for descendant defined method?
+	  #There fore always fastest in this block, or use coderefs?
+	  if ( $method == 0 ) {
+		# ----------------------------------------------------------------
+		# For 'count' and 'density'.
+		for ( $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  $bins{$wsize}->[$bin_index]++;
+		  #warn "setting $wsize bin $bin_index to ". $bins{$wsize}->[$bin_index];
+		}
+	  }
+=pod
+	  } elsif ( $method == 1 ) {
+		# ----------------------------------------------------------------
+		# For 'indices' and 'index'
+		#How is this useful?
+		#Is this not just count per bin?
+		#No this is a list of the feature indices
+		#So forms a distribution?
+		throw('Not implemented for method for index');
+		for ( my $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  push( @{ $bins[$bin_index] }, $feature_index );
+		}
+		++$feature_index;
+	  } elsif ( $method == 2 ) {
+		# ----------------------------------------------------------------
+		# For 'features' and 'feature'.
+		throw('Not implemented for method for feature/features');
+		for ( my $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  push( @{ $bins[$bin_index] }, $feature );
+		}
+	  } elsif ( $method == 3 ) {
+		# ----------------------------------------------------------------
+		# For 'fractional_count' and 'weight'.
+		throw('Not implemented for method for fractional_count/weight');
+		if ( $start_bin == $end_bin ) {
+		  ++$bins[$start_bin];
+		} else {
+		  my $feature_length =
+			$feature->[FEATURE_END] - $feature->[FEATURE_START] + 1;
+		  # The first bin...
+		  $bins[$start_bin] +=
+			( ( $start_bin + 1 )*$bin_length -
+			  ( $feature->[FEATURE_START] - $slice_start ) )/
+				$feature_length;
+		  # The intermediate bins (if there are any)...
+		  for ( my $bin_index = $start_bin + 1 ;
+				$bin_index <= $end_bin - 1 ;
+				++$bin_index ) {
+			$bins[$bin_index] += $bin_length/$feature_length;
+		  }
+		  # The last bin...
+		  $bins[$end_bin] +=
+			( ( $feature->[FEATURE_END] - $slice_start ) -
+			  $end_bin*$bin_length +
+			  1 )/$feature_length;
+		}						## end else [ if ( $start_bin == $end_bin)
+	  }
+	  elsif ( $method == 4 ) {
+		# ----------------------------------------------------------------
+		# For 'coverage'.
+		#What exactly is this doing?
+		#This is coverage of bin
+		#Rather than coverage of feature as in fractional_count
+	#	my $feature_start = $feature->[FEATURE_START] - $slice_start;
+	#	my $feature_end   = $feature->[FEATURE_END] - $slice_start;
+	#
+	#	if ( !defined( $bin_masks[$start_bin] )
+	# 		 || ( defined( $bin_masks[$start_bin] )
+	#	 		  && $bin_masks[$start_bin] != 1 ) ) {
+	#	  # Mask the $start_bin from the start of the feature to the end
+	#	  # of the bin, or to the end of the feature (whichever occurs
+	#	  # first).
+	#	  my $bin_start = int( $start_bin*$bin_length );
+	#	  my $bin_end = int( ( $start_bin + 1 )*$bin_length - 1 );
+	#	  for ( my $pos = $feature_start;
+	#			$pos <= $bin_end && $pos <= $feature_end ;
+	#			++$pos ) {
+	#		$bin_masks[$start_bin][ $pos - $bin_start ] = 1;
+	#	  }
+	#	}
+	#
+	#	for ( my $bin_index = $start_bin + 1 ;
+	#		  $bin_index <= $end_bin - 1 ;
+	#		  ++$bin_index ) {
+	#	  # Mark the middle bins between $start_bin and $end_bin as fully
+	#	  # masked out.
+	#	  $bin_masks[$bin_index] = 1;
+	#	}
+	#
+	#	if ( $end_bin != $start_bin ) {
+	#
+	#	  if ( !defined( $bin_masks[$end_bin] )
+	#		   || ( defined( $bin_masks[$end_bin] )
+	#				&& $bin_masks[$end_bin] != 1 ) ) {
+	#		# Mask the $end_bin from the start of the bin to the end of
+	#		# the feature, or to the end of the bin (whichever occurs
+	#		# first).
+	#		my $bin_start = int( $end_bin*$bin_length );
+	#		my $bin_end = int( ( $end_bin + 1 )*$bin_length - 1 );
+	#		for ( my $pos = $bin_start ;
+	#			  $pos <= $feature_end && $pos <= $bin_end ;
+	#			  ++$pos ) {
+	#		  $bin_masks[$end_bin][ $pos - $bin_start ] = 1;
+	#		}
+	#	  }
+	#	}
+	 # }							## end elsif ( $method == 4 )
+=cut
+	  elsif ( $method == 5 ) {
+		#$self->$method($bin_index, $start_bin, $end_bin, $wsize, \%bins, \%bin_counts);
+		#average score
+		#This is simple an average of all the scores for features which overlap this bin
+		#No weighting with respect to the bin or the feature
+		for ( $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  #we should really push onto array here so we can have median or mean.
+		  $bins{$wsize}->[$bin_index] += $this->get_score_by_Feature($feature);
+		  $bin_counts{$wsize}->[$bin_index]++;
+		}
+	  }
+	  elsif( $method == 6){
+		#Max magnitude
+		#Take the highest value +ve or -ve score
+		for ( $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  #we really need to capture the lowest -ve and higest +ve scores here and post process
+		  #To pick between them
+		  my $score = $this->get_score_by_Feature($feature);
+		  #Write score method as wrapper to scores?
+		  $bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
+		  #warn "Comparing wsize $wsize bin $bin_index score $score to ".  $bins{$wsize}->[$bin_index]->[0].' '.$bins{$wsize}->[$bin_index]->[1]."\n";
+		  if($score <  $bins{$wsize}->[$bin_index]->[0]){
+			#warn "setting -ve bin to $score\n";
+			$bins{$wsize}->[$bin_index]->[0] = $score;
+		  }
+		  elsif($score > $bins{$wsize}->[$bin_index][1]){
+			#warn "setting +ve bin to $score\n";
+			$bins{$wsize}->[$bin_index]->[1] = $score;
+		  }
+		}
+	  }
+	  else {
+		throw("Only accomodates average score method");
+	  }
+	}
+}	## end foreach my $feature ( @{$features...
+#Now do post processing of bins
+=pod
+if ( $method == 4 ) {
+# ------------------------------------------------------------------
+# For the 'coverage' method: Finish up by going through @bin_masks
+# and sum up the arrays.
+for ( my $bin_index = 0 ; $bin_index < $nbins ; ++$bin_index ) {
+if ( defined( $bin_masks[$bin_index] ) ) {
+if ( !ref( $bin_masks[$bin_index] ) ) {
+$bins[$bin_index] = 1;
+} else {
+$bins[$bin_index] =
+scalar( grep ( defined($_), @{ $bin_masks[$bin_index] } ) )/
+$bin_length;
+}
+}
+}
+}
+=cut
+if( $method == 5){
+	#For average score, need to divide bins by bin_counts
+	foreach my $wsize(keys %bins){
+	  foreach my $bin_index(0..$#{$bins{$wsize}}){
+		if($bin_counts{$wsize}->[$bin_index]){
+		  $bins{$wsize}->[$bin_index] /= $bin_counts{$wsize}->[$bin_index];
+		}
+		#warn "bin_index $wsize:$bin_index has score ".$bins{$wsize}->[$bin_index];
+	  }
+	}
+}
+elsif( $method == 6){
+	#Max magnitude
+	#Take the highest value +ve or -ve score
+	foreach my $wsize(keys %bins){
+	  foreach my $bin_index(0..$#{$bins{$wsize}}){
+		#So we have the potential that we have no listref in a given bin
+		#default value if we haven't seen anything is 0
+		#we actually want an array of -ve +ve values
+		#warn "Are we storing 0 values for absent data?";
+		#Not for max_magnitude, but maybe for others?
+		if($bins{$wsize}->[$bin_index]){
+		  #warn $wsize.':'.$bin_index.':'.$bins{$wsize}->[$bin_index]->[0].'-'.$bins{$wsize}->[$bin_index]->[1];
+		  my $tmp_minus = $bins{$wsize}->[$bin_index]->[0] * -1;
+		  if($tmp_minus > $bins{$wsize}->[$bin_index]->[1]){
+			$bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[0];
+		  }
+		  else{
+			$bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[1];
+		  }
+		  #warn "bin $bin_index now ".	$bins{$wsize}->[$bin_index];
+		}
+	  }
+	}
+}
+elsif($method != 0){#Do no post processing for count(0)
+	throw('Collector currently only accomodates average_score, count and max magnitude methods');
+}
+#Could return bin_counts too summary reporting in zmenu
+#Could also do counting of specific type
+#warn "returning bins ".Data::Dumper::Dumper(\%bins);
+return \%bins;
+} ## end sub _bin_features
+=pod
+#These could potentially be used as code refs to avoid having the if else block
+#This way we can also define new methods in the descendant Collector?
+#Would have to have pass args and refs to bin hashes
+#This would slow things down over direct access here
+#But speed is no longer that critical as we do not use the Collector for display
+#purposes, only to build the Collections which are then used for display directly.
+sub calculate_average_score{
+my $self = shift;
+if ( $method == 5 ) {
+		#average score
+		#This is simple an average of all the scores for features which overlap this bin
+		#No weighting with respect to the bin or the feature
+		for ( my $bin_index = $start_bin ;
+			  $bin_index <= $end_bin ;
+			  ++$bin_index ) {
+		  #we should really push onto array here so we can have median or mean.
+		  $bins{$wsize}->[$bin_index] += $feature->score;
+		  $bin_counts{$wsize}->[$bin_index]++;
+		}
+	  }
+}
+sub post_process_average_score{
+}
+sub calculate_max_magnitude{
+my $self = shift;
+#Max magnitude
+#Take the highest value +ve or -ve score
+for ( my $bin_index = $start_bin ;
+		$bin_index <= $end_bin ;
+		++$bin_index ) {
+	#we really need to capture the lowest -ve and higest +ve scores here and post process
+	#To pick between them
+	my $score = $feature->score;
+	$bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
+	if($score <  $bins{$wsize}->[$bin_index]->[0]){
+	  $bins{$wsize}->[$bin_index]->[0] = $score;
+	}
+	elsif($score > $bins{$wsize}->[$bin_index][1]){
+	  $bins{$wsize}->[$bin_index]->[1] = $score;
+	}
+}
+}
+sub post_process_max_magnitude{
+}
+=cut
+#separated to allow addition of non-standard methods
+#Could potentially add these in new
+#and put this back in _bin_features
+sub validate_bin_method{
+my ($self, $method) = @_;
+#change this to set the coderefs
+#Just set anonymous sub to immediately return for non post processed methods
+#No need for coderef, just set the method name?
+#if(! $self->can('calculate_'.$method)){
+#throw("$method method does not have a valid calculate_${method} method");
+#}
+#if($self->can('post_process_'.$method)){
+##set post process flag?
+#or simply do this can in line in the _bin_features sub?
+#}
+#Add average_score to avoid changing Collection.pm
+my $class = ref($self);
+${$class::VALID_BINNING_METHODS}{'average_score'} = 5;
+${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6;
+${$class::VALID_BINNING_METHODS}{'count'} = 0;
+#foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){
+#	warn "valid method is $method name";
+#  }
+if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) {
+throw(
+		  sprintf(
+				  "Invalid binning method '%s', valid methods are:\n\t%s\n",
+				  $method,
+				  join( "\n\t", sort( keys(%{$class::VALID_BINNING_METHODS}) ) ) ) );
+}
+else{
+	#warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method};
+}
+return ${$class::VALID_BINNING_METHODS}{$method};
+}
+1;

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Collector.pm @ 0:1f6dce3d34e0