Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/Collector.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 Bio::EnsEMBL::Utils::Collector | |
| 24 | |
| 25 =head1 SYNOPSIS | |
| 26 | |
| 27 # Inherit this base module in your feature specific Collector | |
| 28 # instance: | |
| 29 | |
| 30 package Bio::EnsEMBL::Funcgen::Collector::ResultFeature; | |
| 31 use base('Bio::EnsEMBL::Utils::Collector'); | |
| 32 | |
| 33 # ... and define package config variables | |
| 34 $Bio::EnsEMBL::Funcgen::Collector::bin_model = 'SIMPLE'; | |
| 35 $Bio::EnsEMBL::Funcgen::Collector::window_sizes = | |
| 36 [ 30, 65, 130, 260, 450, 648, 950, 1296 ]; | |
| 37 # Could replace 30 with 0 here for low density data at natural resolution | |
| 38 | |
| 39 $Bio::EnsEMBL::Utils::Collector::bin_method = | |
| 40 'count'; # only used by collector | |
| 41 | |
| 42 $Bio::EnsEMBL::Utils::Collector::packed_size = 2; | |
| 43 | |
| 44 # ... or simply use this module in a script either defining package | |
| 45 # config variables, or passing as parameters to the constructor: | |
| 46 | |
| 47 my $collector = | |
| 48 Bio::EnsEMBL::Utils::BaseCollector->new( -pack_template => 'v' ); | |
| 49 | |
| 50 $Bio::EnsEMBL::Funcgen::Collector::pack_template = 'v'; | |
| 51 | |
| 52 # Config variables can also be over-ridden by passing a config hash to | |
| 53 # the store_window_bins_by_Slice() method: | |
| 54 | |
| 55 $collector->store_window_bins_by_Slice( $slice, ( | |
| 56 -pack_template => 'v', | |
| 57 -packed_size => 2 ) ); | |
| 58 | |
| 59 # NOTE: Over-riding default config variables can cause problems when | |
| 60 # storing or fetching data. e.g. Fetch may revert to using defaults or | |
| 61 # table partitions may not match window sizes. | |
| 62 | |
| 63 =head1 DESCRIPTION | |
| 64 | |
| 65 This package is the base Collector class which contains generic | |
| 66 getter/setter methods along with the main 'collecting' methods which | |
| 67 perform the majority of the work in generating compressed data | |
| 68 collections optimised for web display. The bins produced are aimed at | |
| 69 storage in a BLOB representing an entire seq_region i.e. even bins with | |
| 70 no features/null data are encoded as a 0 score. Non-BLOB collections | |
| 71 are currently not supported. | |
| 72 | |
| 73 If your Collection class defines a Bio::EnsEMBL::Feature, then its | |
| 74 adaptor should inherit from the relevant Collection class. | |
| 75 | |
| 76 The minimum prerequisites of the input features/data are that they have | |
| 77 a start() and end() method. For instance a Bio::EnsEMBL::Features | |
| 78 generated from a database or parsed from a flat file. | |
| 79 | |
| 80 NOTE: This Collector does not have a lightweight mode previously used | |
| 81 for dynamic/on the fly collecting i.e. it does not take advantage of | |
| 82 bypassing object creation via the related BaseFeatureAdaptor method. | |
| 83 | |
| 84 =cut | |
| 85 | |
| 86 package Bio::EnsEMBL::Utils::Collector; | |
| 87 | |
| 88 use Bio::EnsEMBL::Utils::Argument ('rearrange'); | |
| 89 use Bio::EnsEMBL::Utils::Exception ('throw'); | |
| 90 use strict; | |
| 91 use warnings; | |
| 92 | |
| 93 ### Global package config vars | |
| 94 | |
| 95 # Defaults | |
| 96 our $max_view_width = 1000000; # Max bp width in location/detailed view | |
| 97 our $max_data_type_size = 16777216; # Default is 16MB for long BLOB | |
| 98 # This is really a guide value as this should be set in the inheriting | |
| 99 # Collector class by deducting the rest of the row size from this value. | |
| 100 # Is is upto the inheritor to handle checking whether this size has been | |
| 101 # exceeded. | |
| 102 | |
| 103 # NOTE: Theoretically the min window size is: slice_length/(16777216/2) | |
| 104 # So for human chr1: 249,250,621/(16777216/2) = 29.7 => 30. However, | |
| 105 # this size does not seem to directly translate to the MySQL | |
| 106 # max_allowed_packet_size. Increasing max_allowed_packet_size to 64MB | |
| 107 # solves this issue, and substr operation doesn't appear to incur any of | |
| 108 # the potential memory(4*) usage issues. | |
| 109 | |
| 110 # Others global package variables which are set in the inheriting | |
| 111 # Collector class. | |
| 112 our ( $bin_model, $bin_method, $pack_template, | |
| 113 $packed_size, $window_sizes ); | |
| 114 | |
| 115 =head2 new | |
| 116 | |
| 117 Args : None | |
| 118 Example : | |
| 119 | |
| 120 my $collector = Bio::EnsEMBL::XXX::Collector::FEATURE->new(); | |
| 121 $collector->store_windows_by_Slice($slice); | |
| 122 | |
| 123 # Where XXX is, e.g. Compara, FuncGen etc. | |
| 124 | |
| 125 Description: Simple new method to enable use of collector | |
| 126 when not inherited by a descendant of | |
| 127 Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor | |
| 128 | |
| 129 Returntype : Bio::EnsEMBL::XXX::Collector | |
| 130 Exceptions : None | |
| 131 Caller : Collector script | |
| 132 Status : At Risk | |
| 133 | |
| 134 =cut | |
| 135 | |
| 136 sub new { | |
| 137 return bless {}, $_[0]; # Simply blesses this class as an empty hash. | |
| 138 | |
| 139 # Do not set anything here, as will not be first in ISA for | |
| 140 # FeatureAdaptors. Hence, not guaranteed to be called. | |
| 141 } | |
| 142 | |
| 143 =head2 new_assembly | |
| 144 | |
| 145 Args : optional - string assembly version e.g. GRCh37 | |
| 146 Example : $collector->new_assembly('GRCh37'); | |
| 147 Description: Getter/Setter for new assembly version which should be | |
| 148 used to project only 0 wsize Collections. | |
| 149 Returntype : string | |
| 150 Exceptions : None | |
| 151 Caller : store_window_bins_by_Slice() or | |
| 152 write_collection() in inheriting Collector class. | |
| 153 Status : At Risk | |
| 154 | |
| 155 =cut | |
| 156 | |
| 157 sub new_assembly { | |
| 158 my ( $self, $new_asm ) = @_; | |
| 159 | |
| 160 if ( defined($new_asm) ) { | |
| 161 $self->{'new_assembly'} = $new_asm; | |
| 162 } | |
| 163 | |
| 164 return $self->{'new_assembly'}; | |
| 165 } | |
| 166 | |
| 167 ### Setter/Getter methods for basic/mandatory config | |
| 168 # Can also be set using package variables in the inheriting | |
| 169 # Collector/Adaptor or run script. Allows over-riding of defaults set | |
| 170 # in Adaptor/Collector. | |
| 171 | |
| 172 # Package variables used here instead of attrs to enable easy | |
| 173 # default config in inheriting class/script method. Provided | |
| 174 # for easy/standardised fetch access outside of this package | |
| 175 # i.e. Collectors/Adaptors | |
| 176 | |
| 177 =head2 max_data_type_size | |
| 178 | |
| 179 Args : optional - int Maximum size of collection in bytes | |
| 180 Example : $collector->max_data_type_size($new_max_size); | |
| 181 Description: Getter/Setter for max_data_type_size, default is | |
| 182 currently set at in this class as 16777216 (16MB), for | |
| 183 long BLOB. This is used by the write_collection() | |
| 184 method to determine when to build and store a compressed | |
| 185 collection. | |
| 186 Returntype : int | |
| 187 Exceptions : None | |
| 188 Caller : bins_per_record() and | |
| 189 write_collection() in inheriting Collector class. | |
| 190 Status : At Risk | |
| 191 | |
| 192 =cut | |
| 193 | |
| 194 sub max_data_type_size { | |
| 195 my ( $self, $size ) = @_; | |
| 196 | |
| 197 # Validate is sensible integer | |
| 198 | |
| 199 if ( defined($size) ) { | |
| 200 if ( $size !~ /^\d+$/ ) { | |
| 201 throw("max_data_type_size must be a integer of bytes, not $size"); | |
| 202 } | |
| 203 $max_data_type_size = $size; | |
| 204 } elsif ( !defined($max_data_type_size) ) { | |
| 205 # This should never happen as we have defaults in this module. | |
| 206 throw( 'You must define a ' | |
| 207 . '$Bio::EnsEMBL::Utils::Collector::max_data_type_size ' | |
| 208 . 'or pass -max_data_type_size config' ); | |
| 209 } | |
| 210 | |
| 211 return $max_data_type_size; | |
| 212 } | |
| 213 | |
| 214 =head2 max_view_width | |
| 215 | |
| 216 Args : optional - int Maximum width of view | |
| 217 Example : $collector->max_view_width($new_max_width); | |
| 218 Description: Getter/Setter for max_view_width, default is currently | |
| 219 set at in this class as 500000bp, for maximum level of | |
| 220 zoom permitted by location view. | |
| 221 Returntype : int | |
| 222 Exceptions : None | |
| 223 Caller : general | |
| 224 Status : At Risk | |
| 225 | |
| 226 =cut | |
| 227 | |
| 228 sub max_view_width { | |
| 229 my ( $self, $size ) = @_; | |
| 230 | |
| 231 # Validate is sensible integer | |
| 232 | |
| 233 if ( defined($size) ) { | |
| 234 if ( $size !~ /^\d+$/ ) { | |
| 235 throw("max_view_width must be a integer, not $size"); | |
| 236 } | |
| 237 $max_view_width = $size; | |
| 238 } elsif ( !defined $max_view_width ) { | |
| 239 # This should never happen as we have defaults in this module. | |
| 240 throw( 'You must define a ' | |
| 241 . '$Bio::EnsEMBL::Utils::Collector::max_view_width ' | |
| 242 . 'or pass -max_view_width config' ); | |
| 243 } | |
| 244 | |
| 245 return $max_view_width; | |
| 246 } | |
| 247 | |
| 248 =head2 bin_method | |
| 249 | |
| 250 Args[0] : optional - string name of bin method e.g. 'max_magnitude' | |
| 251 Args[1] : optional - Bio::EnsEMBL::Funcgen::Parsers::InputSet | |
| 252 Example : my $bin_method = $self->bin_method(); | |
| 253 Description: Getter/Setter for bin_method, default is normally set in | |
| 254 the inheriting Collector class either by package variable | |
| 255 or by passing a config hash via the store methods. | |
| 256 Returntype : string | |
| 257 Exceptions : Throws if cannot set by package variable | |
| 258 Caller : general | |
| 259 Status : At Risk | |
| 260 | |
| 261 =cut | |
| 262 | |
| 263 sub bin_method { | |
| 264 my ( $self, $bmethod, $config ) = @_; | |
| 265 | |
| 266 if ( defined($bmethod) ) { | |
| 267 $bin_method = $bmethod; | |
| 268 } | |
| 269 | |
| 270 if ( !defined($bin_method) ) { | |
| 271 throw( 'You must define a ' | |
| 272 . '$Bio::EnsEMBL::Utils::Collector::bin_method ' | |
| 273 . 'or pass -bin_method config' ); | |
| 274 } | |
| 275 | |
| 276 if ( !$self->can( "_calculate_" . $bin_method ) ) { | |
| 277 throw("$bin_method is not a valid/available binning method"); | |
| 278 } | |
| 279 | |
| 280 my $set_up_method = "_set_up_" . $bin_method; | |
| 281 if ( $self->can($set_up_method) ) { | |
| 282 $self->$set_up_method($config); | |
| 283 } | |
| 284 | |
| 285 return $bin_method; | |
| 286 } | |
| 287 | |
| 288 =head2 bin_model | |
| 289 | |
| 290 Args : optional - string bin model e.g. SIMPLE or COMPLEX | |
| 291 Example : my $bin_model = $self->bin_model; | |
| 292 Description: Getter/Setter for bin_model, default should be set in | |
| 293 inheriting Collector class. Currently only supports | |
| 294 'SIMPLE' bin model. | |
| 295 Returntype : string | |
| 296 Exceptions : Throws if bin_model is not SIMPLE | |
| 297 Caller : general | |
| 298 Status : At Risk | |
| 299 | |
| 300 =cut | |
| 301 | |
| 302 sub bin_model { | |
| 303 my ( $self, $bmodel ) = @_; | |
| 304 | |
| 305 if ( defined($bmodel) ) { | |
| 306 $bin_model = $bmodel; | |
| 307 } | |
| 308 | |
| 309 if ( !defined($bin_model) ) { | |
| 310 throw( 'You must define a ' | |
| 311 . '$Bio::EnsEMBL::Utils::Collector::bin_model ' | |
| 312 . 'or pass -bin_model config' ); | |
| 313 } | |
| 314 | |
| 315 if ( $bin_model ne 'SIMPLE' ) { | |
| 316 throw( 'Bio::EnsEMBL::Utils::Collector does not yet support ' | |
| 317 . 'non-SIMPLE bin models' ); | |
| 318 } | |
| 319 | |
| 320 return $bin_model; | |
| 321 } | |
| 322 | |
| 323 | |
| 324 =head2 window_sizes | |
| 325 | |
| 326 Args : optional - arrayref of window sizes | |
| 327 Example : | |
| 328 | |
| 329 foreach my $wsize ( @{ $collector->window_sizes } ) | |
| 330 { # Do some collecting | |
| 331 } | |
| 332 | |
| 333 Description: Getter/Setter for window_sizes. Default should be set | |
| 334 in inheriting Collector (if the config is dynamic), | |
| 335 FeatureAdaptor class or script using package variable or | |
| 336 this method. | |
| 337 NOTE: Redefining these may cause a mismatch with the | |
| 338 table partition definition. | |
| 339 Returntype : arrayref of ints | |
| 340 Exceptions : Throws if cannot set a valid array of int window sizes | |
| 341 Caller : general | |
| 342 Status : At Risk - rename bin_sizes? | |
| 343 | |
| 344 =cut | |
| 345 | |
| 346 sub window_sizes { | |
| 347 my ( $self, $sizes ) = @_; | |
| 348 | |
| 349 if ( defined($sizes) ) { | |
| 350 $window_sizes = $sizes; | |
| 351 } | |
| 352 | |
| 353 if ( !( ref($window_sizes) | |
| 354 && ( ref($window_sizes) eq 'ARRAY' ) | |
| 355 && ( scalar(@$window_sizes) > 0 ) ) ) | |
| 356 { | |
| 357 throw('Must pass -windows_sizes in the config ' | |
| 358 . 'or define $Bio::EnsEMBL::Utils::Collector::window_sizes ' | |
| 359 . 'in your Collector as an array ref of integer window_sizes' ); | |
| 360 } | |
| 361 | |
| 362 return $window_sizes; | |
| 363 } | |
| 364 | |
| 365 | |
| 366 | |
| 367 | |
| 368 =head2 has_window_size | |
| 369 | |
| 370 Args : int - window size to validate | |
| 371 Example : if( $collector->has_window_size('30') ){ | |
| 372 #Do something wrt to 30bp window size | |
| 373 } | |
| 374 | |
| 375 Description: Simple utility method to validate whether this Collector | |
| 376 has a given window_size | |
| 377 Returntype : Boolean | |
| 378 Exceptions : Throws if window size not specified | |
| 379 Caller : general | |
| 380 Status : At Risk | |
| 381 | |
| 382 =cut | |
| 383 | |
| 384 | |
| 385 sub has_window_size{ | |
| 386 my ( $self, $size ) = @_; | |
| 387 | |
| 388 if(! defined $size){ | |
| 389 throw('You must pass a window size to validate'); | |
| 390 } | |
| 391 | |
| 392 return grep(/$size/, @$window_sizes); | |
| 393 } | |
| 394 | |
| 395 | |
| 396 | |
| 397 | |
| 398 ### Getter/Setters for BLOB collection config | |
| 399 # NOTE: Overriding the defaults here may cause a mismatch when the data | |
| 400 # is retrieved. | |
| 401 | |
| 402 =head2 pack_template | |
| 403 | |
| 404 Args : optional - string perl 'pack' template | |
| 405 Example : $self->pack_template('v'); | |
| 406 Description: Getter/Setter for pack_template. Default should be set | |
| 407 in inheriting Collector (if the config is dynamic), | |
| 408 FeatureAdaptor class or script using package variable or | |
| 409 this method. | |
| 410 Returntype : string | |
| 411 Exceptions : Throws if cannot set pack_template from package variable | |
| 412 Caller : FeatureAdaptor::_obj_from_sth | |
| 413 Status : At Risk | |
| 414 | |
| 415 =cut | |
| 416 | |
| 417 sub pack_template { | |
| 418 my ( $self, $template ) = @_; | |
| 419 | |
| 420 if ( defined($template) ) { | |
| 421 $pack_template = $template; | |
| 422 } | |
| 423 | |
| 424 if ( !defined($pack_template) ) { | |
| 425 throw( 'Must pass a per score ' | |
| 426 . '-pack_template in the config ' | |
| 427 . 'or define $Bio::EnsEMBL::Utils::Collector::pack_template ' | |
| 428 . 'in your Collector' ); | |
| 429 } | |
| 430 | |
| 431 return $pack_template; | |
| 432 } | |
| 433 | |
| 434 =head2 packed_size | |
| 435 | |
| 436 Args : optional - int size of perl 'pack' template in bytes | |
| 437 Example : $self->packed_size(2); | |
| 438 Description: Getter/Setter for packed_size. Default should be set | |
| 439 in inheriting Collector (if the config is dynamic), | |
| 440 FeatureAdaptor class or script using package variable or | |
| 441 this method. | |
| 442 Returntype : string | |
| 443 Exceptions : Throws if cannot set pack_template from pacakge variable | |
| 444 Caller : current_packed_size() and | |
| 445 FeatureAdaptor::_obj_from_sth() | |
| 446 Status : At Risk | |
| 447 | |
| 448 =cut | |
| 449 | |
| 450 sub packed_size { | |
| 451 my ( $self, $size ) = @_; | |
| 452 | |
| 453 if ( defined($size) ) { | |
| 454 $packed_size = $size; | |
| 455 } | |
| 456 | |
| 457 if ( !defined($packed_size) ) { | |
| 458 throw( 'Must pass -packed_size(wrt to pack_template) config ' | |
| 459 . 'or define $Bio::EnsEMBL::Utils::Collector::packed_size ' | |
| 460 . 'in your Collector' ); | |
| 461 } | |
| 462 | |
| 463 if ( $packed_size !~ /^\d+$/ ) { | |
| 464 throw( "$packed_size is not an integer, " | |
| 465 . "must pass a size integer for packed_size " | |
| 466 . "which specifies size of pack_template:\t" | |
| 467 . $pack_template ); | |
| 468 } | |
| 469 | |
| 470 return $packed_size; | |
| 471 } | |
| 472 | |
| 473 =head2 bins_per_record | |
| 474 | |
| 475 Example : my $bin_per_records = $self->bin_per_record | |
| 476 Description: Simple method to calculate the max number of bins | |
| 477 allowed per record given the current config. | |
| 478 Returntype : int | |
| 479 Exceptions : None | |
| 480 Caller : | |
| 481 Status : At Risk | |
| 482 | |
| 483 =cut | |
| 484 | |
| 485 sub bins_per_record { | |
| 486 return int( $max_data_type_size/$packed_size ); | |
| 487 } | |
| 488 | |
| 489 | |
| 490 =head2 current_packed_size | |
| 491 | |
| 492 Arg[0] : int - window size | |
| 493 Example : my $cps = $self->current_packed_size($wsize); | |
| 494 Description: Simple method to calculate the max number of bins | |
| 495 allowed per record given the current config. | |
| 496 Returntype : int | |
| 497 Exceptions : None | |
| 498 Caller : | |
| 499 Status : At Risk | |
| 500 | |
| 501 =cut | |
| 502 | |
| 503 sub current_packed_size { | |
| 504 my ( $self, $wsize ) = @_; | |
| 505 return ( scalar( @{ $self->score_cache($wsize) } )*$packed_size ); | |
| 506 } | |
| 507 | |
| 508 | |
| 509 =head2 score_cache | |
| 510 | |
| 511 Arg[0] : int - window size | |
| 512 Example : my $cps = $self->current_packed_size($wsize); | |
| 513 Description: Handles caching of bin scores for each window size | |
| 514 Returntype : arrayref | |
| 515 Exceptions : Throws if no window size defined | |
| 516 Caller : current_packed_size() and store_collection() | |
| 517 methods | |
| 518 Status : At Risk | |
| 519 | |
| 520 =cut | |
| 521 | |
| 522 sub score_cache { | |
| 523 my ( $self, $wsize, $scores ) = @_; | |
| 524 | |
| 525 if ( !defined($wsize) ) { | |
| 526 throw('Must pass a window size argument'); | |
| 527 } | |
| 528 | |
| 529 $self->{'score_cache'}{$wsize} ||= []; | |
| 530 | |
| 531 if ( defined($scores) ) { | |
| 532 push( @{ $self->{'score_cache'}{$wsize} }, @{$scores} ); | |
| 533 } | |
| 534 | |
| 535 return $self->{'score_cache'}{$wsize}; | |
| 536 } | |
| 537 | |
| 538 | |
| 539 =head2 collection_start | |
| 540 | |
| 541 Arg[0] : int - window_size | |
| 542 Arg[1] : optional int - seq_region_start | |
| 543 Example : my $coll_start->(150); | |
| 544 Description: Getter/Setter collection seq_region_start | |
| 545 Returntype : int | |
| 546 Exceptions : Throws if no window size defined | |
| 547 Caller : store_window_bin_by_Slice() and write_collection() | |
| 548 Status : At Risk | |
| 549 | |
| 550 =cut | |
| 551 | |
| 552 sub collection_start { | |
| 553 my ( $self, $wsize, $sr_start ) = @_; | |
| 554 | |
| 555 if ( !defined($wsize) ) { | |
| 556 throw('Must pass a window size argument'); | |
| 557 } | |
| 558 | |
| 559 if ( defined($sr_start) ) { | |
| 560 $self->{'collection_start'}{$wsize} = $sr_start; | |
| 561 } | |
| 562 | |
| 563 return $self->{'collection_start'}{$wsize}; | |
| 564 } | |
| 565 | |
| 566 | |
| 567 =head2 collection_end | |
| 568 | |
| 569 Arg[0] : int - window_size | |
| 570 Arg[1] : optional int - seq_region_end | |
| 571 Example : my $coll_end->(150); | |
| 572 Description: Getter/Setter collection seq_region_end | |
| 573 Returntype : int | |
| 574 Exceptions : Throws if no window size defined | |
| 575 Caller : inheriting Collector write_collection method | |
| 576 Status : At Risk | |
| 577 | |
| 578 =cut | |
| 579 | |
| 580 sub collection_end{ | |
| 581 my ($self, $wsize, $sr_end) = @_; | |
| 582 throw('Must pass a window size argument') if ! defined $wsize; | |
| 583 | |
| 584 if(defined $sr_end){ | |
| 585 $self->{'collection_end'}{$wsize} = $sr_end; | |
| 586 } | |
| 587 else{ | |
| 588 return $self->{'collection_end'}{$wsize}; | |
| 589 } | |
| 590 } | |
| 591 | |
| 592 | |
| 593 =head2 collection_strand | |
| 594 | |
| 595 Arg[0] : int - window_size | |
| 596 Arg[1] : optional int - seq_region_strand | |
| 597 Example : my $coll_start->(0); | |
| 598 Description: Getter/Setter collection seq_region_strand | |
| 599 Returntype : int | |
| 600 Exceptions : Throws if no window size defined | |
| 601 Caller : inheriting Collector write_collection method | |
| 602 Status : At Risk - Collections are currently strandless | |
| 603 | |
| 604 =cut | |
| 605 | |
| 606 sub collection_strand { | |
| 607 my ( $self, $wsize, $strand ) = @_; | |
| 608 | |
| 609 if ( !defined($wsize) ) { | |
| 610 throw('Must pass a window size argument'); | |
| 611 } | |
| 612 | |
| 613 if ( defined $strand ) { | |
| 614 $self->{'collection_strand'}{$wsize} = $strand; | |
| 615 } | |
| 616 | |
| 617 return $self->{'collection_strand'}{$wsize}; | |
| 618 } | |
| 619 | |
| 620 | |
| 621 ### Here follows the actual working methods | |
| 622 | |
| 623 =head2 _get_Slice_chunks | |
| 624 | |
| 625 Description: Defines the optimal set of slice chunks to use for | |
| 626 generating collections such that redundant fetches | |
| 627 are minimized. | |
| 628 Returntype : hashref of window_size chunk size pairs | |
| 629 Exceptions : Throws if no window sizes or max_view_width defined | |
| 630 Caller : store_window_bin_by_Slice() | |
| 631 Status : At Risk | |
| 632 | |
| 633 =cut | |
| 634 | |
| 635 sub _get_Slice_chunks { | |
| 636 my $self = shift; | |
| 637 | |
| 638 if ( !defined($window_sizes) || !defined($max_view_width) ) { | |
| 639 throw( 'You must pass both a window_size array ref ' | |
| 640 . 'and max_view_width arguments' ); | |
| 641 } | |
| 642 | |
| 643 if ( !defined( $self->{'_slice_chunks'} ) ) { | |
| 644 # Calulate sensible slice length based on window sizes | |
| 645 my @wsizes = sort { $a <=> $b } @$window_sizes; | |
| 646 | |
| 647 # Handle calculating only 0 wsize | |
| 648 if ( scalar(@wsizes) == 1 | |
| 649 && $wsizes[0] == 0 ) | |
| 650 { | |
| 651 return { $max_view_width => [0] }; | |
| 652 } | |
| 653 | |
| 654 my $multiplier = int( $max_view_width/$wsizes[$#wsizes] ); | |
| 655 | |
| 656 my $chunk_length = $multiplier*$wsizes[$#wsizes]; | |
| 657 my $not_divisible = 1; | |
| 658 | |
| 659 my %chunk_windows; # Registry of chunk lengths to run with windows | |
| 660 my %workable_chunks = map { $_ => {} } @wsizes; | |
| 661 | |
| 662 # get rid of natural resolution as this will always work | |
| 663 delete $workable_chunks{'0'}; | |
| 664 | |
| 665 while ( $not_divisible && $chunk_length != 0 ) { | |
| 666 $not_divisible = 0; | |
| 667 | |
| 668 foreach my $wsize (@wsizes) { | |
| 669 if ( $wsize == 0 ) { | |
| 670 # Special wsize for normal data | |
| 671 next; | |
| 672 } | |
| 673 | |
| 674 # Set not divisible if modulus is true | |
| 675 if ( $chunk_length % $wsize ) { | |
| 676 $not_divisible = 1; | |
| 677 } else { | |
| 678 $workable_chunks{$wsize}{$chunk_length} = []; | |
| 679 } | |
| 680 } | |
| 681 | |
| 682 # Gradually shrink the length until we find a workable slice | |
| 683 # length for all windows. | |
| 684 if ($not_divisible) { | |
| 685 $chunk_length -= $wsizes[$#wsizes]; | |
| 686 } | |
| 687 } | |
| 688 | |
| 689 my %chunk_sets; | |
| 690 | |
| 691 if ( $chunk_length == 0 ) { | |
| 692 print "Could not find chunk length " | |
| 693 . "for all window sizes, " | |
| 694 . "attempting to subset windows " | |
| 695 . "using alternate slice length\n"; | |
| 696 | |
| 697 foreach my $wsize ( keys(%workable_chunks) ) { | |
| 698 # Loop through windows, seeing if they are workable in the other | |
| 699 # windows. | |
| 700 | |
| 701 foreach my $chunk ( keys( %{ $workable_chunks{$wsize} } ) ) { | |
| 702 | |
| 703 foreach my $other_wsize ( keys %workable_chunks ) { | |
| 704 next if $wsize == $other_wsize; | |
| 705 | |
| 706 if ( exists( $workable_chunks{$other_wsize}{$chunk} ) ) { | |
| 707 # only push it onto the other wsize, as we will do the | |
| 708 # reverse later | |
| 709 $chunk_sets{$chunk}{$wsize} = undef; | |
| 710 } | |
| 711 } | |
| 712 } | |
| 713 } | |
| 714 | |
| 715 # %chunk_sets represents co-occurence of wsizes with repect to | |
| 716 # chunks. Take the set which has the most windows and the longest | |
| 717 # chunk. Then get the largest which handles the rest. | |
| 718 | |
| 719 # define possible set lengths | |
| 720 my $i = 0; | |
| 721 my %set_lengths; | |
| 722 | |
| 723 map { $set_lengths{$i} = []; $i++ } @wsizes; | |
| 724 | |
| 725 # get rid of natural resolution as this will always work | |
| 726 delete $set_lengths{'0'}; | |
| 727 | |
| 728 # Store chunks lengths for each set size | |
| 729 foreach my $chunk ( keys(%chunk_sets) ) { | |
| 730 my $set_size = scalar( values( %{ $chunk_sets{$chunk} } ) ); | |
| 731 push( @{ $set_lengths{$set_size} }, $chunk ); | |
| 732 } | |
| 733 | |
| 734 # Get the biggest set with the longest length; | |
| 735 | |
| 736 # Scalar here as we are disregarding natural resolution of 0 in | |
| 737 # loop. | |
| 738 my $largest_size = scalar(@wsizes); | |
| 739 my $found_largest_set = 0; | |
| 740 | |
| 741 while ( !$found_largest_set ) { | |
| 742 $largest_size--; | |
| 743 | |
| 744 if ( scalar( @{ $set_lengths{$largest_size} } ) > 0 ) { | |
| 745 $found_largest_set = 1; | |
| 746 } | |
| 747 } | |
| 748 | |
| 749 my ($largest_chunk) = | |
| 750 sort { $b <=> $a } @{ $set_lengths{$largest_size} }; | |
| 751 | |
| 752 my @largest_windows = keys %{ $chunk_sets{$largest_chunk} }; | |
| 753 @{ $chunk_windows{$largest_chunk} } = @largest_windows; | |
| 754 | |
| 755 print "Largest chunk $largest_chunk($largest_size) " | |
| 756 . "contains windows: @largest_windows\n"; | |
| 757 | |
| 758 my %remaining_windows = map { $_ => {} } @wsizes; | |
| 759 | |
| 760 # get rid of natural resolution as this will always work | |
| 761 delete $remaining_windows{'0'}; | |
| 762 | |
| 763 map { delete $remaining_windows{$_} } @largest_windows; | |
| 764 my $remaining_set_size = scalar( keys(%remaining_windows) ); | |
| 765 | |
| 766 # Use array here for practicality, would need to maintain hash if | |
| 767 # we need to iterate. | |
| 768 my @rwindows = keys(%remaining_windows); | |
| 769 | |
| 770 # Could be one window, but this will not be in the co-occurence | |
| 771 # hash %chunk_sets. | |
| 772 my $next_chunk; | |
| 773 | |
| 774 if ( scalar(@rwindows) == 1 ) { | |
| 775 my ($last_window) = @rwindows; | |
| 776 # Find a suitably large chunk for this one window. | |
| 777 $multiplier = int( 500000/$last_window ); | |
| 778 $next_chunk = $multiplier*$last_window; | |
| 779 } else { | |
| 780 | |
| 781 foreach my $chunk ( sort { $b <=> $a } | |
| 782 @{ $set_lengths{$remaining_set_size} } ) | |
| 783 { | |
| 784 my $seen_count = 0; | |
| 785 | |
| 786 foreach my $rwindow (@rwindows) { | |
| 787 if ( grep /$rwindow/, | |
| 788 ( values( %{ $chunk_sets{$chunk} } ) ) ) | |
| 789 { | |
| 790 $seen_count++; | |
| 791 } | |
| 792 } | |
| 793 | |
| 794 if ( $seen_count == $remaining_set_size ) { | |
| 795 $next_chunk = $chunk; | |
| 796 last; | |
| 797 } | |
| 798 } | |
| 799 | |
| 800 } | |
| 801 | |
| 802 @{ $chunk_windows{$next_chunk} } = @rwindows; | |
| 803 | |
| 804 if ( defined($next_chunk) ) { | |
| 805 print "Found next chunk length $next_chunk " | |
| 806 . "contains remaining windows:\t@rwindows\n"; | |
| 807 } else { | |
| 808 warn "Need to write iterative method for set definition"; | |
| 809 throw( 'Could not find workable slice length ' | |
| 810 . 'for remaining windows: ' | |
| 811 . join( ', ', @rwindows ) ); | |
| 812 } | |
| 813 } else { | |
| 814 @{ $chunk_windows{$chunk_length} } = keys(%workable_chunks); | |
| 815 print "Found workable chunk length $chunk_length " | |
| 816 . "for all window sizes:\t" | |
| 817 . join( ' ', @{ $chunk_windows{$chunk_length} } ) . "\n"; | |
| 818 } | |
| 819 | |
| 820 $self->{'_slice_chunks'} = \%chunk_windows; | |
| 821 } ## end if ( !defined( $self->...)) | |
| 822 | |
| 823 return $self->{'_slice_chunks'}; | |
| 824 } ## end sub _get_Slice_chunks | |
| 825 | |
| 826 | |
| 827 | |
| 828 | |
| 829 =head2 set_config | |
| 830 | |
| 831 Arg[0] : optional hash - parameter hash(see above methods for more info): | |
| 832 | |
| 833 WINDOW_SIZES => array ref - subset of defined window | |
| 834 sizes | |
| 835 BIN_METHOD => string | |
| 836 MAX_VIEW_WIDTH => int | |
| 837 MAX_DATA_TYPE_SIZE => int | |
| 838 PACK_TEMPLATE => string | |
| 839 PACKED_SIZE => int | |
| 840 BIN_MODEL => string | |
| 841 NEW_ASSEMBLY => string | |
| 842 METHOD_CONFIG => hash of method specific config params | |
| 843 SKIP_ZERO_WINDOW => boolean - skips generation of 0 wsize | |
| 844 this is used if already generated | |
| 845 from an assembly projection. | |
| 846 | |
| 847 NOTE: Over-riding any of the default config may cause | |
| 848 problems when storing or retrieving Collection data, | |
| 849 except sub sets of default window sizes. | |
| 850 | |
| 851 Description: This method replaces the constructor as new will not be | |
| 852 called for Adaptor based Collectors. | |
| 853 Separating this from the store method is currently | |
| 854 redundant as jobs are normally submitetd in Slice based | |
| 855 jobs. However, this will be required if the store method | |
| 856 is further seaprated into fetch/generate and store methods | |
| 857 Returntype : None | |
| 858 Exceptions : Throws if no window sizes or max_view_width defined | |
| 859 Caller : Inheritor Collector e.g. Bio::EnsEMBL::Funcgen:Collector::ResultFeature | |
| 860 or script. | |
| 861 Status : At Risk | |
| 862 | |
| 863 =cut | |
| 864 | |
| 865 sub set_config { | |
| 866 my ( $self, %config ) = @_; | |
| 867 | |
| 868 my ( $wsizes, $bmethod, $mv_width, | |
| 869 $md_type_size, $template, $psize, | |
| 870 $bmodel, $new_assm, $skip_zero_window, | |
| 871 $method_config ) | |
| 872 = rearrange( [ 'WINDOW_SIZES', 'BIN_METHOD', | |
| 873 'MAX_VIEW_WIDTH', 'MAX_DATA_TYPE_SIZE', | |
| 874 'PACK_TEMPLATE', 'PACKED_SIZE', | |
| 875 'BIN_MODEL', 'NEW_ASSEMBLY', | |
| 876 'SKIP_ZERO_WINDOW', 'METHOD_CONFIG' ], | |
| 877 %config ); | |
| 878 | |
| 879 ### VAILDATE/SET VARS/CONFIG | |
| 880 | |
| 881 # Attrs used in this method | |
| 882 $self->bin_method( $bmethod, $method_config ); | |
| 883 $self->bin_model($bmodel); | |
| 884 $self->window_sizes($wsizes); | |
| 885 | |
| 886 # Set to undef if we have empty array? To change this we need to | |
| 887 # pass the config hash -window_sizes conditionally | |
| 888 # This currently overwrite the defaults! | |
| 889 # if ( ref($window_sizes) eq 'ARRAY' | |
| 890 # && scalar( @{$window_sizes} ) == 0 ) | |
| 891 # { | |
| 892 # $window_sizes = undef; | |
| 893 # } | |
| 894 | |
| 895 # Attrs used in other (store) methods | |
| 896 $self->pack_template($template); | |
| 897 $self->packed_size($psize); | |
| 898 $self->max_data_type_size($md_type_size); | |
| 899 $self->max_view_width($mv_width); | |
| 900 | |
| 901 # Other vars | |
| 902 $self->new_assembly($new_assm); | |
| 903 $self->{'_only_natural'} = 0; | |
| 904 $self->{'_store_natural'} = grep /^0$/, @$window_sizes; | |
| 905 | |
| 906 ### Set window_sizes | |
| 907 | |
| 908 if ( $self->new_assembly() ) { | |
| 909 print "Assembly projection may cause problems " | |
| 910 . "for large Collections, " | |
| 911 . "defaulting to window_sizes = (0)\n"; | |
| 912 | |
| 913 | |
| 914 if ( $skip_zero_window ) { | |
| 915 throw( "You cannot -skip_zero_window or " | |
| 916 . "omit 0 from -window_sizes " | |
| 917 . "when projecting to a new assembly($new_assm) " | |
| 918 . "which should only be generated using window_size=0" ); | |
| 919 } | |
| 920 | |
| 921 | |
| 922 | |
| 923 | |
| 924 # Then build the bins on the projected 0 level single Features | |
| 925 | |
| 926 # Test we haven't explicity set window_sizes to be something else | |
| 927 if ( defined($wsizes) | |
| 928 && !( scalar(@$wsizes) == 1 && $wsizes->[0] == 0 ) ) | |
| 929 { | |
| 930 throw( "You have set window_sizes config " | |
| 931 . "which are not safe when projecting to " | |
| 932 . "a new assembly($new_assm), " | |
| 933 . "please omit window_sizes config or set to 0" ); | |
| 934 } | |
| 935 | |
| 936 $self->window_sizes( [0] ); | |
| 937 } else { | |
| 938 | |
| 939 if ( $wsizes && $skip_zero_window && | |
| 940 ( grep /^0$/, @$wsizes )) { | |
| 941 #Only test passed params not default config | |
| 942 | |
| 943 throw( "You have specied skip_zero_window " | |
| 944 . "and window_size 0 in your parameters, " | |
| 945 . "please remove one of these" ); | |
| 946 } | |
| 947 elsif ( defined($window_sizes) && !grep /^0$/, @$window_sizes ) { | |
| 948 $skip_zero_window = 1; | |
| 949 # re-add 0 window as we need this to build the collections | |
| 950 # see ... | |
| 951 unshift( @{$window_sizes}, 0 ); | |
| 952 } | |
| 953 } | |
| 954 | |
| 955 | |
| 956 if ( $self->{'_store_natural'} && scalar( @{$window_sizes} ) == 1 ) { | |
| 957 $self->{'_only_natural'} = 1; | |
| 958 } | |
| 959 if ($skip_zero_window) { | |
| 960 $self->{'_store_natural'} = 0; | |
| 961 } | |
| 962 | |
| 963 return; | |
| 964 } ## end sub set_config | |
| 965 | |
| 966 =head2 store_window_bins_by_Slice | |
| 967 | |
| 968 Arg[0] : Bio::EnsEMBL:Slice | |
| 969 Example : $collector->store_window_bins_by_Slice($slice); | |
| 970 Description: This is the main run method, it loops through | |
| 971 optimal slice chunks from _define_window_chunks, | |
| 972 calls _bin_features_by_Slice as appropriate and | |
| 973 calls write_collection in the inheriting Collector | |
| 974 class/script. | |
| 975 Returntype : None | |
| 976 Exceptions : Throws if Bio::EnsEMBL::Slice is not defined | |
| 977 Caller : store methods in inheriting Collector class/script | |
| 978 Status : At Risk | |
| 979 | |
| 980 =cut | |
| 981 | |
| 982 sub store_window_bins_by_Slice { | |
| 983 my ( $self, $slice ) = @_; | |
| 984 | |
| 985 warn "Need to be careful here " | |
| 986 . "about cleaning start end strand caches between " | |
| 987 . "serially run slices"; | |
| 988 | |
| 989 if ( !( defined($slice) | |
| 990 && ref($slice) | |
| 991 && $slice->isa('Bio::EnsEMBL::Slice') ) ) | |
| 992 { | |
| 993 throw('You must pass a valid Bio::EnsEMBL::Slice'); | |
| 994 } | |
| 995 | |
| 996 # Rollback previously stored features. | |
| 997 # Change 'can' to empty method stubb with pod ??? | |
| 998 if ( $self->can('rollback_Features_by_Slice') ) { | |
| 999 $self->rollback_Features_by_Slice($slice); | |
| 1000 } else { | |
| 1001 warn ref($self) | |
| 1002 . " cannot rollback_Features_by_Slice. " | |
| 1003 . "This may result in storage failure " | |
| 1004 . "or duplicate Collections if there is pre-existing data"; | |
| 1005 } | |
| 1006 | |
| 1007 ### PROCESS CHUNKS | |
| 1008 my %chunk_windows = %{ $self->_get_Slice_chunks }; | |
| 1009 my (%counts); | |
| 1010 my $store_natural = $self->{'_store_natural'}; | |
| 1011 my $only_natural = $self->{'_only_natural'}; | |
| 1012 $counts{0} = 0; # Set natural res count to 0 | |
| 1013 my $slice_end = $slice->end; | |
| 1014 my $orig_start = $slice->start; | |
| 1015 my $region = $slice->coord_system_name; | |
| 1016 my $version = $slice->coord_system->version; | |
| 1017 my $seq_region_name = $slice->seq_region_name; | |
| 1018 my $strand = $slice->strand; | |
| 1019 | |
| 1020 # Warn if this is not a full slice. Version needed in case we are | |
| 1021 # projecting from a non-default version slice | |
| 1022 my $full_slice = | |
| 1023 $slice->adaptor->fetch_by_region( $region, $seq_region_name, undef, | |
| 1024 undef, undef, $version ); | |
| 1025 | |
| 1026 if ( ( $full_slice->start() != $orig_start ) | |
| 1027 || ( $full_slice->end() != $slice_end ) ) | |
| 1028 { | |
| 1029 warn "Generating collections using sub-Slices " | |
| 1030 . "can result in data issues/artifacts"; | |
| 1031 # Last chunk might not be the correct window length. Test | |
| 1032 # slices less than chunk length can cause failures in | |
| 1033 # _bin_features_by_window_sizes others? | |
| 1034 } | |
| 1035 | |
| 1036 # Set the initial collection_start to orig_start. This is not the | |
| 1037 # case for 0 wsize where it must always be the true feature start. | |
| 1038 for my $wsize (@$window_sizes) { | |
| 1039 if ( $wsize == 0 ) { next } | |
| 1040 $self->collection_start( $wsize, $orig_start ); | |
| 1041 | |
| 1042 # Also reset collection end and score cache in case we are running | |
| 1043 # serially. | |
| 1044 $self->{collection_end}{$wsize} = undef; | |
| 1045 $self->{'score_cache'}{$wsize} = []; | |
| 1046 } | |
| 1047 | |
| 1048 my $first_chunk_length = 1; | |
| 1049 | |
| 1050 foreach my $chunk_length ( sort keys %chunk_windows ) { | |
| 1051 print "Processing windows " | |
| 1052 . join( ', ', @{ $chunk_windows{$chunk_length} } ) | |
| 1053 . " with chunk length $chunk_length\n"; | |
| 1054 | |
| 1055 # Set window counts to 0 | |
| 1056 map $counts{$_} = 0, @{ $chunk_windows{$chunk_length} }; | |
| 1057 | |
| 1058 # May need to reset flat file parser handle or other caches via | |
| 1059 # inheriting Collector | |
| 1060 if ( !$first_chunk_length ) { | |
| 1061 # Change 'can' to empty method stubb with pod??? | |
| 1062 if ( $self->can('reinitialise_input') ) { | |
| 1063 $self->reinitialise_input(); | |
| 1064 } | |
| 1065 } | |
| 1066 | |
| 1067 $first_chunk_length = 0; | |
| 1068 | |
| 1069 # Now walk through slice using slice length chunks and build all | |
| 1070 # windows in each chunk. | |
| 1071 my $in_slice = 1; | |
| 1072 my $start_adj = 0; | |
| 1073 my ( $sub_slice, $sub_end, $features, $bins ); | |
| 1074 my $sub_start = 1; | |
| 1075 my $slice_length = $slice->length(); | |
| 1076 | |
| 1077 # Always create in local coords for fetch | |
| 1078 # Then change to seq_region coords for store if required | |
| 1079 | |
| 1080 while ($in_slice) { | |
| 1081 $sub_start += $start_adj; | |
| 1082 $sub_end = $sub_start + $chunk_length - 1; | |
| 1083 | |
| 1084 if ( $sub_end >= $slice_length ) { | |
| 1085 # Surplus bins are removed in store/write_collection in caller | |
| 1086 $in_slice = 0; | |
| 1087 } | |
| 1088 | |
| 1089 $sub_slice = | |
| 1090 $slice->adaptor->fetch_by_region( $region, $seq_region_name, | |
| 1091 $sub_start + $orig_start - 1, | |
| 1092 $sub_end + $orig_start - 1, | |
| 1093 $strand, $version ); | |
| 1094 | |
| 1095 # Can't subslice as this will not clip if we go over the length of | |
| 1096 # the slice, unlike normal slice fetching. Will clipping the end | |
| 1097 # to the slice end cause any problems here? How will this affect | |
| 1098 # bin clipping? | |
| 1099 | |
| 1100 ### Grab features and shift chunk coords | |
| 1101 $features = $self->get_Features_by_Slice($sub_slice); | |
| 1102 | |
| 1103 # warn "Binning " | |
| 1104 # . scalar(@$features) | |
| 1105 # . " Features for chunk length $chunk_length, on Slice " | |
| 1106 # . $sub_slice->name; | |
| 1107 | |
| 1108 if ( ( @{$features} ) | |
| 1109 && ref( $features->[0] ) =~ /Bio::EnsEMBL::Utils::Collection/ ) | |
| 1110 { | |
| 1111 # Would need to create base module with generic methods: | |
| 1112 # window_size, ... | |
| 1113 | |
| 1114 # Check that the returned feature/collections support window_size. | |
| 1115 # All Collections should be able to | |
| 1116 | |
| 1117 if ( $features->[0]->can('window_size') ) { | |
| 1118 if ( $features->[0]->window_size != 0 ) { | |
| 1119 throw( "You are trying to generated Collections from " | |
| 1120 . "a non-zero window sized Collection:\t" | |
| 1121 . $features->[1]->{'window_size'} ); | |
| 1122 } | |
| 1123 | |
| 1124 # This should never happen | |
| 1125 # if ( !$skip_zero_window ) { | |
| 1126 # throw( 'You have retrieved data from a Collection ' | |
| 1127 # . 'which without using -skip_zero_window ' | |
| 1128 # . 'i.e. you are trying to generate overwrite ' | |
| 1129 # . 'the data you are generating the Collections from' ); | |
| 1130 # } | |
| 1131 | |
| 1132 } else { | |
| 1133 throw( 'Something is wrong, ' | |
| 1134 . 'the Collection you have retrieved ' | |
| 1135 . 'does not support the method window_size' ); | |
| 1136 } | |
| 1137 } ## end if ( ( @{$features} ) ...) | |
| 1138 | |
| 1139 # Set collection start here for 0 window_size | |
| 1140 if ( @{$features} | |
| 1141 && $store_natural | |
| 1142 && !defined( $self->collection_start(0) ) ) | |
| 1143 { | |
| 1144 $self->collection_start( 0, | |
| 1145 $features->[0]->start + $sub_start ); | |
| 1146 } | |
| 1147 | |
| 1148 if ($in_slice) { | |
| 1149 $start_adj = $chunk_length; | |
| 1150 } | |
| 1151 | |
| 1152 # Collect features into wsize bins | |
| 1153 if ( !$only_natural ) { | |
| 1154 # Get hashref of wsize=>bin array pairs | |
| 1155 $bins = | |
| 1156 $self->_bin_features_by_Slice_window_sizes( | |
| 1157 -slice => $sub_slice, | |
| 1158 -window_sizes => $chunk_windows{$chunk_length}, | |
| 1159 -features => $features, ); | |
| 1160 } | |
| 1161 | |
| 1162 # Handle 0 wsize | |
| 1163 if ($store_natural) { | |
| 1164 foreach my $feature ( @{$features} ) { | |
| 1165 $counts{0}++; | |
| 1166 | |
| 1167 if ( $bin_model eq 'SIMPLE' ) { | |
| 1168 $self->collection_start( 0, $feature->start + $sub_start ); | |
| 1169 | |
| 1170 $self->write_collection( | |
| 1171 0, | |
| 1172 $slice, # Pass Slice to sub-slice when storing | |
| 1173 $feature->end + $sub_start, | |
| 1174 $feature->strand, # Need to pass strand for 0 resolution | |
| 1175 $feature->scores, ); | |
| 1176 } | |
| 1177 } | |
| 1178 | |
| 1179 print "Window size 0 (natural resolution) has " | |
| 1180 . scalar( @{$features} ) | |
| 1181 . " feature bins for:\t" | |
| 1182 . $sub_slice->name . "\n"; | |
| 1183 } | |
| 1184 | |
| 1185 # Now store collections for wsizes >0 | |
| 1186 my $num_bins; | |
| 1187 | |
| 1188 foreach my $wsize ( sort keys( %{$bins} ) ) { | |
| 1189 $num_bins = scalar( @{ $bins->{$wsize} } ); | |
| 1190 $counts{$wsize} += $num_bins; | |
| 1191 | |
| 1192 if ( $bin_model eq 'SIMPLE' ) { | |
| 1193 $self->write_collection( | |
| 1194 $wsize, | |
| 1195 $slice, | |
| 1196 #$sub_start, | |
| 1197 $sub_end, | |
| 1198 $slice->strand, # This is most likely 1! | |
| 1199 # Override this woth 0 in descendant Collector if required. | |
| 1200 $bins->{$wsize}, ); | |
| 1201 | |
| 1202 } else { | |
| 1203 throw( 'Bio::EnsEMBL::Utils::Collector ' | |
| 1204 . 'does not yet support non-SIMPLE bin models' ); | |
| 1205 # i.e. More than one score | |
| 1206 } | |
| 1207 } | |
| 1208 } ## end while ($in_slice) | |
| 1209 | |
| 1210 # Turn off storing of natural resolution for next chunk length sets | |
| 1211 $store_natural = 0; | |
| 1212 } ## end foreach my $chunk_length ( ...) | |
| 1213 | |
| 1214 # Write last collections for each wsize | |
| 1215 | |
| 1216 foreach my $wsize (@$window_sizes) { | |
| 1217 | |
| 1218 if ( ( $wsize == 0 && !$store_natural ) | |
| 1219 || ( $wsize != 0 && $only_natural ) ) | |
| 1220 { | |
| 1221 next; | |
| 1222 } | |
| 1223 | |
| 1224 print "Writing final $wsize window_size collection, " | |
| 1225 . "this may result in slightly different " | |
| 1226 . "bin numbers from counts due to removing " | |
| 1227 . "overhanging bins past end of slice\n"; | |
| 1228 $self->write_collection( $wsize, $slice ); | |
| 1229 } | |
| 1230 | |
| 1231 # Print some counts | |
| 1232 foreach my $wsize ( sort ( keys %counts ) ) { | |
| 1233 print "Generated " | |
| 1234 . $counts{$wsize} | |
| 1235 . " bins for window size $wsize for " | |
| 1236 . $slice->name . "\n"; | |
| 1237 # Some may have failed to store if we are projecting to a new | |
| 1238 # assembly. | |
| 1239 } | |
| 1240 | |
| 1241 return; | |
| 1242 } ## end sub store_window_bins_by_Slice | |
| 1243 | |
| 1244 =head2 _bin_features_by_Slice_window_sizes | |
| 1245 | |
| 1246 Args[0] : Bio::EnsEMBL::Slice | |
| 1247 Args[1] : ARRAYREF of window sizes | |
| 1248 Args[2] : ARRAYREF of features with start and end method | |
| 1249 e.g. Bio::EnsEMBL::Features | |
| 1250 Example : | |
| 1251 | |
| 1252 $bins = | |
| 1253 $self->_bin_features_by_window_sizes( | |
| 1254 -slice => $slice, | |
| 1255 -window_sizes => $chunk_windows{$chunk_length}, | |
| 1256 -features => $features, ); | |
| 1257 | |
| 1258 Description: Bins feature scores for a given list of window sizes and | |
| 1259 predefined method. | |
| 1260 Returntype : HASHREF of scores per bin per window size | |
| 1261 Exceptions : None | |
| 1262 Caller : store_window_bins_by_Slice | |
| 1263 Status : At Risk | |
| 1264 | |
| 1265 =cut | |
| 1266 | |
| 1267 sub _bin_features_by_Slice_window_sizes { | |
| 1268 my ( $self, @args ) = @_; | |
| 1269 | |
| 1270 my ( $slice, $wsizes, $features ) = | |
| 1271 rearrange( [ 'SLICE', 'WINDOW_SIZES', 'FEATURES' ], @args ); | |
| 1272 | |
| 1273 # Generate these once in caller? | |
| 1274 my $calc_method = '_calculate_' . $bin_method; | |
| 1275 my $post_method = '_post_process_' . $bin_method; | |
| 1276 | |
| 1277 # Do this conditional on the Collection type i.e. is | |
| 1278 # collection seq_region blob then no else yes Would need | |
| 1279 # $Bio::EnsEMBL::Utils::Collector::collection_format=BLOB|STANDARD | |
| 1280 # if ( !defined($features) || !@{$features} ) { return {} } | |
| 1281 | |
| 1282 # Set up some hashes to store data by window_size | |
| 1283 my ( %bins, %nbins, %bin_counts ); | |
| 1284 my $slice_start = $slice->start(); | |
| 1285 my $slice_length = $slice->length(); | |
| 1286 | |
| 1287 # Set up some bin data for the windows | |
| 1288 foreach my $wsize (@$wsizes) { | |
| 1289 $nbins{$wsize} = int( $slice_length/$wsize ); # int rounds down | |
| 1290 # nbins is index of the bin not the 'number' | |
| 1291 # Unless $slice_length is a multiple! | |
| 1292 if ( !( $slice_length % $wsize ) ) { $nbins{$wsize}-- } | |
| 1293 | |
| 1294 # Create default bins with 0 | |
| 1295 $bins{$wsize} = []; | |
| 1296 map { $bins{$wsize}->[$_] = 0 } ( 0 .. $nbins{$wsize} ); | |
| 1297 | |
| 1298 # Set bin counts to 0 for each bin | |
| 1299 $bin_counts{$wsize} = []; | |
| 1300 | |
| 1301 # This is adding an undef to the start of the array!? | |
| 1302 map { $bin_counts{$wsize}->[ ($_) ] = 0 } @{ $bins{$wsize} }; | |
| 1303 | |
| 1304 foreach my $bin ( @{ $bins{$wsize} } ) { | |
| 1305 $bin_counts{$wsize}->[$bin] = 0; | |
| 1306 } | |
| 1307 } | |
| 1308 | |
| 1309 my $feature_index = 0; | |
| 1310 my ( $bin_index, @bin_masks ); | |
| 1311 | |
| 1312 foreach my $feature ( @{$features} ) { | |
| 1313 # Set up the bins for each window size | |
| 1314 | |
| 1315 foreach my $wsize (@$wsizes) { | |
| 1316 my $start_bin = int( ( $feature->start )/$wsize ); | |
| 1317 my $end_bin = int( ( $feature->end )/$wsize ); | |
| 1318 | |
| 1319 if ( $end_bin > $nbins{$wsize} ) { | |
| 1320 $end_bin = $nbins{$wsize}; | |
| 1321 } | |
| 1322 | |
| 1323 $self->$calc_method( $feature, $start_bin, $end_bin, | |
| 1324 $wsize, \%bins, \%bin_counts ); | |
| 1325 } | |
| 1326 | |
| 1327 } | |
| 1328 | |
| 1329 # Now do post processing of bins if required | |
| 1330 if ( $self->can($post_method) ) { | |
| 1331 $self->$post_method( \%bins, \%bin_counts ); | |
| 1332 } | |
| 1333 | |
| 1334 return \%bins; | |
| 1335 } ## end sub _bin_features_by_Slice_window_sizes | |
| 1336 # end sub _bin_features_by_Slice | |
| 1337 | |
| 1338 | |
| 1339 ### Here follows the bin methods | |
| 1340 # These may also be defined in the inheriting Collector class. No tests | |
| 1341 # as these are internal and require speed. | |
| 1342 | |
| 1343 | |
| 1344 =head2 _calculate_count | |
| 1345 | |
| 1346 Args[0] : feature e.g. Bio::EnsEMBL::Feature | |
| 1347 Args[1] : int - start bin | |
| 1348 Args[2] : int - end bin | |
| 1349 Args[3] : int - window_size | |
| 1350 Args[4] : hashref - score bins | |
| 1351 Example : $self->$calc_method | |
| 1352 Description: Adds count to bins which this feature overlaps | |
| 1353 Returntype : None | |
| 1354 Exceptions : None | |
| 1355 Caller : _bin_features_by_window_sizes | |
| 1356 Status : At Risk | |
| 1357 | |
| 1358 =cut | |
| 1359 | |
| 1360 sub _calculate_count { | |
| 1361 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_; | |
| 1362 | |
| 1363 my $bin_index; | |
| 1364 | |
| 1365 for ( $bin_index = $start_bin; $bin_index <= $end_bin; ++$bin_index ) | |
| 1366 { | |
| 1367 $bins_ref->{$wsize}->[$bin_index]++; | |
| 1368 } | |
| 1369 | |
| 1370 return; | |
| 1371 } | |
| 1372 | |
| 1373 | |
| 1374 =head2 _calculate_average_score | |
| 1375 | |
| 1376 Args[0] : feature e.g. Bio::EnsEMBL::Feature | |
| 1377 Args[1] : int - start bin | |
| 1378 Args[2] : int - end bin | |
| 1379 Args[3] : int - window_size | |
| 1380 Args[4] : hashref - score bins | |
| 1381 Example : $self->$calc_method | |
| 1382 Description: Adds score to bins which this feature overlaps | |
| 1383 Returntype : None | |
| 1384 Exceptions : None | |
| 1385 Caller : _bin_features_by_window_sizes | |
| 1386 Status : At Risk | |
| 1387 | |
| 1388 =cut | |
| 1389 | |
| 1390 | |
| 1391 sub _calculate_average_score { | |
| 1392 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref, | |
| 1393 $bin_counts_ref ) | |
| 1394 = @_; | |
| 1395 | |
| 1396 # This is simple an average of all the scores for features which | |
| 1397 # overlap this bin. No weighting with respect to the bin or the | |
| 1398 # feature. | |
| 1399 | |
| 1400 my $score = $self->get_score_by_Feature($feature); | |
| 1401 | |
| 1402 for ( my $bin_index = $start_bin; | |
| 1403 $bin_index <= $end_bin; | |
| 1404 ++$bin_index ) | |
| 1405 { | |
| 1406 # We should really push onto array here so we can have median or | |
| 1407 # mean. | |
| 1408 | |
| 1409 $bins_ref->{$wsize}->[$bin_index] += $score; | |
| 1410 $bin_counts_ref->{$wsize}->[$bin_index]++; | |
| 1411 } | |
| 1412 | |
| 1413 return; | |
| 1414 } | |
| 1415 | |
| 1416 | |
| 1417 =head2 _post_process_average_score | |
| 1418 | |
| 1419 Args[0] : hashref - score bins | |
| 1420 Args[1] : hashref - count bins | |
| 1421 Example : $self->$post_method | |
| 1422 Description: Post processes bins to calculate average score | |
| 1423 Returntype : None | |
| 1424 Exceptions : None | |
| 1425 Caller : _bin_features_by_window_sizes | |
| 1426 Status : At Risk | |
| 1427 | |
| 1428 =cut | |
| 1429 | |
| 1430 sub _post_process_average_score { | |
| 1431 my ( $self, $bins_ref, $bin_counts_ref ) = @_; | |
| 1432 | |
| 1433 foreach my $wsize ( keys %{$bins_ref} ) { | |
| 1434 foreach my $bin_index ( 0 .. $#{ $bins_ref->{$wsize} } ) { | |
| 1435 | |
| 1436 if ( $bin_counts_ref->{$wsize}->[$bin_index] ) { | |
| 1437 $bins_ref->{$wsize}->[$bin_index] /= | |
| 1438 $bin_counts_ref->{$wsize}->[$bin_index]; | |
| 1439 } | |
| 1440 | |
| 1441 } | |
| 1442 } | |
| 1443 | |
| 1444 return; | |
| 1445 } | |
| 1446 | |
| 1447 | |
| 1448 =head2 _calculate_max_magnitude | |
| 1449 | |
| 1450 Args[0] : feature e.g. Bio::EnsEMBL::Feature | |
| 1451 Args[1] : int - start bin | |
| 1452 Args[2] : int - end bin | |
| 1453 Args[3] : int - window_size | |
| 1454 Args[4] : hashref - score bins | |
| 1455 Example : $self->$calc_method | |
| 1456 Description: Sets max +/-ve scores for bins which this feature overlaps | |
| 1457 Returntype : None | |
| 1458 Exceptions : None | |
| 1459 Caller : _bin_features_by_window_sizes | |
| 1460 Status : At Risk | |
| 1461 | |
| 1462 =cut | |
| 1463 | |
| 1464 sub _calculate_max_magnitude { | |
| 1465 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_; | |
| 1466 | |
| 1467 my $score = $self->get_score_by_Feature($feature); | |
| 1468 | |
| 1469 # Max magnitude | |
| 1470 # Take the highest value +ve or -ve score | |
| 1471 for ( my $bin_index = $start_bin; | |
| 1472 $bin_index <= $end_bin; | |
| 1473 ++$bin_index ) | |
| 1474 { | |
| 1475 | |
| 1476 # We really need to capture the lowest -ve and higest +ve scores | |
| 1477 # here and post process to pick between them. | |
| 1478 | |
| 1479 $bins_ref->{$wsize}->[$bin_index] ||= [ 0, 0 ]; #-ve, +ve | |
| 1480 | |
| 1481 if ( $score < $bins_ref->{$wsize}->[$bin_index]->[0] ) { | |
| 1482 $bins_ref->{$wsize}->[$bin_index]->[0] = $score; | |
| 1483 } elsif ( $score > $bins_ref->{$wsize}->[$bin_index][1] ) { | |
| 1484 $bins_ref->{$wsize}->[$bin_index]->[1] = $score; | |
| 1485 } | |
| 1486 } | |
| 1487 | |
| 1488 return; | |
| 1489 } ## end sub _calculate_max_magnitude | |
| 1490 | |
| 1491 | |
| 1492 =head2 _post_process_max_magnitude | |
| 1493 | |
| 1494 Args[0] : hashref - score bins | |
| 1495 Args[1] : hashref - count bins | |
| 1496 Example : $self->$post_method | |
| 1497 Description: Post processes bins to pick largest +ve or -ve score | |
| 1498 Returntype : None | |
| 1499 Exceptions : None | |
| 1500 Caller : _bin_features_by_window_sizes | |
| 1501 Status : At Risk | |
| 1502 | |
| 1503 =cut | |
| 1504 | |
| 1505 sub _post_process_max_magnitude { | |
| 1506 my ( $self, $bins_ref ) = @_; | |
| 1507 | |
| 1508 # Take the highest value +ve or -ve score | |
| 1509 | |
| 1510 foreach my $wsize ( keys %{$bins_ref} ) { | |
| 1511 foreach my $bin_index ( 0 .. $#{ $bins_ref->{$wsize} } ) { | |
| 1512 | |
| 1513 # Have potential for no listref in a given bin | |
| 1514 | |
| 1515 # default value if we haven't seen anything is 0 | |
| 1516 # Actually want an array of -ve +ve values | |
| 1517 | |
| 1518 if ( $bins_ref->{$wsize}->[$bin_index] ) { | |
| 1519 my $tmp_minus = -$bins_ref->{$wsize}->[$bin_index]->[0]; | |
| 1520 | |
| 1521 if ( $tmp_minus > $bins_ref->{$wsize}->[$bin_index]->[1] ) { | |
| 1522 $bins_ref->{$wsize}->[$bin_index] = | |
| 1523 $bins_ref->{$wsize}->[$bin_index]->[0]; | |
| 1524 } else { | |
| 1525 $bins_ref->{$wsize}->[$bin_index] = | |
| 1526 $bins_ref->{$wsize}->[$bin_index]->[1]; | |
| 1527 } | |
| 1528 | |
| 1529 } | |
| 1530 | |
| 1531 } | |
| 1532 } | |
| 1533 return; | |
| 1534 } ## end sub _post_process_max_magnitude | |
| 1535 | |
| 1536 | |
| 1537 =head2 _calculate_RPKM | |
| 1538 | |
| 1539 Args[0] : feature e.g. Bio::EnsEMBL::Feature | |
| 1540 Args[1] : int - start bin | |
| 1541 Args[2] : int - end bin | |
| 1542 Args[3] : int - window_size | |
| 1543 Args[4] : hashref - score bins | |
| 1544 Example : $self->$calc_method | |
| 1545 Description: Stores counts to calculate Read Per Kb per Million(RPKM) | |
| 1546 Returntype : None | |
| 1547 Exceptions : None | |
| 1548 Caller : _bin_features_by_window_sizes | |
| 1549 Status : At Risk | |
| 1550 | |
| 1551 =cut | |
| 1552 | |
| 1553 sub _calculate_RPKM { | |
| 1554 my ( $self, $feature, $start_bin, $end_bin, $wsize, $bins_ref ) = @_; | |
| 1555 | |
| 1556 $self->_calculate_count( $feature, $start_bin, $end_bin, | |
| 1557 $wsize, $bins_ref ); | |
| 1558 | |
| 1559 return; | |
| 1560 } | |
| 1561 | |
| 1562 | |
| 1563 =head2 _post_process_RPKM | |
| 1564 | |
| 1565 Args[0] : hashref - score bins | |
| 1566 Args[1] : hashref - count bins | |
| 1567 Example : $self->$post_method | |
| 1568 Description: Post processes bins to calculate average score | |
| 1569 Returntype : None | |
| 1570 Exceptions : None | |
| 1571 Caller : _bin_features_by_window_sizes | |
| 1572 Status : At Risk | |
| 1573 | |
| 1574 =cut | |
| 1575 | |
| 1576 sub _post_process_RPKM { | |
| 1577 my ( $self, $bins_ref ) = @_; | |
| 1578 | |
| 1579 #10^9 x C / NGB | |
| 1580 #C = Reads overlapping bin | |
| 1581 #N = Total reads in the experiment | |
| 1582 #G = Length of bin in bps | |
| 1583 #(don't really have to account for non-ref/HAPs or gender here | |
| 1584 #as should be close enough, CellTypes/gender differences will be miniscule) | |
| 1585 #B = length of each bin | |
| 1586 | |
| 1587 foreach my $wsize(keys %{$bins_ref}){ | |
| 1588 | |
| 1589 foreach my $bin_index(0..$#{$bins_ref->{$wsize}}){ | |
| 1590 $bins_ref->{$wsize}->[$bin_index] = | |
| 1591 ((10**9) * | |
| 1592 $bins_ref->{$wsize}->[$bin_index])/(($self->_RPKM_factor($wsize)) * $wsize); | |
| 1593 } | |
| 1594 } | |
| 1595 | |
| 1596 return; | |
| 1597 | |
| 1598 } | |
| 1599 | |
| 1600 | |
| 1601 =head2 _set_up_RPKM | |
| 1602 | |
| 1603 Args[0] : hashref - method config e.g | |
| 1604 { | |
| 1605 DNADB => Bio::EnsEMBL::DBSQL::DBAdaptor, | |
| 1606 TOTAL_FEATURE => $total_feature_count, | |
| 1607 } | |
| 1608 | |
| 1609 Example : $self->$set_up_method($config); | |
| 1610 Description: Sets the RPKM factor | |
| 1611 Returntype : None | |
| 1612 Exceptions : Throws is required config params are not set | |
| 1613 Caller : bin_method | |
| 1614 Status : At Risk | |
| 1615 | |
| 1616 =cut | |
| 1617 | |
| 1618 | |
| 1619 sub _set_up_RPKM{ | |
| 1620 my ($self, $config) = @_; | |
| 1621 | |
| 1622 my ($dnadb, $total_features) = rearrange([ 'DNADB', 'TOTAL_FEATURES'], %{$config}); | |
| 1623 | |
| 1624 #Test specifically here to notify about config hash | |
| 1625 if(! $total_features){ | |
| 1626 throw("For RPKM you must pass a valid 'total_features' ". | |
| 1627 "as part of the method config hash."); | |
| 1628 } | |
| 1629 | |
| 1630 if(! $dnadb){ | |
| 1631 throw("For RPKM you must pass 'dnadb' as part of the method config hash."); | |
| 1632 } | |
| 1633 | |
| 1634 foreach my $wsize(@{$self->window_sizes}){ | |
| 1635 #Should never have 0 here | |
| 1636 $self->_RPKM_factor($wsize, ($wsize * $total_features)); #N*G | |
| 1637 | |
| 1638 warn "setting $wsize RPKM factor($wsize * $total_features) to ". | |
| 1639 $self->_RPKM_factor($wsize); | |
| 1640 } | |
| 1641 | |
| 1642 return; | |
| 1643 } ## end sub _set_up_RPKM | |
| 1644 | |
| 1645 | |
| 1646 =head2 _RPKM_factor | |
| 1647 | |
| 1648 Args[0] : int - RPKM factor i.e. (Total reads in the experiment * | |
| 1649 Genome length) | |
| 1650 Example : $self->_RPKM_factor($wsize, $factor); | |
| 1651 Description: Gets/Sets the RPKM factor | |
| 1652 Returntype : int | |
| 1653 Exceptions : None | |
| 1654 Caller : _set_up_RPKM, _post_process_RPKM | |
| 1655 Status : At Risk | |
| 1656 | |
| 1657 =cut | |
| 1658 | |
| 1659 sub _RPKM_factor{ | |
| 1660 my ($self, $wsize, $factor) = @_; | |
| 1661 | |
| 1662 if (! defined $wsize){ | |
| 1663 throw("You must pass at least window_size to get or set the RPKM factor"); | |
| 1664 } | |
| 1665 | |
| 1666 if(defined $factor){ | |
| 1667 $self->{'RPKM_factor'}{$wsize} = $factor; | |
| 1668 } | |
| 1669 elsif(! exists $self->{'RPKM_factor'}{$wsize}){ | |
| 1670 #This should never happen unless the window sizes | |
| 1671 #are redefined after initialisation | |
| 1672 throw("You have requested an RPKM factor for a window_size". | |
| 1673 " which has not been set:\t$wsize"); | |
| 1674 } | |
| 1675 | |
| 1676 return $self->{'RPKM_factor'}{$wsize}; | |
| 1677 } | |
| 1678 | |
| 1679 =head2 get_diploid_genome_length_by_gender | |
| 1680 | |
| 1681 Args[0] : string - RPKM factor i.e. (Total reads in the experiment * | |
| 1682 Genome length) | |
| 1683 Args[1] : string - gender e.g. male or female | |
| 1684 Example : | |
| 1685 | |
| 1686 my $glength = | |
| 1687 $self->get_diploid_genome_length_by_gender( $dnadb, $gender ); | |
| 1688 | |
| 1689 Description: Gets the gender specific diploid genome length, | |
| 1690 including non-ref but not including haplotypes. Only | |
| 1691 handles species with X/Y sex chromosomes. | |
| 1692 Returntype : int | |
| 1693 Exceptions : None | |
| 1694 Caller : _set_up_RPKM, _post_process_RPKM | |
| 1695 Status : At Risk - Move to and export from generic Utils Slice module??? | |
| 1696 | |
| 1697 =cut | |
| 1698 | |
| 1699 sub get_diploid_genome_length_by_gender { | |
| 1700 my ( $dnadb, $gender ) = @_; | |
| 1701 | |
| 1702 my %sex_chrs = ( 'Y' => 'male', | |
| 1703 'X' => 'female', ); | |
| 1704 | |
| 1705 my $dip_length = 0; | |
| 1706 | |
| 1707 if (!(( ref($dnadb) && $dnadb->isa('Bio::EnsEMBL::DBSQL::DBAdaptor') ) | |
| 1708 && $dnadb->grou() eq 'core' | |
| 1709 && ( defined $gender && $gender =~ /(male|female)/ ) ) ) | |
| 1710 { | |
| 1711 throw( "Must provide valid " | |
| 1712 . "Bio::EnsEMBL::DBSQL::DBAdaptor($dnadb) and " | |
| 1713 . "gender ($gender) arguments" ); | |
| 1714 } | |
| 1715 | |
| 1716 my @ref_slices = $dnadb->get_SliceAdaptor->fetch_all('toplevel'); | |
| 1717 | |
| 1718 # Include non-ref(unassembled), but omit haps/lrgs(i.e. redundant) | |
| 1719 | |
| 1720 foreach my $slice ( | |
| 1721 @{ $dnadb->get_SliceAdaptor->fetch_all( 'toplevel', undef, 1, 1 ) } | |
| 1722 ) | |
| 1723 { | |
| 1724 # Include duplicated region for true diploid length | |
| 1725 | |
| 1726 # Skip haps/lrgs | |
| 1727 if ( ( $slice->coord_system->name() eq 'chromosome' | |
| 1728 && !$slice->is_reference() ) | |
| 1729 || $slice->coord_system->name() eq 'lrg' ) | |
| 1730 { | |
| 1731 next; | |
| 1732 } | |
| 1733 | |
| 1734 if ( exists( $sex_chrs{ $slice->seq_region_name() } ) ) { | |
| 1735 if ( $gender eq 'male' ) { | |
| 1736 $dip_length += $slice->length; | |
| 1737 } elsif ( $sex_chrs{ $slice->seq_region_name } eq 'male' ) { | |
| 1738 next; | |
| 1739 } | |
| 1740 } | |
| 1741 | |
| 1742 $dip_length += 2*$slice->length; | |
| 1743 } | |
| 1744 | |
| 1745 return $dip_length; | |
| 1746 } ## end sub get_diploid_genome_length_by_gender | |
| 1747 | |
| 1748 | |
| 1749 1; |
