comparison variant_effect_predictor/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
24
25 =head1 SYNOPSIS
26
27 my $dfa = $database_adaptor->get_DensityFeatureAdaptor();
28
29 my $interpolate = 1;
30 my $blocks_wanted = 50;
31
32 @dense_feats = @{
33 $dfa->fetch_all_by_Slice( $slice, 'SNPDensity', $blocks_wanted,
34 $interpolate );
35 }
36
37 =head1 DESCRIPTION
38
39 Density Feature Adaptor - An adaptor responsible for the creation of density
40 features from the database.
41
42 =head1 METHODS
43
44 =cut
45
46 package Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor;
47 use vars qw(@ISA);
48 use strict;
49
50
51 use POSIX;
52 use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
53 use Bio::EnsEMBL::Utils::Cache;
54 use Bio::EnsEMBL::DensityFeature;
55 use Bio::EnsEMBL::DensityFeatureSet;
56 use Bio::EnsEMBL::DensityType;
57 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
58
59 @ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor);
60
61 our $DENSITY_FEATURE_CACHE_SIZE = 20;
62
63 =head2 new
64
65 Arg [1] : list of args @args
66 Superclass constructor arguments
67 Example : none
68 Description: Constructor which just initializes internal cache structures
69 Returntype : Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
70 Exceptions : none
71 Caller : implementing subclass constructors
72 Status : Stable
73
74 =cut
75
76 sub new {
77 my $caller = shift;
78
79 my $class = ref($caller) || $caller;
80
81 my $self = $class->SUPER::new(@_);
82
83 #initialize an LRU cache
84 my %cache;
85 tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE);
86 $self->{'_density_feature_cache'} = \%cache;
87
88 return $self;
89 }
90
91 =head2 fetch_all_by_Slice
92
93 Arg [1] : Bio::EnsEMBL::Slice $slice - The slice representing the region
94 to retrieve density features from.
95 Arg [2] : string $logic_name - The logic name of the density features to
96 retrieve.
97 Arg [3] : int $num_blocks (optional; default = 50) - The number of
98 features that are desired. The ratio between the size of these
99 features and the size of the features in the database will be
100 used to determine which database features will be used.
101 Arg [4] : boolean $interpolate (optional; default = 0) - A flag indicating
102 whether the features in the database should be interpolated to
103 fit them to the requested number of features. If true the
104 features will be interpolated to provide $num_blocks features.
105 This will not guarantee that exactly $num_blocks features are
106 returned due to rounding etc. but it will be close.
107 Arg [5] : float $max_ratio - The maximum ratio between the size of the
108 requested features (as determined by $num_blocks) and the actual
109 size of the features in the database. If this value is exceeded
110 then an empty list will be returned. This can be used to
111 prevent excessive interpolation of the database values.
112 Example : #interpolate:
113 $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 10, 1);
114 #do not interpoloate, get what is in the database:
115 $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 50);
116 #interpolate, but not too much
117 $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity',50,1,5.0);
118 Description: Retrieves a set of density features which overlap the region
119 of this slice. Density features are a discrete representation
120 of a continuous value along a sequence, such as a density or
121 percent coverage. Density Features may be stored in chunks of
122 different sizes in the database, and interpolated to fit the
123 desired size for the requested region. For example the database
124 may store a single density value for each 1KB and also for each
125 1MB. When fetching for an entire chromosome the 1MB density
126 chunks will be used if the requested number of blocks is not
127 very high.
128 Note that features which have been interpolated are not stored
129 in the database and as such will have no dbID or adaptor set.
130 Returntype : Bio::EnsEMBL::DensityFeature
131 Exceptions : warning on invalid $num_blocks argument
132 warning if no features with logic_name $logic_name exist
133 warning if density_type table has invalid block_size value
134 Caller : general
135 Status : Stable
136
137 =cut
138
139 sub fetch_all_by_Slice {
140 my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
141
142 if(defined($num_blocks) && $num_blocks < 1) {
143 warning("Invalid number of density blocks [$num_blocks] requested.\n" .
144 "Returning empty list.");
145 return [];
146 }
147
148 $num_blocks ||= 50;
149 my $length = $slice->length();
150
151 my $wanted_block_size = POSIX::ceil($length/$num_blocks);
152
153 #
154 # get out all of the density types and choose the one with the
155 # block size closest to our desired block size
156 #
157
158 my $dta = $self->db()->get_DensityTypeAdaptor();
159
160 my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)};
161 if( ! @dtypes ){
162 my @all_dtypes = @{ $dta->fetch_all() };
163 @all_dtypes or warning( "No DensityTypes in $dta" ) && return [];
164 my $valid_list = join( ", ", map{$_->analysis->logic_name} @all_dtypes );
165 warning( "No DensityTypes for logic name $logic_name. ".
166 "Select from $valid_list" );
167 return [];
168 }
169
170 my $best_ratio = undef;
171 my $density_type = undef;
172 my $best_ratio_large = undef;
173 my $density_type_large = undef;
174 my %dt_ratio_hash;
175
176 foreach my $dt (@dtypes) {
177
178 my $ratio;
179 if( $dt->block_size() > 0 ) {
180 $ratio = $wanted_block_size/$dt->block_size();
181 } else {
182 # This is only valid if the requested seq_region is the one the
183 # features are stored on. Please use sensibly. Or find better implementation.
184
185 my $block_size = $slice->seq_region_length() / $dt->region_features();
186 $ratio = $wanted_block_size / $block_size;
187 }
188
189 # we prefer to use a block size that's smaller than the required one
190 # (better results on interpolation).
191 # give larger bits a disadvantage and make them comparable
192 if( $ratio < 1 ) {
193 $ratio = 5/$ratio;
194 }
195
196 $dt_ratio_hash{ $ratio } = $dt;
197 }
198
199 $best_ratio = (sort {$a<=>$b} (keys %dt_ratio_hash))[0];
200
201 #the ratio was not good enough, or this logic name was not in the
202 #density type table, return an empty list
203 if(!defined($best_ratio) ||
204 (defined($max_ratio) && $best_ratio > $max_ratio)) {
205 return [];
206 }
207
208 $density_type = $dt_ratio_hash{$best_ratio};
209
210 my $constraint = "df.density_type_id = " . $density_type->dbID();
211
212 my @features =
213 @{$self->fetch_all_by_Slice_constraint($slice,$constraint)};
214
215 return \@features if(!$interpolate);
216
217 #interpolate the features into new features of a different size
218 my @out;
219 #sort the features on start position
220 @features = sort( { $a->start() <=> $b->start() } @features );
221
222 #resize the features that were returned
223 my $start = 1;
224 my $end = $start+$wanted_block_size-1;
225
226 my $value_type = $density_type->value_type();
227
228 # create a new density type object for the interpolated features that
229 # is not stored in the database
230 $density_type = Bio::EnsEMBL::DensityType->new
231 (-VALUE_TYPE => $value_type,
232 -ANALYSIS => $density_type->analysis(),
233 -BLOCK_SIZE => $wanted_block_size);
234
235 while($start < $length) {
236 # $end = $length if($end > $length);
237
238 my $density_value = 0.0;
239 my ($f, $fstart, $fend, $portion);
240 my @dvalues;
241
242 #construct a new feature using all of the old density features that
243 #we overlapped
244 while(($f = shift(@features)) && $end > $f->{'start'}) {
245
246 #what portion of this feature are we using to construct our new block?
247 $fstart = ($f->{'start'} < $start) ? $start : $f->{'start'};
248 $fend = ($f->{'end'} > $end ) ? $end : $f->{'end'};
249 $fend = $length if($fend > $length);
250
251 if($value_type eq 'sum') {
252
253 $portion = ($fend-$fstart+1)/$f->length();
254
255 #take a percentage of density value, depending on how much of the
256 #feature we overlapped
257 $density_value += $portion * $f->{'density_value'};
258
259 } elsif($value_type eq 'ratio') {
260
261 #maintain a running total of the length used from each feature
262 #and its value
263 push(@dvalues,[$fend-$fstart+1,$f->{'density_value'}]);
264
265 } else {
266 throw("Unknown density value type [$value_type].");
267 }
268
269 #do not want to look at next feature if we only used part of this one:
270 last if($fend < $f->{'end'});
271 }
272
273 #if we did not completely overlap the last feature, put it back on so
274 #it can be partially used by the next block
275 if(defined($f) && (!defined($fend) || $fend < $f->{'end'})) {
276 unshift(@features, $f);
277 }
278
279 if($value_type eq 'ratio') {
280 #take a weighted average of the all the density values of the features
281 #used to construct this one
282 my $total_len = $end - $start + 1;
283 if($total_len > 0) {
284 foreach my $pair (@dvalues) {
285 my ($dlen, $dval) = @$pair;
286 $density_value += $dval * ($dlen/$total_len);
287 }
288 }
289 }
290
291 # interpolated features are not stored in the db so do not set the dbID
292 # or the adaptor
293 push @out, Bio::EnsEMBL::DensityFeature->new
294 (-seq_region => $slice,
295 -start => $start,
296 -end => $end,
297 -density_type => $density_type,
298 -density_value => $density_value);
299
300 $start = $end + 1;
301 $end += $wanted_block_size;
302 }
303
304 return \@out;
305 }
306
307
308 sub _tables {
309 my $self = shift;
310
311 return (['density_feature', 'df']);
312 }
313
314
315 sub _columns {
316 my $self = shift;
317
318 return qw( df.density_feature_id
319 df.seq_region_id df.seq_region_start df.seq_region_end
320 df.density_value df.density_type_id );
321 }
322
323
324
325 sub _objs_from_sth {
326 my ($self, $sth, $mapper, $dest_slice) = @_;
327
328 #
329 # This code is ugly because an attempt has been made to remove as many
330 # function calls as possible for speed purposes. Thus many caches,
331 # and a fair bit of gymnastics have been used.
332 #
333
334 my $sa = $self->db()->get_SliceAdaptor();
335 my $dta = $self->db()->get_DensityTypeAdaptor();
336
337 my @features;
338 my %dtype_hash;
339 my %slice_hash;
340 my %sr_name_hash;
341 my %sr_cs_hash;
342
343 my($density_feature_id, $seq_region_id, $seq_region_start, $seq_region_end,
344 $density_value, $density_type_id );
345
346 $sth->bind_columns(\$density_feature_id, \$seq_region_id, \$seq_region_start,
347 \$seq_region_end, \$density_value, \$density_type_id);
348
349 my $asm_cs;
350 my $cmp_cs;
351 my $asm_cs_vers;
352 my $asm_cs_name;
353 my $cmp_cs_vers;
354 my $cmp_cs_name;
355 if($mapper) {
356 $asm_cs = $mapper->assembled_CoordSystem();
357 $cmp_cs = $mapper->component_CoordSystem();
358 $asm_cs_name = $asm_cs->name();
359 $asm_cs_vers = $asm_cs->version();
360 $cmp_cs_name = $cmp_cs->name();
361 $cmp_cs_vers = $cmp_cs->version();
362 }
363
364 my $dest_slice_start;
365 my $dest_slice_end;
366 my $dest_slice_strand;
367 my $dest_slice_length;
368 my $dest_slice_sr_name;
369 my $dest_slice_sr_id;
370
371 if($dest_slice) {
372 $dest_slice_start = $dest_slice->start();
373 $dest_slice_end = $dest_slice->end();
374 $dest_slice_strand = $dest_slice->strand();
375 $dest_slice_length = $dest_slice->length();
376 $dest_slice_sr_name = $dest_slice->seq_region_name();
377 $dest_slice_sr_id = $dest_slice->get_seq_region_id();
378 }
379
380 FEATURE: while($sth->fetch()) {
381 #get the density type object
382 my $density_type = $dtype_hash{$density_type_id} ||=
383 $dta->fetch_by_dbID($density_type_id);
384
385 #get the slice object
386 #need to get the internal_seq_region, if present
387 $seq_region_id = $self->get_seq_region_id_internal($seq_region_id);
388 my $slice = $slice_hash{"ID:".$seq_region_id};
389 if(!$slice) {
390 $slice = $sa->fetch_by_seq_region_id($seq_region_id);
391 $slice_hash{"ID:".$seq_region_id} = $slice;
392 $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
393 $sr_cs_hash{$seq_region_id} = $slice->coord_system();
394 }
395
396 my $sr_name = $sr_name_hash{$seq_region_id};
397 my $sr_cs = $sr_cs_hash{$seq_region_id};
398
399 #
400 # remap the feature coordinates to another coord system
401 # if a mapper was provided
402 #
403 if($mapper) {
404
405 my $len = $seq_region_end - $seq_region_start + 1;
406
407 my @coords;
408
409 if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) {
410
411 @coords = $mapper->map( $sr_name, $seq_region_start, $seq_region_end,
412 1, $sr_cs, 0, $dest_slice);
413
414 } else {
415 @coords = $mapper->map($sr_name, $seq_region_start, $seq_region_end,1, $sr_cs);
416 }
417
418 #filter out gaps
419 @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords;
420
421 #throw out density features mapped to gaps, or split
422 next FEATURE if(@coords != 1);
423
424 $seq_region_start = $coords[0]->{'start'};
425 $seq_region_end = $coords[0]->{'end'};
426 $seq_region_id = $coords[0]->{'id'};
427
428 if($density_type->value_type() eq 'sum') {
429 #adjust density value so it reflects length of feature actually used
430 my $newlen = $seq_region_end - $seq_region_start +1;
431 $density_value *= $newlen/$len if($newlen != $len);
432 }
433
434 #get a slice in the coord system we just mapped to
435 # if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) {
436 $slice = $slice_hash{"ID:".$seq_region_id} ||=
437 $sa->fetch_by_seq_region_id($seq_region_id);
438 # } else {
439 # $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||=
440 # $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef,
441 # $asm_cs_vers);
442 # }
443 }
444
445 #
446 # If a destination slice was provided convert the coords
447 # If the dest_slice starts at 1 and is foward strand, nothing needs doing
448 #
449 if($dest_slice) {
450 if($dest_slice_start != 1 || $dest_slice_strand != 1) {
451 if($dest_slice_strand == 1) {
452 $seq_region_start = $seq_region_start - $dest_slice_start + 1;
453 $seq_region_end = $seq_region_end - $dest_slice_start + 1;
454 } else {
455 my $tmp_seq_region_start = $seq_region_start;
456 $seq_region_start = $dest_slice_end - $seq_region_end + 1;
457 $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1;
458 }
459 }
460
461 #throw away features entirely off the end of the requested slice
462 if($seq_region_end < 1 || $seq_region_start > $dest_slice_length ||
463 ( $dest_slice_sr_id ne $seq_region_id )) {
464 next FEATURE;
465 }
466 $slice = $dest_slice;
467 }
468
469 push( @features,
470 $self->_create_feature( 'Bio::EnsEMBL::DensityFeature', {
471 -dbID => $density_feature_id,
472 -adaptor => $self,
473 -start => $seq_region_start,
474 -end => $seq_region_end,
475 -seq_region => $slice,
476 -density_value => $density_value,
477 -density_type => $density_type
478 } ) );
479
480 }
481
482 return \@features;
483 }
484
485
486 =head2 list_dbIDs
487
488 Arg [1] : none
489 Example : @feature_ids = @{$density_feature_adaptor->list_dbIDs()};
490 Description: Gets an array of internal ids for all density features in the
491 current db
492 Arg[1] : <optional> int. not set to 0 for the ids to be sorted by the seq_region.
493 Returntype : list of ints
494 Exceptions : none
495 Caller : ?
496 Status : Stable
497
498 =cut
499
500 sub list_dbIDs {
501 my ($self, $ordered) = @_;
502
503 return $self->_list_dbIDs("density_feature",undef, $ordered);
504 }
505
506
507 =head2 store
508
509 Arg [1] : list of Bio::EnsEMBL::DensityFeatures @df
510 the simple features to store in the database
511 Example : $density_feature_adaptor->store(1234, @density_feats);
512 Description: Stores a list of density feature objects in the database
513 Returntype : none
514 Exceptions : thrown if @df is not defined, if any of the features do not
515 have an attached slice.
516 or if any elements of @df are not Bio::EnsEMBL::SeqFeatures
517 Caller : general
518 Status : Stable
519
520 =cut
521
522 sub store{
523 my ($self,@df) = @_;
524
525 if( scalar(@df) == 0 ) {
526 throw("Must call store with list of DensityFeatures");
527 }
528 #mysql> desc density_feature;
529 #+--------------------+---------+------+-----+---------+----------------+
530 #| Field | Type | Null | Key | Default | Extra |
531 #+--------------------+---------+------+-----+---------+----------------+
532 #| density_feature_id | int(11) | | PRI | NULL | auto_increment |
533 #| density_type_id | int(11) | | MUL | 0 | |
534 #| seq_region_id | int(11) | | | 0 | |
535 #| seq_region_start | int(11) | | | 0 | |
536 #| seq_region_end | int(11) | | | 0 | |
537 #| density_value | float | | | 0 | |
538 #+--------------------+---------+------+-----+---------+----------------+
539
540 my $sth = $self->prepare
541 ("INSERT INTO density_feature (seq_region_id, seq_region_start, " .
542 "seq_region_end, density_type_id, " .
543 "density_value) " .
544 "VALUES (?,?,?,?,?)");
545
546 my $db = $self->db();
547 my $analysis_adaptor = $db->get_AnalysisAdaptor();
548
549 FEATURE: foreach my $df ( @df ) {
550
551 if( !ref $df || !$df->isa("Bio::EnsEMBL::DensityFeature") ) {
552 throw("DensityFeature must be an Ensembl DensityFeature, " .
553 "not a [".ref($df)."]");
554 }
555
556 # we dont store 0 value density features
557 next if( $df->density_value == 0 );
558 if($df->is_stored($db)) {
559 warning("DensityFeature [".$df->dbID."] is already stored" .
560 " in this database.");
561 next FEATURE;
562 }
563
564 if(!defined($df->density_type)) {
565 throw("A density type must be attached to the features to be stored.");
566 }
567
568 #store the density_type if it has not been stored yet
569
570 if(!$df->density_type->is_stored($db)) {
571 my $dta = $db->get_DensityTypeAdaptor();
572 $dta->store($df->density_type());
573 }
574
575 my $original = $df;
576 my $seq_region_id;
577 ($df, $seq_region_id) = $self->_pre_store($df);
578
579 $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
580 $sth->bind_param(2,$df->start,SQL_INTEGER);
581 $sth->bind_param(3,$df->end,SQL_INTEGER);
582 $sth->bind_param(4,$df->density_type->dbID,SQL_INTEGER);
583 $sth->bind_param(5,$df->density_value,SQL_FLOAT);
584 $sth->execute();
585
586 $original->dbID($sth->{'mysql_insertid'});
587 $original->adaptor($self);
588 }
589 }
590
591 =head2 fetch_Featureset_by_Slice
592
593 Arg [1-5] : see
594 Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
595 for argument documentation
596 Example : $featureset = $dfa->fetch_FeatureSet_by_Slice($slice,'SNPDensity', 10, 1);
597 Description: wrapper around
598 Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
599 which returns a Bio::EnsEMBL::DensityFeatureSet and also caches
600 results
601 Returntype : Bio::EnsEMBL::DensityFeatureSet
602 Exceptions : none
603 Caller : general
604 Status : Stable
605
606 =cut
607
608 sub fetch_Featureset_by_Slice {
609 my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
610
611 my $key = join(":", $slice->name,
612 $logic_name,
613 $num_blocks || 50,
614 $interpolate || 0,
615 $max_ratio);
616
617 unless ($self->{'_density_feature_cache'}->{$key}) {
618 my $dfeats = $self->fetch_all_by_Slice($slice, $logic_name, $num_blocks,
619 $interpolate, $max_ratio);
620 $self->{'_density_feature_cache'}->{$key} =
621 new Bio::EnsEMBL::DensityFeatureSet($dfeats);
622 }
623 return $self->{'_density_feature_cache'}->{$key};
624 }
625
626
627 1;