diff variant_effect_predictor/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/DensityFeatureAdaptor.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,627 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
+
+=head1 SYNOPSIS
+
+  my $dfa = $database_adaptor->get_DensityFeatureAdaptor();
+
+  my $interpolate   = 1;
+  my $blocks_wanted = 50;
+
+  @dense_feats = @{
+    $dfa->fetch_all_by_Slice( $slice, 'SNPDensity', $blocks_wanted,
+      $interpolate );
+    }
+
+=head1 DESCRIPTION
+
+Density Feature Adaptor - An adaptor responsible for the creation of density
+features from the database.
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor;
+use vars qw(@ISA);
+use strict;
+
+
+use POSIX;
+use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
+use Bio::EnsEMBL::Utils::Cache;
+use Bio::EnsEMBL::DensityFeature;
+use Bio::EnsEMBL::DensityFeatureSet;
+use Bio::EnsEMBL::DensityType;
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+
+@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor);
+
+our $DENSITY_FEATURE_CACHE_SIZE = 20;
+
+=head2 new
+
+  Arg [1]    : list of args @args
+               Superclass constructor arguments
+  Example    : none
+  Description: Constructor which just initializes internal cache structures
+  Returntype : Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
+  Exceptions : none
+  Caller     : implementing subclass constructors
+  Status     : Stable
+
+=cut
+
+sub new {
+  my $caller = shift;
+
+  my $class = ref($caller) || $caller;
+
+  my $self = $class->SUPER::new(@_);
+
+  #initialize an LRU cache
+  my %cache;
+  tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE);
+  $self->{'_density_feature_cache'} = \%cache;
+
+  return $self;
+}
+
+=head2 fetch_all_by_Slice
+
+  Arg [1]    : Bio::EnsEMBL::Slice $slice - The slice representing the region
+               to retrieve density features from.
+  Arg [2]    : string $logic_name - The logic name of the density features to
+               retrieve.
+  Arg [3]    : int $num_blocks (optional; default = 50) - The number of
+               features that are desired. The ratio between the size of these
+               features and the size of the features in the database will be
+               used to determine which database features will be used.
+  Arg [4]    : boolean $interpolate (optional; default = 0) - A flag indicating
+               whether the features in the database should be interpolated to
+               fit them to the requested number of features.  If true the
+               features will be interpolated to provide $num_blocks features.
+               This will not guarantee that exactly $num_blocks features are
+               returned due to rounding etc. but it will be close.
+  Arg [5]    : float $max_ratio - The maximum ratio between the size of the
+               requested features (as determined by $num_blocks) and the actual
+               size of the features in the database.  If this value is exceeded
+               then an empty list will be returned.  This can be used to
+               prevent excessive interpolation of the database values.
+  Example    : #interpolate:
+               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 10, 1);
+               #do not interpoloate, get what is in the database:
+               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 50);
+               #interpolate, but not too much
+               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity',50,1,5.0);
+  Description: Retrieves a set of density features which overlap the region
+               of this slice. Density features are a discrete representation
+               of a continuous value along a sequence, such as a density or
+               percent coverage.  Density Features may be stored in chunks of
+               different sizes in the database, and interpolated to fit the
+               desired size for the requested region.  For example the database
+               may store a single density value for each 1KB and also for each
+               1MB.  When fetching for an entire chromosome the 1MB density
+               chunks will be used if the requested number of blocks is not
+               very high.
+               Note that features which have been interpolated are not stored
+               in the database and as such will have no dbID or adaptor set.
+  Returntype : Bio::EnsEMBL::DensityFeature
+  Exceptions : warning on invalid $num_blocks argument
+               warning if no features with logic_name $logic_name exist
+               warning if density_type table has invalid block_size value
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub fetch_all_by_Slice {
+  my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
+
+  if(defined($num_blocks) && $num_blocks < 1) {
+    warning("Invalid number of density blocks [$num_blocks] requested.\n" .
+	   "Returning empty list.");
+    return [];
+  }
+
+  $num_blocks ||= 50;
+  my $length = $slice->length();
+
+  my $wanted_block_size = POSIX::ceil($length/$num_blocks);
+
+  #
+  # get out all of the density types and choose the one with the
+  # block size closest to our desired block size
+  #
+
+  my $dta = $self->db()->get_DensityTypeAdaptor();
+
+  my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)};
+  if( ! @dtypes ){
+    my @all_dtypes =  @{ $dta->fetch_all() };
+    @all_dtypes or warning( "No DensityTypes in $dta" ) && return [];
+    my $valid_list = join( ", ", map{$_->analysis->logic_name} @all_dtypes );
+    warning( "No DensityTypes for logic name $logic_name. ".
+	     "Select from $valid_list" );
+    return [];
+  }
+
+  my $best_ratio   = undef;
+  my $density_type = undef;
+  my $best_ratio_large = undef;
+  my $density_type_large = undef;
+  my %dt_ratio_hash;
+
+  foreach my $dt (@dtypes) {
+
+    my $ratio;
+    if( $dt->block_size() > 0 ) {
+      $ratio = $wanted_block_size/$dt->block_size();
+    } else {
+      # This is only valid if the requested seq_region is the one the 
+      # features are stored on. Please use sensibly. Or find better implementation.
+
+      my $block_size = $slice->seq_region_length() / $dt->region_features();
+      $ratio = $wanted_block_size / $block_size;
+    }
+    
+    # we prefer to use a block size that's smaller than the required one
+    # (better results on interpolation).
+    # give larger bits a disadvantage and make them comparable
+    if( $ratio < 1 ) {
+      $ratio = 5/$ratio;
+    }
+
+    $dt_ratio_hash{ $ratio } = $dt;
+  }
+
+  $best_ratio = (sort {$a<=>$b} (keys %dt_ratio_hash))[0];
+  
+  #the ratio was not good enough, or this logic name was not in the
+  #density type table, return an empty list
+  if(!defined($best_ratio) ||
+     (defined($max_ratio) && $best_ratio > $max_ratio)) {
+    return [];
+  }
+
+  $density_type = $dt_ratio_hash{$best_ratio};
+
+  my $constraint = "df.density_type_id = " . $density_type->dbID();
+
+  my @features =
+    @{$self->fetch_all_by_Slice_constraint($slice,$constraint)};
+
+  return \@features if(!$interpolate);
+
+  #interpolate the features into new features of a different size
+  my @out;
+  #sort the features on start position
+  @features = sort( { $a->start() <=> $b->start() } @features );
+
+  #resize the features that were returned
+  my $start = 1;
+  my $end   = $start+$wanted_block_size-1;
+
+  my $value_type = $density_type->value_type();
+
+  # create a new density type object for the interpolated features that
+  # is not stored in the database
+  $density_type = Bio::EnsEMBL::DensityType->new
+    (-VALUE_TYPE => $value_type,
+     -ANALYSIS   => $density_type->analysis(),
+     -BLOCK_SIZE => $wanted_block_size);
+
+  while($start < $length) {
+#    $end = $length if($end > $length);
+
+    my $density_value = 0.0;
+    my ($f, $fstart, $fend, $portion);
+    my @dvalues;
+
+    #construct a new feature using all of the old density features that
+    #we overlapped
+    while(($f = shift(@features)) && $end > $f->{'start'}) {
+
+      #what portion of this feature are we using to construct our new block?
+      $fstart = ($f->{'start'} < $start) ? $start : $f->{'start'};
+      $fend   = ($f->{'end'}   > $end  ) ? $end   : $f->{'end'};
+      $fend   = $length if($fend > $length);
+
+      if($value_type eq 'sum') {
+
+        $portion = ($fend-$fstart+1)/$f->length();
+
+        #take a percentage of density value, depending on how much of the
+        #feature we overlapped
+        $density_value += $portion * $f->{'density_value'};
+
+      } elsif($value_type eq 'ratio') {
+
+        #maintain a running total of the length used from each feature
+        #and its value
+        push(@dvalues,[$fend-$fstart+1,$f->{'density_value'}]);
+
+      } else {
+        throw("Unknown density value type [$value_type].");
+      }
+
+      #do not want to look at next feature if we only used part of this one:
+      last if($fend < $f->{'end'});
+    }
+
+    #if we did not completely overlap the last feature, put it back on so
+    #it can be partially used by the next block
+    if(defined($f) && (!defined($fend) || $fend < $f->{'end'})) {
+      unshift(@features, $f);
+    }
+
+    if($value_type eq 'ratio') {
+      #take a weighted average of the all the density values of the features
+      #used to construct this one
+      my $total_len = $end - $start + 1;
+      if($total_len > 0) {
+        foreach my $pair (@dvalues) {
+          my ($dlen, $dval) = @$pair;
+          $density_value += $dval * ($dlen/$total_len);
+        }
+      }
+    }
+
+    # interpolated features are not stored in the db so do not set the dbID
+    # or the adaptor
+    push @out, Bio::EnsEMBL::DensityFeature->new
+      (-seq_region    => $slice,
+       -start         => $start,
+       -end           => $end,
+       -density_type  => $density_type,
+       -density_value => $density_value);
+
+    $start = $end + 1;
+    $end  += $wanted_block_size;
+  }
+
+  return \@out;
+}
+
+
+sub _tables {
+  my $self = shift;
+
+  return (['density_feature', 'df']);
+}
+
+
+sub _columns {
+  my $self = shift;
+
+  return qw( df.density_feature_id
+             df.seq_region_id df.seq_region_start df.seq_region_end
+             df.density_value df.density_type_id );
+}
+
+
+
+sub _objs_from_sth {
+  my ($self, $sth, $mapper, $dest_slice) = @_;
+
+  #
+  # This code is ugly because an attempt has been made to remove as many
+  # function calls as possible for speed purposes.  Thus many caches,
+  # and a fair bit of gymnastics have been used.
+  #
+
+  my $sa = $self->db()->get_SliceAdaptor();
+  my $dta = $self->db()->get_DensityTypeAdaptor();
+
+  my @features;
+  my %dtype_hash;
+  my %slice_hash;
+  my %sr_name_hash;
+  my %sr_cs_hash;
+
+  my($density_feature_id, $seq_region_id, $seq_region_start, $seq_region_end,
+     $density_value, $density_type_id );
+
+  $sth->bind_columns(\$density_feature_id, \$seq_region_id, \$seq_region_start,
+                     \$seq_region_end, \$density_value, \$density_type_id);
+
+  my $asm_cs;
+  my $cmp_cs;
+  my $asm_cs_vers;
+  my $asm_cs_name;
+  my $cmp_cs_vers;
+  my $cmp_cs_name;
+  if($mapper) {
+    $asm_cs = $mapper->assembled_CoordSystem();
+    $cmp_cs = $mapper->component_CoordSystem();
+    $asm_cs_name = $asm_cs->name();
+    $asm_cs_vers = $asm_cs->version();
+    $cmp_cs_name = $cmp_cs->name();
+    $cmp_cs_vers = $cmp_cs->version();
+  }
+
+  my $dest_slice_start;
+  my $dest_slice_end;
+  my $dest_slice_strand;
+  my $dest_slice_length;
+  my $dest_slice_sr_name;
+  my $dest_slice_sr_id;
+
+  if($dest_slice) {
+    $dest_slice_start  = $dest_slice->start();
+    $dest_slice_end    = $dest_slice->end();
+    $dest_slice_strand = $dest_slice->strand();
+    $dest_slice_length = $dest_slice->length();
+    $dest_slice_sr_name = $dest_slice->seq_region_name();
+    $dest_slice_sr_id  = $dest_slice->get_seq_region_id();
+  }
+
+  FEATURE: while($sth->fetch()) {
+    #get the density type object
+    my $density_type = $dtype_hash{$density_type_id} ||=
+      $dta->fetch_by_dbID($density_type_id);
+
+    #get the slice object
+    #need to get the internal_seq_region, if present
+    $seq_region_id = $self->get_seq_region_id_internal($seq_region_id);
+    my $slice = $slice_hash{"ID:".$seq_region_id};
+    if(!$slice) {
+      $slice = $sa->fetch_by_seq_region_id($seq_region_id);
+      $slice_hash{"ID:".$seq_region_id} = $slice;
+      $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
+      $sr_cs_hash{$seq_region_id} = $slice->coord_system();
+    }
+
+    my $sr_name = $sr_name_hash{$seq_region_id};
+    my $sr_cs   = $sr_cs_hash{$seq_region_id};
+ 
+   #
+    # remap the feature coordinates to another coord system
+    # if a mapper was provided
+    #
+    if($mapper) {
+
+      my $len = $seq_region_end - $seq_region_start + 1;
+
+      my @coords;
+
+      if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper')  ) {
+	    
+	  @coords = $mapper->map( $sr_name, $seq_region_start, $seq_region_end,
+                          1, $sr_cs, 0, $dest_slice);
+
+      } else {
+	  @coords = $mapper->map($sr_name, $seq_region_start, $seq_region_end,1, $sr_cs);
+      }
+
+      #filter out gaps
+      @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords;
+
+      #throw out density features mapped to gaps, or split
+      next FEATURE if(@coords != 1);
+
+      $seq_region_start  = $coords[0]->{'start'};
+      $seq_region_end    = $coords[0]->{'end'};
+      $seq_region_id     = $coords[0]->{'id'};
+
+      if($density_type->value_type() eq 'sum') {
+        #adjust density value so it reflects length of feature actually used
+        my $newlen = $seq_region_end - $seq_region_start +1;
+        $density_value *= $newlen/$len if($newlen != $len);
+      }
+
+      #get a slice in the coord system we just mapped to
+#      if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) {
+        $slice = $slice_hash{"ID:".$seq_region_id} ||=
+          $sa->fetch_by_seq_region_id($seq_region_id);
+#      } else {
+#        $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||=
+#          $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef,
+#                               $asm_cs_vers);
+#      }
+    }
+
+    #
+    # If a destination slice was provided convert the coords
+    # If the dest_slice starts at 1 and is foward strand, nothing needs doing
+    #
+    if($dest_slice) {
+      if($dest_slice_start != 1 || $dest_slice_strand != 1) {
+        if($dest_slice_strand == 1) {
+          $seq_region_start = $seq_region_start - $dest_slice_start + 1;
+          $seq_region_end   = $seq_region_end   - $dest_slice_start + 1;
+        } else {
+          my $tmp_seq_region_start = $seq_region_start;
+          $seq_region_start = $dest_slice_end - $seq_region_end + 1;
+          $seq_region_end   = $dest_slice_end - $tmp_seq_region_start + 1;
+        }
+      }
+
+      #throw away features entirely off the end of the requested slice
+      if($seq_region_end < 1 || $seq_region_start > $dest_slice_length ||
+	 ( $dest_slice_sr_id ne $seq_region_id )) {
+	next FEATURE;
+      }
+      $slice = $dest_slice;
+    }
+
+    push( @features,
+          $self->_create_feature( 'Bio::EnsEMBL::DensityFeature', {
+                                    -dbID       => $density_feature_id,
+                                    -adaptor    => $self,
+                                    -start      => $seq_region_start,
+                                    -end        => $seq_region_end,
+                                    -seq_region => $slice,
+                                    -density_value => $density_value,
+                                    -density_type  => $density_type
+                                  } ) );
+
+  }
+
+  return \@features;
+}
+
+
+=head2 list_dbIDs
+
+  Arg [1]    : none
+  Example    : @feature_ids = @{$density_feature_adaptor->list_dbIDs()};
+  Description: Gets an array of internal ids for all density features in the
+               current db
+  Arg[1]     : <optional> int. not set to 0 for the ids to be sorted by the seq_region.
+  Returntype : list of ints
+  Exceptions : none
+  Caller     : ?
+  Status     : Stable
+
+=cut
+
+sub list_dbIDs {
+   my ($self, $ordered) = @_;
+
+   return $self->_list_dbIDs("density_feature",undef, $ordered);
+}
+
+
+=head2 store
+
+  Arg [1]    : list of Bio::EnsEMBL::DensityFeatures @df
+               the simple features to store in the database
+  Example    : $density_feature_adaptor->store(1234, @density_feats);
+  Description: Stores a list of density feature objects in the database
+  Returntype : none
+  Exceptions : thrown if @df is not defined, if any of the features do not
+               have an attached slice.
+               or if any elements of @df are not Bio::EnsEMBL::SeqFeatures 
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub store{
+  my ($self,@df) = @_;
+
+  if( scalar(@df) == 0 ) {
+    throw("Must call store with list of DensityFeatures");
+  }
+#mysql> desc density_feature;
+#+--------------------+---------+------+-----+---------+----------------+
+#| Field              | Type    | Null | Key | Default | Extra          |
+#+--------------------+---------+------+-----+---------+----------------+
+#| density_feature_id | int(11) |      | PRI | NULL    | auto_increment |
+#| density_type_id    | int(11) |      | MUL | 0       |                |
+#| seq_region_id      | int(11) |      |     | 0       |                |
+#| seq_region_start   | int(11) |      |     | 0       |                |
+#| seq_region_end     | int(11) |      |     | 0       |                |
+#| density_value      | float   |      |     | 0       |                |
+#+--------------------+---------+------+-----+---------+----------------+
+
+  my $sth = $self->prepare
+    ("INSERT INTO density_feature (seq_region_id, seq_region_start, " .
+                                  "seq_region_end, density_type_id, " .
+                                  "density_value) " .
+     "VALUES (?,?,?,?,?)");
+
+  my $db = $self->db();
+  my $analysis_adaptor = $db->get_AnalysisAdaptor();
+
+ FEATURE: foreach my $df ( @df ) {
+
+    if( !ref $df || !$df->isa("Bio::EnsEMBL::DensityFeature") ) {
+      throw("DensityFeature must be an Ensembl DensityFeature, " .
+            "not a [".ref($df)."]");
+    }
+    
+    # we dont store 0 value density features
+    next if( $df->density_value == 0 );
+    if($df->is_stored($db)) {
+      warning("DensityFeature [".$df->dbID."] is already stored" .
+              " in this database.");
+      next FEATURE;
+    }
+
+    if(!defined($df->density_type)) {
+      throw("A density type must be attached to the features to be stored.");
+    }
+
+    #store the density_type if it has not been stored yet
+
+    if(!$df->density_type->is_stored($db)) {
+      my $dta = $db->get_DensityTypeAdaptor();
+      $dta->store($df->density_type());
+    }
+
+    my $original = $df;
+    my $seq_region_id;
+    ($df, $seq_region_id) = $self->_pre_store($df);
+
+    $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
+    $sth->bind_param(2,$df->start,SQL_INTEGER);
+    $sth->bind_param(3,$df->end,SQL_INTEGER);
+    $sth->bind_param(4,$df->density_type->dbID,SQL_INTEGER);
+    $sth->bind_param(5,$df->density_value,SQL_FLOAT);
+    $sth->execute();
+
+    $original->dbID($sth->{'mysql_insertid'});
+    $original->adaptor($self);
+  }
+}
+
+=head2 fetch_Featureset_by_Slice 
+
+  Arg [1-5]  : see
+               Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
+               for argument documentation
+  Example    : $featureset = $dfa->fetch_FeatureSet_by_Slice($slice,'SNPDensity', 10, 1);
+  Description: wrapper around
+               Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
+               which returns a Bio::EnsEMBL::DensityFeatureSet and also caches
+               results
+  Returntype : Bio::EnsEMBL::DensityFeatureSet
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub fetch_Featureset_by_Slice {
+  my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
+
+  my $key = join(":", $slice->name,
+                      $logic_name,
+                      $num_blocks || 50,
+                      $interpolate || 0,
+                      $max_ratio);
+                      
+  unless ($self->{'_density_feature_cache'}->{$key}) {
+    my $dfeats = $self->fetch_all_by_Slice($slice, $logic_name, $num_blocks,
+                                           $interpolate, $max_ratio);
+    $self->{'_density_feature_cache'}->{$key} =
+      new Bio::EnsEMBL::DensityFeatureSet($dfeats);
+  }
+  return $self->{'_density_feature_cache'}->{$key};
+}
+
+
+1;