diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,528 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions
+
+=head1 SYNOPSIS
+
+  # create a new SyntenyRegion from a source and a target gene
+  my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast( [
+      $source_gene->start,  $source_gene->end,
+      $source_gene->strand, $source_gene->seq_region_name,
+      $target_gene->start,  $target_gene->end,
+      $target_gene->strand, $target_gene->seq_region_name,
+      $entry->score,
+  ] );
+
+  # merge with another SyntenyRegion
+  my $merged_sr = $sr->merge($sr1);
+
+  # score a gene pair against this SyntenyRegion
+  my $score =
+    $sr->score_location_relationship( $source_gene1, $target_gene1 );
+
+=head1 DESCRIPTION
+
+This object represents a synteny between a source and a target location.
+SyntenyRegions are built from mapped genes, and the their score is
+defined as the score of the gene mapping. For merged SyntenyRegions,
+scores are combined.
+
+=head1 METHODS
+
+  new_fast
+  source_start
+  source_end
+  source_strand
+  source_seq_region_name
+  target_start
+  target_end
+  target_strand
+  target_seq_region_name
+  score
+  merge
+  stretch
+  score_location_relationship
+  to_string
+
+=cut
+
+package Bio::EnsEMBL::IdMapping::SyntenyRegion;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+
+
+=head2 new_fast
+
+  Arg[1]      : Arrayref $array_ref - the arrayref to bless into the
+                SyntenyRegion object 
+  Example     : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
+                  ]);
+  Description : Constructor. On instantiation, source and target regions are
+                reverse complemented so that source is always on forward strand.
+  Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub new_fast {
+  my $class = shift;
+  my $array_ref = shift;
+
+  # reverse complement source and target so that source is always on forward
+  # strand; this will make merging and other comparison operations easier
+  # at later stages
+  if ($array_ref->[2] == -1) {
+    $array_ref->[2] = 1;
+    $array_ref->[6] = -1 * $array_ref->[6];
+  }
+  
+  return bless $array_ref, $class;
+}
+
+
+=head2 source_start
+
+  Arg[1]      : (optional) Int - source location start coordinate
+  Description : Getter/setter for source location start coordinate.
+  Return type : Int
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub source_start {
+  my $self = shift;
+  $self->[0] = shift if (@_);
+  return $self->[0];
+}
+
+
+=head2 source_end
+
+  Arg[1]      : (optional) Int - source location end coordinate
+  Description : Getter/setter for source location end coordinate.
+  Return type : Int
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub source_end {
+  my $self = shift;
+  $self->[1] = shift if (@_);
+  return $self->[1];
+}
+
+
+=head2 source_strand
+
+  Arg[1]      : (optional) Int - source location strand
+  Description : Getter/setter for source location strand.
+  Return type : Int
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub source_strand {
+  my $self = shift;
+  $self->[2] = shift if (@_);
+  return $self->[2];
+}
+
+
+=head2 source_seq_region_name
+
+  Arg[1]      : (optional) String - source location seq_region name
+  Description : Getter/setter for source location seq_region name.
+  Return type : String
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub source_seq_region_name {
+  my $self = shift;
+  $self->[3] = shift if (@_);
+  return $self->[3];
+}
+
+
+=head2 target_start
+
+  Arg[1]      : (optional) Int - target location start coordinate
+  Description : Getter/setter for target location start coordinate.
+  Return type : Int
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub target_start {
+  my $self = shift;
+  $self->[4] = shift if (@_);
+  return $self->[4];
+}
+
+
+=head2 target_end
+
+  Arg[1]      : (optional) Int - target location end coordinate
+  Description : Getter/setter for target location end coordinate.
+  Return type : Int
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub target_end {
+  my $self = shift;
+  $self->[5] = shift if (@_);
+  return $self->[5];
+}
+
+
+=head2 target_strand
+
+  Arg[1]      : (optional) Int - target location strand
+  Description : Getter/setter for target location strand.
+  Return type : Int
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub target_strand {
+  my $self = shift;
+  $self->[6] = shift if (@_);
+  return $self->[6];
+}
+
+
+=head2 target_seq_region_name
+
+  Arg[1]      : (optional) String - target location seq_region name
+  Description : Getter/setter for target location seq_region name.
+  Return type : String
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub target_seq_region_name {
+  my $self = shift;
+  $self->[7] = shift if (@_);
+  return $self->[7];
+}
+
+
+=head2 score
+
+  Arg[1]      : (optional) Float - score
+  Description : Getter/setter for the score between source and target location.
+  Return type : Int
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub score {
+  my $self = shift;
+  $self->[8] = shift if (@_);
+  return $self->[8];
+}
+
+
+=head2 merge
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another
+                SyntenyRegion
+  Example     : $merged_sr = $sr->merge($other_sr);
+  Description : Merges two overlapping SyntenyRegions if they meet certain
+                criteria (see documentation in the code for details). Score is
+                calculated as a combined distance score. If the two
+                SyntenyRegions aren't mergeable, this method returns undef.
+  Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef
+  Exceptions  : warns on bad scores
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub merge {
+  my ($self, $sr) = @_;
+
+  # must be on same seq_region
+  if ($self->source_seq_region_name ne $sr->source_seq_region_name or
+      $self->target_seq_region_name ne $sr->target_seq_region_name) {
+    return 0;
+  }
+
+  # target must be on same strand
+  return 0 unless ($self->target_strand == $sr->target_strand);
+
+  # find the distance of source and target pair and compare
+  my $source_dist = $sr->source_start - $self->source_start;
+  my $target_dist;
+  if ($self->target_strand == 1) {
+    $target_dist = $sr->target_start - $self->target_start;
+  } else {
+    $target_dist = $self->target_end - $sr->target_end;
+  }
+
+  # prevent division by zero error
+  if ($source_dist == 0 or $target_dist == 0) {
+    warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n");
+    return 0;
+  }
+
+  # calculate a distance score
+  my $dist = $source_dist - $target_dist;
+  $dist = -$dist if ($dist < 0);
+  my $d1 = $dist/$source_dist;
+  $d1 = -$d1 if ($d1 < 0);
+  my $d2 = $dist/$target_dist;
+  $d2 = -$d2 if ($d2 < 0);
+  my $dist_score = 1 - $d1 - $d2;
+
+  # distance score must be more than 50%
+  return 0 if ($dist_score < 0.5);
+
+  my $new_score = $dist_score * ($sr->score + $self->score)/2;
+
+  if ($new_score > 1) {
+    warn("WARNING: Bad merge score: $new_score\n");
+  }
+
+  # extend SyntenyRegion to cover both sources and targets, set merged score
+  # and return
+  if ($sr->source_start < $self->source_start) {
+    $self->source_start($sr->source_start);
+  }
+  if ($sr->source_end > $self->source_end) {
+    $self->source_end($sr->source_end);
+  }
+  
+  if ($sr->target_start < $self->target_start) {
+    $self->target_start($sr->target_start);
+  }
+  if ($sr->target_end > $self->target_end) {
+    $self->target_end($sr->target_end);
+  }
+
+  $self->score($new_score);
+
+  return $self;
+}
+
+
+=head2 stretch
+
+  Arg[1]      : Float $factor - stretching factor
+  Example     : $stretched_sr = $sr->stretch(2);
+  Description : Extends this SyntenyRegion to span a $factor * $score more area.
+  Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub stretch {
+  my ($self, $factor) = @_;
+
+  my $source_adjust = int(($self->source_end - $self->source_start + 1) *
+    $factor * $self->score);
+  $self->source_start($self->source_start - $source_adjust);
+  $self->source_end($self->source_end + $source_adjust);
+  #warn sprintf("  sss %d %d %d\n", $source_adjust, $self->source_start,
+  #  $self->source_end);
+  
+  my $target_adjust = int(($self->target_end - $self->target_start + 1) *
+    $factor * $self->score);
+  $self->target_start($self->target_start - $target_adjust);
+  $self->target_end($self->target_end + $target_adjust);
+
+  return $self;
+}
+
+
+=head2 score_location_relationship
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene
+  Arg[2]      : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene
+  Example     : my $score = $sr->score_location_relationship($source_gene,
+                  $target_gene);
+  Description : This function calculates how well the given source location
+                interpolates on given target location inside this SyntenyRegion.
+
+                Scoring is done the following way: Source and target location
+                are normalized with respect to this Regions source and target.
+                Source range will then be somewhere close to 0.0-1.0 and target
+                range anything around that.
+
+                The extend of the covered area between source and target range
+                is a measurement of how well they agree (smaller extend is
+                better). The extend (actually 2*extend) is reduced by the size
+                of the regions. This will result in 0.0 if they overlap
+                perfectly and bigger values if they dont.
+
+                This is substracted from 1.0 to give the score. The score is
+                likely to be below zero, but is cut off at 0.0f.
+
+                Finally, the score is multiplied with the score of the synteny
+                itself.
+  Return type : Float
+  Exceptions  : warns if score out of range
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+
+
+sub score_location_relationship {
+  my ($self, $source_gene, $target_gene) = @_;
+
+  # must be on same seq_region
+  if (($self->source_seq_region_name ne $source_gene->seq_region_name) or
+      ($self->target_seq_region_name ne $target_gene->seq_region_name)) {
+    return 0;
+  }
+
+  # strand relationship must be the same (use logical XOR to find out)
+  if (($self->source_strand == $source_gene->strand) xor
+      ($self->target_strand == $target_gene->strand)) {
+    return 0;
+  }
+
+  # normalise source location
+  my $source_rel_start = ($source_gene->start - $self->source_start) /
+                         ($self->source_end - $self->source_start + 1);
+
+  my $source_rel_end = ($source_gene->end - $self->source_start + 1) /
+                       ($self->source_end - $self->source_start + 1);
+
+  #warn "  aaa ".$self->to_string."\n";
+  #warn sprintf("  bbb %.6f %.6f\n", $source_rel_start, $source_rel_end);
+
+  # cut off if the source location is completely outside
+  return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1);
+  
+  # normalise target location
+  my ($target_rel_start, $target_rel_end);
+  my $t_length = $self->target_end - $self->target_start + 1;
+
+  if ($self->target_strand == 1) {
+
+    $target_rel_start = ($target_gene->start - $self->target_start) / $t_length;
+
+    $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length;
+
+  } else {
+    $target_rel_start = ($self->target_end - $target_gene->end) / $t_length;
+    $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length;
+  }
+
+  my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end :
+                                                           $source_rel_end) -
+              (($target_rel_start < $source_rel_start) ? $target_rel_start :
+                                                         $source_rel_start);
+
+  my $score = $self->score * (1 - (2 * $added_range - $target_rel_end -
+    $source_rel_end + $target_rel_start + $source_rel_start));
+
+  #warn "  ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range,
+  #  $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end);
+
+  $score = 0 if ($score < 0);
+
+  # sanity check
+  if ($score > 1) {
+    warn "Out of range score ($score) for ".$source_gene->id.":".
+      $target_gene->id."\n";
+  }
+
+  return $score;
+}
+
+
+=head2 to_string
+
+  Example     : print LOG $sr->to_string, "\n";
+  Description : Returns a string representation of the SyntenyRegion object.
+                Useful for debugging and logging.
+  Return type : String
+  Exceptions  : none
+  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub to_string {
+  my $self = shift;
+  return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f",
+    $self->source_seq_region_name,
+    $self->source_start,
+    $self->source_end,
+    $self->source_strand,
+    $self->target_seq_region_name,
+    $self->target_start,
+    $self->target_end,
+    $self->target_strand,
+    $self->score
+  );
+}
+
+
+1;
+