view variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions

=head1 SYNOPSIS

  # create a new SyntenyRegion from a source and a target gene
  my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast( [
      $source_gene->start,  $source_gene->end,
      $source_gene->strand, $source_gene->seq_region_name,
      $target_gene->start,  $target_gene->end,
      $target_gene->strand, $target_gene->seq_region_name,
      $entry->score,
  ] );

  # merge with another SyntenyRegion
  my $merged_sr = $sr->merge($sr1);

  # score a gene pair against this SyntenyRegion
  my $score =
    $sr->score_location_relationship( $source_gene1, $target_gene1 );

=head1 DESCRIPTION

This object represents a synteny between a source and a target location.
SyntenyRegions are built from mapped genes, and the their score is
defined as the score of the gene mapping. For merged SyntenyRegions,
scores are combined.

=head1 METHODS

  new_fast
  source_start
  source_end
  source_strand
  source_seq_region_name
  target_start
  target_end
  target_strand
  target_seq_region_name
  score
  merge
  stretch
  score_location_relationship
  to_string

=cut

package Bio::EnsEMBL::IdMapping::SyntenyRegion;

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::Utils::Exception qw(throw warning);


=head2 new_fast

  Arg[1]      : Arrayref $array_ref - the arrayref to bless into the
                SyntenyRegion object 
  Example     : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
                  ]);
  Description : Constructor. On instantiation, source and target regions are
                reverse complemented so that source is always on forward strand.
  Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub new_fast {
  my $class = shift;
  my $array_ref = shift;

  # reverse complement source and target so that source is always on forward
  # strand; this will make merging and other comparison operations easier
  # at later stages
  if ($array_ref->[2] == -1) {
    $array_ref->[2] = 1;
    $array_ref->[6] = -1 * $array_ref->[6];
  }
  
  return bless $array_ref, $class;
}


=head2 source_start

  Arg[1]      : (optional) Int - source location start coordinate
  Description : Getter/setter for source location start coordinate.
  Return type : Int
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub source_start {
  my $self = shift;
  $self->[0] = shift if (@_);
  return $self->[0];
}


=head2 source_end

  Arg[1]      : (optional) Int - source location end coordinate
  Description : Getter/setter for source location end coordinate.
  Return type : Int
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub source_end {
  my $self = shift;
  $self->[1] = shift if (@_);
  return $self->[1];
}


=head2 source_strand

  Arg[1]      : (optional) Int - source location strand
  Description : Getter/setter for source location strand.
  Return type : Int
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub source_strand {
  my $self = shift;
  $self->[2] = shift if (@_);
  return $self->[2];
}


=head2 source_seq_region_name

  Arg[1]      : (optional) String - source location seq_region name
  Description : Getter/setter for source location seq_region name.
  Return type : String
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub source_seq_region_name {
  my $self = shift;
  $self->[3] = shift if (@_);
  return $self->[3];
}


=head2 target_start

  Arg[1]      : (optional) Int - target location start coordinate
  Description : Getter/setter for target location start coordinate.
  Return type : Int
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub target_start {
  my $self = shift;
  $self->[4] = shift if (@_);
  return $self->[4];
}


=head2 target_end

  Arg[1]      : (optional) Int - target location end coordinate
  Description : Getter/setter for target location end coordinate.
  Return type : Int
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub target_end {
  my $self = shift;
  $self->[5] = shift if (@_);
  return $self->[5];
}


=head2 target_strand

  Arg[1]      : (optional) Int - target location strand
  Description : Getter/setter for target location strand.
  Return type : Int
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub target_strand {
  my $self = shift;
  $self->[6] = shift if (@_);
  return $self->[6];
}


=head2 target_seq_region_name

  Arg[1]      : (optional) String - target location seq_region name
  Description : Getter/setter for target location seq_region name.
  Return type : String
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub target_seq_region_name {
  my $self = shift;
  $self->[7] = shift if (@_);
  return $self->[7];
}


=head2 score

  Arg[1]      : (optional) Float - score
  Description : Getter/setter for the score between source and target location.
  Return type : Int
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub score {
  my $self = shift;
  $self->[8] = shift if (@_);
  return $self->[8];
}


=head2 merge

  Arg[1]      : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another
                SyntenyRegion
  Example     : $merged_sr = $sr->merge($other_sr);
  Description : Merges two overlapping SyntenyRegions if they meet certain
                criteria (see documentation in the code for details). Score is
                calculated as a combined distance score. If the two
                SyntenyRegions aren't mergeable, this method returns undef.
  Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef
  Exceptions  : warns on bad scores
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub merge {
  my ($self, $sr) = @_;

  # must be on same seq_region
  if ($self->source_seq_region_name ne $sr->source_seq_region_name or
      $self->target_seq_region_name ne $sr->target_seq_region_name) {
    return 0;
  }

  # target must be on same strand
  return 0 unless ($self->target_strand == $sr->target_strand);

  # find the distance of source and target pair and compare
  my $source_dist = $sr->source_start - $self->source_start;
  my $target_dist;
  if ($self->target_strand == 1) {
    $target_dist = $sr->target_start - $self->target_start;
  } else {
    $target_dist = $self->target_end - $sr->target_end;
  }

  # prevent division by zero error
  if ($source_dist == 0 or $target_dist == 0) {
    warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n");
    return 0;
  }

  # calculate a distance score
  my $dist = $source_dist - $target_dist;
  $dist = -$dist if ($dist < 0);
  my $d1 = $dist/$source_dist;
  $d1 = -$d1 if ($d1 < 0);
  my $d2 = $dist/$target_dist;
  $d2 = -$d2 if ($d2 < 0);
  my $dist_score = 1 - $d1 - $d2;

  # distance score must be more than 50%
  return 0 if ($dist_score < 0.5);

  my $new_score = $dist_score * ($sr->score + $self->score)/2;

  if ($new_score > 1) {
    warn("WARNING: Bad merge score: $new_score\n");
  }

  # extend SyntenyRegion to cover both sources and targets, set merged score
  # and return
  if ($sr->source_start < $self->source_start) {
    $self->source_start($sr->source_start);
  }
  if ($sr->source_end > $self->source_end) {
    $self->source_end($sr->source_end);
  }
  
  if ($sr->target_start < $self->target_start) {
    $self->target_start($sr->target_start);
  }
  if ($sr->target_end > $self->target_end) {
    $self->target_end($sr->target_end);
  }

  $self->score($new_score);

  return $self;
}


=head2 stretch

  Arg[1]      : Float $factor - stretching factor
  Example     : $stretched_sr = $sr->stretch(2);
  Description : Extends this SyntenyRegion to span a $factor * $score more area.
  Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub stretch {
  my ($self, $factor) = @_;

  my $source_adjust = int(($self->source_end - $self->source_start + 1) *
    $factor * $self->score);
  $self->source_start($self->source_start - $source_adjust);
  $self->source_end($self->source_end + $source_adjust);
  #warn sprintf("  sss %d %d %d\n", $source_adjust, $self->source_start,
  #  $self->source_end);
  
  my $target_adjust = int(($self->target_end - $self->target_start + 1) *
    $factor * $self->score);
  $self->target_start($self->target_start - $target_adjust);
  $self->target_end($self->target_end + $target_adjust);

  return $self;
}


=head2 score_location_relationship

  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene
  Arg[2]      : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene
  Example     : my $score = $sr->score_location_relationship($source_gene,
                  $target_gene);
  Description : This function calculates how well the given source location
                interpolates on given target location inside this SyntenyRegion.

                Scoring is done the following way: Source and target location
                are normalized with respect to this Regions source and target.
                Source range will then be somewhere close to 0.0-1.0 and target
                range anything around that.

                The extend of the covered area between source and target range
                is a measurement of how well they agree (smaller extend is
                better). The extend (actually 2*extend) is reduced by the size
                of the regions. This will result in 0.0 if they overlap
                perfectly and bigger values if they dont.

                This is substracted from 1.0 to give the score. The score is
                likely to be below zero, but is cut off at 0.0f.

                Finally, the score is multiplied with the score of the synteny
                itself.
  Return type : Float
  Exceptions  : warns if score out of range
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut



sub score_location_relationship {
  my ($self, $source_gene, $target_gene) = @_;

  # must be on same seq_region
  if (($self->source_seq_region_name ne $source_gene->seq_region_name) or
      ($self->target_seq_region_name ne $target_gene->seq_region_name)) {
    return 0;
  }

  # strand relationship must be the same (use logical XOR to find out)
  if (($self->source_strand == $source_gene->strand) xor
      ($self->target_strand == $target_gene->strand)) {
    return 0;
  }

  # normalise source location
  my $source_rel_start = ($source_gene->start - $self->source_start) /
                         ($self->source_end - $self->source_start + 1);

  my $source_rel_end = ($source_gene->end - $self->source_start + 1) /
                       ($self->source_end - $self->source_start + 1);

  #warn "  aaa ".$self->to_string."\n";
  #warn sprintf("  bbb %.6f %.6f\n", $source_rel_start, $source_rel_end);

  # cut off if the source location is completely outside
  return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1);
  
  # normalise target location
  my ($target_rel_start, $target_rel_end);
  my $t_length = $self->target_end - $self->target_start + 1;

  if ($self->target_strand == 1) {

    $target_rel_start = ($target_gene->start - $self->target_start) / $t_length;

    $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length;

  } else {
    $target_rel_start = ($self->target_end - $target_gene->end) / $t_length;
    $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length;
  }

  my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end :
                                                           $source_rel_end) -
              (($target_rel_start < $source_rel_start) ? $target_rel_start :
                                                         $source_rel_start);

  my $score = $self->score * (1 - (2 * $added_range - $target_rel_end -
    $source_rel_end + $target_rel_start + $source_rel_start));

  #warn "  ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range,
  #  $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end);

  $score = 0 if ($score < 0);

  # sanity check
  if ($score > 1) {
    warn "Out of range score ($score) for ".$source_gene->id.":".
      $target_gene->id."\n";
  }

  return $score;
}


=head2 to_string

  Example     : print LOG $sr->to_string, "\n";
  Description : Returns a string representation of the SyntenyRegion object.
                Useful for debugging and logging.
  Return type : String
  Exceptions  : none
  Caller      : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Status      : At Risk
              : under development

=cut

sub to_string {
  my $self = shift;
  return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f",
    $self->source_seq_region_name,
    $self->source_start,
    $self->source_end,
    $self->source_strand,
    $self->target_seq_region_name,
    $self->target_start,
    $self->target_end,
    $self->target_strand,
    $self->score
  );
}


1;