view variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source


=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

=head1 SYNOPSIS

=head1 DESCRIPTION

=head1 METHODS

=cut

package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::IdMapping::BaseObject;
our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);

use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
use Bio::EnsEMBL::IdMapping::MappingList;

# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
use constant SIMILAR_SCORE_RATIO => 0.01;

#
# find the highest unambiguous score for all sources and targets in a scoring
# matrix
#
sub basic_mapping {
  my $self         = shift;
  my $matrix       = shift;
  my $mapping_name = shift;

  # argument checks
  unless ($matrix
      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

  throw('Need a name for serialising the mapping.')
    unless ($mapping_name);

  # Create a new MappingList object. Specify AUTO_LOAD to load
  # serialised existing mappings if found
  my $dump_path =
    path_append( $self->conf->param('basedir'), 'mapping' );

  my $mappings =
    Bio::EnsEMBL::IdMapping::MappingList->new(
                                   -DUMP_PATH  => $dump_path,
                                   -CACHE_FILE => "${mapping_name}.ser",
                                   -AUTO_LOAD  => 1, );

  # checkpoint test: return a previously stored MappingList
  if ( $mappings->loaded ) {
    $self->logger->info(
                  "Read existing mappings from ${mapping_name}.ser.\n");
    return $mappings;
  }

  my $sources_done = {};
  my $targets_done = {};

  # sort scoring matrix entries by descending score
  my @sorted_entries =
    sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries };

  # debug
  #my $idx = substr($mapping_name, -1);

  while ( my $entry = shift(@sorted_entries) ) {

    #$self->logger->debug("\nxxx$idx ".$entry->to_string." ");

    # we already found a mapping for either source or target
    next
      if (    $sources_done->{ $entry->source }
           or $targets_done->{ $entry->target } );

    #$self->logger->debug('d');

    # there's a better mapping for either source or target
    next
      if ( $self->higher_score_exists(
                           $entry, $matrix, $sources_done, $targets_done
           ) );

    #$self->logger->debug('h');

    # check for ambiguous mappings; they are dealt with later
    my $other_sources = [];
    my $other_targets = [];

    if ( $self->ambiguous_mapping( $entry,         $matrix,
                                   $other_sources, $other_targets ) )
    {
      #$self->logger->debug('a');

      $other_sources =
        $self->filter_sources( $other_sources, $sources_done );
      $other_targets =
        $self->filter_targets( $other_targets, $targets_done );

      next if ( scalar(@$other_sources) or scalar(@$other_targets) );
    }

    #$self->logger->debug('A');

    # this is the best mapping, add it
    $mappings->add_Entry($entry);

    $sources_done->{ $entry->source } = 1;
    $targets_done->{ $entry->target } = 1;
  } ## end while ( my $entry = shift...)

  # create checkpoint
  $mappings->write_to_file;

  return $mappings;
} ## end sub basic_mapping

sub higher_score_exists {
  my ( $self, $entry, $matrix, $sources_done, $targets_done ) = @_;

  my $source = $entry->source;
  my $target = $entry->target;
  my $score  = $entry->score;

  foreach
    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
  {
    if (     $other_source != $source
         and !$sources_done->{$other_source}
         and $score < $matrix->get_score( $other_source, $target ) )
    {
      return 1;
    }
  }

  foreach
    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
  {
    if (     $other_target != $target
         and !$targets_done->{$other_target}
         and $score < $matrix->get_score( $source, $other_target ) )
    {
      return 1;
    }
  }

  return 0;
} ## end sub higher_score_exists

#
# find ambiguous mappings (see scores_similar() for definition)
#
sub ambiguous_mapping {
  my ( $self, $entry, $matrix, $other_sources, $other_targets ) = @_;

  my $source = $entry->source;
  my $target = $entry->target;
  my $score  = $entry->score;

  my $retval = 0;

  foreach
    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
  {
    my $other_score = $matrix->get_score( $other_source, $target );

    if ( $other_source != $source
         and (    $self->scores_similar( $score, $other_score )
               or $score < $other_score ) )
    {
      $retval = 1;
      push @{$other_sources}, $other_source;
    }
  }

  foreach
    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
  {
    my $other_score = $matrix->get_score( $source, $other_target );

    if ( $other_target != $target
         and (    $self->scores_similar( $score, $other_score )
               or $score < $other_score ) )
    {
      $retval = 1;
      push @{$other_targets}, $other_target;
    }
  }

  return $retval;
} ## end sub ambiguous_mapping

#
# rule for similarity taken from java code...
#
sub scores_similar {
  my ( $self, $s1, $s2 ) = @_;

  # always give priority to exact matches over very similar ones
  return 0 if ( $s1 == 1 and $s2 < 1 );

  my $diff = $s1 - $s2;
  $diff = -$diff if ( $diff < 0 );

  my $pc = 2*$diff/( $s1 + $s2 );

  return ( $pc < SIMILAR_SCORE_RATIO );
}

sub filter_sources {
  my ( $self, $other_sources, $sources_done ) = @_;

  unless (     scalar( @{$other_sources} )
           and scalar( keys %{$sources_done} ) )
  {
    return $other_sources;
  }

  my @tmp = ();

  foreach my $e ( @{$other_sources} ) {
    push @tmp, $e unless ( $sources_done->{$e} );
  }

  return \@tmp;
}

sub filter_targets {
  my ( $self, $other_targets, $targets_done ) = @_;

  unless (     scalar( @{$other_targets} )
           and scalar( keys %{$targets_done} ) )
  {
    return $other_targets;
  }

  my @tmp = ();

  foreach my $e ( @{$other_targets} ) {
    push @tmp, $e unless ( $targets_done->{$e} );
  }

  return \@tmp;
}

1;