diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,274 @@
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::IdMapping::BaseObject;
+our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
+use Bio::EnsEMBL::IdMapping::MappingList;
+
+# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
+use constant SIMILAR_SCORE_RATIO => 0.01;
+
+#
+# find the highest unambiguous score for all sources and targets in a scoring
+# matrix
+#
+sub basic_mapping {
+  my $self         = shift;
+  my $matrix       = shift;
+  my $mapping_name = shift;
+
+  # argument checks
+  unless ($matrix
+      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
+  {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+
+  throw('Need a name for serialising the mapping.')
+    unless ($mapping_name);
+
+  # Create a new MappingList object. Specify AUTO_LOAD to load
+  # serialised existing mappings if found
+  my $dump_path =
+    path_append( $self->conf->param('basedir'), 'mapping' );
+
+  my $mappings =
+    Bio::EnsEMBL::IdMapping::MappingList->new(
+                                   -DUMP_PATH  => $dump_path,
+                                   -CACHE_FILE => "${mapping_name}.ser",
+                                   -AUTO_LOAD  => 1, );
+
+  # checkpoint test: return a previously stored MappingList
+  if ( $mappings->loaded ) {
+    $self->logger->info(
+                  "Read existing mappings from ${mapping_name}.ser.\n");
+    return $mappings;
+  }
+
+  my $sources_done = {};
+  my $targets_done = {};
+
+  # sort scoring matrix entries by descending score
+  my @sorted_entries =
+    sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries };
+
+  # debug
+  #my $idx = substr($mapping_name, -1);
+
+  while ( my $entry = shift(@sorted_entries) ) {
+
+    #$self->logger->debug("\nxxx$idx ".$entry->to_string." ");
+
+    # we already found a mapping for either source or target
+    next
+      if (    $sources_done->{ $entry->source }
+           or $targets_done->{ $entry->target } );
+
+    #$self->logger->debug('d');
+
+    # there's a better mapping for either source or target
+    next
+      if ( $self->higher_score_exists(
+                           $entry, $matrix, $sources_done, $targets_done
+           ) );
+
+    #$self->logger->debug('h');
+
+    # check for ambiguous mappings; they are dealt with later
+    my $other_sources = [];
+    my $other_targets = [];
+
+    if ( $self->ambiguous_mapping( $entry,         $matrix,
+                                   $other_sources, $other_targets ) )
+    {
+      #$self->logger->debug('a');
+
+      $other_sources =
+        $self->filter_sources( $other_sources, $sources_done );
+      $other_targets =
+        $self->filter_targets( $other_targets, $targets_done );
+
+      next if ( scalar(@$other_sources) or scalar(@$other_targets) );
+    }
+
+    #$self->logger->debug('A');
+
+    # this is the best mapping, add it
+    $mappings->add_Entry($entry);
+
+    $sources_done->{ $entry->source } = 1;
+    $targets_done->{ $entry->target } = 1;
+  } ## end while ( my $entry = shift...)
+
+  # create checkpoint
+  $mappings->write_to_file;
+
+  return $mappings;
+} ## end sub basic_mapping
+
+sub higher_score_exists {
+  my ( $self, $entry, $matrix, $sources_done, $targets_done ) = @_;
+
+  my $source = $entry->source;
+  my $target = $entry->target;
+  my $score  = $entry->score;
+
+  foreach
+    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
+  {
+    if (     $other_source != $source
+         and !$sources_done->{$other_source}
+         and $score < $matrix->get_score( $other_source, $target ) )
+    {
+      return 1;
+    }
+  }
+
+  foreach
+    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
+  {
+    if (     $other_target != $target
+         and !$targets_done->{$other_target}
+         and $score < $matrix->get_score( $source, $other_target ) )
+    {
+      return 1;
+    }
+  }
+
+  return 0;
+} ## end sub higher_score_exists
+
+#
+# find ambiguous mappings (see scores_similar() for definition)
+#
+sub ambiguous_mapping {
+  my ( $self, $entry, $matrix, $other_sources, $other_targets ) = @_;
+
+  my $source = $entry->source;
+  my $target = $entry->target;
+  my $score  = $entry->score;
+
+  my $retval = 0;
+
+  foreach
+    my $other_source ( @{ $matrix->get_sources_for_target($target) } )
+  {
+    my $other_score = $matrix->get_score( $other_source, $target );
+
+    if ( $other_source != $source
+         and (    $self->scores_similar( $score, $other_score )
+               or $score < $other_score ) )
+    {
+      $retval = 1;
+      push @{$other_sources}, $other_source;
+    }
+  }
+
+  foreach
+    my $other_target ( @{ $matrix->get_targets_for_source($source) } )
+  {
+    my $other_score = $matrix->get_score( $source, $other_target );
+
+    if ( $other_target != $target
+         and (    $self->scores_similar( $score, $other_score )
+               or $score < $other_score ) )
+    {
+      $retval = 1;
+      push @{$other_targets}, $other_target;
+    }
+  }
+
+  return $retval;
+} ## end sub ambiguous_mapping
+
+#
+# rule for similarity taken from java code...
+#
+sub scores_similar {
+  my ( $self, $s1, $s2 ) = @_;
+
+  # always give priority to exact matches over very similar ones
+  return 0 if ( $s1 == 1 and $s2 < 1 );
+
+  my $diff = $s1 - $s2;
+  $diff = -$diff if ( $diff < 0 );
+
+  my $pc = 2*$diff/( $s1 + $s2 );
+
+  return ( $pc < SIMILAR_SCORE_RATIO );
+}
+
+sub filter_sources {
+  my ( $self, $other_sources, $sources_done ) = @_;
+
+  unless (     scalar( @{$other_sources} )
+           and scalar( keys %{$sources_done} ) )
+  {
+    return $other_sources;
+  }
+
+  my @tmp = ();
+
+  foreach my $e ( @{$other_sources} ) {
+    push @tmp, $e unless ( $sources_done->{$e} );
+  }
+
+  return \@tmp;
+}
+
+sub filter_targets {
+  my ( $self, $other_targets, $targets_done ) = @_;
+
+  unless (     scalar( @{$other_targets} )
+           and scalar( keys %{$targets_done} ) )
+  {
+    return $other_targets;
+  }
+
+  my @tmp = ();
+
+  foreach my $e ( @{$other_targets} ) {
+    push @tmp, $e unless ( $targets_done->{$e} );
+  }
+
+  return \@tmp;
+}
+
+1;