diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,652 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+=head1 METHODS
+
+=cut
+
+
+package Bio::EnsEMBL::IdMapping::InternalIdMapper;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::IdMapping::BaseObject;
+our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
+use Bio::EnsEMBL::IdMapping::Entry;
+use Bio::EnsEMBL::IdMapping::MappingList;
+use Bio::EnsEMBL::IdMapping::SyntenyFramework;
+
+
+# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
+use constant SIMILAR_SCORE_RATIO => 0.01;
+
+    
+sub map_genes {
+  my $self = shift;
+  my $gene_scores = shift;
+  my $transcript_scores = shift;
+  my $gsb = shift;
+
+  # argument checks
+  unless ($gene_scores and
+          $gene_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+  
+  unless ($transcript_scores and
+          $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+
+  unless ($gsb and
+          $gsb->isa('Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.');
+  }
+  
+  $self->logger->info("== Internal ID mapping for genes...\n\n", 0, 'stamped');
+
+  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
+
+  my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+    -DUMP_PATH   => $dump_path,
+    -CACHE_FILE  => 'gene_mappings.ser',
+  );
+
+  my $mapping_cache = $mappings->cache_file;
+
+  if (-s $mapping_cache) {
+    
+    # read from file
+    $self->logger->info("Reading gene mappings from file...\n", 0, 'stamped');
+    $self->logger->debug("Cache file $mapping_cache.\n", 1);
+    $mappings->read_from_file;
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+    
+  } else {
+    
+    # create gene mappings
+    $self->logger->info("No gene mappings found. Will calculate them now.\n");
+
+    # determine which plugin methods to run
+    my @default_plugins = (qw(
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id
+    ));
+
+    my @plugins = $self->conf->param('plugin_internal_id_mappers_gene');
+    @plugins = @default_plugins unless (defined($plugins[0]));
+
+    my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+      -DUMP_PATH   => $dump_path,
+      -CACHE_FILE  => 'gene_mappings0.ser',
+    );
+    my @mappings = ();
+    my $i = 0;
+
+    #
+    # run the scoring chain
+    #
+    foreach my $plugin (@plugins) {
+      ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
+        $gsb, $new_mappings, $gene_scores, $transcript_scores);
+
+      push(@mappings, $new_mappings);
+    }
+
+    # report remaining ambiguities
+    $self->logger->info($gene_scores->get_source_count.
+      " source genes are ambiguous with ".
+      $gene_scores->get_target_count." target genes.\n\n");
+
+    $self->log_ambiguous($gene_scores, 'gene');
+    
+    # merge mappings and write to file
+    $mappings->add_all(@mappings);
+    $mappings->write_to_file;
+
+    if ($self->logger->loglevel eq 'debug') {
+      $mappings->log('gene', $self->conf->param('basedir'));
+    }
+
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+
+  }
+
+  return $mappings;
+}
+
+
+sub map_transcripts {
+  my $self = shift;
+  my $transcript_scores = shift;
+  my $gene_mappings = shift;
+  my $tsb = shift;
+
+  # argument checks
+  unless ($transcript_scores and
+      $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+
+  unless ($gene_mappings and
+          $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
+    throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
+  }
+  
+  unless ($tsb and
+          $tsb->isa('Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.');
+  }
+  
+  $self->logger->info("== Internal ID mapping for transcripts...\n\n", 0, 'stamped');
+
+  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
+
+  my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+    -DUMP_PATH   => $dump_path,
+    -CACHE_FILE  => 'transcript_mappings.ser',
+  );
+
+  my $mapping_cache = $mappings->cache_file;
+
+  if (-s $mapping_cache) {
+    
+    # read from file
+    $self->logger->info("Reading transcript mappings from file...\n", 0,
+      'stamped');
+    $self->logger->debug("Cache file $mapping_cache.\n", 1);
+    $mappings->read_from_file;
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+    
+  } else {
+    
+    # create transcript mappings
+    $self->logger->info("No transcript mappings found. Will calculate them now.\n");
+
+    # determine which plugin methods to run
+    my @default_plugins = (qw(
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::biotype
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id
+    ));
+
+    my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript');
+    @plugins = @default_plugins unless (defined($plugins[0]));
+
+    my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+      -DUMP_PATH   => $dump_path,
+      -CACHE_FILE  => 'transcript_mappings0.ser',
+    );
+    my @mappings = ();
+    my $i = 0;
+
+    #
+    # run the scoring chain
+    #
+    foreach my $plugin (@plugins) {
+      ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin,
+        $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings);
+
+      push(@mappings, $new_mappings);
+    }
+
+    # report remaining ambiguities
+    $self->logger->info($transcript_scores->get_source_count.
+      " source transcripts are ambiguous with ".
+      $transcript_scores->get_target_count." target transcripts.\n\n");
+
+    $self->log_ambiguous($transcript_scores, 'transcript');
+
+    # merge mappings and write to file
+    $mappings->add_all(@mappings);
+    $mappings->write_to_file;
+
+    if ($self->logger->loglevel eq 'debug') {
+      $mappings->log('transcript', $self->conf->param('basedir'));
+    }
+
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+
+  }
+
+  return $mappings;
+
+}
+
+
+sub map_exons {
+  my $self = shift;
+  my $exon_scores = shift;
+  my $transcript_mappings = shift;
+  my $esb = shift;
+
+  # argument checks
+  unless ($exon_scores and
+      $exon_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
+  }
+
+  unless ($transcript_mappings and
+          $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
+  }
+  
+  unless ($esb and
+          $esb->isa('Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.');
+  }
+  
+  $self->logger->info("== Internal ID mapping for exons...\n\n", 0, 'stamped');
+
+  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
+
+  my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+    -DUMP_PATH   => $dump_path,
+    -CACHE_FILE  => 'exon_mappings.ser',
+  );
+
+  my $mapping_cache = $mappings->cache_file;
+
+  if (-s $mapping_cache) {
+    
+    # read from file
+    $self->logger->info("Reading exon mappings from file...\n", 0,
+      'stamped');
+    $self->logger->debug("Cache file $mapping_cache.\n", 1);
+    $mappings->read_from_file;
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+    
+  } else {
+    
+    # create exon mappings
+    $self->logger->info("No exon mappings found. Will calculate them now.\n");
+
+    # determine which plugin methods to run
+    my @default_plugins = (qw(
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript
+      Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::internal_id
+    ));
+
+    my @plugins = $self->conf->param('plugin_internal_id_mappers_exon');
+    @plugins = @default_plugins unless (defined($plugins[0]));
+
+    my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+      -DUMP_PATH   => $dump_path,
+      -CACHE_FILE  => 'exon_mappings0.ser',
+    );
+    my @mappings = ();
+    my $i = 0;
+
+    #
+    # run the scoring chain
+    #
+    foreach my $plugin (@plugins) {
+      ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
+        $esb, $new_mappings, $exon_scores);
+
+      push(@mappings, $new_mappings);
+    }
+
+    # report remaining ambiguities
+    $self->logger->info($exon_scores->get_source_count.
+      " source exons are ambiguous with ".
+      $exon_scores->get_target_count." target exons.\n\n");
+
+    $self->log_ambiguous($exon_scores, 'exon');
+
+    # merge mappings and write to file
+    $mappings->add_all(@mappings);
+    $mappings->write_to_file;
+
+    if ($self->logger->loglevel eq 'debug') {
+      $mappings->log('exon', $self->conf->param('basedir'));
+    }
+
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+
+  }
+
+  return $mappings;
+
+}
+
+
+#
+# this is not implemented as a plugin, since a) it's too simple and b) it's
+# tied to transcripts so there are no translation scores or score builder.
+#
+sub map_translations {
+  my $self = shift;
+  my $transcript_mappings = shift;
+
+  # argument checks
+  unless ($transcript_mappings and
+          $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
+  }
+  
+  $self->logger->info("== Internal ID mapping for translations...\n\n", 0, 'stamped');
+
+  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
+
+  my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+    -DUMP_PATH   => $dump_path,
+    -CACHE_FILE  => 'translation_mappings.ser',
+  );
+
+  my $mapping_cache = $mappings->cache_file;
+
+  if (-s $mapping_cache) {
+    
+    # read from file
+    $self->logger->info("Reading translation mappings from file...\n", 0,
+      'stamped');
+    $self->logger->debug("Cache file $mapping_cache.\n", 1);
+    $mappings->read_from_file;
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+    
+  } else {
+    
+    # create translation mappings
+    $self->logger->info("No translation mappings found. Will calculate them now.\n");
+
+    $self->logger->info("Translation mapping...\n", 0, 'stamped');
+
+    #
+    # map translations for mapped transcripts
+    #
+    my $i = 0;
+
+    foreach my $entry (@{ $transcript_mappings->get_all_Entries }) {
+
+      my $source_tl = $self->cache->get_by_key('transcripts_by_id',
+        'source', $entry->source)->translation;
+      my $target_tl = $self->cache->get_by_key('transcripts_by_id',
+        'target', $entry->target)->translation;
+
+      if ($source_tl and $target_tl) {
+      
+        # add mapping for the translations; note that the score is taken from
+        # the transcript mapping
+        my $tl_entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([
+          $source_tl->id, $target_tl->id, $entry->score
+        ]);
+        $mappings->add_Entry($tl_entry);
+      
+      } else {
+        $i++;
+      }
+
+    }
+
+    $self->logger->debug("Skipped transcripts without translation: $i\n", 1);
+    $self->logger->info("New mappings: ".$mappings->get_entry_count."\n\n");
+
+    $mappings->write_to_file;
+
+    if ($self->logger->loglevel eq 'debug') {
+      $mappings->log('translation', $self->conf->param('basedir'));
+    }
+
+    $self->logger->info("Done.\n\n", 0, 'stamped');
+
+  }
+
+  return $mappings;
+
+}
+
+
+sub delegate_to_plugin {
+  my $self = shift;
+  my $plugin = shift;
+  my $num = shift;
+  my $score_builder = shift;
+  my $mappings = shift;
+  my $scores = shift;
+
+  # argument checks
+  unless ($score_builder and
+          $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.');
+  }
+
+  unless ($mappings and
+          $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::MappingList.');
+  }
+  
+  unless ($scores and
+          $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+
+  # split plugin name into module and method
+  $plugin =~ /(.*)::(\w+)$/;
+  my $module = $1;
+  my $method = $2;
+
+  unless ($module and $method) {
+    throw("Unable to determine module and method name from $plugin.\n");
+  }
+
+  # instantiate the plugin unless we already have an instance
+  my $plugin_instance;
+  if ($self->has_plugin($module)) {
+    
+    # re-use an existing plugin instance
+    $plugin_instance = $self->get_plugin($module);
+  
+  } else {
+    
+    # inject and instantiate the plugin module
+    inject($module);
+    $plugin_instance = $module->new(
+        -LOGGER       => $self->logger,
+        -CONF         => $self->conf,
+        -CACHE        => $self->cache
+    );
+    $self->add_plugin($plugin_instance);
+
+  }
+
+  # run the method on the plugin
+  #
+  # pass in a sequence number (number of method run, used for generating
+  # checkpoint files), the scores used for determining the mapping, and all
+  # other arguments passed to this method (these will vary for different object
+  # types)
+  #
+  # return the scores and mappings to feed into the next plugin in the chain
+  return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_);
+}
+
+
+sub has_plugin {
+  my $self = shift;
+  my $module = shift;
+
+  defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0);
+}
+
+
+sub get_plugin {
+  my $self = shift;
+  my $module = shift;
+
+  return $self->{'_plugins'}->{$module};
+}
+
+
+sub add_plugin {
+  my $self = shift;
+  my $plugin_instance = shift;
+
+  $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance;
+}
+
+
+sub log_ambiguous {
+  my $self = shift;
+  my $matrix = shift;
+  my $type = shift;
+
+  unless ($matrix and
+          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+  
+  # create dump directory if it doesn't exist
+  my $debug_path = $self->conf->param('basedir').'/debug';
+  unless (-d $debug_path) {
+    system("mkdir -p $debug_path") == 0 or
+      throw("Unable to create directory $debug_path.\n");
+  }
+  
+  my $logfile = "$debug_path/ambiguous_${type}.txt";
+  
+  open(my $fh, '>', $logfile) or
+    throw("Unable to open $logfile for writing: $!");
+
+  my @low_scoring = ();
+  my @high_scoring = ();
+  my $last_id;
+
+  # log by source
+  foreach my $entry (sort { $a->source <=> $b->source }
+                        @{ $matrix->get_all_Entries }) {
+    
+    $last_id ||= $entry->target;
+
+    if ($last_id != $entry->source) {
+      $self->write_ambiguous($type, 'source', $fh, \@low_scoring,
+        \@high_scoring);
+      $last_id = $entry->source;
+    }
+    
+    if ($entry->score < 0.5) {
+      push @low_scoring, $entry;
+    } else {
+      push @high_scoring, $entry;
+    }
+  }
+
+  # write last source
+  $self->write_ambiguous($type, 'source', $fh, \@low_scoring, \@high_scoring);
+
+  # now do the same by target
+  $last_id = undef;
+  foreach my $entry (sort { $a->target <=> $b->target }
+                        @{ $matrix->get_all_Entries }) {
+
+    $last_id ||= $entry->target;
+
+    if ($last_id != $entry->target) {
+      $self->write_ambiguous($type, 'target', $fh, \@low_scoring,
+        \@high_scoring);
+      $last_id = $entry->target;
+    }
+    
+    if ($entry->score < 0.5) {
+      push @low_scoring, $entry;
+    } else {
+      push @high_scoring, $entry;
+    }
+  }
+
+  # write last target
+  $self->write_ambiguous($type, 'target', $fh, \@low_scoring, \@high_scoring);
+
+  close($fh);
+}
+
+
+sub write_ambiguous {
+  my ($self, $type, $db_type, $fh, $low, $high) = @_;
+  
+  # if only source or target are ambiguous (i.e. you have only one mapping from
+  # this perspective) then log from the other perspective
+  if (scalar(@$low) + scalar(@$high) <= 1) {
+    @$low = ();
+    @$high = ();
+    return;
+  }
+
+  my $first_id;
+  if (@$low) {
+    $first_id = $low->[0]->$db_type;
+  } else {
+    $first_id = $high->[0]->$db_type;
+  }
+
+  my $other_db_type;
+  if ($db_type eq 'source') {
+    $other_db_type = 'target';
+  } else {
+    $other_db_type = 'source';
+  }
+
+  print $fh "$db_type $type $first_id scores ambiguously:\n";
+
+  # high scorers
+  if (@$high) {
+    print $fh "  high scoring ${other_db_type}s\n";
+
+    while (my $e = shift(@$high)) {
+      print $fh "    ", $e->$other_db_type, " ", $e->score, "\n";
+    }
+  }
+
+  # low scorers
+  if (@$low) {
+    print $fh "  low scoring ${other_db_type}s\n    ";
+
+    my $i = 1;
+
+    while (my $e = shift(@$low)) {
+      print $fh "\n    " unless (($i++)%10);
+      print $fh $e->$other_db_type, ", ";
+    }
+  }
+
+  print $fh "\n";
+}
+
+
+1;
+