diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,305 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+=head1 SYNOPSIS
+
+=head1 DESCRIPTION
+
+=head1 METHODS
+
+=cut
+
+
+package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
+our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper);
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
+
+  
+#
+# basic mapping
+#
+sub init_basic {
+  my $self = shift;
+  my $num = shift;
+  my $tsb = shift;
+  my $mappings = shift;
+  my $transcript_scores = shift;
+
+  $self->logger->info("Basic transcript mapping...\n", 0, 'stamped');
+
+  $mappings = $self->basic_mapping($transcript_scores,
+    "transcript_mappings$num");
+  $num++;
+  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
+    "transcript_matrix$num");
+
+  return ($new_scores, $mappings);
+}
+
+
+#
+# handle cases with exact match but different translation
+#
+sub non_exact_translation {
+  my $self = shift;
+  my $num = shift;
+  my $tsb = shift;
+  my $mappings = shift;
+  my $transcript_scores = shift;
+
+  $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped');
+  
+  unless ($transcript_scores->loaded) {
+    $tsb->different_translation_rescore($transcript_scores);
+    $transcript_scores->write_to_file;
+  }
+  
+  $mappings = $self->basic_mapping($transcript_scores,
+    "transcript_mappings$num");
+  $num++;
+  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
+    "transcript_matrix$num");
+
+  return ($new_scores, $mappings);
+}
+
+
+#
+# reduce score for mappings of transcripts which do not belong to mapped
+# genes
+#
+sub mapped_gene {
+  my $self = shift;
+  my $num = shift;
+  my $tsb = shift;
+  my $mappings = shift;
+  my $transcript_scores = shift;
+  my $gene_mappings = shift;
+
+  $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped');
+  
+  unless ($transcript_scores->loaded) {
+  $tsb->non_mapped_gene_rescore($transcript_scores, $gene_mappings);
+    $transcript_scores->write_to_file;
+  }
+  
+  $mappings = $self->basic_mapping($transcript_scores,
+    "transcript_mappings$num");
+  $num++;
+  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
+    "transcript_matrix$num");
+
+  return ($new_scores, $mappings);
+}
+
+#
+# rescore by penalising scores between transcripts with different biotypes
+#
+sub biotype {
+  my $self              = shift;
+  my $num               = shift;
+  my $tsb               = shift;
+  my $mappings          = shift;
+  my $transcript_scores = shift;
+
+  $self->logger->info( "Retry with biotype disambiguation...\n",
+                       0, 'stamped' );
+
+  unless ( $transcript_scores->loaded() ) {
+    $tsb->biotype_transcript_rescore($transcript_scores);
+    $transcript_scores->write_to_file();
+  }
+
+  my $new_mappings = $self->basic_mapping( $transcript_scores,
+                                           "transcript_mappings$num" );
+  $num++;
+  my $new_scores =
+    $tsb->create_shrinked_matrix( $transcript_scores, $new_mappings,
+                                  "transcript_matrix$num" );
+
+  return ( $new_scores, $new_mappings );
+}
+
+#
+# selectively rescore by penalising scores between transcripts with
+# different internalIDs  
+#
+sub internal_id {
+  my $self = shift;
+  my $num = shift;
+  my $tsb = shift;
+  my $mappings = shift;
+  my $transcript_scores = shift;
+
+  $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped');
+  
+  unless ($transcript_scores->loaded) {
+    $tsb->internal_id_rescore($transcript_scores);
+    $transcript_scores->write_to_file;
+  }
+
+  $mappings = $self->basic_mapping($transcript_scores,
+    "transcript_mappings$num");
+  $num++;
+  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
+    "transcript_matrix$num");
+
+  return ($new_scores, $mappings);
+}
+
+
+#
+# handle ambiguities between transcripts in single genes
+#
+sub single_gene {
+  my $self = shift;
+  my $num = shift;
+  my $tsb = shift;
+  my $mappings = shift;
+  my $transcript_scores = shift;
+
+  $self->logger->info("Transcripts in single genes...\n", 0, 'stamped');
+  
+  unless ($transcript_scores->loaded) {
+    $transcript_scores->write_to_file;
+  }
+  
+  $mappings = $self->same_gene_transcript_mapping($transcript_scores,
+    "transcript_mappings$num");
+  $num++;
+  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
+    "transcript_matrix$num");
+
+  return ($new_scores, $mappings);
+}
+
+
+#
+# modified basic mapper that maps transcripts that are ambiguous within one gene
+#
+sub same_gene_transcript_mapping {
+  my $self = shift;
+  my $matrix = shift;
+  my $mapping_name = shift;
+
+  # argument checks
+  unless ($matrix and
+          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+
+  throw('Need a name for serialising the mapping.') unless ($mapping_name);
+
+  # Create a new MappingList object. Specify AUTO_LOAD to load serialised
+  # existing mappings if found
+  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
+  
+  my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
+    -DUMP_PATH   => $dump_path,
+    -CACHE_FILE  => "${mapping_name}.ser",
+    -AUTO_LOAD   => 1,
+  );
+  
+  # checkpoint test: return a previously stored MappingList
+  if ($mappings->loaded) {
+    $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n");
+    return $mappings;
+  }
+
+  my $sources_done = {};
+  my $targets_done = {};
+
+  # sort scoring matrix entries by descending score
+  my @sorted_entries = sort { $b->score <=> $a->score ||
+    $a->source <=> $b->source || $a->target <=> $b->target }
+      @{ $matrix->get_all_Entries };
+
+  while (my $entry = shift(@sorted_entries)) {
+    
+    # $self->logger->debug("\nxxx4 ".$entry->to_string." ");
+
+    # we already found a mapping for either source or target yet
+    next if ($sources_done->{$entry->source} or
+             $targets_done->{$entry->target});
+
+    #$self->logger->debug('d');
+
+    my $other_sources = [];
+    my $other_targets = [];
+    my %source_genes = ();
+    my %target_genes = ();
+
+    if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) {
+      #$self->logger->debug('a');
+
+      $other_sources = $self->filter_sources($other_sources, $sources_done);
+      $other_targets = $self->filter_targets($other_targets, $targets_done);
+
+      $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
+        'source', $entry->source)} = 1;
+      $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
+        'target', $entry->target)} = 1;
+
+      foreach my $other_source (@{ $other_sources }) {
+        $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
+          'source', $other_source)} = 1;
+      }
+        
+      foreach my $other_target (@{ $other_targets }) {
+        $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
+          'target', $other_target)} = 1;
+      }
+      
+      # only add mapping if only one source and target gene involved
+      if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) {
+        #$self->logger->debug('O');
+        $mappings->add_Entry($entry);
+      }
+
+    } else {
+      #$self->logger->debug('A');
+
+      # this is the best mapping, add it
+      $mappings->add_Entry($entry);
+    }
+
+    $sources_done->{$entry->source} = 1;
+    $targets_done->{$entry->target} = 1;
+  }
+
+  # create checkpoint
+  $mappings->write_to_file;
+
+  return $mappings;
+}
+
+
+1;
+