diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,561 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic
+regions across the genome
+
+=head1 SYNOPSIS
+
+  # build the SyntenyFramework from unambiguous gene mappings
+  my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
+    -DUMP_PATH  => $dump_path,
+    -CACHE_FILE => 'synteny_framework.ser',
+    -LOGGER     => $self->logger,
+    -CONF       => $self->conf,
+    -CACHE      => $self->cache,
+  );
+  $sf->build_synteny($gene_mappings);
+
+  # use it to rescore the genes
+  $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
+
+=head1 DESCRIPTION
+
+The SyntenyFramework is a set of SyntenyRegions. These are pairs of
+locations very analoguous to the information in the assembly table (the
+locations dont have to be the same length though). They are built from
+genes that map uniquely between source and target.
+
+Once built, the SyntenyFramework is used to score source and target gene
+pairs to determine whether they are similar. This process is slow (it
+involves testing all gene pairs against all SyntenyRegions), this module
+therefor has built-in support to run the process in parallel via LSF.
+
+=head1 METHODS
+
+  new
+  build_synteny
+  _by_overlap
+  add_SyntenyRegion
+  get_all_SyntenyRegions
+  rescore_gene_matrix_lsf
+  rescore_gene_matrix
+  logger
+  conf
+  cache
+
+=cut
+
+package Bio::EnsEMBL::IdMapping::SyntenyFramework;
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Bio::EnsEMBL::IdMapping::Serialisable;
+our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable);
+
+use Bio::EnsEMBL::Utils::Argument qw(rearrange);
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
+use Bio::EnsEMBL::IdMapping::SyntenyRegion;
+use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
+
+use FindBin qw($Bin);
+FindBin->again;
+
+
+=head2 new
+
+  Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
+  Arg [CONF]  : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
+  Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object
+  Arg [DUMP_PATH] : String - path for object serialisation
+  Arg [CACHE_FILE] : String - filename of serialised object
+  Example     : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
+                  -DUMP_PATH    => $dump_path,
+                  -CACHE_FILE   => 'synteny_framework.ser',
+                  -LOGGER       => $self->logger,
+                  -CONF         => $self->conf,
+                  -CACHE        => $self->cache,
+                );
+  Description : Constructor.
+  Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework
+  Exceptions  : thrown on wrong or missing arguments
+  Caller      : InternalIdMapper plugins
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+  my $self = $class->SUPER::new(@_);
+
+  my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);
+
+  unless ($logger and ref($logger) and
+          $logger->isa('Bio::EnsEMBL::Utils::Logger')) {
+    throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
+  }
+  
+  unless ($conf and ref($conf) and
+          $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
+    throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
+  }
+  
+  unless ($cache and ref($cache) and
+          $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
+    throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
+  }
+  
+  # initialise
+  $self->logger($logger);
+  $self->conf($conf);
+  $self->cache($cache);
+  $self->{'cache'} = [];
+
+  return $self;
+}
+
+
+=head2 build_synteny
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings
+                to build the SyntenyFramework from
+  Example     : $synteny_framework->build_synteny($gene_mappings);
+  Description : Builds the SyntenyFramework from unambiguous gene mappings.
+                SyntenyRegions are allowed to overlap. At most two overlapping
+                SyntenyRegions are merged (otherwise we'd get too large
+                SyntenyRegions with little information content).
+  Return type : none
+  Exceptions  : thrown on wrong or missing argument
+  Caller      : InternalIdMapper plugins
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub build_synteny {
+  my $self = shift;
+  my $mappings = shift;
+  
+  unless ($mappings and
+          $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
+    throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
+  }
+
+  # create a synteny region for each mapping
+  my @synteny_regions = ();
+
+  foreach my $entry (@{ $mappings->get_all_Entries }) {
+    
+    my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
+      $entry->source);
+    my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
+      $entry->target);
+
+    my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
+      $source_gene->start,
+      $source_gene->end,
+      $source_gene->strand,
+      $source_gene->seq_region_name,
+      $target_gene->start,
+      $target_gene->end,
+      $target_gene->strand,
+      $target_gene->seq_region_name,
+      $entry->score,
+    ]);
+
+    push @synteny_regions, $sr;
+  }
+
+  unless (@synteny_regions) {
+    $self->logger->warning("No synteny regions could be identified.\n");
+    return;
+  }
+
+  # sort synteny regions
+  #my @sorted = sort _by_overlap @synteny_regions;
+  my @sorted = reverse sort {
+    $a->source_seq_region_name cmp $b->source_seq_region_name ||
+    $a->source_start <=> $b->source_start ||
+    $a->source_end <=> $b->source_end } @synteny_regions;
+
+  $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n");
+  
+  # now create merged regions from overlapping syntenies, but only merge a
+  # maximum of 2 regions (otherwise you end up with large synteny blocks which
+  # won't contain much information in this context)
+  my $last_merged = 0;
+  my $last_sr = shift(@sorted);
+
+  while (my $sr = shift(@sorted)) {
+    #$self->logger->debug("this ".$sr->to_string."\n");
+  
+    my $merged_sr = $last_sr->merge($sr);
+
+    if (! $merged_sr) {
+      unless ($last_merged) {
+        $self->add_SyntenyRegion($last_sr->stretch(2));
+        #$self->logger->debug("nnn  ".$last_sr->to_string."\n");
+      }
+      $last_merged = 0;
+    } else {
+      $self->add_SyntenyRegion($merged_sr->stretch(2));
+      #$self->logger->debug("mmm  ".$merged_sr->to_string."\n");
+      $last_merged = 1;
+    }
+    
+    $last_sr = $sr;
+  }
+
+  # deal with last synteny region in @sorted
+  unless ($last_merged) {
+    $self->add_SyntenyRegion($last_sr->stretch(2));
+    $last_merged = 0;
+  }
+
+  #foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
+  #  $self->logger->debug("SRs ".$sr->to_string."\n");
+  #}
+  
+  $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n");
+
+}
+
+
+#
+# sort SyntenyRegions by overlap
+#
+sub _by_overlap {
+  # first sort by seq_region
+  my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name);
+  return $retval if ($retval);
+
+  # then sort by overlap:
+  # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap
+  if ($a->source_end < $b->source_start) { return 1; }
+  if ($a->source_start < $b->source_end) { return -1; }
+  return 0;
+}
+
+
+=head2 add_SyntenyRegion
+
+  Arg[1]      : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add
+  Example     : $synteny_framework->add_SyntenyRegion($synteny_region);
+  Description : Adds a SyntenyRegion to the framework. For speed reasons (and
+                since this is an internal method), no argument check is done.
+  Return type : none
+  Exceptions  : none
+  Caller      : internal
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub add_SyntenyRegion {
+  push @{ $_[0]->{'cache'} }, $_[1];
+}
+
+
+=head2 get_all_SyntenyRegions
+
+  Example     : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) {
+                  # do something with the SyntenyRegion
+                }
+  Description : Get a list of all SyntenyRegions in the framework.
+  Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion
+  Exceptions  : none
+  Caller      : general
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub get_all_SyntenyRegions {
+  return $_[0]->{'cache'};
+}
+
+
+=head2 rescore_gene_matrix_lsf
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
+                scores to rescore
+  Example     : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
+  Description : This method runs rescore_gene_matrix() (via the
+                synteny_resocre.pl script) in parallel with lsf, then combines
+                the results to return a single rescored scoring matrix.
+                Parallelisation is done by chunking the scoring matrix into
+                several pieces (determined by the --synteny_rescore_jobs
+                configuration option).
+  Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
+  Exceptions  : thrown on wrong or missing argument
+                thrown on filesystem I/O error
+                thrown on failure of one or mor lsf jobs
+  Caller      : InternalIdMapper plugins
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub rescore_gene_matrix_lsf {
+  my $self = shift;
+  my $matrix = shift;
+
+  unless ($matrix and
+          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+
+  # serialise SyntenyFramework to disk
+  $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped');
+  $self->write_to_file;
+  $self->logger->debug("Done.\n", 0, 'stamped');
+
+  # split the ScoredMappingMatrix into chunks and write to disk
+  my $matrix_size = $matrix->size;
+  $self->logger->debug("Scores before rescoring: $matrix_size.\n");
+
+  my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20;
+  $num_jobs++;
+
+  my $dump_path = path_append($self->conf->param('basedir'),
+    'matrix/synteny_rescore');
+
+  $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped');
+  foreach my $i (1..$num_jobs) {
+    my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1;
+    my $end = int($matrix_size/($num_jobs-1)) * $i;
+    $self->logger->debug("$start-$end\n", 1);
+    my $sub_matrix = $matrix->sub_matrix($start, $end);
+    
+    $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser");
+    $sub_matrix->dump_path($dump_path);
+    
+    $sub_matrix->write_to_file;
+  }
+  $self->logger->debug("Done.\n", 0, 'stamped');
+
+  # create an empty lsf log directory
+  my $logpath = path_append($self->logger->logpath, 'synteny_rescore');
+  system("rm -rf $logpath") == 0 or
+    $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
+  system("mkdir -p $logpath") == 0 or
+    $self->logger->error("Can't create lsf log dir $logpath: $!\n");
+
+  # build lsf command
+  my $lsf_name = 'idmapping_synteny_rescore_'.time;
+
+  my $options = $self->conf->create_commandline_options(
+      logauto       => 1,
+      logautobase   => "synteny_rescore",
+      logpath       => $logpath,
+      interactive   => 0,
+      is_component  => 1,
+  );
+
+  my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX};
+
+  my $bsub_cmd =
+    sprintf( "|bsub -J%s[1-%d] "
+                            . "-o %s/synteny_rescore.%%I.out "
+                            . "-e %s/synteny_rescore.%%I.err %s",
+             $lsf_name, $num_jobs, $logpath, $logpath,
+             $self->conf()->param('lsf_opt_synteny_rescore') );
+
+  # run lsf job array
+  $self->logger->info("Submitting $num_jobs jobs to lsf.\n");
+  $self->logger->debug("$cmd\n\n");
+
+  local *BSUB;
+  open( BSUB, $bsub_cmd )
+    or $self->logger->error("Could not open open pipe to bsub: $!\n");
+
+  print BSUB $cmd;
+  $self->logger->error("Error submitting synteny rescoring jobs: $!\n")
+    unless ($? == 0); 
+  close BSUB;
+
+  # submit dependent job to monitor finishing of jobs
+  $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
+
+  my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
+    qq{-o $logpath/synteny_rescore_depend.out /bin/true};
+
+  system($dependent_job) == 0 or
+    $self->logger->error("Error submitting dependent job: $!\n");
+
+  $self->logger->info("All jobs finished.\n", 0, 'stamped');
+
+  # check for lsf errors
+  sleep(5);
+  my $err;
+  foreach my $i (1..$num_jobs) {
+    $err++ unless (-e "$logpath/synteny_rescore.$i.success");
+  }
+
+  if ($err) {
+    $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
+  }
+
+  # merge and return matrix
+  $self->logger->debug("Merging rescored matrices...\n");
+  $matrix->flush;
+
+  foreach my $i (1..$num_jobs) {
+    # read partial matrix created by lsf job from file
+    my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
+      -DUMP_PATH   => $dump_path,
+      -CACHE_FILE  => "gene_matrix_synteny$i.ser",
+    );
+    $sub_matrix->read_from_file;
+
+    # merge with main matrix
+    $matrix->merge($sub_matrix);
+  }
+
+  $self->logger->debug("Done.\n");
+  $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n");
+  
+  return $matrix;
+}
+
+
+# 
+#
+=head2 rescore_gene_matrix
+
+  Arg[1]      : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
+                scores to rescore
+  Example     : my $new_scores = $sf->rescore_gene_matrix($gene_scores);
+  Description : Rescores a gene matrix. Retains 70% of old score and builds
+                other 30% from the synteny match.
+  Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
+  Exceptions  : thrown on wrong or missing argument
+  Caller      : InternalIdMapper plugins
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub rescore_gene_matrix {
+  my $self = shift;
+  my $matrix = shift;
+
+  unless ($matrix and
+          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
+    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
+  }
+
+  my $retain_factor = 0.7;
+
+  foreach my $entry (@{ $matrix->get_all_Entries }) {
+    my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
+      $entry->source);
+
+    my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
+      $entry->target);
+
+    my $highest_score = 0;
+
+    foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
+      my $score = $sr->score_location_relationship($source_gene, $target_gene);
+      $highest_score = $score if ($score > $highest_score);
+    }
+
+    #$self->logger->debug("highscore ".$entry->to_string." ".
+    #  sprintf("%.6f\n", $highest_score));
+
+    $matrix->set_score($entry->source, $entry->target,
+      ($entry->score * 0.7 + $highest_score * 0.3));
+  }
+
+  return $matrix;
+}
+
+
+=head2 logger
+
+  Arg[1]      : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set
+  Example     : $object->logger->info("Starting ID mapping.\n");
+  Description : Getter/setter for logger object
+  Return type : Bio::EnsEMBL::Utils::Logger
+  Exceptions  : none
+  Caller      : constructor
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub logger {
+  my $self = shift;
+  $self->{'_logger'} = shift if (@_);
+  return $self->{'_logger'};
+}
+
+
+=head2 conf
+
+  Arg[1]      : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration
+                to set
+  Example     : my $basedir = $object->conf->param('basedir');
+  Description : Getter/setter for configuration object
+  Return type : Bio::EnsEMBL::Utils::ConfParser
+  Exceptions  : none
+  Caller      : constructor
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub conf {
+  my $self = shift;
+  $self->{'_conf'} = shift if (@_);
+  return $self->{'_conf'};
+}
+
+
+=head2 cache
+
+  Arg[1]      : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set
+  Example     : $object->cache->read_from_file('source');
+  Description : Getter/setter for cache object
+  Return type : Bio::EnsEMBL::IdMapping::Cache
+  Exceptions  : none
+  Caller      : constructor
+  Status      : At Risk
+              : under development
+
+=cut
+
+sub cache {
+  my $self = shift;
+  $self->{'_cache'} = shift if (@_);
+  return $self->{'_cache'};
+}
+
+
+1;
+