Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,652 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::IdMapping::InternalIdMapper; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append); +use Bio::EnsEMBL::IdMapping::Entry; +use Bio::EnsEMBL::IdMapping::MappingList; +use Bio::EnsEMBL::IdMapping::SyntenyFramework; + + +# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this +use constant SIMILAR_SCORE_RATIO => 0.01; + + +sub map_genes { + my $self = shift; + my $gene_scores = shift; + my $transcript_scores = shift; + my $gsb = shift; + + # argument checks + unless ($gene_scores and + $gene_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($transcript_scores and + $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($gsb and + $gsb->isa('Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.'); + } + + $self->logger->info("== Internal ID mapping for genes...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading gene mappings from file...\n", 0, 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create gene mappings + $self->logger->info("No gene mappings found. Will calculate them now.\n"); + + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_gene'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings0.ser', + ); + my @mappings = (); + my $i = 0; + + # + # run the scoring chain + # + foreach my $plugin (@plugins) { + ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++, + $gsb, $new_mappings, $gene_scores, $transcript_scores); + + push(@mappings, $new_mappings); + } + + # report remaining ambiguities + $self->logger->info($gene_scores->get_source_count. + " source genes are ambiguous with ". + $gene_scores->get_target_count." target genes.\n\n"); + + $self->log_ambiguous($gene_scores, 'gene'); + + # merge mappings and write to file + $mappings->add_all(@mappings); + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('gene', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; +} + + +sub map_transcripts { + my $self = shift; + my $transcript_scores = shift; + my $gene_mappings = shift; + my $tsb = shift; + + # argument checks + unless ($transcript_scores and + $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + unless ($gene_mappings and + $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); + } + + unless ($tsb and + $tsb->isa('Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.'); + } + + $self->logger->info("== Internal ID mapping for transcripts...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'transcript_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading transcript mappings from file...\n", 0, + 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create transcript mappings + $self->logger->info("No transcript mappings found. Will calculate them now.\n"); + + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::biotype + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'transcript_mappings0.ser', + ); + my @mappings = (); + my $i = 0; + + # + # run the scoring chain + # + foreach my $plugin (@plugins) { + ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin, + $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings); + + push(@mappings, $new_mappings); + } + + # report remaining ambiguities + $self->logger->info($transcript_scores->get_source_count. + " source transcripts are ambiguous with ". + $transcript_scores->get_target_count." target transcripts.\n\n"); + + $self->log_ambiguous($transcript_scores, 'transcript'); + + # merge mappings and write to file + $mappings->add_all(@mappings); + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('transcript', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; + +} + + +sub map_exons { + my $self = shift; + my $exon_scores = shift; + my $transcript_mappings = shift; + my $esb = shift; + + # argument checks + unless ($exon_scores and + $exon_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.'); + } + + unless ($transcript_mappings and + $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.'); + } + + unless ($esb and + $esb->isa('Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.'); + } + + $self->logger->info("== Internal ID mapping for exons...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'exon_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading exon mappings from file...\n", 0, + 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create exon mappings + $self->logger->info("No exon mappings found. Will calculate them now.\n"); + + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::internal_id + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_exon'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'exon_mappings0.ser', + ); + my @mappings = (); + my $i = 0; + + # + # run the scoring chain + # + foreach my $plugin (@plugins) { + ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++, + $esb, $new_mappings, $exon_scores); + + push(@mappings, $new_mappings); + } + + # report remaining ambiguities + $self->logger->info($exon_scores->get_source_count. + " source exons are ambiguous with ". + $exon_scores->get_target_count." target exons.\n\n"); + + $self->log_ambiguous($exon_scores, 'exon'); + + # merge mappings and write to file + $mappings->add_all(@mappings); + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('exon', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; + +} + + +# +# this is not implemented as a plugin, since a) it's too simple and b) it's +# tied to transcripts so there are no translation scores or score builder. +# +sub map_translations { + my $self = shift; + my $transcript_mappings = shift; + + # argument checks + unless ($transcript_mappings and + $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.'); + } + + $self->logger->info("== Internal ID mapping for translations...\n\n", 0, 'stamped'); + + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'translation_mappings.ser', + ); + + my $mapping_cache = $mappings->cache_file; + + if (-s $mapping_cache) { + + # read from file + $self->logger->info("Reading translation mappings from file...\n", 0, + 'stamped'); + $self->logger->debug("Cache file $mapping_cache.\n", 1); + $mappings->read_from_file; + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } else { + + # create translation mappings + $self->logger->info("No translation mappings found. Will calculate them now.\n"); + + $self->logger->info("Translation mapping...\n", 0, 'stamped'); + + # + # map translations for mapped transcripts + # + my $i = 0; + + foreach my $entry (@{ $transcript_mappings->get_all_Entries }) { + + my $source_tl = $self->cache->get_by_key('transcripts_by_id', + 'source', $entry->source)->translation; + my $target_tl = $self->cache->get_by_key('transcripts_by_id', + 'target', $entry->target)->translation; + + if ($source_tl and $target_tl) { + + # add mapping for the translations; note that the score is taken from + # the transcript mapping + my $tl_entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([ + $source_tl->id, $target_tl->id, $entry->score + ]); + $mappings->add_Entry($tl_entry); + + } else { + $i++; + } + + } + + $self->logger->debug("Skipped transcripts without translation: $i\n", 1); + $self->logger->info("New mappings: ".$mappings->get_entry_count."\n\n"); + + $mappings->write_to_file; + + if ($self->logger->loglevel eq 'debug') { + $mappings->log('translation', $self->conf->param('basedir')); + } + + $self->logger->info("Done.\n\n", 0, 'stamped'); + + } + + return $mappings; + +} + + +sub delegate_to_plugin { + my $self = shift; + my $plugin = shift; + my $num = shift; + my $score_builder = shift; + my $mappings = shift; + my $scores = shift; + + # argument checks + unless ($score_builder and + $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.'); + } + + unless ($mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a Bio::EnsEMBL::IdMapping::MappingList.'); + } + + unless ($scores and + $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # split plugin name into module and method + $plugin =~ /(.*)::(\w+)$/; + my $module = $1; + my $method = $2; + + unless ($module and $method) { + throw("Unable to determine module and method name from $plugin.\n"); + } + + # instantiate the plugin unless we already have an instance + my $plugin_instance; + if ($self->has_plugin($module)) { + + # re-use an existing plugin instance + $plugin_instance = $self->get_plugin($module); + + } else { + + # inject and instantiate the plugin module + inject($module); + $plugin_instance = $module->new( + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache + ); + $self->add_plugin($plugin_instance); + + } + + # run the method on the plugin + # + # pass in a sequence number (number of method run, used for generating + # checkpoint files), the scores used for determining the mapping, and all + # other arguments passed to this method (these will vary for different object + # types) + # + # return the scores and mappings to feed into the next plugin in the chain + return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_); +} + + +sub has_plugin { + my $self = shift; + my $module = shift; + + defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0); +} + + +sub get_plugin { + my $self = shift; + my $module = shift; + + return $self->{'_plugins'}->{$module}; +} + + +sub add_plugin { + my $self = shift; + my $plugin_instance = shift; + + $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance; +} + + +sub log_ambiguous { + my $self = shift; + my $matrix = shift; + my $type = shift; + + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + # create dump directory if it doesn't exist + my $debug_path = $self->conf->param('basedir').'/debug'; + unless (-d $debug_path) { + system("mkdir -p $debug_path") == 0 or + throw("Unable to create directory $debug_path.\n"); + } + + my $logfile = "$debug_path/ambiguous_${type}.txt"; + + open(my $fh, '>', $logfile) or + throw("Unable to open $logfile for writing: $!"); + + my @low_scoring = (); + my @high_scoring = (); + my $last_id; + + # log by source + foreach my $entry (sort { $a->source <=> $b->source } + @{ $matrix->get_all_Entries }) { + + $last_id ||= $entry->target; + + if ($last_id != $entry->source) { + $self->write_ambiguous($type, 'source', $fh, \@low_scoring, + \@high_scoring); + $last_id = $entry->source; + } + + if ($entry->score < 0.5) { + push @low_scoring, $entry; + } else { + push @high_scoring, $entry; + } + } + + # write last source + $self->write_ambiguous($type, 'source', $fh, \@low_scoring, \@high_scoring); + + # now do the same by target + $last_id = undef; + foreach my $entry (sort { $a->target <=> $b->target } + @{ $matrix->get_all_Entries }) { + + $last_id ||= $entry->target; + + if ($last_id != $entry->target) { + $self->write_ambiguous($type, 'target', $fh, \@low_scoring, + \@high_scoring); + $last_id = $entry->target; + } + + if ($entry->score < 0.5) { + push @low_scoring, $entry; + } else { + push @high_scoring, $entry; + } + } + + # write last target + $self->write_ambiguous($type, 'target', $fh, \@low_scoring, \@high_scoring); + + close($fh); +} + + +sub write_ambiguous { + my ($self, $type, $db_type, $fh, $low, $high) = @_; + + # if only source or target are ambiguous (i.e. you have only one mapping from + # this perspective) then log from the other perspective + if (scalar(@$low) + scalar(@$high) <= 1) { + @$low = (); + @$high = (); + return; + } + + my $first_id; + if (@$low) { + $first_id = $low->[0]->$db_type; + } else { + $first_id = $high->[0]->$db_type; + } + + my $other_db_type; + if ($db_type eq 'source') { + $other_db_type = 'target'; + } else { + $other_db_type = 'source'; + } + + print $fh "$db_type $type $first_id scores ambiguously:\n"; + + # high scorers + if (@$high) { + print $fh " high scoring ${other_db_type}s\n"; + + while (my $e = shift(@$high)) { + print $fh " ", $e->$other_db_type, " ", $e->score, "\n"; + } + } + + # low scorers + if (@$low) { + print $fh " low scoring ${other_db_type}s\n "; + + my $i = 1; + + while (my $e = shift(@$low)) { + print $fh "\n " unless (($i++)%10); + print $fh $e->$other_db_type, ", "; + } + } + + print $fh "\n"; +} + + +1; +