comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 =head1 SYNOPSIS
24
25 =head1 DESCRIPTION
26
27 =head1 METHODS
28
29 =cut
30
31
32 package Bio::EnsEMBL::IdMapping::InternalIdMapper;
33
34 use strict;
35 use warnings;
36 no warnings 'uninitialized';
37
38 use Bio::EnsEMBL::IdMapping::BaseObject;
39 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
40
41 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
42 use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
43 use Bio::EnsEMBL::IdMapping::Entry;
44 use Bio::EnsEMBL::IdMapping::MappingList;
45 use Bio::EnsEMBL::IdMapping::SyntenyFramework;
46
47
48 # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
49 use constant SIMILAR_SCORE_RATIO => 0.01;
50
51
52 sub map_genes {
53 my $self = shift;
54 my $gene_scores = shift;
55 my $transcript_scores = shift;
56 my $gsb = shift;
57
58 # argument checks
59 unless ($gene_scores and
60 $gene_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
61 throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
62 }
63
64 unless ($transcript_scores and
65 $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
66 throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
67 }
68
69 unless ($gsb and
70 $gsb->isa('Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) {
71 throw('Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.');
72 }
73
74 $self->logger->info("== Internal ID mapping for genes...\n\n", 0, 'stamped');
75
76 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
77
78 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
79 -DUMP_PATH => $dump_path,
80 -CACHE_FILE => 'gene_mappings.ser',
81 );
82
83 my $mapping_cache = $mappings->cache_file;
84
85 if (-s $mapping_cache) {
86
87 # read from file
88 $self->logger->info("Reading gene mappings from file...\n", 0, 'stamped');
89 $self->logger->debug("Cache file $mapping_cache.\n", 1);
90 $mappings->read_from_file;
91 $self->logger->info("Done.\n\n", 0, 'stamped');
92
93 } else {
94
95 # create gene mappings
96 $self->logger->info("No gene mappings found. Will calculate them now.\n");
97
98 # determine which plugin methods to run
99 my @default_plugins = (qw(
100 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic
101 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript
102 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype
103 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny
104 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id
105 ));
106
107 my @plugins = $self->conf->param('plugin_internal_id_mappers_gene');
108 @plugins = @default_plugins unless (defined($plugins[0]));
109
110 my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
111 -DUMP_PATH => $dump_path,
112 -CACHE_FILE => 'gene_mappings0.ser',
113 );
114 my @mappings = ();
115 my $i = 0;
116
117 #
118 # run the scoring chain
119 #
120 foreach my $plugin (@plugins) {
121 ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
122 $gsb, $new_mappings, $gene_scores, $transcript_scores);
123
124 push(@mappings, $new_mappings);
125 }
126
127 # report remaining ambiguities
128 $self->logger->info($gene_scores->get_source_count.
129 " source genes are ambiguous with ".
130 $gene_scores->get_target_count." target genes.\n\n");
131
132 $self->log_ambiguous($gene_scores, 'gene');
133
134 # merge mappings and write to file
135 $mappings->add_all(@mappings);
136 $mappings->write_to_file;
137
138 if ($self->logger->loglevel eq 'debug') {
139 $mappings->log('gene', $self->conf->param('basedir'));
140 }
141
142 $self->logger->info("Done.\n\n", 0, 'stamped');
143
144 }
145
146 return $mappings;
147 }
148
149
150 sub map_transcripts {
151 my $self = shift;
152 my $transcript_scores = shift;
153 my $gene_mappings = shift;
154 my $tsb = shift;
155
156 # argument checks
157 unless ($transcript_scores and
158 $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
159 throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
160 }
161
162 unless ($gene_mappings and
163 $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
164 throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
165 }
166
167 unless ($tsb and
168 $tsb->isa('Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) {
169 throw('Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.');
170 }
171
172 $self->logger->info("== Internal ID mapping for transcripts...\n\n", 0, 'stamped');
173
174 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
175
176 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
177 -DUMP_PATH => $dump_path,
178 -CACHE_FILE => 'transcript_mappings.ser',
179 );
180
181 my $mapping_cache = $mappings->cache_file;
182
183 if (-s $mapping_cache) {
184
185 # read from file
186 $self->logger->info("Reading transcript mappings from file...\n", 0,
187 'stamped');
188 $self->logger->debug("Cache file $mapping_cache.\n", 1);
189 $mappings->read_from_file;
190 $self->logger->info("Done.\n\n", 0, 'stamped');
191
192 } else {
193
194 # create transcript mappings
195 $self->logger->info("No transcript mappings found. Will calculate them now.\n");
196
197 # determine which plugin methods to run
198 my @default_plugins = (qw(
199 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic
200 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation
201 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::biotype
202 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene
203 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene
204 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id
205 ));
206
207 my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript');
208 @plugins = @default_plugins unless (defined($plugins[0]));
209
210 my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
211 -DUMP_PATH => $dump_path,
212 -CACHE_FILE => 'transcript_mappings0.ser',
213 );
214 my @mappings = ();
215 my $i = 0;
216
217 #
218 # run the scoring chain
219 #
220 foreach my $plugin (@plugins) {
221 ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin,
222 $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings);
223
224 push(@mappings, $new_mappings);
225 }
226
227 # report remaining ambiguities
228 $self->logger->info($transcript_scores->get_source_count.
229 " source transcripts are ambiguous with ".
230 $transcript_scores->get_target_count." target transcripts.\n\n");
231
232 $self->log_ambiguous($transcript_scores, 'transcript');
233
234 # merge mappings and write to file
235 $mappings->add_all(@mappings);
236 $mappings->write_to_file;
237
238 if ($self->logger->loglevel eq 'debug') {
239 $mappings->log('transcript', $self->conf->param('basedir'));
240 }
241
242 $self->logger->info("Done.\n\n", 0, 'stamped');
243
244 }
245
246 return $mappings;
247
248 }
249
250
251 sub map_exons {
252 my $self = shift;
253 my $exon_scores = shift;
254 my $transcript_mappings = shift;
255 my $esb = shift;
256
257 # argument checks
258 unless ($exon_scores and
259 $exon_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
260 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
261 }
262
263 unless ($transcript_mappings and
264 $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
265 throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
266 }
267
268 unless ($esb and
269 $esb->isa('Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) {
270 throw('Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.');
271 }
272
273 $self->logger->info("== Internal ID mapping for exons...\n\n", 0, 'stamped');
274
275 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
276
277 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
278 -DUMP_PATH => $dump_path,
279 -CACHE_FILE => 'exon_mappings.ser',
280 );
281
282 my $mapping_cache = $mappings->cache_file;
283
284 if (-s $mapping_cache) {
285
286 # read from file
287 $self->logger->info("Reading exon mappings from file...\n", 0,
288 'stamped');
289 $self->logger->debug("Cache file $mapping_cache.\n", 1);
290 $mappings->read_from_file;
291 $self->logger->info("Done.\n\n", 0, 'stamped');
292
293 } else {
294
295 # create exon mappings
296 $self->logger->info("No exon mappings found. Will calculate them now.\n");
297
298 # determine which plugin methods to run
299 my @default_plugins = (qw(
300 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic
301 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript
302 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::internal_id
303 ));
304
305 my @plugins = $self->conf->param('plugin_internal_id_mappers_exon');
306 @plugins = @default_plugins unless (defined($plugins[0]));
307
308 my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
309 -DUMP_PATH => $dump_path,
310 -CACHE_FILE => 'exon_mappings0.ser',
311 );
312 my @mappings = ();
313 my $i = 0;
314
315 #
316 # run the scoring chain
317 #
318 foreach my $plugin (@plugins) {
319 ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
320 $esb, $new_mappings, $exon_scores);
321
322 push(@mappings, $new_mappings);
323 }
324
325 # report remaining ambiguities
326 $self->logger->info($exon_scores->get_source_count.
327 " source exons are ambiguous with ".
328 $exon_scores->get_target_count." target exons.\n\n");
329
330 $self->log_ambiguous($exon_scores, 'exon');
331
332 # merge mappings and write to file
333 $mappings->add_all(@mappings);
334 $mappings->write_to_file;
335
336 if ($self->logger->loglevel eq 'debug') {
337 $mappings->log('exon', $self->conf->param('basedir'));
338 }
339
340 $self->logger->info("Done.\n\n", 0, 'stamped');
341
342 }
343
344 return $mappings;
345
346 }
347
348
349 #
350 # this is not implemented as a plugin, since a) it's too simple and b) it's
351 # tied to transcripts so there are no translation scores or score builder.
352 #
353 sub map_translations {
354 my $self = shift;
355 my $transcript_mappings = shift;
356
357 # argument checks
358 unless ($transcript_mappings and
359 $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
360 throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
361 }
362
363 $self->logger->info("== Internal ID mapping for translations...\n\n", 0, 'stamped');
364
365 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
366
367 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
368 -DUMP_PATH => $dump_path,
369 -CACHE_FILE => 'translation_mappings.ser',
370 );
371
372 my $mapping_cache = $mappings->cache_file;
373
374 if (-s $mapping_cache) {
375
376 # read from file
377 $self->logger->info("Reading translation mappings from file...\n", 0,
378 'stamped');
379 $self->logger->debug("Cache file $mapping_cache.\n", 1);
380 $mappings->read_from_file;
381 $self->logger->info("Done.\n\n", 0, 'stamped');
382
383 } else {
384
385 # create translation mappings
386 $self->logger->info("No translation mappings found. Will calculate them now.\n");
387
388 $self->logger->info("Translation mapping...\n", 0, 'stamped');
389
390 #
391 # map translations for mapped transcripts
392 #
393 my $i = 0;
394
395 foreach my $entry (@{ $transcript_mappings->get_all_Entries }) {
396
397 my $source_tl = $self->cache->get_by_key('transcripts_by_id',
398 'source', $entry->source)->translation;
399 my $target_tl = $self->cache->get_by_key('transcripts_by_id',
400 'target', $entry->target)->translation;
401
402 if ($source_tl and $target_tl) {
403
404 # add mapping for the translations; note that the score is taken from
405 # the transcript mapping
406 my $tl_entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([
407 $source_tl->id, $target_tl->id, $entry->score
408 ]);
409 $mappings->add_Entry($tl_entry);
410
411 } else {
412 $i++;
413 }
414
415 }
416
417 $self->logger->debug("Skipped transcripts without translation: $i\n", 1);
418 $self->logger->info("New mappings: ".$mappings->get_entry_count."\n\n");
419
420 $mappings->write_to_file;
421
422 if ($self->logger->loglevel eq 'debug') {
423 $mappings->log('translation', $self->conf->param('basedir'));
424 }
425
426 $self->logger->info("Done.\n\n", 0, 'stamped');
427
428 }
429
430 return $mappings;
431
432 }
433
434
435 sub delegate_to_plugin {
436 my $self = shift;
437 my $plugin = shift;
438 my $num = shift;
439 my $score_builder = shift;
440 my $mappings = shift;
441 my $scores = shift;
442
443 # argument checks
444 unless ($score_builder and
445 $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) {
446 throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.');
447 }
448
449 unless ($mappings and
450 $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
451 throw('Need a Bio::EnsEMBL::IdMapping::MappingList.');
452 }
453
454 unless ($scores and
455 $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
456 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
457 }
458
459 # split plugin name into module and method
460 $plugin =~ /(.*)::(\w+)$/;
461 my $module = $1;
462 my $method = $2;
463
464 unless ($module and $method) {
465 throw("Unable to determine module and method name from $plugin.\n");
466 }
467
468 # instantiate the plugin unless we already have an instance
469 my $plugin_instance;
470 if ($self->has_plugin($module)) {
471
472 # re-use an existing plugin instance
473 $plugin_instance = $self->get_plugin($module);
474
475 } else {
476
477 # inject and instantiate the plugin module
478 inject($module);
479 $plugin_instance = $module->new(
480 -LOGGER => $self->logger,
481 -CONF => $self->conf,
482 -CACHE => $self->cache
483 );
484 $self->add_plugin($plugin_instance);
485
486 }
487
488 # run the method on the plugin
489 #
490 # pass in a sequence number (number of method run, used for generating
491 # checkpoint files), the scores used for determining the mapping, and all
492 # other arguments passed to this method (these will vary for different object
493 # types)
494 #
495 # return the scores and mappings to feed into the next plugin in the chain
496 return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_);
497 }
498
499
500 sub has_plugin {
501 my $self = shift;
502 my $module = shift;
503
504 defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0);
505 }
506
507
508 sub get_plugin {
509 my $self = shift;
510 my $module = shift;
511
512 return $self->{'_plugins'}->{$module};
513 }
514
515
516 sub add_plugin {
517 my $self = shift;
518 my $plugin_instance = shift;
519
520 $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance;
521 }
522
523
524 sub log_ambiguous {
525 my $self = shift;
526 my $matrix = shift;
527 my $type = shift;
528
529 unless ($matrix and
530 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
531 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
532 }
533
534 # create dump directory if it doesn't exist
535 my $debug_path = $self->conf->param('basedir').'/debug';
536 unless (-d $debug_path) {
537 system("mkdir -p $debug_path") == 0 or
538 throw("Unable to create directory $debug_path.\n");
539 }
540
541 my $logfile = "$debug_path/ambiguous_${type}.txt";
542
543 open(my $fh, '>', $logfile) or
544 throw("Unable to open $logfile for writing: $!");
545
546 my @low_scoring = ();
547 my @high_scoring = ();
548 my $last_id;
549
550 # log by source
551 foreach my $entry (sort { $a->source <=> $b->source }
552 @{ $matrix->get_all_Entries }) {
553
554 $last_id ||= $entry->target;
555
556 if ($last_id != $entry->source) {
557 $self->write_ambiguous($type, 'source', $fh, \@low_scoring,
558 \@high_scoring);
559 $last_id = $entry->source;
560 }
561
562 if ($entry->score < 0.5) {
563 push @low_scoring, $entry;
564 } else {
565 push @high_scoring, $entry;
566 }
567 }
568
569 # write last source
570 $self->write_ambiguous($type, 'source', $fh, \@low_scoring, \@high_scoring);
571
572 # now do the same by target
573 $last_id = undef;
574 foreach my $entry (sort { $a->target <=> $b->target }
575 @{ $matrix->get_all_Entries }) {
576
577 $last_id ||= $entry->target;
578
579 if ($last_id != $entry->target) {
580 $self->write_ambiguous($type, 'target', $fh, \@low_scoring,
581 \@high_scoring);
582 $last_id = $entry->target;
583 }
584
585 if ($entry->score < 0.5) {
586 push @low_scoring, $entry;
587 } else {
588 push @high_scoring, $entry;
589 }
590 }
591
592 # write last target
593 $self->write_ambiguous($type, 'target', $fh, \@low_scoring, \@high_scoring);
594
595 close($fh);
596 }
597
598
599 sub write_ambiguous {
600 my ($self, $type, $db_type, $fh, $low, $high) = @_;
601
602 # if only source or target are ambiguous (i.e. you have only one mapping from
603 # this perspective) then log from the other perspective
604 if (scalar(@$low) + scalar(@$high) <= 1) {
605 @$low = ();
606 @$high = ();
607 return;
608 }
609
610 my $first_id;
611 if (@$low) {
612 $first_id = $low->[0]->$db_type;
613 } else {
614 $first_id = $high->[0]->$db_type;
615 }
616
617 my $other_db_type;
618 if ($db_type eq 'source') {
619 $other_db_type = 'target';
620 } else {
621 $other_db_type = 'source';
622 }
623
624 print $fh "$db_type $type $first_id scores ambiguously:\n";
625
626 # high scorers
627 if (@$high) {
628 print $fh " high scoring ${other_db_type}s\n";
629
630 while (my $e = shift(@$high)) {
631 print $fh " ", $e->$other_db_type, " ", $e->score, "\n";
632 }
633 }
634
635 # low scorers
636 if (@$low) {
637 print $fh " low scoring ${other_db_type}s\n ";
638
639 my $i = 1;
640
641 while (my $e = shift(@$low)) {
642 print $fh "\n " unless (($i++)%10);
643 print $fh $e->$other_db_type, ", ";
644 }
645 }
646
647 print $fh "\n";
648 }
649
650
651 1;
652