0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 =head1 SYNOPSIS
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 =head1 METHODS
|
|
28
|
|
29 =cut
|
|
30
|
|
31
|
|
32 package Bio::EnsEMBL::IdMapping::InternalIdMapper;
|
|
33
|
|
34 use strict;
|
|
35 use warnings;
|
|
36 no warnings 'uninitialized';
|
|
37
|
|
38 use Bio::EnsEMBL::IdMapping::BaseObject;
|
|
39 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
|
|
40
|
|
41 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
42 use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
|
|
43 use Bio::EnsEMBL::IdMapping::Entry;
|
|
44 use Bio::EnsEMBL::IdMapping::MappingList;
|
|
45 use Bio::EnsEMBL::IdMapping::SyntenyFramework;
|
|
46
|
|
47
|
|
48 # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
|
|
49 use constant SIMILAR_SCORE_RATIO => 0.01;
|
|
50
|
|
51
|
|
52 sub map_genes {
|
|
53 my $self = shift;
|
|
54 my $gene_scores = shift;
|
|
55 my $transcript_scores = shift;
|
|
56 my $gsb = shift;
|
|
57
|
|
58 # argument checks
|
|
59 unless ($gene_scores and
|
|
60 $gene_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
|
|
61 throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
|
|
62 }
|
|
63
|
|
64 unless ($transcript_scores and
|
|
65 $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
|
|
66 throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
|
|
67 }
|
|
68
|
|
69 unless ($gsb and
|
|
70 $gsb->isa('Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) {
|
|
71 throw('Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.');
|
|
72 }
|
|
73
|
|
74 $self->logger->info("== Internal ID mapping for genes...\n\n", 0, 'stamped');
|
|
75
|
|
76 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
|
|
77
|
|
78 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
79 -DUMP_PATH => $dump_path,
|
|
80 -CACHE_FILE => 'gene_mappings.ser',
|
|
81 );
|
|
82
|
|
83 my $mapping_cache = $mappings->cache_file;
|
|
84
|
|
85 if (-s $mapping_cache) {
|
|
86
|
|
87 # read from file
|
|
88 $self->logger->info("Reading gene mappings from file...\n", 0, 'stamped');
|
|
89 $self->logger->debug("Cache file $mapping_cache.\n", 1);
|
|
90 $mappings->read_from_file;
|
|
91 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
92
|
|
93 } else {
|
|
94
|
|
95 # create gene mappings
|
|
96 $self->logger->info("No gene mappings found. Will calculate them now.\n");
|
|
97
|
|
98 # determine which plugin methods to run
|
|
99 my @default_plugins = (qw(
|
|
100 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic
|
|
101 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript
|
|
102 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype
|
|
103 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny
|
|
104 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id
|
|
105 ));
|
|
106
|
|
107 my @plugins = $self->conf->param('plugin_internal_id_mappers_gene');
|
|
108 @plugins = @default_plugins unless (defined($plugins[0]));
|
|
109
|
|
110 my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
111 -DUMP_PATH => $dump_path,
|
|
112 -CACHE_FILE => 'gene_mappings0.ser',
|
|
113 );
|
|
114 my @mappings = ();
|
|
115 my $i = 0;
|
|
116
|
|
117 #
|
|
118 # run the scoring chain
|
|
119 #
|
|
120 foreach my $plugin (@plugins) {
|
|
121 ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
|
|
122 $gsb, $new_mappings, $gene_scores, $transcript_scores);
|
|
123
|
|
124 push(@mappings, $new_mappings);
|
|
125 }
|
|
126
|
|
127 # report remaining ambiguities
|
|
128 $self->logger->info($gene_scores->get_source_count.
|
|
129 " source genes are ambiguous with ".
|
|
130 $gene_scores->get_target_count." target genes.\n\n");
|
|
131
|
|
132 $self->log_ambiguous($gene_scores, 'gene');
|
|
133
|
|
134 # merge mappings and write to file
|
|
135 $mappings->add_all(@mappings);
|
|
136 $mappings->write_to_file;
|
|
137
|
|
138 if ($self->logger->loglevel eq 'debug') {
|
|
139 $mappings->log('gene', $self->conf->param('basedir'));
|
|
140 }
|
|
141
|
|
142 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
143
|
|
144 }
|
|
145
|
|
146 return $mappings;
|
|
147 }
|
|
148
|
|
149
|
|
150 sub map_transcripts {
|
|
151 my $self = shift;
|
|
152 my $transcript_scores = shift;
|
|
153 my $gene_mappings = shift;
|
|
154 my $tsb = shift;
|
|
155
|
|
156 # argument checks
|
|
157 unless ($transcript_scores and
|
|
158 $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
|
|
159 throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
|
|
160 }
|
|
161
|
|
162 unless ($gene_mappings and
|
|
163 $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
|
|
164 throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
|
|
165 }
|
|
166
|
|
167 unless ($tsb and
|
|
168 $tsb->isa('Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) {
|
|
169 throw('Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.');
|
|
170 }
|
|
171
|
|
172 $self->logger->info("== Internal ID mapping for transcripts...\n\n", 0, 'stamped');
|
|
173
|
|
174 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
|
|
175
|
|
176 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
177 -DUMP_PATH => $dump_path,
|
|
178 -CACHE_FILE => 'transcript_mappings.ser',
|
|
179 );
|
|
180
|
|
181 my $mapping_cache = $mappings->cache_file;
|
|
182
|
|
183 if (-s $mapping_cache) {
|
|
184
|
|
185 # read from file
|
|
186 $self->logger->info("Reading transcript mappings from file...\n", 0,
|
|
187 'stamped');
|
|
188 $self->logger->debug("Cache file $mapping_cache.\n", 1);
|
|
189 $mappings->read_from_file;
|
|
190 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
191
|
|
192 } else {
|
|
193
|
|
194 # create transcript mappings
|
|
195 $self->logger->info("No transcript mappings found. Will calculate them now.\n");
|
|
196
|
|
197 # determine which plugin methods to run
|
|
198 my @default_plugins = (qw(
|
|
199 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic
|
|
200 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation
|
|
201 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::biotype
|
|
202 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene
|
|
203 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene
|
|
204 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id
|
|
205 ));
|
|
206
|
|
207 my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript');
|
|
208 @plugins = @default_plugins unless (defined($plugins[0]));
|
|
209
|
|
210 my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
211 -DUMP_PATH => $dump_path,
|
|
212 -CACHE_FILE => 'transcript_mappings0.ser',
|
|
213 );
|
|
214 my @mappings = ();
|
|
215 my $i = 0;
|
|
216
|
|
217 #
|
|
218 # run the scoring chain
|
|
219 #
|
|
220 foreach my $plugin (@plugins) {
|
|
221 ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin,
|
|
222 $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings);
|
|
223
|
|
224 push(@mappings, $new_mappings);
|
|
225 }
|
|
226
|
|
227 # report remaining ambiguities
|
|
228 $self->logger->info($transcript_scores->get_source_count.
|
|
229 " source transcripts are ambiguous with ".
|
|
230 $transcript_scores->get_target_count." target transcripts.\n\n");
|
|
231
|
|
232 $self->log_ambiguous($transcript_scores, 'transcript');
|
|
233
|
|
234 # merge mappings and write to file
|
|
235 $mappings->add_all(@mappings);
|
|
236 $mappings->write_to_file;
|
|
237
|
|
238 if ($self->logger->loglevel eq 'debug') {
|
|
239 $mappings->log('transcript', $self->conf->param('basedir'));
|
|
240 }
|
|
241
|
|
242 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
243
|
|
244 }
|
|
245
|
|
246 return $mappings;
|
|
247
|
|
248 }
|
|
249
|
|
250
|
|
251 sub map_exons {
|
|
252 my $self = shift;
|
|
253 my $exon_scores = shift;
|
|
254 my $transcript_mappings = shift;
|
|
255 my $esb = shift;
|
|
256
|
|
257 # argument checks
|
|
258 unless ($exon_scores and
|
|
259 $exon_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
|
|
260 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
|
|
261 }
|
|
262
|
|
263 unless ($transcript_mappings and
|
|
264 $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
|
|
265 throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
|
|
266 }
|
|
267
|
|
268 unless ($esb and
|
|
269 $esb->isa('Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) {
|
|
270 throw('Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.');
|
|
271 }
|
|
272
|
|
273 $self->logger->info("== Internal ID mapping for exons...\n\n", 0, 'stamped');
|
|
274
|
|
275 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
|
|
276
|
|
277 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
278 -DUMP_PATH => $dump_path,
|
|
279 -CACHE_FILE => 'exon_mappings.ser',
|
|
280 );
|
|
281
|
|
282 my $mapping_cache = $mappings->cache_file;
|
|
283
|
|
284 if (-s $mapping_cache) {
|
|
285
|
|
286 # read from file
|
|
287 $self->logger->info("Reading exon mappings from file...\n", 0,
|
|
288 'stamped');
|
|
289 $self->logger->debug("Cache file $mapping_cache.\n", 1);
|
|
290 $mappings->read_from_file;
|
|
291 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
292
|
|
293 } else {
|
|
294
|
|
295 # create exon mappings
|
|
296 $self->logger->info("No exon mappings found. Will calculate them now.\n");
|
|
297
|
|
298 # determine which plugin methods to run
|
|
299 my @default_plugins = (qw(
|
|
300 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic
|
|
301 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript
|
|
302 Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::internal_id
|
|
303 ));
|
|
304
|
|
305 my @plugins = $self->conf->param('plugin_internal_id_mappers_exon');
|
|
306 @plugins = @default_plugins unless (defined($plugins[0]));
|
|
307
|
|
308 my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
309 -DUMP_PATH => $dump_path,
|
|
310 -CACHE_FILE => 'exon_mappings0.ser',
|
|
311 );
|
|
312 my @mappings = ();
|
|
313 my $i = 0;
|
|
314
|
|
315 #
|
|
316 # run the scoring chain
|
|
317 #
|
|
318 foreach my $plugin (@plugins) {
|
|
319 ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
|
|
320 $esb, $new_mappings, $exon_scores);
|
|
321
|
|
322 push(@mappings, $new_mappings);
|
|
323 }
|
|
324
|
|
325 # report remaining ambiguities
|
|
326 $self->logger->info($exon_scores->get_source_count.
|
|
327 " source exons are ambiguous with ".
|
|
328 $exon_scores->get_target_count." target exons.\n\n");
|
|
329
|
|
330 $self->log_ambiguous($exon_scores, 'exon');
|
|
331
|
|
332 # merge mappings and write to file
|
|
333 $mappings->add_all(@mappings);
|
|
334 $mappings->write_to_file;
|
|
335
|
|
336 if ($self->logger->loglevel eq 'debug') {
|
|
337 $mappings->log('exon', $self->conf->param('basedir'));
|
|
338 }
|
|
339
|
|
340 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
341
|
|
342 }
|
|
343
|
|
344 return $mappings;
|
|
345
|
|
346 }
|
|
347
|
|
348
|
|
349 #
|
|
350 # this is not implemented as a plugin, since a) it's too simple and b) it's
|
|
351 # tied to transcripts so there are no translation scores or score builder.
|
|
352 #
|
|
353 sub map_translations {
|
|
354 my $self = shift;
|
|
355 my $transcript_mappings = shift;
|
|
356
|
|
357 # argument checks
|
|
358 unless ($transcript_mappings and
|
|
359 $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
|
|
360 throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
|
|
361 }
|
|
362
|
|
363 $self->logger->info("== Internal ID mapping for translations...\n\n", 0, 'stamped');
|
|
364
|
|
365 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
|
|
366
|
|
367 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
368 -DUMP_PATH => $dump_path,
|
|
369 -CACHE_FILE => 'translation_mappings.ser',
|
|
370 );
|
|
371
|
|
372 my $mapping_cache = $mappings->cache_file;
|
|
373
|
|
374 if (-s $mapping_cache) {
|
|
375
|
|
376 # read from file
|
|
377 $self->logger->info("Reading translation mappings from file...\n", 0,
|
|
378 'stamped');
|
|
379 $self->logger->debug("Cache file $mapping_cache.\n", 1);
|
|
380 $mappings->read_from_file;
|
|
381 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
382
|
|
383 } else {
|
|
384
|
|
385 # create translation mappings
|
|
386 $self->logger->info("No translation mappings found. Will calculate them now.\n");
|
|
387
|
|
388 $self->logger->info("Translation mapping...\n", 0, 'stamped');
|
|
389
|
|
390 #
|
|
391 # map translations for mapped transcripts
|
|
392 #
|
|
393 my $i = 0;
|
|
394
|
|
395 foreach my $entry (@{ $transcript_mappings->get_all_Entries }) {
|
|
396
|
|
397 my $source_tl = $self->cache->get_by_key('transcripts_by_id',
|
|
398 'source', $entry->source)->translation;
|
|
399 my $target_tl = $self->cache->get_by_key('transcripts_by_id',
|
|
400 'target', $entry->target)->translation;
|
|
401
|
|
402 if ($source_tl and $target_tl) {
|
|
403
|
|
404 # add mapping for the translations; note that the score is taken from
|
|
405 # the transcript mapping
|
|
406 my $tl_entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([
|
|
407 $source_tl->id, $target_tl->id, $entry->score
|
|
408 ]);
|
|
409 $mappings->add_Entry($tl_entry);
|
|
410
|
|
411 } else {
|
|
412 $i++;
|
|
413 }
|
|
414
|
|
415 }
|
|
416
|
|
417 $self->logger->debug("Skipped transcripts without translation: $i\n", 1);
|
|
418 $self->logger->info("New mappings: ".$mappings->get_entry_count."\n\n");
|
|
419
|
|
420 $mappings->write_to_file;
|
|
421
|
|
422 if ($self->logger->loglevel eq 'debug') {
|
|
423 $mappings->log('translation', $self->conf->param('basedir'));
|
|
424 }
|
|
425
|
|
426 $self->logger->info("Done.\n\n", 0, 'stamped');
|
|
427
|
|
428 }
|
|
429
|
|
430 return $mappings;
|
|
431
|
|
432 }
|
|
433
|
|
434
|
|
435 sub delegate_to_plugin {
|
|
436 my $self = shift;
|
|
437 my $plugin = shift;
|
|
438 my $num = shift;
|
|
439 my $score_builder = shift;
|
|
440 my $mappings = shift;
|
|
441 my $scores = shift;
|
|
442
|
|
443 # argument checks
|
|
444 unless ($score_builder and
|
|
445 $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) {
|
|
446 throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.');
|
|
447 }
|
|
448
|
|
449 unless ($mappings and
|
|
450 $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
|
|
451 throw('Need a Bio::EnsEMBL::IdMapping::MappingList.');
|
|
452 }
|
|
453
|
|
454 unless ($scores and
|
|
455 $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
|
|
456 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
|
|
457 }
|
|
458
|
|
459 # split plugin name into module and method
|
|
460 $plugin =~ /(.*)::(\w+)$/;
|
|
461 my $module = $1;
|
|
462 my $method = $2;
|
|
463
|
|
464 unless ($module and $method) {
|
|
465 throw("Unable to determine module and method name from $plugin.\n");
|
|
466 }
|
|
467
|
|
468 # instantiate the plugin unless we already have an instance
|
|
469 my $plugin_instance;
|
|
470 if ($self->has_plugin($module)) {
|
|
471
|
|
472 # re-use an existing plugin instance
|
|
473 $plugin_instance = $self->get_plugin($module);
|
|
474
|
|
475 } else {
|
|
476
|
|
477 # inject and instantiate the plugin module
|
|
478 inject($module);
|
|
479 $plugin_instance = $module->new(
|
|
480 -LOGGER => $self->logger,
|
|
481 -CONF => $self->conf,
|
|
482 -CACHE => $self->cache
|
|
483 );
|
|
484 $self->add_plugin($plugin_instance);
|
|
485
|
|
486 }
|
|
487
|
|
488 # run the method on the plugin
|
|
489 #
|
|
490 # pass in a sequence number (number of method run, used for generating
|
|
491 # checkpoint files), the scores used for determining the mapping, and all
|
|
492 # other arguments passed to this method (these will vary for different object
|
|
493 # types)
|
|
494 #
|
|
495 # return the scores and mappings to feed into the next plugin in the chain
|
|
496 return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_);
|
|
497 }
|
|
498
|
|
499
|
|
500 sub has_plugin {
|
|
501 my $self = shift;
|
|
502 my $module = shift;
|
|
503
|
|
504 defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0);
|
|
505 }
|
|
506
|
|
507
|
|
508 sub get_plugin {
|
|
509 my $self = shift;
|
|
510 my $module = shift;
|
|
511
|
|
512 return $self->{'_plugins'}->{$module};
|
|
513 }
|
|
514
|
|
515
|
|
516 sub add_plugin {
|
|
517 my $self = shift;
|
|
518 my $plugin_instance = shift;
|
|
519
|
|
520 $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance;
|
|
521 }
|
|
522
|
|
523
|
|
524 sub log_ambiguous {
|
|
525 my $self = shift;
|
|
526 my $matrix = shift;
|
|
527 my $type = shift;
|
|
528
|
|
529 unless ($matrix and
|
|
530 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
|
|
531 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
|
|
532 }
|
|
533
|
|
534 # create dump directory if it doesn't exist
|
|
535 my $debug_path = $self->conf->param('basedir').'/debug';
|
|
536 unless (-d $debug_path) {
|
|
537 system("mkdir -p $debug_path") == 0 or
|
|
538 throw("Unable to create directory $debug_path.\n");
|
|
539 }
|
|
540
|
|
541 my $logfile = "$debug_path/ambiguous_${type}.txt";
|
|
542
|
|
543 open(my $fh, '>', $logfile) or
|
|
544 throw("Unable to open $logfile for writing: $!");
|
|
545
|
|
546 my @low_scoring = ();
|
|
547 my @high_scoring = ();
|
|
548 my $last_id;
|
|
549
|
|
550 # log by source
|
|
551 foreach my $entry (sort { $a->source <=> $b->source }
|
|
552 @{ $matrix->get_all_Entries }) {
|
|
553
|
|
554 $last_id ||= $entry->target;
|
|
555
|
|
556 if ($last_id != $entry->source) {
|
|
557 $self->write_ambiguous($type, 'source', $fh, \@low_scoring,
|
|
558 \@high_scoring);
|
|
559 $last_id = $entry->source;
|
|
560 }
|
|
561
|
|
562 if ($entry->score < 0.5) {
|
|
563 push @low_scoring, $entry;
|
|
564 } else {
|
|
565 push @high_scoring, $entry;
|
|
566 }
|
|
567 }
|
|
568
|
|
569 # write last source
|
|
570 $self->write_ambiguous($type, 'source', $fh, \@low_scoring, \@high_scoring);
|
|
571
|
|
572 # now do the same by target
|
|
573 $last_id = undef;
|
|
574 foreach my $entry (sort { $a->target <=> $b->target }
|
|
575 @{ $matrix->get_all_Entries }) {
|
|
576
|
|
577 $last_id ||= $entry->target;
|
|
578
|
|
579 if ($last_id != $entry->target) {
|
|
580 $self->write_ambiguous($type, 'target', $fh, \@low_scoring,
|
|
581 \@high_scoring);
|
|
582 $last_id = $entry->target;
|
|
583 }
|
|
584
|
|
585 if ($entry->score < 0.5) {
|
|
586 push @low_scoring, $entry;
|
|
587 } else {
|
|
588 push @high_scoring, $entry;
|
|
589 }
|
|
590 }
|
|
591
|
|
592 # write last target
|
|
593 $self->write_ambiguous($type, 'target', $fh, \@low_scoring, \@high_scoring);
|
|
594
|
|
595 close($fh);
|
|
596 }
|
|
597
|
|
598
|
|
599 sub write_ambiguous {
|
|
600 my ($self, $type, $db_type, $fh, $low, $high) = @_;
|
|
601
|
|
602 # if only source or target are ambiguous (i.e. you have only one mapping from
|
|
603 # this perspective) then log from the other perspective
|
|
604 if (scalar(@$low) + scalar(@$high) <= 1) {
|
|
605 @$low = ();
|
|
606 @$high = ();
|
|
607 return;
|
|
608 }
|
|
609
|
|
610 my $first_id;
|
|
611 if (@$low) {
|
|
612 $first_id = $low->[0]->$db_type;
|
|
613 } else {
|
|
614 $first_id = $high->[0]->$db_type;
|
|
615 }
|
|
616
|
|
617 my $other_db_type;
|
|
618 if ($db_type eq 'source') {
|
|
619 $other_db_type = 'target';
|
|
620 } else {
|
|
621 $other_db_type = 'source';
|
|
622 }
|
|
623
|
|
624 print $fh "$db_type $type $first_id scores ambiguously:\n";
|
|
625
|
|
626 # high scorers
|
|
627 if (@$high) {
|
|
628 print $fh " high scoring ${other_db_type}s\n";
|
|
629
|
|
630 while (my $e = shift(@$high)) {
|
|
631 print $fh " ", $e->$other_db_type, " ", $e->score, "\n";
|
|
632 }
|
|
633 }
|
|
634
|
|
635 # low scorers
|
|
636 if (@$low) {
|
|
637 print $fh " low scoring ${other_db_type}s\n ";
|
|
638
|
|
639 my $i = 1;
|
|
640
|
|
641 while (my $e = shift(@$low)) {
|
|
642 print $fh "\n " unless (($i++)%10);
|
|
643 print $fh $e->$other_db_type, ", ";
|
|
644 }
|
|
645 }
|
|
646
|
|
647 print $fh "\n";
|
|
648 }
|
|
649
|
|
650
|
|
651 1;
|
|
652
|