0
|
1
|
|
2 =head1 LICENSE
|
|
3
|
|
4 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
5 Genome Research Limited. All rights reserved.
|
|
6
|
|
7 This software is distributed under a modified Apache license.
|
|
8 For license details, please see
|
|
9
|
|
10 http://www.ensembl.org/info/about/code_licence.html
|
|
11
|
|
12 =head1 CONTACT
|
|
13
|
|
14 Please email comments or questions to the public Ensembl
|
|
15 developers list at <dev@ensembl.org>.
|
|
16
|
|
17 Questions may also be sent to the Ensembl help desk at
|
|
18 <helpdesk@ensembl.org>.
|
|
19
|
|
20 =cut
|
|
21
|
|
22 =head1 NAME
|
|
23
|
|
24 =head1 SYNOPSIS
|
|
25
|
|
26 =head1 DESCRIPTION
|
|
27
|
|
28 =head1 METHODS
|
|
29
|
|
30 =cut
|
|
31
|
|
32 package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
|
|
33
|
|
34 use strict;
|
|
35 use warnings;
|
|
36 no warnings 'uninitialized';
|
|
37
|
|
38 use Bio::EnsEMBL::IdMapping::BaseObject;
|
|
39 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
|
|
40
|
|
41 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
42 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
|
|
43 use Bio::EnsEMBL::IdMapping::MappingList;
|
|
44
|
|
45 # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
|
|
46 use constant SIMILAR_SCORE_RATIO => 0.01;
|
|
47
|
|
48 #
|
|
49 # find the highest unambiguous score for all sources and targets in a scoring
|
|
50 # matrix
|
|
51 #
|
|
52 sub basic_mapping {
|
|
53 my $self = shift;
|
|
54 my $matrix = shift;
|
|
55 my $mapping_name = shift;
|
|
56
|
|
57 # argument checks
|
|
58 unless ($matrix
|
|
59 and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
|
|
60 {
|
|
61 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
|
|
62 }
|
|
63
|
|
64 throw('Need a name for serialising the mapping.')
|
|
65 unless ($mapping_name);
|
|
66
|
|
67 # Create a new MappingList object. Specify AUTO_LOAD to load
|
|
68 # serialised existing mappings if found
|
|
69 my $dump_path =
|
|
70 path_append( $self->conf->param('basedir'), 'mapping' );
|
|
71
|
|
72 my $mappings =
|
|
73 Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
74 -DUMP_PATH => $dump_path,
|
|
75 -CACHE_FILE => "${mapping_name}.ser",
|
|
76 -AUTO_LOAD => 1, );
|
|
77
|
|
78 # checkpoint test: return a previously stored MappingList
|
|
79 if ( $mappings->loaded ) {
|
|
80 $self->logger->info(
|
|
81 "Read existing mappings from ${mapping_name}.ser.\n");
|
|
82 return $mappings;
|
|
83 }
|
|
84
|
|
85 my $sources_done = {};
|
|
86 my $targets_done = {};
|
|
87
|
|
88 # sort scoring matrix entries by descending score
|
|
89 my @sorted_entries =
|
|
90 sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries };
|
|
91
|
|
92 # debug
|
|
93 #my $idx = substr($mapping_name, -1);
|
|
94
|
|
95 while ( my $entry = shift(@sorted_entries) ) {
|
|
96
|
|
97 #$self->logger->debug("\nxxx$idx ".$entry->to_string." ");
|
|
98
|
|
99 # we already found a mapping for either source or target
|
|
100 next
|
|
101 if ( $sources_done->{ $entry->source }
|
|
102 or $targets_done->{ $entry->target } );
|
|
103
|
|
104 #$self->logger->debug('d');
|
|
105
|
|
106 # there's a better mapping for either source or target
|
|
107 next
|
|
108 if ( $self->higher_score_exists(
|
|
109 $entry, $matrix, $sources_done, $targets_done
|
|
110 ) );
|
|
111
|
|
112 #$self->logger->debug('h');
|
|
113
|
|
114 # check for ambiguous mappings; they are dealt with later
|
|
115 my $other_sources = [];
|
|
116 my $other_targets = [];
|
|
117
|
|
118 if ( $self->ambiguous_mapping( $entry, $matrix,
|
|
119 $other_sources, $other_targets ) )
|
|
120 {
|
|
121 #$self->logger->debug('a');
|
|
122
|
|
123 $other_sources =
|
|
124 $self->filter_sources( $other_sources, $sources_done );
|
|
125 $other_targets =
|
|
126 $self->filter_targets( $other_targets, $targets_done );
|
|
127
|
|
128 next if ( scalar(@$other_sources) or scalar(@$other_targets) );
|
|
129 }
|
|
130
|
|
131 #$self->logger->debug('A');
|
|
132
|
|
133 # this is the best mapping, add it
|
|
134 $mappings->add_Entry($entry);
|
|
135
|
|
136 $sources_done->{ $entry->source } = 1;
|
|
137 $targets_done->{ $entry->target } = 1;
|
|
138 } ## end while ( my $entry = shift...)
|
|
139
|
|
140 # create checkpoint
|
|
141 $mappings->write_to_file;
|
|
142
|
|
143 return $mappings;
|
|
144 } ## end sub basic_mapping
|
|
145
|
|
146 sub higher_score_exists {
|
|
147 my ( $self, $entry, $matrix, $sources_done, $targets_done ) = @_;
|
|
148
|
|
149 my $source = $entry->source;
|
|
150 my $target = $entry->target;
|
|
151 my $score = $entry->score;
|
|
152
|
|
153 foreach
|
|
154 my $other_source ( @{ $matrix->get_sources_for_target($target) } )
|
|
155 {
|
|
156 if ( $other_source != $source
|
|
157 and !$sources_done->{$other_source}
|
|
158 and $score < $matrix->get_score( $other_source, $target ) )
|
|
159 {
|
|
160 return 1;
|
|
161 }
|
|
162 }
|
|
163
|
|
164 foreach
|
|
165 my $other_target ( @{ $matrix->get_targets_for_source($source) } )
|
|
166 {
|
|
167 if ( $other_target != $target
|
|
168 and !$targets_done->{$other_target}
|
|
169 and $score < $matrix->get_score( $source, $other_target ) )
|
|
170 {
|
|
171 return 1;
|
|
172 }
|
|
173 }
|
|
174
|
|
175 return 0;
|
|
176 } ## end sub higher_score_exists
|
|
177
|
|
178 #
|
|
179 # find ambiguous mappings (see scores_similar() for definition)
|
|
180 #
|
|
181 sub ambiguous_mapping {
|
|
182 my ( $self, $entry, $matrix, $other_sources, $other_targets ) = @_;
|
|
183
|
|
184 my $source = $entry->source;
|
|
185 my $target = $entry->target;
|
|
186 my $score = $entry->score;
|
|
187
|
|
188 my $retval = 0;
|
|
189
|
|
190 foreach
|
|
191 my $other_source ( @{ $matrix->get_sources_for_target($target) } )
|
|
192 {
|
|
193 my $other_score = $matrix->get_score( $other_source, $target );
|
|
194
|
|
195 if ( $other_source != $source
|
|
196 and ( $self->scores_similar( $score, $other_score )
|
|
197 or $score < $other_score ) )
|
|
198 {
|
|
199 $retval = 1;
|
|
200 push @{$other_sources}, $other_source;
|
|
201 }
|
|
202 }
|
|
203
|
|
204 foreach
|
|
205 my $other_target ( @{ $matrix->get_targets_for_source($source) } )
|
|
206 {
|
|
207 my $other_score = $matrix->get_score( $source, $other_target );
|
|
208
|
|
209 if ( $other_target != $target
|
|
210 and ( $self->scores_similar( $score, $other_score )
|
|
211 or $score < $other_score ) )
|
|
212 {
|
|
213 $retval = 1;
|
|
214 push @{$other_targets}, $other_target;
|
|
215 }
|
|
216 }
|
|
217
|
|
218 return $retval;
|
|
219 } ## end sub ambiguous_mapping
|
|
220
|
|
221 #
|
|
222 # rule for similarity taken from java code...
|
|
223 #
|
|
224 sub scores_similar {
|
|
225 my ( $self, $s1, $s2 ) = @_;
|
|
226
|
|
227 # always give priority to exact matches over very similar ones
|
|
228 return 0 if ( $s1 == 1 and $s2 < 1 );
|
|
229
|
|
230 my $diff = $s1 - $s2;
|
|
231 $diff = -$diff if ( $diff < 0 );
|
|
232
|
|
233 my $pc = 2*$diff/( $s1 + $s2 );
|
|
234
|
|
235 return ( $pc < SIMILAR_SCORE_RATIO );
|
|
236 }
|
|
237
|
|
238 sub filter_sources {
|
|
239 my ( $self, $other_sources, $sources_done ) = @_;
|
|
240
|
|
241 unless ( scalar( @{$other_sources} )
|
|
242 and scalar( keys %{$sources_done} ) )
|
|
243 {
|
|
244 return $other_sources;
|
|
245 }
|
|
246
|
|
247 my @tmp = ();
|
|
248
|
|
249 foreach my $e ( @{$other_sources} ) {
|
|
250 push @tmp, $e unless ( $sources_done->{$e} );
|
|
251 }
|
|
252
|
|
253 return \@tmp;
|
|
254 }
|
|
255
|
|
256 sub filter_targets {
|
|
257 my ( $self, $other_targets, $targets_done ) = @_;
|
|
258
|
|
259 unless ( scalar( @{$other_targets} )
|
|
260 and scalar( keys %{$targets_done} ) )
|
|
261 {
|
|
262 return $other_targets;
|
|
263 }
|
|
264
|
|
265 my @tmp = ();
|
|
266
|
|
267 foreach my $e ( @{$other_targets} ) {
|
|
268 push @tmp, $e unless ( $targets_done->{$e} );
|
|
269 }
|
|
270
|
|
271 return \@tmp;
|
|
272 }
|
|
273
|
|
274 1;
|