0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 =head1 SYNOPSIS
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 =head1 METHODS
|
|
28
|
|
29 =cut
|
|
30
|
|
31
|
|
32 package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric;
|
|
33
|
|
34 use strict;
|
|
35 use warnings;
|
|
36 no warnings 'uninitialized';
|
|
37
|
|
38 use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
|
|
39 our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper);
|
|
40
|
|
41 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
42 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
|
|
43
|
|
44
|
|
45 #
|
|
46 # basic mapping
|
|
47 #
|
|
48 sub init_basic {
|
|
49 my $self = shift;
|
|
50 my $num = shift;
|
|
51 my $tsb = shift;
|
|
52 my $mappings = shift;
|
|
53 my $transcript_scores = shift;
|
|
54
|
|
55 $self->logger->info("Basic transcript mapping...\n", 0, 'stamped');
|
|
56
|
|
57 $mappings = $self->basic_mapping($transcript_scores,
|
|
58 "transcript_mappings$num");
|
|
59 $num++;
|
|
60 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
|
|
61 "transcript_matrix$num");
|
|
62
|
|
63 return ($new_scores, $mappings);
|
|
64 }
|
|
65
|
|
66
|
|
67 #
|
|
68 # handle cases with exact match but different translation
|
|
69 #
|
|
70 sub non_exact_translation {
|
|
71 my $self = shift;
|
|
72 my $num = shift;
|
|
73 my $tsb = shift;
|
|
74 my $mappings = shift;
|
|
75 my $transcript_scores = shift;
|
|
76
|
|
77 $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped');
|
|
78
|
|
79 unless ($transcript_scores->loaded) {
|
|
80 $tsb->different_translation_rescore($transcript_scores);
|
|
81 $transcript_scores->write_to_file;
|
|
82 }
|
|
83
|
|
84 $mappings = $self->basic_mapping($transcript_scores,
|
|
85 "transcript_mappings$num");
|
|
86 $num++;
|
|
87 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
|
|
88 "transcript_matrix$num");
|
|
89
|
|
90 return ($new_scores, $mappings);
|
|
91 }
|
|
92
|
|
93
|
|
94 #
|
|
95 # reduce score for mappings of transcripts which do not belong to mapped
|
|
96 # genes
|
|
97 #
|
|
98 sub mapped_gene {
|
|
99 my $self = shift;
|
|
100 my $num = shift;
|
|
101 my $tsb = shift;
|
|
102 my $mappings = shift;
|
|
103 my $transcript_scores = shift;
|
|
104 my $gene_mappings = shift;
|
|
105
|
|
106 $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped');
|
|
107
|
|
108 unless ($transcript_scores->loaded) {
|
|
109 $tsb->non_mapped_gene_rescore($transcript_scores, $gene_mappings);
|
|
110 $transcript_scores->write_to_file;
|
|
111 }
|
|
112
|
|
113 $mappings = $self->basic_mapping($transcript_scores,
|
|
114 "transcript_mappings$num");
|
|
115 $num++;
|
|
116 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
|
|
117 "transcript_matrix$num");
|
|
118
|
|
119 return ($new_scores, $mappings);
|
|
120 }
|
|
121
|
|
122 #
|
|
123 # rescore by penalising scores between transcripts with different biotypes
|
|
124 #
|
|
125 sub biotype {
|
|
126 my $self = shift;
|
|
127 my $num = shift;
|
|
128 my $tsb = shift;
|
|
129 my $mappings = shift;
|
|
130 my $transcript_scores = shift;
|
|
131
|
|
132 $self->logger->info( "Retry with biotype disambiguation...\n",
|
|
133 0, 'stamped' );
|
|
134
|
|
135 unless ( $transcript_scores->loaded() ) {
|
|
136 $tsb->biotype_transcript_rescore($transcript_scores);
|
|
137 $transcript_scores->write_to_file();
|
|
138 }
|
|
139
|
|
140 my $new_mappings = $self->basic_mapping( $transcript_scores,
|
|
141 "transcript_mappings$num" );
|
|
142 $num++;
|
|
143 my $new_scores =
|
|
144 $tsb->create_shrinked_matrix( $transcript_scores, $new_mappings,
|
|
145 "transcript_matrix$num" );
|
|
146
|
|
147 return ( $new_scores, $new_mappings );
|
|
148 }
|
|
149
|
|
150 #
|
|
151 # selectively rescore by penalising scores between transcripts with
|
|
152 # different internalIDs
|
|
153 #
|
|
154 sub internal_id {
|
|
155 my $self = shift;
|
|
156 my $num = shift;
|
|
157 my $tsb = shift;
|
|
158 my $mappings = shift;
|
|
159 my $transcript_scores = shift;
|
|
160
|
|
161 $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped');
|
|
162
|
|
163 unless ($transcript_scores->loaded) {
|
|
164 $tsb->internal_id_rescore($transcript_scores);
|
|
165 $transcript_scores->write_to_file;
|
|
166 }
|
|
167
|
|
168 $mappings = $self->basic_mapping($transcript_scores,
|
|
169 "transcript_mappings$num");
|
|
170 $num++;
|
|
171 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
|
|
172 "transcript_matrix$num");
|
|
173
|
|
174 return ($new_scores, $mappings);
|
|
175 }
|
|
176
|
|
177
|
|
178 #
|
|
179 # handle ambiguities between transcripts in single genes
|
|
180 #
|
|
181 sub single_gene {
|
|
182 my $self = shift;
|
|
183 my $num = shift;
|
|
184 my $tsb = shift;
|
|
185 my $mappings = shift;
|
|
186 my $transcript_scores = shift;
|
|
187
|
|
188 $self->logger->info("Transcripts in single genes...\n", 0, 'stamped');
|
|
189
|
|
190 unless ($transcript_scores->loaded) {
|
|
191 $transcript_scores->write_to_file;
|
|
192 }
|
|
193
|
|
194 $mappings = $self->same_gene_transcript_mapping($transcript_scores,
|
|
195 "transcript_mappings$num");
|
|
196 $num++;
|
|
197 my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
|
|
198 "transcript_matrix$num");
|
|
199
|
|
200 return ($new_scores, $mappings);
|
|
201 }
|
|
202
|
|
203
|
|
204 #
|
|
205 # modified basic mapper that maps transcripts that are ambiguous within one gene
|
|
206 #
|
|
207 sub same_gene_transcript_mapping {
|
|
208 my $self = shift;
|
|
209 my $matrix = shift;
|
|
210 my $mapping_name = shift;
|
|
211
|
|
212 # argument checks
|
|
213 unless ($matrix and
|
|
214 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
|
|
215 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
|
|
216 }
|
|
217
|
|
218 throw('Need a name for serialising the mapping.') unless ($mapping_name);
|
|
219
|
|
220 # Create a new MappingList object. Specify AUTO_LOAD to load serialised
|
|
221 # existing mappings if found
|
|
222 my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
|
|
223
|
|
224 my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
|
|
225 -DUMP_PATH => $dump_path,
|
|
226 -CACHE_FILE => "${mapping_name}.ser",
|
|
227 -AUTO_LOAD => 1,
|
|
228 );
|
|
229
|
|
230 # checkpoint test: return a previously stored MappingList
|
|
231 if ($mappings->loaded) {
|
|
232 $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n");
|
|
233 return $mappings;
|
|
234 }
|
|
235
|
|
236 my $sources_done = {};
|
|
237 my $targets_done = {};
|
|
238
|
|
239 # sort scoring matrix entries by descending score
|
|
240 my @sorted_entries = sort { $b->score <=> $a->score ||
|
|
241 $a->source <=> $b->source || $a->target <=> $b->target }
|
|
242 @{ $matrix->get_all_Entries };
|
|
243
|
|
244 while (my $entry = shift(@sorted_entries)) {
|
|
245
|
|
246 # $self->logger->debug("\nxxx4 ".$entry->to_string." ");
|
|
247
|
|
248 # we already found a mapping for either source or target yet
|
|
249 next if ($sources_done->{$entry->source} or
|
|
250 $targets_done->{$entry->target});
|
|
251
|
|
252 #$self->logger->debug('d');
|
|
253
|
|
254 my $other_sources = [];
|
|
255 my $other_targets = [];
|
|
256 my %source_genes = ();
|
|
257 my %target_genes = ();
|
|
258
|
|
259 if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) {
|
|
260 #$self->logger->debug('a');
|
|
261
|
|
262 $other_sources = $self->filter_sources($other_sources, $sources_done);
|
|
263 $other_targets = $self->filter_targets($other_targets, $targets_done);
|
|
264
|
|
265 $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
|
|
266 'source', $entry->source)} = 1;
|
|
267 $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
|
|
268 'target', $entry->target)} = 1;
|
|
269
|
|
270 foreach my $other_source (@{ $other_sources }) {
|
|
271 $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
|
|
272 'source', $other_source)} = 1;
|
|
273 }
|
|
274
|
|
275 foreach my $other_target (@{ $other_targets }) {
|
|
276 $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
|
|
277 'target', $other_target)} = 1;
|
|
278 }
|
|
279
|
|
280 # only add mapping if only one source and target gene involved
|
|
281 if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) {
|
|
282 #$self->logger->debug('O');
|
|
283 $mappings->add_Entry($entry);
|
|
284 }
|
|
285
|
|
286 } else {
|
|
287 #$self->logger->debug('A');
|
|
288
|
|
289 # this is the best mapping, add it
|
|
290 $mappings->add_Entry($entry);
|
|
291 }
|
|
292
|
|
293 $sources_done->{$entry->source} = 1;
|
|
294 $targets_done->{$entry->target} = 1;
|
|
295 }
|
|
296
|
|
297 # create checkpoint
|
|
298 $mappings->write_to_file;
|
|
299
|
|
300 return $mappings;
|
|
301 }
|
|
302
|
|
303
|
|
304 1;
|
|
305
|