0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 # create a new SyntenyRegion from a source and a target gene
|
|
28 my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast( [
|
|
29 $source_gene->start, $source_gene->end,
|
|
30 $source_gene->strand, $source_gene->seq_region_name,
|
|
31 $target_gene->start, $target_gene->end,
|
|
32 $target_gene->strand, $target_gene->seq_region_name,
|
|
33 $entry->score,
|
|
34 ] );
|
|
35
|
|
36 # merge with another SyntenyRegion
|
|
37 my $merged_sr = $sr->merge($sr1);
|
|
38
|
|
39 # score a gene pair against this SyntenyRegion
|
|
40 my $score =
|
|
41 $sr->score_location_relationship( $source_gene1, $target_gene1 );
|
|
42
|
|
43 =head1 DESCRIPTION
|
|
44
|
|
45 This object represents a synteny between a source and a target location.
|
|
46 SyntenyRegions are built from mapped genes, and the their score is
|
|
47 defined as the score of the gene mapping. For merged SyntenyRegions,
|
|
48 scores are combined.
|
|
49
|
|
50 =head1 METHODS
|
|
51
|
|
52 new_fast
|
|
53 source_start
|
|
54 source_end
|
|
55 source_strand
|
|
56 source_seq_region_name
|
|
57 target_start
|
|
58 target_end
|
|
59 target_strand
|
|
60 target_seq_region_name
|
|
61 score
|
|
62 merge
|
|
63 stretch
|
|
64 score_location_relationship
|
|
65 to_string
|
|
66
|
|
67 =cut
|
|
68
|
|
69 package Bio::EnsEMBL::IdMapping::SyntenyRegion;
|
|
70
|
|
71 use strict;
|
|
72 use warnings;
|
|
73 no warnings 'uninitialized';
|
|
74
|
|
75 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
76
|
|
77
|
|
78 =head2 new_fast
|
|
79
|
|
80 Arg[1] : Arrayref $array_ref - the arrayref to bless into the
|
|
81 SyntenyRegion object
|
|
82 Example : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
|
|
83 ]);
|
|
84 Description : Constructor. On instantiation, source and target regions are
|
|
85 reverse complemented so that source is always on forward strand.
|
|
86 Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object
|
|
87 Exceptions : none
|
|
88 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
89 Status : At Risk
|
|
90 : under development
|
|
91
|
|
92 =cut
|
|
93
|
|
94 sub new_fast {
|
|
95 my $class = shift;
|
|
96 my $array_ref = shift;
|
|
97
|
|
98 # reverse complement source and target so that source is always on forward
|
|
99 # strand; this will make merging and other comparison operations easier
|
|
100 # at later stages
|
|
101 if ($array_ref->[2] == -1) {
|
|
102 $array_ref->[2] = 1;
|
|
103 $array_ref->[6] = -1 * $array_ref->[6];
|
|
104 }
|
|
105
|
|
106 return bless $array_ref, $class;
|
|
107 }
|
|
108
|
|
109
|
|
110 =head2 source_start
|
|
111
|
|
112 Arg[1] : (optional) Int - source location start coordinate
|
|
113 Description : Getter/setter for source location start coordinate.
|
|
114 Return type : Int
|
|
115 Exceptions : none
|
|
116 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
117 Status : At Risk
|
|
118 : under development
|
|
119
|
|
120 =cut
|
|
121
|
|
122 sub source_start {
|
|
123 my $self = shift;
|
|
124 $self->[0] = shift if (@_);
|
|
125 return $self->[0];
|
|
126 }
|
|
127
|
|
128
|
|
129 =head2 source_end
|
|
130
|
|
131 Arg[1] : (optional) Int - source location end coordinate
|
|
132 Description : Getter/setter for source location end coordinate.
|
|
133 Return type : Int
|
|
134 Exceptions : none
|
|
135 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
136 Status : At Risk
|
|
137 : under development
|
|
138
|
|
139 =cut
|
|
140
|
|
141 sub source_end {
|
|
142 my $self = shift;
|
|
143 $self->[1] = shift if (@_);
|
|
144 return $self->[1];
|
|
145 }
|
|
146
|
|
147
|
|
148 =head2 source_strand
|
|
149
|
|
150 Arg[1] : (optional) Int - source location strand
|
|
151 Description : Getter/setter for source location strand.
|
|
152 Return type : Int
|
|
153 Exceptions : none
|
|
154 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
155 Status : At Risk
|
|
156 : under development
|
|
157
|
|
158 =cut
|
|
159
|
|
160 sub source_strand {
|
|
161 my $self = shift;
|
|
162 $self->[2] = shift if (@_);
|
|
163 return $self->[2];
|
|
164 }
|
|
165
|
|
166
|
|
167 =head2 source_seq_region_name
|
|
168
|
|
169 Arg[1] : (optional) String - source location seq_region name
|
|
170 Description : Getter/setter for source location seq_region name.
|
|
171 Return type : String
|
|
172 Exceptions : none
|
|
173 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
174 Status : At Risk
|
|
175 : under development
|
|
176
|
|
177 =cut
|
|
178
|
|
179 sub source_seq_region_name {
|
|
180 my $self = shift;
|
|
181 $self->[3] = shift if (@_);
|
|
182 return $self->[3];
|
|
183 }
|
|
184
|
|
185
|
|
186 =head2 target_start
|
|
187
|
|
188 Arg[1] : (optional) Int - target location start coordinate
|
|
189 Description : Getter/setter for target location start coordinate.
|
|
190 Return type : Int
|
|
191 Exceptions : none
|
|
192 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
193 Status : At Risk
|
|
194 : under development
|
|
195
|
|
196 =cut
|
|
197
|
|
198 sub target_start {
|
|
199 my $self = shift;
|
|
200 $self->[4] = shift if (@_);
|
|
201 return $self->[4];
|
|
202 }
|
|
203
|
|
204
|
|
205 =head2 target_end
|
|
206
|
|
207 Arg[1] : (optional) Int - target location end coordinate
|
|
208 Description : Getter/setter for target location end coordinate.
|
|
209 Return type : Int
|
|
210 Exceptions : none
|
|
211 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
212 Status : At Risk
|
|
213 : under development
|
|
214
|
|
215 =cut
|
|
216
|
|
217 sub target_end {
|
|
218 my $self = shift;
|
|
219 $self->[5] = shift if (@_);
|
|
220 return $self->[5];
|
|
221 }
|
|
222
|
|
223
|
|
224 =head2 target_strand
|
|
225
|
|
226 Arg[1] : (optional) Int - target location strand
|
|
227 Description : Getter/setter for target location strand.
|
|
228 Return type : Int
|
|
229 Exceptions : none
|
|
230 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
231 Status : At Risk
|
|
232 : under development
|
|
233
|
|
234 =cut
|
|
235
|
|
236 sub target_strand {
|
|
237 my $self = shift;
|
|
238 $self->[6] = shift if (@_);
|
|
239 return $self->[6];
|
|
240 }
|
|
241
|
|
242
|
|
243 =head2 target_seq_region_name
|
|
244
|
|
245 Arg[1] : (optional) String - target location seq_region name
|
|
246 Description : Getter/setter for target location seq_region name.
|
|
247 Return type : String
|
|
248 Exceptions : none
|
|
249 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
250 Status : At Risk
|
|
251 : under development
|
|
252
|
|
253 =cut
|
|
254
|
|
255 sub target_seq_region_name {
|
|
256 my $self = shift;
|
|
257 $self->[7] = shift if (@_);
|
|
258 return $self->[7];
|
|
259 }
|
|
260
|
|
261
|
|
262 =head2 score
|
|
263
|
|
264 Arg[1] : (optional) Float - score
|
|
265 Description : Getter/setter for the score between source and target location.
|
|
266 Return type : Int
|
|
267 Exceptions : none
|
|
268 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
269 Status : At Risk
|
|
270 : under development
|
|
271
|
|
272 =cut
|
|
273
|
|
274 sub score {
|
|
275 my $self = shift;
|
|
276 $self->[8] = shift if (@_);
|
|
277 return $self->[8];
|
|
278 }
|
|
279
|
|
280
|
|
281 =head2 merge
|
|
282
|
|
283 Arg[1] : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another
|
|
284 SyntenyRegion
|
|
285 Example : $merged_sr = $sr->merge($other_sr);
|
|
286 Description : Merges two overlapping SyntenyRegions if they meet certain
|
|
287 criteria (see documentation in the code for details). Score is
|
|
288 calculated as a combined distance score. If the two
|
|
289 SyntenyRegions aren't mergeable, this method returns undef.
|
|
290 Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef
|
|
291 Exceptions : warns on bad scores
|
|
292 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
293 Status : At Risk
|
|
294 : under development
|
|
295
|
|
296 =cut
|
|
297
|
|
298 sub merge {
|
|
299 my ($self, $sr) = @_;
|
|
300
|
|
301 # must be on same seq_region
|
|
302 if ($self->source_seq_region_name ne $sr->source_seq_region_name or
|
|
303 $self->target_seq_region_name ne $sr->target_seq_region_name) {
|
|
304 return 0;
|
|
305 }
|
|
306
|
|
307 # target must be on same strand
|
|
308 return 0 unless ($self->target_strand == $sr->target_strand);
|
|
309
|
|
310 # find the distance of source and target pair and compare
|
|
311 my $source_dist = $sr->source_start - $self->source_start;
|
|
312 my $target_dist;
|
|
313 if ($self->target_strand == 1) {
|
|
314 $target_dist = $sr->target_start - $self->target_start;
|
|
315 } else {
|
|
316 $target_dist = $self->target_end - $sr->target_end;
|
|
317 }
|
|
318
|
|
319 # prevent division by zero error
|
|
320 if ($source_dist == 0 or $target_dist == 0) {
|
|
321 warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n");
|
|
322 return 0;
|
|
323 }
|
|
324
|
|
325 # calculate a distance score
|
|
326 my $dist = $source_dist - $target_dist;
|
|
327 $dist = -$dist if ($dist < 0);
|
|
328 my $d1 = $dist/$source_dist;
|
|
329 $d1 = -$d1 if ($d1 < 0);
|
|
330 my $d2 = $dist/$target_dist;
|
|
331 $d2 = -$d2 if ($d2 < 0);
|
|
332 my $dist_score = 1 - $d1 - $d2;
|
|
333
|
|
334 # distance score must be more than 50%
|
|
335 return 0 if ($dist_score < 0.5);
|
|
336
|
|
337 my $new_score = $dist_score * ($sr->score + $self->score)/2;
|
|
338
|
|
339 if ($new_score > 1) {
|
|
340 warn("WARNING: Bad merge score: $new_score\n");
|
|
341 }
|
|
342
|
|
343 # extend SyntenyRegion to cover both sources and targets, set merged score
|
|
344 # and return
|
|
345 if ($sr->source_start < $self->source_start) {
|
|
346 $self->source_start($sr->source_start);
|
|
347 }
|
|
348 if ($sr->source_end > $self->source_end) {
|
|
349 $self->source_end($sr->source_end);
|
|
350 }
|
|
351
|
|
352 if ($sr->target_start < $self->target_start) {
|
|
353 $self->target_start($sr->target_start);
|
|
354 }
|
|
355 if ($sr->target_end > $self->target_end) {
|
|
356 $self->target_end($sr->target_end);
|
|
357 }
|
|
358
|
|
359 $self->score($new_score);
|
|
360
|
|
361 return $self;
|
|
362 }
|
|
363
|
|
364
|
|
365 =head2 stretch
|
|
366
|
|
367 Arg[1] : Float $factor - stretching factor
|
|
368 Example : $stretched_sr = $sr->stretch(2);
|
|
369 Description : Extends this SyntenyRegion to span a $factor * $score more area.
|
|
370 Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion
|
|
371 Exceptions : none
|
|
372 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
373 Status : At Risk
|
|
374 : under development
|
|
375
|
|
376 =cut
|
|
377
|
|
378 sub stretch {
|
|
379 my ($self, $factor) = @_;
|
|
380
|
|
381 my $source_adjust = int(($self->source_end - $self->source_start + 1) *
|
|
382 $factor * $self->score);
|
|
383 $self->source_start($self->source_start - $source_adjust);
|
|
384 $self->source_end($self->source_end + $source_adjust);
|
|
385 #warn sprintf(" sss %d %d %d\n", $source_adjust, $self->source_start,
|
|
386 # $self->source_end);
|
|
387
|
|
388 my $target_adjust = int(($self->target_end - $self->target_start + 1) *
|
|
389 $factor * $self->score);
|
|
390 $self->target_start($self->target_start - $target_adjust);
|
|
391 $self->target_end($self->target_end + $target_adjust);
|
|
392
|
|
393 return $self;
|
|
394 }
|
|
395
|
|
396
|
|
397 =head2 score_location_relationship
|
|
398
|
|
399 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene
|
|
400 Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene
|
|
401 Example : my $score = $sr->score_location_relationship($source_gene,
|
|
402 $target_gene);
|
|
403 Description : This function calculates how well the given source location
|
|
404 interpolates on given target location inside this SyntenyRegion.
|
|
405
|
|
406 Scoring is done the following way: Source and target location
|
|
407 are normalized with respect to this Regions source and target.
|
|
408 Source range will then be somewhere close to 0.0-1.0 and target
|
|
409 range anything around that.
|
|
410
|
|
411 The extend of the covered area between source and target range
|
|
412 is a measurement of how well they agree (smaller extend is
|
|
413 better). The extend (actually 2*extend) is reduced by the size
|
|
414 of the regions. This will result in 0.0 if they overlap
|
|
415 perfectly and bigger values if they dont.
|
|
416
|
|
417 This is substracted from 1.0 to give the score. The score is
|
|
418 likely to be below zero, but is cut off at 0.0f.
|
|
419
|
|
420 Finally, the score is multiplied with the score of the synteny
|
|
421 itself.
|
|
422 Return type : Float
|
|
423 Exceptions : warns if score out of range
|
|
424 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
425 Status : At Risk
|
|
426 : under development
|
|
427
|
|
428 =cut
|
|
429
|
|
430
|
|
431
|
|
432 sub score_location_relationship {
|
|
433 my ($self, $source_gene, $target_gene) = @_;
|
|
434
|
|
435 # must be on same seq_region
|
|
436 if (($self->source_seq_region_name ne $source_gene->seq_region_name) or
|
|
437 ($self->target_seq_region_name ne $target_gene->seq_region_name)) {
|
|
438 return 0;
|
|
439 }
|
|
440
|
|
441 # strand relationship must be the same (use logical XOR to find out)
|
|
442 if (($self->source_strand == $source_gene->strand) xor
|
|
443 ($self->target_strand == $target_gene->strand)) {
|
|
444 return 0;
|
|
445 }
|
|
446
|
|
447 # normalise source location
|
|
448 my $source_rel_start = ($source_gene->start - $self->source_start) /
|
|
449 ($self->source_end - $self->source_start + 1);
|
|
450
|
|
451 my $source_rel_end = ($source_gene->end - $self->source_start + 1) /
|
|
452 ($self->source_end - $self->source_start + 1);
|
|
453
|
|
454 #warn " aaa ".$self->to_string."\n";
|
|
455 #warn sprintf(" bbb %.6f %.6f\n", $source_rel_start, $source_rel_end);
|
|
456
|
|
457 # cut off if the source location is completely outside
|
|
458 return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1);
|
|
459
|
|
460 # normalise target location
|
|
461 my ($target_rel_start, $target_rel_end);
|
|
462 my $t_length = $self->target_end - $self->target_start + 1;
|
|
463
|
|
464 if ($self->target_strand == 1) {
|
|
465
|
|
466 $target_rel_start = ($target_gene->start - $self->target_start) / $t_length;
|
|
467
|
|
468 $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length;
|
|
469
|
|
470 } else {
|
|
471 $target_rel_start = ($self->target_end - $target_gene->end) / $t_length;
|
|
472 $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length;
|
|
473 }
|
|
474
|
|
475 my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end :
|
|
476 $source_rel_end) -
|
|
477 (($target_rel_start < $source_rel_start) ? $target_rel_start :
|
|
478 $source_rel_start);
|
|
479
|
|
480 my $score = $self->score * (1 - (2 * $added_range - $target_rel_end -
|
|
481 $source_rel_end + $target_rel_start + $source_rel_start));
|
|
482
|
|
483 #warn " ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range,
|
|
484 # $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end);
|
|
485
|
|
486 $score = 0 if ($score < 0);
|
|
487
|
|
488 # sanity check
|
|
489 if ($score > 1) {
|
|
490 warn "Out of range score ($score) for ".$source_gene->id.":".
|
|
491 $target_gene->id."\n";
|
|
492 }
|
|
493
|
|
494 return $score;
|
|
495 }
|
|
496
|
|
497
|
|
498 =head2 to_string
|
|
499
|
|
500 Example : print LOG $sr->to_string, "\n";
|
|
501 Description : Returns a string representation of the SyntenyRegion object.
|
|
502 Useful for debugging and logging.
|
|
503 Return type : String
|
|
504 Exceptions : none
|
|
505 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
|
|
506 Status : At Risk
|
|
507 : under development
|
|
508
|
|
509 =cut
|
|
510
|
|
511 sub to_string {
|
|
512 my $self = shift;
|
|
513 return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f",
|
|
514 $self->source_seq_region_name,
|
|
515 $self->source_start,
|
|
516 $self->source_end,
|
|
517 $self->source_strand,
|
|
518 $self->target_seq_region_name,
|
|
519 $self->target_start,
|
|
520 $self->target_end,
|
|
521 $self->target_strand,
|
|
522 $self->score
|
|
523 );
|
|
524 }
|
|
525
|
|
526
|
|
527 1;
|
|
528
|