comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions
24
25 =head1 SYNOPSIS
26
27 # create a new SyntenyRegion from a source and a target gene
28 my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast( [
29 $source_gene->start, $source_gene->end,
30 $source_gene->strand, $source_gene->seq_region_name,
31 $target_gene->start, $target_gene->end,
32 $target_gene->strand, $target_gene->seq_region_name,
33 $entry->score,
34 ] );
35
36 # merge with another SyntenyRegion
37 my $merged_sr = $sr->merge($sr1);
38
39 # score a gene pair against this SyntenyRegion
40 my $score =
41 $sr->score_location_relationship( $source_gene1, $target_gene1 );
42
43 =head1 DESCRIPTION
44
45 This object represents a synteny between a source and a target location.
46 SyntenyRegions are built from mapped genes, and the their score is
47 defined as the score of the gene mapping. For merged SyntenyRegions,
48 scores are combined.
49
50 =head1 METHODS
51
52 new_fast
53 source_start
54 source_end
55 source_strand
56 source_seq_region_name
57 target_start
58 target_end
59 target_strand
60 target_seq_region_name
61 score
62 merge
63 stretch
64 score_location_relationship
65 to_string
66
67 =cut
68
69 package Bio::EnsEMBL::IdMapping::SyntenyRegion;
70
71 use strict;
72 use warnings;
73 no warnings 'uninitialized';
74
75 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
76
77
78 =head2 new_fast
79
80 Arg[1] : Arrayref $array_ref - the arrayref to bless into the
81 SyntenyRegion object
82 Example : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
83 ]);
84 Description : Constructor. On instantiation, source and target regions are
85 reverse complemented so that source is always on forward strand.
86 Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object
87 Exceptions : none
88 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
89 Status : At Risk
90 : under development
91
92 =cut
93
94 sub new_fast {
95 my $class = shift;
96 my $array_ref = shift;
97
98 # reverse complement source and target so that source is always on forward
99 # strand; this will make merging and other comparison operations easier
100 # at later stages
101 if ($array_ref->[2] == -1) {
102 $array_ref->[2] = 1;
103 $array_ref->[6] = -1 * $array_ref->[6];
104 }
105
106 return bless $array_ref, $class;
107 }
108
109
110 =head2 source_start
111
112 Arg[1] : (optional) Int - source location start coordinate
113 Description : Getter/setter for source location start coordinate.
114 Return type : Int
115 Exceptions : none
116 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
117 Status : At Risk
118 : under development
119
120 =cut
121
122 sub source_start {
123 my $self = shift;
124 $self->[0] = shift if (@_);
125 return $self->[0];
126 }
127
128
129 =head2 source_end
130
131 Arg[1] : (optional) Int - source location end coordinate
132 Description : Getter/setter for source location end coordinate.
133 Return type : Int
134 Exceptions : none
135 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
136 Status : At Risk
137 : under development
138
139 =cut
140
141 sub source_end {
142 my $self = shift;
143 $self->[1] = shift if (@_);
144 return $self->[1];
145 }
146
147
148 =head2 source_strand
149
150 Arg[1] : (optional) Int - source location strand
151 Description : Getter/setter for source location strand.
152 Return type : Int
153 Exceptions : none
154 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
155 Status : At Risk
156 : under development
157
158 =cut
159
160 sub source_strand {
161 my $self = shift;
162 $self->[2] = shift if (@_);
163 return $self->[2];
164 }
165
166
167 =head2 source_seq_region_name
168
169 Arg[1] : (optional) String - source location seq_region name
170 Description : Getter/setter for source location seq_region name.
171 Return type : String
172 Exceptions : none
173 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
174 Status : At Risk
175 : under development
176
177 =cut
178
179 sub source_seq_region_name {
180 my $self = shift;
181 $self->[3] = shift if (@_);
182 return $self->[3];
183 }
184
185
186 =head2 target_start
187
188 Arg[1] : (optional) Int - target location start coordinate
189 Description : Getter/setter for target location start coordinate.
190 Return type : Int
191 Exceptions : none
192 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
193 Status : At Risk
194 : under development
195
196 =cut
197
198 sub target_start {
199 my $self = shift;
200 $self->[4] = shift if (@_);
201 return $self->[4];
202 }
203
204
205 =head2 target_end
206
207 Arg[1] : (optional) Int - target location end coordinate
208 Description : Getter/setter for target location end coordinate.
209 Return type : Int
210 Exceptions : none
211 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
212 Status : At Risk
213 : under development
214
215 =cut
216
217 sub target_end {
218 my $self = shift;
219 $self->[5] = shift if (@_);
220 return $self->[5];
221 }
222
223
224 =head2 target_strand
225
226 Arg[1] : (optional) Int - target location strand
227 Description : Getter/setter for target location strand.
228 Return type : Int
229 Exceptions : none
230 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
231 Status : At Risk
232 : under development
233
234 =cut
235
236 sub target_strand {
237 my $self = shift;
238 $self->[6] = shift if (@_);
239 return $self->[6];
240 }
241
242
243 =head2 target_seq_region_name
244
245 Arg[1] : (optional) String - target location seq_region name
246 Description : Getter/setter for target location seq_region name.
247 Return type : String
248 Exceptions : none
249 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
250 Status : At Risk
251 : under development
252
253 =cut
254
255 sub target_seq_region_name {
256 my $self = shift;
257 $self->[7] = shift if (@_);
258 return $self->[7];
259 }
260
261
262 =head2 score
263
264 Arg[1] : (optional) Float - score
265 Description : Getter/setter for the score between source and target location.
266 Return type : Int
267 Exceptions : none
268 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
269 Status : At Risk
270 : under development
271
272 =cut
273
274 sub score {
275 my $self = shift;
276 $self->[8] = shift if (@_);
277 return $self->[8];
278 }
279
280
281 =head2 merge
282
283 Arg[1] : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another
284 SyntenyRegion
285 Example : $merged_sr = $sr->merge($other_sr);
286 Description : Merges two overlapping SyntenyRegions if they meet certain
287 criteria (see documentation in the code for details). Score is
288 calculated as a combined distance score. If the two
289 SyntenyRegions aren't mergeable, this method returns undef.
290 Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef
291 Exceptions : warns on bad scores
292 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
293 Status : At Risk
294 : under development
295
296 =cut
297
298 sub merge {
299 my ($self, $sr) = @_;
300
301 # must be on same seq_region
302 if ($self->source_seq_region_name ne $sr->source_seq_region_name or
303 $self->target_seq_region_name ne $sr->target_seq_region_name) {
304 return 0;
305 }
306
307 # target must be on same strand
308 return 0 unless ($self->target_strand == $sr->target_strand);
309
310 # find the distance of source and target pair and compare
311 my $source_dist = $sr->source_start - $self->source_start;
312 my $target_dist;
313 if ($self->target_strand == 1) {
314 $target_dist = $sr->target_start - $self->target_start;
315 } else {
316 $target_dist = $self->target_end - $sr->target_end;
317 }
318
319 # prevent division by zero error
320 if ($source_dist == 0 or $target_dist == 0) {
321 warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n");
322 return 0;
323 }
324
325 # calculate a distance score
326 my $dist = $source_dist - $target_dist;
327 $dist = -$dist if ($dist < 0);
328 my $d1 = $dist/$source_dist;
329 $d1 = -$d1 if ($d1 < 0);
330 my $d2 = $dist/$target_dist;
331 $d2 = -$d2 if ($d2 < 0);
332 my $dist_score = 1 - $d1 - $d2;
333
334 # distance score must be more than 50%
335 return 0 if ($dist_score < 0.5);
336
337 my $new_score = $dist_score * ($sr->score + $self->score)/2;
338
339 if ($new_score > 1) {
340 warn("WARNING: Bad merge score: $new_score\n");
341 }
342
343 # extend SyntenyRegion to cover both sources and targets, set merged score
344 # and return
345 if ($sr->source_start < $self->source_start) {
346 $self->source_start($sr->source_start);
347 }
348 if ($sr->source_end > $self->source_end) {
349 $self->source_end($sr->source_end);
350 }
351
352 if ($sr->target_start < $self->target_start) {
353 $self->target_start($sr->target_start);
354 }
355 if ($sr->target_end > $self->target_end) {
356 $self->target_end($sr->target_end);
357 }
358
359 $self->score($new_score);
360
361 return $self;
362 }
363
364
365 =head2 stretch
366
367 Arg[1] : Float $factor - stretching factor
368 Example : $stretched_sr = $sr->stretch(2);
369 Description : Extends this SyntenyRegion to span a $factor * $score more area.
370 Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion
371 Exceptions : none
372 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
373 Status : At Risk
374 : under development
375
376 =cut
377
378 sub stretch {
379 my ($self, $factor) = @_;
380
381 my $source_adjust = int(($self->source_end - $self->source_start + 1) *
382 $factor * $self->score);
383 $self->source_start($self->source_start - $source_adjust);
384 $self->source_end($self->source_end + $source_adjust);
385 #warn sprintf(" sss %d %d %d\n", $source_adjust, $self->source_start,
386 # $self->source_end);
387
388 my $target_adjust = int(($self->target_end - $self->target_start + 1) *
389 $factor * $self->score);
390 $self->target_start($self->target_start - $target_adjust);
391 $self->target_end($self->target_end + $target_adjust);
392
393 return $self;
394 }
395
396
397 =head2 score_location_relationship
398
399 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene
400 Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene
401 Example : my $score = $sr->score_location_relationship($source_gene,
402 $target_gene);
403 Description : This function calculates how well the given source location
404 interpolates on given target location inside this SyntenyRegion.
405
406 Scoring is done the following way: Source and target location
407 are normalized with respect to this Regions source and target.
408 Source range will then be somewhere close to 0.0-1.0 and target
409 range anything around that.
410
411 The extend of the covered area between source and target range
412 is a measurement of how well they agree (smaller extend is
413 better). The extend (actually 2*extend) is reduced by the size
414 of the regions. This will result in 0.0 if they overlap
415 perfectly and bigger values if they dont.
416
417 This is substracted from 1.0 to give the score. The score is
418 likely to be below zero, but is cut off at 0.0f.
419
420 Finally, the score is multiplied with the score of the synteny
421 itself.
422 Return type : Float
423 Exceptions : warns if score out of range
424 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
425 Status : At Risk
426 : under development
427
428 =cut
429
430
431
432 sub score_location_relationship {
433 my ($self, $source_gene, $target_gene) = @_;
434
435 # must be on same seq_region
436 if (($self->source_seq_region_name ne $source_gene->seq_region_name) or
437 ($self->target_seq_region_name ne $target_gene->seq_region_name)) {
438 return 0;
439 }
440
441 # strand relationship must be the same (use logical XOR to find out)
442 if (($self->source_strand == $source_gene->strand) xor
443 ($self->target_strand == $target_gene->strand)) {
444 return 0;
445 }
446
447 # normalise source location
448 my $source_rel_start = ($source_gene->start - $self->source_start) /
449 ($self->source_end - $self->source_start + 1);
450
451 my $source_rel_end = ($source_gene->end - $self->source_start + 1) /
452 ($self->source_end - $self->source_start + 1);
453
454 #warn " aaa ".$self->to_string."\n";
455 #warn sprintf(" bbb %.6f %.6f\n", $source_rel_start, $source_rel_end);
456
457 # cut off if the source location is completely outside
458 return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1);
459
460 # normalise target location
461 my ($target_rel_start, $target_rel_end);
462 my $t_length = $self->target_end - $self->target_start + 1;
463
464 if ($self->target_strand == 1) {
465
466 $target_rel_start = ($target_gene->start - $self->target_start) / $t_length;
467
468 $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length;
469
470 } else {
471 $target_rel_start = ($self->target_end - $target_gene->end) / $t_length;
472 $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length;
473 }
474
475 my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end :
476 $source_rel_end) -
477 (($target_rel_start < $source_rel_start) ? $target_rel_start :
478 $source_rel_start);
479
480 my $score = $self->score * (1 - (2 * $added_range - $target_rel_end -
481 $source_rel_end + $target_rel_start + $source_rel_start));
482
483 #warn " ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range,
484 # $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end);
485
486 $score = 0 if ($score < 0);
487
488 # sanity check
489 if ($score > 1) {
490 warn "Out of range score ($score) for ".$source_gene->id.":".
491 $target_gene->id."\n";
492 }
493
494 return $score;
495 }
496
497
498 =head2 to_string
499
500 Example : print LOG $sr->to_string, "\n";
501 Description : Returns a string representation of the SyntenyRegion object.
502 Useful for debugging and logging.
503 Return type : String
504 Exceptions : none
505 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
506 Status : At Risk
507 : under development
508
509 =cut
510
511 sub to_string {
512 my $self = shift;
513 return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f",
514 $self->source_seq_region_name,
515 $self->source_start,
516 $self->source_end,
517 $self->source_strand,
518 $self->target_seq_region_name,
519 $self->target_start,
520 $self->target_end,
521 $self->target_strand,
522 $self->score
523 );
524 }
525
526
527 1;
528