Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 # create a new SyntenyRegion from a source and a target gene | |
28 my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast( [ | |
29 $source_gene->start, $source_gene->end, | |
30 $source_gene->strand, $source_gene->seq_region_name, | |
31 $target_gene->start, $target_gene->end, | |
32 $target_gene->strand, $target_gene->seq_region_name, | |
33 $entry->score, | |
34 ] ); | |
35 | |
36 # merge with another SyntenyRegion | |
37 my $merged_sr = $sr->merge($sr1); | |
38 | |
39 # score a gene pair against this SyntenyRegion | |
40 my $score = | |
41 $sr->score_location_relationship( $source_gene1, $target_gene1 ); | |
42 | |
43 =head1 DESCRIPTION | |
44 | |
45 This object represents a synteny between a source and a target location. | |
46 SyntenyRegions are built from mapped genes, and the their score is | |
47 defined as the score of the gene mapping. For merged SyntenyRegions, | |
48 scores are combined. | |
49 | |
50 =head1 METHODS | |
51 | |
52 new_fast | |
53 source_start | |
54 source_end | |
55 source_strand | |
56 source_seq_region_name | |
57 target_start | |
58 target_end | |
59 target_strand | |
60 target_seq_region_name | |
61 score | |
62 merge | |
63 stretch | |
64 score_location_relationship | |
65 to_string | |
66 | |
67 =cut | |
68 | |
69 package Bio::EnsEMBL::IdMapping::SyntenyRegion; | |
70 | |
71 use strict; | |
72 use warnings; | |
73 no warnings 'uninitialized'; | |
74 | |
75 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
76 | |
77 | |
78 =head2 new_fast | |
79 | |
80 Arg[1] : Arrayref $array_ref - the arrayref to bless into the | |
81 SyntenyRegion object | |
82 Example : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ | |
83 ]); | |
84 Description : Constructor. On instantiation, source and target regions are | |
85 reverse complemented so that source is always on forward strand. | |
86 Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object | |
87 Exceptions : none | |
88 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
89 Status : At Risk | |
90 : under development | |
91 | |
92 =cut | |
93 | |
94 sub new_fast { | |
95 my $class = shift; | |
96 my $array_ref = shift; | |
97 | |
98 # reverse complement source and target so that source is always on forward | |
99 # strand; this will make merging and other comparison operations easier | |
100 # at later stages | |
101 if ($array_ref->[2] == -1) { | |
102 $array_ref->[2] = 1; | |
103 $array_ref->[6] = -1 * $array_ref->[6]; | |
104 } | |
105 | |
106 return bless $array_ref, $class; | |
107 } | |
108 | |
109 | |
110 =head2 source_start | |
111 | |
112 Arg[1] : (optional) Int - source location start coordinate | |
113 Description : Getter/setter for source location start coordinate. | |
114 Return type : Int | |
115 Exceptions : none | |
116 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
117 Status : At Risk | |
118 : under development | |
119 | |
120 =cut | |
121 | |
122 sub source_start { | |
123 my $self = shift; | |
124 $self->[0] = shift if (@_); | |
125 return $self->[0]; | |
126 } | |
127 | |
128 | |
129 =head2 source_end | |
130 | |
131 Arg[1] : (optional) Int - source location end coordinate | |
132 Description : Getter/setter for source location end coordinate. | |
133 Return type : Int | |
134 Exceptions : none | |
135 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
136 Status : At Risk | |
137 : under development | |
138 | |
139 =cut | |
140 | |
141 sub source_end { | |
142 my $self = shift; | |
143 $self->[1] = shift if (@_); | |
144 return $self->[1]; | |
145 } | |
146 | |
147 | |
148 =head2 source_strand | |
149 | |
150 Arg[1] : (optional) Int - source location strand | |
151 Description : Getter/setter for source location strand. | |
152 Return type : Int | |
153 Exceptions : none | |
154 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
155 Status : At Risk | |
156 : under development | |
157 | |
158 =cut | |
159 | |
160 sub source_strand { | |
161 my $self = shift; | |
162 $self->[2] = shift if (@_); | |
163 return $self->[2]; | |
164 } | |
165 | |
166 | |
167 =head2 source_seq_region_name | |
168 | |
169 Arg[1] : (optional) String - source location seq_region name | |
170 Description : Getter/setter for source location seq_region name. | |
171 Return type : String | |
172 Exceptions : none | |
173 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
174 Status : At Risk | |
175 : under development | |
176 | |
177 =cut | |
178 | |
179 sub source_seq_region_name { | |
180 my $self = shift; | |
181 $self->[3] = shift if (@_); | |
182 return $self->[3]; | |
183 } | |
184 | |
185 | |
186 =head2 target_start | |
187 | |
188 Arg[1] : (optional) Int - target location start coordinate | |
189 Description : Getter/setter for target location start coordinate. | |
190 Return type : Int | |
191 Exceptions : none | |
192 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
193 Status : At Risk | |
194 : under development | |
195 | |
196 =cut | |
197 | |
198 sub target_start { | |
199 my $self = shift; | |
200 $self->[4] = shift if (@_); | |
201 return $self->[4]; | |
202 } | |
203 | |
204 | |
205 =head2 target_end | |
206 | |
207 Arg[1] : (optional) Int - target location end coordinate | |
208 Description : Getter/setter for target location end coordinate. | |
209 Return type : Int | |
210 Exceptions : none | |
211 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
212 Status : At Risk | |
213 : under development | |
214 | |
215 =cut | |
216 | |
217 sub target_end { | |
218 my $self = shift; | |
219 $self->[5] = shift if (@_); | |
220 return $self->[5]; | |
221 } | |
222 | |
223 | |
224 =head2 target_strand | |
225 | |
226 Arg[1] : (optional) Int - target location strand | |
227 Description : Getter/setter for target location strand. | |
228 Return type : Int | |
229 Exceptions : none | |
230 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
231 Status : At Risk | |
232 : under development | |
233 | |
234 =cut | |
235 | |
236 sub target_strand { | |
237 my $self = shift; | |
238 $self->[6] = shift if (@_); | |
239 return $self->[6]; | |
240 } | |
241 | |
242 | |
243 =head2 target_seq_region_name | |
244 | |
245 Arg[1] : (optional) String - target location seq_region name | |
246 Description : Getter/setter for target location seq_region name. | |
247 Return type : String | |
248 Exceptions : none | |
249 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
250 Status : At Risk | |
251 : under development | |
252 | |
253 =cut | |
254 | |
255 sub target_seq_region_name { | |
256 my $self = shift; | |
257 $self->[7] = shift if (@_); | |
258 return $self->[7]; | |
259 } | |
260 | |
261 | |
262 =head2 score | |
263 | |
264 Arg[1] : (optional) Float - score | |
265 Description : Getter/setter for the score between source and target location. | |
266 Return type : Int | |
267 Exceptions : none | |
268 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
269 Status : At Risk | |
270 : under development | |
271 | |
272 =cut | |
273 | |
274 sub score { | |
275 my $self = shift; | |
276 $self->[8] = shift if (@_); | |
277 return $self->[8]; | |
278 } | |
279 | |
280 | |
281 =head2 merge | |
282 | |
283 Arg[1] : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another | |
284 SyntenyRegion | |
285 Example : $merged_sr = $sr->merge($other_sr); | |
286 Description : Merges two overlapping SyntenyRegions if they meet certain | |
287 criteria (see documentation in the code for details). Score is | |
288 calculated as a combined distance score. If the two | |
289 SyntenyRegions aren't mergeable, this method returns undef. | |
290 Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef | |
291 Exceptions : warns on bad scores | |
292 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
293 Status : At Risk | |
294 : under development | |
295 | |
296 =cut | |
297 | |
298 sub merge { | |
299 my ($self, $sr) = @_; | |
300 | |
301 # must be on same seq_region | |
302 if ($self->source_seq_region_name ne $sr->source_seq_region_name or | |
303 $self->target_seq_region_name ne $sr->target_seq_region_name) { | |
304 return 0; | |
305 } | |
306 | |
307 # target must be on same strand | |
308 return 0 unless ($self->target_strand == $sr->target_strand); | |
309 | |
310 # find the distance of source and target pair and compare | |
311 my $source_dist = $sr->source_start - $self->source_start; | |
312 my $target_dist; | |
313 if ($self->target_strand == 1) { | |
314 $target_dist = $sr->target_start - $self->target_start; | |
315 } else { | |
316 $target_dist = $self->target_end - $sr->target_end; | |
317 } | |
318 | |
319 # prevent division by zero error | |
320 if ($source_dist == 0 or $target_dist == 0) { | |
321 warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n"); | |
322 return 0; | |
323 } | |
324 | |
325 # calculate a distance score | |
326 my $dist = $source_dist - $target_dist; | |
327 $dist = -$dist if ($dist < 0); | |
328 my $d1 = $dist/$source_dist; | |
329 $d1 = -$d1 if ($d1 < 0); | |
330 my $d2 = $dist/$target_dist; | |
331 $d2 = -$d2 if ($d2 < 0); | |
332 my $dist_score = 1 - $d1 - $d2; | |
333 | |
334 # distance score must be more than 50% | |
335 return 0 if ($dist_score < 0.5); | |
336 | |
337 my $new_score = $dist_score * ($sr->score + $self->score)/2; | |
338 | |
339 if ($new_score > 1) { | |
340 warn("WARNING: Bad merge score: $new_score\n"); | |
341 } | |
342 | |
343 # extend SyntenyRegion to cover both sources and targets, set merged score | |
344 # and return | |
345 if ($sr->source_start < $self->source_start) { | |
346 $self->source_start($sr->source_start); | |
347 } | |
348 if ($sr->source_end > $self->source_end) { | |
349 $self->source_end($sr->source_end); | |
350 } | |
351 | |
352 if ($sr->target_start < $self->target_start) { | |
353 $self->target_start($sr->target_start); | |
354 } | |
355 if ($sr->target_end > $self->target_end) { | |
356 $self->target_end($sr->target_end); | |
357 } | |
358 | |
359 $self->score($new_score); | |
360 | |
361 return $self; | |
362 } | |
363 | |
364 | |
365 =head2 stretch | |
366 | |
367 Arg[1] : Float $factor - stretching factor | |
368 Example : $stretched_sr = $sr->stretch(2); | |
369 Description : Extends this SyntenyRegion to span a $factor * $score more area. | |
370 Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion | |
371 Exceptions : none | |
372 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
373 Status : At Risk | |
374 : under development | |
375 | |
376 =cut | |
377 | |
378 sub stretch { | |
379 my ($self, $factor) = @_; | |
380 | |
381 my $source_adjust = int(($self->source_end - $self->source_start + 1) * | |
382 $factor * $self->score); | |
383 $self->source_start($self->source_start - $source_adjust); | |
384 $self->source_end($self->source_end + $source_adjust); | |
385 #warn sprintf(" sss %d %d %d\n", $source_adjust, $self->source_start, | |
386 # $self->source_end); | |
387 | |
388 my $target_adjust = int(($self->target_end - $self->target_start + 1) * | |
389 $factor * $self->score); | |
390 $self->target_start($self->target_start - $target_adjust); | |
391 $self->target_end($self->target_end + $target_adjust); | |
392 | |
393 return $self; | |
394 } | |
395 | |
396 | |
397 =head2 score_location_relationship | |
398 | |
399 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene | |
400 Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene | |
401 Example : my $score = $sr->score_location_relationship($source_gene, | |
402 $target_gene); | |
403 Description : This function calculates how well the given source location | |
404 interpolates on given target location inside this SyntenyRegion. | |
405 | |
406 Scoring is done the following way: Source and target location | |
407 are normalized with respect to this Regions source and target. | |
408 Source range will then be somewhere close to 0.0-1.0 and target | |
409 range anything around that. | |
410 | |
411 The extend of the covered area between source and target range | |
412 is a measurement of how well they agree (smaller extend is | |
413 better). The extend (actually 2*extend) is reduced by the size | |
414 of the regions. This will result in 0.0 if they overlap | |
415 perfectly and bigger values if they dont. | |
416 | |
417 This is substracted from 1.0 to give the score. The score is | |
418 likely to be below zero, but is cut off at 0.0f. | |
419 | |
420 Finally, the score is multiplied with the score of the synteny | |
421 itself. | |
422 Return type : Float | |
423 Exceptions : warns if score out of range | |
424 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
425 Status : At Risk | |
426 : under development | |
427 | |
428 =cut | |
429 | |
430 | |
431 | |
432 sub score_location_relationship { | |
433 my ($self, $source_gene, $target_gene) = @_; | |
434 | |
435 # must be on same seq_region | |
436 if (($self->source_seq_region_name ne $source_gene->seq_region_name) or | |
437 ($self->target_seq_region_name ne $target_gene->seq_region_name)) { | |
438 return 0; | |
439 } | |
440 | |
441 # strand relationship must be the same (use logical XOR to find out) | |
442 if (($self->source_strand == $source_gene->strand) xor | |
443 ($self->target_strand == $target_gene->strand)) { | |
444 return 0; | |
445 } | |
446 | |
447 # normalise source location | |
448 my $source_rel_start = ($source_gene->start - $self->source_start) / | |
449 ($self->source_end - $self->source_start + 1); | |
450 | |
451 my $source_rel_end = ($source_gene->end - $self->source_start + 1) / | |
452 ($self->source_end - $self->source_start + 1); | |
453 | |
454 #warn " aaa ".$self->to_string."\n"; | |
455 #warn sprintf(" bbb %.6f %.6f\n", $source_rel_start, $source_rel_end); | |
456 | |
457 # cut off if the source location is completely outside | |
458 return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1); | |
459 | |
460 # normalise target location | |
461 my ($target_rel_start, $target_rel_end); | |
462 my $t_length = $self->target_end - $self->target_start + 1; | |
463 | |
464 if ($self->target_strand == 1) { | |
465 | |
466 $target_rel_start = ($target_gene->start - $self->target_start) / $t_length; | |
467 | |
468 $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length; | |
469 | |
470 } else { | |
471 $target_rel_start = ($self->target_end - $target_gene->end) / $t_length; | |
472 $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length; | |
473 } | |
474 | |
475 my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end : | |
476 $source_rel_end) - | |
477 (($target_rel_start < $source_rel_start) ? $target_rel_start : | |
478 $source_rel_start); | |
479 | |
480 my $score = $self->score * (1 - (2 * $added_range - $target_rel_end - | |
481 $source_rel_end + $target_rel_start + $source_rel_start)); | |
482 | |
483 #warn " ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range, | |
484 # $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end); | |
485 | |
486 $score = 0 if ($score < 0); | |
487 | |
488 # sanity check | |
489 if ($score > 1) { | |
490 warn "Out of range score ($score) for ".$source_gene->id.":". | |
491 $target_gene->id."\n"; | |
492 } | |
493 | |
494 return $score; | |
495 } | |
496 | |
497 | |
498 =head2 to_string | |
499 | |
500 Example : print LOG $sr->to_string, "\n"; | |
501 Description : Returns a string representation of the SyntenyRegion object. | |
502 Useful for debugging and logging. | |
503 Return type : String | |
504 Exceptions : none | |
505 Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
506 Status : At Risk | |
507 : under development | |
508 | |
509 =cut | |
510 | |
511 sub to_string { | |
512 my $self = shift; | |
513 return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f", | |
514 $self->source_seq_region_name, | |
515 $self->source_start, | |
516 $self->source_end, | |
517 $self->source_strand, | |
518 $self->target_seq_region_name, | |
519 $self->target_start, | |
520 $self->target_end, | |
521 $self->target_strand, | |
522 $self->score | |
523 ); | |
524 } | |
525 | |
526 | |
527 1; | |
528 |