comparison variant_effect_predictor/Bio/EnsEMBL/Variation/BaseTranscriptVariation.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Variation::BaseTranscriptVariation
24
25 =head1 SYNOPSIS
26
27 use Bio::EnsEMBL::Variation::BaseTranscriptVariation;
28
29 =head1 DESCRIPTION
30
31 A helper class for representing an overlap of a Transcript and a
32 Variation (either sequence or structural). Should not be invoked directly.
33
34 =cut
35
36 package Bio::EnsEMBL::Variation::BaseTranscriptVariation;
37
38 use strict;
39 use warnings;
40
41 use Digest::MD5 qw(md5_hex);
42
43 use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref);
44 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap within_cds);
45
46 use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlap);
47
48 =head2 transcript_stable_id
49
50 Description: Returns the stable_id of the associated Transcript
51 Returntype : string
52 Exceptions : none
53 Status : At Risk
54
55 =cut
56
57 sub transcript_stable_id {
58 my $self = shift;
59 return $self->SUPER::_feature_stable_id(@_);
60 }
61
62 =head2 transcript
63
64 Arg [1] : (optional) Bio::EnsEMBL::Transcript
65 Description: Get/set the associated Transcript
66 Returntype : Bio::EnsEMBL::Transcript
67 Exceptions : throws if argument is wrong type
68 Status : At Risk
69
70 =cut
71
72 sub transcript {
73 my ($self, $transcript) = @_;
74 assert_ref($transcript, 'Bio::EnsEMBL::Transcript') if $transcript;
75 return $self->SUPER::feature($transcript, 'Transcript');
76 }
77
78 =head2 feature
79
80 Arg [1] : (optional) Bio::EnsEMBL::Transcript
81 Description: Get/set the associated Transcript (overriding the superclass feature method)
82 Returntype : Bio::EnsEMBL::Transcript
83 Exceptions : throws if argument is wrong type
84 Status : At Risk
85
86 =cut
87
88 sub feature {
89 my $self = shift;
90 return $self->transcript(@_);
91 }
92
93 =head2 cdna_start
94
95 Arg [1] : (optional) int $start
96 Example : $cdna_start = $tv->cdna_start;
97 Description: Getter/Setter for the start position of this variation on the
98 transcript in cDNA coordinates.
99 Returntype : int
100 Exceptions : None
101 Caller : general
102 Status : Stable
103
104 =cut
105
106 sub cdna_start {
107 my ($self, $cdna_start) = @_;
108
109 $self->{cdna_start} = $cdna_start if defined $cdna_start;
110
111 unless (exists $self->{cdna_start}) {
112 my $cdna_coords = $self->cdna_coords;
113
114 my ($first, $last) = ($cdna_coords->[0], $cdna_coords->[-1]);
115
116 $self->{cdna_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start;
117 $self->{cdna_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end;
118 }
119
120 return $self->{cdna_start};
121 }
122
123 =head2 cdna_end
124
125 Arg [1] : (optional) int $end
126 Example : $cdna_end = $tv->cdna_end;
127 Description: Getter/Setter for the end position of this variation on the
128 transcript in cDNA coordinates.
129 Returntype : int
130 Exceptions : None
131 Caller : general
132 Status : Stable
133
134 =cut
135
136 sub cdna_end {
137 my ($self, $cdna_end) = @_;
138
139 $self->{cdna_end} = $cdna_end if defined $cdna_end;
140
141 # call cdna_start to calculate the start and end
142 $self->cdna_start unless exists $self->{cdna_end};
143
144 return $self->{cdna_end};
145 }
146
147 =head2 cds_start
148
149 Arg [1] : (optional) int $start
150 Example : $cds_start = $tv->cds_start;
151 Description: Getter/Setter for the start position of this variation on the
152 transcript in CDS coordinates.
153 Returntype : int
154 Exceptions : None
155 Caller : general
156 Status : Stable
157
158 =cut
159
160 sub cds_start {
161 my ($self, $cds_start) = @_;
162
163 $self->{cds_start} = $cds_start if defined $cds_start;
164
165 unless (exists $self->{cds_start}) {
166 my $cds_coords = $self->cds_coords;
167
168 my ($first, $last) = ($cds_coords->[0], $cds_coords->[-1]);
169 my $exon_phase = $self->transcript->start_Exon->phase;
170
171 $self->{cds_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start + ($exon_phase > 0 ? $exon_phase : 0);
172 $self->{cds_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end + ($exon_phase > 0 ? $exon_phase : 0);
173 }
174
175 return $self->{cds_start};
176 }
177
178 =head2 cds_end
179
180 Arg [1] : (optional) int $end
181 Example : $cds_end = $tv->cds_end;
182 Description: Getter/Setter for the end position of this variation on the
183 transcript in CDS coordinates.
184 Returntype : int
185 Exceptions : None
186 Caller : general
187 Status : Stable
188
189 =cut
190
191 sub cds_end {
192 my ($self, $cds_end) = @_;
193
194 $self->{cds_end} = $cds_end if defined $cds_end;
195
196 # call cds_start to calculate the start and end
197 $self->cds_start unless exists $self->{cds_end};
198
199 return $self->{cds_end};
200 }
201
202 =head2 translation_start
203
204 Arg [1] : (optional) int $start
205 Example : $translation_start = $tv->translation_start;
206 Description: Getter/Setter for the start position of this variation on the
207 transcript in peptide coordinates.
208 Returntype : int
209 Exceptions : None
210 Caller : general
211 Status : Stable
212
213 =cut
214
215 sub translation_start {
216 my ($self, $translation_start) = @_;
217
218 $self->{translation_start} = $translation_start if defined $translation_start;
219
220 unless (exists $self->{translation_start}) {
221 my $translation_coords = $self->translation_coords;
222
223 my ($first, $last) = ($translation_coords->[0], $translation_coords->[-1]);
224
225 $self->{translation_start} = $first->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $first->start;
226 $self->{translation_end} = $last->isa('Bio::EnsEMBL::Mapper::Gap') ? undef : $last->end;
227 }
228
229 return $self->{translation_start};
230 }
231
232
233 =head2 translation_end
234
235 Arg [1] : (optional) int $end
236 Example : $transaltion_end = $tv->translation_end;
237 Description: Getter/Setter for the end position of this variation on the
238 transcript in peptide coordinates.
239 Returntype : int
240 Exceptions : None
241 Caller : general
242 Status : Stable
243
244 =cut
245
246 sub translation_end {
247 my ($self, $translation_end) = @_;
248
249 $self->{translation_end} = $translation_end if defined $translation_end;
250
251 # call translation_start to calculate the start and end
252 $self->translation_start unless exists $self->{translation_end};
253
254 return $self->{translation_end};
255 }
256
257 =head2 cdna_coords
258
259 Description: Use the TranscriptMapper to calculate the cDNA
260 coordinates of this variation
261 Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects
262 Exceptions : None
263 Caller : general
264 Status : Stable
265
266 =cut
267
268 sub cdna_coords {
269 my $self = shift;
270
271 unless ($self->{_cdna_coords}) {
272 my $vf = $self->base_variation_feature;
273 my $tran = $self->transcript;
274 $self->{_cdna_coords} = [ $self->_mapper->genomic2cdna($vf->start, $vf->end, $tran->strand) ];
275 }
276
277 return $self->{_cdna_coords};
278 }
279
280 =head2 cds_coords
281
282 Description: Use the TranscriptMapper to calculate the CDS
283 coordinates of this variation
284 Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects
285 Exceptions : None
286 Caller : general
287 Status : Stable
288
289 =cut
290
291 sub cds_coords {
292 my $self = shift;
293
294 unless ($self->{_cds_coords}) {
295 my $vf = $self->base_variation_feature;
296 my $tran = $self->transcript;
297 $self->{_cds_coords} = [ $self->_mapper->genomic2cds($vf->start, $vf->end, $tran->strand) ];
298 }
299
300 return $self->{_cds_coords};
301 }
302
303 =head2 translation_coords
304
305 Description: Use the TranscriptMapper to calculate the peptide
306 coordinates of this variation
307 Returntype : listref of Bio::EnsEMBL::Coordinate and Bio::EnsEMBL::Gap objects
308 Exceptions : None
309 Caller : general
310 Status : Stable
311
312 =cut
313
314 sub translation_coords {
315 my $self = shift;
316
317 unless ($self->{_translation_coords}) {
318 my $vf = $self->base_variation_feature;
319 my $tran = $self->transcript;
320 $self->{_translation_coords} = [ $self->_mapper->genomic2pep($vf->start, $vf->end, $tran->strand) ];
321 }
322
323 return $self->{_translation_coords};
324 }
325
326
327 =head2 distance_to_transcript
328
329 Arg [1] : (optional) int $distance
330 Example : $distance = $tv->distance_to_transcript;
331 Description: Getter/Setter for the distance of this variant to the transcript.
332 This is the shortest distance between variant start/end and
333 transcript start/end, so if a variant falls 5' of a transcript
334 on the forward strand this distance will be that between the
335 variant end and the transcript start; if it falls 3' it will be
336 the distance between the variant start and the transcript end.
337 Returntype : int
338 Exceptions : None
339 Caller : general
340 Status : Stable
341
342 =cut
343
344 sub distance_to_transcript {
345 my ($self, $distance) = @_;
346
347 $self->{distance_to_transcript} = $distance if defined $distance;
348
349 unless (exists $self->{distance_to_transcript}) {
350 my $vf = $self->base_variation_feature;
351 my $tr = $self->transcript;
352
353 my @dists = (
354 $vf->start - $tr->start,
355 $vf->start - $tr->end,
356 $vf->end - $tr->start,
357 $vf->end - $tr->end
358 );
359
360 # make positive if <0 and sort
361 @dists = sort {$a <=> $b} map {$_ < 0 ? 0 - $_ : $_} @dists;
362
363 $self->{distance_to_transcript} = $dists[0];
364 }
365
366 return $self->{distance_to_transcript};
367 }
368
369 =head2 get_overlapping_ProteinFeatures
370
371 Description: Find any ProteinFeatures (e.g. pfam or interpro domains etc.) that
372 the associated variation feature lies in
373 Returntype : listref of Bio::EnsEMBL::ProteinFeatures (possibly empty)
374 Exceptions : None
375 Caller : general
376 Status : At Risk
377
378 =cut
379
380 sub get_overlapping_ProteinFeatures {
381 my $self = shift;
382
383 unless (exists $self->{_protein_features}) {
384
385 $self->{_protein_features } = [];
386
387 my $tl = $self->transcript->translation;
388
389 if (defined $tl) {
390
391 my $tl_start = $self->translation_start;
392 my $tl_end = $self->translation_end;
393
394 if (defined $tl_start && defined $tl_end) {
395 for my $feat (@{ $tl->get_all_ProteinFeatures }) {
396 if (overlap($feat->start, $feat->end, $tl_start, $tl_end)) {
397 push @{ $self->{_protein_features} }, $feat;
398 }
399 }
400 }
401 }
402 }
403
404 return $self->{_protein_features};
405 }
406
407 =head2 exon_number
408
409 Description: Identify which exon(s) this variant falls in
410 Returntype : '/'-separated string containing the exon number(s) and the total
411 number of exons in this transcript, or undef if this variant
412 does not fall in any exons
413 Exceptions : None
414 Caller : general
415 Status : At Risk
416
417 =cut
418
419 sub exon_number {
420 my $self = shift;
421 $self->_exon_intron_number unless exists $self->{exon_number};
422 return $self->{exon_number};
423 }
424
425 =head2 intron_number
426
427 Description: Identify which intron(s) this variant falls in
428 Returntype : '/'-separated string containing the intron number(s) and the total
429 number of introns in this transcript, or undef if this variant
430 does not fall in any introns
431 Exceptions : None
432 Caller : general
433 Status : At Risk
434
435 =cut
436
437 sub intron_number {
438 my $self = shift;
439 $self->_exon_intron_number unless exists $self->{intron_number};
440 return $self->{intron_number};
441 }
442
443 sub _exon_intron_number {
444 my $self = shift;
445
446 # work out which exon or intron this variant falls in
447
448 # ensure the keys exist so even if we don't fall in an exon
449 # or intron we'll only call this method once
450
451 $self->{exon_number} = $self->{intron_number} = undef;
452
453 my $vf = $self->base_variation_feature;
454
455 my $vf_start = $vf->start;
456 my $vf_end = $vf->end;
457
458 my $strand = $self->transcript->strand;
459
460 my $exons = $self->_exons;
461
462 my $tot_exons = scalar(@$exons);
463
464 my $exon_count = 0;
465
466 my $prev_exon;
467
468 my (@overlapped_exons, @overlapped_introns);
469
470 for my $exon (@$exons) {
471
472 $exon_count++;
473
474 if (overlap($vf_start, $vf_end, $exon->start, $exon->end)) {
475 push @overlapped_exons, $exon_count;
476 #$self->{exon_number} = defined($self->{exon_number}) ? $self->{exon_number}.",".$exon_count : $exon_count;
477 }
478
479 if ($prev_exon) {
480 my $intron_start = $strand == 1 ? $prev_exon->end + 1 : $exon->end + 1;
481 my $intron_end = $strand == 1 ? $exon->start - 1 : $prev_exon->start - 1;
482
483 if ($prev_exon && overlap($vf_start, $vf_end, $intron_start, $intron_end)) {
484 push @overlapped_introns, $exon_count - 1;
485 #$self->{intron_number} = defined($self->{intron_number}) ? $self->{intron_number}.",".($exon_count - 1) : ($exon_count - 1);
486 }
487 }
488
489 $prev_exon = $exon;
490 }
491
492 if(@overlapped_exons) {
493 $self->{exon_number} = (scalar @overlapped_exons > 1 ? $overlapped_exons[0].'-'.$overlapped_exons[-1] : $overlapped_exons[0]).'/'.$tot_exons;
494 }
495 if(@overlapped_introns) {
496 $self->{intron_number} = (scalar @overlapped_introns > 1 ? $overlapped_introns[0].'-'.$overlapped_introns[-1] : $overlapped_introns[0]).'/'.($tot_exons - 1);
497 }
498 }
499
500 sub _intron_effects {
501 my $self = shift;
502
503 # internal method used by Bio::EnsEMBL::Variation::Utils::VariationEffect
504 # when calculating various consequence types
505
506 # this method is a major bottle neck in the effect calculation code so
507 # we cache results and use local variables instead of method calls where
508 # possible to speed things up - caveat bug-fixer!
509
510 unless ($self->{_intron_effects}) {
511
512 my $vf = $self->base_variation_feature;
513
514 my $intron_effects = {};
515
516 my $found_effect = 0;
517
518 my $vf_start = $vf->start;
519 my $vf_end = $vf->end;
520
521 my $insertion = $vf_start == $vf_end+1;
522
523 for my $intron (@{ $self->_introns }) {
524
525 my $intron_start = $intron->start;
526 my $intron_end = $intron->end;
527
528 # under various circumstances the genebuild process can introduce
529 # artificial short (<= 12 nucleotide) introns into transcripts
530 # (e.g. to deal with errors in the reference sequence etc.), we
531 # don't want to categorise variations that fall in these introns
532 # as intronic, or as any kind of splice variant
533
534 my $frameshift_intron = ( abs($intron_end - $intron_start) <= 12 );
535
536 if ($frameshift_intron) {
537 if (overlap($vf_start, $vf_end, $intron_start, $intron_end)) {
538 $intron_effects->{within_frameshift_intron} = 1;
539 next;
540 }
541 }
542
543 if (overlap($vf_start, $vf_end, $intron_start, $intron_start+1)) {
544 $intron_effects->{start_splice_site} = 1;
545 }
546
547 if (overlap($vf_start, $vf_end, $intron_end-1, $intron_end)) {
548 $intron_effects->{end_splice_site} = 1;
549 }
550
551 # we need to special case insertions between the donor and acceptor sites
552
553 if (overlap($vf_start, $vf_end, $intron_start+2, $intron_end-2) or
554 ($insertion && ($vf_start == $intron_start+2 || $vf_end == $intron_end-2)) ) {
555 $intron_effects->{intronic} = 1;
556 }
557
558 # the definition of splice_region (SO:0001630) is "within 1-3 bases
559 # of the exon or 3-8 bases of the intron", the intron start is the
560 # first base of the intron so we only need to add or subtract 7 from
561 # it to get the correct coordinate. We also need to special case
562 # insertions between the edge of an exon and a donor or acceptor site
563 # and between a donor or acceptor site and the intron
564
565 if ( overlap($vf_start, $vf_end, $intron_start-3, $intron_start-1) or
566 overlap($vf_start, $vf_end, $intron_start+2, $intron_start+7) or
567 overlap($vf_start, $vf_end, $intron_end-7, $intron_end-2 ) or
568 overlap($vf_start, $vf_end, $intron_end+1, $intron_end+3 ) or
569 ($insertion && (
570 $vf_start == $intron_start ||
571 $vf_end == $intron_end ||
572 $vf_start == $intron_start+2 ||
573 $vf_end == $intron_end-2
574 ) )) {
575
576 $intron_effects->{splice_region} = 1;
577 }
578 }
579
580 $self->{_intron_effects} = $intron_effects;
581 }
582
583 return $self->{_intron_effects};
584 }
585
586 # NB: the methods below all cache their data in the associated transcript itself, this
587 # gives a significant speed up when you are calculating the effect of all variations
588 # on a transcript, and means that the cache will be freed when the transcript itself
589 # is garbage collected rather than us having to maintain a transcript feature cache
590 # ourselves
591
592 sub _introns {
593 my $self = shift;
594
595 my $tran = $self->transcript;
596
597 my $introns = $tran->{_variation_effect_feature_cache}->{introns} ||= $tran->get_all_Introns;
598
599 return $introns;
600 }
601
602 sub _exons {
603 my $self = shift;
604
605 my $tran = $self->transcript;
606
607 my $exons = $tran->{_variation_effect_feature_cache}->{exons} ||= $tran->get_all_Exons;
608
609 return $exons;
610 }
611
612 sub _mapper {
613 my $self = shift;
614
615 my $tran = $self->transcript;
616
617 my $mapper = $tran->{_variation_effect_feature_cache}->{mapper} ||= $tran->get_TranscriptMapper;
618
619 return $mapper;
620 }
621 sub _translateable_seq {
622 my $self = shift;
623
624 my $tran = $self->transcript;
625
626 my $tran_seq = $tran->{_variation_effect_feature_cache}->{translateable_seq} ||= $tran->translateable_seq;
627
628 return $tran_seq;
629 }
630
631 sub _peptide {
632 my $self = shift;
633
634 my $tran = $self->transcript;
635
636 my $peptide = $tran->{_variation_effect_feature_cache}->{peptide};
637
638 unless ($peptide) {
639 my $translation = $tran->translate;
640 $peptide = $translation ? $translation->seq : undef;
641 $tran->{_variation_effect_feature_cache}->{peptide} = $peptide;
642 }
643
644 return $peptide;
645 }
646
647 sub _translation_md5 {
648 my $self = shift;
649
650 my $tran = $self->transcript;
651
652 unless (exists $tran->{_variation_effect_feature_cache}->{translation_md5}) {
653 $tran->{_variation_effect_feature_cache}->{translation_md5} =
654 $self->_peptide ? md5_hex($self->_peptide) : undef;
655 }
656
657 return $tran->{_variation_effect_feature_cache}->{translation_md5};
658 }
659
660 sub _codon_table {
661 my $self = shift;
662
663 my $tran = $self->transcript;
664
665 my $codon_table = $tran->{_variation_effect_feature_cache}->{codon_table};
666
667 unless ($codon_table) {
668 # for mithocondrial dna we need to to use a different codon table
669 my $attrib = $tran->slice->get_all_Attributes('codon_table')->[0];
670
671 # default to the vertebrate codon table which is denoted as 1
672 $codon_table = $attrib ? $attrib->value : 1;
673
674 $tran->{_variation_effect_feature_cache}->{codon_table} = $codon_table
675 }
676
677 return $codon_table;
678 }
679
680 1;