0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Variation::TranscriptVariationAllele
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 use Bio::EnsEMBL::Variation::TranscriptVariationAllele;
|
|
28
|
|
29 my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new(
|
|
30 -transcript_variation => $tv,
|
|
31 -variation_feature_seq => 'A',
|
|
32 -is_reference => 0,
|
|
33 );
|
|
34
|
|
35 print "sequence with respect to the transcript: ", $tva->feature_seq, "\n";
|
|
36 print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n";
|
|
37 print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n";
|
|
38 print "amino acid change: ", $tva->peptide_allele_string, "\n";
|
|
39 print "resulting codon: ", $tva->codon, "\n";
|
|
40 print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n";
|
|
41 print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n";
|
|
42 print "SIFT prediction: ", $tva->sift_prediction, "\n";
|
|
43
|
|
44 =head1 DESCRIPTION
|
|
45
|
|
46 A TranscriptVariationAllele object represents a single allele of a TranscriptVariation.
|
|
47 It provides methods that are specific to the sequence of the allele, such as codon,
|
|
48 peptide etc. Methods that depend only on position (e.g. CDS start) will be found in
|
|
49 the associated TranscriptVariation. Ordinarily you will not create these objects
|
|
50 yourself, but instead you would create a TranscriptVariation object which will then
|
|
51 construct TranscriptVariationAlleles based on the allele string of the associated
|
|
52 VariationFeature.
|
|
53
|
|
54 Note that any methods that are not specific to Transcripts will be found in the
|
|
55 VariationFeatureOverlapAllele superclass.
|
|
56
|
|
57 =cut
|
|
58
|
|
59 package Bio::EnsEMBL::Variation::TranscriptVariationAllele;
|
|
60
|
|
61 use strict;
|
|
62 use warnings;
|
|
63
|
|
64 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP);
|
|
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
|
|
66 use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string);
|
|
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
68 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(within_cds within_intron stop_lost affects_start_codon);
|
|
69
|
|
70 use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele);
|
|
71
|
|
72
|
|
73 our $DEBUG = 0;
|
|
74
|
|
75 sub new_fast {
|
|
76 my ($class, $hashref) = @_;
|
|
77
|
|
78 # swap a transcript_variation argument for a variation_feature_overlap one
|
|
79
|
|
80 if ($hashref->{transcript_variation}) {
|
|
81 $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation};
|
|
82 }
|
|
83
|
|
84 # and call the superclass
|
|
85
|
|
86 return $class->SUPER::new_fast($hashref);
|
|
87 }
|
|
88
|
|
89 =head2 transcript_variation
|
|
90
|
|
91 Description: Get/set the associated TranscriptVariation
|
|
92 Returntype : Bio::EnsEMBL::Variation::TranscriptVariation
|
|
93 Exceptions : throws if the argument is the wrong type
|
|
94 Status : At Risk
|
|
95
|
|
96 =cut
|
|
97
|
|
98 sub transcript_variation {
|
|
99 my ($self, $tv) = @_;
|
|
100 assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $tv;
|
|
101 return $self->variation_feature_overlap($tv);
|
|
102 }
|
|
103
|
|
104 =head2 variation_feature
|
|
105
|
|
106 Description: Get the associated VariationFeature
|
|
107 Returntype : Bio::EnsEMBL::Variation::VariationFeature
|
|
108 Exceptions : none
|
|
109 Status : At Risk
|
|
110
|
|
111 =cut
|
|
112
|
|
113 sub variation_feature {
|
|
114 my $self = shift;
|
|
115 return $self->transcript_variation->variation_feature;
|
|
116 }
|
|
117
|
|
118 =head2 pep_allele_string
|
|
119
|
|
120 Description: Return a '/' delimited string of the reference peptide and the
|
|
121 peptide resulting from this allele, or a single peptide if this
|
|
122 allele does not change the peptide (e.g. because it is synonymous)
|
|
123 Returntype : string or undef if this allele is not in the CDS
|
|
124 Exceptions : none
|
|
125 Status : At Risk
|
|
126
|
|
127 =cut
|
|
128
|
|
129 sub pep_allele_string {
|
|
130 my ($self) = @_;
|
|
131
|
|
132 my $pep = $self->peptide;
|
|
133
|
|
134 return undef unless $pep;
|
|
135
|
|
136 my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide;
|
|
137
|
|
138 return undef unless $ref_pep;
|
|
139
|
|
140 return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep;
|
|
141 }
|
|
142
|
|
143 =head2 codon_allele_string
|
|
144
|
|
145 Description: Return a '/' delimited string of the reference codon and the
|
|
146 codon resulting from this allele
|
|
147 Returntype : string or undef if this allele is not in the CDS
|
|
148 Exceptions : none
|
|
149 Status : At Risk
|
|
150
|
|
151 =cut
|
|
152
|
|
153 sub codon_allele_string {
|
|
154 my ($self) = @_;
|
|
155
|
|
156 my $codon = $self->codon;
|
|
157
|
|
158 return undef unless $codon;
|
|
159
|
|
160 my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon;
|
|
161
|
|
162 return $ref_codon.'/'.$codon;
|
|
163 }
|
|
164
|
|
165 =head2 display_codon_allele_string
|
|
166
|
|
167 Description: Return a '/' delimited string of the reference display_codon and the
|
|
168 display_codon resulting from this allele. The display_codon identifies
|
|
169 the nucleotides affected by this variant in UPPER CASE and other
|
|
170 nucleotides in lower case
|
|
171 Returntype : string or undef if this allele is not in the CDS
|
|
172 Exceptions : none
|
|
173 Status : At Risk
|
|
174
|
|
175 =cut
|
|
176
|
|
177 sub display_codon_allele_string {
|
|
178 my ($self) = @_;
|
|
179
|
|
180 my $display_codon = $self->display_codon;
|
|
181
|
|
182 return undef unless $display_codon;
|
|
183
|
|
184 my $ref_display_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->display_codon;
|
|
185
|
|
186 return undef unless $ref_display_codon;
|
|
187
|
|
188 return $ref_display_codon.'/'.$display_codon;
|
|
189 }
|
|
190
|
|
191 =head2 peptide
|
|
192
|
|
193 Description: Return the amino acid sequence that this allele is predicted to result in
|
|
194 Returntype : string or undef if this allele is not in the CDS or is a frameshift
|
|
195 Exceptions : none
|
|
196 Status : At Risk
|
|
197
|
|
198 =cut
|
|
199
|
|
200 sub peptide {
|
|
201 my ($self, $peptide) = @_;
|
|
202
|
|
203 $self->{peptide} = $peptide if $peptide;
|
|
204
|
|
205 unless ($self->{peptide}) {
|
|
206
|
|
207 return undef unless $self->seq_is_unambiguous_dna;
|
|
208
|
|
209 if (my $codon = $self->codon) {
|
|
210
|
|
211 # the codon method can set the peptide in some circumstances
|
|
212 # so check here before we try an (expensive) translation
|
|
213 return $self->{peptide} if $self->{peptide};
|
|
214
|
|
215 # translate the codon sequence to establish the peptide allele
|
|
216
|
|
217 # allow for partial codons - split the sequence into whole and partial
|
|
218 # e.g. AAAGG split into AAA and GG
|
|
219 my $whole_codon = substr($codon, 0, int(length($codon) / 3) * 3);
|
|
220 my $partial_codon = substr($codon, int(length($codon) / 3) * 3);
|
|
221
|
|
222 my $pep = '';
|
|
223
|
|
224 if($whole_codon) {
|
|
225 # for mithocondrial dna we need to to use a different codon table
|
|
226 my $codon_table = $self->transcript_variation->_codon_table;
|
|
227
|
|
228 my $codon_seq = Bio::Seq->new(
|
|
229 -seq => $whole_codon,
|
|
230 -moltype => 'dna',
|
|
231 -alphabet => 'dna',
|
|
232 );
|
|
233
|
|
234 $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq;
|
|
235 }
|
|
236
|
|
237 if($partial_codon) {
|
|
238 $pep .= 'X';
|
|
239 }
|
|
240
|
|
241 $pep ||= '-';
|
|
242
|
|
243 $self->{peptide} = $pep;
|
|
244 }
|
|
245 }
|
|
246
|
|
247 return $self->{peptide};
|
|
248 }
|
|
249
|
|
250 =head2 codon
|
|
251
|
|
252 Description: Return the codon sequence that this allele is predicted to result in
|
|
253 Returntype : string or undef if this allele is not in the CDS or is a frameshift
|
|
254 Exceptions : none
|
|
255 Status : At Risk
|
|
256
|
|
257 =cut
|
|
258
|
|
259 sub codon {
|
|
260 my ($self, $codon) = @_;
|
|
261
|
|
262 $self->{codon} = $codon if defined $codon;
|
|
263
|
|
264 my $tv = $self->transcript_variation;
|
|
265
|
|
266 return undef unless $tv->translation_start && $tv->translation_end;
|
|
267
|
|
268 return undef unless $self->seq_is_dna;
|
|
269
|
|
270 unless ($self->{codon}) {
|
|
271
|
|
272 # try to calculate the codon sequence
|
|
273
|
|
274 my $seq = $self->feature_seq;
|
|
275
|
|
276 $seq = '' if $seq eq '-';
|
|
277
|
|
278 # calculate necessary coords and lengths
|
|
279
|
|
280 my $codon_cds_start = $tv->translation_start * 3 - 2;
|
|
281 my $codon_cds_end = $tv->translation_end * 3;
|
|
282 my $codon_len = $codon_cds_end - $codon_cds_start + 1;
|
|
283 my $vf_nt_len = $tv->cds_end - $tv->cds_start + 1;
|
|
284 my $allele_len = $self->seq_length;
|
|
285
|
|
286 if ($allele_len != $vf_nt_len) {
|
|
287 if (abs($allele_len - $vf_nt_len) % 3) {
|
|
288 # this is a frameshift variation, we don't attempt to
|
|
289 # calculate the resulting codon or peptide change as this
|
|
290 # could get quite complicated
|
|
291 return undef;
|
|
292 }
|
|
293 }
|
|
294
|
|
295 # splice the allele sequence into the CDS
|
|
296
|
|
297 my $cds = $tv->_translateable_seq;
|
|
298
|
|
299 substr($cds, $tv->cds_start-1, $vf_nt_len) = $seq;
|
|
300
|
|
301 # and extract the codon sequence
|
|
302
|
|
303 my $codon = substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len));
|
|
304
|
|
305 if (length($codon) < 1) {
|
|
306 $self->{codon} = '-';
|
|
307 $self->{peptide} = '-';
|
|
308 }
|
|
309 else {
|
|
310 $self->{codon} = $codon;
|
|
311 }
|
|
312 }
|
|
313
|
|
314 return $self->{codon};
|
|
315 }
|
|
316
|
|
317 =head2 display_codon
|
|
318
|
|
319 Description: Return the codon sequence that this allele is predicted to result in
|
|
320 with the affected nucleotides identified in UPPER CASE and other
|
|
321 nucleotides in lower case
|
|
322 Returntype : string or undef if this allele is not in the CDS or is a frameshift
|
|
323 Exceptions : none
|
|
324 Status : At Risk
|
|
325
|
|
326 =cut
|
|
327
|
|
328 sub display_codon {
|
|
329 my $self = shift;
|
|
330
|
|
331 unless ($self->{_display_codon}) {
|
|
332
|
|
333 if ($self->codon && defined $self->transcript_variation->codon_position) {
|
|
334
|
|
335 my $display_codon = lc $self->codon;
|
|
336
|
|
337 # if this allele is an indel then just return all lowercase
|
|
338
|
|
339 if ($self->feature_seq ne '-') {
|
|
340
|
|
341 # codon_position is 1-based, while substr assumes the string starts at 0
|
|
342
|
|
343 my $pos = $self->transcript_variation->codon_position - 1;
|
|
344
|
|
345 my $len = length $self->feature_seq;
|
|
346
|
|
347 substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len);
|
|
348 }
|
|
349
|
|
350 $self->{_display_codon} = $display_codon;
|
|
351 }
|
|
352 }
|
|
353
|
|
354 return $self->{_display_codon};
|
|
355 }
|
|
356
|
|
357 =head2 polyphen_prediction
|
|
358
|
|
359 Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele.
|
|
360 (Note that we currently only have PolyPhen predictions for variants that
|
|
361 result in single amino acid substitutions in human)
|
|
362 Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown')
|
|
363 if this is a non-synonymous mutation and a prediction is available, undef
|
|
364 otherwise
|
|
365 Exceptions : none
|
|
366 Status : At Risk
|
|
367
|
|
368 =cut
|
|
369
|
|
370 sub polyphen_prediction {
|
|
371 my ($self, $classifier, $polyphen_prediction) = @_;
|
|
372
|
|
373 $classifier ||= 'humvar';
|
|
374
|
|
375 my $analysis = "polyphen_${classifier}";
|
|
376
|
|
377 $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction;
|
|
378
|
|
379 unless ($self->{$analysis}->{prediction}) {
|
|
380 my ($prediction, $score) = $self->_protein_function_prediction($analysis);
|
|
381 $self->{$analysis}->{score} = $score;
|
|
382 $self->{$analysis}->{prediction} = $prediction;
|
|
383 }
|
|
384
|
|
385 return $self->{$analysis}->{prediction};
|
|
386 }
|
|
387
|
|
388 =head2 polyphen_score
|
|
389
|
|
390 Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we
|
|
391 currently only have PolyPhen predictions for variants that result in single
|
|
392 amino acid substitutions in human)
|
|
393 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is
|
|
394 available, undef otherwise
|
|
395 Exceptions : none
|
|
396 Status : At Risk
|
|
397
|
|
398 =cut
|
|
399
|
|
400 sub polyphen_score {
|
|
401 my ($self, $classifier, $polyphen_score) = @_;
|
|
402
|
|
403 $classifier ||= 'humvar';
|
|
404
|
|
405 my $analysis = "polyphen_${classifier}";
|
|
406
|
|
407 $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score;
|
|
408
|
|
409 unless ($self->{$analysis}->{score}) {
|
|
410 my ($prediction, $score) = $self->_protein_function_prediction($analysis);
|
|
411 $self->{$analysis}->{score} = $score;
|
|
412 $self->{$analysis}->{prediction} = $prediction;
|
|
413 }
|
|
414
|
|
415 return $self->{$analysis}->{score};
|
|
416 }
|
|
417
|
|
418 =head2 sift_prediction
|
|
419
|
|
420 Description: Return the qualitative SIFT prediction for the effect of this allele.
|
|
421 (Note that we currently only have SIFT predictions for variants that
|
|
422 result in single amino acid substitutions in human)
|
|
423 Returntype : string (one of 'tolerated', 'deleterious') if this is a non-synonymous
|
|
424 mutation and a prediction is available, undef otherwise
|
|
425 Exceptions : none
|
|
426 Status : At Risk
|
|
427
|
|
428 =cut
|
|
429
|
|
430 sub sift_prediction {
|
|
431 my ($self, $sift_prediction) = @_;
|
|
432
|
|
433 $self->{sift_prediction} = $sift_prediction if $sift_prediction;
|
|
434
|
|
435 unless ($self->{sift_prediction}) {
|
|
436 my ($prediction, $score) = $self->_protein_function_prediction('sift');
|
|
437 $self->{sift_score} = $score;
|
|
438 $self->{sift_prediction} = $prediction unless $self->{sift_prediction};
|
|
439 }
|
|
440
|
|
441 return $self->{sift_prediction};
|
|
442 }
|
|
443
|
|
444 =head2 sift_score
|
|
445
|
|
446 Description: Return the SIFT score for this allele (Note that we currently only have SIFT
|
|
447 predictions for variants that result in single amino acid substitutions in human)
|
|
448 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is
|
|
449 available, undef otherwise
|
|
450 Exceptions : none
|
|
451 Status : At Risk
|
|
452
|
|
453 =cut
|
|
454
|
|
455 sub sift_score {
|
|
456 my ($self, $sift_score) = @_;
|
|
457
|
|
458 $self->{sift_score} = $sift_score if defined $sift_score;
|
|
459
|
|
460 unless ($self->{sift_score}) {
|
|
461 my ($prediction, $score) = $self->_protein_function_prediction('sift');
|
|
462 $self->{sift_score} = $score;
|
|
463 $self->{sift_prediction} = $prediction;
|
|
464 }
|
|
465
|
|
466 return $self->{sift_score};
|
|
467 }
|
|
468
|
|
469
|
|
470 sub _protein_function_prediction {
|
|
471 my ($self, $analysis) = @_;
|
|
472
|
|
473 # we can only get results for variants that cause a single amino acid substitution,
|
|
474 # so check the peptide allele string first
|
|
475
|
|
476 if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) {
|
|
477
|
|
478 if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) {
|
|
479
|
|
480 my ($prediction, $score) = $matrix->get_prediction(
|
|
481 $self->transcript_variation->translation_start,
|
|
482 $self->peptide,
|
|
483 );
|
|
484
|
|
485 return wantarray ? ($prediction, $score) : $prediction;
|
|
486 }
|
|
487 }
|
|
488
|
|
489 return undef;
|
|
490 }
|
|
491
|
|
492 =head2 hgvs_genomic
|
|
493
|
|
494 Description: Return a string representing the genomic-level effect of this allele in HGVS format
|
|
495 Returntype : string
|
|
496 Exceptions : none
|
|
497 Status : At Risk
|
|
498
|
|
499 =cut
|
|
500
|
|
501 sub hgvs_genomic {
|
|
502 return _hgvs_generic(@_,'genomic');
|
|
503 }
|
|
504
|
|
505 =head2 hgvs_coding
|
|
506
|
|
507 Description: Return a string representing the CDS-level effect of this allele in HGVS format
|
|
508 Returntype : string or undef if this allele is not in the CDS
|
|
509 Exceptions : none
|
|
510 Status : At Risk
|
|
511
|
|
512 =cut
|
|
513
|
|
514 sub hgvs_coding {
|
|
515
|
|
516 deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.');
|
|
517 return hgvs_transcript(@_);
|
|
518 }
|
|
519
|
|
520
|
|
521 =head2 hgvs_transcript
|
|
522
|
|
523 Description: Return a string representing the CDS-level effect of this allele in HGVS format
|
|
524 Returntype : string or undef if this allele is not in the CDS
|
|
525 Exceptions : none
|
|
526 Status : At Risk
|
|
527
|
|
528 =cut
|
|
529
|
|
530
|
|
531 sub hgvs_transcript {
|
|
532
|
|
533
|
|
534 my $self = shift;
|
|
535 my $notation = shift;
|
|
536
|
|
537 ##### set if string supplied
|
|
538 $self->{hgvs_transcript} = $notation if defined $notation;
|
|
539 ##### return if held
|
|
540 return $self->{hgvs_transcript} if defined $self->{hgvs_transcript} ;
|
|
541 return $self->{hgvs_coding} if defined $self->{hgvs_coding} ;
|
|
542
|
|
543 my $variation_feature_sequence = $self->variation_feature_seq() ;
|
|
544
|
|
545 ### don't try to handle odd characters
|
|
546 return undef if $variation_feature_sequence =~ m/[^ACGT\-]/ig;
|
|
547
|
|
548 ### no result for reference allele
|
|
549 return undef if $self->is_reference ==1;
|
|
550
|
|
551 ### else evaluate
|
|
552
|
|
553 ### get reference sequence strand
|
|
554 my $refseq_strand = $self->transcript_variation->transcript->strand();
|
|
555
|
|
556 if($DEBUG ==1){
|
|
557 my $var_name = $self->transcript_variation->variation_feature->variation_name();
|
|
558 print "\nHGVS transcript: Checking $var_name refseq strand => $refseq_strand seq name : " . $self->transcript_variation->transcript_stable_id() . " var strand " . $self->transcript_variation->variation_feature->strand() . " vf st " . $self->variation_feature->strand() ." seqname: " . $self->variation_feature->seqname() ." seq: " . $self->variation_feature_seq ."\n";
|
|
559 }
|
|
560
|
|
561 my $hgvs_notation ; ### store components of HGVS string in hash
|
|
562
|
|
563 ### vf strand is relative to transcript or transcript slice
|
|
564 if( $self->transcript_variation->variation_feature->strand() <0 && $refseq_strand >0 ||
|
|
565 $self->transcript_variation->variation_feature->strand() >0 && $refseq_strand < 0
|
|
566 ){
|
|
567 reverse_comp(\$variation_feature_sequence);
|
|
568 }
|
|
569
|
|
570 ### need to get ref seq from slice transcript is on for intron labelling
|
|
571 my ($slice_start, $slice_end, $slice) = $self->_var2transcript_slice_coords();
|
|
572
|
|
573
|
|
574 unless($slice_start){
|
|
575 #warn "TVA: VF not within transcript - no HGVS\n";
|
|
576 return undef;
|
|
577 }
|
|
578
|
|
579 ### decide event type from HGVS nomenclature
|
|
580 $hgvs_notation = hgvs_variant_notation( $variation_feature_sequence, ### alt_allele,
|
|
581 $slice->seq(), ### using this to extract ref allele
|
|
582 $slice_start,
|
|
583 $slice_end
|
|
584 );
|
|
585
|
|
586 ### This should not happen
|
|
587 unless($hgvs_notation->{'type'}){
|
|
588 #warn "Error - not continuing; no HGVS annotation\n";
|
|
589 return undef;
|
|
590 }
|
|
591
|
|
592 ### create reference name - transcript name & seq version
|
|
593 $hgvs_notation->{'ref_name'} = $self->transcript_variation->transcript_stable_id() . "." . $self->transcript_variation->transcript->version();
|
|
594
|
|
595
|
|
596 ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups]
|
|
597 $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} );
|
|
598 $hgvs_notation->{end} = $self->_get_cDNA_position( $hgvs_notation->{end} );
|
|
599
|
|
600
|
|
601 # Make sure that start is always less than end
|
|
602 my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
|
|
603 my ($exon_end_coord, $intron_end_offset) = $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
|
|
604 $intron_start_offset ||= 0;
|
|
605 $intron_end_offset ||= 0;
|
|
606
|
|
607 ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if
|
|
608 (($exon_start_coord + $intron_start_offset) > ($exon_end_coord + $intron_end_offset));
|
|
609
|
|
610
|
|
611
|
|
612 if($self->transcript->cdna_coding_start()){
|
|
613 $hgvs_notation->{'numbering'} = "c"; ### set 'c' if transcript is coding
|
|
614 }
|
|
615 else{
|
|
616 $hgvs_notation->{'numbering'} = "n"; ### set 'n' if transcript non-coding
|
|
617 }
|
|
618 ### generic formatting
|
|
619 $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation);
|
|
620
|
|
621 if($DEBUG ==1){print "HGVS notation for var " . $self->transcript_variation->variation_feature->variation_name() . " from hgvs transcript : " . $self->{hgvs_transcript} . " \n";}
|
|
622
|
|
623 return $self->{hgvs_transcript};
|
|
624
|
|
625 }
|
|
626
|
|
627
|
|
628 =head2 hgvs_protein
|
|
629
|
|
630 Description: Return a string representing the protein-level effect of this allele in HGVS format
|
|
631 Returntype : string or undef if this allele is not in the CDS
|
|
632 Exceptions : none
|
|
633 Status : At Risk
|
|
634
|
|
635 =cut
|
|
636
|
|
637 sub hgvs_protein {
|
|
638
|
|
639 my $self = shift;
|
|
640 my $notation = shift;
|
|
641 my $hgvs_notation;
|
|
642
|
|
643 if($DEBUG ==1){print "\nStarting hgvs_protein with ". $self->transcript_variation->variation_feature->variation_name() . "\n"; }
|
|
644
|
|
645 ### set if string supplied
|
|
646 $self->{hgvs_protein} = $notation if defined $notation;
|
|
647
|
|
648 ### return if set
|
|
649 return $self->{hgvs_protein} if defined $self->{hgvs_protein} ;
|
|
650
|
|
651 ### don't try to handle odd characters
|
|
652 return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig;
|
|
653
|
|
654 #### else, check viable input and create notation
|
|
655 return if $self->is_reference();
|
|
656
|
|
657 unless ($self->transcript_variation->translation_start() && $self->transcript_variation->translation_end()){
|
|
658 return undef ;
|
|
659 }
|
|
660
|
|
661 #### for debug
|
|
662 #my $var_name = $self->transcript_variation->variation_feature->variation_name();
|
|
663
|
|
664 ### get reference sequence [add seq version to transcript name]
|
|
665 $hgvs_notation->{ref_name} = $self->transcript_variation->transcript->translation()->display_id() . "." . $self->transcript_variation->transcript->translation()->version();
|
|
666
|
|
667 $hgvs_notation->{'numbering'} = 'p';
|
|
668
|
|
669 ### get default reference location [changed later in some cases eg. duplication]
|
|
670 $hgvs_notation->{start} = $self->transcript_variation->translation_start();
|
|
671 $hgvs_notation->{end} = $self->transcript_variation->translation_end();
|
|
672
|
|
673
|
|
674 ## get default reference & alt peptides [changed later to hgvs format]
|
|
675 $hgvs_notation->{alt} = $self->peptide;
|
|
676 $hgvs_notation->{ref} = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide;
|
|
677
|
|
678 if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref}){
|
|
679 $hgvs_notation = _clip_alleles( $hgvs_notation);
|
|
680 }
|
|
681
|
|
682 #### define type - types are different for protein numbering
|
|
683 $hgvs_notation = $self->_get_hgvs_protein_type($hgvs_notation);
|
|
684
|
|
685 ##### Convert ref & alt peptides taking into account HGVS rules
|
|
686 $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation);
|
|
687
|
|
688 ##### String formatting
|
|
689 $self->{hgvs_protein} = $self->_get_hgvs_protein_format($hgvs_notation);
|
|
690 return $self->{hgvs_protein} ;
|
|
691 }
|
|
692
|
|
693 ### HGVS: format protein string
|
|
694 sub _get_hgvs_protein_format{
|
|
695
|
|
696 my $self = shift;
|
|
697 my $hgvs_notation = shift;
|
|
698
|
|
699 if ((defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} &&
|
|
700 $hgvs_notation->{ref} eq $hgvs_notation->{alt}) ||
|
|
701 (defined $hgvs_notation->{type} && $hgvs_notation->{type} eq "=")){
|
|
702
|
|
703 ### no protein change - return transcript nomenclature with flag for neutral protein consequence
|
|
704 $hgvs_notation->{'hgvs'} = $self->hgvs_transcript() . "(p.=)";
|
|
705 return $hgvs_notation->{'hgvs'} ;
|
|
706 }
|
|
707
|
|
708 ### all start with refseq name & numbering type
|
|
709 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . ".";
|
|
710
|
|
711 ### handle stop_lost seperately regardless of cause by del/delins => p.XposAA1extnum_AA_to_stop
|
|
712 if(stop_lost($self)){ ### if deletion of stop add extX and number of new aa to alt
|
|
713 $hgvs_notation->{alt} = substr($hgvs_notation->{alt}, 0, 3);
|
|
714 if($hgvs_notation->{type} eq "del"){
|
|
715 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "del");
|
|
716 if(defined $aa_til_stop){
|
|
717 $hgvs_notation->{alt} .= "extX" . $aa_til_stop;
|
|
718 }
|
|
719 }
|
|
720 elsif($hgvs_notation->{type} eq ">"){
|
|
721 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "subs");
|
|
722 if(defined $aa_til_stop){
|
|
723 $hgvs_notation->{alt} .= "extX" . $aa_til_stop;
|
|
724 }
|
|
725 }
|
|
726 else{
|
|
727 # warn "TVA: stop loss for type $hgvs_notation->{type} not caught \n";
|
|
728 }
|
|
729 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
|
|
730 }
|
|
731
|
|
732 elsif( $hgvs_notation->{type} eq "dup"){
|
|
733
|
|
734 if($hgvs_notation->{start} < $hgvs_notation->{end}){
|
|
735 ### list only first and last peptides in long duplicated string
|
|
736 my $ref_pep_first = substr($hgvs_notation->{alt}, 0, 3);
|
|
737 my $ref_pep_last = substr($hgvs_notation->{alt}, -3, 3);
|
|
738 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} ."dup";
|
|
739
|
|
740 }
|
|
741 else{
|
|
742 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{alt} . $hgvs_notation->{start} . "dup" ;
|
|
743 }
|
|
744
|
|
745 print "formating a dup $hgvs_notation->{hgvs} \n" if $DEBUG==1;
|
|
746 }
|
|
747
|
|
748 elsif($hgvs_notation->{type} eq ">"){
|
|
749 #### substitution
|
|
750
|
|
751 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt};
|
|
752 }
|
|
753
|
|
754 elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){
|
|
755
|
|
756 #### list first and last aa in reference only
|
|
757 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3);
|
|
758 my $ref_pep_last;
|
|
759 if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){
|
|
760 $ref_pep_last ="X";
|
|
761 }
|
|
762 else{
|
|
763 $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3);
|
|
764 }
|
|
765 if($hgvs_notation->{ref} =~ /X$/){
|
|
766 ### For stops & add extX & distance to next stop to alt pep
|
|
767 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "loss");
|
|
768 if(defined $aa_til_stop){
|
|
769 $hgvs_notation->{alt} .="extX" . $aa_til_stop;
|
|
770 }
|
|
771 }
|
|
772 if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){
|
|
773 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
|
|
774 }
|
|
775 else{
|
|
776 ### correct ordering if needed
|
|
777 if($hgvs_notation->{start} > $hgvs_notation->{end}){
|
|
778 ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} );
|
|
779 }
|
|
780
|
|
781 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
|
|
782 }
|
|
783 }
|
|
784
|
|
785 elsif($hgvs_notation->{type} eq "fs"){
|
|
786 if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "X"){ ## stop gained
|
|
787 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
|
|
788
|
|
789 }
|
|
790 else{ ## not immediate stop - count aa until next
|
|
791
|
|
792 my $aa_til_stop = _stop_loss_extra_AA($self, $hgvs_notation->{start}-1, "fs");
|
|
793 if($aa_til_stop ){
|
|
794 ### use long form if new stop found
|
|
795 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} . "fsX$aa_til_stop" ;
|
|
796 }
|
|
797 else{
|
|
798 ### otherwise use short form
|
|
799 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "fs";
|
|
800 }
|
|
801 }
|
|
802 }
|
|
803
|
|
804 elsif( $hgvs_notation->{type} eq "del"){
|
|
805 $hgvs_notation->{alt} = "del";
|
|
806 if( length($hgvs_notation->{ref}) >3 ){
|
|
807 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3);
|
|
808 my $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3);
|
|
809 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ;
|
|
810 }
|
|
811 else{
|
|
812 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
|
|
813 }
|
|
814 }
|
|
815
|
|
816 elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){
|
|
817 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "_" . $hgvs_notation->{alt} . $hgvs_notation->{end} ;
|
|
818 }
|
|
819
|
|
820 else{
|
|
821 #### default to substitution
|
|
822 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt};
|
|
823 }
|
|
824
|
|
825 if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";}
|
|
826 return $hgvs_notation->{'hgvs'};
|
|
827 }
|
|
828
|
|
829 ### HGVS: get type of variation event in protein terms
|
|
830 sub _get_hgvs_protein_type{
|
|
831
|
|
832 my $self = shift;
|
|
833 my $hgvs_notation = shift;
|
|
834
|
|
835 ### get allele length
|
|
836 my ($ref_length, $alt_length ) = $self->_get_allele_length();
|
|
837
|
|
838
|
|
839 if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){
|
|
840 ### Run type checks on peptides if available
|
|
841
|
|
842 if($hgvs_notation->{alt} eq $hgvs_notation->{ref} ){
|
|
843 #### synonymous indicated by =
|
|
844 $hgvs_notation->{type} = "=";
|
|
845 }
|
|
846 elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) {
|
|
847
|
|
848 $hgvs_notation->{type} = ">";
|
|
849 }
|
|
850 elsif($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") {
|
|
851
|
|
852 $hgvs_notation->{type} = "ins";
|
|
853 }
|
|
854 elsif($hgvs_notation->{alt} eq "" ) {
|
|
855
|
|
856 $hgvs_notation->{type} = "del";
|
|
857 }
|
|
858 elsif( (length($hgvs_notation->{alt}) > length($hgvs_notation->{ref}) ) &&
|
|
859 $hgvs_notation->{alt} =~ / $hgvs_notation->{ref}/ ) {
|
|
860 ### capture duplication event described as TTT/TTTTT
|
|
861 $hgvs_notation->{type} = "dup";
|
|
862 }
|
|
863
|
|
864 elsif( (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) ==1) ||
|
|
865 (length($hgvs_notation->{alt}) ==1 && length($hgvs_notation->{ref}) >1) ) {
|
|
866 $hgvs_notation->{type} = "delins";
|
|
867 }
|
|
868 else{
|
|
869 $hgvs_notation->{type} = ">";
|
|
870 }
|
|
871 }
|
|
872
|
|
873
|
|
874 elsif($ref_length ne $alt_length && ($ref_length - $alt_length)%3 !=0 ){
|
|
875 $hgvs_notation->{type} = "fs";
|
|
876 }
|
|
877
|
|
878 elsif(length($self->variation_feature_seq()) >1 ){
|
|
879 if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){
|
|
880 ### convention for insertions - end one less than start
|
|
881 $hgvs_notation->{type} = "ins";
|
|
882 }
|
|
883 elsif( $hgvs_notation->{start} != $hgvs_notation->{end} ){
|
|
884 $hgvs_notation->{type} = "delins";
|
|
885 }
|
|
886 else{
|
|
887 $hgvs_notation->{type} = ">";
|
|
888 }
|
|
889 }
|
|
890 else{
|
|
891 #print STDERR "DEBUG ".$self->variation_feature->start."\n";
|
|
892 #warn "Cannot define protein variant type [$ref_length - $alt_length]\n";
|
|
893 }
|
|
894 return $hgvs_notation ;
|
|
895
|
|
896 }
|
|
897
|
|
898 ### HGVS: get reference & alternative peptide
|
|
899 sub _get_hgvs_peptides{
|
|
900
|
|
901 my $self = shift;
|
|
902 my $hgvs_notation = shift;
|
|
903
|
|
904 if($hgvs_notation->{type} eq "fs"){
|
|
905 ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately
|
|
906 $hgvs_notation = $self->_get_fs_peptides($hgvs_notation);
|
|
907 }
|
|
908 elsif($hgvs_notation->{type} eq "ins" ){
|
|
909
|
|
910 ### HGVS ref are peptides flanking insertion
|
|
911 $hgvs_notation->{ref} = $self->_get_surrounding_peptides($hgvs_notation->{start});
|
|
912
|
|
913 if( $hgvs_notation->{alt} =~/\*/){
|
|
914 ## inserted bases after stop irrelevant; consider as substitution gaining stop MAINTAIN PREVIOUS BEHAVIOUR FOR NOW
|
|
915 #$hgvs_notation->{alt} =~ s/\*\w+$/\*/;
|
|
916 }
|
|
917 else{
|
|
918 ### Additional check that inserted bases do not duplicate 3' reference sequence [set to type = dup if so]
|
|
919 $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation);
|
|
920 }
|
|
921 }
|
|
922
|
|
923
|
|
924 ### Convert peptide to 3 letter code as used in HGVS
|
|
925 $hgvs_notation->{ref} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref', -alphabet => 'protein')) || "";
|
|
926 if( $hgvs_notation->{alt} eq "-"){
|
|
927 $hgvs_notation->{alt} = "del";
|
|
928 }
|
|
929 else{
|
|
930 $hgvs_notation->{alt} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref', -alphabet => 'protein')) || "";
|
|
931 }
|
|
932
|
|
933 ### handle special cases
|
|
934 if( affects_start_codon($self) ){
|
|
935 #### handle initiator loss -> probably no translation => alt allele is '?'
|
|
936 $hgvs_notation->{alt} = "?";
|
|
937 $hgvs_notation->{type} = "";
|
|
938 }
|
|
939
|
|
940 elsif( $hgvs_notation->{type} eq "del"){
|
|
941 #### partial last codon
|
|
942 $hgvs_notation->{alt} = "del";
|
|
943 }
|
|
944 elsif($hgvs_notation->{type} eq "fs"){
|
|
945 ### only quote first ref peptide for frameshift
|
|
946 $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3);
|
|
947 }
|
|
948
|
|
949 ### set stop to HGVS prefered "X"
|
|
950 if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Ter|Xaa/X/g; }
|
|
951 if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Ter|Xaa/X/g; }
|
|
952
|
|
953 return ($hgvs_notation);
|
|
954
|
|
955 }
|
|
956
|
|
957 ### HGVS: remove common peptides from alt and ref strings & shift start/end accordingly
|
|
958 sub _clip_alleles{
|
|
959
|
|
960 my $hgvs_notation = shift;
|
|
961
|
|
962 my $check_alt = $hgvs_notation->{alt} ;
|
|
963 my $check_ref = $hgvs_notation->{ref} ;
|
|
964 my $check_start = $hgvs_notation->{start};
|
|
965
|
|
966 ### strip same bases from start of string
|
|
967 for (my $p =0; $p <length ($hgvs_notation->{ref}); $p++){
|
|
968 my $check_next_ref = substr( $check_ref, 0, 1);
|
|
969 my $check_next_alt = substr( $check_alt, 0, 1);
|
|
970
|
|
971 if($check_next_ref eq "*" && $check_next_alt eq "*"){
|
|
972 ### stop re-created by variant - no protein change
|
|
973 $hgvs_notation->{type} = "=";
|
|
974
|
|
975 return($hgvs_notation);
|
|
976 }
|
|
977
|
|
978 if($check_next_ref eq $check_next_alt){
|
|
979 $check_start++;
|
|
980 $check_ref = substr( $check_ref, 1);
|
|
981 $check_alt = substr( $check_alt, 1);
|
|
982 }
|
|
983 else{
|
|
984 last;
|
|
985 }
|
|
986 }
|
|
987 #### strip same bases from end of string
|
|
988 for (my $q =0; $q < length ($check_ref); $q++){
|
|
989 my $check_next_ref = substr( $check_ref, -1, 1);
|
|
990 my $check_next_alt = substr( $check_alt, -1, 1);
|
|
991 if($check_next_ref eq $check_next_alt){
|
|
992 chop $check_ref;
|
|
993 chop $check_alt;
|
|
994 }
|
|
995 else{
|
|
996 last;
|
|
997 }
|
|
998 }
|
|
999
|
|
1000 $hgvs_notation->{alt} = $check_alt;
|
|
1001 $hgvs_notation->{ref} = $check_ref;
|
|
1002
|
|
1003 ### check if clipping force type change & adjust location
|
|
1004 if(length ($check_ref) == 0 && length ($check_alt) >= 1){
|
|
1005 ### re-set as ins not delins
|
|
1006 $hgvs_notation->{type} ="ins";
|
|
1007 ### insertion between ref base and next => adjust next
|
|
1008 if($hgvs_notation->{end} == $hgvs_notation->{start} ){$hgvs_notation->{end} = $check_start; }
|
|
1009 # $hgvs_notation->{start} = $check_start;
|
|
1010 }
|
|
1011 elsif(length ($check_ref) >=1 && length ($check_alt) == 0){
|
|
1012 ### re-set as del not delins
|
|
1013 $hgvs_notation->{type} ="del";
|
|
1014 $hgvs_notation->{start} = $check_start;
|
|
1015 }
|
|
1016 else{
|
|
1017 #### save trimmed peptide string & increased start position
|
|
1018 $hgvs_notation->{start} = $check_start;
|
|
1019
|
|
1020 }
|
|
1021
|
|
1022 return $hgvs_notation;
|
|
1023 }
|
|
1024
|
|
1025
|
|
1026
|
|
1027
|
|
1028
|
|
1029 #### HGVS: check allele lengths to look for frameshifts
|
|
1030 sub _get_allele_length{
|
|
1031
|
|
1032 my $self = shift;
|
|
1033 my $ref_length = 0;
|
|
1034 my $alt_length = 0;
|
|
1035
|
|
1036 my $al_string = $self->allele_string();
|
|
1037 my $ref_allele = (split/\//, $al_string)[0];
|
|
1038 $ref_allele =~ s/\-//;
|
|
1039 $ref_length = length $ref_allele;
|
|
1040
|
|
1041 my $alt_allele = $self->variation_feature_seq();
|
|
1042 $alt_allele =~ s/\-//;
|
|
1043 $alt_length = length $alt_allele;
|
|
1044
|
|
1045 return ($ref_length, $alt_length );
|
|
1046
|
|
1047 }
|
|
1048
|
|
1049 ### HGVS: list first different peptide [may not be first changed codon]
|
|
1050 sub _get_fs_peptides{
|
|
1051
|
|
1052 my $self = shift;
|
|
1053 my $hgvs_notation = shift;
|
|
1054
|
|
1055 ### get CDS with alt variant
|
|
1056 my $alt_cds = $self->_get_alternate_cds();
|
|
1057 return undef unless defined($alt_cds);
|
|
1058
|
|
1059 #### get new translation
|
|
1060 my $alt_trans = $alt_cds->translate()->seq();
|
|
1061
|
|
1062 ### get changed end (currently in single letter AA coding)
|
|
1063 my $ref_trans = $self->transcript->translate()->seq();
|
|
1064 $ref_trans .= "*"; ## appending ref stop for checking purposes
|
|
1065
|
|
1066 $hgvs_notation->{start} = $self->transcript_variation->translation_start() ;
|
|
1067
|
|
1068 if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq
|
|
1069 $hgvs_notation->{alt} = "del";
|
|
1070 $hgvs_notation->{type} = "del";
|
|
1071 return $hgvs_notation;
|
|
1072 }
|
|
1073
|
|
1074 while ($hgvs_notation->{start} <= length $alt_trans){
|
|
1075 ### frame shift may result in the same AA#
|
|
1076
|
|
1077 $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1);
|
|
1078 $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1);
|
|
1079
|
|
1080 if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){
|
|
1081 ### variation at stop codon, but maintains stop codon
|
|
1082 return ($hgvs_notation);
|
|
1083 }
|
|
1084 last if $hgvs_notation->{ref} ne $hgvs_notation->{alt};
|
|
1085 $hgvs_notation->{start}++;
|
|
1086 }
|
|
1087 return ($hgvs_notation);
|
|
1088
|
|
1089 }
|
|
1090
|
|
1091 #### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion
|
|
1092 sub _get_surrounding_peptides{
|
|
1093
|
|
1094 my $self = shift;
|
|
1095 my $ref_pos = shift;
|
|
1096 my $ref_trans = $self->transcript->translate()->seq();
|
|
1097
|
|
1098 my $end = substr($ref_trans, $ref_pos-1);
|
|
1099 my $ref_string = substr($ref_trans, $ref_pos-1, 2);
|
|
1100
|
|
1101 return ($ref_string);
|
|
1102
|
|
1103 }
|
|
1104
|
|
1105
|
|
1106 #### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop
|
|
1107 sub _get_alternate_cds{
|
|
1108
|
|
1109 my $self = shift;
|
|
1110
|
|
1111 ### get reference sequence
|
|
1112 my $reference_cds_seq = $self->transcript->translateable_seq();
|
|
1113
|
|
1114 return undef unless defined($self->transcript_variation->cds_start) && defined($self->transcript_variation->cds_end());
|
|
1115
|
|
1116 ### get sequences upstream and downstream of variant
|
|
1117 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) );
|
|
1118 my $downstream_seq = substr($reference_cds_seq, ($self->transcript_variation->cds_end() ) );
|
|
1119
|
|
1120 ### fix alternate allele if deletion or on opposite strand
|
|
1121 my $alt_allele = $self->variation_feature_seq();
|
|
1122 $alt_allele =~ s/\-//;
|
|
1123 if( $self->transcript_variation->variation_feature->strand() <0 && $self->transcript_variation->transcript->strand() >0 ||
|
|
1124 $self->transcript_variation->variation_feature->strand() >0 && $self->transcript_variation->transcript->strand() < 0
|
|
1125 ){
|
|
1126 reverse_comp(\$alt_allele) ;
|
|
1127 }
|
|
1128 ### build alternate seq
|
|
1129 my $alternate_seq = $upstream_seq . $alt_allele . $downstream_seq ;
|
|
1130
|
|
1131 ### create seq obj with alternative allele in the CDS sequence
|
|
1132 my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq, -id => 'alt_cds', -alphabet => 'dna');
|
|
1133
|
|
1134 ### append UTR if available as stop may be disrupted
|
|
1135 my $utr = $self->transcript_variation->transcript->three_prime_utr();
|
|
1136
|
|
1137 if (defined $utr) {
|
|
1138 ### append the UTR to the alternative CDS
|
|
1139 $alt_cds->seq($alt_cds->seq() . $utr->seq());
|
|
1140 }
|
|
1141 else{
|
|
1142 ##warn "No UTR available for alternate CDS\n";
|
|
1143 }
|
|
1144
|
|
1145 return $alt_cds;
|
|
1146 }
|
|
1147
|
|
1148 ### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication
|
|
1149 sub _check_for_peptide_duplication{
|
|
1150
|
|
1151 my $self = shift;
|
|
1152 my $hgvs_notation= shift;
|
|
1153
|
|
1154 ##### get reference sequence
|
|
1155 my $reference_cds_seq = $self->transcript->translateable_seq();
|
|
1156
|
|
1157 ##### get sequence upstream of variant
|
|
1158 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) );
|
|
1159
|
|
1160 ##### create translation to check against inserted peptides
|
|
1161 my $upstream_cds =Bio::PrimarySeq->new(-seq => $upstream_seq, -id => 'alt_cds', -alphabet => 'dna');
|
|
1162 my $upstream_trans = $upstream_cds->translate()->seq();
|
|
1163
|
|
1164 ## Test whether alt peptide matches the reference sequence just before the variant
|
|
1165 my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ;
|
|
1166 my $test_seq = substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'}));
|
|
1167
|
|
1168 if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) {
|
|
1169
|
|
1170 $hgvs_notation->{type} = 'dup';
|
|
1171 $hgvs_notation->{end} = $hgvs_notation->{start} -1;
|
|
1172 $hgvs_notation->{start} -= length($hgvs_notation->{alt});
|
|
1173
|
|
1174 }
|
|
1175
|
|
1176 return $hgvs_notation;
|
|
1177
|
|
1178 }
|
|
1179
|
|
1180 #### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's
|
|
1181 sub _stop_loss_extra_AA{
|
|
1182
|
|
1183 my $self = shift;
|
|
1184 my $ref_var_pos = shift; ### first effected AA - supply for frameshifts
|
|
1185 my $test = shift;
|
|
1186
|
|
1187 return undef unless $ref_var_pos;
|
|
1188
|
|
1189 my $extra_aa;
|
|
1190
|
|
1191 ### get the sequence with variant added
|
|
1192 my $alt_cds = $self->_get_alternate_cds();
|
|
1193 return undef unless defined($alt_cds);
|
|
1194
|
|
1195 ### get new translation
|
|
1196 my $alt_trans = $alt_cds->translate();
|
|
1197
|
|
1198 my $ref_temp = $self->transcript->translate()->seq;
|
|
1199 my $ref_len = length($ref_temp);
|
|
1200
|
|
1201 if($DEBUG==1){
|
|
1202 print "alt translated:\n" . $alt_trans->seq() . "\n";
|
|
1203 print "ref translated:\n$ref_temp\n";;
|
|
1204 }
|
|
1205
|
|
1206 #### Find the number of residues that are translated until a termination codon is encountered
|
|
1207 if ($alt_trans->seq() =~ m/\*/) {
|
|
1208 if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";}
|
|
1209
|
|
1210 if($test eq "fs" ){
|
|
1211 ### frame shift - count from first AA effected by variant to stop
|
|
1212 $extra_aa = $+[0] - $ref_var_pos;
|
|
1213 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";}
|
|
1214 }
|
|
1215
|
|
1216 else{
|
|
1217 $extra_aa = $+[0] - 1 - $ref_len;
|
|
1218 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";}
|
|
1219 }
|
|
1220 }
|
|
1221
|
|
1222 # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon
|
|
1223 if(defined $extra_aa && $extra_aa >0){
|
|
1224 return $extra_aa ;
|
|
1225 }
|
|
1226 else{
|
|
1227 #warn "No stop found in alternate sequence\n";
|
|
1228 return undef;
|
|
1229 }
|
|
1230
|
|
1231 }
|
|
1232
|
|
1233 =head
|
|
1234 # We haven't implemented support for these methods yet
|
|
1235
|
|
1236 sub hgvs_rna {
|
|
1237 return _hgvs_generic(@_,'rna');
|
|
1238 }
|
|
1239
|
|
1240 sub hgvs_mitochondrial {
|
|
1241 return _hgvs_generic(@_,'mitochondrial');
|
|
1242 }
|
|
1243
|
|
1244 =cut
|
|
1245
|
|
1246 sub _hgvs_generic {
|
|
1247
|
|
1248 my $self = shift;
|
|
1249 my $reference = pop;
|
|
1250 my $notation = shift;
|
|
1251
|
|
1252 #ÊThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these
|
|
1253 return undef if ($reference =~ m/rna|mitochondrial/);
|
|
1254
|
|
1255 my $sub = qq{hgvs_$reference};
|
|
1256
|
|
1257 $self->{$sub} = $notation if defined $notation;
|
|
1258
|
|
1259 unless ($self->{$sub}) {
|
|
1260
|
|
1261 # Use the transcript this VF is on as the reference feature
|
|
1262 my $reference_feature = $self->transcript;
|
|
1263 # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region
|
|
1264 $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic');
|
|
1265
|
|
1266 # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles
|
|
1267 $self->transcript_variation->$sub($self->variation_feature->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$self->transcript_variation));
|
|
1268 }
|
|
1269
|
|
1270 return $self->{$sub};
|
|
1271 }
|
|
1272
|
|
1273
|
|
1274 ### HGVS: move variant to transcript slice
|
|
1275 sub _var2transcript_slice_coords{
|
|
1276
|
|
1277 my $self = shift;
|
|
1278
|
|
1279 my $ref_slice = $self->transcript->feature_Slice(); #### returns with strand same as feature
|
|
1280 my $tr_vf = $self->variation_feature->transfer($ref_slice);
|
|
1281
|
|
1282 # Return undef if this VariationFeature does not fall within the supplied feature.
|
|
1283 return undef if (!defined $tr_vf ||
|
|
1284 $tr_vf->start < 1 ||
|
|
1285 $tr_vf->end < 1 ||
|
|
1286 $tr_vf->start > ($self->transcript->end - $self->transcript->start + 1) ||
|
|
1287 $tr_vf->end > ($self->transcript->end - $self->transcript->start + 1));
|
|
1288
|
|
1289 return( $tr_vf->start() , $tr_vf->end(), $ref_slice);
|
|
1290 }
|
|
1291
|
|
1292
|
|
1293
|
|
1294 ### HGVS: get variant position in transcript
|
|
1295
|
|
1296 # intron:
|
|
1297 # If the position is in an intron, the boundary position of the closest exon and
|
|
1298 # a + or - offset into the intron is returned.
|
|
1299 # Ordered by genome forward not 5' -> 3'
|
|
1300
|
|
1301 # upstream:
|
|
1302 # If the position is 5' of the start codon, it is reported relative to the start codon
|
|
1303 # (-1 being the last nucleotide before the 'A' of ATG).
|
|
1304
|
|
1305 #downstream:
|
|
1306 # If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset
|
|
1307 # from the start codon (*1 being the first nucleotide after the last position of the stop codon)
|
|
1308
|
|
1309 sub _get_cDNA_position {
|
|
1310
|
|
1311 my $self = shift;
|
|
1312 my $position = shift; ### start or end of variation
|
|
1313
|
|
1314 my $transcript = $self->transcript();
|
|
1315 my $strand = $transcript->strand();
|
|
1316
|
|
1317 #### TranscriptVariation start/stop coord relative to transcript
|
|
1318 #### Switch to chromosome coordinates taking into account strand
|
|
1319 $position = ( $strand > 0 ?
|
|
1320 ( $self->transcript->start() + $position - 1 ) :
|
|
1321 ( $self->transcript->end() - $position + 1));
|
|
1322
|
|
1323
|
|
1324
|
|
1325 # Get all exons and sort them in positional order
|
|
1326 my @exons = sort {$a->start() <=> $b->start()} @{$transcript->get_all_Exons()};
|
|
1327 my $n_exons = scalar(@exons);
|
|
1328
|
|
1329 my $cdna_position;
|
|
1330 # Loop over the exons and get the coordinates of the variation in exon+intron notation
|
|
1331 for (my $i=0; $i<$n_exons; $i++) {
|
|
1332
|
|
1333 # Skip if the start point is beyond this exon
|
|
1334 next if ($position > $exons[$i]->end());
|
|
1335
|
|
1336
|
|
1337 # EXONIC: If the start coordinate is within this exon
|
|
1338 if ($position >= $exons[$i]->start()) {
|
|
1339 # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation
|
|
1340 # If the transcript is in the opposite direction, count from the end instead
|
|
1341 $cdna_position = $exons[$i]->cdna_start($transcript) + ( $strand > 0 ?
|
|
1342 ( $position - $exons[$i]->start ) :
|
|
1343 ( $exons[$i]->end() - $position )
|
|
1344 );
|
|
1345 last; #### last exon checked
|
|
1346 }
|
|
1347 ## INTRONIC
|
|
1348 # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one
|
|
1349 else {
|
|
1350
|
|
1351 my $updist = ($position - $exons[$i-1]->end());
|
|
1352 my $downdist = ($exons[$i]->start() - $position);
|
|
1353
|
|
1354 # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that
|
|
1355 if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) {
|
|
1356
|
|
1357 # If the orientation is reversed, we should use the cDNA start and a '-' offset
|
|
1358 $cdna_position = ($strand >= 0 ?
|
|
1359 $exons[$i-1]->cdna_end($transcript) . '+' :
|
|
1360 $exons[$i-1]->cdna_start($transcript) . '-')
|
|
1361 . $updist;
|
|
1362 }
|
|
1363 # Else if downstream is shortest...
|
|
1364 else {
|
|
1365 # If the orientation is reversed, we should use the cDNA end and a '+' offset
|
|
1366 $cdna_position = ($strand >= 0 ?
|
|
1367 $exons[$i]->cdna_start($transcript) . '-' :
|
|
1368 $exons[$i]->cdna_end($transcript) . '+')
|
|
1369 . $downdist;
|
|
1370 }
|
|
1371 last; ## last exon checked
|
|
1372 }
|
|
1373 }
|
|
1374
|
|
1375 # Shift the position to make it relative to the start & stop codons
|
|
1376 my $start_codon = $transcript->cdna_coding_start();
|
|
1377 my $stop_codon = $transcript->cdna_coding_end();
|
|
1378
|
|
1379 # Disassemble the cDNA coordinate into the exon and intron parts
|
|
1380 ### just built this now taking it appart again
|
|
1381 my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/;
|
|
1382
|
|
1383
|
|
1384 # Start by correcting for the stop codon
|
|
1385 if (defined($stop_codon) && $cdna_coord > $stop_codon) {
|
|
1386 # Get the offset from the stop codon
|
|
1387 $cdna_coord -= $stop_codon;
|
|
1388 # Prepend a * to indicate the position is in the 3' UTR
|
|
1389 $cdna_coord = '*' . $cdna_coord;
|
|
1390 }
|
|
1391 elsif (defined($start_codon)) {
|
|
1392 # If the position is beyond the start codon, add 1 to get the correct offset
|
|
1393 $cdna_coord += ($cdna_coord >= $start_codon);
|
|
1394 # Subtract the position of the start codon
|
|
1395 $cdna_coord -= $start_codon;
|
|
1396 }
|
|
1397
|
|
1398 # Re-assemble the cDNA position [ return exon num & offset & direction for intron eg. 142+363]
|
|
1399 $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : '');
|
|
1400 return $cdna_position;
|
|
1401 }
|
|
1402
|
|
1403
|
|
1404 1;
|
|
1405
|
|
1406
|