Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Variation::TranscriptVariationAllele | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 use Bio::EnsEMBL::Variation::TranscriptVariationAllele; | |
28 | |
29 my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new( | |
30 -transcript_variation => $tv, | |
31 -variation_feature_seq => 'A', | |
32 -is_reference => 0, | |
33 ); | |
34 | |
35 print "sequence with respect to the transcript: ", $tva->feature_seq, "\n"; | |
36 print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n"; | |
37 print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n"; | |
38 print "amino acid change: ", $tva->peptide_allele_string, "\n"; | |
39 print "resulting codon: ", $tva->codon, "\n"; | |
40 print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n"; | |
41 print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n"; | |
42 print "SIFT prediction: ", $tva->sift_prediction, "\n"; | |
43 | |
44 =head1 DESCRIPTION | |
45 | |
46 A TranscriptVariationAllele object represents a single allele of a TranscriptVariation. | |
47 It provides methods that are specific to the sequence of the allele, such as codon, | |
48 peptide etc. Methods that depend only on position (e.g. CDS start) will be found in | |
49 the associated TranscriptVariation. Ordinarily you will not create these objects | |
50 yourself, but instead you would create a TranscriptVariation object which will then | |
51 construct TranscriptVariationAlleles based on the allele string of the associated | |
52 VariationFeature. | |
53 | |
54 Note that any methods that are not specific to Transcripts will be found in the | |
55 VariationFeatureOverlapAllele superclass. | |
56 | |
57 =cut | |
58 | |
59 package Bio::EnsEMBL::Variation::TranscriptVariationAllele; | |
60 | |
61 use strict; | |
62 use warnings; | |
63 | |
64 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP); | |
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); | |
66 use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string); | |
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
68 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(within_cds within_intron stop_lost affects_start_codon); | |
69 | |
70 use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele); | |
71 | |
72 | |
73 our $DEBUG = 0; | |
74 | |
75 sub new_fast { | |
76 my ($class, $hashref) = @_; | |
77 | |
78 # swap a transcript_variation argument for a variation_feature_overlap one | |
79 | |
80 if ($hashref->{transcript_variation}) { | |
81 $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation}; | |
82 } | |
83 | |
84 # and call the superclass | |
85 | |
86 return $class->SUPER::new_fast($hashref); | |
87 } | |
88 | |
89 =head2 transcript_variation | |
90 | |
91 Description: Get/set the associated TranscriptVariation | |
92 Returntype : Bio::EnsEMBL::Variation::TranscriptVariation | |
93 Exceptions : throws if the argument is the wrong type | |
94 Status : At Risk | |
95 | |
96 =cut | |
97 | |
98 sub transcript_variation { | |
99 my ($self, $tv) = @_; | |
100 assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $tv; | |
101 return $self->variation_feature_overlap($tv); | |
102 } | |
103 | |
104 =head2 variation_feature | |
105 | |
106 Description: Get the associated VariationFeature | |
107 Returntype : Bio::EnsEMBL::Variation::VariationFeature | |
108 Exceptions : none | |
109 Status : At Risk | |
110 | |
111 =cut | |
112 | |
113 sub variation_feature { | |
114 my $self = shift; | |
115 return $self->transcript_variation->variation_feature; | |
116 } | |
117 | |
118 =head2 pep_allele_string | |
119 | |
120 Description: Return a '/' delimited string of the reference peptide and the | |
121 peptide resulting from this allele, or a single peptide if this | |
122 allele does not change the peptide (e.g. because it is synonymous) | |
123 Returntype : string or undef if this allele is not in the CDS | |
124 Exceptions : none | |
125 Status : At Risk | |
126 | |
127 =cut | |
128 | |
129 sub pep_allele_string { | |
130 my ($self) = @_; | |
131 | |
132 my $pep = $self->peptide; | |
133 | |
134 return undef unless $pep; | |
135 | |
136 my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; | |
137 | |
138 return undef unless $ref_pep; | |
139 | |
140 return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep; | |
141 } | |
142 | |
143 =head2 codon_allele_string | |
144 | |
145 Description: Return a '/' delimited string of the reference codon and the | |
146 codon resulting from this allele | |
147 Returntype : string or undef if this allele is not in the CDS | |
148 Exceptions : none | |
149 Status : At Risk | |
150 | |
151 =cut | |
152 | |
153 sub codon_allele_string { | |
154 my ($self) = @_; | |
155 | |
156 my $codon = $self->codon; | |
157 | |
158 return undef unless $codon; | |
159 | |
160 my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon; | |
161 | |
162 return $ref_codon.'/'.$codon; | |
163 } | |
164 | |
165 =head2 display_codon_allele_string | |
166 | |
167 Description: Return a '/' delimited string of the reference display_codon and the | |
168 display_codon resulting from this allele. The display_codon identifies | |
169 the nucleotides affected by this variant in UPPER CASE and other | |
170 nucleotides in lower case | |
171 Returntype : string or undef if this allele is not in the CDS | |
172 Exceptions : none | |
173 Status : At Risk | |
174 | |
175 =cut | |
176 | |
177 sub display_codon_allele_string { | |
178 my ($self) = @_; | |
179 | |
180 my $display_codon = $self->display_codon; | |
181 | |
182 return undef unless $display_codon; | |
183 | |
184 my $ref_display_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->display_codon; | |
185 | |
186 return $ref_display_codon.'/'.$display_codon; | |
187 } | |
188 | |
189 =head2 peptide | |
190 | |
191 Description: Return the amino acid sequence that this allele is predicted to result in | |
192 Returntype : string or undef if this allele is not in the CDS or is a frameshift | |
193 Exceptions : none | |
194 Status : At Risk | |
195 | |
196 =cut | |
197 | |
198 sub peptide { | |
199 my ($self, $peptide) = @_; | |
200 | |
201 $self->{peptide} = $peptide if $peptide; | |
202 | |
203 unless ($self->{peptide}) { | |
204 | |
205 return undef unless $self->seq_is_unambiguous_dna; | |
206 | |
207 if (my $codon = $self->codon) { | |
208 | |
209 # the codon method can set the peptide in some circumstances | |
210 # so check here before we try an (expensive) translation | |
211 return $self->{peptide} if $self->{peptide}; | |
212 | |
213 # translate the codon sequence to establish the peptide allele | |
214 | |
215 # allow for partial codons - split the sequence into whole and partial | |
216 # e.g. AAAGG split into AAA and GG | |
217 my $whole_codon = substr($codon, 0, int(length($codon) / 3) * 3); | |
218 my $partial_codon = substr($codon, int(length($codon) / 3) * 3); | |
219 | |
220 my $pep = ''; | |
221 | |
222 if($whole_codon) { | |
223 # for mithocondrial dna we need to to use a different codon table | |
224 my $codon_table = $self->transcript_variation->_codon_table; | |
225 | |
226 my $codon_seq = Bio::Seq->new( | |
227 -seq => $whole_codon, | |
228 -moltype => 'dna', | |
229 -alphabet => 'dna', | |
230 ); | |
231 | |
232 $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq; | |
233 } | |
234 | |
235 if($partial_codon) { | |
236 $pep .= 'X'; | |
237 } | |
238 | |
239 $pep ||= '-'; | |
240 | |
241 $self->{peptide} = $pep; | |
242 } | |
243 } | |
244 | |
245 return $self->{peptide}; | |
246 } | |
247 | |
248 =head2 codon | |
249 | |
250 Description: Return the codon sequence that this allele is predicted to result in | |
251 Returntype : string or undef if this allele is not in the CDS or is a frameshift | |
252 Exceptions : none | |
253 Status : At Risk | |
254 | |
255 =cut | |
256 | |
257 sub codon { | |
258 my ($self, $codon) = @_; | |
259 | |
260 $self->{codon} = $codon if defined $codon; | |
261 | |
262 my $tv = $self->transcript_variation; | |
263 | |
264 return undef unless $tv->translation_start && $tv->translation_end; | |
265 | |
266 return undef unless $self->seq_is_dna; | |
267 | |
268 unless ($self->{codon}) { | |
269 | |
270 # try to calculate the codon sequence | |
271 | |
272 my $seq = $self->feature_seq; | |
273 | |
274 $seq = '' if $seq eq '-'; | |
275 | |
276 # calculate necessary coords and lengths | |
277 | |
278 my $codon_cds_start = $tv->translation_start * 3 - 2; | |
279 my $codon_cds_end = $tv->translation_end * 3; | |
280 my $codon_len = $codon_cds_end - $codon_cds_start + 1; | |
281 my $vf_nt_len = $tv->cds_end - $tv->cds_start + 1; | |
282 my $allele_len = $self->seq_length; | |
283 | |
284 if ($allele_len != $vf_nt_len) { | |
285 if (abs($allele_len - $vf_nt_len) % 3) { | |
286 # this is a frameshift variation, we don't attempt to | |
287 # calculate the resulting codon or peptide change as this | |
288 # could get quite complicated | |
289 return undef; | |
290 } | |
291 } | |
292 | |
293 # splice the allele sequence into the CDS | |
294 | |
295 my $cds = $tv->_translateable_seq; | |
296 | |
297 substr($cds, $tv->cds_start-1, $vf_nt_len) = $seq; | |
298 | |
299 # and extract the codon sequence | |
300 | |
301 my $codon = substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len)); | |
302 | |
303 if (length($codon) < 1) { | |
304 $self->{codon} = '-'; | |
305 $self->{peptide} = '-'; | |
306 } | |
307 else { | |
308 $self->{codon} = $codon; | |
309 } | |
310 } | |
311 | |
312 return $self->{codon}; | |
313 } | |
314 | |
315 =head2 display_codon | |
316 | |
317 Description: Return the codon sequence that this allele is predicted to result in | |
318 with the affected nucleotides identified in UPPER CASE and other | |
319 nucleotides in lower case | |
320 Returntype : string or undef if this allele is not in the CDS or is a frameshift | |
321 Exceptions : none | |
322 Status : At Risk | |
323 | |
324 =cut | |
325 | |
326 sub display_codon { | |
327 my $self = shift; | |
328 | |
329 unless ($self->{_display_codon}) { | |
330 | |
331 if ($self->codon && defined $self->transcript_variation->codon_position) { | |
332 | |
333 my $display_codon = lc $self->codon; | |
334 | |
335 # if this allele is an indel then just return all lowercase | |
336 | |
337 if ($self->feature_seq ne '-') { | |
338 | |
339 # codon_position is 1-based, while substr assumes the string starts at 0 | |
340 | |
341 my $pos = $self->transcript_variation->codon_position - 1; | |
342 | |
343 my $len = length $self->feature_seq; | |
344 | |
345 substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len); | |
346 } | |
347 | |
348 $self->{_display_codon} = $display_codon; | |
349 } | |
350 } | |
351 | |
352 return $self->{_display_codon}; | |
353 } | |
354 | |
355 =head2 polyphen_prediction | |
356 | |
357 Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele. | |
358 (Note that we currently only have PolyPhen predictions for variants that | |
359 result in single amino acid substitutions in human) | |
360 Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown') | |
361 if this is a non-synonymous mutation and a prediction is available, undef | |
362 otherwise | |
363 Exceptions : none | |
364 Status : At Risk | |
365 | |
366 =cut | |
367 | |
368 sub polyphen_prediction { | |
369 my ($self, $classifier, $polyphen_prediction) = @_; | |
370 | |
371 $classifier ||= 'humvar'; | |
372 | |
373 my $analysis = "polyphen_${classifier}"; | |
374 | |
375 $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction; | |
376 | |
377 unless ($self->{$analysis}->{prediction}) { | |
378 my ($prediction, $score) = $self->_protein_function_prediction($analysis); | |
379 $self->{$analysis}->{score} = $score; | |
380 $self->{$analysis}->{prediction} = $prediction; | |
381 } | |
382 | |
383 return $self->{$analysis}->{prediction}; | |
384 } | |
385 | |
386 =head2 polyphen_score | |
387 | |
388 Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we | |
389 currently only have PolyPhen predictions for variants that result in single | |
390 amino acid substitutions in human) | |
391 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is | |
392 available, undef otherwise | |
393 Exceptions : none | |
394 Status : At Risk | |
395 | |
396 =cut | |
397 | |
398 sub polyphen_score { | |
399 my ($self, $classifier, $polyphen_score) = @_; | |
400 | |
401 $classifier ||= 'humvar'; | |
402 | |
403 my $analysis = "polyphen_${classifier}"; | |
404 | |
405 $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score; | |
406 | |
407 unless ($self->{$analysis}->{score}) { | |
408 my ($prediction, $score) = $self->_protein_function_prediction($analysis); | |
409 $self->{$analysis}->{score} = $score; | |
410 $self->{$analysis}->{prediction} = $prediction; | |
411 } | |
412 | |
413 return $self->{$analysis}->{score}; | |
414 } | |
415 | |
416 =head2 sift_prediction | |
417 | |
418 Description: Return the qualitative SIFT prediction for the effect of this allele. | |
419 (Note that we currently only have SIFT predictions for variants that | |
420 result in single amino acid substitutions in human) | |
421 Returntype : string (one of 'tolerated', 'deleterious') if this is a non-synonymous | |
422 mutation and a prediction is available, undef otherwise | |
423 Exceptions : none | |
424 Status : At Risk | |
425 | |
426 =cut | |
427 | |
428 sub sift_prediction { | |
429 my ($self, $sift_prediction) = @_; | |
430 | |
431 $self->{sift_prediction} = $sift_prediction if $sift_prediction; | |
432 | |
433 unless ($self->{sift_prediction}) { | |
434 my ($prediction, $score) = $self->_protein_function_prediction('sift'); | |
435 $self->{sift_score} = $score; | |
436 $self->{sift_prediction} = $prediction unless $self->{sift_prediction}; | |
437 } | |
438 | |
439 return $self->{sift_prediction}; | |
440 } | |
441 | |
442 =head2 sift_score | |
443 | |
444 Description: Return the SIFT score for this allele (Note that we currently only have SIFT | |
445 predictions for variants that result in single amino acid substitutions in human) | |
446 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is | |
447 available, undef otherwise | |
448 Exceptions : none | |
449 Status : At Risk | |
450 | |
451 =cut | |
452 | |
453 sub sift_score { | |
454 my ($self, $sift_score) = @_; | |
455 | |
456 $self->{sift_score} = $sift_score if defined $sift_score; | |
457 | |
458 unless ($self->{sift_score}) { | |
459 my ($prediction, $score) = $self->_protein_function_prediction('sift'); | |
460 $self->{sift_score} = $score; | |
461 $self->{sift_prediction} = $prediction; | |
462 } | |
463 | |
464 return $self->{sift_score}; | |
465 } | |
466 | |
467 | |
468 sub _protein_function_prediction { | |
469 my ($self, $analysis) = @_; | |
470 | |
471 # we can only get results for variants that cause a single amino acid substitution, | |
472 # so check the peptide allele string first | |
473 | |
474 if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) { | |
475 | |
476 if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) { | |
477 | |
478 my ($prediction, $score) = $matrix->get_prediction( | |
479 $self->transcript_variation->translation_start, | |
480 $self->peptide, | |
481 ); | |
482 | |
483 return wantarray ? ($prediction, $score) : $prediction; | |
484 } | |
485 } | |
486 | |
487 return undef; | |
488 } | |
489 | |
490 =head2 hgvs_genomic | |
491 | |
492 Description: Return a string representing the genomic-level effect of this allele in HGVS format | |
493 Returntype : string | |
494 Exceptions : none | |
495 Status : At Risk | |
496 | |
497 =cut | |
498 | |
499 sub hgvs_genomic { | |
500 return _hgvs_generic(@_,'genomic'); | |
501 } | |
502 | |
503 =head2 hgvs_coding | |
504 | |
505 Description: Return a string representing the CDS-level effect of this allele in HGVS format | |
506 Returntype : string or undef if this allele is not in the CDS | |
507 Exceptions : none | |
508 Status : At Risk | |
509 | |
510 =cut | |
511 | |
512 sub hgvs_coding { | |
513 | |
514 deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.'); | |
515 return hgvs_transcript(@_); | |
516 } | |
517 | |
518 | |
519 =head2 hgvs_transcript | |
520 | |
521 Description: Return a string representing the CDS-level effect of this allele in HGVS format | |
522 Returntype : string or undef if this allele is not in the CDS | |
523 Exceptions : none | |
524 Status : At Risk | |
525 | |
526 =cut | |
527 | |
528 | |
529 sub hgvs_transcript { | |
530 | |
531 | |
532 my $self = shift; | |
533 my $notation = shift; | |
534 | |
535 ##### set if string supplied | |
536 $self->{hgvs_transcript} = $notation if defined $notation; | |
537 ##### return if held | |
538 return $self->{hgvs_transcript} if defined $self->{hgvs_transcript} ; | |
539 return $self->{hgvs_coding} if defined $self->{hgvs_coding} ; | |
540 | |
541 my $variation_feature_sequence = $self->variation_feature_seq() ; | |
542 | |
543 ### don't try to handle odd characters | |
544 return undef if $variation_feature_sequence =~ m/[^ACGT\-]/ig; | |
545 | |
546 ### no result for reference allele | |
547 return undef if $self->is_reference ==1; | |
548 | |
549 ### else evaluate | |
550 | |
551 ### get reference sequence strand | |
552 my $refseq_strand = $self->transcript_variation->transcript->strand(); | |
553 | |
554 if($DEBUG ==1){ | |
555 my $var_name = $self->transcript_variation->variation_feature->variation_name(); | |
556 print "\nHGVS transcript: Checking $var_name refseq strand => $refseq_strand seq name : " . $self->transcript_variation->transcript_stable_id() . " var strand " . $self->transcript_variation->variation_feature->strand() . " vf st " . $self->variation_feature->strand() ." seqname: " . $self->variation_feature->seqname() ." seq: " . $self->variation_feature_seq ."\n"; | |
557 } | |
558 | |
559 my $hgvs_notation ; ### store components of HGVS string in hash | |
560 | |
561 ### vf strand is relative to transcript or transcript slice | |
562 if( $self->transcript_variation->variation_feature->strand() <0 && $refseq_strand >0 || | |
563 $self->transcript_variation->variation_feature->strand() >0 && $refseq_strand < 0 | |
564 ){ | |
565 reverse_comp(\$variation_feature_sequence); | |
566 } | |
567 | |
568 ### need to get ref seq from slice transcript is on for intron labelling | |
569 my ($slice_start, $slice_end, $slice) = $self->_var2transcript_slice_coords(); | |
570 | |
571 | |
572 unless($slice_start){ | |
573 #warn "TVA: VF not within transcript - no HGVS\n"; | |
574 return undef; | |
575 } | |
576 | |
577 ### decide event type from HGVS nomenclature | |
578 $hgvs_notation = hgvs_variant_notation( $variation_feature_sequence, ### alt_allele, | |
579 $slice->seq(), ### using this to extract ref allele | |
580 $slice_start, | |
581 $slice_end | |
582 ); | |
583 | |
584 ### This should not happen | |
585 unless($hgvs_notation->{'type'}){ | |
586 #warn "Error - not continuing; no HGVS annotation\n"; | |
587 return undef; | |
588 } | |
589 | |
590 ### create reference name - transcript name & seq version | |
591 $hgvs_notation->{'ref_name'} = $self->transcript_variation->transcript_stable_id() . "." . $self->transcript_variation->transcript->version(); | |
592 | |
593 | |
594 ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups] | |
595 $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} ); | |
596 $hgvs_notation->{end} = $self->_get_cDNA_position( $hgvs_notation->{end} ); | |
597 | |
598 | |
599 # Make sure that start is always less than end | |
600 my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; | |
601 my ($exon_end_coord, $intron_end_offset) = $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; | |
602 $intron_start_offset ||= 0; | |
603 $intron_end_offset ||= 0; | |
604 | |
605 ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if | |
606 (($exon_start_coord + $intron_start_offset) > ($exon_end_coord + $intron_end_offset)); | |
607 | |
608 | |
609 | |
610 if($self->transcript->cdna_coding_start()){ | |
611 $hgvs_notation->{'numbering'} = "c"; ### set 'c' if transcript is coding | |
612 } | |
613 else{ | |
614 $hgvs_notation->{'numbering'} = "n"; ### set 'n' if transcript non-coding | |
615 } | |
616 ### generic formatting | |
617 $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation); | |
618 | |
619 if($DEBUG ==1){print "HGVS notation for var " . $self->transcript_variation->variation_feature->variation_name() . " from hgvs transcript : " . $self->{hgvs_transcript} . " \n";} | |
620 | |
621 return $self->{hgvs_transcript}; | |
622 | |
623 } | |
624 | |
625 | |
626 =head2 hgvs_protein | |
627 | |
628 Description: Return a string representing the protein-level effect of this allele in HGVS format | |
629 Returntype : string or undef if this allele is not in the CDS | |
630 Exceptions : none | |
631 Status : At Risk | |
632 | |
633 =cut | |
634 | |
635 sub hgvs_protein { | |
636 | |
637 my $self = shift; | |
638 my $notation = shift; | |
639 my $hgvs_notation; | |
640 | |
641 if($DEBUG ==1){print "\nStarting hgvs_protein with ". $self->transcript_variation->variation_feature->variation_name() . "\n"; } | |
642 | |
643 ### set if string supplied | |
644 $self->{hgvs_protein} = $notation if defined $notation; | |
645 | |
646 ### return if set | |
647 return $self->{hgvs_protein} if defined $self->{hgvs_protein} ; | |
648 | |
649 ### don't try to handle odd characters | |
650 return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig; | |
651 | |
652 #### else, check viable input and create notation | |
653 return if $self->is_reference(); | |
654 | |
655 unless ($self->transcript_variation->translation_start()){ | |
656 return undef ; | |
657 } | |
658 | |
659 #### for debug | |
660 #my $var_name = $self->transcript_variation->variation_feature->variation_name(); | |
661 | |
662 ### get reference sequence [add seq version to transcript name] | |
663 $hgvs_notation->{ref_name} = $self->transcript_variation->transcript->translation()->display_id() . "." . $self->transcript_variation->transcript->translation()->version(); | |
664 | |
665 $hgvs_notation->{'numbering'} = 'p'; | |
666 | |
667 ### get default reference location [changed later in some cases eg. duplication] | |
668 $hgvs_notation->{start} = $self->transcript_variation->translation_start(); | |
669 $hgvs_notation->{end} = $self->transcript_variation->translation_end(); | |
670 | |
671 | |
672 ## get default reference & alt peptides [changed later to hgvs format] | |
673 $hgvs_notation->{alt} = $self->peptide; | |
674 $hgvs_notation->{ref} = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; | |
675 | |
676 if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref}){ | |
677 $hgvs_notation = _clip_alleles( $hgvs_notation); | |
678 } | |
679 | |
680 #### define type - types are different for protein numbering | |
681 $hgvs_notation = $self->_get_hgvs_protein_type($hgvs_notation); | |
682 | |
683 ##### Convert ref & alt peptides taking into account HGVS rules | |
684 $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation); | |
685 | |
686 ##### String formatting | |
687 $self->{hgvs_protein} = $self->_get_hgvs_protein_format($hgvs_notation); | |
688 return $self->{hgvs_protein} ; | |
689 } | |
690 | |
691 ### HGVS: format protein string | |
692 sub _get_hgvs_protein_format{ | |
693 | |
694 my $self = shift; | |
695 my $hgvs_notation = shift; | |
696 | |
697 if ((defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} && | |
698 $hgvs_notation->{ref} eq $hgvs_notation->{alt}) || | |
699 (defined $hgvs_notation->{type} && $hgvs_notation->{type} eq "=")){ | |
700 | |
701 ### no protein change - return transcript nomenclature with flag for neutral protein consequence | |
702 $hgvs_notation->{'hgvs'} = $self->hgvs_transcript() . "(p.=)"; | |
703 return $hgvs_notation->{'hgvs'} ; | |
704 } | |
705 | |
706 ### all start with refseq name & numbering type | |
707 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . "."; | |
708 | |
709 ### handle stop_lost seperately regardless of cause by del/delins => p.XposAA1extnum_AA_to_stop | |
710 if(stop_lost($self)){ ### if deletion of stop add extX and number of new aa to alt | |
711 $hgvs_notation->{alt} = substr($hgvs_notation->{alt}, 0, 3); | |
712 if($hgvs_notation->{type} eq "del"){ | |
713 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "del"); | |
714 if(defined $aa_til_stop){ | |
715 $hgvs_notation->{alt} .= "extX" . $aa_til_stop; | |
716 } | |
717 } | |
718 elsif($hgvs_notation->{type} eq ">"){ | |
719 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "subs"); | |
720 if(defined $aa_til_stop){ | |
721 $hgvs_notation->{alt} .= "extX" . $aa_til_stop; | |
722 } | |
723 } | |
724 else{ | |
725 # warn "TVA: stop loss for type $hgvs_notation->{type} not caught \n"; | |
726 } | |
727 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; | |
728 } | |
729 | |
730 elsif( $hgvs_notation->{type} eq "dup"){ | |
731 | |
732 if($hgvs_notation->{start} < $hgvs_notation->{end}){ | |
733 ### list only first and last peptides in long duplicated string | |
734 my $ref_pep_first = substr($hgvs_notation->{alt}, 0, 3); | |
735 my $ref_pep_last = substr($hgvs_notation->{alt}, -3, 3); | |
736 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} ."dup"; | |
737 | |
738 } | |
739 else{ | |
740 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{alt} . $hgvs_notation->{start} . "dup" ; | |
741 } | |
742 print "formating a dup $hgvs_notation->{hgvs} \n"; | |
743 } | |
744 | |
745 elsif($hgvs_notation->{type} eq ">"){ | |
746 #### substitution | |
747 | |
748 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; | |
749 } | |
750 | |
751 elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){ | |
752 | |
753 #### list first and last aa in reference only | |
754 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); | |
755 my $ref_pep_last; | |
756 if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){ | |
757 $ref_pep_last ="X"; | |
758 } | |
759 else{ | |
760 $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); | |
761 } | |
762 if($hgvs_notation->{ref} =~ /X$/){ | |
763 ### For stops & add extX & distance to next stop to alt pep | |
764 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "loss"); | |
765 if(defined $aa_til_stop){ | |
766 $hgvs_notation->{alt} .="extX" . $aa_til_stop; | |
767 } | |
768 } | |
769 if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){ | |
770 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; | |
771 } | |
772 else{ | |
773 ### correct ordering if needed | |
774 if($hgvs_notation->{start} > $hgvs_notation->{end}){ | |
775 ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} ); | |
776 } | |
777 | |
778 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; | |
779 } | |
780 } | |
781 | |
782 elsif($hgvs_notation->{type} eq "fs"){ | |
783 if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "X"){ ## stop gained | |
784 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; | |
785 | |
786 } | |
787 else{ ## not immediate stop - count aa until next | |
788 | |
789 my $aa_til_stop = _stop_loss_extra_AA($self, $hgvs_notation->{start}-1, "fs"); | |
790 if($aa_til_stop ){ | |
791 ### use long form if new stop found | |
792 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} . "fsX$aa_til_stop" ; | |
793 } | |
794 else{ | |
795 ### otherwise use short form | |
796 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "fs"; | |
797 } | |
798 } | |
799 } | |
800 | |
801 elsif( $hgvs_notation->{type} eq "del"){ | |
802 $hgvs_notation->{alt} = "del"; | |
803 if( length($hgvs_notation->{ref}) >3 ){ | |
804 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); | |
805 my $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); | |
806 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ; | |
807 } | |
808 else{ | |
809 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; | |
810 } | |
811 } | |
812 | |
813 elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){ | |
814 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "_" . $hgvs_notation->{alt} . $hgvs_notation->{end} ; | |
815 } | |
816 | |
817 else{ | |
818 #### default to substitution | |
819 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; | |
820 } | |
821 | |
822 if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";} | |
823 return $hgvs_notation->{'hgvs'}; | |
824 } | |
825 | |
826 ### HGVS: get type of variation event in protein terms | |
827 sub _get_hgvs_protein_type{ | |
828 | |
829 my $self = shift; | |
830 my $hgvs_notation = shift; | |
831 | |
832 | |
833 ### get allele length | |
834 my ($ref_length, $alt_length ) = $self->_get_allele_length(); | |
835 | |
836 | |
837 if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){ | |
838 ### Run type checks on peptides if available | |
839 | |
840 if($hgvs_notation->{alt} eq $hgvs_notation->{ref} ){ | |
841 #### synonymous indicated by = | |
842 $hgvs_notation->{type} = "="; | |
843 } | |
844 elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) { | |
845 | |
846 $hgvs_notation->{type} = ">"; | |
847 } | |
848 elsif($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") { | |
849 | |
850 $hgvs_notation->{type} = "ins"; | |
851 } | |
852 elsif($hgvs_notation->{alt} eq "" ) { | |
853 | |
854 $hgvs_notation->{type} = "del"; | |
855 } | |
856 elsif( (length($hgvs_notation->{alt}) > length($hgvs_notation->{ref}) ) && | |
857 $hgvs_notation->{alt} =~ / $hgvs_notation->{ref}/ ) { | |
858 ### capture duplication event described as TTT/TTTTT | |
859 $hgvs_notation->{type} = "dup"; | |
860 } | |
861 | |
862 elsif( (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) ==1) || | |
863 (length($hgvs_notation->{alt}) ==1 && length($hgvs_notation->{ref}) >1) ) { | |
864 $hgvs_notation->{type} = "delins"; | |
865 } | |
866 else{ | |
867 $hgvs_notation->{type} = ">"; | |
868 } | |
869 } | |
870 | |
871 | |
872 elsif($ref_length ne $alt_length && ($ref_length - $alt_length)%3 !=0 ){ | |
873 $hgvs_notation->{type} = "fs"; | |
874 } | |
875 | |
876 elsif(length($self->variation_feature_seq()) >1 ){ | |
877 if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){ | |
878 ### convention for insertions - end one less than start | |
879 $hgvs_notation->{type} = "ins"; | |
880 } | |
881 elsif( $hgvs_notation->{start} != $hgvs_notation->{end} ){ | |
882 $hgvs_notation->{type} = "delins"; | |
883 } | |
884 else{ | |
885 $hgvs_notation->{type} = ">"; | |
886 } | |
887 } | |
888 else{ | |
889 # warn "Cannot define protein variant type [$ref_length - $alt_length]\n"; | |
890 } | |
891 return $hgvs_notation ; | |
892 | |
893 } | |
894 | |
895 ### HGVS: get reference & alternative peptide | |
896 sub _get_hgvs_peptides{ | |
897 | |
898 my $self = shift; | |
899 my $hgvs_notation = shift; | |
900 | |
901 if($hgvs_notation->{type} eq "fs"){ | |
902 ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately | |
903 $hgvs_notation = $self->_get_fs_peptides($hgvs_notation); | |
904 | |
905 } | |
906 elsif($hgvs_notation->{type} eq "ins" ){ | |
907 | |
908 ### HGVS ref are peptides flanking insertion | |
909 $hgvs_notation->{ref} = $self->_get_surrounding_peptides($hgvs_notation->{start}); | |
910 | |
911 if( $hgvs_notation->{alt} =~/\*/){ | |
912 ## inserted bases after stop irrelevant; consider as substitution gaining stop MAINTAIN PREVIOUS BEHAVIOUR FOR NOW | |
913 #$hgvs_notation->{alt} =~ s/\*\w+$/\*/; | |
914 } | |
915 else{ | |
916 ### Additional check that inserted bases do not duplicate 3' reference sequence [set to type = dup if so] | |
917 $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation); | |
918 } | |
919 } | |
920 | |
921 | |
922 ### Convert peptide to 3 letter code as used in HGVS | |
923 $hgvs_notation->{ref} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref', -alphabet => 'protein')) || ""; | |
924 if( $hgvs_notation->{alt} eq "-"){ | |
925 $hgvs_notation->{alt} = "del"; | |
926 } | |
927 else{ | |
928 $hgvs_notation->{alt} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref', -alphabet => 'protein')) || ""; | |
929 } | |
930 | |
931 ### handle special cases | |
932 if( affects_start_codon($self) ){ | |
933 #### handle initiator loss -> probably no translation => alt allele is '?' | |
934 $hgvs_notation->{alt} = "?"; | |
935 $hgvs_notation->{type} = ""; | |
936 } | |
937 | |
938 elsif( $hgvs_notation->{type} eq "del"){ | |
939 #### partial last codon | |
940 $hgvs_notation->{alt} = "del"; | |
941 } | |
942 elsif($hgvs_notation->{type} eq "fs"){ | |
943 ### only quote first ref peptide for frameshift | |
944 $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3); | |
945 } | |
946 | |
947 ### set stop to HGVS prefered "X" | |
948 if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Ter|Xaa/X/g; } | |
949 if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Ter|Xaa/X/g; } | |
950 | |
951 return ($hgvs_notation); | |
952 | |
953 } | |
954 | |
955 ### HGVS: remove common peptides from alt and ref strings & shift start/end accordingly | |
956 sub _clip_alleles{ | |
957 | |
958 my $hgvs_notation = shift; | |
959 | |
960 my $check_alt = $hgvs_notation->{alt} ; | |
961 my $check_ref = $hgvs_notation->{ref} ; | |
962 my $check_start = $hgvs_notation->{start}; | |
963 | |
964 ### strip same bases from start of string | |
965 for (my $p =0; $p <length ($hgvs_notation->{ref}); $p++){ | |
966 my $check_next_ref = substr( $check_ref, 0, 1); | |
967 my $check_next_alt = substr( $check_alt, 0, 1); | |
968 | |
969 if($check_next_ref eq "*" && $check_next_alt eq "*"){ | |
970 ### stop re-created by variant - no protein change | |
971 $hgvs_notation->{type} = "="; | |
972 | |
973 return($hgvs_notation); | |
974 } | |
975 | |
976 if($check_next_ref eq $check_next_alt){ | |
977 $check_start++; | |
978 $check_ref = substr( $check_ref, 1); | |
979 $check_alt = substr( $check_alt, 1); | |
980 } | |
981 else{ | |
982 last; | |
983 } | |
984 } | |
985 #### strip same bases from end of string | |
986 for (my $q =0; $q < length ($check_ref); $q++){ | |
987 my $check_next_ref = substr( $check_ref, -1, 1); | |
988 my $check_next_alt = substr( $check_alt, -1, 1); | |
989 if($check_next_ref eq $check_next_alt){ | |
990 chop $check_ref; | |
991 chop $check_alt; | |
992 } | |
993 else{ | |
994 last; | |
995 } | |
996 } | |
997 | |
998 $hgvs_notation->{alt} = $check_alt; | |
999 $hgvs_notation->{ref} = $check_ref; | |
1000 | |
1001 ### check if clipping force type change & adjust location | |
1002 if(length ($check_ref) == 0 && length ($check_alt) >= 1){ | |
1003 ### re-set as ins not delins | |
1004 $hgvs_notation->{type} ="ins"; | |
1005 ### insertion between ref base and next => adjust next | |
1006 if($hgvs_notation->{end} == $hgvs_notation->{start} ){$hgvs_notation->{end} = $check_start; } | |
1007 # $hgvs_notation->{start} = $check_start; | |
1008 } | |
1009 elsif(length ($check_ref) >=1 && length ($check_alt) == 0){ | |
1010 ### re-set as del not delins | |
1011 $hgvs_notation->{type} ="del"; | |
1012 $hgvs_notation->{start} = $check_start; | |
1013 } | |
1014 else{ | |
1015 #### save trimmed peptide string & increased start position | |
1016 $hgvs_notation->{start} = $check_start; | |
1017 | |
1018 } | |
1019 | |
1020 return $hgvs_notation; | |
1021 } | |
1022 | |
1023 | |
1024 | |
1025 | |
1026 | |
1027 #### HGVS: check allele lengths to look for frameshifts | |
1028 sub _get_allele_length{ | |
1029 | |
1030 my $self = shift; | |
1031 my $ref_length = 0; | |
1032 my $alt_length = 0; | |
1033 | |
1034 my $al_string = $self->allele_string(); | |
1035 my $ref_allele = (split/\//, $al_string)[0]; | |
1036 $ref_allele =~ s/\-//; | |
1037 $ref_length = length $ref_allele; | |
1038 | |
1039 my $alt_allele = $self->variation_feature_seq(); | |
1040 $alt_allele =~ s/\-//; | |
1041 $alt_length = length $alt_allele; | |
1042 | |
1043 return ($ref_length, $alt_length ); | |
1044 | |
1045 } | |
1046 | |
1047 ### HGVS: list first different peptide [may not be first changed codon] | |
1048 sub _get_fs_peptides{ | |
1049 | |
1050 my $self = shift; | |
1051 my $hgvs_notation = shift; | |
1052 | |
1053 ### get CDS with alt variant | |
1054 my $alt_cds = $self->_get_alternate_cds(); | |
1055 | |
1056 #### get new translation | |
1057 my $alt_trans = $alt_cds->translate()->seq(); | |
1058 | |
1059 ### get changed end (currently in single letter AA coding) | |
1060 my $ref_trans = $self->transcript->translate()->seq(); | |
1061 $ref_trans .= "*"; ## appending ref stop for checking purposes | |
1062 | |
1063 $hgvs_notation->{start} = $self->transcript_variation->translation_start() ; | |
1064 | |
1065 if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq | |
1066 $hgvs_notation->{alt} = "del"; | |
1067 $hgvs_notation->{type} = "del"; | |
1068 return $hgvs_notation; | |
1069 } | |
1070 | |
1071 while ($hgvs_notation->{start} <= length $alt_trans){ | |
1072 ### frame shift may result in the same AA# | |
1073 | |
1074 $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1); | |
1075 $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1); | |
1076 | |
1077 if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){ | |
1078 ### variation at stop codon, but maintains stop codon | |
1079 return ($hgvs_notation); | |
1080 } | |
1081 last if $hgvs_notation->{ref} ne $hgvs_notation->{alt}; | |
1082 $hgvs_notation->{start}++; | |
1083 } | |
1084 return ($hgvs_notation); | |
1085 | |
1086 } | |
1087 | |
1088 #### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion | |
1089 sub _get_surrounding_peptides{ | |
1090 | |
1091 my $self = shift; | |
1092 my $ref_pos = shift; | |
1093 my $ref_trans = $self->transcript->translate()->seq(); | |
1094 | |
1095 my $end = substr($ref_trans, $ref_pos-1); | |
1096 my $ref_string = substr($ref_trans, $ref_pos-1, 2); | |
1097 | |
1098 return ($ref_string); | |
1099 | |
1100 } | |
1101 | |
1102 | |
1103 #### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop | |
1104 sub _get_alternate_cds{ | |
1105 | |
1106 my $self = shift; | |
1107 | |
1108 ### get reference sequence | |
1109 my $reference_cds_seq = $self->transcript->translateable_seq(); | |
1110 | |
1111 ### get sequences upstream and downstream of variant | |
1112 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); | |
1113 my $downstream_seq = substr($reference_cds_seq, ($self->transcript_variation->cds_end() ) ); | |
1114 | |
1115 ### fix alternate allele if deletion | |
1116 my $alt_allele = $self->variation_feature_seq(); | |
1117 $alt_allele =~ s/\-//; | |
1118 | |
1119 ### build alternate seq | |
1120 my $alternate_seq = $upstream_seq . $alt_allele . $downstream_seq ; | |
1121 | |
1122 ### create seq obj with alternative allele in the CDS sequence | |
1123 my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq, -id => 'alt_cds', -alphabet => 'dna'); | |
1124 | |
1125 ### append UTR if available as stop may be disrupted | |
1126 my $utr = $self->transcript_variation->transcript->three_prime_utr(); | |
1127 | |
1128 if (defined $utr) { | |
1129 ### append the UTR to the alternative CDS | |
1130 $alt_cds->seq($alt_cds->seq() . $utr->seq()); | |
1131 } | |
1132 else{ | |
1133 ##warn "No UTR available for alternate CDS\n"; | |
1134 } | |
1135 | |
1136 return $alt_cds; | |
1137 } | |
1138 | |
1139 ### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication | |
1140 sub _check_for_peptide_duplication{ | |
1141 | |
1142 my $self = shift; | |
1143 my $hgvs_notation= shift; | |
1144 | |
1145 ##### get reference sequence | |
1146 my $reference_cds_seq = $self->transcript->translateable_seq(); | |
1147 | |
1148 ##### get sequence upstream of variant | |
1149 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); | |
1150 | |
1151 ##### create translation to check against inserted peptides | |
1152 my $upstream_cds =Bio::PrimarySeq->new(-seq => $upstream_seq, -id => 'alt_cds', -alphabet => 'dna'); | |
1153 my $upstream_trans = $upstream_cds->translate()->seq(); | |
1154 | |
1155 ## Test whether alt peptide matches the reference sequence just before the variant | |
1156 my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ; | |
1157 my $test_seq = substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'})); | |
1158 | |
1159 if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) { | |
1160 | |
1161 $hgvs_notation->{type} = 'dup'; | |
1162 $hgvs_notation->{end} = $hgvs_notation->{start} -1; | |
1163 $hgvs_notation->{start} -= length($hgvs_notation->{alt}); | |
1164 | |
1165 } | |
1166 | |
1167 return $hgvs_notation; | |
1168 | |
1169 } | |
1170 | |
1171 #### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's | |
1172 sub _stop_loss_extra_AA{ | |
1173 | |
1174 my $self = shift; | |
1175 my $ref_var_pos = shift; ### first effected AA - supply for frameshifts | |
1176 my $test = shift; | |
1177 | |
1178 return undef unless $ref_var_pos; | |
1179 | |
1180 my $extra_aa; | |
1181 | |
1182 ### get the sequence with variant added | |
1183 my $alt_cds = $self->_get_alternate_cds(); | |
1184 ### get new translation | |
1185 my $alt_trans = $alt_cds->translate(); | |
1186 | |
1187 my $ref_temp = $self->transcript->translate()->seq; | |
1188 my $ref_len = length($ref_temp); | |
1189 | |
1190 if($DEBUG==1){ | |
1191 print "alt translated:\n" . $alt_trans->seq() . "\n"; | |
1192 print "ref translated:\n$ref_temp\n";; | |
1193 } | |
1194 | |
1195 #### Find the number of residues that are translated until a termination codon is encountered | |
1196 if ($alt_trans->seq() =~ m/\*/) { | |
1197 if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";} | |
1198 | |
1199 if($test eq "fs" ){ | |
1200 ### frame shift - count from first AA effected by variant to stop | |
1201 $extra_aa = $+[0] - $ref_var_pos; | |
1202 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";} | |
1203 } | |
1204 | |
1205 else{ | |
1206 $extra_aa = $+[0] - 1 - $ref_len; | |
1207 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";} | |
1208 } | |
1209 } | |
1210 | |
1211 # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon | |
1212 if(defined $extra_aa && $extra_aa >0){ | |
1213 return $extra_aa ; | |
1214 } | |
1215 else{ | |
1216 #warn "No stop found in alternate sequence\n"; | |
1217 return undef; | |
1218 } | |
1219 | |
1220 } | |
1221 | |
1222 =head | |
1223 # We haven't implemented support for these methods yet | |
1224 | |
1225 sub hgvs_rna { | |
1226 return _hgvs_generic(@_,'rna'); | |
1227 } | |
1228 | |
1229 sub hgvs_mitochondrial { | |
1230 return _hgvs_generic(@_,'mitochondrial'); | |
1231 } | |
1232 | |
1233 =cut | |
1234 | |
1235 sub _hgvs_generic { | |
1236 | |
1237 my $self = shift; | |
1238 my $reference = pop; | |
1239 my $notation = shift; | |
1240 | |
1241 #ĘThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these | |
1242 return undef if ($reference =~ m/rna|mitochondrial/); | |
1243 | |
1244 my $sub = qq{hgvs_$reference}; | |
1245 | |
1246 $self->{$sub} = $notation if defined $notation; | |
1247 | |
1248 unless ($self->{$sub}) { | |
1249 | |
1250 # Use the transcript this VF is on as the reference feature | |
1251 my $reference_feature = $self->transcript; | |
1252 # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region | |
1253 $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic'); | |
1254 | |
1255 # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles | |
1256 $self->transcript_variation->$sub($self->variation_feature->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$self->transcript_variation)); | |
1257 } | |
1258 | |
1259 return $self->{$sub}; | |
1260 } | |
1261 | |
1262 | |
1263 ### HGVS: move variant to transcript slice | |
1264 sub _var2transcript_slice_coords{ | |
1265 | |
1266 my $self = shift; | |
1267 | |
1268 my $ref_slice = $self->transcript->feature_Slice(); #### returns with strand same as feature | |
1269 my $tr_vf = $self->variation_feature->transfer($ref_slice); | |
1270 | |
1271 # Return undef if this VariationFeature does not fall within the supplied feature. | |
1272 return undef if (!defined $tr_vf || | |
1273 $tr_vf->start < 1 || | |
1274 $tr_vf->end < 1 || | |
1275 $tr_vf->start > ($self->transcript->end - $self->transcript->start + 1) || | |
1276 $tr_vf->end > ($self->transcript->end - $self->transcript->start + 1)); | |
1277 | |
1278 return( $tr_vf->start() , $tr_vf->end(), $ref_slice); | |
1279 } | |
1280 | |
1281 | |
1282 | |
1283 ### HGVS: get variant position in transcript | |
1284 | |
1285 # intron: | |
1286 # If the position is in an intron, the boundary position of the closest exon and | |
1287 # a + or - offset into the intron is returned. | |
1288 # Ordered by genome forward not 5' -> 3' | |
1289 | |
1290 # upstream: | |
1291 # If the position is 5' of the start codon, it is reported relative to the start codon | |
1292 # (-1 being the last nucleotide before the 'A' of ATG). | |
1293 | |
1294 #downstream: | |
1295 # If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset | |
1296 # from the start codon (*1 being the first nucleotide after the last position of the stop codon) | |
1297 | |
1298 sub _get_cDNA_position { | |
1299 | |
1300 my $self = shift; | |
1301 my $position = shift; ### start or end of variation | |
1302 | |
1303 my $transcript = $self->transcript(); | |
1304 my $strand = $transcript->strand(); | |
1305 | |
1306 #### TranscriptVariation start/stop coord relative to transcript | |
1307 #### Switch to chromosome coordinates taking into account strand | |
1308 $position = ( $strand > 0 ? | |
1309 ( $self->transcript->start() + $position - 1 ) : | |
1310 ( $self->transcript->end() - $position + 1)); | |
1311 | |
1312 | |
1313 | |
1314 # Get all exons and sort them in positional order | |
1315 my @exons = sort {$a->start() <=> $b->start()} @{$transcript->get_all_Exons()}; | |
1316 my $n_exons = scalar(@exons); | |
1317 | |
1318 my $cdna_position; | |
1319 # Loop over the exons and get the coordinates of the variation in exon+intron notation | |
1320 for (my $i=0; $i<$n_exons; $i++) { | |
1321 | |
1322 # Skip if the start point is beyond this exon | |
1323 next if ($position > $exons[$i]->end()); | |
1324 | |
1325 | |
1326 # EXONIC: If the start coordinate is within this exon | |
1327 if ($position >= $exons[$i]->start()) { | |
1328 # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation | |
1329 # If the transcript is in the opposite direction, count from the end instead | |
1330 $cdna_position = $exons[$i]->cdna_start($transcript) + ( $strand > 0 ? | |
1331 ( $position - $exons[$i]->start ) : | |
1332 ( $exons[$i]->end() - $position ) | |
1333 ); | |
1334 last; #### last exon checked | |
1335 } | |
1336 ## INTRONIC | |
1337 # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one | |
1338 else { | |
1339 | |
1340 my $updist = ($position - $exons[$i-1]->end()); | |
1341 my $downdist = ($exons[$i]->start() - $position); | |
1342 | |
1343 # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that | |
1344 if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) { | |
1345 | |
1346 # If the orientation is reversed, we should use the cDNA start and a '-' offset | |
1347 $cdna_position = ($strand >= 0 ? | |
1348 $exons[$i-1]->cdna_end($transcript) . '+' : | |
1349 $exons[$i-1]->cdna_start($transcript) . '-') | |
1350 . $updist; | |
1351 } | |
1352 # Else if downstream is shortest... | |
1353 else { | |
1354 # If the orientation is reversed, we should use the cDNA end and a '+' offset | |
1355 $cdna_position = ($strand >= 0 ? | |
1356 $exons[$i]->cdna_start($transcript) . '-' : | |
1357 $exons[$i]->cdna_end($transcript) . '+') | |
1358 . $downdist; | |
1359 } | |
1360 last; ## last exon checked | |
1361 } | |
1362 } | |
1363 | |
1364 # Shift the position to make it relative to the start & stop codons | |
1365 my $start_codon = $transcript->cdna_coding_start(); | |
1366 my $stop_codon = $transcript->cdna_coding_end(); | |
1367 | |
1368 # Disassemble the cDNA coordinate into the exon and intron parts | |
1369 ### just built this now taking it appart again | |
1370 my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/; | |
1371 | |
1372 | |
1373 # Start by correcting for the stop codon | |
1374 if (defined($stop_codon) && $cdna_coord > $stop_codon) { | |
1375 # Get the offset from the stop codon | |
1376 $cdna_coord -= $stop_codon; | |
1377 # Prepend a * to indicate the position is in the 3' UTR | |
1378 $cdna_coord = '*' . $cdna_coord; | |
1379 } | |
1380 elsif (defined($start_codon)) { | |
1381 # If the position is beyond the start codon, add 1 to get the correct offset | |
1382 $cdna_coord += ($cdna_coord >= $start_codon); | |
1383 # Subtract the position of the start codon | |
1384 $cdna_coord -= $start_codon; | |
1385 } | |
1386 | |
1387 # Re-assemble the cDNA position [ return exon num & offset & direction for intron eg. 142+363] | |
1388 $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : ''); | |
1389 return $cdna_position; | |
1390 } | |
1391 | |
1392 | |
1393 1; | |
1394 | |
1395 |