Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Variation::TranscriptVariationAllele | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 use Bio::EnsEMBL::Variation::TranscriptVariationAllele; | |
28 | |
29 my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new( | |
30 -transcript_variation => $tv, | |
31 -variation_feature_seq => 'A', | |
32 -is_reference => 0, | |
33 ); | |
34 | |
35 print "sequence with respect to the transcript: ", $tva->feature_seq, "\n"; | |
36 print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n"; | |
37 print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n"; | |
38 print "amino acid change: ", $tva->peptide_allele_string, "\n"; | |
39 print "resulting codon: ", $tva->codon, "\n"; | |
40 print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n"; | |
41 print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n"; | |
42 print "SIFT prediction: ", $tva->sift_prediction, "\n"; | |
43 | |
44 =head1 DESCRIPTION | |
45 | |
46 A TranscriptVariationAllele object represents a single allele of a TranscriptVariation. | |
47 It provides methods that are specific to the sequence of the allele, such as codon, | |
48 peptide etc. Methods that depend only on position (e.g. CDS start) will be found in | |
49 the associated TranscriptVariation. Ordinarily you will not create these objects | |
50 yourself, but instead you would create a TranscriptVariation object which will then | |
51 construct TranscriptVariationAlleles based on the allele string of the associated | |
52 VariationFeature. | |
53 | |
54 Note that any methods that are not specific to Transcripts will be found in the | |
55 VariationFeatureOverlapAllele superclass. | |
56 | |
57 =cut | |
58 | |
59 package Bio::EnsEMBL::Variation::TranscriptVariationAllele; | |
60 | |
61 use strict; | |
62 use warnings; | |
63 | |
64 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP); | |
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); | |
66 use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string); | |
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
68 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(within_cds within_intron stop_lost affects_start_codon); | |
69 | |
70 use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele); | |
71 | |
72 | |
73 our $DEBUG = 0; | |
74 | |
75 sub new_fast { | |
76 my ($class, $hashref) = @_; | |
77 | |
78 # swap a transcript_variation argument for a variation_feature_overlap one | |
79 | |
80 if ($hashref->{transcript_variation}) { | |
81 $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation}; | |
82 } | |
83 | |
84 # and call the superclass | |
85 | |
86 return $class->SUPER::new_fast($hashref); | |
87 } | |
88 | |
89 =head2 transcript_variation | |
90 | |
91 Description: Get/set the associated TranscriptVariation | |
92 Returntype : Bio::EnsEMBL::Variation::TranscriptVariation | |
93 Exceptions : throws if the argument is the wrong type | |
94 Status : At Risk | |
95 | |
96 =cut | |
97 | |
98 sub transcript_variation { | |
99 my ($self, $tv) = @_; | |
100 assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $tv; | |
101 return $self->variation_feature_overlap($tv); | |
102 } | |
103 | |
104 =head2 variation_feature | |
105 | |
106 Description: Get the associated VariationFeature | |
107 Returntype : Bio::EnsEMBL::Variation::VariationFeature | |
108 Exceptions : none | |
109 Status : At Risk | |
110 | |
111 =cut | |
112 | |
113 sub variation_feature { | |
114 my $self = shift; | |
115 return $self->transcript_variation->variation_feature; | |
116 } | |
117 | |
118 =head2 pep_allele_string | |
119 | |
120 Description: Return a '/' delimited string of the reference peptide and the | |
121 peptide resulting from this allele, or a single peptide if this | |
122 allele does not change the peptide (e.g. because it is synonymous) | |
123 Returntype : string or undef if this allele is not in the CDS | |
124 Exceptions : none | |
125 Status : At Risk | |
126 | |
127 =cut | |
128 | |
129 sub pep_allele_string { | |
130 my ($self) = @_; | |
131 | |
132 my $pep = $self->peptide; | |
133 | |
134 return undef unless $pep; | |
135 | |
136 my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; | |
137 | |
138 return undef unless $ref_pep; | |
139 | |
140 return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep; | |
141 } | |
142 | |
143 =head2 codon_allele_string | |
144 | |
145 Description: Return a '/' delimited string of the reference codon and the | |
146 codon resulting from this allele | |
147 Returntype : string or undef if this allele is not in the CDS | |
148 Exceptions : none | |
149 Status : At Risk | |
150 | |
151 =cut | |
152 | |
153 sub codon_allele_string { | |
154 my ($self) = @_; | |
155 | |
156 my $codon = $self->codon; | |
157 | |
158 return undef unless $codon; | |
159 | |
160 my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon; | |
161 | |
162 return $ref_codon.'/'.$codon; | |
163 } | |
164 | |
165 =head2 display_codon_allele_string | |
166 | |
167 Description: Return a '/' delimited string of the reference display_codon and the | |
168 display_codon resulting from this allele. The display_codon identifies | |
169 the nucleotides affected by this variant in UPPER CASE and other | |
170 nucleotides in lower case | |
171 Returntype : string or undef if this allele is not in the CDS | |
172 Exceptions : none | |
173 Status : At Risk | |
174 | |
175 =cut | |
176 | |
177 sub display_codon_allele_string { | |
178 my ($self) = @_; | |
179 | |
180 my $display_codon = $self->display_codon; | |
181 | |
182 return undef unless $display_codon; | |
183 | |
184 my $ref_display_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->display_codon; | |
185 | |
186 return undef unless $ref_display_codon; | |
187 | |
188 return $ref_display_codon.'/'.$display_codon; | |
189 } | |
190 | |
191 =head2 peptide | |
192 | |
193 Description: Return the amino acid sequence that this allele is predicted to result in | |
194 Returntype : string or undef if this allele is not in the CDS or is a frameshift | |
195 Exceptions : none | |
196 Status : At Risk | |
197 | |
198 =cut | |
199 | |
200 sub peptide { | |
201 my ($self, $peptide) = @_; | |
202 | |
203 $self->{peptide} = $peptide if $peptide; | |
204 | |
205 unless ($self->{peptide}) { | |
206 | |
207 return undef unless $self->seq_is_unambiguous_dna; | |
208 | |
209 if (my $codon = $self->codon) { | |
210 | |
211 # the codon method can set the peptide in some circumstances | |
212 # so check here before we try an (expensive) translation | |
213 return $self->{peptide} if $self->{peptide}; | |
214 | |
215 # translate the codon sequence to establish the peptide allele | |
216 | |
217 # allow for partial codons - split the sequence into whole and partial | |
218 # e.g. AAAGG split into AAA and GG | |
219 my $whole_codon = substr($codon, 0, int(length($codon) / 3) * 3); | |
220 my $partial_codon = substr($codon, int(length($codon) / 3) * 3); | |
221 | |
222 my $pep = ''; | |
223 | |
224 if($whole_codon) { | |
225 # for mithocondrial dna we need to to use a different codon table | |
226 my $codon_table = $self->transcript_variation->_codon_table; | |
227 | |
228 my $codon_seq = Bio::Seq->new( | |
229 -seq => $whole_codon, | |
230 -moltype => 'dna', | |
231 -alphabet => 'dna', | |
232 ); | |
233 | |
234 $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq; | |
235 } | |
236 | |
237 if($partial_codon) { | |
238 $pep .= 'X'; | |
239 } | |
240 | |
241 $pep ||= '-'; | |
242 | |
243 $self->{peptide} = $pep; | |
244 } | |
245 } | |
246 | |
247 return $self->{peptide}; | |
248 } | |
249 | |
250 =head2 codon | |
251 | |
252 Description: Return the codon sequence that this allele is predicted to result in | |
253 Returntype : string or undef if this allele is not in the CDS or is a frameshift | |
254 Exceptions : none | |
255 Status : At Risk | |
256 | |
257 =cut | |
258 | |
259 sub codon { | |
260 my ($self, $codon) = @_; | |
261 | |
262 $self->{codon} = $codon if defined $codon; | |
263 | |
264 my $tv = $self->transcript_variation; | |
265 | |
266 return undef unless $tv->translation_start && $tv->translation_end; | |
267 | |
268 return undef unless $self->seq_is_dna; | |
269 | |
270 unless ($self->{codon}) { | |
271 | |
272 # try to calculate the codon sequence | |
273 | |
274 my $seq = $self->feature_seq; | |
275 | |
276 $seq = '' if $seq eq '-'; | |
277 | |
278 # calculate necessary coords and lengths | |
279 | |
280 my $codon_cds_start = $tv->translation_start * 3 - 2; | |
281 my $codon_cds_end = $tv->translation_end * 3; | |
282 my $codon_len = $codon_cds_end - $codon_cds_start + 1; | |
283 my $vf_nt_len = $tv->cds_end - $tv->cds_start + 1; | |
284 my $allele_len = $self->seq_length; | |
285 | |
286 if ($allele_len != $vf_nt_len) { | |
287 if (abs($allele_len - $vf_nt_len) % 3) { | |
288 # this is a frameshift variation, we don't attempt to | |
289 # calculate the resulting codon or peptide change as this | |
290 # could get quite complicated | |
291 return undef; | |
292 } | |
293 } | |
294 | |
295 # splice the allele sequence into the CDS | |
296 | |
297 my $cds = $tv->_translateable_seq; | |
298 | |
299 substr($cds, $tv->cds_start-1, $vf_nt_len) = $seq; | |
300 | |
301 # and extract the codon sequence | |
302 | |
303 my $codon = substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len)); | |
304 | |
305 if (length($codon) < 1) { | |
306 $self->{codon} = '-'; | |
307 $self->{peptide} = '-'; | |
308 } | |
309 else { | |
310 $self->{codon} = $codon; | |
311 } | |
312 } | |
313 | |
314 return $self->{codon}; | |
315 } | |
316 | |
317 =head2 display_codon | |
318 | |
319 Description: Return the codon sequence that this allele is predicted to result in | |
320 with the affected nucleotides identified in UPPER CASE and other | |
321 nucleotides in lower case | |
322 Returntype : string or undef if this allele is not in the CDS or is a frameshift | |
323 Exceptions : none | |
324 Status : At Risk | |
325 | |
326 =cut | |
327 | |
328 sub display_codon { | |
329 my $self = shift; | |
330 | |
331 unless ($self->{_display_codon}) { | |
332 | |
333 if ($self->codon && defined $self->transcript_variation->codon_position) { | |
334 | |
335 my $display_codon = lc $self->codon; | |
336 | |
337 # if this allele is an indel then just return all lowercase | |
338 | |
339 if ($self->feature_seq ne '-') { | |
340 | |
341 # codon_position is 1-based, while substr assumes the string starts at 0 | |
342 | |
343 my $pos = $self->transcript_variation->codon_position - 1; | |
344 | |
345 my $len = length $self->feature_seq; | |
346 | |
347 substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len); | |
348 } | |
349 | |
350 $self->{_display_codon} = $display_codon; | |
351 } | |
352 } | |
353 | |
354 return $self->{_display_codon}; | |
355 } | |
356 | |
357 =head2 polyphen_prediction | |
358 | |
359 Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele. | |
360 (Note that we currently only have PolyPhen predictions for variants that | |
361 result in single amino acid substitutions in human) | |
362 Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown') | |
363 if this is a non-synonymous mutation and a prediction is available, undef | |
364 otherwise | |
365 Exceptions : none | |
366 Status : At Risk | |
367 | |
368 =cut | |
369 | |
370 sub polyphen_prediction { | |
371 my ($self, $classifier, $polyphen_prediction) = @_; | |
372 | |
373 $classifier ||= 'humvar'; | |
374 | |
375 my $analysis = "polyphen_${classifier}"; | |
376 | |
377 $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction; | |
378 | |
379 unless ($self->{$analysis}->{prediction}) { | |
380 my ($prediction, $score) = $self->_protein_function_prediction($analysis); | |
381 $self->{$analysis}->{score} = $score; | |
382 $self->{$analysis}->{prediction} = $prediction; | |
383 } | |
384 | |
385 return $self->{$analysis}->{prediction}; | |
386 } | |
387 | |
388 =head2 polyphen_score | |
389 | |
390 Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we | |
391 currently only have PolyPhen predictions for variants that result in single | |
392 amino acid substitutions in human) | |
393 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is | |
394 available, undef otherwise | |
395 Exceptions : none | |
396 Status : At Risk | |
397 | |
398 =cut | |
399 | |
400 sub polyphen_score { | |
401 my ($self, $classifier, $polyphen_score) = @_; | |
402 | |
403 $classifier ||= 'humvar'; | |
404 | |
405 my $analysis = "polyphen_${classifier}"; | |
406 | |
407 $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score; | |
408 | |
409 unless ($self->{$analysis}->{score}) { | |
410 my ($prediction, $score) = $self->_protein_function_prediction($analysis); | |
411 $self->{$analysis}->{score} = $score; | |
412 $self->{$analysis}->{prediction} = $prediction; | |
413 } | |
414 | |
415 return $self->{$analysis}->{score}; | |
416 } | |
417 | |
418 =head2 sift_prediction | |
419 | |
420 Description: Return the qualitative SIFT prediction for the effect of this allele. | |
421 (Note that we currently only have SIFT predictions for variants that | |
422 result in single amino acid substitutions in human) | |
423 Returntype : string (one of 'tolerated', 'deleterious') if this is a non-synonymous | |
424 mutation and a prediction is available, undef otherwise | |
425 Exceptions : none | |
426 Status : At Risk | |
427 | |
428 =cut | |
429 | |
430 sub sift_prediction { | |
431 my ($self, $sift_prediction) = @_; | |
432 | |
433 $self->{sift_prediction} = $sift_prediction if $sift_prediction; | |
434 | |
435 unless ($self->{sift_prediction}) { | |
436 my ($prediction, $score) = $self->_protein_function_prediction('sift'); | |
437 $self->{sift_score} = $score; | |
438 $self->{sift_prediction} = $prediction unless $self->{sift_prediction}; | |
439 } | |
440 | |
441 return $self->{sift_prediction}; | |
442 } | |
443 | |
444 =head2 sift_score | |
445 | |
446 Description: Return the SIFT score for this allele (Note that we currently only have SIFT | |
447 predictions for variants that result in single amino acid substitutions in human) | |
448 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is | |
449 available, undef otherwise | |
450 Exceptions : none | |
451 Status : At Risk | |
452 | |
453 =cut | |
454 | |
455 sub sift_score { | |
456 my ($self, $sift_score) = @_; | |
457 | |
458 $self->{sift_score} = $sift_score if defined $sift_score; | |
459 | |
460 unless ($self->{sift_score}) { | |
461 my ($prediction, $score) = $self->_protein_function_prediction('sift'); | |
462 $self->{sift_score} = $score; | |
463 $self->{sift_prediction} = $prediction; | |
464 } | |
465 | |
466 return $self->{sift_score}; | |
467 } | |
468 | |
469 | |
470 sub _protein_function_prediction { | |
471 my ($self, $analysis) = @_; | |
472 | |
473 # we can only get results for variants that cause a single amino acid substitution, | |
474 # so check the peptide allele string first | |
475 | |
476 if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) { | |
477 | |
478 if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) { | |
479 | |
480 my ($prediction, $score) = $matrix->get_prediction( | |
481 $self->transcript_variation->translation_start, | |
482 $self->peptide, | |
483 ); | |
484 | |
485 return wantarray ? ($prediction, $score) : $prediction; | |
486 } | |
487 } | |
488 | |
489 return undef; | |
490 } | |
491 | |
492 =head2 hgvs_genomic | |
493 | |
494 Description: Return a string representing the genomic-level effect of this allele in HGVS format | |
495 Returntype : string | |
496 Exceptions : none | |
497 Status : At Risk | |
498 | |
499 =cut | |
500 | |
501 sub hgvs_genomic { | |
502 return _hgvs_generic(@_,'genomic'); | |
503 } | |
504 | |
505 =head2 hgvs_coding | |
506 | |
507 Description: Return a string representing the CDS-level effect of this allele in HGVS format | |
508 Returntype : string or undef if this allele is not in the CDS | |
509 Exceptions : none | |
510 Status : At Risk | |
511 | |
512 =cut | |
513 | |
514 sub hgvs_coding { | |
515 | |
516 deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.'); | |
517 return hgvs_transcript(@_); | |
518 } | |
519 | |
520 | |
521 =head2 hgvs_transcript | |
522 | |
523 Description: Return a string representing the CDS-level effect of this allele in HGVS format | |
524 Returntype : string or undef if this allele is not in the CDS | |
525 Exceptions : none | |
526 Status : At Risk | |
527 | |
528 =cut | |
529 | |
530 | |
531 sub hgvs_transcript { | |
532 | |
533 | |
534 my $self = shift; | |
535 my $notation = shift; | |
536 | |
537 ##### set if string supplied | |
538 $self->{hgvs_transcript} = $notation if defined $notation; | |
539 ##### return if held | |
540 return $self->{hgvs_transcript} if defined $self->{hgvs_transcript} ; | |
541 return $self->{hgvs_coding} if defined $self->{hgvs_coding} ; | |
542 | |
543 my $variation_feature_sequence = $self->variation_feature_seq() ; | |
544 | |
545 ### don't try to handle odd characters | |
546 return undef if $variation_feature_sequence =~ m/[^ACGT\-]/ig; | |
547 | |
548 ### no result for reference allele | |
549 return undef if $self->is_reference ==1; | |
550 | |
551 ### else evaluate | |
552 | |
553 ### get reference sequence strand | |
554 my $refseq_strand = $self->transcript_variation->transcript->strand(); | |
555 | |
556 if($DEBUG ==1){ | |
557 my $var_name = $self->transcript_variation->variation_feature->variation_name(); | |
558 print "\nHGVS transcript: Checking $var_name refseq strand => $refseq_strand seq name : " . $self->transcript_variation->transcript_stable_id() . " var strand " . $self->transcript_variation->variation_feature->strand() . " vf st " . $self->variation_feature->strand() ." seqname: " . $self->variation_feature->seqname() ." seq: " . $self->variation_feature_seq ."\n"; | |
559 } | |
560 | |
561 my $hgvs_notation ; ### store components of HGVS string in hash | |
562 | |
563 ### vf strand is relative to transcript or transcript slice | |
564 if( $self->transcript_variation->variation_feature->strand() <0 && $refseq_strand >0 || | |
565 $self->transcript_variation->variation_feature->strand() >0 && $refseq_strand < 0 | |
566 ){ | |
567 reverse_comp(\$variation_feature_sequence); | |
568 } | |
569 | |
570 ### need to get ref seq from slice transcript is on for intron labelling | |
571 my ($slice_start, $slice_end, $slice) = $self->_var2transcript_slice_coords(); | |
572 | |
573 | |
574 unless($slice_start){ | |
575 #warn "TVA: VF not within transcript - no HGVS\n"; | |
576 return undef; | |
577 } | |
578 | |
579 ### decide event type from HGVS nomenclature | |
580 $hgvs_notation = hgvs_variant_notation( $variation_feature_sequence, ### alt_allele, | |
581 $slice->seq(), ### using this to extract ref allele | |
582 $slice_start, | |
583 $slice_end | |
584 ); | |
585 | |
586 ### This should not happen | |
587 unless($hgvs_notation->{'type'}){ | |
588 #warn "Error - not continuing; no HGVS annotation\n"; | |
589 return undef; | |
590 } | |
591 | |
592 ### create reference name - transcript name & seq version | |
593 $hgvs_notation->{'ref_name'} = $self->transcript_variation->transcript_stable_id() . "." . $self->transcript_variation->transcript->version(); | |
594 | |
595 | |
596 ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups] | |
597 $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} ); | |
598 $hgvs_notation->{end} = $self->_get_cDNA_position( $hgvs_notation->{end} ); | |
599 | |
600 | |
601 # Make sure that start is always less than end | |
602 my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; | |
603 my ($exon_end_coord, $intron_end_offset) = $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; | |
604 $intron_start_offset ||= 0; | |
605 $intron_end_offset ||= 0; | |
606 | |
607 ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if | |
608 (($exon_start_coord + $intron_start_offset) > ($exon_end_coord + $intron_end_offset)); | |
609 | |
610 | |
611 | |
612 if($self->transcript->cdna_coding_start()){ | |
613 $hgvs_notation->{'numbering'} = "c"; ### set 'c' if transcript is coding | |
614 } | |
615 else{ | |
616 $hgvs_notation->{'numbering'} = "n"; ### set 'n' if transcript non-coding | |
617 } | |
618 ### generic formatting | |
619 $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation); | |
620 | |
621 if($DEBUG ==1){print "HGVS notation for var " . $self->transcript_variation->variation_feature->variation_name() . " from hgvs transcript : " . $self->{hgvs_transcript} . " \n";} | |
622 | |
623 return $self->{hgvs_transcript}; | |
624 | |
625 } | |
626 | |
627 | |
628 =head2 hgvs_protein | |
629 | |
630 Description: Return a string representing the protein-level effect of this allele in HGVS format | |
631 Returntype : string or undef if this allele is not in the CDS | |
632 Exceptions : none | |
633 Status : At Risk | |
634 | |
635 =cut | |
636 | |
637 sub hgvs_protein { | |
638 | |
639 my $self = shift; | |
640 my $notation = shift; | |
641 my $hgvs_notation; | |
642 | |
643 if($DEBUG ==1){print "\nStarting hgvs_protein with ". $self->transcript_variation->variation_feature->variation_name() . "\n"; } | |
644 | |
645 ### set if string supplied | |
646 $self->{hgvs_protein} = $notation if defined $notation; | |
647 | |
648 ### return if set | |
649 return $self->{hgvs_protein} if defined $self->{hgvs_protein} ; | |
650 | |
651 ### don't try to handle odd characters | |
652 return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig; | |
653 | |
654 #### else, check viable input and create notation | |
655 return if $self->is_reference(); | |
656 | |
657 unless ($self->transcript_variation->translation_start() && $self->transcript_variation->translation_end()){ | |
658 return undef ; | |
659 } | |
660 | |
661 #### for debug | |
662 #my $var_name = $self->transcript_variation->variation_feature->variation_name(); | |
663 | |
664 ### get reference sequence [add seq version to transcript name] | |
665 $hgvs_notation->{ref_name} = $self->transcript_variation->transcript->translation()->display_id() . "." . $self->transcript_variation->transcript->translation()->version(); | |
666 | |
667 $hgvs_notation->{'numbering'} = 'p'; | |
668 | |
669 ### get default reference location [changed later in some cases eg. duplication] | |
670 $hgvs_notation->{start} = $self->transcript_variation->translation_start(); | |
671 $hgvs_notation->{end} = $self->transcript_variation->translation_end(); | |
672 | |
673 | |
674 ## get default reference & alt peptides [changed later to hgvs format] | |
675 $hgvs_notation->{alt} = $self->peptide; | |
676 $hgvs_notation->{ref} = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; | |
677 | |
678 if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref}){ | |
679 $hgvs_notation = _clip_alleles( $hgvs_notation); | |
680 } | |
681 | |
682 #### define type - types are different for protein numbering | |
683 $hgvs_notation = $self->_get_hgvs_protein_type($hgvs_notation); | |
684 | |
685 ##### Convert ref & alt peptides taking into account HGVS rules | |
686 $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation); | |
687 | |
688 ##### String formatting | |
689 $self->{hgvs_protein} = $self->_get_hgvs_protein_format($hgvs_notation); | |
690 return $self->{hgvs_protein} ; | |
691 } | |
692 | |
693 ### HGVS: format protein string | |
694 sub _get_hgvs_protein_format{ | |
695 | |
696 my $self = shift; | |
697 my $hgvs_notation = shift; | |
698 | |
699 if ((defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} && | |
700 $hgvs_notation->{ref} eq $hgvs_notation->{alt}) || | |
701 (defined $hgvs_notation->{type} && $hgvs_notation->{type} eq "=")){ | |
702 | |
703 ### no protein change - return transcript nomenclature with flag for neutral protein consequence | |
704 $hgvs_notation->{'hgvs'} = $self->hgvs_transcript() . "(p.=)"; | |
705 return $hgvs_notation->{'hgvs'} ; | |
706 } | |
707 | |
708 ### all start with refseq name & numbering type | |
709 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . "."; | |
710 | |
711 ### handle stop_lost seperately regardless of cause by del/delins => p.XposAA1extnum_AA_to_stop | |
712 if(stop_lost($self)){ ### if deletion of stop add extX and number of new aa to alt | |
713 $hgvs_notation->{alt} = substr($hgvs_notation->{alt}, 0, 3); | |
714 if($hgvs_notation->{type} eq "del"){ | |
715 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "del"); | |
716 if(defined $aa_til_stop){ | |
717 $hgvs_notation->{alt} .= "extX" . $aa_til_stop; | |
718 } | |
719 } | |
720 elsif($hgvs_notation->{type} eq ">"){ | |
721 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "subs"); | |
722 if(defined $aa_til_stop){ | |
723 $hgvs_notation->{alt} .= "extX" . $aa_til_stop; | |
724 } | |
725 } | |
726 else{ | |
727 # warn "TVA: stop loss for type $hgvs_notation->{type} not caught \n"; | |
728 } | |
729 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; | |
730 } | |
731 | |
732 elsif( $hgvs_notation->{type} eq "dup"){ | |
733 | |
734 if($hgvs_notation->{start} < $hgvs_notation->{end}){ | |
735 ### list only first and last peptides in long duplicated string | |
736 my $ref_pep_first = substr($hgvs_notation->{alt}, 0, 3); | |
737 my $ref_pep_last = substr($hgvs_notation->{alt}, -3, 3); | |
738 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} ."dup"; | |
739 | |
740 } | |
741 else{ | |
742 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{alt} . $hgvs_notation->{start} . "dup" ; | |
743 } | |
744 | |
745 print "formating a dup $hgvs_notation->{hgvs} \n" if $DEBUG==1; | |
746 } | |
747 | |
748 elsif($hgvs_notation->{type} eq ">"){ | |
749 #### substitution | |
750 | |
751 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; | |
752 } | |
753 | |
754 elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){ | |
755 | |
756 #### list first and last aa in reference only | |
757 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); | |
758 my $ref_pep_last; | |
759 if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){ | |
760 $ref_pep_last ="X"; | |
761 } | |
762 else{ | |
763 $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); | |
764 } | |
765 if($hgvs_notation->{ref} =~ /X$/){ | |
766 ### For stops & add extX & distance to next stop to alt pep | |
767 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "loss"); | |
768 if(defined $aa_til_stop){ | |
769 $hgvs_notation->{alt} .="extX" . $aa_til_stop; | |
770 } | |
771 } | |
772 if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){ | |
773 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; | |
774 } | |
775 else{ | |
776 ### correct ordering if needed | |
777 if($hgvs_notation->{start} > $hgvs_notation->{end}){ | |
778 ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} ); | |
779 } | |
780 | |
781 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; | |
782 } | |
783 } | |
784 | |
785 elsif($hgvs_notation->{type} eq "fs"){ | |
786 if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "X"){ ## stop gained | |
787 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; | |
788 | |
789 } | |
790 else{ ## not immediate stop - count aa until next | |
791 | |
792 my $aa_til_stop = _stop_loss_extra_AA($self, $hgvs_notation->{start}-1, "fs"); | |
793 if($aa_til_stop ){ | |
794 ### use long form if new stop found | |
795 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} . "fsX$aa_til_stop" ; | |
796 } | |
797 else{ | |
798 ### otherwise use short form | |
799 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "fs"; | |
800 } | |
801 } | |
802 } | |
803 | |
804 elsif( $hgvs_notation->{type} eq "del"){ | |
805 $hgvs_notation->{alt} = "del"; | |
806 if( length($hgvs_notation->{ref}) >3 ){ | |
807 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); | |
808 my $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); | |
809 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ; | |
810 } | |
811 else{ | |
812 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; | |
813 } | |
814 } | |
815 | |
816 elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){ | |
817 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "_" . $hgvs_notation->{alt} . $hgvs_notation->{end} ; | |
818 } | |
819 | |
820 else{ | |
821 #### default to substitution | |
822 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; | |
823 } | |
824 | |
825 if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";} | |
826 return $hgvs_notation->{'hgvs'}; | |
827 } | |
828 | |
829 ### HGVS: get type of variation event in protein terms | |
830 sub _get_hgvs_protein_type{ | |
831 | |
832 my $self = shift; | |
833 my $hgvs_notation = shift; | |
834 | |
835 ### get allele length | |
836 my ($ref_length, $alt_length ) = $self->_get_allele_length(); | |
837 | |
838 | |
839 if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){ | |
840 ### Run type checks on peptides if available | |
841 | |
842 if($hgvs_notation->{alt} eq $hgvs_notation->{ref} ){ | |
843 #### synonymous indicated by = | |
844 $hgvs_notation->{type} = "="; | |
845 } | |
846 elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) { | |
847 | |
848 $hgvs_notation->{type} = ">"; | |
849 } | |
850 elsif($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") { | |
851 | |
852 $hgvs_notation->{type} = "ins"; | |
853 } | |
854 elsif($hgvs_notation->{alt} eq "" ) { | |
855 | |
856 $hgvs_notation->{type} = "del"; | |
857 } | |
858 elsif( (length($hgvs_notation->{alt}) > length($hgvs_notation->{ref}) ) && | |
859 $hgvs_notation->{alt} =~ / $hgvs_notation->{ref}/ ) { | |
860 ### capture duplication event described as TTT/TTTTT | |
861 $hgvs_notation->{type} = "dup"; | |
862 } | |
863 | |
864 elsif( (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) ==1) || | |
865 (length($hgvs_notation->{alt}) ==1 && length($hgvs_notation->{ref}) >1) ) { | |
866 $hgvs_notation->{type} = "delins"; | |
867 } | |
868 else{ | |
869 $hgvs_notation->{type} = ">"; | |
870 } | |
871 } | |
872 | |
873 | |
874 elsif($ref_length ne $alt_length && ($ref_length - $alt_length)%3 !=0 ){ | |
875 $hgvs_notation->{type} = "fs"; | |
876 } | |
877 | |
878 elsif(length($self->variation_feature_seq()) >1 ){ | |
879 if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){ | |
880 ### convention for insertions - end one less than start | |
881 $hgvs_notation->{type} = "ins"; | |
882 } | |
883 elsif( $hgvs_notation->{start} != $hgvs_notation->{end} ){ | |
884 $hgvs_notation->{type} = "delins"; | |
885 } | |
886 else{ | |
887 $hgvs_notation->{type} = ">"; | |
888 } | |
889 } | |
890 else{ | |
891 #print STDERR "DEBUG ".$self->variation_feature->start."\n"; | |
892 #warn "Cannot define protein variant type [$ref_length - $alt_length]\n"; | |
893 } | |
894 return $hgvs_notation ; | |
895 | |
896 } | |
897 | |
898 ### HGVS: get reference & alternative peptide | |
899 sub _get_hgvs_peptides{ | |
900 | |
901 my $self = shift; | |
902 my $hgvs_notation = shift; | |
903 | |
904 if($hgvs_notation->{type} eq "fs"){ | |
905 ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately | |
906 $hgvs_notation = $self->_get_fs_peptides($hgvs_notation); | |
907 } | |
908 elsif($hgvs_notation->{type} eq "ins" ){ | |
909 | |
910 ### HGVS ref are peptides flanking insertion | |
911 $hgvs_notation->{ref} = $self->_get_surrounding_peptides($hgvs_notation->{start}); | |
912 | |
913 if( $hgvs_notation->{alt} =~/\*/){ | |
914 ## inserted bases after stop irrelevant; consider as substitution gaining stop MAINTAIN PREVIOUS BEHAVIOUR FOR NOW | |
915 #$hgvs_notation->{alt} =~ s/\*\w+$/\*/; | |
916 } | |
917 else{ | |
918 ### Additional check that inserted bases do not duplicate 3' reference sequence [set to type = dup if so] | |
919 $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation); | |
920 } | |
921 } | |
922 | |
923 | |
924 ### Convert peptide to 3 letter code as used in HGVS | |
925 $hgvs_notation->{ref} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref', -alphabet => 'protein')) || ""; | |
926 if( $hgvs_notation->{alt} eq "-"){ | |
927 $hgvs_notation->{alt} = "del"; | |
928 } | |
929 else{ | |
930 $hgvs_notation->{alt} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref', -alphabet => 'protein')) || ""; | |
931 } | |
932 | |
933 ### handle special cases | |
934 if( affects_start_codon($self) ){ | |
935 #### handle initiator loss -> probably no translation => alt allele is '?' | |
936 $hgvs_notation->{alt} = "?"; | |
937 $hgvs_notation->{type} = ""; | |
938 } | |
939 | |
940 elsif( $hgvs_notation->{type} eq "del"){ | |
941 #### partial last codon | |
942 $hgvs_notation->{alt} = "del"; | |
943 } | |
944 elsif($hgvs_notation->{type} eq "fs"){ | |
945 ### only quote first ref peptide for frameshift | |
946 $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3); | |
947 } | |
948 | |
949 ### set stop to HGVS prefered "X" | |
950 if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Ter|Xaa/X/g; } | |
951 if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Ter|Xaa/X/g; } | |
952 | |
953 return ($hgvs_notation); | |
954 | |
955 } | |
956 | |
957 ### HGVS: remove common peptides from alt and ref strings & shift start/end accordingly | |
958 sub _clip_alleles{ | |
959 | |
960 my $hgvs_notation = shift; | |
961 | |
962 my $check_alt = $hgvs_notation->{alt} ; | |
963 my $check_ref = $hgvs_notation->{ref} ; | |
964 my $check_start = $hgvs_notation->{start}; | |
965 | |
966 ### strip same bases from start of string | |
967 for (my $p =0; $p <length ($hgvs_notation->{ref}); $p++){ | |
968 my $check_next_ref = substr( $check_ref, 0, 1); | |
969 my $check_next_alt = substr( $check_alt, 0, 1); | |
970 | |
971 if($check_next_ref eq "*" && $check_next_alt eq "*"){ | |
972 ### stop re-created by variant - no protein change | |
973 $hgvs_notation->{type} = "="; | |
974 | |
975 return($hgvs_notation); | |
976 } | |
977 | |
978 if($check_next_ref eq $check_next_alt){ | |
979 $check_start++; | |
980 $check_ref = substr( $check_ref, 1); | |
981 $check_alt = substr( $check_alt, 1); | |
982 } | |
983 else{ | |
984 last; | |
985 } | |
986 } | |
987 #### strip same bases from end of string | |
988 for (my $q =0; $q < length ($check_ref); $q++){ | |
989 my $check_next_ref = substr( $check_ref, -1, 1); | |
990 my $check_next_alt = substr( $check_alt, -1, 1); | |
991 if($check_next_ref eq $check_next_alt){ | |
992 chop $check_ref; | |
993 chop $check_alt; | |
994 } | |
995 else{ | |
996 last; | |
997 } | |
998 } | |
999 | |
1000 $hgvs_notation->{alt} = $check_alt; | |
1001 $hgvs_notation->{ref} = $check_ref; | |
1002 | |
1003 ### check if clipping force type change & adjust location | |
1004 if(length ($check_ref) == 0 && length ($check_alt) >= 1){ | |
1005 ### re-set as ins not delins | |
1006 $hgvs_notation->{type} ="ins"; | |
1007 ### insertion between ref base and next => adjust next | |
1008 if($hgvs_notation->{end} == $hgvs_notation->{start} ){$hgvs_notation->{end} = $check_start; } | |
1009 # $hgvs_notation->{start} = $check_start; | |
1010 } | |
1011 elsif(length ($check_ref) >=1 && length ($check_alt) == 0){ | |
1012 ### re-set as del not delins | |
1013 $hgvs_notation->{type} ="del"; | |
1014 $hgvs_notation->{start} = $check_start; | |
1015 } | |
1016 else{ | |
1017 #### save trimmed peptide string & increased start position | |
1018 $hgvs_notation->{start} = $check_start; | |
1019 | |
1020 } | |
1021 | |
1022 return $hgvs_notation; | |
1023 } | |
1024 | |
1025 | |
1026 | |
1027 | |
1028 | |
1029 #### HGVS: check allele lengths to look for frameshifts | |
1030 sub _get_allele_length{ | |
1031 | |
1032 my $self = shift; | |
1033 my $ref_length = 0; | |
1034 my $alt_length = 0; | |
1035 | |
1036 my $al_string = $self->allele_string(); | |
1037 my $ref_allele = (split/\//, $al_string)[0]; | |
1038 $ref_allele =~ s/\-//; | |
1039 $ref_length = length $ref_allele; | |
1040 | |
1041 my $alt_allele = $self->variation_feature_seq(); | |
1042 $alt_allele =~ s/\-//; | |
1043 $alt_length = length $alt_allele; | |
1044 | |
1045 return ($ref_length, $alt_length ); | |
1046 | |
1047 } | |
1048 | |
1049 ### HGVS: list first different peptide [may not be first changed codon] | |
1050 sub _get_fs_peptides{ | |
1051 | |
1052 my $self = shift; | |
1053 my $hgvs_notation = shift; | |
1054 | |
1055 ### get CDS with alt variant | |
1056 my $alt_cds = $self->_get_alternate_cds(); | |
1057 return undef unless defined($alt_cds); | |
1058 | |
1059 #### get new translation | |
1060 my $alt_trans = $alt_cds->translate()->seq(); | |
1061 | |
1062 ### get changed end (currently in single letter AA coding) | |
1063 my $ref_trans = $self->transcript->translate()->seq(); | |
1064 $ref_trans .= "*"; ## appending ref stop for checking purposes | |
1065 | |
1066 $hgvs_notation->{start} = $self->transcript_variation->translation_start() ; | |
1067 | |
1068 if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq | |
1069 $hgvs_notation->{alt} = "del"; | |
1070 $hgvs_notation->{type} = "del"; | |
1071 return $hgvs_notation; | |
1072 } | |
1073 | |
1074 while ($hgvs_notation->{start} <= length $alt_trans){ | |
1075 ### frame shift may result in the same AA# | |
1076 | |
1077 $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1); | |
1078 $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1); | |
1079 | |
1080 if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){ | |
1081 ### variation at stop codon, but maintains stop codon | |
1082 return ($hgvs_notation); | |
1083 } | |
1084 last if $hgvs_notation->{ref} ne $hgvs_notation->{alt}; | |
1085 $hgvs_notation->{start}++; | |
1086 } | |
1087 return ($hgvs_notation); | |
1088 | |
1089 } | |
1090 | |
1091 #### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion | |
1092 sub _get_surrounding_peptides{ | |
1093 | |
1094 my $self = shift; | |
1095 my $ref_pos = shift; | |
1096 my $ref_trans = $self->transcript->translate()->seq(); | |
1097 | |
1098 my $end = substr($ref_trans, $ref_pos-1); | |
1099 my $ref_string = substr($ref_trans, $ref_pos-1, 2); | |
1100 | |
1101 return ($ref_string); | |
1102 | |
1103 } | |
1104 | |
1105 | |
1106 #### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop | |
1107 sub _get_alternate_cds{ | |
1108 | |
1109 my $self = shift; | |
1110 | |
1111 ### get reference sequence | |
1112 my $reference_cds_seq = $self->transcript->translateable_seq(); | |
1113 | |
1114 return undef unless defined($self->transcript_variation->cds_start) && defined($self->transcript_variation->cds_end()); | |
1115 | |
1116 ### get sequences upstream and downstream of variant | |
1117 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); | |
1118 my $downstream_seq = substr($reference_cds_seq, ($self->transcript_variation->cds_end() ) ); | |
1119 | |
1120 ### fix alternate allele if deletion or on opposite strand | |
1121 my $alt_allele = $self->variation_feature_seq(); | |
1122 $alt_allele =~ s/\-//; | |
1123 if( $self->transcript_variation->variation_feature->strand() <0 && $self->transcript_variation->transcript->strand() >0 || | |
1124 $self->transcript_variation->variation_feature->strand() >0 && $self->transcript_variation->transcript->strand() < 0 | |
1125 ){ | |
1126 reverse_comp(\$alt_allele) ; | |
1127 } | |
1128 ### build alternate seq | |
1129 my $alternate_seq = $upstream_seq . $alt_allele . $downstream_seq ; | |
1130 | |
1131 ### create seq obj with alternative allele in the CDS sequence | |
1132 my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq, -id => 'alt_cds', -alphabet => 'dna'); | |
1133 | |
1134 ### append UTR if available as stop may be disrupted | |
1135 my $utr = $self->transcript_variation->transcript->three_prime_utr(); | |
1136 | |
1137 if (defined $utr) { | |
1138 ### append the UTR to the alternative CDS | |
1139 $alt_cds->seq($alt_cds->seq() . $utr->seq()); | |
1140 } | |
1141 else{ | |
1142 ##warn "No UTR available for alternate CDS\n"; | |
1143 } | |
1144 | |
1145 return $alt_cds; | |
1146 } | |
1147 | |
1148 ### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication | |
1149 sub _check_for_peptide_duplication{ | |
1150 | |
1151 my $self = shift; | |
1152 my $hgvs_notation= shift; | |
1153 | |
1154 ##### get reference sequence | |
1155 my $reference_cds_seq = $self->transcript->translateable_seq(); | |
1156 | |
1157 ##### get sequence upstream of variant | |
1158 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); | |
1159 | |
1160 ##### create translation to check against inserted peptides | |
1161 my $upstream_cds =Bio::PrimarySeq->new(-seq => $upstream_seq, -id => 'alt_cds', -alphabet => 'dna'); | |
1162 my $upstream_trans = $upstream_cds->translate()->seq(); | |
1163 | |
1164 ## Test whether alt peptide matches the reference sequence just before the variant | |
1165 my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ; | |
1166 my $test_seq = substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'})); | |
1167 | |
1168 if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) { | |
1169 | |
1170 $hgvs_notation->{type} = 'dup'; | |
1171 $hgvs_notation->{end} = $hgvs_notation->{start} -1; | |
1172 $hgvs_notation->{start} -= length($hgvs_notation->{alt}); | |
1173 | |
1174 } | |
1175 | |
1176 return $hgvs_notation; | |
1177 | |
1178 } | |
1179 | |
1180 #### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's | |
1181 sub _stop_loss_extra_AA{ | |
1182 | |
1183 my $self = shift; | |
1184 my $ref_var_pos = shift; ### first effected AA - supply for frameshifts | |
1185 my $test = shift; | |
1186 | |
1187 return undef unless $ref_var_pos; | |
1188 | |
1189 my $extra_aa; | |
1190 | |
1191 ### get the sequence with variant added | |
1192 my $alt_cds = $self->_get_alternate_cds(); | |
1193 return undef unless defined($alt_cds); | |
1194 | |
1195 ### get new translation | |
1196 my $alt_trans = $alt_cds->translate(); | |
1197 | |
1198 my $ref_temp = $self->transcript->translate()->seq; | |
1199 my $ref_len = length($ref_temp); | |
1200 | |
1201 if($DEBUG==1){ | |
1202 print "alt translated:\n" . $alt_trans->seq() . "\n"; | |
1203 print "ref translated:\n$ref_temp\n";; | |
1204 } | |
1205 | |
1206 #### Find the number of residues that are translated until a termination codon is encountered | |
1207 if ($alt_trans->seq() =~ m/\*/) { | |
1208 if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";} | |
1209 | |
1210 if($test eq "fs" ){ | |
1211 ### frame shift - count from first AA effected by variant to stop | |
1212 $extra_aa = $+[0] - $ref_var_pos; | |
1213 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";} | |
1214 } | |
1215 | |
1216 else{ | |
1217 $extra_aa = $+[0] - 1 - $ref_len; | |
1218 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";} | |
1219 } | |
1220 } | |
1221 | |
1222 # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon | |
1223 if(defined $extra_aa && $extra_aa >0){ | |
1224 return $extra_aa ; | |
1225 } | |
1226 else{ | |
1227 #warn "No stop found in alternate sequence\n"; | |
1228 return undef; | |
1229 } | |
1230 | |
1231 } | |
1232 | |
1233 =head | |
1234 # We haven't implemented support for these methods yet | |
1235 | |
1236 sub hgvs_rna { | |
1237 return _hgvs_generic(@_,'rna'); | |
1238 } | |
1239 | |
1240 sub hgvs_mitochondrial { | |
1241 return _hgvs_generic(@_,'mitochondrial'); | |
1242 } | |
1243 | |
1244 =cut | |
1245 | |
1246 sub _hgvs_generic { | |
1247 | |
1248 my $self = shift; | |
1249 my $reference = pop; | |
1250 my $notation = shift; | |
1251 | |
1252 #ĘThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these | |
1253 return undef if ($reference =~ m/rna|mitochondrial/); | |
1254 | |
1255 my $sub = qq{hgvs_$reference}; | |
1256 | |
1257 $self->{$sub} = $notation if defined $notation; | |
1258 | |
1259 unless ($self->{$sub}) { | |
1260 | |
1261 # Use the transcript this VF is on as the reference feature | |
1262 my $reference_feature = $self->transcript; | |
1263 # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region | |
1264 $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic'); | |
1265 | |
1266 # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles | |
1267 $self->transcript_variation->$sub($self->variation_feature->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$self->transcript_variation)); | |
1268 } | |
1269 | |
1270 return $self->{$sub}; | |
1271 } | |
1272 | |
1273 | |
1274 ### HGVS: move variant to transcript slice | |
1275 sub _var2transcript_slice_coords{ | |
1276 | |
1277 my $self = shift; | |
1278 | |
1279 my $ref_slice = $self->transcript->feature_Slice(); #### returns with strand same as feature | |
1280 my $tr_vf = $self->variation_feature->transfer($ref_slice); | |
1281 | |
1282 # Return undef if this VariationFeature does not fall within the supplied feature. | |
1283 return undef if (!defined $tr_vf || | |
1284 $tr_vf->start < 1 || | |
1285 $tr_vf->end < 1 || | |
1286 $tr_vf->start > ($self->transcript->end - $self->transcript->start + 1) || | |
1287 $tr_vf->end > ($self->transcript->end - $self->transcript->start + 1)); | |
1288 | |
1289 return( $tr_vf->start() , $tr_vf->end(), $ref_slice); | |
1290 } | |
1291 | |
1292 | |
1293 | |
1294 ### HGVS: get variant position in transcript | |
1295 | |
1296 # intron: | |
1297 # If the position is in an intron, the boundary position of the closest exon and | |
1298 # a + or - offset into the intron is returned. | |
1299 # Ordered by genome forward not 5' -> 3' | |
1300 | |
1301 # upstream: | |
1302 # If the position is 5' of the start codon, it is reported relative to the start codon | |
1303 # (-1 being the last nucleotide before the 'A' of ATG). | |
1304 | |
1305 #downstream: | |
1306 # If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset | |
1307 # from the start codon (*1 being the first nucleotide after the last position of the stop codon) | |
1308 | |
1309 sub _get_cDNA_position { | |
1310 | |
1311 my $self = shift; | |
1312 my $position = shift; ### start or end of variation | |
1313 | |
1314 my $transcript = $self->transcript(); | |
1315 my $strand = $transcript->strand(); | |
1316 | |
1317 #### TranscriptVariation start/stop coord relative to transcript | |
1318 #### Switch to chromosome coordinates taking into account strand | |
1319 $position = ( $strand > 0 ? | |
1320 ( $self->transcript->start() + $position - 1 ) : | |
1321 ( $self->transcript->end() - $position + 1)); | |
1322 | |
1323 | |
1324 | |
1325 # Get all exons and sort them in positional order | |
1326 my @exons = sort {$a->start() <=> $b->start()} @{$transcript->get_all_Exons()}; | |
1327 my $n_exons = scalar(@exons); | |
1328 | |
1329 my $cdna_position; | |
1330 # Loop over the exons and get the coordinates of the variation in exon+intron notation | |
1331 for (my $i=0; $i<$n_exons; $i++) { | |
1332 | |
1333 # Skip if the start point is beyond this exon | |
1334 next if ($position > $exons[$i]->end()); | |
1335 | |
1336 | |
1337 # EXONIC: If the start coordinate is within this exon | |
1338 if ($position >= $exons[$i]->start()) { | |
1339 # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation | |
1340 # If the transcript is in the opposite direction, count from the end instead | |
1341 $cdna_position = $exons[$i]->cdna_start($transcript) + ( $strand > 0 ? | |
1342 ( $position - $exons[$i]->start ) : | |
1343 ( $exons[$i]->end() - $position ) | |
1344 ); | |
1345 last; #### last exon checked | |
1346 } | |
1347 ## INTRONIC | |
1348 # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one | |
1349 else { | |
1350 | |
1351 my $updist = ($position - $exons[$i-1]->end()); | |
1352 my $downdist = ($exons[$i]->start() - $position); | |
1353 | |
1354 # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that | |
1355 if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) { | |
1356 | |
1357 # If the orientation is reversed, we should use the cDNA start and a '-' offset | |
1358 $cdna_position = ($strand >= 0 ? | |
1359 $exons[$i-1]->cdna_end($transcript) . '+' : | |
1360 $exons[$i-1]->cdna_start($transcript) . '-') | |
1361 . $updist; | |
1362 } | |
1363 # Else if downstream is shortest... | |
1364 else { | |
1365 # If the orientation is reversed, we should use the cDNA end and a '+' offset | |
1366 $cdna_position = ($strand >= 0 ? | |
1367 $exons[$i]->cdna_start($transcript) . '-' : | |
1368 $exons[$i]->cdna_end($transcript) . '+') | |
1369 . $downdist; | |
1370 } | |
1371 last; ## last exon checked | |
1372 } | |
1373 } | |
1374 | |
1375 # Shift the position to make it relative to the start & stop codons | |
1376 my $start_codon = $transcript->cdna_coding_start(); | |
1377 my $stop_codon = $transcript->cdna_coding_end(); | |
1378 | |
1379 # Disassemble the cDNA coordinate into the exon and intron parts | |
1380 ### just built this now taking it appart again | |
1381 my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/; | |
1382 | |
1383 | |
1384 # Start by correcting for the stop codon | |
1385 if (defined($stop_codon) && $cdna_coord > $stop_codon) { | |
1386 # Get the offset from the stop codon | |
1387 $cdna_coord -= $stop_codon; | |
1388 # Prepend a * to indicate the position is in the 3' UTR | |
1389 $cdna_coord = '*' . $cdna_coord; | |
1390 } | |
1391 elsif (defined($start_codon)) { | |
1392 # If the position is beyond the start codon, add 1 to get the correct offset | |
1393 $cdna_coord += ($cdna_coord >= $start_codon); | |
1394 # Subtract the position of the start codon | |
1395 $cdna_coord -= $start_codon; | |
1396 } | |
1397 | |
1398 # Re-assemble the cDNA position [ return exon num & offset & direction for intron eg. 142+363] | |
1399 $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : ''); | |
1400 return $cdna_position; | |
1401 } | |
1402 | |
1403 | |
1404 1; | |
1405 | |
1406 |