comparison variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Variation::TranscriptVariationAllele
24
25 =head1 SYNOPSIS
26
27 use Bio::EnsEMBL::Variation::TranscriptVariationAllele;
28
29 my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new(
30 -transcript_variation => $tv,
31 -variation_feature_seq => 'A',
32 -is_reference => 0,
33 );
34
35 print "sequence with respect to the transcript: ", $tva->feature_seq, "\n";
36 print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n";
37 print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n";
38 print "amino acid change: ", $tva->peptide_allele_string, "\n";
39 print "resulting codon: ", $tva->codon, "\n";
40 print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n";
41 print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n";
42 print "SIFT prediction: ", $tva->sift_prediction, "\n";
43
44 =head1 DESCRIPTION
45
46 A TranscriptVariationAllele object represents a single allele of a TranscriptVariation.
47 It provides methods that are specific to the sequence of the allele, such as codon,
48 peptide etc. Methods that depend only on position (e.g. CDS start) will be found in
49 the associated TranscriptVariation. Ordinarily you will not create these objects
50 yourself, but instead you would create a TranscriptVariation object which will then
51 construct TranscriptVariationAlleles based on the allele string of the associated
52 VariationFeature.
53
54 Note that any methods that are not specific to Transcripts will be found in the
55 VariationFeatureOverlapAllele superclass.
56
57 =cut
58
59 package Bio::EnsEMBL::Variation::TranscriptVariationAllele;
60
61 use strict;
62 use warnings;
63
64 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP);
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
66 use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string);
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
68 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(within_cds within_intron stop_lost affects_start_codon);
69
70 use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele);
71
72
73 our $DEBUG = 0;
74
75 sub new_fast {
76 my ($class, $hashref) = @_;
77
78 # swap a transcript_variation argument for a variation_feature_overlap one
79
80 if ($hashref->{transcript_variation}) {
81 $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation};
82 }
83
84 # and call the superclass
85
86 return $class->SUPER::new_fast($hashref);
87 }
88
89 =head2 transcript_variation
90
91 Description: Get/set the associated TranscriptVariation
92 Returntype : Bio::EnsEMBL::Variation::TranscriptVariation
93 Exceptions : throws if the argument is the wrong type
94 Status : At Risk
95
96 =cut
97
98 sub transcript_variation {
99 my ($self, $tv) = @_;
100 assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $tv;
101 return $self->variation_feature_overlap($tv);
102 }
103
104 =head2 variation_feature
105
106 Description: Get the associated VariationFeature
107 Returntype : Bio::EnsEMBL::Variation::VariationFeature
108 Exceptions : none
109 Status : At Risk
110
111 =cut
112
113 sub variation_feature {
114 my $self = shift;
115 return $self->transcript_variation->variation_feature;
116 }
117
118 =head2 pep_allele_string
119
120 Description: Return a '/' delimited string of the reference peptide and the
121 peptide resulting from this allele, or a single peptide if this
122 allele does not change the peptide (e.g. because it is synonymous)
123 Returntype : string or undef if this allele is not in the CDS
124 Exceptions : none
125 Status : At Risk
126
127 =cut
128
129 sub pep_allele_string {
130 my ($self) = @_;
131
132 my $pep = $self->peptide;
133
134 return undef unless $pep;
135
136 my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide;
137
138 return undef unless $ref_pep;
139
140 return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep;
141 }
142
143 =head2 codon_allele_string
144
145 Description: Return a '/' delimited string of the reference codon and the
146 codon resulting from this allele
147 Returntype : string or undef if this allele is not in the CDS
148 Exceptions : none
149 Status : At Risk
150
151 =cut
152
153 sub codon_allele_string {
154 my ($self) = @_;
155
156 my $codon = $self->codon;
157
158 return undef unless $codon;
159
160 my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon;
161
162 return $ref_codon.'/'.$codon;
163 }
164
165 =head2 display_codon_allele_string
166
167 Description: Return a '/' delimited string of the reference display_codon and the
168 display_codon resulting from this allele. The display_codon identifies
169 the nucleotides affected by this variant in UPPER CASE and other
170 nucleotides in lower case
171 Returntype : string or undef if this allele is not in the CDS
172 Exceptions : none
173 Status : At Risk
174
175 =cut
176
177 sub display_codon_allele_string {
178 my ($self) = @_;
179
180 my $display_codon = $self->display_codon;
181
182 return undef unless $display_codon;
183
184 my $ref_display_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->display_codon;
185
186 return undef unless $ref_display_codon;
187
188 return $ref_display_codon.'/'.$display_codon;
189 }
190
191 =head2 peptide
192
193 Description: Return the amino acid sequence that this allele is predicted to result in
194 Returntype : string or undef if this allele is not in the CDS or is a frameshift
195 Exceptions : none
196 Status : At Risk
197
198 =cut
199
200 sub peptide {
201 my ($self, $peptide) = @_;
202
203 $self->{peptide} = $peptide if $peptide;
204
205 unless ($self->{peptide}) {
206
207 return undef unless $self->seq_is_unambiguous_dna;
208
209 if (my $codon = $self->codon) {
210
211 # the codon method can set the peptide in some circumstances
212 # so check here before we try an (expensive) translation
213 return $self->{peptide} if $self->{peptide};
214
215 # translate the codon sequence to establish the peptide allele
216
217 # allow for partial codons - split the sequence into whole and partial
218 # e.g. AAAGG split into AAA and GG
219 my $whole_codon = substr($codon, 0, int(length($codon) / 3) * 3);
220 my $partial_codon = substr($codon, int(length($codon) / 3) * 3);
221
222 my $pep = '';
223
224 if($whole_codon) {
225 # for mithocondrial dna we need to to use a different codon table
226 my $codon_table = $self->transcript_variation->_codon_table;
227
228 my $codon_seq = Bio::Seq->new(
229 -seq => $whole_codon,
230 -moltype => 'dna',
231 -alphabet => 'dna',
232 );
233
234 $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq;
235 }
236
237 if($partial_codon) {
238 $pep .= 'X';
239 }
240
241 $pep ||= '-';
242
243 $self->{peptide} = $pep;
244 }
245 }
246
247 return $self->{peptide};
248 }
249
250 =head2 codon
251
252 Description: Return the codon sequence that this allele is predicted to result in
253 Returntype : string or undef if this allele is not in the CDS or is a frameshift
254 Exceptions : none
255 Status : At Risk
256
257 =cut
258
259 sub codon {
260 my ($self, $codon) = @_;
261
262 $self->{codon} = $codon if defined $codon;
263
264 my $tv = $self->transcript_variation;
265
266 return undef unless $tv->translation_start && $tv->translation_end;
267
268 return undef unless $self->seq_is_dna;
269
270 unless ($self->{codon}) {
271
272 # try to calculate the codon sequence
273
274 my $seq = $self->feature_seq;
275
276 $seq = '' if $seq eq '-';
277
278 # calculate necessary coords and lengths
279
280 my $codon_cds_start = $tv->translation_start * 3 - 2;
281 my $codon_cds_end = $tv->translation_end * 3;
282 my $codon_len = $codon_cds_end - $codon_cds_start + 1;
283 my $vf_nt_len = $tv->cds_end - $tv->cds_start + 1;
284 my $allele_len = $self->seq_length;
285
286 if ($allele_len != $vf_nt_len) {
287 if (abs($allele_len - $vf_nt_len) % 3) {
288 # this is a frameshift variation, we don't attempt to
289 # calculate the resulting codon or peptide change as this
290 # could get quite complicated
291 return undef;
292 }
293 }
294
295 # splice the allele sequence into the CDS
296
297 my $cds = $tv->_translateable_seq;
298
299 substr($cds, $tv->cds_start-1, $vf_nt_len) = $seq;
300
301 # and extract the codon sequence
302
303 my $codon = substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len));
304
305 if (length($codon) < 1) {
306 $self->{codon} = '-';
307 $self->{peptide} = '-';
308 }
309 else {
310 $self->{codon} = $codon;
311 }
312 }
313
314 return $self->{codon};
315 }
316
317 =head2 display_codon
318
319 Description: Return the codon sequence that this allele is predicted to result in
320 with the affected nucleotides identified in UPPER CASE and other
321 nucleotides in lower case
322 Returntype : string or undef if this allele is not in the CDS or is a frameshift
323 Exceptions : none
324 Status : At Risk
325
326 =cut
327
328 sub display_codon {
329 my $self = shift;
330
331 unless ($self->{_display_codon}) {
332
333 if ($self->codon && defined $self->transcript_variation->codon_position) {
334
335 my $display_codon = lc $self->codon;
336
337 # if this allele is an indel then just return all lowercase
338
339 if ($self->feature_seq ne '-') {
340
341 # codon_position is 1-based, while substr assumes the string starts at 0
342
343 my $pos = $self->transcript_variation->codon_position - 1;
344
345 my $len = length $self->feature_seq;
346
347 substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len);
348 }
349
350 $self->{_display_codon} = $display_codon;
351 }
352 }
353
354 return $self->{_display_codon};
355 }
356
357 =head2 polyphen_prediction
358
359 Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele.
360 (Note that we currently only have PolyPhen predictions for variants that
361 result in single amino acid substitutions in human)
362 Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown')
363 if this is a non-synonymous mutation and a prediction is available, undef
364 otherwise
365 Exceptions : none
366 Status : At Risk
367
368 =cut
369
370 sub polyphen_prediction {
371 my ($self, $classifier, $polyphen_prediction) = @_;
372
373 $classifier ||= 'humvar';
374
375 my $analysis = "polyphen_${classifier}";
376
377 $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction;
378
379 unless ($self->{$analysis}->{prediction}) {
380 my ($prediction, $score) = $self->_protein_function_prediction($analysis);
381 $self->{$analysis}->{score} = $score;
382 $self->{$analysis}->{prediction} = $prediction;
383 }
384
385 return $self->{$analysis}->{prediction};
386 }
387
388 =head2 polyphen_score
389
390 Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we
391 currently only have PolyPhen predictions for variants that result in single
392 amino acid substitutions in human)
393 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is
394 available, undef otherwise
395 Exceptions : none
396 Status : At Risk
397
398 =cut
399
400 sub polyphen_score {
401 my ($self, $classifier, $polyphen_score) = @_;
402
403 $classifier ||= 'humvar';
404
405 my $analysis = "polyphen_${classifier}";
406
407 $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score;
408
409 unless ($self->{$analysis}->{score}) {
410 my ($prediction, $score) = $self->_protein_function_prediction($analysis);
411 $self->{$analysis}->{score} = $score;
412 $self->{$analysis}->{prediction} = $prediction;
413 }
414
415 return $self->{$analysis}->{score};
416 }
417
418 =head2 sift_prediction
419
420 Description: Return the qualitative SIFT prediction for the effect of this allele.
421 (Note that we currently only have SIFT predictions for variants that
422 result in single amino acid substitutions in human)
423 Returntype : string (one of 'tolerated', 'deleterious') if this is a non-synonymous
424 mutation and a prediction is available, undef otherwise
425 Exceptions : none
426 Status : At Risk
427
428 =cut
429
430 sub sift_prediction {
431 my ($self, $sift_prediction) = @_;
432
433 $self->{sift_prediction} = $sift_prediction if $sift_prediction;
434
435 unless ($self->{sift_prediction}) {
436 my ($prediction, $score) = $self->_protein_function_prediction('sift');
437 $self->{sift_score} = $score;
438 $self->{sift_prediction} = $prediction unless $self->{sift_prediction};
439 }
440
441 return $self->{sift_prediction};
442 }
443
444 =head2 sift_score
445
446 Description: Return the SIFT score for this allele (Note that we currently only have SIFT
447 predictions for variants that result in single amino acid substitutions in human)
448 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is
449 available, undef otherwise
450 Exceptions : none
451 Status : At Risk
452
453 =cut
454
455 sub sift_score {
456 my ($self, $sift_score) = @_;
457
458 $self->{sift_score} = $sift_score if defined $sift_score;
459
460 unless ($self->{sift_score}) {
461 my ($prediction, $score) = $self->_protein_function_prediction('sift');
462 $self->{sift_score} = $score;
463 $self->{sift_prediction} = $prediction;
464 }
465
466 return $self->{sift_score};
467 }
468
469
470 sub _protein_function_prediction {
471 my ($self, $analysis) = @_;
472
473 # we can only get results for variants that cause a single amino acid substitution,
474 # so check the peptide allele string first
475
476 if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) {
477
478 if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) {
479
480 my ($prediction, $score) = $matrix->get_prediction(
481 $self->transcript_variation->translation_start,
482 $self->peptide,
483 );
484
485 return wantarray ? ($prediction, $score) : $prediction;
486 }
487 }
488
489 return undef;
490 }
491
492 =head2 hgvs_genomic
493
494 Description: Return a string representing the genomic-level effect of this allele in HGVS format
495 Returntype : string
496 Exceptions : none
497 Status : At Risk
498
499 =cut
500
501 sub hgvs_genomic {
502 return _hgvs_generic(@_,'genomic');
503 }
504
505 =head2 hgvs_coding
506
507 Description: Return a string representing the CDS-level effect of this allele in HGVS format
508 Returntype : string or undef if this allele is not in the CDS
509 Exceptions : none
510 Status : At Risk
511
512 =cut
513
514 sub hgvs_coding {
515
516 deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.');
517 return hgvs_transcript(@_);
518 }
519
520
521 =head2 hgvs_transcript
522
523 Description: Return a string representing the CDS-level effect of this allele in HGVS format
524 Returntype : string or undef if this allele is not in the CDS
525 Exceptions : none
526 Status : At Risk
527
528 =cut
529
530
531 sub hgvs_transcript {
532
533
534 my $self = shift;
535 my $notation = shift;
536
537 ##### set if string supplied
538 $self->{hgvs_transcript} = $notation if defined $notation;
539 ##### return if held
540 return $self->{hgvs_transcript} if defined $self->{hgvs_transcript} ;
541 return $self->{hgvs_coding} if defined $self->{hgvs_coding} ;
542
543 my $variation_feature_sequence = $self->variation_feature_seq() ;
544
545 ### don't try to handle odd characters
546 return undef if $variation_feature_sequence =~ m/[^ACGT\-]/ig;
547
548 ### no result for reference allele
549 return undef if $self->is_reference ==1;
550
551 ### else evaluate
552
553 ### get reference sequence strand
554 my $refseq_strand = $self->transcript_variation->transcript->strand();
555
556 if($DEBUG ==1){
557 my $var_name = $self->transcript_variation->variation_feature->variation_name();
558 print "\nHGVS transcript: Checking $var_name refseq strand => $refseq_strand seq name : " . $self->transcript_variation->transcript_stable_id() . " var strand " . $self->transcript_variation->variation_feature->strand() . " vf st " . $self->variation_feature->strand() ." seqname: " . $self->variation_feature->seqname() ." seq: " . $self->variation_feature_seq ."\n";
559 }
560
561 my $hgvs_notation ; ### store components of HGVS string in hash
562
563 ### vf strand is relative to transcript or transcript slice
564 if( $self->transcript_variation->variation_feature->strand() <0 && $refseq_strand >0 ||
565 $self->transcript_variation->variation_feature->strand() >0 && $refseq_strand < 0
566 ){
567 reverse_comp(\$variation_feature_sequence);
568 }
569
570 ### need to get ref seq from slice transcript is on for intron labelling
571 my ($slice_start, $slice_end, $slice) = $self->_var2transcript_slice_coords();
572
573
574 unless($slice_start){
575 #warn "TVA: VF not within transcript - no HGVS\n";
576 return undef;
577 }
578
579 ### decide event type from HGVS nomenclature
580 $hgvs_notation = hgvs_variant_notation( $variation_feature_sequence, ### alt_allele,
581 $slice->seq(), ### using this to extract ref allele
582 $slice_start,
583 $slice_end
584 );
585
586 ### This should not happen
587 unless($hgvs_notation->{'type'}){
588 #warn "Error - not continuing; no HGVS annotation\n";
589 return undef;
590 }
591
592 ### create reference name - transcript name & seq version
593 $hgvs_notation->{'ref_name'} = $self->transcript_variation->transcript_stable_id() . "." . $self->transcript_variation->transcript->version();
594
595
596 ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups]
597 $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} );
598 $hgvs_notation->{end} = $self->_get_cDNA_position( $hgvs_notation->{end} );
599
600
601 # Make sure that start is always less than end
602 my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
603 my ($exon_end_coord, $intron_end_offset) = $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
604 $intron_start_offset ||= 0;
605 $intron_end_offset ||= 0;
606
607 ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if
608 (($exon_start_coord + $intron_start_offset) > ($exon_end_coord + $intron_end_offset));
609
610
611
612 if($self->transcript->cdna_coding_start()){
613 $hgvs_notation->{'numbering'} = "c"; ### set 'c' if transcript is coding
614 }
615 else{
616 $hgvs_notation->{'numbering'} = "n"; ### set 'n' if transcript non-coding
617 }
618 ### generic formatting
619 $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation);
620
621 if($DEBUG ==1){print "HGVS notation for var " . $self->transcript_variation->variation_feature->variation_name() . " from hgvs transcript : " . $self->{hgvs_transcript} . " \n";}
622
623 return $self->{hgvs_transcript};
624
625 }
626
627
628 =head2 hgvs_protein
629
630 Description: Return a string representing the protein-level effect of this allele in HGVS format
631 Returntype : string or undef if this allele is not in the CDS
632 Exceptions : none
633 Status : At Risk
634
635 =cut
636
637 sub hgvs_protein {
638
639 my $self = shift;
640 my $notation = shift;
641 my $hgvs_notation;
642
643 if($DEBUG ==1){print "\nStarting hgvs_protein with ". $self->transcript_variation->variation_feature->variation_name() . "\n"; }
644
645 ### set if string supplied
646 $self->{hgvs_protein} = $notation if defined $notation;
647
648 ### return if set
649 return $self->{hgvs_protein} if defined $self->{hgvs_protein} ;
650
651 ### don't try to handle odd characters
652 return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig;
653
654 #### else, check viable input and create notation
655 return if $self->is_reference();
656
657 unless ($self->transcript_variation->translation_start() && $self->transcript_variation->translation_end()){
658 return undef ;
659 }
660
661 #### for debug
662 #my $var_name = $self->transcript_variation->variation_feature->variation_name();
663
664 ### get reference sequence [add seq version to transcript name]
665 $hgvs_notation->{ref_name} = $self->transcript_variation->transcript->translation()->display_id() . "." . $self->transcript_variation->transcript->translation()->version();
666
667 $hgvs_notation->{'numbering'} = 'p';
668
669 ### get default reference location [changed later in some cases eg. duplication]
670 $hgvs_notation->{start} = $self->transcript_variation->translation_start();
671 $hgvs_notation->{end} = $self->transcript_variation->translation_end();
672
673
674 ## get default reference & alt peptides [changed later to hgvs format]
675 $hgvs_notation->{alt} = $self->peptide;
676 $hgvs_notation->{ref} = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide;
677
678 if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref}){
679 $hgvs_notation = _clip_alleles( $hgvs_notation);
680 }
681
682 #### define type - types are different for protein numbering
683 $hgvs_notation = $self->_get_hgvs_protein_type($hgvs_notation);
684
685 ##### Convert ref & alt peptides taking into account HGVS rules
686 $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation);
687
688 ##### String formatting
689 $self->{hgvs_protein} = $self->_get_hgvs_protein_format($hgvs_notation);
690 return $self->{hgvs_protein} ;
691 }
692
693 ### HGVS: format protein string
694 sub _get_hgvs_protein_format{
695
696 my $self = shift;
697 my $hgvs_notation = shift;
698
699 if ((defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} &&
700 $hgvs_notation->{ref} eq $hgvs_notation->{alt}) ||
701 (defined $hgvs_notation->{type} && $hgvs_notation->{type} eq "=")){
702
703 ### no protein change - return transcript nomenclature with flag for neutral protein consequence
704 $hgvs_notation->{'hgvs'} = $self->hgvs_transcript() . "(p.=)";
705 return $hgvs_notation->{'hgvs'} ;
706 }
707
708 ### all start with refseq name & numbering type
709 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . ".";
710
711 ### handle stop_lost seperately regardless of cause by del/delins => p.XposAA1extnum_AA_to_stop
712 if(stop_lost($self)){ ### if deletion of stop add extX and number of new aa to alt
713 $hgvs_notation->{alt} = substr($hgvs_notation->{alt}, 0, 3);
714 if($hgvs_notation->{type} eq "del"){
715 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "del");
716 if(defined $aa_til_stop){
717 $hgvs_notation->{alt} .= "extX" . $aa_til_stop;
718 }
719 }
720 elsif($hgvs_notation->{type} eq ">"){
721 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "subs");
722 if(defined $aa_til_stop){
723 $hgvs_notation->{alt} .= "extX" . $aa_til_stop;
724 }
725 }
726 else{
727 # warn "TVA: stop loss for type $hgvs_notation->{type} not caught \n";
728 }
729 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
730 }
731
732 elsif( $hgvs_notation->{type} eq "dup"){
733
734 if($hgvs_notation->{start} < $hgvs_notation->{end}){
735 ### list only first and last peptides in long duplicated string
736 my $ref_pep_first = substr($hgvs_notation->{alt}, 0, 3);
737 my $ref_pep_last = substr($hgvs_notation->{alt}, -3, 3);
738 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} ."dup";
739
740 }
741 else{
742 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{alt} . $hgvs_notation->{start} . "dup" ;
743 }
744
745 print "formating a dup $hgvs_notation->{hgvs} \n" if $DEBUG==1;
746 }
747
748 elsif($hgvs_notation->{type} eq ">"){
749 #### substitution
750
751 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt};
752 }
753
754 elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){
755
756 #### list first and last aa in reference only
757 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3);
758 my $ref_pep_last;
759 if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){
760 $ref_pep_last ="X";
761 }
762 else{
763 $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3);
764 }
765 if($hgvs_notation->{ref} =~ /X$/){
766 ### For stops & add extX & distance to next stop to alt pep
767 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "loss");
768 if(defined $aa_til_stop){
769 $hgvs_notation->{alt} .="extX" . $aa_til_stop;
770 }
771 }
772 if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){
773 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
774 }
775 else{
776 ### correct ordering if needed
777 if($hgvs_notation->{start} > $hgvs_notation->{end}){
778 ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} );
779 }
780
781 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
782 }
783 }
784
785 elsif($hgvs_notation->{type} eq "fs"){
786 if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "X"){ ## stop gained
787 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
788
789 }
790 else{ ## not immediate stop - count aa until next
791
792 my $aa_til_stop = _stop_loss_extra_AA($self, $hgvs_notation->{start}-1, "fs");
793 if($aa_til_stop ){
794 ### use long form if new stop found
795 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} . "fsX$aa_til_stop" ;
796 }
797 else{
798 ### otherwise use short form
799 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "fs";
800 }
801 }
802 }
803
804 elsif( $hgvs_notation->{type} eq "del"){
805 $hgvs_notation->{alt} = "del";
806 if( length($hgvs_notation->{ref}) >3 ){
807 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3);
808 my $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3);
809 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ;
810 }
811 else{
812 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
813 }
814 }
815
816 elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){
817 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "_" . $hgvs_notation->{alt} . $hgvs_notation->{end} ;
818 }
819
820 else{
821 #### default to substitution
822 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt};
823 }
824
825 if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";}
826 return $hgvs_notation->{'hgvs'};
827 }
828
829 ### HGVS: get type of variation event in protein terms
830 sub _get_hgvs_protein_type{
831
832 my $self = shift;
833 my $hgvs_notation = shift;
834
835 ### get allele length
836 my ($ref_length, $alt_length ) = $self->_get_allele_length();
837
838
839 if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){
840 ### Run type checks on peptides if available
841
842 if($hgvs_notation->{alt} eq $hgvs_notation->{ref} ){
843 #### synonymous indicated by =
844 $hgvs_notation->{type} = "=";
845 }
846 elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) {
847
848 $hgvs_notation->{type} = ">";
849 }
850 elsif($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") {
851
852 $hgvs_notation->{type} = "ins";
853 }
854 elsif($hgvs_notation->{alt} eq "" ) {
855
856 $hgvs_notation->{type} = "del";
857 }
858 elsif( (length($hgvs_notation->{alt}) > length($hgvs_notation->{ref}) ) &&
859 $hgvs_notation->{alt} =~ / $hgvs_notation->{ref}/ ) {
860 ### capture duplication event described as TTT/TTTTT
861 $hgvs_notation->{type} = "dup";
862 }
863
864 elsif( (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) ==1) ||
865 (length($hgvs_notation->{alt}) ==1 && length($hgvs_notation->{ref}) >1) ) {
866 $hgvs_notation->{type} = "delins";
867 }
868 else{
869 $hgvs_notation->{type} = ">";
870 }
871 }
872
873
874 elsif($ref_length ne $alt_length && ($ref_length - $alt_length)%3 !=0 ){
875 $hgvs_notation->{type} = "fs";
876 }
877
878 elsif(length($self->variation_feature_seq()) >1 ){
879 if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){
880 ### convention for insertions - end one less than start
881 $hgvs_notation->{type} = "ins";
882 }
883 elsif( $hgvs_notation->{start} != $hgvs_notation->{end} ){
884 $hgvs_notation->{type} = "delins";
885 }
886 else{
887 $hgvs_notation->{type} = ">";
888 }
889 }
890 else{
891 #print STDERR "DEBUG ".$self->variation_feature->start."\n";
892 #warn "Cannot define protein variant type [$ref_length - $alt_length]\n";
893 }
894 return $hgvs_notation ;
895
896 }
897
898 ### HGVS: get reference & alternative peptide
899 sub _get_hgvs_peptides{
900
901 my $self = shift;
902 my $hgvs_notation = shift;
903
904 if($hgvs_notation->{type} eq "fs"){
905 ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately
906 $hgvs_notation = $self->_get_fs_peptides($hgvs_notation);
907 }
908 elsif($hgvs_notation->{type} eq "ins" ){
909
910 ### HGVS ref are peptides flanking insertion
911 $hgvs_notation->{ref} = $self->_get_surrounding_peptides($hgvs_notation->{start});
912
913 if( $hgvs_notation->{alt} =~/\*/){
914 ## inserted bases after stop irrelevant; consider as substitution gaining stop MAINTAIN PREVIOUS BEHAVIOUR FOR NOW
915 #$hgvs_notation->{alt} =~ s/\*\w+$/\*/;
916 }
917 else{
918 ### Additional check that inserted bases do not duplicate 3' reference sequence [set to type = dup if so]
919 $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation);
920 }
921 }
922
923
924 ### Convert peptide to 3 letter code as used in HGVS
925 $hgvs_notation->{ref} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref', -alphabet => 'protein')) || "";
926 if( $hgvs_notation->{alt} eq "-"){
927 $hgvs_notation->{alt} = "del";
928 }
929 else{
930 $hgvs_notation->{alt} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref', -alphabet => 'protein')) || "";
931 }
932
933 ### handle special cases
934 if( affects_start_codon($self) ){
935 #### handle initiator loss -> probably no translation => alt allele is '?'
936 $hgvs_notation->{alt} = "?";
937 $hgvs_notation->{type} = "";
938 }
939
940 elsif( $hgvs_notation->{type} eq "del"){
941 #### partial last codon
942 $hgvs_notation->{alt} = "del";
943 }
944 elsif($hgvs_notation->{type} eq "fs"){
945 ### only quote first ref peptide for frameshift
946 $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3);
947 }
948
949 ### set stop to HGVS prefered "X"
950 if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Ter|Xaa/X/g; }
951 if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Ter|Xaa/X/g; }
952
953 return ($hgvs_notation);
954
955 }
956
957 ### HGVS: remove common peptides from alt and ref strings & shift start/end accordingly
958 sub _clip_alleles{
959
960 my $hgvs_notation = shift;
961
962 my $check_alt = $hgvs_notation->{alt} ;
963 my $check_ref = $hgvs_notation->{ref} ;
964 my $check_start = $hgvs_notation->{start};
965
966 ### strip same bases from start of string
967 for (my $p =0; $p <length ($hgvs_notation->{ref}); $p++){
968 my $check_next_ref = substr( $check_ref, 0, 1);
969 my $check_next_alt = substr( $check_alt, 0, 1);
970
971 if($check_next_ref eq "*" && $check_next_alt eq "*"){
972 ### stop re-created by variant - no protein change
973 $hgvs_notation->{type} = "=";
974
975 return($hgvs_notation);
976 }
977
978 if($check_next_ref eq $check_next_alt){
979 $check_start++;
980 $check_ref = substr( $check_ref, 1);
981 $check_alt = substr( $check_alt, 1);
982 }
983 else{
984 last;
985 }
986 }
987 #### strip same bases from end of string
988 for (my $q =0; $q < length ($check_ref); $q++){
989 my $check_next_ref = substr( $check_ref, -1, 1);
990 my $check_next_alt = substr( $check_alt, -1, 1);
991 if($check_next_ref eq $check_next_alt){
992 chop $check_ref;
993 chop $check_alt;
994 }
995 else{
996 last;
997 }
998 }
999
1000 $hgvs_notation->{alt} = $check_alt;
1001 $hgvs_notation->{ref} = $check_ref;
1002
1003 ### check if clipping force type change & adjust location
1004 if(length ($check_ref) == 0 && length ($check_alt) >= 1){
1005 ### re-set as ins not delins
1006 $hgvs_notation->{type} ="ins";
1007 ### insertion between ref base and next => adjust next
1008 if($hgvs_notation->{end} == $hgvs_notation->{start} ){$hgvs_notation->{end} = $check_start; }
1009 # $hgvs_notation->{start} = $check_start;
1010 }
1011 elsif(length ($check_ref) >=1 && length ($check_alt) == 0){
1012 ### re-set as del not delins
1013 $hgvs_notation->{type} ="del";
1014 $hgvs_notation->{start} = $check_start;
1015 }
1016 else{
1017 #### save trimmed peptide string & increased start position
1018 $hgvs_notation->{start} = $check_start;
1019
1020 }
1021
1022 return $hgvs_notation;
1023 }
1024
1025
1026
1027
1028
1029 #### HGVS: check allele lengths to look for frameshifts
1030 sub _get_allele_length{
1031
1032 my $self = shift;
1033 my $ref_length = 0;
1034 my $alt_length = 0;
1035
1036 my $al_string = $self->allele_string();
1037 my $ref_allele = (split/\//, $al_string)[0];
1038 $ref_allele =~ s/\-//;
1039 $ref_length = length $ref_allele;
1040
1041 my $alt_allele = $self->variation_feature_seq();
1042 $alt_allele =~ s/\-//;
1043 $alt_length = length $alt_allele;
1044
1045 return ($ref_length, $alt_length );
1046
1047 }
1048
1049 ### HGVS: list first different peptide [may not be first changed codon]
1050 sub _get_fs_peptides{
1051
1052 my $self = shift;
1053 my $hgvs_notation = shift;
1054
1055 ### get CDS with alt variant
1056 my $alt_cds = $self->_get_alternate_cds();
1057 return undef unless defined($alt_cds);
1058
1059 #### get new translation
1060 my $alt_trans = $alt_cds->translate()->seq();
1061
1062 ### get changed end (currently in single letter AA coding)
1063 my $ref_trans = $self->transcript->translate()->seq();
1064 $ref_trans .= "*"; ## appending ref stop for checking purposes
1065
1066 $hgvs_notation->{start} = $self->transcript_variation->translation_start() ;
1067
1068 if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq
1069 $hgvs_notation->{alt} = "del";
1070 $hgvs_notation->{type} = "del";
1071 return $hgvs_notation;
1072 }
1073
1074 while ($hgvs_notation->{start} <= length $alt_trans){
1075 ### frame shift may result in the same AA#
1076
1077 $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1);
1078 $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1);
1079
1080 if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){
1081 ### variation at stop codon, but maintains stop codon
1082 return ($hgvs_notation);
1083 }
1084 last if $hgvs_notation->{ref} ne $hgvs_notation->{alt};
1085 $hgvs_notation->{start}++;
1086 }
1087 return ($hgvs_notation);
1088
1089 }
1090
1091 #### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion
1092 sub _get_surrounding_peptides{
1093
1094 my $self = shift;
1095 my $ref_pos = shift;
1096 my $ref_trans = $self->transcript->translate()->seq();
1097
1098 my $end = substr($ref_trans, $ref_pos-1);
1099 my $ref_string = substr($ref_trans, $ref_pos-1, 2);
1100
1101 return ($ref_string);
1102
1103 }
1104
1105
1106 #### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop
1107 sub _get_alternate_cds{
1108
1109 my $self = shift;
1110
1111 ### get reference sequence
1112 my $reference_cds_seq = $self->transcript->translateable_seq();
1113
1114 return undef unless defined($self->transcript_variation->cds_start) && defined($self->transcript_variation->cds_end());
1115
1116 ### get sequences upstream and downstream of variant
1117 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) );
1118 my $downstream_seq = substr($reference_cds_seq, ($self->transcript_variation->cds_end() ) );
1119
1120 ### fix alternate allele if deletion or on opposite strand
1121 my $alt_allele = $self->variation_feature_seq();
1122 $alt_allele =~ s/\-//;
1123 if( $self->transcript_variation->variation_feature->strand() <0 && $self->transcript_variation->transcript->strand() >0 ||
1124 $self->transcript_variation->variation_feature->strand() >0 && $self->transcript_variation->transcript->strand() < 0
1125 ){
1126 reverse_comp(\$alt_allele) ;
1127 }
1128 ### build alternate seq
1129 my $alternate_seq = $upstream_seq . $alt_allele . $downstream_seq ;
1130
1131 ### create seq obj with alternative allele in the CDS sequence
1132 my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq, -id => 'alt_cds', -alphabet => 'dna');
1133
1134 ### append UTR if available as stop may be disrupted
1135 my $utr = $self->transcript_variation->transcript->three_prime_utr();
1136
1137 if (defined $utr) {
1138 ### append the UTR to the alternative CDS
1139 $alt_cds->seq($alt_cds->seq() . $utr->seq());
1140 }
1141 else{
1142 ##warn "No UTR available for alternate CDS\n";
1143 }
1144
1145 return $alt_cds;
1146 }
1147
1148 ### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication
1149 sub _check_for_peptide_duplication{
1150
1151 my $self = shift;
1152 my $hgvs_notation= shift;
1153
1154 ##### get reference sequence
1155 my $reference_cds_seq = $self->transcript->translateable_seq();
1156
1157 ##### get sequence upstream of variant
1158 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) );
1159
1160 ##### create translation to check against inserted peptides
1161 my $upstream_cds =Bio::PrimarySeq->new(-seq => $upstream_seq, -id => 'alt_cds', -alphabet => 'dna');
1162 my $upstream_trans = $upstream_cds->translate()->seq();
1163
1164 ## Test whether alt peptide matches the reference sequence just before the variant
1165 my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ;
1166 my $test_seq = substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'}));
1167
1168 if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) {
1169
1170 $hgvs_notation->{type} = 'dup';
1171 $hgvs_notation->{end} = $hgvs_notation->{start} -1;
1172 $hgvs_notation->{start} -= length($hgvs_notation->{alt});
1173
1174 }
1175
1176 return $hgvs_notation;
1177
1178 }
1179
1180 #### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's
1181 sub _stop_loss_extra_AA{
1182
1183 my $self = shift;
1184 my $ref_var_pos = shift; ### first effected AA - supply for frameshifts
1185 my $test = shift;
1186
1187 return undef unless $ref_var_pos;
1188
1189 my $extra_aa;
1190
1191 ### get the sequence with variant added
1192 my $alt_cds = $self->_get_alternate_cds();
1193 return undef unless defined($alt_cds);
1194
1195 ### get new translation
1196 my $alt_trans = $alt_cds->translate();
1197
1198 my $ref_temp = $self->transcript->translate()->seq;
1199 my $ref_len = length($ref_temp);
1200
1201 if($DEBUG==1){
1202 print "alt translated:\n" . $alt_trans->seq() . "\n";
1203 print "ref translated:\n$ref_temp\n";;
1204 }
1205
1206 #### Find the number of residues that are translated until a termination codon is encountered
1207 if ($alt_trans->seq() =~ m/\*/) {
1208 if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";}
1209
1210 if($test eq "fs" ){
1211 ### frame shift - count from first AA effected by variant to stop
1212 $extra_aa = $+[0] - $ref_var_pos;
1213 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";}
1214 }
1215
1216 else{
1217 $extra_aa = $+[0] - 1 - $ref_len;
1218 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";}
1219 }
1220 }
1221
1222 # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon
1223 if(defined $extra_aa && $extra_aa >0){
1224 return $extra_aa ;
1225 }
1226 else{
1227 #warn "No stop found in alternate sequence\n";
1228 return undef;
1229 }
1230
1231 }
1232
1233 =head
1234 # We haven't implemented support for these methods yet
1235
1236 sub hgvs_rna {
1237 return _hgvs_generic(@_,'rna');
1238 }
1239
1240 sub hgvs_mitochondrial {
1241 return _hgvs_generic(@_,'mitochondrial');
1242 }
1243
1244 =cut
1245
1246 sub _hgvs_generic {
1247
1248 my $self = shift;
1249 my $reference = pop;
1250 my $notation = shift;
1251
1252 #ĘThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these
1253 return undef if ($reference =~ m/rna|mitochondrial/);
1254
1255 my $sub = qq{hgvs_$reference};
1256
1257 $self->{$sub} = $notation if defined $notation;
1258
1259 unless ($self->{$sub}) {
1260
1261 # Use the transcript this VF is on as the reference feature
1262 my $reference_feature = $self->transcript;
1263 # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region
1264 $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic');
1265
1266 # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles
1267 $self->transcript_variation->$sub($self->variation_feature->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$self->transcript_variation));
1268 }
1269
1270 return $self->{$sub};
1271 }
1272
1273
1274 ### HGVS: move variant to transcript slice
1275 sub _var2transcript_slice_coords{
1276
1277 my $self = shift;
1278
1279 my $ref_slice = $self->transcript->feature_Slice(); #### returns with strand same as feature
1280 my $tr_vf = $self->variation_feature->transfer($ref_slice);
1281
1282 # Return undef if this VariationFeature does not fall within the supplied feature.
1283 return undef if (!defined $tr_vf ||
1284 $tr_vf->start < 1 ||
1285 $tr_vf->end < 1 ||
1286 $tr_vf->start > ($self->transcript->end - $self->transcript->start + 1) ||
1287 $tr_vf->end > ($self->transcript->end - $self->transcript->start + 1));
1288
1289 return( $tr_vf->start() , $tr_vf->end(), $ref_slice);
1290 }
1291
1292
1293
1294 ### HGVS: get variant position in transcript
1295
1296 # intron:
1297 # If the position is in an intron, the boundary position of the closest exon and
1298 # a + or - offset into the intron is returned.
1299 # Ordered by genome forward not 5' -> 3'
1300
1301 # upstream:
1302 # If the position is 5' of the start codon, it is reported relative to the start codon
1303 # (-1 being the last nucleotide before the 'A' of ATG).
1304
1305 #downstream:
1306 # If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset
1307 # from the start codon (*1 being the first nucleotide after the last position of the stop codon)
1308
1309 sub _get_cDNA_position {
1310
1311 my $self = shift;
1312 my $position = shift; ### start or end of variation
1313
1314 my $transcript = $self->transcript();
1315 my $strand = $transcript->strand();
1316
1317 #### TranscriptVariation start/stop coord relative to transcript
1318 #### Switch to chromosome coordinates taking into account strand
1319 $position = ( $strand > 0 ?
1320 ( $self->transcript->start() + $position - 1 ) :
1321 ( $self->transcript->end() - $position + 1));
1322
1323
1324
1325 # Get all exons and sort them in positional order
1326 my @exons = sort {$a->start() <=> $b->start()} @{$transcript->get_all_Exons()};
1327 my $n_exons = scalar(@exons);
1328
1329 my $cdna_position;
1330 # Loop over the exons and get the coordinates of the variation in exon+intron notation
1331 for (my $i=0; $i<$n_exons; $i++) {
1332
1333 # Skip if the start point is beyond this exon
1334 next if ($position > $exons[$i]->end());
1335
1336
1337 # EXONIC: If the start coordinate is within this exon
1338 if ($position >= $exons[$i]->start()) {
1339 # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation
1340 # If the transcript is in the opposite direction, count from the end instead
1341 $cdna_position = $exons[$i]->cdna_start($transcript) + ( $strand > 0 ?
1342 ( $position - $exons[$i]->start ) :
1343 ( $exons[$i]->end() - $position )
1344 );
1345 last; #### last exon checked
1346 }
1347 ## INTRONIC
1348 # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one
1349 else {
1350
1351 my $updist = ($position - $exons[$i-1]->end());
1352 my $downdist = ($exons[$i]->start() - $position);
1353
1354 # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that
1355 if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) {
1356
1357 # If the orientation is reversed, we should use the cDNA start and a '-' offset
1358 $cdna_position = ($strand >= 0 ?
1359 $exons[$i-1]->cdna_end($transcript) . '+' :
1360 $exons[$i-1]->cdna_start($transcript) . '-')
1361 . $updist;
1362 }
1363 # Else if downstream is shortest...
1364 else {
1365 # If the orientation is reversed, we should use the cDNA end and a '+' offset
1366 $cdna_position = ($strand >= 0 ?
1367 $exons[$i]->cdna_start($transcript) . '-' :
1368 $exons[$i]->cdna_end($transcript) . '+')
1369 . $downdist;
1370 }
1371 last; ## last exon checked
1372 }
1373 }
1374
1375 # Shift the position to make it relative to the start & stop codons
1376 my $start_codon = $transcript->cdna_coding_start();
1377 my $stop_codon = $transcript->cdna_coding_end();
1378
1379 # Disassemble the cDNA coordinate into the exon and intron parts
1380 ### just built this now taking it appart again
1381 my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/;
1382
1383
1384 # Start by correcting for the stop codon
1385 if (defined($stop_codon) && $cdna_coord > $stop_codon) {
1386 # Get the offset from the stop codon
1387 $cdna_coord -= $stop_codon;
1388 # Prepend a * to indicate the position is in the 3' UTR
1389 $cdna_coord = '*' . $cdna_coord;
1390 }
1391 elsif (defined($start_codon)) {
1392 # If the position is beyond the start codon, add 1 to get the correct offset
1393 $cdna_coord += ($cdna_coord >= $start_codon);
1394 # Subtract the position of the start codon
1395 $cdna_coord -= $start_codon;
1396 }
1397
1398 # Re-assemble the cDNA position [ return exon num & offset & direction for intron eg. 142+363]
1399 $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : '');
1400 return $cdna_position;
1401 }
1402
1403
1404 1;
1405
1406