comparison variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Variation::TranscriptVariationAllele
24
25 =head1 SYNOPSIS
26
27 use Bio::EnsEMBL::Variation::TranscriptVariationAllele;
28
29 my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new(
30 -transcript_variation => $tv,
31 -variation_feature_seq => 'A',
32 -is_reference => 0,
33 );
34
35 print "sequence with respect to the transcript: ", $tva->feature_seq, "\n";
36 print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n";
37 print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n";
38 print "amino acid change: ", $tva->peptide_allele_string, "\n";
39 print "resulting codon: ", $tva->codon, "\n";
40 print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n";
41 print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n";
42 print "SIFT prediction: ", $tva->sift_prediction, "\n";
43
44 =head1 DESCRIPTION
45
46 A TranscriptVariationAllele object represents a single allele of a TranscriptVariation.
47 It provides methods that are specific to the sequence of the allele, such as codon,
48 peptide etc. Methods that depend only on position (e.g. CDS start) will be found in
49 the associated TranscriptVariation. Ordinarily you will not create these objects
50 yourself, but instead you would create a TranscriptVariation object which will then
51 construct TranscriptVariationAlleles based on the allele string of the associated
52 VariationFeature.
53
54 Note that any methods that are not specific to Transcripts will be found in the
55 VariationFeatureOverlapAllele superclass.
56
57 =cut
58
59 package Bio::EnsEMBL::Variation::TranscriptVariationAllele;
60
61 use strict;
62 use warnings;
63
64 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP);
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
66 use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string);
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
68 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(within_cds within_intron stop_lost affects_start_codon);
69
70 use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele);
71
72
73 our $DEBUG = 0;
74
75 sub new_fast {
76 my ($class, $hashref) = @_;
77
78 # swap a transcript_variation argument for a variation_feature_overlap one
79
80 if ($hashref->{transcript_variation}) {
81 $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation};
82 }
83
84 # and call the superclass
85
86 return $class->SUPER::new_fast($hashref);
87 }
88
89 =head2 transcript_variation
90
91 Description: Get/set the associated TranscriptVariation
92 Returntype : Bio::EnsEMBL::Variation::TranscriptVariation
93 Exceptions : throws if the argument is the wrong type
94 Status : At Risk
95
96 =cut
97
98 sub transcript_variation {
99 my ($self, $tv) = @_;
100 assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $tv;
101 return $self->variation_feature_overlap($tv);
102 }
103
104 =head2 variation_feature
105
106 Description: Get the associated VariationFeature
107 Returntype : Bio::EnsEMBL::Variation::VariationFeature
108 Exceptions : none
109 Status : At Risk
110
111 =cut
112
113 sub variation_feature {
114 my $self = shift;
115 return $self->transcript_variation->variation_feature;
116 }
117
118 =head2 pep_allele_string
119
120 Description: Return a '/' delimited string of the reference peptide and the
121 peptide resulting from this allele, or a single peptide if this
122 allele does not change the peptide (e.g. because it is synonymous)
123 Returntype : string or undef if this allele is not in the CDS
124 Exceptions : none
125 Status : At Risk
126
127 =cut
128
129 sub pep_allele_string {
130 my ($self) = @_;
131
132 my $pep = $self->peptide;
133
134 return undef unless $pep;
135
136 my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide;
137
138 return undef unless $ref_pep;
139
140 return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep;
141 }
142
143 =head2 codon_allele_string
144
145 Description: Return a '/' delimited string of the reference codon and the
146 codon resulting from this allele
147 Returntype : string or undef if this allele is not in the CDS
148 Exceptions : none
149 Status : At Risk
150
151 =cut
152
153 sub codon_allele_string {
154 my ($self) = @_;
155
156 my $codon = $self->codon;
157
158 return undef unless $codon;
159
160 my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon;
161
162 return $ref_codon.'/'.$codon;
163 }
164
165 =head2 display_codon_allele_string
166
167 Description: Return a '/' delimited string of the reference display_codon and the
168 display_codon resulting from this allele. The display_codon identifies
169 the nucleotides affected by this variant in UPPER CASE and other
170 nucleotides in lower case
171 Returntype : string or undef if this allele is not in the CDS
172 Exceptions : none
173 Status : At Risk
174
175 =cut
176
177 sub display_codon_allele_string {
178 my ($self) = @_;
179
180 my $display_codon = $self->display_codon;
181
182 return undef unless $display_codon;
183
184 my $ref_display_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->display_codon;
185
186 return $ref_display_codon.'/'.$display_codon;
187 }
188
189 =head2 peptide
190
191 Description: Return the amino acid sequence that this allele is predicted to result in
192 Returntype : string or undef if this allele is not in the CDS or is a frameshift
193 Exceptions : none
194 Status : At Risk
195
196 =cut
197
198 sub peptide {
199 my ($self, $peptide) = @_;
200
201 $self->{peptide} = $peptide if $peptide;
202
203 unless ($self->{peptide}) {
204
205 return undef unless $self->seq_is_unambiguous_dna;
206
207 if (my $codon = $self->codon) {
208
209 # the codon method can set the peptide in some circumstances
210 # so check here before we try an (expensive) translation
211 return $self->{peptide} if $self->{peptide};
212
213 # translate the codon sequence to establish the peptide allele
214
215 # allow for partial codons - split the sequence into whole and partial
216 # e.g. AAAGG split into AAA and GG
217 my $whole_codon = substr($codon, 0, int(length($codon) / 3) * 3);
218 my $partial_codon = substr($codon, int(length($codon) / 3) * 3);
219
220 my $pep = '';
221
222 if($whole_codon) {
223 # for mithocondrial dna we need to to use a different codon table
224 my $codon_table = $self->transcript_variation->_codon_table;
225
226 my $codon_seq = Bio::Seq->new(
227 -seq => $whole_codon,
228 -moltype => 'dna',
229 -alphabet => 'dna',
230 );
231
232 $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq;
233 }
234
235 if($partial_codon) {
236 $pep .= 'X';
237 }
238
239 $pep ||= '-';
240
241 $self->{peptide} = $pep;
242 }
243 }
244
245 return $self->{peptide};
246 }
247
248 =head2 codon
249
250 Description: Return the codon sequence that this allele is predicted to result in
251 Returntype : string or undef if this allele is not in the CDS or is a frameshift
252 Exceptions : none
253 Status : At Risk
254
255 =cut
256
257 sub codon {
258 my ($self, $codon) = @_;
259
260 $self->{codon} = $codon if defined $codon;
261
262 my $tv = $self->transcript_variation;
263
264 return undef unless $tv->translation_start && $tv->translation_end;
265
266 return undef unless $self->seq_is_dna;
267
268 unless ($self->{codon}) {
269
270 # try to calculate the codon sequence
271
272 my $seq = $self->feature_seq;
273
274 $seq = '' if $seq eq '-';
275
276 # calculate necessary coords and lengths
277
278 my $codon_cds_start = $tv->translation_start * 3 - 2;
279 my $codon_cds_end = $tv->translation_end * 3;
280 my $codon_len = $codon_cds_end - $codon_cds_start + 1;
281 my $vf_nt_len = $tv->cds_end - $tv->cds_start + 1;
282 my $allele_len = $self->seq_length;
283
284 if ($allele_len != $vf_nt_len) {
285 if (abs($allele_len - $vf_nt_len) % 3) {
286 # this is a frameshift variation, we don't attempt to
287 # calculate the resulting codon or peptide change as this
288 # could get quite complicated
289 return undef;
290 }
291 }
292
293 # splice the allele sequence into the CDS
294
295 my $cds = $tv->_translateable_seq;
296
297 substr($cds, $tv->cds_start-1, $vf_nt_len) = $seq;
298
299 # and extract the codon sequence
300
301 my $codon = substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len));
302
303 if (length($codon) < 1) {
304 $self->{codon} = '-';
305 $self->{peptide} = '-';
306 }
307 else {
308 $self->{codon} = $codon;
309 }
310 }
311
312 return $self->{codon};
313 }
314
315 =head2 display_codon
316
317 Description: Return the codon sequence that this allele is predicted to result in
318 with the affected nucleotides identified in UPPER CASE and other
319 nucleotides in lower case
320 Returntype : string or undef if this allele is not in the CDS or is a frameshift
321 Exceptions : none
322 Status : At Risk
323
324 =cut
325
326 sub display_codon {
327 my $self = shift;
328
329 unless ($self->{_display_codon}) {
330
331 if ($self->codon && defined $self->transcript_variation->codon_position) {
332
333 my $display_codon = lc $self->codon;
334
335 # if this allele is an indel then just return all lowercase
336
337 if ($self->feature_seq ne '-') {
338
339 # codon_position is 1-based, while substr assumes the string starts at 0
340
341 my $pos = $self->transcript_variation->codon_position - 1;
342
343 my $len = length $self->feature_seq;
344
345 substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len);
346 }
347
348 $self->{_display_codon} = $display_codon;
349 }
350 }
351
352 return $self->{_display_codon};
353 }
354
355 =head2 polyphen_prediction
356
357 Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele.
358 (Note that we currently only have PolyPhen predictions for variants that
359 result in single amino acid substitutions in human)
360 Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown')
361 if this is a non-synonymous mutation and a prediction is available, undef
362 otherwise
363 Exceptions : none
364 Status : At Risk
365
366 =cut
367
368 sub polyphen_prediction {
369 my ($self, $classifier, $polyphen_prediction) = @_;
370
371 $classifier ||= 'humvar';
372
373 my $analysis = "polyphen_${classifier}";
374
375 $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction;
376
377 unless ($self->{$analysis}->{prediction}) {
378 my ($prediction, $score) = $self->_protein_function_prediction($analysis);
379 $self->{$analysis}->{score} = $score;
380 $self->{$analysis}->{prediction} = $prediction;
381 }
382
383 return $self->{$analysis}->{prediction};
384 }
385
386 =head2 polyphen_score
387
388 Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we
389 currently only have PolyPhen predictions for variants that result in single
390 amino acid substitutions in human)
391 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is
392 available, undef otherwise
393 Exceptions : none
394 Status : At Risk
395
396 =cut
397
398 sub polyphen_score {
399 my ($self, $classifier, $polyphen_score) = @_;
400
401 $classifier ||= 'humvar';
402
403 my $analysis = "polyphen_${classifier}";
404
405 $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score;
406
407 unless ($self->{$analysis}->{score}) {
408 my ($prediction, $score) = $self->_protein_function_prediction($analysis);
409 $self->{$analysis}->{score} = $score;
410 $self->{$analysis}->{prediction} = $prediction;
411 }
412
413 return $self->{$analysis}->{score};
414 }
415
416 =head2 sift_prediction
417
418 Description: Return the qualitative SIFT prediction for the effect of this allele.
419 (Note that we currently only have SIFT predictions for variants that
420 result in single amino acid substitutions in human)
421 Returntype : string (one of 'tolerated', 'deleterious') if this is a non-synonymous
422 mutation and a prediction is available, undef otherwise
423 Exceptions : none
424 Status : At Risk
425
426 =cut
427
428 sub sift_prediction {
429 my ($self, $sift_prediction) = @_;
430
431 $self->{sift_prediction} = $sift_prediction if $sift_prediction;
432
433 unless ($self->{sift_prediction}) {
434 my ($prediction, $score) = $self->_protein_function_prediction('sift');
435 $self->{sift_score} = $score;
436 $self->{sift_prediction} = $prediction unless $self->{sift_prediction};
437 }
438
439 return $self->{sift_prediction};
440 }
441
442 =head2 sift_score
443
444 Description: Return the SIFT score for this allele (Note that we currently only have SIFT
445 predictions for variants that result in single amino acid substitutions in human)
446 Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is
447 available, undef otherwise
448 Exceptions : none
449 Status : At Risk
450
451 =cut
452
453 sub sift_score {
454 my ($self, $sift_score) = @_;
455
456 $self->{sift_score} = $sift_score if defined $sift_score;
457
458 unless ($self->{sift_score}) {
459 my ($prediction, $score) = $self->_protein_function_prediction('sift');
460 $self->{sift_score} = $score;
461 $self->{sift_prediction} = $prediction;
462 }
463
464 return $self->{sift_score};
465 }
466
467
468 sub _protein_function_prediction {
469 my ($self, $analysis) = @_;
470
471 # we can only get results for variants that cause a single amino acid substitution,
472 # so check the peptide allele string first
473
474 if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) {
475
476 if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) {
477
478 my ($prediction, $score) = $matrix->get_prediction(
479 $self->transcript_variation->translation_start,
480 $self->peptide,
481 );
482
483 return wantarray ? ($prediction, $score) : $prediction;
484 }
485 }
486
487 return undef;
488 }
489
490 =head2 hgvs_genomic
491
492 Description: Return a string representing the genomic-level effect of this allele in HGVS format
493 Returntype : string
494 Exceptions : none
495 Status : At Risk
496
497 =cut
498
499 sub hgvs_genomic {
500 return _hgvs_generic(@_,'genomic');
501 }
502
503 =head2 hgvs_coding
504
505 Description: Return a string representing the CDS-level effect of this allele in HGVS format
506 Returntype : string or undef if this allele is not in the CDS
507 Exceptions : none
508 Status : At Risk
509
510 =cut
511
512 sub hgvs_coding {
513
514 deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.');
515 return hgvs_transcript(@_);
516 }
517
518
519 =head2 hgvs_transcript
520
521 Description: Return a string representing the CDS-level effect of this allele in HGVS format
522 Returntype : string or undef if this allele is not in the CDS
523 Exceptions : none
524 Status : At Risk
525
526 =cut
527
528
529 sub hgvs_transcript {
530
531
532 my $self = shift;
533 my $notation = shift;
534
535 ##### set if string supplied
536 $self->{hgvs_transcript} = $notation if defined $notation;
537 ##### return if held
538 return $self->{hgvs_transcript} if defined $self->{hgvs_transcript} ;
539 return $self->{hgvs_coding} if defined $self->{hgvs_coding} ;
540
541 my $variation_feature_sequence = $self->variation_feature_seq() ;
542
543 ### don't try to handle odd characters
544 return undef if $variation_feature_sequence =~ m/[^ACGT\-]/ig;
545
546 ### no result for reference allele
547 return undef if $self->is_reference ==1;
548
549 ### else evaluate
550
551 ### get reference sequence strand
552 my $refseq_strand = $self->transcript_variation->transcript->strand();
553
554 if($DEBUG ==1){
555 my $var_name = $self->transcript_variation->variation_feature->variation_name();
556 print "\nHGVS transcript: Checking $var_name refseq strand => $refseq_strand seq name : " . $self->transcript_variation->transcript_stable_id() . " var strand " . $self->transcript_variation->variation_feature->strand() . " vf st " . $self->variation_feature->strand() ." seqname: " . $self->variation_feature->seqname() ." seq: " . $self->variation_feature_seq ."\n";
557 }
558
559 my $hgvs_notation ; ### store components of HGVS string in hash
560
561 ### vf strand is relative to transcript or transcript slice
562 if( $self->transcript_variation->variation_feature->strand() <0 && $refseq_strand >0 ||
563 $self->transcript_variation->variation_feature->strand() >0 && $refseq_strand < 0
564 ){
565 reverse_comp(\$variation_feature_sequence);
566 }
567
568 ### need to get ref seq from slice transcript is on for intron labelling
569 my ($slice_start, $slice_end, $slice) = $self->_var2transcript_slice_coords();
570
571
572 unless($slice_start){
573 #warn "TVA: VF not within transcript - no HGVS\n";
574 return undef;
575 }
576
577 ### decide event type from HGVS nomenclature
578 $hgvs_notation = hgvs_variant_notation( $variation_feature_sequence, ### alt_allele,
579 $slice->seq(), ### using this to extract ref allele
580 $slice_start,
581 $slice_end
582 );
583
584 ### This should not happen
585 unless($hgvs_notation->{'type'}){
586 #warn "Error - not continuing; no HGVS annotation\n";
587 return undef;
588 }
589
590 ### create reference name - transcript name & seq version
591 $hgvs_notation->{'ref_name'} = $self->transcript_variation->transcript_stable_id() . "." . $self->transcript_variation->transcript->version();
592
593
594 ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups]
595 $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} );
596 $hgvs_notation->{end} = $self->_get_cDNA_position( $hgvs_notation->{end} );
597
598
599 # Make sure that start is always less than end
600 my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
601 my ($exon_end_coord, $intron_end_offset) = $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/;
602 $intron_start_offset ||= 0;
603 $intron_end_offset ||= 0;
604
605 ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if
606 (($exon_start_coord + $intron_start_offset) > ($exon_end_coord + $intron_end_offset));
607
608
609
610 if($self->transcript->cdna_coding_start()){
611 $hgvs_notation->{'numbering'} = "c"; ### set 'c' if transcript is coding
612 }
613 else{
614 $hgvs_notation->{'numbering'} = "n"; ### set 'n' if transcript non-coding
615 }
616 ### generic formatting
617 $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation);
618
619 if($DEBUG ==1){print "HGVS notation for var " . $self->transcript_variation->variation_feature->variation_name() . " from hgvs transcript : " . $self->{hgvs_transcript} . " \n";}
620
621 return $self->{hgvs_transcript};
622
623 }
624
625
626 =head2 hgvs_protein
627
628 Description: Return a string representing the protein-level effect of this allele in HGVS format
629 Returntype : string or undef if this allele is not in the CDS
630 Exceptions : none
631 Status : At Risk
632
633 =cut
634
635 sub hgvs_protein {
636
637 my $self = shift;
638 my $notation = shift;
639 my $hgvs_notation;
640
641 if($DEBUG ==1){print "\nStarting hgvs_protein with ". $self->transcript_variation->variation_feature->variation_name() . "\n"; }
642
643 ### set if string supplied
644 $self->{hgvs_protein} = $notation if defined $notation;
645
646 ### return if set
647 return $self->{hgvs_protein} if defined $self->{hgvs_protein} ;
648
649 ### don't try to handle odd characters
650 return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig;
651
652 #### else, check viable input and create notation
653 return if $self->is_reference();
654
655 unless ($self->transcript_variation->translation_start()){
656 return undef ;
657 }
658
659 #### for debug
660 #my $var_name = $self->transcript_variation->variation_feature->variation_name();
661
662 ### get reference sequence [add seq version to transcript name]
663 $hgvs_notation->{ref_name} = $self->transcript_variation->transcript->translation()->display_id() . "." . $self->transcript_variation->transcript->translation()->version();
664
665 $hgvs_notation->{'numbering'} = 'p';
666
667 ### get default reference location [changed later in some cases eg. duplication]
668 $hgvs_notation->{start} = $self->transcript_variation->translation_start();
669 $hgvs_notation->{end} = $self->transcript_variation->translation_end();
670
671
672 ## get default reference & alt peptides [changed later to hgvs format]
673 $hgvs_notation->{alt} = $self->peptide;
674 $hgvs_notation->{ref} = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide;
675
676 if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref}){
677 $hgvs_notation = _clip_alleles( $hgvs_notation);
678 }
679
680 #### define type - types are different for protein numbering
681 $hgvs_notation = $self->_get_hgvs_protein_type($hgvs_notation);
682
683 ##### Convert ref & alt peptides taking into account HGVS rules
684 $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation);
685
686 ##### String formatting
687 $self->{hgvs_protein} = $self->_get_hgvs_protein_format($hgvs_notation);
688 return $self->{hgvs_protein} ;
689 }
690
691 ### HGVS: format protein string
692 sub _get_hgvs_protein_format{
693
694 my $self = shift;
695 my $hgvs_notation = shift;
696
697 if ((defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} &&
698 $hgvs_notation->{ref} eq $hgvs_notation->{alt}) ||
699 (defined $hgvs_notation->{type} && $hgvs_notation->{type} eq "=")){
700
701 ### no protein change - return transcript nomenclature with flag for neutral protein consequence
702 $hgvs_notation->{'hgvs'} = $self->hgvs_transcript() . "(p.=)";
703 return $hgvs_notation->{'hgvs'} ;
704 }
705
706 ### all start with refseq name & numbering type
707 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . ".";
708
709 ### handle stop_lost seperately regardless of cause by del/delins => p.XposAA1extnum_AA_to_stop
710 if(stop_lost($self)){ ### if deletion of stop add extX and number of new aa to alt
711 $hgvs_notation->{alt} = substr($hgvs_notation->{alt}, 0, 3);
712 if($hgvs_notation->{type} eq "del"){
713 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "del");
714 if(defined $aa_til_stop){
715 $hgvs_notation->{alt} .= "extX" . $aa_til_stop;
716 }
717 }
718 elsif($hgvs_notation->{type} eq ">"){
719 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "subs");
720 if(defined $aa_til_stop){
721 $hgvs_notation->{alt} .= "extX" . $aa_til_stop;
722 }
723 }
724 else{
725 # warn "TVA: stop loss for type $hgvs_notation->{type} not caught \n";
726 }
727 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
728 }
729
730 elsif( $hgvs_notation->{type} eq "dup"){
731
732 if($hgvs_notation->{start} < $hgvs_notation->{end}){
733 ### list only first and last peptides in long duplicated string
734 my $ref_pep_first = substr($hgvs_notation->{alt}, 0, 3);
735 my $ref_pep_last = substr($hgvs_notation->{alt}, -3, 3);
736 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} ."dup";
737
738 }
739 else{
740 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{alt} . $hgvs_notation->{start} . "dup" ;
741 }
742 print "formating a dup $hgvs_notation->{hgvs} \n";
743 }
744
745 elsif($hgvs_notation->{type} eq ">"){
746 #### substitution
747
748 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt};
749 }
750
751 elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){
752
753 #### list first and last aa in reference only
754 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3);
755 my $ref_pep_last;
756 if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){
757 $ref_pep_last ="X";
758 }
759 else{
760 $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3);
761 }
762 if($hgvs_notation->{ref} =~ /X$/){
763 ### For stops & add extX & distance to next stop to alt pep
764 my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "loss");
765 if(defined $aa_til_stop){
766 $hgvs_notation->{alt} .="extX" . $aa_til_stop;
767 }
768 }
769 if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){
770 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
771 }
772 else{
773 ### correct ordering if needed
774 if($hgvs_notation->{start} > $hgvs_notation->{end}){
775 ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} );
776 }
777
778 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ;
779 }
780 }
781
782 elsif($hgvs_notation->{type} eq "fs"){
783 if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "X"){ ## stop gained
784 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
785
786 }
787 else{ ## not immediate stop - count aa until next
788
789 my $aa_til_stop = _stop_loss_extra_AA($self, $hgvs_notation->{start}-1, "fs");
790 if($aa_til_stop ){
791 ### use long form if new stop found
792 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} . "fsX$aa_til_stop" ;
793 }
794 else{
795 ### otherwise use short form
796 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "fs";
797 }
798 }
799 }
800
801 elsif( $hgvs_notation->{type} eq "del"){
802 $hgvs_notation->{alt} = "del";
803 if( length($hgvs_notation->{ref}) >3 ){
804 my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3);
805 my $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3);
806 $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ;
807 }
808 else{
809 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ;
810 }
811 }
812
813 elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){
814 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "_" . $hgvs_notation->{alt} . $hgvs_notation->{end} ;
815 }
816
817 else{
818 #### default to substitution
819 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt};
820 }
821
822 if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";}
823 return $hgvs_notation->{'hgvs'};
824 }
825
826 ### HGVS: get type of variation event in protein terms
827 sub _get_hgvs_protein_type{
828
829 my $self = shift;
830 my $hgvs_notation = shift;
831
832
833 ### get allele length
834 my ($ref_length, $alt_length ) = $self->_get_allele_length();
835
836
837 if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){
838 ### Run type checks on peptides if available
839
840 if($hgvs_notation->{alt} eq $hgvs_notation->{ref} ){
841 #### synonymous indicated by =
842 $hgvs_notation->{type} = "=";
843 }
844 elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) {
845
846 $hgvs_notation->{type} = ">";
847 }
848 elsif($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") {
849
850 $hgvs_notation->{type} = "ins";
851 }
852 elsif($hgvs_notation->{alt} eq "" ) {
853
854 $hgvs_notation->{type} = "del";
855 }
856 elsif( (length($hgvs_notation->{alt}) > length($hgvs_notation->{ref}) ) &&
857 $hgvs_notation->{alt} =~ / $hgvs_notation->{ref}/ ) {
858 ### capture duplication event described as TTT/TTTTT
859 $hgvs_notation->{type} = "dup";
860 }
861
862 elsif( (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) ==1) ||
863 (length($hgvs_notation->{alt}) ==1 && length($hgvs_notation->{ref}) >1) ) {
864 $hgvs_notation->{type} = "delins";
865 }
866 else{
867 $hgvs_notation->{type} = ">";
868 }
869 }
870
871
872 elsif($ref_length ne $alt_length && ($ref_length - $alt_length)%3 !=0 ){
873 $hgvs_notation->{type} = "fs";
874 }
875
876 elsif(length($self->variation_feature_seq()) >1 ){
877 if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){
878 ### convention for insertions - end one less than start
879 $hgvs_notation->{type} = "ins";
880 }
881 elsif( $hgvs_notation->{start} != $hgvs_notation->{end} ){
882 $hgvs_notation->{type} = "delins";
883 }
884 else{
885 $hgvs_notation->{type} = ">";
886 }
887 }
888 else{
889 # warn "Cannot define protein variant type [$ref_length - $alt_length]\n";
890 }
891 return $hgvs_notation ;
892
893 }
894
895 ### HGVS: get reference & alternative peptide
896 sub _get_hgvs_peptides{
897
898 my $self = shift;
899 my $hgvs_notation = shift;
900
901 if($hgvs_notation->{type} eq "fs"){
902 ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately
903 $hgvs_notation = $self->_get_fs_peptides($hgvs_notation);
904
905 }
906 elsif($hgvs_notation->{type} eq "ins" ){
907
908 ### HGVS ref are peptides flanking insertion
909 $hgvs_notation->{ref} = $self->_get_surrounding_peptides($hgvs_notation->{start});
910
911 if( $hgvs_notation->{alt} =~/\*/){
912 ## inserted bases after stop irrelevant; consider as substitution gaining stop MAINTAIN PREVIOUS BEHAVIOUR FOR NOW
913 #$hgvs_notation->{alt} =~ s/\*\w+$/\*/;
914 }
915 else{
916 ### Additional check that inserted bases do not duplicate 3' reference sequence [set to type = dup if so]
917 $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation);
918 }
919 }
920
921
922 ### Convert peptide to 3 letter code as used in HGVS
923 $hgvs_notation->{ref} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref', -alphabet => 'protein')) || "";
924 if( $hgvs_notation->{alt} eq "-"){
925 $hgvs_notation->{alt} = "del";
926 }
927 else{
928 $hgvs_notation->{alt} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref', -alphabet => 'protein')) || "";
929 }
930
931 ### handle special cases
932 if( affects_start_codon($self) ){
933 #### handle initiator loss -> probably no translation => alt allele is '?'
934 $hgvs_notation->{alt} = "?";
935 $hgvs_notation->{type} = "";
936 }
937
938 elsif( $hgvs_notation->{type} eq "del"){
939 #### partial last codon
940 $hgvs_notation->{alt} = "del";
941 }
942 elsif($hgvs_notation->{type} eq "fs"){
943 ### only quote first ref peptide for frameshift
944 $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3);
945 }
946
947 ### set stop to HGVS prefered "X"
948 if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Ter|Xaa/X/g; }
949 if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Ter|Xaa/X/g; }
950
951 return ($hgvs_notation);
952
953 }
954
955 ### HGVS: remove common peptides from alt and ref strings & shift start/end accordingly
956 sub _clip_alleles{
957
958 my $hgvs_notation = shift;
959
960 my $check_alt = $hgvs_notation->{alt} ;
961 my $check_ref = $hgvs_notation->{ref} ;
962 my $check_start = $hgvs_notation->{start};
963
964 ### strip same bases from start of string
965 for (my $p =0; $p <length ($hgvs_notation->{ref}); $p++){
966 my $check_next_ref = substr( $check_ref, 0, 1);
967 my $check_next_alt = substr( $check_alt, 0, 1);
968
969 if($check_next_ref eq "*" && $check_next_alt eq "*"){
970 ### stop re-created by variant - no protein change
971 $hgvs_notation->{type} = "=";
972
973 return($hgvs_notation);
974 }
975
976 if($check_next_ref eq $check_next_alt){
977 $check_start++;
978 $check_ref = substr( $check_ref, 1);
979 $check_alt = substr( $check_alt, 1);
980 }
981 else{
982 last;
983 }
984 }
985 #### strip same bases from end of string
986 for (my $q =0; $q < length ($check_ref); $q++){
987 my $check_next_ref = substr( $check_ref, -1, 1);
988 my $check_next_alt = substr( $check_alt, -1, 1);
989 if($check_next_ref eq $check_next_alt){
990 chop $check_ref;
991 chop $check_alt;
992 }
993 else{
994 last;
995 }
996 }
997
998 $hgvs_notation->{alt} = $check_alt;
999 $hgvs_notation->{ref} = $check_ref;
1000
1001 ### check if clipping force type change & adjust location
1002 if(length ($check_ref) == 0 && length ($check_alt) >= 1){
1003 ### re-set as ins not delins
1004 $hgvs_notation->{type} ="ins";
1005 ### insertion between ref base and next => adjust next
1006 if($hgvs_notation->{end} == $hgvs_notation->{start} ){$hgvs_notation->{end} = $check_start; }
1007 # $hgvs_notation->{start} = $check_start;
1008 }
1009 elsif(length ($check_ref) >=1 && length ($check_alt) == 0){
1010 ### re-set as del not delins
1011 $hgvs_notation->{type} ="del";
1012 $hgvs_notation->{start} = $check_start;
1013 }
1014 else{
1015 #### save trimmed peptide string & increased start position
1016 $hgvs_notation->{start} = $check_start;
1017
1018 }
1019
1020 return $hgvs_notation;
1021 }
1022
1023
1024
1025
1026
1027 #### HGVS: check allele lengths to look for frameshifts
1028 sub _get_allele_length{
1029
1030 my $self = shift;
1031 my $ref_length = 0;
1032 my $alt_length = 0;
1033
1034 my $al_string = $self->allele_string();
1035 my $ref_allele = (split/\//, $al_string)[0];
1036 $ref_allele =~ s/\-//;
1037 $ref_length = length $ref_allele;
1038
1039 my $alt_allele = $self->variation_feature_seq();
1040 $alt_allele =~ s/\-//;
1041 $alt_length = length $alt_allele;
1042
1043 return ($ref_length, $alt_length );
1044
1045 }
1046
1047 ### HGVS: list first different peptide [may not be first changed codon]
1048 sub _get_fs_peptides{
1049
1050 my $self = shift;
1051 my $hgvs_notation = shift;
1052
1053 ### get CDS with alt variant
1054 my $alt_cds = $self->_get_alternate_cds();
1055
1056 #### get new translation
1057 my $alt_trans = $alt_cds->translate()->seq();
1058
1059 ### get changed end (currently in single letter AA coding)
1060 my $ref_trans = $self->transcript->translate()->seq();
1061 $ref_trans .= "*"; ## appending ref stop for checking purposes
1062
1063 $hgvs_notation->{start} = $self->transcript_variation->translation_start() ;
1064
1065 if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq
1066 $hgvs_notation->{alt} = "del";
1067 $hgvs_notation->{type} = "del";
1068 return $hgvs_notation;
1069 }
1070
1071 while ($hgvs_notation->{start} <= length $alt_trans){
1072 ### frame shift may result in the same AA#
1073
1074 $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1);
1075 $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1);
1076
1077 if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){
1078 ### variation at stop codon, but maintains stop codon
1079 return ($hgvs_notation);
1080 }
1081 last if $hgvs_notation->{ref} ne $hgvs_notation->{alt};
1082 $hgvs_notation->{start}++;
1083 }
1084 return ($hgvs_notation);
1085
1086 }
1087
1088 #### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion
1089 sub _get_surrounding_peptides{
1090
1091 my $self = shift;
1092 my $ref_pos = shift;
1093 my $ref_trans = $self->transcript->translate()->seq();
1094
1095 my $end = substr($ref_trans, $ref_pos-1);
1096 my $ref_string = substr($ref_trans, $ref_pos-1, 2);
1097
1098 return ($ref_string);
1099
1100 }
1101
1102
1103 #### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop
1104 sub _get_alternate_cds{
1105
1106 my $self = shift;
1107
1108 ### get reference sequence
1109 my $reference_cds_seq = $self->transcript->translateable_seq();
1110
1111 ### get sequences upstream and downstream of variant
1112 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) );
1113 my $downstream_seq = substr($reference_cds_seq, ($self->transcript_variation->cds_end() ) );
1114
1115 ### fix alternate allele if deletion
1116 my $alt_allele = $self->variation_feature_seq();
1117 $alt_allele =~ s/\-//;
1118
1119 ### build alternate seq
1120 my $alternate_seq = $upstream_seq . $alt_allele . $downstream_seq ;
1121
1122 ### create seq obj with alternative allele in the CDS sequence
1123 my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq, -id => 'alt_cds', -alphabet => 'dna');
1124
1125 ### append UTR if available as stop may be disrupted
1126 my $utr = $self->transcript_variation->transcript->three_prime_utr();
1127
1128 if (defined $utr) {
1129 ### append the UTR to the alternative CDS
1130 $alt_cds->seq($alt_cds->seq() . $utr->seq());
1131 }
1132 else{
1133 ##warn "No UTR available for alternate CDS\n";
1134 }
1135
1136 return $alt_cds;
1137 }
1138
1139 ### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication
1140 sub _check_for_peptide_duplication{
1141
1142 my $self = shift;
1143 my $hgvs_notation= shift;
1144
1145 ##### get reference sequence
1146 my $reference_cds_seq = $self->transcript->translateable_seq();
1147
1148 ##### get sequence upstream of variant
1149 my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) );
1150
1151 ##### create translation to check against inserted peptides
1152 my $upstream_cds =Bio::PrimarySeq->new(-seq => $upstream_seq, -id => 'alt_cds', -alphabet => 'dna');
1153 my $upstream_trans = $upstream_cds->translate()->seq();
1154
1155 ## Test whether alt peptide matches the reference sequence just before the variant
1156 my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ;
1157 my $test_seq = substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'}));
1158
1159 if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) {
1160
1161 $hgvs_notation->{type} = 'dup';
1162 $hgvs_notation->{end} = $hgvs_notation->{start} -1;
1163 $hgvs_notation->{start} -= length($hgvs_notation->{alt});
1164
1165 }
1166
1167 return $hgvs_notation;
1168
1169 }
1170
1171 #### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's
1172 sub _stop_loss_extra_AA{
1173
1174 my $self = shift;
1175 my $ref_var_pos = shift; ### first effected AA - supply for frameshifts
1176 my $test = shift;
1177
1178 return undef unless $ref_var_pos;
1179
1180 my $extra_aa;
1181
1182 ### get the sequence with variant added
1183 my $alt_cds = $self->_get_alternate_cds();
1184 ### get new translation
1185 my $alt_trans = $alt_cds->translate();
1186
1187 my $ref_temp = $self->transcript->translate()->seq;
1188 my $ref_len = length($ref_temp);
1189
1190 if($DEBUG==1){
1191 print "alt translated:\n" . $alt_trans->seq() . "\n";
1192 print "ref translated:\n$ref_temp\n";;
1193 }
1194
1195 #### Find the number of residues that are translated until a termination codon is encountered
1196 if ($alt_trans->seq() =~ m/\*/) {
1197 if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";}
1198
1199 if($test eq "fs" ){
1200 ### frame shift - count from first AA effected by variant to stop
1201 $extra_aa = $+[0] - $ref_var_pos;
1202 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";}
1203 }
1204
1205 else{
1206 $extra_aa = $+[0] - 1 - $ref_len;
1207 if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";}
1208 }
1209 }
1210
1211 # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon
1212 if(defined $extra_aa && $extra_aa >0){
1213 return $extra_aa ;
1214 }
1215 else{
1216 #warn "No stop found in alternate sequence\n";
1217 return undef;
1218 }
1219
1220 }
1221
1222 =head
1223 # We haven't implemented support for these methods yet
1224
1225 sub hgvs_rna {
1226 return _hgvs_generic(@_,'rna');
1227 }
1228
1229 sub hgvs_mitochondrial {
1230 return _hgvs_generic(@_,'mitochondrial');
1231 }
1232
1233 =cut
1234
1235 sub _hgvs_generic {
1236
1237 my $self = shift;
1238 my $reference = pop;
1239 my $notation = shift;
1240
1241 #ĘThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these
1242 return undef if ($reference =~ m/rna|mitochondrial/);
1243
1244 my $sub = qq{hgvs_$reference};
1245
1246 $self->{$sub} = $notation if defined $notation;
1247
1248 unless ($self->{$sub}) {
1249
1250 # Use the transcript this VF is on as the reference feature
1251 my $reference_feature = $self->transcript;
1252 # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region
1253 $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic');
1254
1255 # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles
1256 $self->transcript_variation->$sub($self->variation_feature->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$self->transcript_variation));
1257 }
1258
1259 return $self->{$sub};
1260 }
1261
1262
1263 ### HGVS: move variant to transcript slice
1264 sub _var2transcript_slice_coords{
1265
1266 my $self = shift;
1267
1268 my $ref_slice = $self->transcript->feature_Slice(); #### returns with strand same as feature
1269 my $tr_vf = $self->variation_feature->transfer($ref_slice);
1270
1271 # Return undef if this VariationFeature does not fall within the supplied feature.
1272 return undef if (!defined $tr_vf ||
1273 $tr_vf->start < 1 ||
1274 $tr_vf->end < 1 ||
1275 $tr_vf->start > ($self->transcript->end - $self->transcript->start + 1) ||
1276 $tr_vf->end > ($self->transcript->end - $self->transcript->start + 1));
1277
1278 return( $tr_vf->start() , $tr_vf->end(), $ref_slice);
1279 }
1280
1281
1282
1283 ### HGVS: get variant position in transcript
1284
1285 # intron:
1286 # If the position is in an intron, the boundary position of the closest exon and
1287 # a + or - offset into the intron is returned.
1288 # Ordered by genome forward not 5' -> 3'
1289
1290 # upstream:
1291 # If the position is 5' of the start codon, it is reported relative to the start codon
1292 # (-1 being the last nucleotide before the 'A' of ATG).
1293
1294 #downstream:
1295 # If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset
1296 # from the start codon (*1 being the first nucleotide after the last position of the stop codon)
1297
1298 sub _get_cDNA_position {
1299
1300 my $self = shift;
1301 my $position = shift; ### start or end of variation
1302
1303 my $transcript = $self->transcript();
1304 my $strand = $transcript->strand();
1305
1306 #### TranscriptVariation start/stop coord relative to transcript
1307 #### Switch to chromosome coordinates taking into account strand
1308 $position = ( $strand > 0 ?
1309 ( $self->transcript->start() + $position - 1 ) :
1310 ( $self->transcript->end() - $position + 1));
1311
1312
1313
1314 # Get all exons and sort them in positional order
1315 my @exons = sort {$a->start() <=> $b->start()} @{$transcript->get_all_Exons()};
1316 my $n_exons = scalar(@exons);
1317
1318 my $cdna_position;
1319 # Loop over the exons and get the coordinates of the variation in exon+intron notation
1320 for (my $i=0; $i<$n_exons; $i++) {
1321
1322 # Skip if the start point is beyond this exon
1323 next if ($position > $exons[$i]->end());
1324
1325
1326 # EXONIC: If the start coordinate is within this exon
1327 if ($position >= $exons[$i]->start()) {
1328 # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation
1329 # If the transcript is in the opposite direction, count from the end instead
1330 $cdna_position = $exons[$i]->cdna_start($transcript) + ( $strand > 0 ?
1331 ( $position - $exons[$i]->start ) :
1332 ( $exons[$i]->end() - $position )
1333 );
1334 last; #### last exon checked
1335 }
1336 ## INTRONIC
1337 # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one
1338 else {
1339
1340 my $updist = ($position - $exons[$i-1]->end());
1341 my $downdist = ($exons[$i]->start() - $position);
1342
1343 # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that
1344 if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) {
1345
1346 # If the orientation is reversed, we should use the cDNA start and a '-' offset
1347 $cdna_position = ($strand >= 0 ?
1348 $exons[$i-1]->cdna_end($transcript) . '+' :
1349 $exons[$i-1]->cdna_start($transcript) . '-')
1350 . $updist;
1351 }
1352 # Else if downstream is shortest...
1353 else {
1354 # If the orientation is reversed, we should use the cDNA end and a '+' offset
1355 $cdna_position = ($strand >= 0 ?
1356 $exons[$i]->cdna_start($transcript) . '-' :
1357 $exons[$i]->cdna_end($transcript) . '+')
1358 . $downdist;
1359 }
1360 last; ## last exon checked
1361 }
1362 }
1363
1364 # Shift the position to make it relative to the start & stop codons
1365 my $start_codon = $transcript->cdna_coding_start();
1366 my $stop_codon = $transcript->cdna_coding_end();
1367
1368 # Disassemble the cDNA coordinate into the exon and intron parts
1369 ### just built this now taking it appart again
1370 my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/;
1371
1372
1373 # Start by correcting for the stop codon
1374 if (defined($stop_codon) && $cdna_coord > $stop_codon) {
1375 # Get the offset from the stop codon
1376 $cdna_coord -= $stop_codon;
1377 # Prepend a * to indicate the position is in the 3' UTR
1378 $cdna_coord = '*' . $cdna_coord;
1379 }
1380 elsif (defined($start_codon)) {
1381 # If the position is beyond the start codon, add 1 to get the correct offset
1382 $cdna_coord += ($cdna_coord >= $start_codon);
1383 # Subtract the position of the start codon
1384 $cdna_coord -= $start_codon;
1385 }
1386
1387 # Re-assemble the cDNA position [ return exon num & offset & direction for intron eg. 142+363]
1388 $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : '');
1389 return $cdna_position;
1390 }
1391
1392
1393 1;
1394
1395