Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/TranscriptVariationAllele.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1406 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::TranscriptVariationAllele + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::TranscriptVariationAllele; + + my $tva = Bio::EnsEMBL::Variation::TranscriptVariationAllele->new( + -transcript_variation => $tv, + -variation_feature_seq => 'A', + -is_reference => 0, + ); + + print "sequence with respect to the transcript: ", $tva->feature_seq, "\n"; + print "sequence with respect to the variation feature: ", $tva->variation_feature_seq, "\n"; + print "consequence SO terms: ", (join ",", map { $_->SO_term } @{ $tva->get_all_OverlapConsequences }), "\n"; + print "amino acid change: ", $tva->peptide_allele_string, "\n"; + print "resulting codon: ", $tva->codon, "\n"; + print "reference codon: ", $tva->transcript_variation->get_reference_TranscriptVariationAllele->codon, "\n"; + print "PolyPhen prediction: ", $tva->polyphen_prediction, "\n"; + print "SIFT prediction: ", $tva->sift_prediction, "\n"; + +=head1 DESCRIPTION + +A TranscriptVariationAllele object represents a single allele of a TranscriptVariation. +It provides methods that are specific to the sequence of the allele, such as codon, +peptide etc. Methods that depend only on position (e.g. CDS start) will be found in +the associated TranscriptVariation. Ordinarily you will not create these objects +yourself, but instead you would create a TranscriptVariation object which will then +construct TranscriptVariationAlleles based on the allele string of the associated +VariationFeature. + +Note that any methods that are not specific to Transcripts will be found in the +VariationFeatureOverlapAllele superclass. + +=cut + +package Bio::EnsEMBL::Variation::TranscriptVariationAllele; + +use strict; +use warnings; + +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix qw($AA_LOOKUP); +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(hgvs_variant_notation format_hgvs_string); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(within_cds within_intron stop_lost affects_start_codon); + +use base qw(Bio::EnsEMBL::Variation::VariationFeatureOverlapAllele Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele); + + +our $DEBUG = 0; + +sub new_fast { + my ($class, $hashref) = @_; + + # swap a transcript_variation argument for a variation_feature_overlap one + + if ($hashref->{transcript_variation}) { + $hashref->{variation_feature_overlap} = delete $hashref->{transcript_variation}; + } + + # and call the superclass + + return $class->SUPER::new_fast($hashref); +} + +=head2 transcript_variation + + Description: Get/set the associated TranscriptVariation + Returntype : Bio::EnsEMBL::Variation::TranscriptVariation + Exceptions : throws if the argument is the wrong type + Status : At Risk + +=cut + +sub transcript_variation { + my ($self, $tv) = @_; + assert_ref($tv, 'Bio::EnsEMBL::Variation::TranscriptVariation') if $tv; + return $self->variation_feature_overlap($tv); +} + +=head2 variation_feature + + Description: Get the associated VariationFeature + Returntype : Bio::EnsEMBL::Variation::VariationFeature + Exceptions : none + Status : At Risk + +=cut + +sub variation_feature { + my $self = shift; + return $self->transcript_variation->variation_feature; +} + +=head2 pep_allele_string + + Description: Return a '/' delimited string of the reference peptide and the + peptide resulting from this allele, or a single peptide if this + allele does not change the peptide (e.g. because it is synonymous) + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub pep_allele_string { + my ($self) = @_; + + my $pep = $self->peptide; + + return undef unless $pep; + + my $ref_pep = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; + + return undef unless $ref_pep; + + return $ref_pep ne $pep ? $ref_pep.'/'.$pep : $pep; +} + +=head2 codon_allele_string + + Description: Return a '/' delimited string of the reference codon and the + codon resulting from this allele + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub codon_allele_string { + my ($self) = @_; + + my $codon = $self->codon; + + return undef unless $codon; + + my $ref_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->codon; + + return $ref_codon.'/'.$codon; +} + +=head2 display_codon_allele_string + + Description: Return a '/' delimited string of the reference display_codon and the + display_codon resulting from this allele. The display_codon identifies + the nucleotides affected by this variant in UPPER CASE and other + nucleotides in lower case + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub display_codon_allele_string { + my ($self) = @_; + + my $display_codon = $self->display_codon; + + return undef unless $display_codon; + + my $ref_display_codon = $self->transcript_variation->get_reference_TranscriptVariationAllele->display_codon; + + return undef unless $ref_display_codon; + + return $ref_display_codon.'/'.$display_codon; +} + +=head2 peptide + + Description: Return the amino acid sequence that this allele is predicted to result in + Returntype : string or undef if this allele is not in the CDS or is a frameshift + Exceptions : none + Status : At Risk + +=cut + +sub peptide { + my ($self, $peptide) = @_; + + $self->{peptide} = $peptide if $peptide; + + unless ($self->{peptide}) { + + return undef unless $self->seq_is_unambiguous_dna; + + if (my $codon = $self->codon) { + + # the codon method can set the peptide in some circumstances + # so check here before we try an (expensive) translation + return $self->{peptide} if $self->{peptide}; + + # translate the codon sequence to establish the peptide allele + + # allow for partial codons - split the sequence into whole and partial + # e.g. AAAGG split into AAA and GG + my $whole_codon = substr($codon, 0, int(length($codon) / 3) * 3); + my $partial_codon = substr($codon, int(length($codon) / 3) * 3); + + my $pep = ''; + + if($whole_codon) { + # for mithocondrial dna we need to to use a different codon table + my $codon_table = $self->transcript_variation->_codon_table; + + my $codon_seq = Bio::Seq->new( + -seq => $whole_codon, + -moltype => 'dna', + -alphabet => 'dna', + ); + + $pep .= $codon_seq->translate(undef, undef, undef, $codon_table)->seq; + } + + if($partial_codon) { + $pep .= 'X'; + } + + $pep ||= '-'; + + $self->{peptide} = $pep; + } + } + + return $self->{peptide}; +} + +=head2 codon + + Description: Return the codon sequence that this allele is predicted to result in + Returntype : string or undef if this allele is not in the CDS or is a frameshift + Exceptions : none + Status : At Risk + +=cut + +sub codon { + my ($self, $codon) = @_; + + $self->{codon} = $codon if defined $codon; + + my $tv = $self->transcript_variation; + + return undef unless $tv->translation_start && $tv->translation_end; + + return undef unless $self->seq_is_dna; + + unless ($self->{codon}) { + + # try to calculate the codon sequence + + my $seq = $self->feature_seq; + + $seq = '' if $seq eq '-'; + + # calculate necessary coords and lengths + + my $codon_cds_start = $tv->translation_start * 3 - 2; + my $codon_cds_end = $tv->translation_end * 3; + my $codon_len = $codon_cds_end - $codon_cds_start + 1; + my $vf_nt_len = $tv->cds_end - $tv->cds_start + 1; + my $allele_len = $self->seq_length; + + if ($allele_len != $vf_nt_len) { + if (abs($allele_len - $vf_nt_len) % 3) { + # this is a frameshift variation, we don't attempt to + # calculate the resulting codon or peptide change as this + # could get quite complicated + return undef; + } + } + + # splice the allele sequence into the CDS + + my $cds = $tv->_translateable_seq; + + substr($cds, $tv->cds_start-1, $vf_nt_len) = $seq; + + # and extract the codon sequence + + my $codon = substr($cds, $codon_cds_start-1, $codon_len + ($allele_len - $vf_nt_len)); + + if (length($codon) < 1) { + $self->{codon} = '-'; + $self->{peptide} = '-'; + } + else { + $self->{codon} = $codon; + } + } + + return $self->{codon}; +} + +=head2 display_codon + + Description: Return the codon sequence that this allele is predicted to result in + with the affected nucleotides identified in UPPER CASE and other + nucleotides in lower case + Returntype : string or undef if this allele is not in the CDS or is a frameshift + Exceptions : none + Status : At Risk + +=cut + +sub display_codon { + my $self = shift; + + unless ($self->{_display_codon}) { + + if ($self->codon && defined $self->transcript_variation->codon_position) { + + my $display_codon = lc $self->codon; + + # if this allele is an indel then just return all lowercase + + if ($self->feature_seq ne '-') { + + # codon_position is 1-based, while substr assumes the string starts at 0 + + my $pos = $self->transcript_variation->codon_position - 1; + + my $len = length $self->feature_seq; + + substr($display_codon, $pos, $len) = uc substr($display_codon, $pos, $len); + } + + $self->{_display_codon} = $display_codon; + } + } + + return $self->{_display_codon}; +} + +=head2 polyphen_prediction + + Description: Return the qualitative PolyPhen-2 prediction for the effect of this allele. + (Note that we currently only have PolyPhen predictions for variants that + result in single amino acid substitutions in human) + Returntype : string (one of 'probably damaging', 'possibly damaging', 'benign', 'unknown') + if this is a non-synonymous mutation and a prediction is available, undef + otherwise + Exceptions : none + Status : At Risk + +=cut + +sub polyphen_prediction { + my ($self, $classifier, $polyphen_prediction) = @_; + + $classifier ||= 'humvar'; + + my $analysis = "polyphen_${classifier}"; + + $self->{$analysis}->{prediction} = $polyphen_prediction if $polyphen_prediction; + + unless ($self->{$analysis}->{prediction}) { + my ($prediction, $score) = $self->_protein_function_prediction($analysis); + $self->{$analysis}->{score} = $score; + $self->{$analysis}->{prediction} = $prediction; + } + + return $self->{$analysis}->{prediction}; +} + +=head2 polyphen_score + + Description: Return the PolyPhen-2 probability that this allele is deleterious (Note that we + currently only have PolyPhen predictions for variants that result in single + amino acid substitutions in human) + Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is + available, undef otherwise + Exceptions : none + Status : At Risk + +=cut + +sub polyphen_score { + my ($self, $classifier, $polyphen_score) = @_; + + $classifier ||= 'humvar'; + + my $analysis = "polyphen_${classifier}"; + + $self->{$analysis}->{score} = $polyphen_score if defined $polyphen_score; + + unless ($self->{$analysis}->{score}) { + my ($prediction, $score) = $self->_protein_function_prediction($analysis); + $self->{$analysis}->{score} = $score; + $self->{$analysis}->{prediction} = $prediction; + } + + return $self->{$analysis}->{score}; +} + +=head2 sift_prediction + + Description: Return the qualitative SIFT prediction for the effect of this allele. + (Note that we currently only have SIFT predictions for variants that + result in single amino acid substitutions in human) + Returntype : string (one of 'tolerated', 'deleterious') if this is a non-synonymous + mutation and a prediction is available, undef otherwise + Exceptions : none + Status : At Risk + +=cut + +sub sift_prediction { + my ($self, $sift_prediction) = @_; + + $self->{sift_prediction} = $sift_prediction if $sift_prediction; + + unless ($self->{sift_prediction}) { + my ($prediction, $score) = $self->_protein_function_prediction('sift'); + $self->{sift_score} = $score; + $self->{sift_prediction} = $prediction unless $self->{sift_prediction}; + } + + return $self->{sift_prediction}; +} + +=head2 sift_score + + Description: Return the SIFT score for this allele (Note that we currently only have SIFT + predictions for variants that result in single amino acid substitutions in human) + Returntype : float between 0 and 1 if this is a non-synonymous mutation and a prediction is + available, undef otherwise + Exceptions : none + Status : At Risk + +=cut + +sub sift_score { + my ($self, $sift_score) = @_; + + $self->{sift_score} = $sift_score if defined $sift_score; + + unless ($self->{sift_score}) { + my ($prediction, $score) = $self->_protein_function_prediction('sift'); + $self->{sift_score} = $score; + $self->{sift_prediction} = $prediction; + } + + return $self->{sift_score}; +} + + +sub _protein_function_prediction { + my ($self, $analysis) = @_; + + # we can only get results for variants that cause a single amino acid substitution, + # so check the peptide allele string first + + if ($self->pep_allele_string && $self->pep_allele_string =~ /^[A-Z]\/[A-Z]$/ && defined $AA_LOOKUP->{$self->peptide}) { + + if (my $matrix = $self->transcript_variation->_protein_function_predictions($analysis)) { + + my ($prediction, $score) = $matrix->get_prediction( + $self->transcript_variation->translation_start, + $self->peptide, + ); + + return wantarray ? ($prediction, $score) : $prediction; + } + } + + return undef; +} + +=head2 hgvs_genomic + + Description: Return a string representing the genomic-level effect of this allele in HGVS format + Returntype : string + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_genomic { + return _hgvs_generic(@_,'genomic'); +} + +=head2 hgvs_coding + + Description: Return a string representing the CDS-level effect of this allele in HGVS format + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_coding { + + deprecate('HGVS coding support has been moved to hgvs_transcript. This method will be removed in the next release.'); + return hgvs_transcript(@_); +} + + +=head2 hgvs_transcript + + Description: Return a string representing the CDS-level effect of this allele in HGVS format + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + + +sub hgvs_transcript { + + + my $self = shift; + my $notation = shift; + + ##### set if string supplied + $self->{hgvs_transcript} = $notation if defined $notation; + ##### return if held + return $self->{hgvs_transcript} if defined $self->{hgvs_transcript} ; + return $self->{hgvs_coding} if defined $self->{hgvs_coding} ; + + my $variation_feature_sequence = $self->variation_feature_seq() ; + + ### don't try to handle odd characters + return undef if $variation_feature_sequence =~ m/[^ACGT\-]/ig; + + ### no result for reference allele + return undef if $self->is_reference ==1; + + ### else evaluate + + ### get reference sequence strand + my $refseq_strand = $self->transcript_variation->transcript->strand(); + + if($DEBUG ==1){ + my $var_name = $self->transcript_variation->variation_feature->variation_name(); + print "\nHGVS transcript: Checking $var_name refseq strand => $refseq_strand seq name : " . $self->transcript_variation->transcript_stable_id() . " var strand " . $self->transcript_variation->variation_feature->strand() . " vf st " . $self->variation_feature->strand() ." seqname: " . $self->variation_feature->seqname() ." seq: " . $self->variation_feature_seq ."\n"; + } + + my $hgvs_notation ; ### store components of HGVS string in hash + + ### vf strand is relative to transcript or transcript slice + if( $self->transcript_variation->variation_feature->strand() <0 && $refseq_strand >0 || + $self->transcript_variation->variation_feature->strand() >0 && $refseq_strand < 0 + ){ + reverse_comp(\$variation_feature_sequence); + } + + ### need to get ref seq from slice transcript is on for intron labelling + my ($slice_start, $slice_end, $slice) = $self->_var2transcript_slice_coords(); + + + unless($slice_start){ + #warn "TVA: VF not within transcript - no HGVS\n"; + return undef; + } + + ### decide event type from HGVS nomenclature + $hgvs_notation = hgvs_variant_notation( $variation_feature_sequence, ### alt_allele, + $slice->seq(), ### using this to extract ref allele + $slice_start, + $slice_end + ); + + ### This should not happen + unless($hgvs_notation->{'type'}){ + #warn "Error - not continuing; no HGVS annotation\n"; + return undef; + } + + ### create reference name - transcript name & seq version + $hgvs_notation->{'ref_name'} = $self->transcript_variation->transcript_stable_id() . "." . $self->transcript_variation->transcript->version(); + + + ### get position relative to transcript features [use HGVS coords not variation feature coords due to dups] + $hgvs_notation->{start} = $self->_get_cDNA_position( $hgvs_notation->{start} ); + $hgvs_notation->{end} = $self->_get_cDNA_position( $hgvs_notation->{end} ); + + + # Make sure that start is always less than end + my ($exon_start_coord, $intron_start_offset) = $hgvs_notation->{start} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; + my ($exon_end_coord, $intron_end_offset) = $hgvs_notation->{end} =~ m/(\-?[0-9]+)\+?(\-?[0-9]+)?/; + $intron_start_offset ||= 0; + $intron_end_offset ||= 0; + + ($hgvs_notation->{start},$hgvs_notation->{end}) = ($hgvs_notation->{end},$hgvs_notation->{start}) if + (($exon_start_coord + $intron_start_offset) > ($exon_end_coord + $intron_end_offset)); + + + + if($self->transcript->cdna_coding_start()){ + $hgvs_notation->{'numbering'} = "c"; ### set 'c' if transcript is coding + } + else{ + $hgvs_notation->{'numbering'} = "n"; ### set 'n' if transcript non-coding + } + ### generic formatting + $self->{hgvs_transcript} = format_hgvs_string( $hgvs_notation); + + if($DEBUG ==1){print "HGVS notation for var " . $self->transcript_variation->variation_feature->variation_name() . " from hgvs transcript : " . $self->{hgvs_transcript} . " \n";} + + return $self->{hgvs_transcript}; + +} + + +=head2 hgvs_protein + + Description: Return a string representing the protein-level effect of this allele in HGVS format + Returntype : string or undef if this allele is not in the CDS + Exceptions : none + Status : At Risk + +=cut + +sub hgvs_protein { + + my $self = shift; + my $notation = shift; + my $hgvs_notation; + + if($DEBUG ==1){print "\nStarting hgvs_protein with ". $self->transcript_variation->variation_feature->variation_name() . "\n"; } + + ### set if string supplied + $self->{hgvs_protein} = $notation if defined $notation; + + ### return if set + return $self->{hgvs_protein} if defined $self->{hgvs_protein} ; + + ### don't try to handle odd characters + return undef if $self->variation_feature_seq() =~ m/[^ACGT\-]/ig; + + #### else, check viable input and create notation + return if $self->is_reference(); + + unless ($self->transcript_variation->translation_start() && $self->transcript_variation->translation_end()){ + return undef ; + } + + #### for debug + #my $var_name = $self->transcript_variation->variation_feature->variation_name(); + + ### get reference sequence [add seq version to transcript name] + $hgvs_notation->{ref_name} = $self->transcript_variation->transcript->translation()->display_id() . "." . $self->transcript_variation->transcript->translation()->version(); + + $hgvs_notation->{'numbering'} = 'p'; + + ### get default reference location [changed later in some cases eg. duplication] + $hgvs_notation->{start} = $self->transcript_variation->translation_start(); + $hgvs_notation->{end} = $self->transcript_variation->translation_end(); + + + ## get default reference & alt peptides [changed later to hgvs format] + $hgvs_notation->{alt} = $self->peptide; + $hgvs_notation->{ref} = $self->transcript_variation->get_reference_TranscriptVariationAllele->peptide; + + if(defined $hgvs_notation->{alt} && defined $hgvs_notation->{ref}){ + $hgvs_notation = _clip_alleles( $hgvs_notation); + } + + #### define type - types are different for protein numbering + $hgvs_notation = $self->_get_hgvs_protein_type($hgvs_notation); + + ##### Convert ref & alt peptides taking into account HGVS rules + $hgvs_notation = $self->_get_hgvs_peptides($hgvs_notation); + + ##### String formatting + $self->{hgvs_protein} = $self->_get_hgvs_protein_format($hgvs_notation); + return $self->{hgvs_protein} ; +} + +### HGVS: format protein string +sub _get_hgvs_protein_format{ + + my $self = shift; + my $hgvs_notation = shift; + + if ((defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} && + $hgvs_notation->{ref} eq $hgvs_notation->{alt}) || + (defined $hgvs_notation->{type} && $hgvs_notation->{type} eq "=")){ + + ### no protein change - return transcript nomenclature with flag for neutral protein consequence + $hgvs_notation->{'hgvs'} = $self->hgvs_transcript() . "(p.=)"; + return $hgvs_notation->{'hgvs'} ; + } + + ### all start with refseq name & numbering type + $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . "."; + + ### handle stop_lost seperately regardless of cause by del/delins => p.XposAA1extnum_AA_to_stop + if(stop_lost($self)){ ### if deletion of stop add extX and number of new aa to alt + $hgvs_notation->{alt} = substr($hgvs_notation->{alt}, 0, 3); + if($hgvs_notation->{type} eq "del"){ + my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "del"); + if(defined $aa_til_stop){ + $hgvs_notation->{alt} .= "extX" . $aa_til_stop; + } + } + elsif($hgvs_notation->{type} eq ">"){ + my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "subs"); + if(defined $aa_til_stop){ + $hgvs_notation->{alt} .= "extX" . $aa_til_stop; + } + } + else{ + # warn "TVA: stop loss for type $hgvs_notation->{type} not caught \n"; + } + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; + } + + elsif( $hgvs_notation->{type} eq "dup"){ + + if($hgvs_notation->{start} < $hgvs_notation->{end}){ + ### list only first and last peptides in long duplicated string + my $ref_pep_first = substr($hgvs_notation->{alt}, 0, 3); + my $ref_pep_last = substr($hgvs_notation->{alt}, -3, 3); + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} ."dup"; + + } + else{ + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{alt} . $hgvs_notation->{start} . "dup" ; + } + + print "formating a dup $hgvs_notation->{hgvs} \n" if $DEBUG==1; + } + + elsif($hgvs_notation->{type} eq ">"){ + #### substitution + + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; + } + + elsif( $hgvs_notation->{type} eq "delins" || $hgvs_notation->{type} eq "ins" ){ + + #### list first and last aa in reference only + my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); + my $ref_pep_last; + if(substr($hgvs_notation->{ref}, -1, 1) eq "X"){ + $ref_pep_last ="X"; + } + else{ + $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); + } + if($hgvs_notation->{ref} =~ /X$/){ + ### For stops & add extX & distance to next stop to alt pep + my $aa_til_stop = _stop_loss_extra_AA($self,$hgvs_notation->{start}-1, "loss"); + if(defined $aa_til_stop){ + $hgvs_notation->{alt} .="extX" . $aa_til_stop; + } + } + if($hgvs_notation->{start} == $hgvs_notation->{end} && $hgvs_notation->{type} eq "delins"){ + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; + } + else{ + ### correct ordering if needed + if($hgvs_notation->{start} > $hgvs_notation->{end}){ + ( $hgvs_notation->{start}, $hgvs_notation->{end}) = ($hgvs_notation->{end}, $hgvs_notation->{start} ); + } + + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} . $hgvs_notation->{type} . $hgvs_notation->{alt} ; + } + } + + elsif($hgvs_notation->{type} eq "fs"){ + if(defined $hgvs_notation->{alt} && $hgvs_notation->{alt} eq "X"){ ## stop gained + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; + + } + else{ ## not immediate stop - count aa until next + + my $aa_til_stop = _stop_loss_extra_AA($self, $hgvs_notation->{start}-1, "fs"); + if($aa_til_stop ){ + ### use long form if new stop found + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} . "fsX$aa_til_stop" ; + } + else{ + ### otherwise use short form + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "fs"; + } + } + } + + elsif( $hgvs_notation->{type} eq "del"){ + $hgvs_notation->{alt} = "del"; + if( length($hgvs_notation->{ref}) >3 ){ + my $ref_pep_first = substr($hgvs_notation->{ref}, 0, 3); + my $ref_pep_last = substr($hgvs_notation->{ref}, -3, 3); + $hgvs_notation->{'hgvs'} .= $ref_pep_first . $hgvs_notation->{start} . "_" . $ref_pep_last . $hgvs_notation->{end} .$hgvs_notation->{alt} ; + } + else{ + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . $hgvs_notation->{alt} ; + } + } + + elsif($hgvs_notation->{start} ne $hgvs_notation->{end} ){ + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref} . $hgvs_notation->{start} . "_" . $hgvs_notation->{alt} . $hgvs_notation->{end} ; + } + + else{ + #### default to substitution + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{ref}. $hgvs_notation->{start} . $hgvs_notation->{alt}; + } + + if($DEBUG==1){ print "Returning protein format: $hgvs_notation->{'hgvs'}\n";} + return $hgvs_notation->{'hgvs'}; +} + +### HGVS: get type of variation event in protein terms +sub _get_hgvs_protein_type{ + + my $self = shift; + my $hgvs_notation = shift; + + ### get allele length + my ($ref_length, $alt_length ) = $self->_get_allele_length(); + + + if( defined $hgvs_notation->{ref} && defined $hgvs_notation->{alt} ){ + ### Run type checks on peptides if available + + if($hgvs_notation->{alt} eq $hgvs_notation->{ref} ){ + #### synonymous indicated by = + $hgvs_notation->{type} = "="; + } + elsif( length($hgvs_notation->{ref}) ==1 && length($hgvs_notation->{alt}) ==1 ) { + + $hgvs_notation->{type} = ">"; + } + elsif($hgvs_notation->{ref} eq "-" || $hgvs_notation->{ref} eq "") { + + $hgvs_notation->{type} = "ins"; + } + elsif($hgvs_notation->{alt} eq "" ) { + + $hgvs_notation->{type} = "del"; + } + elsif( (length($hgvs_notation->{alt}) > length($hgvs_notation->{ref}) ) && + $hgvs_notation->{alt} =~ / $hgvs_notation->{ref}/ ) { + ### capture duplication event described as TTT/TTTTT + $hgvs_notation->{type} = "dup"; + } + + elsif( (length($hgvs_notation->{alt}) >1 && length($hgvs_notation->{ref}) ==1) || + (length($hgvs_notation->{alt}) ==1 && length($hgvs_notation->{ref}) >1) ) { + $hgvs_notation->{type} = "delins"; + } + else{ + $hgvs_notation->{type} = ">"; + } + } + + + elsif($ref_length ne $alt_length && ($ref_length - $alt_length)%3 !=0 ){ + $hgvs_notation->{type} = "fs"; + } + + elsif(length($self->variation_feature_seq()) >1 ){ + if($hgvs_notation->{start} == ($hgvs_notation->{end} + 1) ){ + ### convention for insertions - end one less than start + $hgvs_notation->{type} = "ins"; + } + elsif( $hgvs_notation->{start} != $hgvs_notation->{end} ){ + $hgvs_notation->{type} = "delins"; + } + else{ + $hgvs_notation->{type} = ">"; + } + } + else{ + #print STDERR "DEBUG ".$self->variation_feature->start."\n"; + #warn "Cannot define protein variant type [$ref_length - $alt_length]\n"; + } + return $hgvs_notation ; + +} + +### HGVS: get reference & alternative peptide +sub _get_hgvs_peptides{ + + my $self = shift; + my $hgvs_notation = shift; + + if($hgvs_notation->{type} eq "fs"){ + ### ensembl alt/ref peptides not the same as HGVS alt/ref - look up seperately + $hgvs_notation = $self->_get_fs_peptides($hgvs_notation); + } + elsif($hgvs_notation->{type} eq "ins" ){ + + ### HGVS ref are peptides flanking insertion + $hgvs_notation->{ref} = $self->_get_surrounding_peptides($hgvs_notation->{start}); + + if( $hgvs_notation->{alt} =~/\*/){ + ## inserted bases after stop irrelevant; consider as substitution gaining stop MAINTAIN PREVIOUS BEHAVIOUR FOR NOW + #$hgvs_notation->{alt} =~ s/\*\w+$/\*/; + } + else{ + ### Additional check that inserted bases do not duplicate 3' reference sequence [set to type = dup if so] + $hgvs_notation = $self->_check_for_peptide_duplication($hgvs_notation); + } + } + + + ### Convert peptide to 3 letter code as used in HGVS + $hgvs_notation->{ref} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{ref}, -id => 'ref', -alphabet => 'protein')) || ""; + if( $hgvs_notation->{alt} eq "-"){ + $hgvs_notation->{alt} = "del"; + } + else{ + $hgvs_notation->{alt} = Bio::SeqUtils->seq3(Bio::PrimarySeq->new(-seq => $hgvs_notation->{alt}, -id => 'ref', -alphabet => 'protein')) || ""; + } + + ### handle special cases + if( affects_start_codon($self) ){ + #### handle initiator loss -> probably no translation => alt allele is '?' + $hgvs_notation->{alt} = "?"; + $hgvs_notation->{type} = ""; + } + + elsif( $hgvs_notation->{type} eq "del"){ + #### partial last codon + $hgvs_notation->{alt} = "del"; + } + elsif($hgvs_notation->{type} eq "fs"){ + ### only quote first ref peptide for frameshift + $hgvs_notation->{ref} = substr($hgvs_notation->{ref},0,3); + } + + ### set stop to HGVS prefered "X" + if(defined $hgvs_notation->{ref}){ $hgvs_notation->{ref} =~ s/Ter|Xaa/X/g; } + if(defined $hgvs_notation->{alt}){ $hgvs_notation->{alt} =~ s/Ter|Xaa/X/g; } + + return ($hgvs_notation); + +} + +### HGVS: remove common peptides from alt and ref strings & shift start/end accordingly +sub _clip_alleles{ + + my $hgvs_notation = shift; + + my $check_alt = $hgvs_notation->{alt} ; + my $check_ref = $hgvs_notation->{ref} ; + my $check_start = $hgvs_notation->{start}; + + ### strip same bases from start of string + for (my $p =0; $p <length ($hgvs_notation->{ref}); $p++){ + my $check_next_ref = substr( $check_ref, 0, 1); + my $check_next_alt = substr( $check_alt, 0, 1); + + if($check_next_ref eq "*" && $check_next_alt eq "*"){ + ### stop re-created by variant - no protein change + $hgvs_notation->{type} = "="; + + return($hgvs_notation); + } + + if($check_next_ref eq $check_next_alt){ + $check_start++; + $check_ref = substr( $check_ref, 1); + $check_alt = substr( $check_alt, 1); + } + else{ + last; + } + } + #### strip same bases from end of string + for (my $q =0; $q < length ($check_ref); $q++){ + my $check_next_ref = substr( $check_ref, -1, 1); + my $check_next_alt = substr( $check_alt, -1, 1); + if($check_next_ref eq $check_next_alt){ + chop $check_ref; + chop $check_alt; + } + else{ + last; + } + } + + $hgvs_notation->{alt} = $check_alt; + $hgvs_notation->{ref} = $check_ref; + + ### check if clipping force type change & adjust location + if(length ($check_ref) == 0 && length ($check_alt) >= 1){ + ### re-set as ins not delins + $hgvs_notation->{type} ="ins"; + ### insertion between ref base and next => adjust next + if($hgvs_notation->{end} == $hgvs_notation->{start} ){$hgvs_notation->{end} = $check_start; } +# $hgvs_notation->{start} = $check_start; + } + elsif(length ($check_ref) >=1 && length ($check_alt) == 0){ + ### re-set as del not delins + $hgvs_notation->{type} ="del"; + $hgvs_notation->{start} = $check_start; + } + else{ + #### save trimmed peptide string & increased start position + $hgvs_notation->{start} = $check_start; + + } + + return $hgvs_notation; +} + + + + + +#### HGVS: check allele lengths to look for frameshifts +sub _get_allele_length{ + + my $self = shift; + my $ref_length = 0; + my $alt_length = 0; + + my $al_string = $self->allele_string(); + my $ref_allele = (split/\//, $al_string)[0]; + $ref_allele =~ s/\-//; + $ref_length = length $ref_allele; + + my $alt_allele = $self->variation_feature_seq(); + $alt_allele =~ s/\-//; + $alt_length = length $alt_allele; + + return ($ref_length, $alt_length ); + +} + +### HGVS: list first different peptide [may not be first changed codon] +sub _get_fs_peptides{ + + my $self = shift; + my $hgvs_notation = shift; + + ### get CDS with alt variant + my $alt_cds = $self->_get_alternate_cds(); + return undef unless defined($alt_cds); + + #### get new translation + my $alt_trans = $alt_cds->translate()->seq(); + + ### get changed end (currently in single letter AA coding) + my $ref_trans = $self->transcript->translate()->seq(); + $ref_trans .= "*"; ## appending ref stop for checking purposes + + $hgvs_notation->{start} = $self->transcript_variation->translation_start() ; + + if( $hgvs_notation->{start} > length $alt_trans){ ## deletion of stop, no further AA in alt seq + $hgvs_notation->{alt} = "del"; + $hgvs_notation->{type} = "del"; + return $hgvs_notation; + } + + while ($hgvs_notation->{start} <= length $alt_trans){ + ### frame shift may result in the same AA# + + $hgvs_notation->{ref} = substr($ref_trans, $hgvs_notation->{start}-1, 1); + $hgvs_notation->{alt} = substr($alt_trans, $hgvs_notation->{start}-1, 1); + + if($hgvs_notation->{ref} eq "*" && $hgvs_notation->{alt} eq "*"){ + ### variation at stop codon, but maintains stop codon + return ($hgvs_notation); + } + last if $hgvs_notation->{ref} ne $hgvs_notation->{alt}; + $hgvs_notation->{start}++; + } + return ($hgvs_notation); + +} + +#### HGVS: if variant is an insertion, ref pep is initially "-", so seek AA before and after insertion +sub _get_surrounding_peptides{ + + my $self = shift; + my $ref_pos = shift; + my $ref_trans = $self->transcript->translate()->seq(); + + my $end = substr($ref_trans, $ref_pos-1); + my $ref_string = substr($ref_trans, $ref_pos-1, 2); + + return ($ref_string); + +} + + +#### HGVS: alternate CDS needed to check for new stop when variant disrupts 'reference' stop +sub _get_alternate_cds{ + + my $self = shift; + + ### get reference sequence + my $reference_cds_seq = $self->transcript->translateable_seq(); + + return undef unless defined($self->transcript_variation->cds_start) && defined($self->transcript_variation->cds_end()); + + ### get sequences upstream and downstream of variant + my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); + my $downstream_seq = substr($reference_cds_seq, ($self->transcript_variation->cds_end() ) ); + + ### fix alternate allele if deletion or on opposite strand + my $alt_allele = $self->variation_feature_seq(); + $alt_allele =~ s/\-//; + if( $self->transcript_variation->variation_feature->strand() <0 && $self->transcript_variation->transcript->strand() >0 || + $self->transcript_variation->variation_feature->strand() >0 && $self->transcript_variation->transcript->strand() < 0 + ){ + reverse_comp(\$alt_allele) ; + } + ### build alternate seq + my $alternate_seq = $upstream_seq . $alt_allele . $downstream_seq ; + + ### create seq obj with alternative allele in the CDS sequence + my $alt_cds =Bio::PrimarySeq->new(-seq => $alternate_seq, -id => 'alt_cds', -alphabet => 'dna'); + + ### append UTR if available as stop may be disrupted + my $utr = $self->transcript_variation->transcript->three_prime_utr(); + + if (defined $utr) { + ### append the UTR to the alternative CDS + $alt_cds->seq($alt_cds->seq() . $utr->seq()); + } + else{ + ##warn "No UTR available for alternate CDS\n"; + } + + return $alt_cds; +} + +### HGVS: if inserted string is identical to 3' reference sequence, describe as duplication +sub _check_for_peptide_duplication{ + + my $self = shift; + my $hgvs_notation= shift; + + ##### get reference sequence + my $reference_cds_seq = $self->transcript->translateable_seq(); + + ##### get sequence upstream of variant + my $upstream_seq = substr($reference_cds_seq, 0, ($self->transcript_variation->cds_start() -1) ); + + ##### create translation to check against inserted peptides + my $upstream_cds =Bio::PrimarySeq->new(-seq => $upstream_seq, -id => 'alt_cds', -alphabet => 'dna'); + my $upstream_trans = $upstream_cds->translate()->seq(); + + ## Test whether alt peptide matches the reference sequence just before the variant + my $test_new_start = $hgvs_notation->{'start'} - length($hgvs_notation->{'alt'}) -1 ; + my $test_seq = substr($upstream_trans, $test_new_start, length($hgvs_notation->{'alt'})); + + if ( $test_new_start >= 0 && $test_seq eq $hgvs_notation->{alt}) { + + $hgvs_notation->{type} = 'dup'; + $hgvs_notation->{end} = $hgvs_notation->{start} -1; + $hgvs_notation->{start} -= length($hgvs_notation->{alt}); + + } + + return $hgvs_notation; + +} + +#### HGVS: if a stop is lost, seek the next in transcript & count number of extra AA's +sub _stop_loss_extra_AA{ + + my $self = shift; + my $ref_var_pos = shift; ### first effected AA - supply for frameshifts + my $test = shift; + + return undef unless $ref_var_pos; + + my $extra_aa; + + ### get the sequence with variant added + my $alt_cds = $self->_get_alternate_cds(); + return undef unless defined($alt_cds); + + ### get new translation + my $alt_trans = $alt_cds->translate(); + + my $ref_temp = $self->transcript->translate()->seq; + my $ref_len = length($ref_temp); + + if($DEBUG==1){ + print "alt translated:\n" . $alt_trans->seq() . "\n"; + print "ref translated:\n$ref_temp\n";; + } + + #### Find the number of residues that are translated until a termination codon is encountered + if ($alt_trans->seq() =~ m/\*/) { + if($DEBUG==1){print "Got $+[0] aa before stop, var event at $ref_var_pos \n";} + + if($test eq "fs" ){ + ### frame shift - count from first AA effected by variant to stop + $extra_aa = $+[0] - $ref_var_pos; + if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before fs stop [ $+[0] - peptide ref_start: $ref_var_pos )]\n";} + } + + else{ + $extra_aa = $+[0] - 1 - $ref_len; + if($DEBUG==1){ print "Stop change ($test): found $extra_aa amino acids before next stop [ $+[0] - 1 -normal stop $ref_len)]\n";} + } + } + + # A special case is if the first aa is a stop codon => don't display the number of residues until the stop codon + if(defined $extra_aa && $extra_aa >0){ + return $extra_aa ; + } + else{ + #warn "No stop found in alternate sequence\n"; + return undef; + } + +} + +=head +# We haven't implemented support for these methods yet + +sub hgvs_rna { + return _hgvs_generic(@_,'rna'); +} + +sub hgvs_mitochondrial { + return _hgvs_generic(@_,'mitochondrial'); +} + +=cut + +sub _hgvs_generic { + + my $self = shift; + my $reference = pop; + my $notation = shift; + + #ĘThe rna and mitochondrial modes have not yet been implemented, so return undef in case we get a call to these + return undef if ($reference =~ m/rna|mitochondrial/); + + my $sub = qq{hgvs_$reference}; + + $self->{$sub} = $notation if defined $notation; + + unless ($self->{$sub}) { + + # Use the transcript this VF is on as the reference feature + my $reference_feature = $self->transcript; + # If we want genomic coordinates, the reference_feature should actually be the slice for the underlying seq_region + $reference_feature = $reference_feature->slice->seq_region_Slice if ($reference eq 'genomic'); + + # Calculate the HGVS notation on-the-fly and pass it to the TranscriptVariation in order to distribute the result to the other alleles + $self->transcript_variation->$sub($self->variation_feature->get_all_hgvs_notations($reference_feature,substr($reference,0,1),undef,undef,$self->transcript_variation)); + } + + return $self->{$sub}; +} + + +### HGVS: move variant to transcript slice +sub _var2transcript_slice_coords{ + + my $self = shift; + + my $ref_slice = $self->transcript->feature_Slice(); #### returns with strand same as feature + my $tr_vf = $self->variation_feature->transfer($ref_slice); + + # Return undef if this VariationFeature does not fall within the supplied feature. + return undef if (!defined $tr_vf || + $tr_vf->start < 1 || + $tr_vf->end < 1 || + $tr_vf->start > ($self->transcript->end - $self->transcript->start + 1) || + $tr_vf->end > ($self->transcript->end - $self->transcript->start + 1)); + + return( $tr_vf->start() , $tr_vf->end(), $ref_slice); +} + + + +### HGVS: get variant position in transcript + +# intron: +# If the position is in an intron, the boundary position of the closest exon and +# a + or - offset into the intron is returned. +# Ordered by genome forward not 5' -> 3' + +# upstream: +# If the position is 5' of the start codon, it is reported relative to the start codon +# (-1 being the last nucleotide before the 'A' of ATG). + +#downstream: +# If the position is 3' pf the stop codon, it is reported with a '*' prefix and the offset +# from the start codon (*1 being the first nucleotide after the last position of the stop codon) + +sub _get_cDNA_position { + + my $self = shift; + my $position = shift; ### start or end of variation + + my $transcript = $self->transcript(); + my $strand = $transcript->strand(); + + #### TranscriptVariation start/stop coord relative to transcript + #### Switch to chromosome coordinates taking into account strand + $position = ( $strand > 0 ? + ( $self->transcript->start() + $position - 1 ) : + ( $self->transcript->end() - $position + 1)); + + + + # Get all exons and sort them in positional order + my @exons = sort {$a->start() <=> $b->start()} @{$transcript->get_all_Exons()}; + my $n_exons = scalar(@exons); + + my $cdna_position; + # Loop over the exons and get the coordinates of the variation in exon+intron notation + for (my $i=0; $i<$n_exons; $i++) { + + # Skip if the start point is beyond this exon + next if ($position > $exons[$i]->end()); + + + # EXONIC: If the start coordinate is within this exon + if ($position >= $exons[$i]->start()) { + # Get the cDNA start coordinate of the exon and add the number of nucleotides from the exon boundary to the variation + # If the transcript is in the opposite direction, count from the end instead + $cdna_position = $exons[$i]->cdna_start($transcript) + ( $strand > 0 ? + ( $position - $exons[$i]->start ) : + ( $exons[$i]->end() - $position ) + ); + last; #### last exon checked + } + ## INTRONIC + # Else the start coordinate is between this exon and the previous one, determine which one is closest and get coordinates relative to that one + else { + + my $updist = ($position - $exons[$i-1]->end()); + my $downdist = ($exons[$i]->start() - $position); + + # If the distance to the upstream exon is the shortest, or equal and in the positive orientation, use that + if ($updist < $downdist || ($updist == $downdist && $strand >= 0)) { + + # If the orientation is reversed, we should use the cDNA start and a '-' offset + $cdna_position = ($strand >= 0 ? + $exons[$i-1]->cdna_end($transcript) . '+' : + $exons[$i-1]->cdna_start($transcript) . '-') + . $updist; + } + # Else if downstream is shortest... + else { + # If the orientation is reversed, we should use the cDNA end and a '+' offset + $cdna_position = ($strand >= 0 ? + $exons[$i]->cdna_start($transcript) . '-' : + $exons[$i]->cdna_end($transcript) . '+') + . $downdist; + } + last; ## last exon checked + } + } + + # Shift the position to make it relative to the start & stop codons + my $start_codon = $transcript->cdna_coding_start(); + my $stop_codon = $transcript->cdna_coding_end(); + + # Disassemble the cDNA coordinate into the exon and intron parts + ### just built this now taking it appart again + my ($cdna_coord, $intron_offset) = $cdna_position =~ m/([0-9]+)([\+\-][0-9]+)?/; + + + # Start by correcting for the stop codon + if (defined($stop_codon) && $cdna_coord > $stop_codon) { + # Get the offset from the stop codon + $cdna_coord -= $stop_codon; + # Prepend a * to indicate the position is in the 3' UTR + $cdna_coord = '*' . $cdna_coord; + } + elsif (defined($start_codon)) { + # If the position is beyond the start codon, add 1 to get the correct offset + $cdna_coord += ($cdna_coord >= $start_codon); + # Subtract the position of the start codon + $cdna_coord -= $start_codon; + } + + # Re-assemble the cDNA position [ return exon num & offset & direction for intron eg. 142+363] + $cdna_position = $cdna_coord . (defined($intron_offset) ? $intron_offset : ''); + return $cdna_position; +} + + +1; + +