diff variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptAlleles.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptAlleles.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,703 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+TranscriptAlleles - A utility class used to obtain information about the
+relationships between a transcript and Alleles
+
+=head1 SYNOPSIS
+
+  use Bio::EnsEMBL::Utils::TranscriptAlleles;
+
+  # get the peptide variations caused by a set of Alleles
+
+  %variations = %{
+    Bio::EnsEMBL::Utils::TranscriptAlleles::get_all_peptide_variations(
+      $transcript, $alleles ) };
+
+=head1 DESCRIPTION
+
+This is a utility class which can be used to find consequence type of an
+AlleleFeature in a transcript, and to determine the amino acid changes
+caused by the AlleleFeature in the Transcript
+
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::Utils::TranscriptAlleles;
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Variation::ConsequenceType;
+use vars qw(@ISA @EXPORT_OK);
+
+use Data::Dumper;
+
+@ISA = qw(Exporter);
+
+@EXPORT_OK = qw(&get_all_ConsequenceType &type_variation);
+
+
+=head2 get_all_ConsequenceType
+
+  Arg [1]    : $transcript the transcript to obtain the peptide variations for
+  Arg [2]    : $alleles listref of AlleleFeatures
+  Example    : $consequence_types = get_all_ConsequenceType($transcript, \@alleles);
+               foreach my $ct (@{$consequence_types}){
+                  print "Allele : ", $ct->allele_string, " has a consequence type of :",$ct->type;
+                  print " and is affecting the transcript with ",@{$ct->aa_alleles}, "in position ", 
+		              $ct->aa_start,"-", $ct->aa_end if (defined $ct->aa_alleles);
+		  print "\n";
+	      }
+  Description: Takes a list of AlleleFeatures and a Transcritpt, and return a list
+                     of ConsequenceType of the alleles in the given Transcript
+  Returntype : listref of Bio::EnsEMBL::Variation::ConsequenceType
+  Exceptions : none
+  Caller     : general 
+
+=cut
+
+sub get_all_ConsequenceType {
+  my $transcript = shift;
+  my $alleles = shift;
+
+  if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
+    throw('Bio::EnsEMBL::Transcript argument is required.');
+  }
+
+  if(!ref($alleles) || (ref($alleles) ne 'ARRAY')) {
+    throw('Reference to a list of Bio::EnsEMBL::Variation::AlleleFeature objects is required');
+  }
+
+
+  my @alleles_ordered = sort { $a->start <=> $b->start} @$alleles; #sort the alleles by the genomic position
+  my @same_codon; #contains up to 3 allele features, that are in the same codon, but each position can contain more than 1 allele
+  my @out; #array containing the consequence types of the alleles in the transcript
+  foreach my $allele (@alleles_ordered) {
+#  foreach my $allele (@{$alleles}) {
+    #get consequence type of the AlleleFeature
+    # my $new_allele = $allele->transform('chromosome');
+    #my $consequence_type = Bio::EnsEMBL::Variation::ConsequenceType->new($transcript->dbID(),'',$allele->start,$allele->end,$allele->strand,[$allele->allele_string]);
+    ### REAL HACK BY js5 because something is borked in TranscriptMapper
+    ### This relies on the Allele being of the form i.e. a SNP! [ACGT-](/[ACGT-])+
+    ### The rest don't work anyway until we have a AlignStrainSlice
+    ### MUST BE SORTED....
+
+    #we have to consider het alleles
+    my $allele_string;
+    if ($allele->allele_string =~ /[\|\\\/]/){
+      my @alleles = split /[\|\\\/]/,$allele->allele_string;
+      if ($alleles[0] ne $allele->ref_allele_string){
+	$allele_string = $alleles[0];
+	  }
+      else{
+	$allele_string = $alleles[1];
+      }
+    }
+    else{
+      $allele_string = $allele->allele_string;  
+    }    
+    my $opposite_strand = 0; #to indicate wether transcript and allele and in different strands
+    my $transcript_allele = $allele_string;
+    if( $transcript->strand != $allele->strand ) {
+      $transcript_allele =~tr/ACGT/TGCA/;
+      $opposite_strand = 1;
+    }
+
+    my $consequence_type = Bio::EnsEMBL::Variation::ConsequenceType->new($transcript->dbID(),'',$allele->start, $allele->end, $transcript->strand, [$transcript_allele]);
+    #calculate the consequence type of the Allele if different from the reference Allele
+    #if (($opposite_strand && $allele->ref_allele_string eq $allele_string) || (!$opposite_strand && $allele->ref_allele_string eq $allele_string)){      	#same allele as reference, there is no consequence, called SARA
+    if ($allele->ref_allele_string eq $allele_string) {      	#same allele as reference, there is no consequence, called SARA
+      	#same allele as reference, there is no consequence, called SARA
+	#we have to calculate if there are more than 2 in the same codon
+	empty_codon(\@out,\@same_codon);	
+	$consequence_type->type('SARA');
+	push @out, $consequence_type;
+	next;
+    }
+	
+    my $ref_consequences = type_variation($transcript,"",$consequence_type);
+    if ($allele->start != $allele->end){
+	empty_codon(\@out,\@same_codon);
+	#do not calculate for indels effects of 2 or more in same codon
+	push @out, @{$ref_consequences};
+	next;
+    }
+
+    my $new_consequence = shift @{$ref_consequences};
+    if (! defined $new_consequence ) {
+	empty_codon(\@out,\@same_codon);
+      push @out, $consequence_type; # should be empty
+      next;
+    }
+
+    if ( !defined $new_consequence->aa_start){
+	empty_codon(\@out,\@same_codon);
+	push @out, $new_consequence;
+	next;
+    }
+    #first element of the codon
+    if (!defined $same_codon[0]){
+	push @{$same_codon[0]}, $new_consequence; #goes to the first position
+	next;
+    }
+    #for alleles with aa effect, find out if they are in the same codon
+    if ($same_codon[-1]->[0]->aa_start == $new_consequence->aa_start){
+	#they are in the same codon, find out if it is the same position
+	if ($same_codon[-1]->[0]->start == $new_consequence->start){
+	    #it is the same position
+	    push @{$same_codon[-1]},$new_consequence; #push in the last 
+	}
+	else{	    
+	    push @{$same_codon[$#same_codon + 1]},$new_consequence; #this is a new element in the codon
+	}
+
+    }
+    else{
+	#if there is more than one element in the same_codon array, calculate the effect of the codon
+	if (@same_codon > 1){
+	    calculate_same_codon(\@same_codon);
+	}
+	map {push @out, @{$_}} @same_codon;
+	@same_codon = ();
+	push @{$same_codon[0]}, $new_consequence; #push the element not in the same codon
+    }
+  }
+  #add last consequence_type
+  empty_codon(\@out,\@same_codon);
+
+  return \@out;
+}
+
+sub empty_codon{
+    my $out = shift;
+    my $same_codon = shift;
+
+    if (@{$same_codon} == 1){
+	map {push @{$out}, @{$_}} @{$same_codon};
+    }
+    elsif (@{$same_codon} > 1){
+	calculate_same_codon($same_codon);
+	map {push @{$out}, @{$_}} @{$same_codon};    
+    }
+    @{$same_codon} = ();
+}
+
+# recalculates the effect of 2 or 3 SNPs in the same codon
+sub calculate_same_codon{
+    my $same_codon = shift;
+    my $new_codon;
+    my $old_aa;
+    my $codon_table = Bio::Tools::CodonTable->new;
+    if (@{$same_codon} == 3){
+	#if there are 3 alleles in the same codon
+	map {$new_codon .= @{$_->[0]->alleles};$old_aa = $_->[0]->aa_alleles()->[0]} @{$same_codon};
+    }
+    else{
+	#if there are 2 alleles affecting the same codon
+	my $first_pos = ($same_codon->[0]->[0]->cdna_start -1) % 3; #position of the first allele in the codon
+	my $second_pos = ($same_codon->[1]->[0]->cdna_start -1)% 3; #position of the second allele in the codon
+	if ($first_pos == 0){
+	    #codon starts with first allele
+	    $new_codon = $same_codon->[0]->[0]->alleles->[0]; #first base in the codon
+	    if ($second_pos == 1){
+		$new_codon .= $same_codon->[1]->[0]->alleles->[0]; #second base in the codon
+		$new_codon .= substr($same_codon->[1]->[0]->codon,2,1); #third base in the codon
+	    }
+	    else{
+		$new_codon .= substr($same_codon->[1]->[0]->codon,1,1); #second base in the codon
+		$new_codon .= $same_codon->[1]->[0]->alleles->[0]; #third base in the codon
+	    }
+	}
+	else{
+	    #alleles are in position 1 and 2 in the codon
+	    $new_codon = substr($same_codon->[1]->[0]->codon,0,1); #first base in the codon
+	    $new_codon .= $same_codon->[0]->[0]->alleles->[0]; #second base in the codon
+	    $new_codon .= $same_codon->[1]->[0]->alleles->[0]; #third base in the codon
+	}
+	$old_aa = $same_codon->[0]->[0]->aa_alleles->[0];	
+    }
+    #calculate the new_aa
+    my $new_aa = $codon_table->translate($new_codon);
+    #and update the aa_alleles field in all the codons
+    foreach my $codon (@{$same_codon}){
+	map {$_->aa_alleles([$old_aa,$new_aa])} @{$codon};
+    }
+
+}
+#
+# Classifies a variation which is in the vicinity of a transcript
+#
+sub type_variation {
+  my $tr  = shift;
+  my $g   = shift;
+  my $var = shift;
+
+  my $UPSTREAM = 5000;
+  my $DOWNSTREAM = 5000;
+
+  #empty type first in the case of recursive call
+  $var->empty_type if defined $var->type;
+  
+  if (!$var->isa('Bio::EnsEMBL::Variation::ConsequenceType')) {
+      throw("Not possible to calculate the consequence type for ",ref($var)," : Bio::EnsEMBL::Variation::ConsequenceType object expected");
+  }
+
+  if (($var->start < $tr->start - $UPSTREAM) || ($var->end  > $tr->end + $DOWNSTREAM)){
+    #since the variation is more than UPSTREAM and DOWNSTREAM of the transcript, there is no effect in the transcript
+    return [];
+  }
+  
+  
+  # check the cache
+  my $tran_features = $tr->{_variation_effect_feature_cache};
+  
+  # populate it if not found 
+  unless ($tran_features) {
+	$tran_features = {
+	  mapper  => $tr->get_TranscriptMapper,
+	};
+
+    my ($attrib) = @{$tr->slice()->get_all_Attributes('codon_table')}; #for mithocondrial dna it is necessary to change the table
+
+    my $codon_table;
+    $codon_table = $attrib->value() if($attrib);
+    $codon_table ||= 1; # default vertebrate codon table 
+
+    if ($tran_features->{translation} = $tr->translate(undef, undef, undef, $codon_table)) {
+	  $tran_features->{translateable_seq} = $tr->translateable_seq;
+      
+      # to include the stop codon we need to translate the Bio::Seq sequence, not just
+      # $tr->translation, this is the source of the missing STOP_LOSTs
+      my $mrna_seqobj = Bio::Seq->new(
+          -seq      =>  $tran_features->{translateable_seq},
+          -moltype  => 'dna',
+          -alphabet => 'dna'
+      );
+
+	  $tran_features->{peptide} = $mrna_seqobj->translate(undef, undef, undef, $codon_table)->seq;
+	}
+	
+	$tr->{_variation_effect_feature_cache} = $tran_features;
+  }
+
+  if ( !defined( $tran_features->{translation} ) )
+  {    # for other biotype rather than coding/IG genes
+        # check if the variation is completely outside the transcript:
+
+    if ( $var->end() < $tr->start() ) {
+      $var->type( ( $tr->strand() == 1 ) ? 'UPSTREAM' : 'DOWNSTREAM' );
+      return [$var];
+    }
+    if ( $var->start() > $tr->end() ) {
+      $var->type( ( $tr->strand() == 1 ) ? 'DOWNSTREAM' : 'UPSTREAM' );
+      return [$var];
+    }
+
+    if ( $var->start() >= $tr->start() and $var->end() <= $tr->end() )
+    {    # within the transcript
+      if ( $tr->biotype() eq "miRNA" ) {
+        my ($attribute) = @{ $tr->get_all_Attributes('miRNA') };
+
+        # the value is the mature miRNA coordinate within miRNA
+        # transcript
+        if ( defined($attribute)
+             && $attribute->value() =~ /(\d+)-(\d+)/ )
+        {
+          # transfer cdna value to genomic coordinates
+          my @mapper_objs = $tr->cdna2genomic( $1, $2, $tr->strand() );
+
+          foreach my $obj (@mapper_objs)
+          {    #Note you can get more than one mature seq per miRNA
+            if ( $obj->isa("Bio::EnsEMBL::Mapper::Coordinate") ) {
+              if (     $var->start() >= $obj->start()
+                   and $var->end() <= $obj->end() )
+              {
+                $var->type("WITHIN_MATURE_miRNA");
+                return [$var];
+              }
+            }
+          }
+        }
+      }
+
+      $var->type("WITHIN_NON_CODING_GENE");
+      return [$var];
+
+    } ## end if ( $var->start() >= ...)
+  } ## end if ( !defined( $tr->translation...))
+
+  # get a transcript mapper object
+  my $tm = $tran_features->{mapper};
+  
+  # map to CDNA coords
+  my @cdna_coords = $tm->genomic2cdna($var->start,$var->end,$var->strand);
+
+  # map to CDS cooords
+  my @cds_coords = $tm->genomic2cds($var->start, $var->end,$var->strand);
+  
+  # map to peptide coords
+  my @pep_coords = $tm->genomic2pep($var->start, $var->end, $var->strand);
+  
+  # get the phase of the first exon
+  my $exon_phase = $tr->start_Exon->phase;
+  
+  # check for partial codon consequence
+  if(
+	 @pep_coords == 1
+	 && @cds_coords == 1
+	 && !($cds_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap'))
+	 && !($pep_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap'))
+  ) {
+	
+	# get the CDS sequence
+	my $cds = $tran_features->{translateable_seq};
+	
+	my $start = $pep_coords[0]->start();
+	my $codon_cds_start = ($start * 3) - 2;
+	
+	my $last_codon_length = length($cds) - ($codon_cds_start - 1);
+	
+	if($last_codon_length < 3 && $last_codon_length > 0) {
+	  $var->type("PARTIAL_CODON");
+	  
+	  # add the CDS coords
+	  $var->cds_start($cds_coords[0]->start + ($exon_phase > 0 ? $exon_phase : 0));
+	  $var->cds_end($cds_coords[0]->end + ($exon_phase > 0 ? $exon_phase : 0));
+	  
+	  # add the cDNA coords
+	  $var->cdna_start($cdna_coords[0]->start);
+	  $var->cdna_end($cdna_coords[0]->end);
+	  
+	  return [$var];
+	}
+  }
+  
+
+  # Handle simple cases where the variation is not split into parts.
+  # Call method recursively with component parts in complicated case.
+  # E.g. a single multi-base variation may be both intronic and coding
+  
+  if(@cdna_coords > 1) {
+    my @out;
+    #this will be a new type, complex_indel
+    $var->type('COMPLEX_INDEL');
+    return [$var];
+#     foreach my $c (@coords) {
+#       my %new_var = %{$var};
+#       $new_var{'end'} = $var->start + $c->length() - 1;
+#       $var->start( $new_var{'end'} + 1);
+#       #empty the type before re-run
+#       $var->empty_type ;  
+#       push @out, @{type_variation($tr, $g, bless \%new_var, ref($var))};
+#     }
+#    return \@out;
+
+
+  }
+
+  # look at different splice distances
+  my @coords_splice_2 = $tm->genomic2cdna($var->start -2, $var->end +2, $var->strand);
+  my @coords_splice_3 = $tm->genomic2cdna($var->start -3, $var->end +3, $var->strand);
+  my @coords_splice_8 = $tm->genomic2cdna($var->start -8, $var->end +8, $var->strand);
+
+  my ($splice_site_2, $splice_site_3, $splice_site_8);
+
+  if (scalar @coords_splice_2 >1) {
+    $splice_site_2=1;
+  }
+  elsif (scalar @coords_splice_3 >1) {
+    $splice_site_3=1;
+  }
+  elsif (scalar @coords_splice_8 >1) {
+    $splice_site_8=1;
+  }
+  
+
+  my $c = $cdna_coords[0];
+  if($c->isa('Bio::EnsEMBL::Mapper::Gap')) {
+
+    # check if the variation is completely outside the transcript:
+
+    if($var->end < $tr->start()) {
+      $var->type( ($tr->strand() == 1) ? 'UPSTREAM' : 'DOWNSTREAM' );
+      return [$var];
+    }
+    if($var->start > $tr->end()) {
+      $var->type( ($tr->strand() == 1) ? 'DOWNSTREAM' : 'UPSTREAM' );
+      return [$var];
+    }
+	
+	# nonsense-mediated decay transcript
+	if($tr->biotype() eq 'nonsense_mediated_decay') {
+	  $var->type("NMD_TRANSCRIPT");
+	  #return [$var];
+	}
+
+    # variation must be intronic since mapped to cdna gap, but is within
+    # transcript, note that ESSENTIAL_SPLICE_SITE only consider first (AG) and last (GT) 2 bases inside the intron.
+    # if variation is in intron, we need to check the lenth of intron, if it's shoter than 6, we call it SYNONYMOUS_CODING rather then INTRONIC
+
+    foreach my $intron (@{$tran_features->{introns}}) {
+      if ($intron->length <=5) {#the length of frameshift intron could be 1,2,4,5 bases
+		if ($var->start>=$intron->start and $var->end<=$intron->end) {
+		  #this is a type of SYNONYMOUS_CODING since changes happen in frameshift intron, which don't change exon structure
+		  $var->type('SYNONYMOUS_CODING');
+		  return [$var];
+		}
+      }
+    }
+    #if it's not in frameshift intron, then it's in normal intron
+    $var->type('INTRONIC');
+
+    if ($splice_site_2) {
+      $var->type('ESSENTIAL_SPLICE_SITE');
+    }
+    elsif ($splice_site_3 or $splice_site_8) {
+      $var->type('SPLICE_SITE');
+    }
+    return [$var];
+  }
+  
+  # nonsense-mediated decay transcript
+  if($tr->biotype() eq 'nonsense_mediated_decay') {
+	$var->type("NMD_TRANSCRIPT");
+	#return [$var];
+  }
+
+  #now variation must be in exons, the first 3 bs into exon could be splice_site
+
+  if ($splice_site_2 or $splice_site_3) {
+	
+	my ($se_s, $se_e, $ee_s, $ee_e) = ($tr->start_Exon->start, $tr->start_Exon->end, $tr->end_Exon->start, $tr->end_Exon->end);
+	($se_s, $se_e, $ee_s, $ee_e) = ($se_e, $se_s, $ee_e, $ee_s) if $tr->strand < 0;
+	
+	# check coord relative to first exon
+	# near beginning of first exon is obv not a splice site
+	if($var->start <= $se_e) {
+	  if(abs($se_e - $var->start) <= 3) {
+		$var->type('SPLICE_SITE');
+	  }
+	}
+	
+	# also check relative to last exon
+	# near end of last exon is also not a splice site
+	elsif($var->start >= $ee_s) {
+	  if(abs($ee_s - $var->start) <= 3) {
+		$var->type('SPLICE_SITE');
+	  }
+	}
+	
+	# if not near either end of transcript, then it is definitely a splice site
+	else {
+	  $var->type('SPLICE_SITE');
+	}
+  }
+  
+  $var->cdna_start( $c->start() );
+  $var->cdna_end( $c->end() );
+
+  if(@cds_coords > 1) {
+#    my @out;
+      #this is a new type, complex_indel
+      $var->type('COMPLEX_INDEL');
+      return [$var];
+#     foreach my $c (@coords) {
+#       my %new_var = %{$var};
+#       $new_var{'end'} = $var->start + $c->length() - 1;
+#       $var->start( $new_var{'end'} + 1);
+#       #empty the type before re-run       
+#       $var->empty_type ;
+#       push @out, @{type_variation($tr, $g, bless \%new_var, ref($var))};
+#     }
+#     return \@out;
+  }
+
+  $c = $cds_coords[0];
+
+  if($c->isa('Bio::EnsEMBL::Mapper::Gap')) {
+    # mapped successfully to CDNA but not to CDS, must be UTR
+
+    if($var->end < $tr->coding_region_start()) {
+      $var->type( ($tr->strand() == 1) ? '5PRIME_UTR' : '3PRIME_UTR' );
+    }
+    elsif($var->start > $tr->coding_region_end()) {
+      $var->type( ($tr->strand() == 1) ? '3PRIME_UTR' : '5PRIME_UTR');
+    }
+    else {
+      throw('Unexpected: CDNA variation which is not in CDS is not in UTR');
+    }
+    return [$var];
+  }
+  
+  # we need to add the exon phase on in case of weird transcripts
+  # where the first exon is not in normal phase
+  $var->cds_start( $c->start() + ($exon_phase > 0 ? $exon_phase : 0));
+  $var->cds_end( $c->end() + ($exon_phase > 0 ? $exon_phase : 0));
+
+
+  if(@pep_coords != 1 || $pep_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) {
+    throw("Unexpected: Could map to CDS but not to peptide coordinates.");
+  }
+
+  $c = $pep_coords[0];
+
+  $var->aa_start( $c->start());
+  $var->aa_end( $c->end());
+
+  apply_aa_change($tr, $var);
+
+  return [$var];
+}
+
+#
+# Determines the effect of a coding variation on the peptide sequence
+#
+
+sub apply_aa_change {
+  my $tr = shift;
+  my $var = shift;
+  
+  my ($attrib) = @{$tr->slice()->get_all_Attributes('codon_table')}; #for mithocondrial dna it is necessary to change the table
+
+  my $codon_table;
+  $codon_table = $attrib->value() if($attrib);
+  $codon_table ||= 1; # default vertebrate codon table 
+
+  # check the cache
+  my $tran_features = $tr->{_variation_effect_feature_cache};
+  
+  # populate it if not found
+  unless ($tran_features) {
+	$tran_features = {
+	  mapper  => $tr->get_TranscriptMapper,
+	};
+
+    if ($tran_features->{translation} = $tr->translate(undef, undef, undef, $codon_table)) {
+	  $tran_features->{translateable_seq} = $tr->translateable_seq;
+      
+      # to include the stop codon we need to translate the Bio::Seq sequence, not just
+      # $tr->translation, this is the source of the missing STOP_LOSTs
+      my $mrna_seqobj = Bio::Seq->new(
+          -seq      =>  $tran_features->{translateable_seq},
+          -moltype  => 'dna',
+          -alphabet => 'dna'
+      );
+
+	  $tran_features->{peptide} = $mrna_seqobj->translate(undef, undef, undef, $codon_table)->seq;
+	}
+	
+	$tr->{_variation_effect_feature_cache} = $tran_features;
+  }
+
+  my $mrna = $tran_features->{translateable_seq}; # get from cache
+
+  my $peptide = $tran_features->{peptide}; # get from cache
+  
+  my $len = $var->aa_end - $var->aa_start + 1;
+  my $old_aa = substr($peptide, $var->aa_start -1 , $len);
+
+  my $codon_cds_start = $var->aa_start * 3 - 2;
+  my $codon_cds_end   = $var->aa_end   * 3;
+  my $codon_len = $codon_cds_end - $codon_cds_start + 1;
+
+  my @alleles = @{$var->alleles};
+
+  my $var_len = $var->cds_end - $var->cds_start + 1;
+
+  my @aa_alleles = ($old_aa);
+  
+  my $ref_codon = substr($mrna, $codon_cds_start-1, $codon_len);
+  my @codons;
+  push @codons, $ref_codon;
+
+  #here could generate multi type if have multi-allele change: "ACTAGT/-/T"
+  foreach my $a (@alleles) {
+    $a =~ s/\-//;
+    my $cds = $mrna;
+	
+    if($var_len != length($a)) {
+      if(abs(length($a) - $var_len) % 3) {
+        # frameshifting variation, do not set peptide_allele string
+        # since too complicated and could be very long
+	
+	$var->type('FRAMESHIFT_CODING');
+        return [$var];
+      }
+
+      if($codon_len == 0) { # insertion
+        $aa_alleles[0] = '-';
+        $old_aa    = '-';
+      }
+    }
+
+    my $new_aa;
+	
+	# change sequence
+    substr($cds, $var->cds_start-1, $var_len) = $a;
+	
+	# get the new codon
+    my $codon_str = substr($cds, $codon_cds_start-1, $codon_len + length($a)-$var_len);
+    
+	push @codons, $codon_str;
+    $var->codon($codon_str); #add the codon to the ConsequenceType object
+    my $codon_seq = Bio::Seq->new(-seq      => $codon_str,
+				  -moltype  => 'dna',
+				  -alphabet => 'dna');
+
+    $new_aa = $codon_seq->translate(undef,undef,undef,$codon_table)->seq();
+
+    if(length($new_aa)<1){
+      $new_aa='-';
+    }
+    
+    if(uc($new_aa) ne uc($old_aa)) {
+      push @aa_alleles, $new_aa;
+      if ($new_aa =~ /\*/) {
+	$var->type('STOP_GAINED');
+      }
+      elsif ($old_aa =~ /\*/) {
+	$var->type('STOP_LOST');
+      }
+    }
+  }
+  
+  #note if type is already defined as SOTP_GAINED OR STOP_LOST, then even @aa_alleles > 1, we are not given type
+  # of 'NON_SYNONYMOUS_CODING'
+  if(@aa_alleles > 1) {
+    if (!$var->type or (join ' ',@{$var->type}) !~ /STOP/) {
+      $var->type('NON_SYNONYMOUS_CODING');
+    }
+  }
+  else {
+    $var->type('SYNONYMOUS_CODING');
+  }
+
+  #$var->codons(\@codons);
+  $var->aa_alleles(\@aa_alleles);
+}
+
+
+1;