view variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

TranscriptSNPs - A utility class used to obtain information about the
relationships between a transcript and SNPs

=head1 SYNOPSIS

  use Bio::EnsEMBL::Utils::TranscriptSNPs;

  # get and type all snps in the region of the transcript

  %snps = %{
    Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript,
      $flanking ) };

  # get all snps overlapping the transcript in cdna coordinates

  %snps =
    %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs(
      $transcript) };

  # get the peptide variations caused by a set of SNPs

  %variations = %{
    Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations(
      $transcript, $snps ) };

=head1 DESCRIPTION

This is a utility class which can be used to get snps associated with a
transcript, and to determine the amino acid changes caused by the SNPs

=head1 METHODS

=cut

package Bio::EnsEMBL::Utils::TranscriptSNPs;

use strict;
use warnings;
no warnings 'uninitialized';



use Bio::EnsEMBL::Utils::Exception qw(throw warning);


=head2 get_all_peptide_variations

  Arg [1]    : $transcript the transcript to obtain the peptide variations for
  Arg [2]    : $snps listref of coding snps in cdna coordinates
  Example    : $pep_hash = get_all_peptide_variations($transcript, \@snps);
  Description: Takes a list of coding snps on this transcript in
               which are in cdna coordinates and returns a hash with peptide
               coordinate keys and listrefs of alternative amino acids as
               values.  The SNPs must additionally have a strand of 1 for the
               sake of simplicity.  Normally these could be generated using the
               get_all_cdna_SNPs method.

               Note that the peptide encoded by the reference sequence is
               also present in the results and that duplicate peptides
               (e.g. resulting from synonomous mutations) are discarded.
               It is possible to have greated than two peptides variations
               at a given location given adjacent or overlapping snps.
               Insertion/deletion variations are ignored by this method.
               Example of a data structure that could be returned:
               {  1  => ['I', 'M'],
                 10  => ['I', 'T'],
                 37  => ['N', 'D'],
                 56  => ['G', 'E'],
                 118 => ['R', 'K'],
                 159 => ['D', 'E'],
                 167 => ['Q', 'R'],
                 173 => ['H', 'Q'] }
  Returntype : hashref
  Exceptions : none
  Caller     : general

=cut

sub get_all_peptide_variations {
  my $transcript = shift;
  my $snps = shift;

  if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
    throw('Bio::EnsEMBL::Transcript argument is required.');
  }

  if(!ref($snps) eq 'ARRAY') {
    throw('Reference to a list of Bio::EnsEMBL::SNP objects is required');
  }

  my $codon_table = Bio::Tools::CodonTable->new;
  my $codon_length = 3;
  my $cdna = $transcript->spliced_seq;

  my $variant_alleles;
  my $translation_start = $transcript->cdna_coding_start;
  foreach my $snp (@$snps) {
    #skip variations not on a single base
    next if ($snp->start != $snp->end);

    my $start = $snp->start;
    my $strand = $snp->strand;

    #calculate offset of the nucleotide from codon start (0|1|2)
    my $codon_pos = ($start - $translation_start) % $codon_length;

    #calculate the peptide coordinate of the snp
    my $peptide = ($start - $translation_start +
		   ($codon_length - $codon_pos)) / $codon_length;
	
	# skip this SNP if it falls in a partial codon
	next if $start - $codon_pos + $codon_length > length($cdna);

    #retrieve the codon
    my $codon = substr($cdna, $start - $codon_pos-1, $codon_length);

    #store each alternative allele by its location in the peptide
    my @alleles = split(/\/|\|/, lc($snp->allele_string));
    #my @alleles = split(/\/|\|/, lc($snp->alleles));

    foreach my $allele (@alleles) {
      next if $allele eq '-';       #skip deletions
      next if CORE::length($allele) != 1; #skip insertions

      #get_all_cdna_SNPs always gives strand of 1 now
      #if($strand == -1) {
      #  #complement the allele if the snp is on the reverse strand
      #  $allele =~ 
      #  tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
      #}

      #create a data structure of variant alleles sorted by both their
      #peptide position and their position within the peptides codon
      $variant_alleles ||= {};
      if(exists $variant_alleles->{$peptide}) {
        my $alleles_arr = $variant_alleles->{$peptide}->[1];
        push @{$alleles_arr->[$codon_pos]}, $allele;
      } else {
        #create a list of 3 lists (one list for each codon position)
        my $alleles_arr = [[],[],[]];
        push @{$alleles_arr->[$codon_pos]}, $allele;
        $variant_alleles->{$peptide} = [$codon, $alleles_arr];
      }
    }
  }

  my %out;
  #now generate all possible codons for each peptide and translate them
  foreach my $peptide (keys %$variant_alleles) {
    my ($codon, $alleles) = @{$variant_alleles->{$peptide}};

    #need to push original nucleotides onto each position
    #so that all possible combinations can be generated
    push @{$alleles->[0]}, substr($codon,0,1);
    push @{$alleles->[1]}, substr($codon,1,1);
    push @{$alleles->[2]}, substr($codon,2,1);

    my %alt_amino_acids;
    foreach my $a1 (@{$alleles->[0]}) {
      substr($codon, 0, 1) = $a1;
      foreach my $a2 (@{$alleles->[1]}) {
        substr($codon, 1, 1) = $a2;
        foreach my $a3 (@{$alleles->[2]}) {
          substr($codon, 2, 1) = $a3;
          my $aa = $codon_table->translate($codon);
          #print "$codon translation is $aa\n";
          $alt_amino_acids{$aa} = 1;
        }
      }
    }

    my @aas = keys %alt_amino_acids;
    $out{$peptide} = \@aas;
  }

  return \%out;
}



=head2 get_all_SNPs

  Arg [1]    : Bio::EnsEMBL::Transcript $transcript

  Arg [2]    : (optional) int $flanking
               The number of basepairs of transcript flanking sequence to
               retrieve snps from (default 0)
  Arg [3]    : $source type of database source (dbSNP, Glovar)
  Example    : $snp_hashref = get_all_transcript_SNPs($transcript)
  Description: Retrieves all snps found within the region of the 
               provided transcript
               The snps are returned in a hash with keys corresponding
               to the region the snp was found in.  Possible keys are:
               'three prime UTR', 'five prime UTR', 'coding', 'intronic',
               'three prime flanking', 'five prime flanking'
               If no flanking argument is provided no flanking snps will be
               obtained.
               The listrefs which are the values of the returned hash
               contain snps in coordinates of the transcript region 
               (i.e. first base = first base of the first exon on the
               postive strand - flanking bases + 1)

               Multiple base variations and inserts/deletes are discarded
               by this method and not used.

  Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
               values
  Exceptions : none
  Caller     : general

=cut

sub get_all_SNPs {
  my $transcript = shift;
  my $flanking = shift || 0;
  my $source = shift;

  if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
    throw('Bio::EnsEMBL::Transcript argument required.');
  }

  my $slice = $transcript->slice();

  if(!$slice) {
    warning("Cannot obtain SNPs for transcript without attached Slice.");
    return {};
  }

  my $sa = $slice->adaptor();

  if(!$sa) {
    warning('Cannot obtain SNPs for transcript unless attached slice ' .
            'has attached adaptor');
    return {};
  }

  my %snp_hash;

  # retrieve slice in the region of the transcript
  $slice = $sa->fetch_by_Feature($transcript, $flanking );

  # copy transcript, to work in coord system we are interested in
  $transcript = $transcript->transfer( $slice );

  # get all snps in the transcript region
  my $snps;
  if ($source eq 'glovar') {
    $snps = $slice->get_all_ExternalFeatures('GlovarSNP');
  }
  elsif ($source eq 'variation') {
    $snps = $slice->get_all_VariationFeatures;
  }
  else {
    $snps = $slice->get_all_SNPs;   # dont need once use new snp api (i think)
  }

  my $trans_start  = $flanking + 1;
  my $trans_end    = $slice->length - $flanking;
  my $trans_strand = $transcript->get_all_Exons->[0]->strand;

  # classify each snp
  foreach my $snp (@$snps) {
    my $key;

    if(($trans_strand == 1 && $snp->end < $trans_start) ||
       ($trans_strand == -1 && $snp->start > $trans_end)) {
      # this snp is upstream from the transcript
      $key = 'five prime flanking';
    }

    elsif(($trans_strand == 1 && $snp->start > $trans_end) ||
	  ($trans_strand == -1 && $snp->start < $trans_start)) {
      # this snp is downstream from the transcript
      $key = 'three prime flanking';
    }

    else {
      #snp is inside transcript region check if it overlaps an exon
      foreach my $e (@{$transcript->get_all_Exons}) {
        if($snp->end >= $e->start && $snp->start <= $e->end) {
          # this snp is in an exon
	  
          if(($trans_strand == 1 && 
              $snp->end < $transcript->coding_region_start) ||
             ($trans_strand == -1 && 
              $snp->start > $transcript->coding_region_end)) {
            # this snp is in the 5' UTR
            $key = 'five prime UTR';
          }

          elsif(($trans_strand == 1 && 
                 $snp->start > $transcript->coding_region_end)||
                ($trans_strand == -1 && 
                 $snp->end < $transcript->coding_region_start)) {
            # this snp is in the 3' UTR
            $key = 'three prime UTR';
          }

          else {
            # snp is coding
            $key = 'coding';
          }
          last;
        }
      }
      unless($key) {
        # snp was not in an exon and is therefore intronic
        $key = 'intronic';
      }
    }

    unless($key) {
      #warning('SNP could not be mapped. In/Dels not supported yet...');
      next;
    }

    if(exists $snp_hash{$key}) {
      push @{$snp_hash{$key}}, $snp;
    } 
    else {
      $snp_hash{$key} = [$snp];
    }
  }

  return \%snp_hash;
}



=head2 get_all_cdna_SNPs

  Arg [1]    : Bio::EnsEMBL::Transcript $transcript
  Arg [2]    : $source type of database source (dbSNP, Glovar)
  Example    : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs;
  Description: Retrieves all snps found within exons of the provided
               transcript.
               The snps are returned in a hash with three keys corresponding
               to the region the snp was found in.  Valid keys are:
               'three prime UTR', 'five prime UTR', 'coding'
               The listrefs which are the values of the returned hash
               contain snps in CDNA coordinates.

               Multiple base variations and insertions/deletions are not
               used by this function and are discarded.
  Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for 
               values
  Exceptions : none
  Caller     : general

=cut

sub get_all_cdna_SNPs {
  my ($transcript, $source) = @_;

  #retrieve all of the snps from this transcript
  my $all_snps = get_all_SNPs($transcript, 0, $source);
  my %snp_hash;

  my @cdna_types = ('three prime UTR', 'five prime UTR','coding');

  my $slice = $transcript->slice();
  my $sa    = $slice->adaptor();

  $slice = $sa->fetch_by_Feature($transcript);

  # copy transcript in order to work in coord system of interest
  $transcript = $transcript->transfer($slice);

  foreach my $type (@cdna_types) {
    $snp_hash{$type} = [];
    foreach my $snp (@{$all_snps->{$type}}) {
      my @coords = $transcript->genomic2cdna($snp->start, $snp->end,
                                             $snp->strand);

      #skip snps that don't map cleanly (possibly an indel...)
      if(scalar(@coords) != 1) {
        #warning("snp of type $type does not map cleanly\n");
        next;
      }

      my ($coord) = @coords;

      unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
        #warning("snp of type $type maps to gap\n");
        next;
      }

      my $alleles;
      my $ambicode;

      # get alleles and ambig_code (with fallback to old snp API)
      $alleles = $snp->allele_string || $snp->{'alleles'};
      $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'};

      #we arbitrarily put the SNP on the +ve strand because it is easier to
      #work with in the webcode
      if($coord->strand == -1) {
        $alleles =~
         tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
        $ambicode =~
         tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
      }
      #copy the snp and convert to cdna coords...
      my $new_snp;
      %$new_snp = %$snp;
      bless $new_snp, ref $snp;
      $new_snp->start($coord->start);
      $new_snp->end($coord->end);
      $new_snp->strand(1);
      $new_snp->allele_string($alleles);
      $new_snp->ambig_code($ambicode);
      push @{$snp_hash{$type}}, $new_snp;
    }
  }

  return \%snp_hash;
}




1;