Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,444 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +TranscriptSNPs - A utility class used to obtain information about the +relationships between a transcript and SNPs + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::TranscriptSNPs; + + # get and type all snps in the region of the transcript + + %snps = %{ + Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript, + $flanking ) }; + + # get all snps overlapping the transcript in cdna coordinates + + %snps = + %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs( + $transcript) }; + + # get the peptide variations caused by a set of SNPs + + %variations = %{ + Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations( + $transcript, $snps ) }; + +=head1 DESCRIPTION + +This is a utility class which can be used to get snps associated with a +transcript, and to determine the amino acid changes caused by the SNPs + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::Utils::TranscriptSNPs; + +use strict; +use warnings; +no warnings 'uninitialized'; + + + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 get_all_peptide_variations + + Arg [1] : $transcript the transcript to obtain the peptide variations for + Arg [2] : $snps listref of coding snps in cdna coordinates + Example : $pep_hash = get_all_peptide_variations($transcript, \@snps); + Description: Takes a list of coding snps on this transcript in + which are in cdna coordinates and returns a hash with peptide + coordinate keys and listrefs of alternative amino acids as + values. The SNPs must additionally have a strand of 1 for the + sake of simplicity. Normally these could be generated using the + get_all_cdna_SNPs method. + + Note that the peptide encoded by the reference sequence is + also present in the results and that duplicate peptides + (e.g. resulting from synonomous mutations) are discarded. + It is possible to have greated than two peptides variations + at a given location given adjacent or overlapping snps. + Insertion/deletion variations are ignored by this method. + Example of a data structure that could be returned: + { 1 => ['I', 'M'], + 10 => ['I', 'T'], + 37 => ['N', 'D'], + 56 => ['G', 'E'], + 118 => ['R', 'K'], + 159 => ['D', 'E'], + 167 => ['Q', 'R'], + 173 => ['H', 'Q'] } + Returntype : hashref + Exceptions : none + Caller : general + +=cut + +sub get_all_peptide_variations { + my $transcript = shift; + my $snps = shift; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw('Bio::EnsEMBL::Transcript argument is required.'); + } + + if(!ref($snps) eq 'ARRAY') { + throw('Reference to a list of Bio::EnsEMBL::SNP objects is required'); + } + + my $codon_table = Bio::Tools::CodonTable->new; + my $codon_length = 3; + my $cdna = $transcript->spliced_seq; + + my $variant_alleles; + my $translation_start = $transcript->cdna_coding_start; + foreach my $snp (@$snps) { + #skip variations not on a single base + next if ($snp->start != $snp->end); + + my $start = $snp->start; + my $strand = $snp->strand; + + #calculate offset of the nucleotide from codon start (0|1|2) + my $codon_pos = ($start - $translation_start) % $codon_length; + + #calculate the peptide coordinate of the snp + my $peptide = ($start - $translation_start + + ($codon_length - $codon_pos)) / $codon_length; + + # skip this SNP if it falls in a partial codon + next if $start - $codon_pos + $codon_length > length($cdna); + + #retrieve the codon + my $codon = substr($cdna, $start - $codon_pos-1, $codon_length); + + #store each alternative allele by its location in the peptide + my @alleles = split(/\/|\|/, lc($snp->allele_string)); + #my @alleles = split(/\/|\|/, lc($snp->alleles)); + + foreach my $allele (@alleles) { + next if $allele eq '-'; #skip deletions + next if CORE::length($allele) != 1; #skip insertions + + #get_all_cdna_SNPs always gives strand of 1 now + #if($strand == -1) { + # #complement the allele if the snp is on the reverse strand + # $allele =~ + # tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; + #} + + #create a data structure of variant alleles sorted by both their + #peptide position and their position within the peptides codon + $variant_alleles ||= {}; + if(exists $variant_alleles->{$peptide}) { + my $alleles_arr = $variant_alleles->{$peptide}->[1]; + push @{$alleles_arr->[$codon_pos]}, $allele; + } else { + #create a list of 3 lists (one list for each codon position) + my $alleles_arr = [[],[],[]]; + push @{$alleles_arr->[$codon_pos]}, $allele; + $variant_alleles->{$peptide} = [$codon, $alleles_arr]; + } + } + } + + my %out; + #now generate all possible codons for each peptide and translate them + foreach my $peptide (keys %$variant_alleles) { + my ($codon, $alleles) = @{$variant_alleles->{$peptide}}; + + #need to push original nucleotides onto each position + #so that all possible combinations can be generated + push @{$alleles->[0]}, substr($codon,0,1); + push @{$alleles->[1]}, substr($codon,1,1); + push @{$alleles->[2]}, substr($codon,2,1); + + my %alt_amino_acids; + foreach my $a1 (@{$alleles->[0]}) { + substr($codon, 0, 1) = $a1; + foreach my $a2 (@{$alleles->[1]}) { + substr($codon, 1, 1) = $a2; + foreach my $a3 (@{$alleles->[2]}) { + substr($codon, 2, 1) = $a3; + my $aa = $codon_table->translate($codon); + #print "$codon translation is $aa\n"; + $alt_amino_acids{$aa} = 1; + } + } + } + + my @aas = keys %alt_amino_acids; + $out{$peptide} = \@aas; + } + + return \%out; +} + + + +=head2 get_all_SNPs + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + + Arg [2] : (optional) int $flanking + The number of basepairs of transcript flanking sequence to + retrieve snps from (default 0) + Arg [3] : $source type of database source (dbSNP, Glovar) + Example : $snp_hashref = get_all_transcript_SNPs($transcript) + Description: Retrieves all snps found within the region of the + provided transcript + The snps are returned in a hash with keys corresponding + to the region the snp was found in. Possible keys are: + 'three prime UTR', 'five prime UTR', 'coding', 'intronic', + 'three prime flanking', 'five prime flanking' + If no flanking argument is provided no flanking snps will be + obtained. + The listrefs which are the values of the returned hash + contain snps in coordinates of the transcript region + (i.e. first base = first base of the first exon on the + postive strand - flanking bases + 1) + + Multiple base variations and inserts/deletes are discarded + by this method and not used. + + Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for + values + Exceptions : none + Caller : general + +=cut + +sub get_all_SNPs { + my $transcript = shift; + my $flanking = shift || 0; + my $source = shift; + + if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { + throw('Bio::EnsEMBL::Transcript argument required.'); + } + + my $slice = $transcript->slice(); + + if(!$slice) { + warning("Cannot obtain SNPs for transcript without attached Slice."); + return {}; + } + + my $sa = $slice->adaptor(); + + if(!$sa) { + warning('Cannot obtain SNPs for transcript unless attached slice ' . + 'has attached adaptor'); + return {}; + } + + my %snp_hash; + + # retrieve slice in the region of the transcript + $slice = $sa->fetch_by_Feature($transcript, $flanking ); + + # copy transcript, to work in coord system we are interested in + $transcript = $transcript->transfer( $slice ); + + # get all snps in the transcript region + my $snps; + if ($source eq 'glovar') { + $snps = $slice->get_all_ExternalFeatures('GlovarSNP'); + } + elsif ($source eq 'variation') { + $snps = $slice->get_all_VariationFeatures; + } + else { + $snps = $slice->get_all_SNPs; # dont need once use new snp api (i think) + } + + my $trans_start = $flanking + 1; + my $trans_end = $slice->length - $flanking; + my $trans_strand = $transcript->get_all_Exons->[0]->strand; + + # classify each snp + foreach my $snp (@$snps) { + my $key; + + if(($trans_strand == 1 && $snp->end < $trans_start) || + ($trans_strand == -1 && $snp->start > $trans_end)) { + # this snp is upstream from the transcript + $key = 'five prime flanking'; + } + + elsif(($trans_strand == 1 && $snp->start > $trans_end) || + ($trans_strand == -1 && $snp->start < $trans_start)) { + # this snp is downstream from the transcript + $key = 'three prime flanking'; + } + + else { + #snp is inside transcript region check if it overlaps an exon + foreach my $e (@{$transcript->get_all_Exons}) { + if($snp->end >= $e->start && $snp->start <= $e->end) { + # this snp is in an exon + + if(($trans_strand == 1 && + $snp->end < $transcript->coding_region_start) || + ($trans_strand == -1 && + $snp->start > $transcript->coding_region_end)) { + # this snp is in the 5' UTR + $key = 'five prime UTR'; + } + + elsif(($trans_strand == 1 && + $snp->start > $transcript->coding_region_end)|| + ($trans_strand == -1 && + $snp->end < $transcript->coding_region_start)) { + # this snp is in the 3' UTR + $key = 'three prime UTR'; + } + + else { + # snp is coding + $key = 'coding'; + } + last; + } + } + unless($key) { + # snp was not in an exon and is therefore intronic + $key = 'intronic'; + } + } + + unless($key) { + #warning('SNP could not be mapped. In/Dels not supported yet...'); + next; + } + + if(exists $snp_hash{$key}) { + push @{$snp_hash{$key}}, $snp; + } + else { + $snp_hash{$key} = [$snp]; + } + } + + return \%snp_hash; +} + + + +=head2 get_all_cdna_SNPs + + Arg [1] : Bio::EnsEMBL::Transcript $transcript + Arg [2] : $source type of database source (dbSNP, Glovar) + Example : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs; + Description: Retrieves all snps found within exons of the provided + transcript. + The snps are returned in a hash with three keys corresponding + to the region the snp was found in. Valid keys are: + 'three prime UTR', 'five prime UTR', 'coding' + The listrefs which are the values of the returned hash + contain snps in CDNA coordinates. + + Multiple base variations and insertions/deletions are not + used by this function and are discarded. + Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for + values + Exceptions : none + Caller : general + +=cut + +sub get_all_cdna_SNPs { + my ($transcript, $source) = @_; + + #retrieve all of the snps from this transcript + my $all_snps = get_all_SNPs($transcript, 0, $source); + my %snp_hash; + + my @cdna_types = ('three prime UTR', 'five prime UTR','coding'); + + my $slice = $transcript->slice(); + my $sa = $slice->adaptor(); + + $slice = $sa->fetch_by_Feature($transcript); + + # copy transcript in order to work in coord system of interest + $transcript = $transcript->transfer($slice); + + foreach my $type (@cdna_types) { + $snp_hash{$type} = []; + foreach my $snp (@{$all_snps->{$type}}) { + my @coords = $transcript->genomic2cdna($snp->start, $snp->end, + $snp->strand); + + #skip snps that don't map cleanly (possibly an indel...) + if(scalar(@coords) != 1) { + #warning("snp of type $type does not map cleanly\n"); + next; + } + + my ($coord) = @coords; + + unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + #warning("snp of type $type maps to gap\n"); + next; + } + + my $alleles; + my $ambicode; + + # get alleles and ambig_code (with fallback to old snp API) + $alleles = $snp->allele_string || $snp->{'alleles'}; + $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'}; + + #we arbitrarily put the SNP on the +ve strand because it is easier to + #work with in the webcode + if($coord->strand == -1) { + $alleles =~ + tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//; + $ambicode =~ + tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//; + } + #copy the snp and convert to cdna coords... + my $new_snp; + %$new_snp = %$snp; + bless $new_snp, ref $snp; + $new_snp->start($coord->start); + $new_snp->end($coord->end); + $new_snp->strand(1); + $new_snp->allele_string($alleles); + $new_snp->ambig_code($ambicode); + push @{$snp_hash{$type}}, $new_snp; + } + } + + return \%snp_hash; +} + + + + +1;