comparison variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 TranscriptSNPs - A utility class used to obtain information about the
24 relationships between a transcript and SNPs
25
26 =head1 SYNOPSIS
27
28 use Bio::EnsEMBL::Utils::TranscriptSNPs;
29
30 # get and type all snps in the region of the transcript
31
32 %snps = %{
33 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript,
34 $flanking ) };
35
36 # get all snps overlapping the transcript in cdna coordinates
37
38 %snps =
39 %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs(
40 $transcript) };
41
42 # get the peptide variations caused by a set of SNPs
43
44 %variations = %{
45 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations(
46 $transcript, $snps ) };
47
48 =head1 DESCRIPTION
49
50 This is a utility class which can be used to get snps associated with a
51 transcript, and to determine the amino acid changes caused by the SNPs
52
53 =head1 METHODS
54
55 =cut
56
57 package Bio::EnsEMBL::Utils::TranscriptSNPs;
58
59 use strict;
60 use warnings;
61 no warnings 'uninitialized';
62
63
64
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
66
67
68 =head2 get_all_peptide_variations
69
70 Arg [1] : $transcript the transcript to obtain the peptide variations for
71 Arg [2] : $snps listref of coding snps in cdna coordinates
72 Example : $pep_hash = get_all_peptide_variations($transcript, \@snps);
73 Description: Takes a list of coding snps on this transcript in
74 which are in cdna coordinates and returns a hash with peptide
75 coordinate keys and listrefs of alternative amino acids as
76 values. The SNPs must additionally have a strand of 1 for the
77 sake of simplicity. Normally these could be generated using the
78 get_all_cdna_SNPs method.
79
80 Note that the peptide encoded by the reference sequence is
81 also present in the results and that duplicate peptides
82 (e.g. resulting from synonomous mutations) are discarded.
83 It is possible to have greated than two peptides variations
84 at a given location given adjacent or overlapping snps.
85 Insertion/deletion variations are ignored by this method.
86 Example of a data structure that could be returned:
87 { 1 => ['I', 'M'],
88 10 => ['I', 'T'],
89 37 => ['N', 'D'],
90 56 => ['G', 'E'],
91 118 => ['R', 'K'],
92 159 => ['D', 'E'],
93 167 => ['Q', 'R'],
94 173 => ['H', 'Q'] }
95 Returntype : hashref
96 Exceptions : none
97 Caller : general
98
99 =cut
100
101 sub get_all_peptide_variations {
102 my $transcript = shift;
103 my $snps = shift;
104
105 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
106 throw('Bio::EnsEMBL::Transcript argument is required.');
107 }
108
109 if(!ref($snps) eq 'ARRAY') {
110 throw('Reference to a list of Bio::EnsEMBL::SNP objects is required');
111 }
112
113 my $codon_table = Bio::Tools::CodonTable->new;
114 my $codon_length = 3;
115 my $cdna = $transcript->spliced_seq;
116
117 my $variant_alleles;
118 my $translation_start = $transcript->cdna_coding_start;
119 foreach my $snp (@$snps) {
120 #skip variations not on a single base
121 next if ($snp->start != $snp->end);
122
123 my $start = $snp->start;
124 my $strand = $snp->strand;
125
126 #calculate offset of the nucleotide from codon start (0|1|2)
127 my $codon_pos = ($start - $translation_start) % $codon_length;
128
129 #calculate the peptide coordinate of the snp
130 my $peptide = ($start - $translation_start +
131 ($codon_length - $codon_pos)) / $codon_length;
132
133 # skip this SNP if it falls in a partial codon
134 next if $start - $codon_pos + $codon_length > length($cdna);
135
136 #retrieve the codon
137 my $codon = substr($cdna, $start - $codon_pos-1, $codon_length);
138
139 #store each alternative allele by its location in the peptide
140 my @alleles = split(/\/|\|/, lc($snp->allele_string));
141 #my @alleles = split(/\/|\|/, lc($snp->alleles));
142
143 foreach my $allele (@alleles) {
144 next if $allele eq '-'; #skip deletions
145 next if CORE::length($allele) != 1; #skip insertions
146
147 #get_all_cdna_SNPs always gives strand of 1 now
148 #if($strand == -1) {
149 # #complement the allele if the snp is on the reverse strand
150 # $allele =~
151 # tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
152 #}
153
154 #create a data structure of variant alleles sorted by both their
155 #peptide position and their position within the peptides codon
156 $variant_alleles ||= {};
157 if(exists $variant_alleles->{$peptide}) {
158 my $alleles_arr = $variant_alleles->{$peptide}->[1];
159 push @{$alleles_arr->[$codon_pos]}, $allele;
160 } else {
161 #create a list of 3 lists (one list for each codon position)
162 my $alleles_arr = [[],[],[]];
163 push @{$alleles_arr->[$codon_pos]}, $allele;
164 $variant_alleles->{$peptide} = [$codon, $alleles_arr];
165 }
166 }
167 }
168
169 my %out;
170 #now generate all possible codons for each peptide and translate them
171 foreach my $peptide (keys %$variant_alleles) {
172 my ($codon, $alleles) = @{$variant_alleles->{$peptide}};
173
174 #need to push original nucleotides onto each position
175 #so that all possible combinations can be generated
176 push @{$alleles->[0]}, substr($codon,0,1);
177 push @{$alleles->[1]}, substr($codon,1,1);
178 push @{$alleles->[2]}, substr($codon,2,1);
179
180 my %alt_amino_acids;
181 foreach my $a1 (@{$alleles->[0]}) {
182 substr($codon, 0, 1) = $a1;
183 foreach my $a2 (@{$alleles->[1]}) {
184 substr($codon, 1, 1) = $a2;
185 foreach my $a3 (@{$alleles->[2]}) {
186 substr($codon, 2, 1) = $a3;
187 my $aa = $codon_table->translate($codon);
188 #print "$codon translation is $aa\n";
189 $alt_amino_acids{$aa} = 1;
190 }
191 }
192 }
193
194 my @aas = keys %alt_amino_acids;
195 $out{$peptide} = \@aas;
196 }
197
198 return \%out;
199 }
200
201
202
203 =head2 get_all_SNPs
204
205 Arg [1] : Bio::EnsEMBL::Transcript $transcript
206
207 Arg [2] : (optional) int $flanking
208 The number of basepairs of transcript flanking sequence to
209 retrieve snps from (default 0)
210 Arg [3] : $source type of database source (dbSNP, Glovar)
211 Example : $snp_hashref = get_all_transcript_SNPs($transcript)
212 Description: Retrieves all snps found within the region of the
213 provided transcript
214 The snps are returned in a hash with keys corresponding
215 to the region the snp was found in. Possible keys are:
216 'three prime UTR', 'five prime UTR', 'coding', 'intronic',
217 'three prime flanking', 'five prime flanking'
218 If no flanking argument is provided no flanking snps will be
219 obtained.
220 The listrefs which are the values of the returned hash
221 contain snps in coordinates of the transcript region
222 (i.e. first base = first base of the first exon on the
223 postive strand - flanking bases + 1)
224
225 Multiple base variations and inserts/deletes are discarded
226 by this method and not used.
227
228 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
229 values
230 Exceptions : none
231 Caller : general
232
233 =cut
234
235 sub get_all_SNPs {
236 my $transcript = shift;
237 my $flanking = shift || 0;
238 my $source = shift;
239
240 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
241 throw('Bio::EnsEMBL::Transcript argument required.');
242 }
243
244 my $slice = $transcript->slice();
245
246 if(!$slice) {
247 warning("Cannot obtain SNPs for transcript without attached Slice.");
248 return {};
249 }
250
251 my $sa = $slice->adaptor();
252
253 if(!$sa) {
254 warning('Cannot obtain SNPs for transcript unless attached slice ' .
255 'has attached adaptor');
256 return {};
257 }
258
259 my %snp_hash;
260
261 # retrieve slice in the region of the transcript
262 $slice = $sa->fetch_by_Feature($transcript, $flanking );
263
264 # copy transcript, to work in coord system we are interested in
265 $transcript = $transcript->transfer( $slice );
266
267 # get all snps in the transcript region
268 my $snps;
269 if ($source eq 'glovar') {
270 $snps = $slice->get_all_ExternalFeatures('GlovarSNP');
271 }
272 elsif ($source eq 'variation') {
273 $snps = $slice->get_all_VariationFeatures;
274 }
275 else {
276 $snps = $slice->get_all_SNPs; # dont need once use new snp api (i think)
277 }
278
279 my $trans_start = $flanking + 1;
280 my $trans_end = $slice->length - $flanking;
281 my $trans_strand = $transcript->get_all_Exons->[0]->strand;
282
283 # classify each snp
284 foreach my $snp (@$snps) {
285 my $key;
286
287 if(($trans_strand == 1 && $snp->end < $trans_start) ||
288 ($trans_strand == -1 && $snp->start > $trans_end)) {
289 # this snp is upstream from the transcript
290 $key = 'five prime flanking';
291 }
292
293 elsif(($trans_strand == 1 && $snp->start > $trans_end) ||
294 ($trans_strand == -1 && $snp->start < $trans_start)) {
295 # this snp is downstream from the transcript
296 $key = 'three prime flanking';
297 }
298
299 else {
300 #snp is inside transcript region check if it overlaps an exon
301 foreach my $e (@{$transcript->get_all_Exons}) {
302 if($snp->end >= $e->start && $snp->start <= $e->end) {
303 # this snp is in an exon
304
305 if(($trans_strand == 1 &&
306 $snp->end < $transcript->coding_region_start) ||
307 ($trans_strand == -1 &&
308 $snp->start > $transcript->coding_region_end)) {
309 # this snp is in the 5' UTR
310 $key = 'five prime UTR';
311 }
312
313 elsif(($trans_strand == 1 &&
314 $snp->start > $transcript->coding_region_end)||
315 ($trans_strand == -1 &&
316 $snp->end < $transcript->coding_region_start)) {
317 # this snp is in the 3' UTR
318 $key = 'three prime UTR';
319 }
320
321 else {
322 # snp is coding
323 $key = 'coding';
324 }
325 last;
326 }
327 }
328 unless($key) {
329 # snp was not in an exon and is therefore intronic
330 $key = 'intronic';
331 }
332 }
333
334 unless($key) {
335 #warning('SNP could not be mapped. In/Dels not supported yet...');
336 next;
337 }
338
339 if(exists $snp_hash{$key}) {
340 push @{$snp_hash{$key}}, $snp;
341 }
342 else {
343 $snp_hash{$key} = [$snp];
344 }
345 }
346
347 return \%snp_hash;
348 }
349
350
351
352 =head2 get_all_cdna_SNPs
353
354 Arg [1] : Bio::EnsEMBL::Transcript $transcript
355 Arg [2] : $source type of database source (dbSNP, Glovar)
356 Example : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs;
357 Description: Retrieves all snps found within exons of the provided
358 transcript.
359 The snps are returned in a hash with three keys corresponding
360 to the region the snp was found in. Valid keys are:
361 'three prime UTR', 'five prime UTR', 'coding'
362 The listrefs which are the values of the returned hash
363 contain snps in CDNA coordinates.
364
365 Multiple base variations and insertions/deletions are not
366 used by this function and are discarded.
367 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
368 values
369 Exceptions : none
370 Caller : general
371
372 =cut
373
374 sub get_all_cdna_SNPs {
375 my ($transcript, $source) = @_;
376
377 #retrieve all of the snps from this transcript
378 my $all_snps = get_all_SNPs($transcript, 0, $source);
379 my %snp_hash;
380
381 my @cdna_types = ('three prime UTR', 'five prime UTR','coding');
382
383 my $slice = $transcript->slice();
384 my $sa = $slice->adaptor();
385
386 $slice = $sa->fetch_by_Feature($transcript);
387
388 # copy transcript in order to work in coord system of interest
389 $transcript = $transcript->transfer($slice);
390
391 foreach my $type (@cdna_types) {
392 $snp_hash{$type} = [];
393 foreach my $snp (@{$all_snps->{$type}}) {
394 my @coords = $transcript->genomic2cdna($snp->start, $snp->end,
395 $snp->strand);
396
397 #skip snps that don't map cleanly (possibly an indel...)
398 if(scalar(@coords) != 1) {
399 #warning("snp of type $type does not map cleanly\n");
400 next;
401 }
402
403 my ($coord) = @coords;
404
405 unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
406 #warning("snp of type $type maps to gap\n");
407 next;
408 }
409
410 my $alleles;
411 my $ambicode;
412
413 # get alleles and ambig_code (with fallback to old snp API)
414 $alleles = $snp->allele_string || $snp->{'alleles'};
415 $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'};
416
417 #we arbitrarily put the SNP on the +ve strand because it is easier to
418 #work with in the webcode
419 if($coord->strand == -1) {
420 $alleles =~
421 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
422 $ambicode =~
423 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
424 }
425 #copy the snp and convert to cdna coords...
426 my $new_snp;
427 %$new_snp = %$snp;
428 bless $new_snp, ref $snp;
429 $new_snp->start($coord->start);
430 $new_snp->end($coord->end);
431 $new_snp->strand(1);
432 $new_snp->allele_string($alleles);
433 $new_snp->ambig_code($ambicode);
434 push @{$snp_hash{$type}}, $new_snp;
435 }
436 }
437
438 return \%snp_hash;
439 }
440
441
442
443
444 1;