Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 TranscriptSNPs - A utility class used to obtain information about the | |
24 relationships between a transcript and SNPs | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 use Bio::EnsEMBL::Utils::TranscriptSNPs; | |
29 | |
30 # get and type all snps in the region of the transcript | |
31 | |
32 %snps = %{ | |
33 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript, | |
34 $flanking ) }; | |
35 | |
36 # get all snps overlapping the transcript in cdna coordinates | |
37 | |
38 %snps = | |
39 %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs( | |
40 $transcript) }; | |
41 | |
42 # get the peptide variations caused by a set of SNPs | |
43 | |
44 %variations = %{ | |
45 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations( | |
46 $transcript, $snps ) }; | |
47 | |
48 =head1 DESCRIPTION | |
49 | |
50 This is a utility class which can be used to get snps associated with a | |
51 transcript, and to determine the amino acid changes caused by the SNPs | |
52 | |
53 =head1 METHODS | |
54 | |
55 =cut | |
56 | |
57 package Bio::EnsEMBL::Utils::TranscriptSNPs; | |
58 | |
59 use strict; | |
60 use warnings; | |
61 no warnings 'uninitialized'; | |
62 | |
63 | |
64 | |
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
66 | |
67 | |
68 =head2 get_all_peptide_variations | |
69 | |
70 Arg [1] : $transcript the transcript to obtain the peptide variations for | |
71 Arg [2] : $snps listref of coding snps in cdna coordinates | |
72 Example : $pep_hash = get_all_peptide_variations($transcript, \@snps); | |
73 Description: Takes a list of coding snps on this transcript in | |
74 which are in cdna coordinates and returns a hash with peptide | |
75 coordinate keys and listrefs of alternative amino acids as | |
76 values. The SNPs must additionally have a strand of 1 for the | |
77 sake of simplicity. Normally these could be generated using the | |
78 get_all_cdna_SNPs method. | |
79 | |
80 Note that the peptide encoded by the reference sequence is | |
81 also present in the results and that duplicate peptides | |
82 (e.g. resulting from synonomous mutations) are discarded. | |
83 It is possible to have greated than two peptides variations | |
84 at a given location given adjacent or overlapping snps. | |
85 Insertion/deletion variations are ignored by this method. | |
86 Example of a data structure that could be returned: | |
87 { 1 => ['I', 'M'], | |
88 10 => ['I', 'T'], | |
89 37 => ['N', 'D'], | |
90 56 => ['G', 'E'], | |
91 118 => ['R', 'K'], | |
92 159 => ['D', 'E'], | |
93 167 => ['Q', 'R'], | |
94 173 => ['H', 'Q'] } | |
95 Returntype : hashref | |
96 Exceptions : none | |
97 Caller : general | |
98 | |
99 =cut | |
100 | |
101 sub get_all_peptide_variations { | |
102 my $transcript = shift; | |
103 my $snps = shift; | |
104 | |
105 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { | |
106 throw('Bio::EnsEMBL::Transcript argument is required.'); | |
107 } | |
108 | |
109 if(!ref($snps) eq 'ARRAY') { | |
110 throw('Reference to a list of Bio::EnsEMBL::SNP objects is required'); | |
111 } | |
112 | |
113 my $codon_table = Bio::Tools::CodonTable->new; | |
114 my $codon_length = 3; | |
115 my $cdna = $transcript->spliced_seq; | |
116 | |
117 my $variant_alleles; | |
118 my $translation_start = $transcript->cdna_coding_start; | |
119 foreach my $snp (@$snps) { | |
120 #skip variations not on a single base | |
121 next if ($snp->start != $snp->end); | |
122 | |
123 my $start = $snp->start; | |
124 my $strand = $snp->strand; | |
125 | |
126 #calculate offset of the nucleotide from codon start (0|1|2) | |
127 my $codon_pos = ($start - $translation_start) % $codon_length; | |
128 | |
129 #calculate the peptide coordinate of the snp | |
130 my $peptide = ($start - $translation_start + | |
131 ($codon_length - $codon_pos)) / $codon_length; | |
132 | |
133 # skip this SNP if it falls in a partial codon | |
134 next if $start - $codon_pos + $codon_length > length($cdna); | |
135 | |
136 #retrieve the codon | |
137 my $codon = substr($cdna, $start - $codon_pos-1, $codon_length); | |
138 | |
139 #store each alternative allele by its location in the peptide | |
140 my @alleles = split(/\/|\|/, lc($snp->allele_string)); | |
141 #my @alleles = split(/\/|\|/, lc($snp->alleles)); | |
142 | |
143 foreach my $allele (@alleles) { | |
144 next if $allele eq '-'; #skip deletions | |
145 next if CORE::length($allele) != 1; #skip insertions | |
146 | |
147 #get_all_cdna_SNPs always gives strand of 1 now | |
148 #if($strand == -1) { | |
149 # #complement the allele if the snp is on the reverse strand | |
150 # $allele =~ | |
151 # tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; | |
152 #} | |
153 | |
154 #create a data structure of variant alleles sorted by both their | |
155 #peptide position and their position within the peptides codon | |
156 $variant_alleles ||= {}; | |
157 if(exists $variant_alleles->{$peptide}) { | |
158 my $alleles_arr = $variant_alleles->{$peptide}->[1]; | |
159 push @{$alleles_arr->[$codon_pos]}, $allele; | |
160 } else { | |
161 #create a list of 3 lists (one list for each codon position) | |
162 my $alleles_arr = [[],[],[]]; | |
163 push @{$alleles_arr->[$codon_pos]}, $allele; | |
164 $variant_alleles->{$peptide} = [$codon, $alleles_arr]; | |
165 } | |
166 } | |
167 } | |
168 | |
169 my %out; | |
170 #now generate all possible codons for each peptide and translate them | |
171 foreach my $peptide (keys %$variant_alleles) { | |
172 my ($codon, $alleles) = @{$variant_alleles->{$peptide}}; | |
173 | |
174 #need to push original nucleotides onto each position | |
175 #so that all possible combinations can be generated | |
176 push @{$alleles->[0]}, substr($codon,0,1); | |
177 push @{$alleles->[1]}, substr($codon,1,1); | |
178 push @{$alleles->[2]}, substr($codon,2,1); | |
179 | |
180 my %alt_amino_acids; | |
181 foreach my $a1 (@{$alleles->[0]}) { | |
182 substr($codon, 0, 1) = $a1; | |
183 foreach my $a2 (@{$alleles->[1]}) { | |
184 substr($codon, 1, 1) = $a2; | |
185 foreach my $a3 (@{$alleles->[2]}) { | |
186 substr($codon, 2, 1) = $a3; | |
187 my $aa = $codon_table->translate($codon); | |
188 #print "$codon translation is $aa\n"; | |
189 $alt_amino_acids{$aa} = 1; | |
190 } | |
191 } | |
192 } | |
193 | |
194 my @aas = keys %alt_amino_acids; | |
195 $out{$peptide} = \@aas; | |
196 } | |
197 | |
198 return \%out; | |
199 } | |
200 | |
201 | |
202 | |
203 =head2 get_all_SNPs | |
204 | |
205 Arg [1] : Bio::EnsEMBL::Transcript $transcript | |
206 | |
207 Arg [2] : (optional) int $flanking | |
208 The number of basepairs of transcript flanking sequence to | |
209 retrieve snps from (default 0) | |
210 Arg [3] : $source type of database source (dbSNP, Glovar) | |
211 Example : $snp_hashref = get_all_transcript_SNPs($transcript) | |
212 Description: Retrieves all snps found within the region of the | |
213 provided transcript | |
214 The snps are returned in a hash with keys corresponding | |
215 to the region the snp was found in. Possible keys are: | |
216 'three prime UTR', 'five prime UTR', 'coding', 'intronic', | |
217 'three prime flanking', 'five prime flanking' | |
218 If no flanking argument is provided no flanking snps will be | |
219 obtained. | |
220 The listrefs which are the values of the returned hash | |
221 contain snps in coordinates of the transcript region | |
222 (i.e. first base = first base of the first exon on the | |
223 postive strand - flanking bases + 1) | |
224 | |
225 Multiple base variations and inserts/deletes are discarded | |
226 by this method and not used. | |
227 | |
228 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for | |
229 values | |
230 Exceptions : none | |
231 Caller : general | |
232 | |
233 =cut | |
234 | |
235 sub get_all_SNPs { | |
236 my $transcript = shift; | |
237 my $flanking = shift || 0; | |
238 my $source = shift; | |
239 | |
240 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { | |
241 throw('Bio::EnsEMBL::Transcript argument required.'); | |
242 } | |
243 | |
244 my $slice = $transcript->slice(); | |
245 | |
246 if(!$slice) { | |
247 warning("Cannot obtain SNPs for transcript without attached Slice."); | |
248 return {}; | |
249 } | |
250 | |
251 my $sa = $slice->adaptor(); | |
252 | |
253 if(!$sa) { | |
254 warning('Cannot obtain SNPs for transcript unless attached slice ' . | |
255 'has attached adaptor'); | |
256 return {}; | |
257 } | |
258 | |
259 my %snp_hash; | |
260 | |
261 # retrieve slice in the region of the transcript | |
262 $slice = $sa->fetch_by_Feature($transcript, $flanking ); | |
263 | |
264 # copy transcript, to work in coord system we are interested in | |
265 $transcript = $transcript->transfer( $slice ); | |
266 | |
267 # get all snps in the transcript region | |
268 my $snps; | |
269 if ($source eq 'glovar') { | |
270 $snps = $slice->get_all_ExternalFeatures('GlovarSNP'); | |
271 } | |
272 elsif ($source eq 'variation') { | |
273 $snps = $slice->get_all_VariationFeatures; | |
274 } | |
275 else { | |
276 $snps = $slice->get_all_SNPs; # dont need once use new snp api (i think) | |
277 } | |
278 | |
279 my $trans_start = $flanking + 1; | |
280 my $trans_end = $slice->length - $flanking; | |
281 my $trans_strand = $transcript->get_all_Exons->[0]->strand; | |
282 | |
283 # classify each snp | |
284 foreach my $snp (@$snps) { | |
285 my $key; | |
286 | |
287 if(($trans_strand == 1 && $snp->end < $trans_start) || | |
288 ($trans_strand == -1 && $snp->start > $trans_end)) { | |
289 # this snp is upstream from the transcript | |
290 $key = 'five prime flanking'; | |
291 } | |
292 | |
293 elsif(($trans_strand == 1 && $snp->start > $trans_end) || | |
294 ($trans_strand == -1 && $snp->start < $trans_start)) { | |
295 # this snp is downstream from the transcript | |
296 $key = 'three prime flanking'; | |
297 } | |
298 | |
299 else { | |
300 #snp is inside transcript region check if it overlaps an exon | |
301 foreach my $e (@{$transcript->get_all_Exons}) { | |
302 if($snp->end >= $e->start && $snp->start <= $e->end) { | |
303 # this snp is in an exon | |
304 | |
305 if(($trans_strand == 1 && | |
306 $snp->end < $transcript->coding_region_start) || | |
307 ($trans_strand == -1 && | |
308 $snp->start > $transcript->coding_region_end)) { | |
309 # this snp is in the 5' UTR | |
310 $key = 'five prime UTR'; | |
311 } | |
312 | |
313 elsif(($trans_strand == 1 && | |
314 $snp->start > $transcript->coding_region_end)|| | |
315 ($trans_strand == -1 && | |
316 $snp->end < $transcript->coding_region_start)) { | |
317 # this snp is in the 3' UTR | |
318 $key = 'three prime UTR'; | |
319 } | |
320 | |
321 else { | |
322 # snp is coding | |
323 $key = 'coding'; | |
324 } | |
325 last; | |
326 } | |
327 } | |
328 unless($key) { | |
329 # snp was not in an exon and is therefore intronic | |
330 $key = 'intronic'; | |
331 } | |
332 } | |
333 | |
334 unless($key) { | |
335 #warning('SNP could not be mapped. In/Dels not supported yet...'); | |
336 next; | |
337 } | |
338 | |
339 if(exists $snp_hash{$key}) { | |
340 push @{$snp_hash{$key}}, $snp; | |
341 } | |
342 else { | |
343 $snp_hash{$key} = [$snp]; | |
344 } | |
345 } | |
346 | |
347 return \%snp_hash; | |
348 } | |
349 | |
350 | |
351 | |
352 =head2 get_all_cdna_SNPs | |
353 | |
354 Arg [1] : Bio::EnsEMBL::Transcript $transcript | |
355 Arg [2] : $source type of database source (dbSNP, Glovar) | |
356 Example : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs; | |
357 Description: Retrieves all snps found within exons of the provided | |
358 transcript. | |
359 The snps are returned in a hash with three keys corresponding | |
360 to the region the snp was found in. Valid keys are: | |
361 'three prime UTR', 'five prime UTR', 'coding' | |
362 The listrefs which are the values of the returned hash | |
363 contain snps in CDNA coordinates. | |
364 | |
365 Multiple base variations and insertions/deletions are not | |
366 used by this function and are discarded. | |
367 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for | |
368 values | |
369 Exceptions : none | |
370 Caller : general | |
371 | |
372 =cut | |
373 | |
374 sub get_all_cdna_SNPs { | |
375 my ($transcript, $source) = @_; | |
376 | |
377 #retrieve all of the snps from this transcript | |
378 my $all_snps = get_all_SNPs($transcript, 0, $source); | |
379 my %snp_hash; | |
380 | |
381 my @cdna_types = ('three prime UTR', 'five prime UTR','coding'); | |
382 | |
383 my $slice = $transcript->slice(); | |
384 my $sa = $slice->adaptor(); | |
385 | |
386 $slice = $sa->fetch_by_Feature($transcript); | |
387 | |
388 # copy transcript in order to work in coord system of interest | |
389 $transcript = $transcript->transfer($slice); | |
390 | |
391 foreach my $type (@cdna_types) { | |
392 $snp_hash{$type} = []; | |
393 foreach my $snp (@{$all_snps->{$type}}) { | |
394 my @coords = $transcript->genomic2cdna($snp->start, $snp->end, | |
395 $snp->strand); | |
396 | |
397 #skip snps that don't map cleanly (possibly an indel...) | |
398 if(scalar(@coords) != 1) { | |
399 #warning("snp of type $type does not map cleanly\n"); | |
400 next; | |
401 } | |
402 | |
403 my ($coord) = @coords; | |
404 | |
405 unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { | |
406 #warning("snp of type $type maps to gap\n"); | |
407 next; | |
408 } | |
409 | |
410 my $alleles; | |
411 my $ambicode; | |
412 | |
413 # get alleles and ambig_code (with fallback to old snp API) | |
414 $alleles = $snp->allele_string || $snp->{'alleles'}; | |
415 $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'}; | |
416 | |
417 #we arbitrarily put the SNP on the +ve strand because it is easier to | |
418 #work with in the webcode | |
419 if($coord->strand == -1) { | |
420 $alleles =~ | |
421 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//; | |
422 $ambicode =~ | |
423 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//; | |
424 } | |
425 #copy the snp and convert to cdna coords... | |
426 my $new_snp; | |
427 %$new_snp = %$snp; | |
428 bless $new_snp, ref $snp; | |
429 $new_snp->start($coord->start); | |
430 $new_snp->end($coord->end); | |
431 $new_snp->strand(1); | |
432 $new_snp->allele_string($alleles); | |
433 $new_snp->ambig_code($ambicode); | |
434 push @{$snp_hash{$type}}, $new_snp; | |
435 } | |
436 } | |
437 | |
438 return \%snp_hash; | |
439 } | |
440 | |
441 | |
442 | |
443 | |
444 1; |