0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 TranscriptSNPs - A utility class used to obtain information about the
|
|
24 relationships between a transcript and SNPs
|
|
25
|
|
26 =head1 SYNOPSIS
|
|
27
|
|
28 use Bio::EnsEMBL::Utils::TranscriptSNPs;
|
|
29
|
|
30 # get and type all snps in the region of the transcript
|
|
31
|
|
32 %snps = %{
|
|
33 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript,
|
|
34 $flanking ) };
|
|
35
|
|
36 # get all snps overlapping the transcript in cdna coordinates
|
|
37
|
|
38 %snps =
|
|
39 %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs(
|
|
40 $transcript) };
|
|
41
|
|
42 # get the peptide variations caused by a set of SNPs
|
|
43
|
|
44 %variations = %{
|
|
45 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations(
|
|
46 $transcript, $snps ) };
|
|
47
|
|
48 =head1 DESCRIPTION
|
|
49
|
|
50 This is a utility class which can be used to get snps associated with a
|
|
51 transcript, and to determine the amino acid changes caused by the SNPs
|
|
52
|
|
53 =head1 METHODS
|
|
54
|
|
55 =cut
|
|
56
|
|
57 package Bio::EnsEMBL::Utils::TranscriptSNPs;
|
|
58
|
|
59 use strict;
|
|
60 use warnings;
|
|
61 no warnings 'uninitialized';
|
|
62
|
|
63
|
|
64
|
|
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
66
|
|
67
|
|
68 =head2 get_all_peptide_variations
|
|
69
|
|
70 Arg [1] : $transcript the transcript to obtain the peptide variations for
|
|
71 Arg [2] : $snps listref of coding snps in cdna coordinates
|
|
72 Example : $pep_hash = get_all_peptide_variations($transcript, \@snps);
|
|
73 Description: Takes a list of coding snps on this transcript in
|
|
74 which are in cdna coordinates and returns a hash with peptide
|
|
75 coordinate keys and listrefs of alternative amino acids as
|
|
76 values. The SNPs must additionally have a strand of 1 for the
|
|
77 sake of simplicity. Normally these could be generated using the
|
|
78 get_all_cdna_SNPs method.
|
|
79
|
|
80 Note that the peptide encoded by the reference sequence is
|
|
81 also present in the results and that duplicate peptides
|
|
82 (e.g. resulting from synonomous mutations) are discarded.
|
|
83 It is possible to have greated than two peptides variations
|
|
84 at a given location given adjacent or overlapping snps.
|
|
85 Insertion/deletion variations are ignored by this method.
|
|
86 Example of a data structure that could be returned:
|
|
87 { 1 => ['I', 'M'],
|
|
88 10 => ['I', 'T'],
|
|
89 37 => ['N', 'D'],
|
|
90 56 => ['G', 'E'],
|
|
91 118 => ['R', 'K'],
|
|
92 159 => ['D', 'E'],
|
|
93 167 => ['Q', 'R'],
|
|
94 173 => ['H', 'Q'] }
|
|
95 Returntype : hashref
|
|
96 Exceptions : none
|
|
97 Caller : general
|
|
98
|
|
99 =cut
|
|
100
|
|
101 sub get_all_peptide_variations {
|
|
102 my $transcript = shift;
|
|
103 my $snps = shift;
|
|
104
|
|
105 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
|
|
106 throw('Bio::EnsEMBL::Transcript argument is required.');
|
|
107 }
|
|
108
|
|
109 if(!ref($snps) eq 'ARRAY') {
|
|
110 throw('Reference to a list of Bio::EnsEMBL::SNP objects is required');
|
|
111 }
|
|
112
|
|
113 my $codon_table = Bio::Tools::CodonTable->new;
|
|
114 my $codon_length = 3;
|
|
115 my $cdna = $transcript->spliced_seq;
|
|
116
|
|
117 my $variant_alleles;
|
|
118 my $translation_start = $transcript->cdna_coding_start;
|
|
119 foreach my $snp (@$snps) {
|
|
120 #skip variations not on a single base
|
|
121 next if ($snp->start != $snp->end);
|
|
122
|
|
123 my $start = $snp->start;
|
|
124 my $strand = $snp->strand;
|
|
125
|
|
126 #calculate offset of the nucleotide from codon start (0|1|2)
|
|
127 my $codon_pos = ($start - $translation_start) % $codon_length;
|
|
128
|
|
129 #calculate the peptide coordinate of the snp
|
|
130 my $peptide = ($start - $translation_start +
|
|
131 ($codon_length - $codon_pos)) / $codon_length;
|
|
132
|
|
133 # skip this SNP if it falls in a partial codon
|
|
134 next if $start - $codon_pos + $codon_length > length($cdna);
|
|
135
|
|
136 #retrieve the codon
|
|
137 my $codon = substr($cdna, $start - $codon_pos-1, $codon_length);
|
|
138
|
|
139 #store each alternative allele by its location in the peptide
|
|
140 my @alleles = split(/\/|\|/, lc($snp->allele_string));
|
|
141 #my @alleles = split(/\/|\|/, lc($snp->alleles));
|
|
142
|
|
143 foreach my $allele (@alleles) {
|
|
144 next if $allele eq '-'; #skip deletions
|
|
145 next if CORE::length($allele) != 1; #skip insertions
|
|
146
|
|
147 #get_all_cdna_SNPs always gives strand of 1 now
|
|
148 #if($strand == -1) {
|
|
149 # #complement the allele if the snp is on the reverse strand
|
|
150 # $allele =~
|
|
151 # tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
|
|
152 #}
|
|
153
|
|
154 #create a data structure of variant alleles sorted by both their
|
|
155 #peptide position and their position within the peptides codon
|
|
156 $variant_alleles ||= {};
|
|
157 if(exists $variant_alleles->{$peptide}) {
|
|
158 my $alleles_arr = $variant_alleles->{$peptide}->[1];
|
|
159 push @{$alleles_arr->[$codon_pos]}, $allele;
|
|
160 } else {
|
|
161 #create a list of 3 lists (one list for each codon position)
|
|
162 my $alleles_arr = [[],[],[]];
|
|
163 push @{$alleles_arr->[$codon_pos]}, $allele;
|
|
164 $variant_alleles->{$peptide} = [$codon, $alleles_arr];
|
|
165 }
|
|
166 }
|
|
167 }
|
|
168
|
|
169 my %out;
|
|
170 #now generate all possible codons for each peptide and translate them
|
|
171 foreach my $peptide (keys %$variant_alleles) {
|
|
172 my ($codon, $alleles) = @{$variant_alleles->{$peptide}};
|
|
173
|
|
174 #need to push original nucleotides onto each position
|
|
175 #so that all possible combinations can be generated
|
|
176 push @{$alleles->[0]}, substr($codon,0,1);
|
|
177 push @{$alleles->[1]}, substr($codon,1,1);
|
|
178 push @{$alleles->[2]}, substr($codon,2,1);
|
|
179
|
|
180 my %alt_amino_acids;
|
|
181 foreach my $a1 (@{$alleles->[0]}) {
|
|
182 substr($codon, 0, 1) = $a1;
|
|
183 foreach my $a2 (@{$alleles->[1]}) {
|
|
184 substr($codon, 1, 1) = $a2;
|
|
185 foreach my $a3 (@{$alleles->[2]}) {
|
|
186 substr($codon, 2, 1) = $a3;
|
|
187 my $aa = $codon_table->translate($codon);
|
|
188 #print "$codon translation is $aa\n";
|
|
189 $alt_amino_acids{$aa} = 1;
|
|
190 }
|
|
191 }
|
|
192 }
|
|
193
|
|
194 my @aas = keys %alt_amino_acids;
|
|
195 $out{$peptide} = \@aas;
|
|
196 }
|
|
197
|
|
198 return \%out;
|
|
199 }
|
|
200
|
|
201
|
|
202
|
|
203 =head2 get_all_SNPs
|
|
204
|
|
205 Arg [1] : Bio::EnsEMBL::Transcript $transcript
|
|
206
|
|
207 Arg [2] : (optional) int $flanking
|
|
208 The number of basepairs of transcript flanking sequence to
|
|
209 retrieve snps from (default 0)
|
|
210 Arg [3] : $source type of database source (dbSNP, Glovar)
|
|
211 Example : $snp_hashref = get_all_transcript_SNPs($transcript)
|
|
212 Description: Retrieves all snps found within the region of the
|
|
213 provided transcript
|
|
214 The snps are returned in a hash with keys corresponding
|
|
215 to the region the snp was found in. Possible keys are:
|
|
216 'three prime UTR', 'five prime UTR', 'coding', 'intronic',
|
|
217 'three prime flanking', 'five prime flanking'
|
|
218 If no flanking argument is provided no flanking snps will be
|
|
219 obtained.
|
|
220 The listrefs which are the values of the returned hash
|
|
221 contain snps in coordinates of the transcript region
|
|
222 (i.e. first base = first base of the first exon on the
|
|
223 postive strand - flanking bases + 1)
|
|
224
|
|
225 Multiple base variations and inserts/deletes are discarded
|
|
226 by this method and not used.
|
|
227
|
|
228 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
|
|
229 values
|
|
230 Exceptions : none
|
|
231 Caller : general
|
|
232
|
|
233 =cut
|
|
234
|
|
235 sub get_all_SNPs {
|
|
236 my $transcript = shift;
|
|
237 my $flanking = shift || 0;
|
|
238 my $source = shift;
|
|
239
|
|
240 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
|
|
241 throw('Bio::EnsEMBL::Transcript argument required.');
|
|
242 }
|
|
243
|
|
244 my $slice = $transcript->slice();
|
|
245
|
|
246 if(!$slice) {
|
|
247 warning("Cannot obtain SNPs for transcript without attached Slice.");
|
|
248 return {};
|
|
249 }
|
|
250
|
|
251 my $sa = $slice->adaptor();
|
|
252
|
|
253 if(!$sa) {
|
|
254 warning('Cannot obtain SNPs for transcript unless attached slice ' .
|
|
255 'has attached adaptor');
|
|
256 return {};
|
|
257 }
|
|
258
|
|
259 my %snp_hash;
|
|
260
|
|
261 # retrieve slice in the region of the transcript
|
|
262 $slice = $sa->fetch_by_Feature($transcript, $flanking );
|
|
263
|
|
264 # copy transcript, to work in coord system we are interested in
|
|
265 $transcript = $transcript->transfer( $slice );
|
|
266
|
|
267 # get all snps in the transcript region
|
|
268 my $snps;
|
|
269 if ($source eq 'glovar') {
|
|
270 $snps = $slice->get_all_ExternalFeatures('GlovarSNP');
|
|
271 }
|
|
272 elsif ($source eq 'variation') {
|
|
273 $snps = $slice->get_all_VariationFeatures;
|
|
274 }
|
|
275 else {
|
|
276 $snps = $slice->get_all_SNPs; # dont need once use new snp api (i think)
|
|
277 }
|
|
278
|
|
279 my $trans_start = $flanking + 1;
|
|
280 my $trans_end = $slice->length - $flanking;
|
|
281 my $trans_strand = $transcript->get_all_Exons->[0]->strand;
|
|
282
|
|
283 # classify each snp
|
|
284 foreach my $snp (@$snps) {
|
|
285 my $key;
|
|
286
|
|
287 if(($trans_strand == 1 && $snp->end < $trans_start) ||
|
|
288 ($trans_strand == -1 && $snp->start > $trans_end)) {
|
|
289 # this snp is upstream from the transcript
|
|
290 $key = 'five prime flanking';
|
|
291 }
|
|
292
|
|
293 elsif(($trans_strand == 1 && $snp->start > $trans_end) ||
|
|
294 ($trans_strand == -1 && $snp->start < $trans_start)) {
|
|
295 # this snp is downstream from the transcript
|
|
296 $key = 'three prime flanking';
|
|
297 }
|
|
298
|
|
299 else {
|
|
300 #snp is inside transcript region check if it overlaps an exon
|
|
301 foreach my $e (@{$transcript->get_all_Exons}) {
|
|
302 if($snp->end >= $e->start && $snp->start <= $e->end) {
|
|
303 # this snp is in an exon
|
|
304
|
|
305 if(($trans_strand == 1 &&
|
|
306 $snp->end < $transcript->coding_region_start) ||
|
|
307 ($trans_strand == -1 &&
|
|
308 $snp->start > $transcript->coding_region_end)) {
|
|
309 # this snp is in the 5' UTR
|
|
310 $key = 'five prime UTR';
|
|
311 }
|
|
312
|
|
313 elsif(($trans_strand == 1 &&
|
|
314 $snp->start > $transcript->coding_region_end)||
|
|
315 ($trans_strand == -1 &&
|
|
316 $snp->end < $transcript->coding_region_start)) {
|
|
317 # this snp is in the 3' UTR
|
|
318 $key = 'three prime UTR';
|
|
319 }
|
|
320
|
|
321 else {
|
|
322 # snp is coding
|
|
323 $key = 'coding';
|
|
324 }
|
|
325 last;
|
|
326 }
|
|
327 }
|
|
328 unless($key) {
|
|
329 # snp was not in an exon and is therefore intronic
|
|
330 $key = 'intronic';
|
|
331 }
|
|
332 }
|
|
333
|
|
334 unless($key) {
|
|
335 #warning('SNP could not be mapped. In/Dels not supported yet...');
|
|
336 next;
|
|
337 }
|
|
338
|
|
339 if(exists $snp_hash{$key}) {
|
|
340 push @{$snp_hash{$key}}, $snp;
|
|
341 }
|
|
342 else {
|
|
343 $snp_hash{$key} = [$snp];
|
|
344 }
|
|
345 }
|
|
346
|
|
347 return \%snp_hash;
|
|
348 }
|
|
349
|
|
350
|
|
351
|
|
352 =head2 get_all_cdna_SNPs
|
|
353
|
|
354 Arg [1] : Bio::EnsEMBL::Transcript $transcript
|
|
355 Arg [2] : $source type of database source (dbSNP, Glovar)
|
|
356 Example : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs;
|
|
357 Description: Retrieves all snps found within exons of the provided
|
|
358 transcript.
|
|
359 The snps are returned in a hash with three keys corresponding
|
|
360 to the region the snp was found in. Valid keys are:
|
|
361 'three prime UTR', 'five prime UTR', 'coding'
|
|
362 The listrefs which are the values of the returned hash
|
|
363 contain snps in CDNA coordinates.
|
|
364
|
|
365 Multiple base variations and insertions/deletions are not
|
|
366 used by this function and are discarded.
|
|
367 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
|
|
368 values
|
|
369 Exceptions : none
|
|
370 Caller : general
|
|
371
|
|
372 =cut
|
|
373
|
|
374 sub get_all_cdna_SNPs {
|
|
375 my ($transcript, $source) = @_;
|
|
376
|
|
377 #retrieve all of the snps from this transcript
|
|
378 my $all_snps = get_all_SNPs($transcript, 0, $source);
|
|
379 my %snp_hash;
|
|
380
|
|
381 my @cdna_types = ('three prime UTR', 'five prime UTR','coding');
|
|
382
|
|
383 my $slice = $transcript->slice();
|
|
384 my $sa = $slice->adaptor();
|
|
385
|
|
386 $slice = $sa->fetch_by_Feature($transcript);
|
|
387
|
|
388 # copy transcript in order to work in coord system of interest
|
|
389 $transcript = $transcript->transfer($slice);
|
|
390
|
|
391 foreach my $type (@cdna_types) {
|
|
392 $snp_hash{$type} = [];
|
|
393 foreach my $snp (@{$all_snps->{$type}}) {
|
|
394 my @coords = $transcript->genomic2cdna($snp->start, $snp->end,
|
|
395 $snp->strand);
|
|
396
|
|
397 #skip snps that don't map cleanly (possibly an indel...)
|
|
398 if(scalar(@coords) != 1) {
|
|
399 #warning("snp of type $type does not map cleanly\n");
|
|
400 next;
|
|
401 }
|
|
402
|
|
403 my ($coord) = @coords;
|
|
404
|
|
405 unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
|
|
406 #warning("snp of type $type maps to gap\n");
|
|
407 next;
|
|
408 }
|
|
409
|
|
410 my $alleles;
|
|
411 my $ambicode;
|
|
412
|
|
413 # get alleles and ambig_code (with fallback to old snp API)
|
|
414 $alleles = $snp->allele_string || $snp->{'alleles'};
|
|
415 $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'};
|
|
416
|
|
417 #we arbitrarily put the SNP on the +ve strand because it is easier to
|
|
418 #work with in the webcode
|
|
419 if($coord->strand == -1) {
|
|
420 $alleles =~
|
|
421 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
|
|
422 $ambicode =~
|
|
423 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
|
|
424 }
|
|
425 #copy the snp and convert to cdna coords...
|
|
426 my $new_snp;
|
|
427 %$new_snp = %$snp;
|
|
428 bless $new_snp, ref $snp;
|
|
429 $new_snp->start($coord->start);
|
|
430 $new_snp->end($coord->end);
|
|
431 $new_snp->strand(1);
|
|
432 $new_snp->allele_string($alleles);
|
|
433 $new_snp->ambig_code($ambicode);
|
|
434 push @{$snp_hash{$type}}, $new_snp;
|
|
435 }
|
|
436 }
|
|
437
|
|
438 return \%snp_hash;
|
|
439 }
|
|
440
|
|
441
|
|
442
|
|
443
|
|
444 1;
|