annotate variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptSNPs.pm @ 1:d6778b5d8382 draft default tip

Deleted selected files
author willmclaren
date Fri, 03 Aug 2012 10:05:43 -0400
parents 21066c0abaf5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 =head1 LICENSE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 Genome Research Limited. All rights reserved.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 This software is distributed under a modified Apache license.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 For license details, please see
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 http://www.ensembl.org/info/about/code_licence.html
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 =head1 CONTACT
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 Please email comments or questions to the public Ensembl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14 developers list at <dev@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16 Questions may also be sent to the Ensembl help desk at
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 <helpdesk@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 TranscriptSNPs - A utility class used to obtain information about the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24 relationships between a transcript and SNPs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28 use Bio::EnsEMBL::Utils::TranscriptSNPs;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 # get and type all snps in the region of the transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 %snps = %{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 $flanking ) };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 # get all snps overlapping the transcript in cdna coordinates
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38 %snps =
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 %{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40 $transcript) };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 # get the peptide variations caused by a set of SNPs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44 %variations = %{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45 Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 $transcript, $snps ) };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 This is a utility class which can be used to get snps associated with a
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51 transcript, and to determine the amino acid changes caused by the SNPs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53 =head1 METHODS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57 package Bio::EnsEMBL::Utils::TranscriptSNPs;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60 use warnings;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 no warnings 'uninitialized';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 =head2 get_all_peptide_variations
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 Arg [1] : $transcript the transcript to obtain the peptide variations for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71 Arg [2] : $snps listref of coding snps in cdna coordinates
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 Example : $pep_hash = get_all_peptide_variations($transcript, \@snps);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 Description: Takes a list of coding snps on this transcript in
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 which are in cdna coordinates and returns a hash with peptide
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75 coordinate keys and listrefs of alternative amino acids as
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 values. The SNPs must additionally have a strand of 1 for the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77 sake of simplicity. Normally these could be generated using the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 get_all_cdna_SNPs method.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 Note that the peptide encoded by the reference sequence is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81 also present in the results and that duplicate peptides
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 (e.g. resulting from synonomous mutations) are discarded.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 It is possible to have greated than two peptides variations
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84 at a given location given adjacent or overlapping snps.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 Insertion/deletion variations are ignored by this method.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 Example of a data structure that could be returned:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 { 1 => ['I', 'M'],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88 10 => ['I', 'T'],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 37 => ['N', 'D'],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 56 => ['G', 'E'],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 118 => ['R', 'K'],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 159 => ['D', 'E'],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 167 => ['Q', 'R'],
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 173 => ['H', 'Q'] }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 Returntype : hashref
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97 Caller : general
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 sub get_all_peptide_variations {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102 my $transcript = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 my $snps = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 throw('Bio::EnsEMBL::Transcript argument is required.');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 if(!ref($snps) eq 'ARRAY') {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 throw('Reference to a list of Bio::EnsEMBL::SNP objects is required');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 my $codon_table = Bio::Tools::CodonTable->new;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 my $codon_length = 3;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115 my $cdna = $transcript->spliced_seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117 my $variant_alleles;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 my $translation_start = $transcript->cdna_coding_start;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 foreach my $snp (@$snps) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 #skip variations not on a single base
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 next if ($snp->start != $snp->end);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123 my $start = $snp->start;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 my $strand = $snp->strand;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126 #calculate offset of the nucleotide from codon start (0|1|2)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 my $codon_pos = ($start - $translation_start) % $codon_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 #calculate the peptide coordinate of the snp
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 my $peptide = ($start - $translation_start +
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131 ($codon_length - $codon_pos)) / $codon_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133 # skip this SNP if it falls in a partial codon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 next if $start - $codon_pos + $codon_length > length($cdna);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 #retrieve the codon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137 my $codon = substr($cdna, $start - $codon_pos-1, $codon_length);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139 #store each alternative allele by its location in the peptide
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 my @alleles = split(/\/|\|/, lc($snp->allele_string));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 #my @alleles = split(/\/|\|/, lc($snp->alleles));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 foreach my $allele (@alleles) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144 next if $allele eq '-'; #skip deletions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 next if CORE::length($allele) != 1; #skip insertions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 #get_all_cdna_SNPs always gives strand of 1 now
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 #if($strand == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 # #complement the allele if the snp is on the reverse strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 # $allele =~
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 # tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152 #}
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154 #create a data structure of variant alleles sorted by both their
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 #peptide position and their position within the peptides codon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 $variant_alleles ||= {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 if(exists $variant_alleles->{$peptide}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 my $alleles_arr = $variant_alleles->{$peptide}->[1];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 push @{$alleles_arr->[$codon_pos]}, $allele;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 #create a list of 3 lists (one list for each codon position)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 my $alleles_arr = [[],[],[]];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 push @{$alleles_arr->[$codon_pos]}, $allele;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 $variant_alleles->{$peptide} = [$codon, $alleles_arr];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169 my %out;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 #now generate all possible codons for each peptide and translate them
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171 foreach my $peptide (keys %$variant_alleles) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 my ($codon, $alleles) = @{$variant_alleles->{$peptide}};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174 #need to push original nucleotides onto each position
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175 #so that all possible combinations can be generated
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176 push @{$alleles->[0]}, substr($codon,0,1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 push @{$alleles->[1]}, substr($codon,1,1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 push @{$alleles->[2]}, substr($codon,2,1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 my %alt_amino_acids;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 foreach my $a1 (@{$alleles->[0]}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 substr($codon, 0, 1) = $a1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183 foreach my $a2 (@{$alleles->[1]}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 substr($codon, 1, 1) = $a2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185 foreach my $a3 (@{$alleles->[2]}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 substr($codon, 2, 1) = $a3;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187 my $aa = $codon_table->translate($codon);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 #print "$codon translation is $aa\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 $alt_amino_acids{$aa} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194 my @aas = keys %alt_amino_acids;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 $out{$peptide} = \@aas;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198 return \%out;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203 =head2 get_all_SNPs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 Arg [1] : Bio::EnsEMBL::Transcript $transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207 Arg [2] : (optional) int $flanking
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 The number of basepairs of transcript flanking sequence to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 retrieve snps from (default 0)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 Arg [3] : $source type of database source (dbSNP, Glovar)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 Example : $snp_hashref = get_all_transcript_SNPs($transcript)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 Description: Retrieves all snps found within the region of the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 provided transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 The snps are returned in a hash with keys corresponding
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 to the region the snp was found in. Possible keys are:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 'three prime UTR', 'five prime UTR', 'coding', 'intronic',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217 'three prime flanking', 'five prime flanking'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 If no flanking argument is provided no flanking snps will be
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 obtained.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220 The listrefs which are the values of the returned hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 contain snps in coordinates of the transcript region
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 (i.e. first base = first base of the first exon on the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223 postive strand - flanking bases + 1)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 Multiple base variations and inserts/deletes are discarded
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226 by this method and not used.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 values
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 Caller : general
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 sub get_all_SNPs {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 my $transcript = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 my $flanking = shift || 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 my $source = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 throw('Bio::EnsEMBL::Transcript argument required.');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 my $slice = $transcript->slice();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 if(!$slice) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 warning("Cannot obtain SNPs for transcript without attached Slice.");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 return {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 my $sa = $slice->adaptor();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 if(!$sa) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 warning('Cannot obtain SNPs for transcript unless attached slice ' .
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 'has attached adaptor');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 return {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 my %snp_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 # retrieve slice in the region of the transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262 $slice = $sa->fetch_by_Feature($transcript, $flanking );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 # copy transcript, to work in coord system we are interested in
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 $transcript = $transcript->transfer( $slice );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 # get all snps in the transcript region
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 my $snps;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 if ($source eq 'glovar') {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270 $snps = $slice->get_all_ExternalFeatures('GlovarSNP');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272 elsif ($source eq 'variation') {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 $snps = $slice->get_all_VariationFeatures;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276 $snps = $slice->get_all_SNPs; # dont need once use new snp api (i think)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 my $trans_start = $flanking + 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280 my $trans_end = $slice->length - $flanking;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 my $trans_strand = $transcript->get_all_Exons->[0]->strand;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 # classify each snp
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 foreach my $snp (@$snps) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 my $key;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 if(($trans_strand == 1 && $snp->end < $trans_start) ||
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288 ($trans_strand == -1 && $snp->start > $trans_end)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 # this snp is upstream from the transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 $key = 'five prime flanking';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 elsif(($trans_strand == 1 && $snp->start > $trans_end) ||
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294 ($trans_strand == -1 && $snp->start < $trans_start)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 # this snp is downstream from the transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 $key = 'three prime flanking';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300 #snp is inside transcript region check if it overlaps an exon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 foreach my $e (@{$transcript->get_all_Exons}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 if($snp->end >= $e->start && $snp->start <= $e->end) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303 # this snp is in an exon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305 if(($trans_strand == 1 &&
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 $snp->end < $transcript->coding_region_start) ||
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 ($trans_strand == -1 &&
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 $snp->start > $transcript->coding_region_end)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309 # this snp is in the 5' UTR
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310 $key = 'five prime UTR';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 elsif(($trans_strand == 1 &&
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314 $snp->start > $transcript->coding_region_end)||
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 ($trans_strand == -1 &&
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316 $snp->end < $transcript->coding_region_start)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317 # this snp is in the 3' UTR
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 $key = 'three prime UTR';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 # snp is coding
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 $key = 'coding';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 last;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328 unless($key) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 # snp was not in an exon and is therefore intronic
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 $key = 'intronic';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334 unless($key) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335 #warning('SNP could not be mapped. In/Dels not supported yet...');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 if(exists $snp_hash{$key}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340 push @{$snp_hash{$key}}, $snp;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 $snp_hash{$key} = [$snp];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347 return \%snp_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352 =head2 get_all_cdna_SNPs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354 Arg [1] : Bio::EnsEMBL::Transcript $transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 Arg [2] : $source type of database source (dbSNP, Glovar)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356 Example : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357 Description: Retrieves all snps found within exons of the provided
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358 transcript.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359 The snps are returned in a hash with three keys corresponding
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 to the region the snp was found in. Valid keys are:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 'three prime UTR', 'five prime UTR', 'coding'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362 The listrefs which are the values of the returned hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363 contain snps in CDNA coordinates.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365 Multiple base variations and insertions/deletions are not
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366 used by this function and are discarded.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367 Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368 values
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369 Exceptions : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370 Caller : general
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374 sub get_all_cdna_SNPs {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375 my ($transcript, $source) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377 #retrieve all of the snps from this transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378 my $all_snps = get_all_SNPs($transcript, 0, $source);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 my %snp_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381 my @cdna_types = ('three prime UTR', 'five prime UTR','coding');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383 my $slice = $transcript->slice();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 my $sa = $slice->adaptor();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386 $slice = $sa->fetch_by_Feature($transcript);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 # copy transcript in order to work in coord system of interest
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389 $transcript = $transcript->transfer($slice);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 foreach my $type (@cdna_types) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392 $snp_hash{$type} = [];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393 foreach my $snp (@{$all_snps->{$type}}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 my @coords = $transcript->genomic2cdna($snp->start, $snp->end,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395 $snp->strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 #skip snps that don't map cleanly (possibly an indel...)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398 if(scalar(@coords) != 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399 #warning("snp of type $type does not map cleanly\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 my ($coord) = @coords;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405 unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406 #warning("snp of type $type maps to gap\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
407 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
408 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
409
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
410 my $alleles;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
411 my $ambicode;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
412
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
413 # get alleles and ambig_code (with fallback to old snp API)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
414 $alleles = $snp->allele_string || $snp->{'alleles'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
415 $ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
416
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
417 #we arbitrarily put the SNP on the +ve strand because it is easier to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
418 #work with in the webcode
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
419 if($coord->strand == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
420 $alleles =~
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
421 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
422 $ambicode =~
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
423 tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
424 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
425 #copy the snp and convert to cdna coords...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
426 my $new_snp;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
427 %$new_snp = %$snp;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
428 bless $new_snp, ref $snp;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
429 $new_snp->start($coord->start);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
430 $new_snp->end($coord->end);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
431 $new_snp->strand(1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
432 $new_snp->allele_string($alleles);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
433 $new_snp->ambig_code($ambicode);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
434 push @{$snp_hash{$type}}, $new_snp;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
435 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
436 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
437
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
438 return \%snp_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
439 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
440
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
441
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
442
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
443
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
444 1;