|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 ExAC
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv ExAC.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz
|
|
|
32 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC
|
|
|
33 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,,AN
|
|
|
34 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC,AN
|
|
|
35
|
|
|
36
|
|
|
37
|
|
|
38 =head1 DESCRIPTION
|
|
|
39
|
|
|
40 A VEP plugin that retrieves ExAC allele frequencies.
|
|
|
41
|
|
|
42 Visit ftp://ftp.broadinstitute.org/pub/ExAC_release/current to download the latest ExAC VCF.
|
|
|
43
|
|
|
44 Note that the currently available version of the ExAC data file (0.3) is only available
|
|
|
45 on the GRCh37 assembly; therefore it can only be used with this plugin when using the
|
|
|
46 VEP on GRCh37. See http://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#assembly
|
|
|
47
|
|
|
48 The tabix utility must be installed in your path to use this plugin.
|
|
|
49
|
|
|
50 The plugin takes 3 command line arguments. Second and third arguments are not mandatory. If AC specified as second
|
|
|
51 argument Allele counts per population will be included in output. If AN specified as third argument Allele specific
|
|
|
52 chromosome counts will be included in output.
|
|
|
53
|
|
|
54
|
|
|
55 =cut
|
|
|
56
|
|
|
57 package ExAC;
|
|
|
58
|
|
|
59 use strict;
|
|
|
60 use warnings;
|
|
|
61
|
|
|
62 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
63 use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles);
|
|
|
64
|
|
|
65 use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_slice);
|
|
|
66
|
|
|
67 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
|
|
|
68
|
|
|
69 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
70
|
|
|
71 sub new {
|
|
|
72 my $class = shift;
|
|
|
73
|
|
|
74 my $self = $class->SUPER::new(@_);
|
|
|
75
|
|
|
76 # test tabix
|
|
|
77 die "ERROR: tabix does not seem to be in your path\n" unless `which tabix 2>&1` =~ /tabix$/;
|
|
|
78
|
|
|
79 # get ExAC file
|
|
|
80 my $file = $self->params->[0];
|
|
|
81
|
|
|
82 # get AC,AN options
|
|
|
83 if (exists($self->params->[1]) && $self->params->[1] eq 'AC'){
|
|
|
84 $self->{display_ac} = 1;
|
|
|
85 }
|
|
|
86 else {
|
|
|
87 $self->{display_ac} = 0;
|
|
|
88 }
|
|
|
89
|
|
|
90 if (exists($self->params->[2]) && $self->params->[2] eq 'AN'){
|
|
|
91 $self->{display_an} = 1;
|
|
|
92 }
|
|
|
93 else {
|
|
|
94 $self->{display_an} = 0;
|
|
|
95 }
|
|
|
96
|
|
|
97 # remote files?
|
|
|
98 if($file =~ /tp\:\/\//) {
|
|
|
99 my $remote_test = `tabix -f $file 1:1-1 2>&1`;
|
|
|
100 print STDERR "$remote_test\n";
|
|
|
101 # if($remote_test && $remote_test !~ /get_local_version/) {
|
|
|
102 # die "$remote_test\nERROR: Could not find file or index file for remote annotation file $file\n";
|
|
|
103 # }
|
|
|
104 }
|
|
|
105
|
|
|
106 # check files exist
|
|
|
107 else {
|
|
|
108 die "ERROR: ExAC file $file not found; you can download it from ftp://ftp.broadinstitute.org/pub/ExAC_release/current\n" unless -e $file;
|
|
|
109 die "ERROR: Tabix index file $file\.tbi not found - perhaps you need to create it first?\n" unless -e $file.'.tbi';
|
|
|
110 }
|
|
|
111
|
|
|
112 $self->{file} = $file;
|
|
|
113
|
|
|
114 return $self;
|
|
|
115 }
|
|
|
116
|
|
|
117 sub feature_types {
|
|
|
118 return ['Feature','Intergenic'];
|
|
|
119 }
|
|
|
120
|
|
|
121 sub get_header_info {
|
|
|
122 my $self = shift;
|
|
|
123
|
|
|
124 if(!exists($self->{header_info})) {
|
|
|
125 open IN, "tabix -f -h ".$self->{file}." 1:1-1 |";
|
|
|
126
|
|
|
127 my %headers = ();
|
|
|
128 my @lines = <IN>;
|
|
|
129
|
|
|
130 while(my $line = shift @lines) {
|
|
|
131 if($line =~ /ID\=AC(\_[A-Zdj]+)?\,.*\"(.+)\"/) {
|
|
|
132 my ($pop, $desc) = ($1, $2);
|
|
|
133
|
|
|
134 $desc =~ s/Counts?/frequency/i;
|
|
|
135 $pop ||= '';
|
|
|
136
|
|
|
137 my $field_name = 'ExAC_AF'.$pop;
|
|
|
138 $headers{$field_name} = 'ExAC '.$desc;
|
|
|
139
|
|
|
140 if ($self->{display_ac}){
|
|
|
141 $field_name = 'ExAC_AC'.$pop;
|
|
|
142 $headers{$field_name} = 'ExAC'.$pop.' Allele count';
|
|
|
143 }
|
|
|
144 if ($self->{display_an}){
|
|
|
145 $field_name = 'ExAC_AN'.$pop;
|
|
|
146 $headers{$field_name} = 'ExAC'.$pop.' Allele number';
|
|
|
147 }
|
|
|
148
|
|
|
149 # store this header on self
|
|
|
150 push @{$self->{headers}}, 'AC'.$pop;
|
|
|
151 }
|
|
|
152 }
|
|
|
153
|
|
|
154 close IN;
|
|
|
155
|
|
|
156 die "ERROR: No valid headers found in ExAC VCF file\n" unless scalar keys %headers;
|
|
|
157
|
|
|
158 $self->{header_info} = \%headers;
|
|
|
159 }
|
|
|
160
|
|
|
161 return $self->{header_info};
|
|
|
162 }
|
|
|
163
|
|
|
164 sub run {
|
|
|
165 my ($self, $tva) = @_;
|
|
|
166 # make sure headers have been loaded
|
|
|
167 $self->get_header_info();
|
|
|
168
|
|
|
169 my $vf = $tva->variation_feature;
|
|
|
170 my $name = $vf->variation_name;
|
|
|
171
|
|
|
172 # get allele, reverse comp if needed
|
|
|
173 my $allele;
|
|
|
174
|
|
|
175 $allele = $tva->variation_feature_seq;
|
|
|
176 reverse_comp(\$allele) if $vf->{strand} < 0;
|
|
|
177
|
|
|
178 # adjust coords to account for VCF-like storage of indels
|
|
|
179 my ($s, $e) = ($vf->{start} - 1, $vf->{end} + 1);
|
|
|
180
|
|
|
181 my $vf_chr = $vf->{chr};
|
|
|
182 $vf_chr =~ s/chr//;
|
|
|
183 my $pos_string = sprintf("%s:%i-%i", $vf_chr, $s, $e);
|
|
|
184
|
|
|
185
|
|
|
186 # clear cache if it looks like the coords are the same
|
|
|
187 # but allele type is different
|
|
|
188 delete $self->{cache} if
|
|
|
189 defined($self->{cache}->{$pos_string}) &&
|
|
|
190 scalar keys %{$self->{cache}->{$pos_string}} &&
|
|
|
191 !defined($self->{cache}->{$pos_string}->{$allele});
|
|
|
192
|
|
|
193 my $data = {};
|
|
|
194
|
|
|
195 # cached?
|
|
|
196 if(defined($self->{cache}) && defined($self->{cache}->{$pos_string})) {
|
|
|
197 $data = $self->{cache}->{$pos_string};
|
|
|
198 }
|
|
|
199
|
|
|
200 # read from file
|
|
|
201 else {
|
|
|
202 open TABIX, sprintf("tabix -f %s %s |", $self->{file}, $pos_string);
|
|
|
203
|
|
|
204 while(<TABIX>) {
|
|
|
205 chomp;
|
|
|
206 s/\r$//g;
|
|
|
207 # parse VCF line into a VariationFeature object
|
|
|
208 my ($vcf_vf) = @{parse_line({format => 'vcf', minimal => 1}, $_)};
|
|
|
209
|
|
|
210 # check parsed OK
|
|
|
211 next unless $vcf_vf && $vcf_vf->isa('Bio::EnsEMBL::Variation::VariationFeature');
|
|
|
212
|
|
|
213 my @vcf_alleles = split /\//, $vcf_vf->allele_string;
|
|
|
214 my $ref_allele = shift @vcf_alleles;
|
|
|
215 my $vcf_vf_start = $vcf_vf->{start};
|
|
|
216 my $vcf_vf_end = $vcf_vf->{end};
|
|
|
217
|
|
|
218 my @vf_alleles = split /\//, $vf->allele_string;
|
|
|
219 my $vf_ref_allele = shift @vf_alleles;
|
|
|
220
|
|
|
221 if ($vcf_vf_start != $vf->{start} || $vcf_vf_end != $vf->{end}) {
|
|
|
222 my $matched_alleles = get_matched_variant_alleles({ref => $vf_ref_allele, alts => [$allele], pos => $vf->{start}}, {ref => $ref_allele, alts => \@vcf_alleles, pos => $vcf_vf_start});
|
|
|
223
|
|
|
224 next unless (@$matched_alleles);
|
|
|
225 # We only match one alt allele from the input VF against alleles from the VCF line. b_allele is the matched allele from the VCF alt alleles
|
|
|
226 $allele = $matched_alleles->[0]->{b_allele};
|
|
|
227 }
|
|
|
228 # iterate over required headers
|
|
|
229 HEADER:
|
|
|
230 foreach my $h(@{$self->{headers} || []}) {
|
|
|
231 my $total_ac = 0;
|
|
|
232
|
|
|
233 if(/$h\=([0-9\,]+)/) {
|
|
|
234
|
|
|
235 # grab AC
|
|
|
236 my @ac = split /\,/, $1;
|
|
|
237 next unless scalar @ac == scalar @vcf_alleles;
|
|
|
238
|
|
|
239 # now sed header to get AN
|
|
|
240 my $anh = $h;
|
|
|
241 $anh =~ s/AC/AN/;
|
|
|
242
|
|
|
243 my $afh = $h;
|
|
|
244 $afh =~ s/AC/AF/;
|
|
|
245
|
|
|
246 # get AC from header
|
|
|
247 my $ach = $h;
|
|
|
248
|
|
|
249 if(/$anh\=([0-9\,]+)/) {
|
|
|
250
|
|
|
251 # grab AN
|
|
|
252 my @an = split /\,/, $1;
|
|
|
253 next unless @an;
|
|
|
254 my $an;
|
|
|
255
|
|
|
256 foreach my $a(@vcf_alleles) {
|
|
|
257 my $ac = shift @ac;
|
|
|
258 $an = shift @an if @an;
|
|
|
259
|
|
|
260 $total_ac += $ac;
|
|
|
261 if ($self->{display_ac}){
|
|
|
262 $data->{$a}->{'ExAC_'.$ach} = $ac;
|
|
|
263 }
|
|
|
264 if ($self->{display_an}){
|
|
|
265 $data->{$a}->{'ExAC_'.$anh} = $an;
|
|
|
266 }
|
|
|
267
|
|
|
268 $data->{$a}->{'ExAC_'.$afh} = sprintf("%.3g", $ac / $an) if $an;
|
|
|
269 }
|
|
|
270
|
|
|
271 # use total to get ref allele freq
|
|
|
272 if ($self->{display_ac}){
|
|
|
273 $data->{$ref_allele}->{'ExAC_'.$ach} = $total_ac;
|
|
|
274 }
|
|
|
275 if ($self->{display_an}){
|
|
|
276 $data->{$ref_allele}->{'ExAC_'.$anh} = $an;
|
|
|
277 }
|
|
|
278 $data->{$ref_allele}->{'ExAC_'.$afh} = sprintf("%.3g", 1 - ($total_ac / $an)) if $an;
|
|
|
279 }
|
|
|
280 }
|
|
|
281 }
|
|
|
282 }
|
|
|
283
|
|
|
284 close TABIX;
|
|
|
285 }
|
|
|
286
|
|
|
287 # overwrite cache
|
|
|
288 $self->{cache} = {$pos_string => $data};
|
|
|
289 return defined($data->{$allele}) ? $data->{$allele} : {};
|
|
|
290 }
|
|
|
291
|
|
|
292 1;
|
|
|
293
|