comparison dir_plugins/ExAC.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
comparison
equal deleted inserted replaced
2:17c98d091710 3:49397129aec0
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 ExAC
27
28 =head1 SYNOPSIS
29
30 mv ExAC.pm ~/.vep/Plugins
31 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz
32 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC
33 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,,AN
34 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC,AN
35
36
37
38 =head1 DESCRIPTION
39
40 A VEP plugin that retrieves ExAC allele frequencies.
41
42 Visit ftp://ftp.broadinstitute.org/pub/ExAC_release/current to download the latest ExAC VCF.
43
44 Note that the currently available version of the ExAC data file (0.3) is only available
45 on the GRCh37 assembly; therefore it can only be used with this plugin when using the
46 VEP on GRCh37. See http://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#assembly
47
48 The tabix utility must be installed in your path to use this plugin.
49
50 The plugin takes 3 command line arguments. Second and third arguments are not mandatory. If AC specified as second
51 argument Allele counts per population will be included in output. If AN specified as third argument Allele specific
52 chromosome counts will be included in output.
53
54
55 =cut
56
57 package ExAC;
58
59 use strict;
60 use warnings;
61
62 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
63 use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles);
64
65 use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_slice);
66
67 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
68
69 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
70
71 sub new {
72 my $class = shift;
73
74 my $self = $class->SUPER::new(@_);
75
76 # test tabix
77 die "ERROR: tabix does not seem to be in your path\n" unless `which tabix 2>&1` =~ /tabix$/;
78
79 # get ExAC file
80 my $file = $self->params->[0];
81
82 # get AC,AN options
83 if (exists($self->params->[1]) && $self->params->[1] eq 'AC'){
84 $self->{display_ac} = 1;
85 }
86 else {
87 $self->{display_ac} = 0;
88 }
89
90 if (exists($self->params->[2]) && $self->params->[2] eq 'AN'){
91 $self->{display_an} = 1;
92 }
93 else {
94 $self->{display_an} = 0;
95 }
96
97 # remote files?
98 if($file =~ /tp\:\/\//) {
99 my $remote_test = `tabix -f $file 1:1-1 2>&1`;
100 print STDERR "$remote_test\n";
101 # if($remote_test && $remote_test !~ /get_local_version/) {
102 # die "$remote_test\nERROR: Could not find file or index file for remote annotation file $file\n";
103 # }
104 }
105
106 # check files exist
107 else {
108 die "ERROR: ExAC file $file not found; you can download it from ftp://ftp.broadinstitute.org/pub/ExAC_release/current\n" unless -e $file;
109 die "ERROR: Tabix index file $file\.tbi not found - perhaps you need to create it first?\n" unless -e $file.'.tbi';
110 }
111
112 $self->{file} = $file;
113
114 return $self;
115 }
116
117 sub feature_types {
118 return ['Feature','Intergenic'];
119 }
120
121 sub get_header_info {
122 my $self = shift;
123
124 if(!exists($self->{header_info})) {
125 open IN, "tabix -f -h ".$self->{file}." 1:1-1 |";
126
127 my %headers = ();
128 my @lines = <IN>;
129
130 while(my $line = shift @lines) {
131 if($line =~ /ID\=AC(\_[A-Zdj]+)?\,.*\"(.+)\"/) {
132 my ($pop, $desc) = ($1, $2);
133
134 $desc =~ s/Counts?/frequency/i;
135 $pop ||= '';
136
137 my $field_name = 'ExAC_AF'.$pop;
138 $headers{$field_name} = 'ExAC '.$desc;
139
140 if ($self->{display_ac}){
141 $field_name = 'ExAC_AC'.$pop;
142 $headers{$field_name} = 'ExAC'.$pop.' Allele count';
143 }
144 if ($self->{display_an}){
145 $field_name = 'ExAC_AN'.$pop;
146 $headers{$field_name} = 'ExAC'.$pop.' Allele number';
147 }
148
149 # store this header on self
150 push @{$self->{headers}}, 'AC'.$pop;
151 }
152 }
153
154 close IN;
155
156 die "ERROR: No valid headers found in ExAC VCF file\n" unless scalar keys %headers;
157
158 $self->{header_info} = \%headers;
159 }
160
161 return $self->{header_info};
162 }
163
164 sub run {
165 my ($self, $tva) = @_;
166 # make sure headers have been loaded
167 $self->get_header_info();
168
169 my $vf = $tva->variation_feature;
170 my $name = $vf->variation_name;
171
172 # get allele, reverse comp if needed
173 my $allele;
174
175 $allele = $tva->variation_feature_seq;
176 reverse_comp(\$allele) if $vf->{strand} < 0;
177
178 # adjust coords to account for VCF-like storage of indels
179 my ($s, $e) = ($vf->{start} - 1, $vf->{end} + 1);
180
181 my $vf_chr = $vf->{chr};
182 $vf_chr =~ s/chr//;
183 my $pos_string = sprintf("%s:%i-%i", $vf_chr, $s, $e);
184
185
186 # clear cache if it looks like the coords are the same
187 # but allele type is different
188 delete $self->{cache} if
189 defined($self->{cache}->{$pos_string}) &&
190 scalar keys %{$self->{cache}->{$pos_string}} &&
191 !defined($self->{cache}->{$pos_string}->{$allele});
192
193 my $data = {};
194
195 # cached?
196 if(defined($self->{cache}) && defined($self->{cache}->{$pos_string})) {
197 $data = $self->{cache}->{$pos_string};
198 }
199
200 # read from file
201 else {
202 open TABIX, sprintf("tabix -f %s %s |", $self->{file}, $pos_string);
203
204 while(<TABIX>) {
205 chomp;
206 s/\r$//g;
207 # parse VCF line into a VariationFeature object
208 my ($vcf_vf) = @{parse_line({format => 'vcf', minimal => 1}, $_)};
209
210 # check parsed OK
211 next unless $vcf_vf && $vcf_vf->isa('Bio::EnsEMBL::Variation::VariationFeature');
212
213 my @vcf_alleles = split /\//, $vcf_vf->allele_string;
214 my $ref_allele = shift @vcf_alleles;
215 my $vcf_vf_start = $vcf_vf->{start};
216 my $vcf_vf_end = $vcf_vf->{end};
217
218 my @vf_alleles = split /\//, $vf->allele_string;
219 my $vf_ref_allele = shift @vf_alleles;
220
221 if ($vcf_vf_start != $vf->{start} || $vcf_vf_end != $vf->{end}) {
222 my $matched_alleles = get_matched_variant_alleles({ref => $vf_ref_allele, alts => [$allele], pos => $vf->{start}}, {ref => $ref_allele, alts => \@vcf_alleles, pos => $vcf_vf_start});
223
224 next unless (@$matched_alleles);
225 # We only match one alt allele from the input VF against alleles from the VCF line. b_allele is the matched allele from the VCF alt alleles
226 $allele = $matched_alleles->[0]->{b_allele};
227 }
228 # iterate over required headers
229 HEADER:
230 foreach my $h(@{$self->{headers} || []}) {
231 my $total_ac = 0;
232
233 if(/$h\=([0-9\,]+)/) {
234
235 # grab AC
236 my @ac = split /\,/, $1;
237 next unless scalar @ac == scalar @vcf_alleles;
238
239 # now sed header to get AN
240 my $anh = $h;
241 $anh =~ s/AC/AN/;
242
243 my $afh = $h;
244 $afh =~ s/AC/AF/;
245
246 # get AC from header
247 my $ach = $h;
248
249 if(/$anh\=([0-9\,]+)/) {
250
251 # grab AN
252 my @an = split /\,/, $1;
253 next unless @an;
254 my $an;
255
256 foreach my $a(@vcf_alleles) {
257 my $ac = shift @ac;
258 $an = shift @an if @an;
259
260 $total_ac += $ac;
261 if ($self->{display_ac}){
262 $data->{$a}->{'ExAC_'.$ach} = $ac;
263 }
264 if ($self->{display_an}){
265 $data->{$a}->{'ExAC_'.$anh} = $an;
266 }
267
268 $data->{$a}->{'ExAC_'.$afh} = sprintf("%.3g", $ac / $an) if $an;
269 }
270
271 # use total to get ref allele freq
272 if ($self->{display_ac}){
273 $data->{$ref_allele}->{'ExAC_'.$ach} = $total_ac;
274 }
275 if ($self->{display_an}){
276 $data->{$ref_allele}->{'ExAC_'.$anh} = $an;
277 }
278 $data->{$ref_allele}->{'ExAC_'.$afh} = sprintf("%.3g", 1 - ($total_ac / $an)) if $an;
279 }
280 }
281 }
282 }
283
284 close TABIX;
285 }
286
287 # overwrite cache
288 $self->{cache} = {$pos_string => $data};
289 return defined($data->{$allele}) ? $data->{$allele} : {};
290 }
291
292 1;
293