|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 LD
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv LD.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
|
|
|
36 finds variants in linkage disequilibrium with any overlapping existing
|
|
|
37 variants from the Ensembl variation databases. You can configure the
|
|
|
38 population used to calculate the r2 value, and the r2 cutoff used by
|
|
|
39 passing arguments to the plugin via the VEP command line (separated
|
|
|
40 by commas). This plugin adds a single new entry to the Extra column
|
|
|
41 with a comma-separated list of linked variant IDs and the associated
|
|
|
42 r2 values, e.g.:
|
|
|
43
|
|
|
44 LinkedVariants=rs123:0.879,rs234:0.943
|
|
|
45
|
|
|
46 If no arguments are supplied, the default population used is the CEU
|
|
|
47 sample from the 1000 Genomes Project phase 3, and the default r2
|
|
|
48 cutoff used is 0.8.
|
|
|
49
|
|
|
50 WARNING: Calculating LD is a relatively slow procedure, so this will
|
|
|
51 slow VEP down considerably when running on large numbers of
|
|
|
52 variants. Consider running vep followed by filter_vep to get a smaller
|
|
|
53 input set:
|
|
|
54
|
|
|
55 ./vep -i input.vcf -cache -vcf -o input_vep.vcf
|
|
|
56 ./filter_vep -i input_vep.vcf -filter "Consequence is missense_variant" > input_vep_filtered.vcf
|
|
|
57 ./vep -i input_vep_filtered.vcf -cache -plugin LD
|
|
|
58
|
|
|
59 =cut
|
|
|
60
|
|
|
61 =head1 INSTALLATION
|
|
|
62
|
|
|
63 LD calculation requires additional installation steps.
|
|
|
64
|
|
|
65 The JSON perl library is required; see VEP's installation instructions
|
|
|
66 for guidance: http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#additional
|
|
|
67
|
|
|
68 A binary from the ensembl-variation git repository must be compiled and either
|
|
|
69 added to your PATH or specified on the command line. In the ensembl-vep
|
|
|
70 directory:
|
|
|
71
|
|
|
72 export HTSLIB_DIR=${PWD}/htslib
|
|
|
73 git clone https://github.com/Ensembl/ensembl-variation
|
|
|
74 cd ensembl-variation/C_code
|
|
|
75 make
|
|
|
76
|
|
|
77 You may EITHER add this path to your PATH environment variable (add this line
|
|
|
78 to your $HOME/.bashrc to make the change permanent):
|
|
|
79
|
|
|
80 export PATH=${PATH}:${PWD}
|
|
|
81
|
|
|
82 OR you may specify the full path to the ld_vcf binary on the vep command line:
|
|
|
83
|
|
|
84 ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8,$PWD/ensembl-variation/C_code/ld_vcf
|
|
|
85
|
|
|
86 =cut
|
|
|
87
|
|
|
88 =head1 DATA
|
|
|
89
|
|
|
90 By default genotype data to calculate LD is retrieved from tabix-indexed
|
|
|
91 VCF files hosted on Ensembl's FTP servers. It is possible to download this
|
|
|
92 data to your local machine and have the LD plugin read genotype data from
|
|
|
93 there instead, giving faster performance and reducing network traffic.
|
|
|
94
|
|
|
95 These commands show how to get the data files for GRCh38.
|
|
|
96
|
|
|
97 mkdir variation_genotype
|
|
|
98 cd variation_genotype
|
|
|
99 lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/
|
|
|
100 cd ..
|
|
|
101
|
|
|
102 For GRCh37 replace the lftp command with:
|
|
|
103
|
|
|
104 lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/
|
|
|
105
|
|
|
106 We must now modify the JSON configuration file used to find the data. Starting
|
|
|
107 in the ensembl-vep directory:
|
|
|
108
|
|
|
109 perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json
|
|
|
110
|
|
|
111 Or for GRCh37:
|
|
|
112
|
|
|
113 perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json
|
|
|
114
|
|
|
115 =cut
|
|
|
116
|
|
|
117 package LD;
|
|
|
118
|
|
|
119 use strict;
|
|
|
120 use warnings;
|
|
|
121
|
|
|
122 use Bio::EnsEMBL::Registry;
|
|
|
123
|
|
|
124 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
125
|
|
|
126 sub feature_types {
|
|
|
127 return ['Feature','Intergenic'];
|
|
|
128 }
|
|
|
129
|
|
|
130 sub get_header_info {
|
|
|
131
|
|
|
132 my $self = shift;
|
|
|
133
|
|
|
134 return {
|
|
|
135 LinkedVariants => "Variants in LD (r2 >= ".$self->{r2_cutoff}.
|
|
|
136 ") with overlapping existing variants from the ".
|
|
|
137 $self->{pop}->name." population",
|
|
|
138 };
|
|
|
139 }
|
|
|
140
|
|
|
141 sub new {
|
|
|
142 my $class = shift;
|
|
|
143
|
|
|
144 my $self = $class->SUPER::new(@_);
|
|
|
145
|
|
|
146 if ($self->config->{offline}) {
|
|
|
147 warn "Warning: a connection to the database is required to calculate LD\n";
|
|
|
148 }
|
|
|
149
|
|
|
150 my $reg = 'Bio::EnsEMBL::Registry';
|
|
|
151
|
|
|
152 # turn on the check for existing variants
|
|
|
153
|
|
|
154 $self->config->{check_existing} = 1;
|
|
|
155
|
|
|
156 # fetch our population
|
|
|
157
|
|
|
158 my ($pop_name, $r2_cutoff, $ld_binary) = @{ $self->params };
|
|
|
159
|
|
|
160 # set some defaults
|
|
|
161
|
|
|
162 $pop_name ||= '1000GENOMES:phase_3:CEU';
|
|
|
163
|
|
|
164 $r2_cutoff = 0.8 unless defined $r2_cutoff;
|
|
|
165
|
|
|
166 my $pop_adap = $reg->get_adaptor('human', 'variation', 'population')
|
|
|
167 || die "Failed to get population adaptor\n";
|
|
|
168
|
|
|
169 my $valid_pops = $pop_adap->fetch_all_LD_Populations();
|
|
|
170 my ($pop) = grep {$_->name eq $pop_name} @$valid_pops;
|
|
|
171 die "Invalid population '$pop_name'; valid populations are:\n".join(", ", map {$_->name} @$valid_pops)."\n" unless $pop;
|
|
|
172
|
|
|
173 $self->{pop} = $pop;
|
|
|
174 $self->{r2_cutoff} = $r2_cutoff;
|
|
|
175
|
|
|
176 # prefetch the necessary adaptors
|
|
|
177
|
|
|
178 my $ld_adap = $reg->get_adaptor('human', 'variation', 'ldfeaturecontainer')
|
|
|
179 || die "Failed to get LD adaptor\n";
|
|
|
180 $ld_adap->db->use_vcf(1);
|
|
|
181 my $var_adap = $reg->get_adaptor('human', 'variation', 'variation')
|
|
|
182 || die "Failed to get variation adaptor\n";
|
|
|
183
|
|
|
184 my $var_feat_adap = $reg->get_adaptor('human', 'variation', 'variationfeature')
|
|
|
185 || die "Failed to get variation feature adaptor\n";
|
|
|
186
|
|
|
187 if($ld_binary) {
|
|
|
188 die("Specified LD binary \"$ld_binary\" does not exist\n") unless -e $ld_binary;
|
|
|
189 $Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor::VCF_BINARY_FILE = $ld_binary;
|
|
|
190 }
|
|
|
191
|
|
|
192 $self->{ld_adap} = $ld_adap;
|
|
|
193 $self->{var_adap} = $var_adap;
|
|
|
194 $self->{var_feat_adap} = $var_feat_adap;
|
|
|
195
|
|
|
196 return $self;
|
|
|
197 }
|
|
|
198
|
|
|
199 sub run {
|
|
|
200 my ($self, $vfoa, $line_hash) = @_;
|
|
|
201
|
|
|
202 # fetch the existing variants from the line hash
|
|
|
203 return {} unless $line_hash->{Existing_variation};
|
|
|
204
|
|
|
205 my @vars = ref($line_hash->{Existing_variation}) eq 'ARRAY' ? @{$line_hash->{Existing_variation}} : split(',', $line_hash->{Existing_variation});
|
|
|
206
|
|
|
207 my @linked;
|
|
|
208
|
|
|
209
|
|
|
210 for my $var (@vars) {
|
|
|
211
|
|
|
212 # check cache
|
|
|
213 my $res;
|
|
|
214
|
|
|
215 if($self->{cache}) {
|
|
|
216 ($res) = grep {$_->{var} eq $var} @{$self->{cache}};
|
|
|
217 }
|
|
|
218
|
|
|
219 unless($res) {
|
|
|
220 my @this_linked;
|
|
|
221
|
|
|
222 # fetch a variation for each overlapping variant ID
|
|
|
223 if (my $v = $self->{var_adap}->fetch_by_name($var)) {
|
|
|
224
|
|
|
225 # and fetch the associated variation features
|
|
|
226
|
|
|
227 for my $vf (@{ $self->{var_feat_adap}->fetch_all_by_Variation($v) }) {
|
|
|
228
|
|
|
229 # we're only interested in variation features that overlap our variant
|
|
|
230
|
|
|
231 if ($vf->slice->name eq $vfoa->variation_feature->slice->name) {
|
|
|
232
|
|
|
233 # fetch an LD feature container for this variation feature and our preconfigured population
|
|
|
234 if (my $ldfc = $self->{ld_adap}->fetch_by_VariationFeature($vf, $self->{pop})) {
|
|
|
235
|
|
|
236 # loop over all the linked variants
|
|
|
237 # we pass 1 to get_all_ld_values() so that it doesn't lazy load
|
|
|
238 # VariationFeature objects - we only need the name here anyway
|
|
|
239 for my $result (@{ $ldfc->get_all_ld_values(1) }) {
|
|
|
240
|
|
|
241 # apply our r2 cutoff
|
|
|
242
|
|
|
243 if ($result->{r2} >= $self->{r2_cutoff}) {
|
|
|
244
|
|
|
245 my $v1 = $result->{variation_name1};
|
|
|
246 my $v2 = $result->{variation_name2};
|
|
|
247
|
|
|
248 # I'm not sure which of these are the query variant, so just check the names
|
|
|
249
|
|
|
250 my $linked = $v1 eq $var ? $v2 : $v1;
|
|
|
251
|
|
|
252 push @this_linked, sprintf("%s:%.3f", $linked, $result->{r2});
|
|
|
253 }
|
|
|
254 }
|
|
|
255 }
|
|
|
256 }
|
|
|
257 }
|
|
|
258 }
|
|
|
259
|
|
|
260 # cache it
|
|
|
261 $res = {
|
|
|
262 var => $var,
|
|
|
263 linked => \@this_linked
|
|
|
264 };
|
|
|
265
|
|
|
266 push @{$self->{cache}}, $res;
|
|
|
267 shift @{$self->{cache}} while scalar @{$self->{cache}} > 50;
|
|
|
268 }
|
|
|
269
|
|
|
270 push @linked, @{$res->{linked}};
|
|
|
271 }
|
|
|
272
|
|
|
273 return scalar @linked ? {LinkedVariants => join(',', @linked)} : {};
|
|
|
274 }
|
|
|
275
|
|
|
276 1;
|
|
|
277
|