comparison dir_plugins/LD.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
comparison
equal deleted inserted replaced
2:17c98d091710 3:49397129aec0
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 LD
27
28 =head1 SYNOPSIS
29
30 mv LD.pm ~/.vep/Plugins
31 ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8
32
33 =head1 DESCRIPTION
34
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
36 finds variants in linkage disequilibrium with any overlapping existing
37 variants from the Ensembl variation databases. You can configure the
38 population used to calculate the r2 value, and the r2 cutoff used by
39 passing arguments to the plugin via the VEP command line (separated
40 by commas). This plugin adds a single new entry to the Extra column
41 with a comma-separated list of linked variant IDs and the associated
42 r2 values, e.g.:
43
44 LinkedVariants=rs123:0.879,rs234:0.943
45
46 If no arguments are supplied, the default population used is the CEU
47 sample from the 1000 Genomes Project phase 3, and the default r2
48 cutoff used is 0.8.
49
50 WARNING: Calculating LD is a relatively slow procedure, so this will
51 slow VEP down considerably when running on large numbers of
52 variants. Consider running vep followed by filter_vep to get a smaller
53 input set:
54
55 ./vep -i input.vcf -cache -vcf -o input_vep.vcf
56 ./filter_vep -i input_vep.vcf -filter "Consequence is missense_variant" > input_vep_filtered.vcf
57 ./vep -i input_vep_filtered.vcf -cache -plugin LD
58
59 =cut
60
61 =head1 INSTALLATION
62
63 LD calculation requires additional installation steps.
64
65 The JSON perl library is required; see VEP's installation instructions
66 for guidance: http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#additional
67
68 A binary from the ensembl-variation git repository must be compiled and either
69 added to your PATH or specified on the command line. In the ensembl-vep
70 directory:
71
72 export HTSLIB_DIR=${PWD}/htslib
73 git clone https://github.com/Ensembl/ensembl-variation
74 cd ensembl-variation/C_code
75 make
76
77 You may EITHER add this path to your PATH environment variable (add this line
78 to your $HOME/.bashrc to make the change permanent):
79
80 export PATH=${PATH}:${PWD}
81
82 OR you may specify the full path to the ld_vcf binary on the vep command line:
83
84 ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8,$PWD/ensembl-variation/C_code/ld_vcf
85
86 =cut
87
88 =head1 DATA
89
90 By default genotype data to calculate LD is retrieved from tabix-indexed
91 VCF files hosted on Ensembl's FTP servers. It is possible to download this
92 data to your local machine and have the LD plugin read genotype data from
93 there instead, giving faster performance and reducing network traffic.
94
95 These commands show how to get the data files for GRCh38.
96
97 mkdir variation_genotype
98 cd variation_genotype
99 lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/
100 cd ..
101
102 For GRCh37 replace the lftp command with:
103
104 lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/
105
106 We must now modify the JSON configuration file used to find the data. Starting
107 in the ensembl-vep directory:
108
109 perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json
110
111 Or for GRCh37:
112
113 perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json
114
115 =cut
116
117 package LD;
118
119 use strict;
120 use warnings;
121
122 use Bio::EnsEMBL::Registry;
123
124 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
125
126 sub feature_types {
127 return ['Feature','Intergenic'];
128 }
129
130 sub get_header_info {
131
132 my $self = shift;
133
134 return {
135 LinkedVariants => "Variants in LD (r2 >= ".$self->{r2_cutoff}.
136 ") with overlapping existing variants from the ".
137 $self->{pop}->name." population",
138 };
139 }
140
141 sub new {
142 my $class = shift;
143
144 my $self = $class->SUPER::new(@_);
145
146 if ($self->config->{offline}) {
147 warn "Warning: a connection to the database is required to calculate LD\n";
148 }
149
150 my $reg = 'Bio::EnsEMBL::Registry';
151
152 # turn on the check for existing variants
153
154 $self->config->{check_existing} = 1;
155
156 # fetch our population
157
158 my ($pop_name, $r2_cutoff, $ld_binary) = @{ $self->params };
159
160 # set some defaults
161
162 $pop_name ||= '1000GENOMES:phase_3:CEU';
163
164 $r2_cutoff = 0.8 unless defined $r2_cutoff;
165
166 my $pop_adap = $reg->get_adaptor('human', 'variation', 'population')
167 || die "Failed to get population adaptor\n";
168
169 my $valid_pops = $pop_adap->fetch_all_LD_Populations();
170 my ($pop) = grep {$_->name eq $pop_name} @$valid_pops;
171 die "Invalid population '$pop_name'; valid populations are:\n".join(", ", map {$_->name} @$valid_pops)."\n" unless $pop;
172
173 $self->{pop} = $pop;
174 $self->{r2_cutoff} = $r2_cutoff;
175
176 # prefetch the necessary adaptors
177
178 my $ld_adap = $reg->get_adaptor('human', 'variation', 'ldfeaturecontainer')
179 || die "Failed to get LD adaptor\n";
180 $ld_adap->db->use_vcf(1);
181 my $var_adap = $reg->get_adaptor('human', 'variation', 'variation')
182 || die "Failed to get variation adaptor\n";
183
184 my $var_feat_adap = $reg->get_adaptor('human', 'variation', 'variationfeature')
185 || die "Failed to get variation feature adaptor\n";
186
187 if($ld_binary) {
188 die("Specified LD binary \"$ld_binary\" does not exist\n") unless -e $ld_binary;
189 $Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor::VCF_BINARY_FILE = $ld_binary;
190 }
191
192 $self->{ld_adap} = $ld_adap;
193 $self->{var_adap} = $var_adap;
194 $self->{var_feat_adap} = $var_feat_adap;
195
196 return $self;
197 }
198
199 sub run {
200 my ($self, $vfoa, $line_hash) = @_;
201
202 # fetch the existing variants from the line hash
203 return {} unless $line_hash->{Existing_variation};
204
205 my @vars = ref($line_hash->{Existing_variation}) eq 'ARRAY' ? @{$line_hash->{Existing_variation}} : split(',', $line_hash->{Existing_variation});
206
207 my @linked;
208
209
210 for my $var (@vars) {
211
212 # check cache
213 my $res;
214
215 if($self->{cache}) {
216 ($res) = grep {$_->{var} eq $var} @{$self->{cache}};
217 }
218
219 unless($res) {
220 my @this_linked;
221
222 # fetch a variation for each overlapping variant ID
223 if (my $v = $self->{var_adap}->fetch_by_name($var)) {
224
225 # and fetch the associated variation features
226
227 for my $vf (@{ $self->{var_feat_adap}->fetch_all_by_Variation($v) }) {
228
229 # we're only interested in variation features that overlap our variant
230
231 if ($vf->slice->name eq $vfoa->variation_feature->slice->name) {
232
233 # fetch an LD feature container for this variation feature and our preconfigured population
234 if (my $ldfc = $self->{ld_adap}->fetch_by_VariationFeature($vf, $self->{pop})) {
235
236 # loop over all the linked variants
237 # we pass 1 to get_all_ld_values() so that it doesn't lazy load
238 # VariationFeature objects - we only need the name here anyway
239 for my $result (@{ $ldfc->get_all_ld_values(1) }) {
240
241 # apply our r2 cutoff
242
243 if ($result->{r2} >= $self->{r2_cutoff}) {
244
245 my $v1 = $result->{variation_name1};
246 my $v2 = $result->{variation_name2};
247
248 # I'm not sure which of these are the query variant, so just check the names
249
250 my $linked = $v1 eq $var ? $v2 : $v1;
251
252 push @this_linked, sprintf("%s:%.3f", $linked, $result->{r2});
253 }
254 }
255 }
256 }
257 }
258 }
259
260 # cache it
261 $res = {
262 var => $var,
263 linked => \@this_linked
264 };
265
266 push @{$self->{cache}}, $res;
267 shift @{$self->{cache}} while scalar @{$self->{cache}} > 50;
268 }
269
270 push @linked, @{$res->{linked}};
271 }
272
273 return scalar @linked ? {LinkedVariants => join(',', @linked)} : {};
274 }
275
276 1;
277