|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 dbNSFP
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv dbNSFP.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin dbNSFP,/path/to/dbNSFP.gz,col1,col2
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 A VEP plugin that retrieves data for missense variants from a tabix-indexed
|
|
|
36 dbNSFP file.
|
|
|
37
|
|
|
38 Please cite the dbNSFP publication alongside the VEP if you use this resource:
|
|
|
39 http://www.ncbi.nlm.nih.gov/pubmed/21520341
|
|
|
40
|
|
|
41 You must have the Bio::DB::HTS module or the tabix utility must be installed
|
|
|
42 in your path to use this plugin. The dbNSFP data file can be downloaded from
|
|
|
43 https://sites.google.com/site/jpopgen/dbNSFP.
|
|
|
44
|
|
|
45 Release 3.5a of dbNSFP uses GRCh38/hg38 coordinates and GRCh37/hg19
|
|
|
46 coordinates.
|
|
|
47 To use plugin with GRCh37/hg19 data:
|
|
|
48 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip
|
|
|
49 > unzip dbNSFPv3.5a.zip
|
|
|
50 > head -n1 dbNSFP3.5a_variant.chr1 > h
|
|
|
51 > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | awk '$8 != "."' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP_hg19.gz
|
|
|
52 > tabix -s 8 -b 9 -e 9 dbNSFP_hg19.gz
|
|
|
53
|
|
|
54 To use plugin with GRCh38/hg38 data:
|
|
|
55 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip
|
|
|
56 > unzip dbNSFPv3.5a.zip
|
|
|
57 > head -n1 dbNSFP3.5a_variant.chr1 > h
|
|
|
58 > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | sort -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP.gz
|
|
|
59 > tabix -s 1 -b 2 -e 2 dbNSFP.gz
|
|
|
60
|
|
|
61 When running the plugin you must list at least one column to retrieve from the
|
|
|
62 dbNSFP file, specified as parameters to the plugin e.g.
|
|
|
63
|
|
|
64 --plugin dbNSFP,/path/to/dbNSFP.gz,LRT_score,GERP++_RS
|
|
|
65
|
|
|
66 You may include all columns with ALL; this fetches a large amount of data per
|
|
|
67 variant!:
|
|
|
68
|
|
|
69 --plugin dbNSFP,/path/to/dbNSFP.gz,ALL
|
|
|
70
|
|
|
71 Tabix also allows the data file to be hosted on a remote server. This plugin is
|
|
|
72 fully compatible with such a setup - simply use the URL of the remote file:
|
|
|
73
|
|
|
74 --plugin dbNSFP,http://my.files.com/dbNSFP.gz,col1,col2
|
|
|
75
|
|
|
76 The plugin replaces occurrences of ';' with ',' and '|' with '&'. However, some
|
|
|
77 data field columns, e.g. Interpro_domain, use the replacement characters. We
|
|
|
78 added a file with replacement logic for customising the required replacement
|
|
|
79 of ';' and '|' in dbNSFP data columns. In addition to the default replacements
|
|
|
80 (; to , and | to &) users can add customised replacements. Users can either modify
|
|
|
81 the file dbNSFP_replacement_logic in the VEP_plugins directory or provide their own
|
|
|
82 file as second argument when calling the plugin:
|
|
|
83
|
|
|
84 --plugin dbNSFP,/path/to/dbNSFP.gz,/path/to/dbNSFP_replacement_logic,LRT_score,GERP++_RS
|
|
|
85
|
|
|
86 Note that transcript sequences referred to in dbNSFP may be out of sync with
|
|
|
87 those in the latest release of Ensembl; this may lead to discrepancies with
|
|
|
88 scores retrieved from other sources.
|
|
|
89
|
|
|
90 If the dbNSFP README file is found in the same directory as the data file,
|
|
|
91 column descriptions will be read from this and incorporated into the VEP output
|
|
|
92 file header.
|
|
|
93
|
|
|
94 =cut
|
|
|
95
|
|
|
96 package dbNSFP;
|
|
|
97
|
|
|
98 use strict;
|
|
|
99 use warnings;
|
|
|
100
|
|
|
101 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
102
|
|
|
103 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
|
|
|
104
|
|
|
105 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
|
|
|
106
|
|
|
107 my %INCLUDE_SO = map {$_ => 1} qw(missense_variant stop_lost stop_gained start_lost);
|
|
|
108
|
|
|
109 sub new {
|
|
|
110 my $class = shift;
|
|
|
111
|
|
|
112 my $self = $class->SUPER::new(@_);
|
|
|
113
|
|
|
114 $self->expand_left(0);
|
|
|
115 $self->expand_right(0);
|
|
|
116
|
|
|
117 # get dbNSFP file
|
|
|
118 my $file = $self->params->[0];
|
|
|
119 $self->add_file($file);
|
|
|
120
|
|
|
121 # get headers
|
|
|
122 open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
|
|
|
123 while(<HEAD>) {
|
|
|
124 next unless /^\#/;
|
|
|
125 chomp;
|
|
|
126 $self->{headers} = [split];
|
|
|
127 }
|
|
|
128 close HEAD;
|
|
|
129
|
|
|
130 die "ERROR: Could not read headers from $file\n" unless defined($self->{headers}) && scalar @{$self->{headers}};
|
|
|
131
|
|
|
132 # check alt and Ensembl_transcriptid headers
|
|
|
133 foreach my $h(qw(alt Ensembl_transcriptid)) {
|
|
|
134 die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}};
|
|
|
135 }
|
|
|
136
|
|
|
137 my $i = 1;
|
|
|
138 # check if 2nd argument is a file that specifies replacement logic
|
|
|
139 # read replacement logic
|
|
|
140 my $replacement_file = $self->params->[$i];
|
|
|
141 if (defined $replacement_file && -e $replacement_file) {
|
|
|
142 $self->add_replacement_logic($replacement_file);
|
|
|
143 $i++;
|
|
|
144 } else {
|
|
|
145 $self->add_replacement_logic();
|
|
|
146 }
|
|
|
147
|
|
|
148 # get required columns
|
|
|
149 while(defined($self->params->[$i])) {
|
|
|
150 my $col = $self->params->[$i];
|
|
|
151 if($col eq 'ALL') {
|
|
|
152 $self->{cols} = {map {$_ => 1} @{$self->{headers}}};
|
|
|
153 last;
|
|
|
154 }
|
|
|
155 die "ERROR: Column $col not found in header for file $file. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless grep {$_ eq $col} @{$self->{headers}};
|
|
|
156
|
|
|
157 $self->{cols}->{$self->params->[$i]} = 1;
|
|
|
158 $i++;
|
|
|
159 }
|
|
|
160
|
|
|
161 die "ERROR: No columns selected to fetch. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless defined($self->{cols}) && scalar keys %{$self->{cols}};
|
|
|
162
|
|
|
163 return $self;
|
|
|
164 }
|
|
|
165
|
|
|
166 sub feature_types {
|
|
|
167 return ['Transcript'];
|
|
|
168 }
|
|
|
169
|
|
|
170 sub get_header_info {
|
|
|
171 my $self = shift;
|
|
|
172
|
|
|
173 if(!exists($self->{_header_info})) {
|
|
|
174
|
|
|
175 # look for readme
|
|
|
176 my $file_dir = $self->files->[0];
|
|
|
177
|
|
|
178 my %rm_descs;
|
|
|
179
|
|
|
180 # won't work for remote
|
|
|
181 if($file_dir !~ /tp\:\/\//) {
|
|
|
182
|
|
|
183 # get just dir
|
|
|
184 $file_dir =~ s/\/[^\/]+$/\//;
|
|
|
185
|
|
|
186 if(opendir DIR, $file_dir) {
|
|
|
187 my ($readme_file) = grep {/dbnsfp.*readme\.txt/i} readdir DIR;
|
|
|
188 closedir DIR;
|
|
|
189
|
|
|
190 if(open RM, $file_dir.$readme_file) {
|
|
|
191 my ($col, $reading);
|
|
|
192
|
|
|
193 # parse dbNSFP readme
|
|
|
194 # relevant lines look like:
|
|
|
195 #
|
|
|
196 # 1 column1_name: description blah blah
|
|
|
197 # blah blah blah
|
|
|
198 # 2 column2_name: description blah blah
|
|
|
199 # blah blah blah
|
|
|
200
|
|
|
201 while(<RM>) {
|
|
|
202 chomp;
|
|
|
203 s/\r$//g;
|
|
|
204
|
|
|
205 if(/^\d+\s/) {
|
|
|
206 $reading = 1;
|
|
|
207
|
|
|
208 m/^\d+\s+(.+?)\:\s+(.+)/;
|
|
|
209 $col = $1;
|
|
|
210
|
|
|
211 $rm_descs{$col} = '(from dbNSFP) '.$2 if $col && $2;
|
|
|
212 }
|
|
|
213 elsif($reading && /\w/) {
|
|
|
214 s/^\s+//;
|
|
|
215 $rm_descs{$col} .= ' '.$_;
|
|
|
216 }
|
|
|
217 else {
|
|
|
218 $reading = 0;
|
|
|
219 }
|
|
|
220 }
|
|
|
221
|
|
|
222 close RM;
|
|
|
223
|
|
|
224 # remove multiple spaces
|
|
|
225 $rm_descs{$_} =~ s/\s+/ /g for keys %rm_descs;
|
|
|
226 }
|
|
|
227 }
|
|
|
228 }
|
|
|
229
|
|
|
230 $self->{_header_info} = {map {$_ => $rm_descs{$_} || ($_.' from dbNSFP file')} keys %{$self->{cols}}};
|
|
|
231 }
|
|
|
232
|
|
|
233 return $self->{_header_info};
|
|
|
234 }
|
|
|
235
|
|
|
236 sub run {
|
|
|
237 my ($self, $tva) = @_;
|
|
|
238
|
|
|
239 # only for missense variants
|
|
|
240 return {} unless grep {$INCLUDE_SO{$_->SO_term}} @{$tva->get_all_OverlapConsequences};
|
|
|
241
|
|
|
242 my $vf = $tva->variation_feature;
|
|
|
243
|
|
|
244 return {} unless $vf->{start} eq $vf->{end};
|
|
|
245
|
|
|
246 # get allele, reverse comp if needed
|
|
|
247 my $allele = $tva->variation_feature_seq;
|
|
|
248 reverse_comp(\$allele) if $vf->{strand} < 0;
|
|
|
249
|
|
|
250 return {} unless $allele =~ /^[ACGT]$/;
|
|
|
251
|
|
|
252 # get transcript stable ID
|
|
|
253 my $tr_id = $tva->transcript->stable_id;
|
|
|
254
|
|
|
255 my $data;
|
|
|
256 my $pos;
|
|
|
257
|
|
|
258 my $assembly = $self->{config}->{assembly};
|
|
|
259 my $chr = ($vf->{chr} =~ /MT/i) ? 'M' : $vf->{chr};
|
|
|
260 foreach my $tmp_data(@{$self->get_data($chr, $vf->{start} - 1, $vf->{end})}) {
|
|
|
261 # compare allele and transcript
|
|
|
262 if ($assembly eq 'GRCh37') {
|
|
|
263 if (exists $tmp_data->{'pos(1-coor)'}) {
|
|
|
264 # for dbNSFP version 2.9.1
|
|
|
265 $pos = $tmp_data->{'pos(1-coor)'}
|
|
|
266 } elsif (exists $tmp_data->{'hg19_pos(1-based)'}) {
|
|
|
267 # for dbNSFP version 3.5c indexed for hg19/(=GRCh37)
|
|
|
268 $pos = $tmp_data->{'hg19_pos(1-based)'}
|
|
|
269 } else {
|
|
|
270 die "dbNSFP file does not contain required columns (pos(1-coor) for version 2.9.1 or hg19_pos(1-based) for version 3.5c) to use with GRCh37";
|
|
|
271 }
|
|
|
272 } else {
|
|
|
273 if (exists $tmp_data->{'pos(1-based)'}) {
|
|
|
274 $pos = $tmp_data->{'pos(1-based)'}
|
|
|
275 } else {
|
|
|
276 die "dbNSFP file does not contain required column pos(1-based) to use with GRCh38";
|
|
|
277 }
|
|
|
278 }
|
|
|
279
|
|
|
280 next unless
|
|
|
281 $pos == $vf->{start} &&
|
|
|
282 defined($tmp_data->{alt}) &&
|
|
|
283 $tmp_data->{alt} eq $allele;
|
|
|
284
|
|
|
285 # make a clean copy as we're going to edit it
|
|
|
286 %$data = %$tmp_data;
|
|
|
287
|
|
|
288 # convert data with multiple transcript values
|
|
|
289 # if($data->{Ensembl_transcriptid} =~ m/\;/) {
|
|
|
290
|
|
|
291 # # find the "index" of this transcript
|
|
|
292 # my @tr_ids = split(';', $data->{Ensembl_transcriptid});
|
|
|
293 # my $tr_index;
|
|
|
294
|
|
|
295 # for my $i(0..$#tr_ids) {
|
|
|
296 # $tr_index = $i;
|
|
|
297 # last if $tr_ids[$tr_index] =~ /^$tr_id(\.\d+)?$/;
|
|
|
298 # }
|
|
|
299
|
|
|
300 # next unless defined($tr_index);
|
|
|
301
|
|
|
302 # # now alter other fields
|
|
|
303 # foreach my $key(keys %$data) {
|
|
|
304 # if($data->{$key} =~ m/\;/) {
|
|
|
305 # my @split = split(';', $data->{$key});
|
|
|
306 # die("ERROR: Transcript index out of range") if $tr_index > $#split;
|
|
|
307 # $data->{$key} = $split[$tr_index];
|
|
|
308 # }
|
|
|
309 # }
|
|
|
310 # }
|
|
|
311 last;
|
|
|
312 }
|
|
|
313
|
|
|
314 return {} unless scalar keys %$data;
|
|
|
315
|
|
|
316 # get required data
|
|
|
317 my @from = @{$self->{replacement}->{default}->{from}};
|
|
|
318 my @to = @{$self->{replacement}->{default}->{to}};
|
|
|
319
|
|
|
320 my %return;
|
|
|
321 foreach my $colname (keys %$data) {
|
|
|
322 next if(!defined($self->{cols}->{$colname}));
|
|
|
323 next if($data->{$colname} eq '.');
|
|
|
324
|
|
|
325 my @from = @{$self->{replacement}->{default}->{from}};
|
|
|
326 my @to = @{$self->{replacement}->{default}->{to}};
|
|
|
327 @from = @{$self->{replacement}->{$colname}->{from}} if (defined $self->{replacement}->{$colname});
|
|
|
328 @to = @{$self->{replacement}->{$colname}->{to}} if (defined $self->{replacement}->{$colname});
|
|
|
329 for my $i (0 .. $#from) {
|
|
|
330 $data->{$colname} =~ s/\Q$from[$i]\E/$to[$i]/g;
|
|
|
331 }
|
|
|
332 $return{$colname} = $data->{$colname};
|
|
|
333 }
|
|
|
334
|
|
|
335 return \%return;
|
|
|
336 }
|
|
|
337
|
|
|
338 sub parse_data {
|
|
|
339 my ($self, $line) = @_;
|
|
|
340
|
|
|
341 $line =~ s/\r$//g;
|
|
|
342
|
|
|
343 my @split = split /\t/, $line;
|
|
|
344
|
|
|
345 # parse data into hash of col names and values
|
|
|
346 my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
|
|
|
347
|
|
|
348 return \%data;
|
|
|
349 }
|
|
|
350
|
|
|
351 sub get_start {
|
|
|
352 return $_[1]->{'pos(1-based)'};
|
|
|
353 }
|
|
|
354
|
|
|
355 sub get_end {
|
|
|
356 return $_[1]->{'pos(1-based)'};
|
|
|
357 }
|
|
|
358
|
|
|
359 sub add_replacement_logic {
|
|
|
360 my $self = shift;
|
|
|
361 my $file = shift;
|
|
|
362 $file ||= 'dbNSFP_replacement_logic';
|
|
|
363 if (! -e $file) {
|
|
|
364 $self->{replacement}->{default}->{from} = [';', '|'];
|
|
|
365 $self->{replacement}->{default}->{to} = [',', '&'];
|
|
|
366 } else {
|
|
|
367 open FILE, $file;
|
|
|
368 while(<FILE>) {
|
|
|
369 chomp;
|
|
|
370 next if /^colname/;
|
|
|
371 my ($colname, $from, $to) = split/\s+/;
|
|
|
372 die ("ERROR: 3 values separated by whitespace are required: colname from to.") if(!($colname && $from && $to));
|
|
|
373 push @{$self->{replacement}->{$colname}->{from}}, $from;
|
|
|
374 push @{$self->{replacement}->{$colname}->{to}}, $to;
|
|
|
375 }
|
|
|
376 close FILE;
|
|
|
377 die("ERROR: No default replacement logic has been specified.\n") if (!defined $self->{replacement}->{default});
|
|
|
378 }
|
|
|
379 }
|
|
|
380
|
|
|
381 1;
|
|
|
382
|
|
|
383
|