comparison dir_plugins/dbNSFP.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
comparison
equal deleted inserted replaced
2:17c98d091710 3:49397129aec0
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 dbNSFP
27
28 =head1 SYNOPSIS
29
30 mv dbNSFP.pm ~/.vep/Plugins
31 ./vep -i variations.vcf --plugin dbNSFP,/path/to/dbNSFP.gz,col1,col2
32
33 =head1 DESCRIPTION
34
35 A VEP plugin that retrieves data for missense variants from a tabix-indexed
36 dbNSFP file.
37
38 Please cite the dbNSFP publication alongside the VEP if you use this resource:
39 http://www.ncbi.nlm.nih.gov/pubmed/21520341
40
41 You must have the Bio::DB::HTS module or the tabix utility must be installed
42 in your path to use this plugin. The dbNSFP data file can be downloaded from
43 https://sites.google.com/site/jpopgen/dbNSFP.
44
45 Release 3.5a of dbNSFP uses GRCh38/hg38 coordinates and GRCh37/hg19
46 coordinates.
47 To use plugin with GRCh37/hg19 data:
48 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip
49 > unzip dbNSFPv3.5a.zip
50 > head -n1 dbNSFP3.5a_variant.chr1 > h
51 > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | awk '$8 != "."' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP_hg19.gz
52 > tabix -s 8 -b 9 -e 9 dbNSFP_hg19.gz
53
54 To use plugin with GRCh38/hg38 data:
55 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip
56 > unzip dbNSFPv3.5a.zip
57 > head -n1 dbNSFP3.5a_variant.chr1 > h
58 > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | sort -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP.gz
59 > tabix -s 1 -b 2 -e 2 dbNSFP.gz
60
61 When running the plugin you must list at least one column to retrieve from the
62 dbNSFP file, specified as parameters to the plugin e.g.
63
64 --plugin dbNSFP,/path/to/dbNSFP.gz,LRT_score,GERP++_RS
65
66 You may include all columns with ALL; this fetches a large amount of data per
67 variant!:
68
69 --plugin dbNSFP,/path/to/dbNSFP.gz,ALL
70
71 Tabix also allows the data file to be hosted on a remote server. This plugin is
72 fully compatible with such a setup - simply use the URL of the remote file:
73
74 --plugin dbNSFP,http://my.files.com/dbNSFP.gz,col1,col2
75
76 The plugin replaces occurrences of ';' with ',' and '|' with '&'. However, some
77 data field columns, e.g. Interpro_domain, use the replacement characters. We
78 added a file with replacement logic for customising the required replacement
79 of ';' and '|' in dbNSFP data columns. In addition to the default replacements
80 (; to , and | to &) users can add customised replacements. Users can either modify
81 the file dbNSFP_replacement_logic in the VEP_plugins directory or provide their own
82 file as second argument when calling the plugin:
83
84 --plugin dbNSFP,/path/to/dbNSFP.gz,/path/to/dbNSFP_replacement_logic,LRT_score,GERP++_RS
85
86 Note that transcript sequences referred to in dbNSFP may be out of sync with
87 those in the latest release of Ensembl; this may lead to discrepancies with
88 scores retrieved from other sources.
89
90 If the dbNSFP README file is found in the same directory as the data file,
91 column descriptions will be read from this and incorporated into the VEP output
92 file header.
93
94 =cut
95
96 package dbNSFP;
97
98 use strict;
99 use warnings;
100
101 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
102
103 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
104
105 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
106
107 my %INCLUDE_SO = map {$_ => 1} qw(missense_variant stop_lost stop_gained start_lost);
108
109 sub new {
110 my $class = shift;
111
112 my $self = $class->SUPER::new(@_);
113
114 $self->expand_left(0);
115 $self->expand_right(0);
116
117 # get dbNSFP file
118 my $file = $self->params->[0];
119 $self->add_file($file);
120
121 # get headers
122 open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
123 while(<HEAD>) {
124 next unless /^\#/;
125 chomp;
126 $self->{headers} = [split];
127 }
128 close HEAD;
129
130 die "ERROR: Could not read headers from $file\n" unless defined($self->{headers}) && scalar @{$self->{headers}};
131
132 # check alt and Ensembl_transcriptid headers
133 foreach my $h(qw(alt Ensembl_transcriptid)) {
134 die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}};
135 }
136
137 my $i = 1;
138 # check if 2nd argument is a file that specifies replacement logic
139 # read replacement logic
140 my $replacement_file = $self->params->[$i];
141 if (defined $replacement_file && -e $replacement_file) {
142 $self->add_replacement_logic($replacement_file);
143 $i++;
144 } else {
145 $self->add_replacement_logic();
146 }
147
148 # get required columns
149 while(defined($self->params->[$i])) {
150 my $col = $self->params->[$i];
151 if($col eq 'ALL') {
152 $self->{cols} = {map {$_ => 1} @{$self->{headers}}};
153 last;
154 }
155 die "ERROR: Column $col not found in header for file $file. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless grep {$_ eq $col} @{$self->{headers}};
156
157 $self->{cols}->{$self->params->[$i]} = 1;
158 $i++;
159 }
160
161 die "ERROR: No columns selected to fetch. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless defined($self->{cols}) && scalar keys %{$self->{cols}};
162
163 return $self;
164 }
165
166 sub feature_types {
167 return ['Transcript'];
168 }
169
170 sub get_header_info {
171 my $self = shift;
172
173 if(!exists($self->{_header_info})) {
174
175 # look for readme
176 my $file_dir = $self->files->[0];
177
178 my %rm_descs;
179
180 # won't work for remote
181 if($file_dir !~ /tp\:\/\//) {
182
183 # get just dir
184 $file_dir =~ s/\/[^\/]+$/\//;
185
186 if(opendir DIR, $file_dir) {
187 my ($readme_file) = grep {/dbnsfp.*readme\.txt/i} readdir DIR;
188 closedir DIR;
189
190 if(open RM, $file_dir.$readme_file) {
191 my ($col, $reading);
192
193 # parse dbNSFP readme
194 # relevant lines look like:
195 #
196 # 1 column1_name: description blah blah
197 # blah blah blah
198 # 2 column2_name: description blah blah
199 # blah blah blah
200
201 while(<RM>) {
202 chomp;
203 s/\r$//g;
204
205 if(/^\d+\s/) {
206 $reading = 1;
207
208 m/^\d+\s+(.+?)\:\s+(.+)/;
209 $col = $1;
210
211 $rm_descs{$col} = '(from dbNSFP) '.$2 if $col && $2;
212 }
213 elsif($reading && /\w/) {
214 s/^\s+//;
215 $rm_descs{$col} .= ' '.$_;
216 }
217 else {
218 $reading = 0;
219 }
220 }
221
222 close RM;
223
224 # remove multiple spaces
225 $rm_descs{$_} =~ s/\s+/ /g for keys %rm_descs;
226 }
227 }
228 }
229
230 $self->{_header_info} = {map {$_ => $rm_descs{$_} || ($_.' from dbNSFP file')} keys %{$self->{cols}}};
231 }
232
233 return $self->{_header_info};
234 }
235
236 sub run {
237 my ($self, $tva) = @_;
238
239 # only for missense variants
240 return {} unless grep {$INCLUDE_SO{$_->SO_term}} @{$tva->get_all_OverlapConsequences};
241
242 my $vf = $tva->variation_feature;
243
244 return {} unless $vf->{start} eq $vf->{end};
245
246 # get allele, reverse comp if needed
247 my $allele = $tva->variation_feature_seq;
248 reverse_comp(\$allele) if $vf->{strand} < 0;
249
250 return {} unless $allele =~ /^[ACGT]$/;
251
252 # get transcript stable ID
253 my $tr_id = $tva->transcript->stable_id;
254
255 my $data;
256 my $pos;
257
258 my $assembly = $self->{config}->{assembly};
259 my $chr = ($vf->{chr} =~ /MT/i) ? 'M' : $vf->{chr};
260 foreach my $tmp_data(@{$self->get_data($chr, $vf->{start} - 1, $vf->{end})}) {
261 # compare allele and transcript
262 if ($assembly eq 'GRCh37') {
263 if (exists $tmp_data->{'pos(1-coor)'}) {
264 # for dbNSFP version 2.9.1
265 $pos = $tmp_data->{'pos(1-coor)'}
266 } elsif (exists $tmp_data->{'hg19_pos(1-based)'}) {
267 # for dbNSFP version 3.5c indexed for hg19/(=GRCh37)
268 $pos = $tmp_data->{'hg19_pos(1-based)'}
269 } else {
270 die "dbNSFP file does not contain required columns (pos(1-coor) for version 2.9.1 or hg19_pos(1-based) for version 3.5c) to use with GRCh37";
271 }
272 } else {
273 if (exists $tmp_data->{'pos(1-based)'}) {
274 $pos = $tmp_data->{'pos(1-based)'}
275 } else {
276 die "dbNSFP file does not contain required column pos(1-based) to use with GRCh38";
277 }
278 }
279
280 next unless
281 $pos == $vf->{start} &&
282 defined($tmp_data->{alt}) &&
283 $tmp_data->{alt} eq $allele;
284
285 # make a clean copy as we're going to edit it
286 %$data = %$tmp_data;
287
288 # convert data with multiple transcript values
289 # if($data->{Ensembl_transcriptid} =~ m/\;/) {
290
291 # # find the "index" of this transcript
292 # my @tr_ids = split(';', $data->{Ensembl_transcriptid});
293 # my $tr_index;
294
295 # for my $i(0..$#tr_ids) {
296 # $tr_index = $i;
297 # last if $tr_ids[$tr_index] =~ /^$tr_id(\.\d+)?$/;
298 # }
299
300 # next unless defined($tr_index);
301
302 # # now alter other fields
303 # foreach my $key(keys %$data) {
304 # if($data->{$key} =~ m/\;/) {
305 # my @split = split(';', $data->{$key});
306 # die("ERROR: Transcript index out of range") if $tr_index > $#split;
307 # $data->{$key} = $split[$tr_index];
308 # }
309 # }
310 # }
311 last;
312 }
313
314 return {} unless scalar keys %$data;
315
316 # get required data
317 my @from = @{$self->{replacement}->{default}->{from}};
318 my @to = @{$self->{replacement}->{default}->{to}};
319
320 my %return;
321 foreach my $colname (keys %$data) {
322 next if(!defined($self->{cols}->{$colname}));
323 next if($data->{$colname} eq '.');
324
325 my @from = @{$self->{replacement}->{default}->{from}};
326 my @to = @{$self->{replacement}->{default}->{to}};
327 @from = @{$self->{replacement}->{$colname}->{from}} if (defined $self->{replacement}->{$colname});
328 @to = @{$self->{replacement}->{$colname}->{to}} if (defined $self->{replacement}->{$colname});
329 for my $i (0 .. $#from) {
330 $data->{$colname} =~ s/\Q$from[$i]\E/$to[$i]/g;
331 }
332 $return{$colname} = $data->{$colname};
333 }
334
335 return \%return;
336 }
337
338 sub parse_data {
339 my ($self, $line) = @_;
340
341 $line =~ s/\r$//g;
342
343 my @split = split /\t/, $line;
344
345 # parse data into hash of col names and values
346 my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
347
348 return \%data;
349 }
350
351 sub get_start {
352 return $_[1]->{'pos(1-based)'};
353 }
354
355 sub get_end {
356 return $_[1]->{'pos(1-based)'};
357 }
358
359 sub add_replacement_logic {
360 my $self = shift;
361 my $file = shift;
362 $file ||= 'dbNSFP_replacement_logic';
363 if (! -e $file) {
364 $self->{replacement}->{default}->{from} = [';', '|'];
365 $self->{replacement}->{default}->{to} = [',', '&'];
366 } else {
367 open FILE, $file;
368 while(<FILE>) {
369 chomp;
370 next if /^colname/;
371 my ($colname, $from, $to) = split/\s+/;
372 die ("ERROR: 3 values separated by whitespace are required: colname from to.") if(!($colname && $from && $to));
373 push @{$self->{replacement}->{$colname}->{from}}, $from;
374 push @{$self->{replacement}->{$colname}->{to}}, $to;
375 }
376 close FILE;
377 die("ERROR: No default replacement logic has been specified.\n") if (!defined $self->{replacement}->{default});
378 }
379 }
380
381 1;
382
383