|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 dbscSNV
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv dbscSNV.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin dbscSNV,/path/to/dbscSNV1.1_GRCh38.txt.gz
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 A VEP plugin that retrieves data for splicing variants from a tabix-indexed
|
|
|
36 dbscSNV file.
|
|
|
37
|
|
|
38 Please cite the dbscSNV publication alongside the VEP if you use this resource:
|
|
|
39 http://nar.oxfordjournals.org/content/42/22/13534
|
|
|
40
|
|
|
41 The Bio::DB::HTS perl library or tabix utility must be installed in your path
|
|
|
42 to use this plugin. The dbscSNV data file can be downloaded from
|
|
|
43 https://sites.google.com/site/jpopgen/dbNSFP.
|
|
|
44
|
|
|
45 The file must be processed and indexed by tabix before use by this plugin.
|
|
|
46 dbscSNV1.1 has coordinates for both GRCh38 and GRCh37; the file must be
|
|
|
47 processed differently according to the assembly you use.
|
|
|
48
|
|
|
49 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip
|
|
|
50 > unzip dbscSNV1.1.zip
|
|
|
51 > head -n1 dbscSNV1.1.chr1 > h
|
|
|
52
|
|
|
53 # GRCh38
|
|
|
54 > cat dbscSNV1.1.chr* | grep -v ^chr | sort -k5,5 -k6,6n | cat h - | bgzip -c > dbscSNV1.1_GRCh38.txt.gz
|
|
|
55 > tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz
|
|
|
56
|
|
|
57 # GRCh37
|
|
|
58 > cat dbscSNV1.1.chr* | grep -v ^chr | cat h - | bgzip -c > dbscSNV1.1_GRCh37.txt.gz
|
|
|
59 > tabix -s 1 -b 2 -e 2 -c c dbscSNV1.1_GRCh37.txt.gz
|
|
|
60
|
|
|
61 Note that in the last command we tell tabix that the header line starts with "c";
|
|
|
62 this may change to the default of "#" in future versions of dbscSNV.
|
|
|
63
|
|
|
64 Tabix also allows the data file to be hosted on a remote server. This plugin is
|
|
|
65 fully compatible with such a setup - simply use the URL of the remote file:
|
|
|
66
|
|
|
67 --plugin dbscSNV,http://my.files.com/dbscSNV.txt.gz
|
|
|
68
|
|
|
69 Note that transcript sequences referred to in dbscSNV may be out of sync with
|
|
|
70 those in the latest release of Ensembl; this may lead to discrepancies with
|
|
|
71 scores retrieved from other sources.
|
|
|
72
|
|
|
73 =cut
|
|
|
74
|
|
|
75 package dbscSNV;
|
|
|
76
|
|
|
77 use strict;
|
|
|
78 use warnings;
|
|
|
79
|
|
|
80 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
81
|
|
|
82 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
|
|
|
83
|
|
|
84 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
|
|
|
85
|
|
|
86 sub new {
|
|
|
87 my $class = shift;
|
|
|
88
|
|
|
89 my $self = $class->SUPER::new(@_);
|
|
|
90
|
|
|
91 $self->expand_left(0);
|
|
|
92 $self->expand_right(0);
|
|
|
93
|
|
|
94 # get dbNSFP file
|
|
|
95 my $file = $self->params->[0];
|
|
|
96 $self->add_file($file);
|
|
|
97
|
|
|
98 if(my $assembly = $self->params->[1]) {
|
|
|
99 $self->{_param_assembly} = $assembly;
|
|
|
100 }
|
|
|
101
|
|
|
102 # get headers
|
|
|
103 open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
|
|
|
104 while(<HEAD>) {
|
|
|
105 chomp;
|
|
|
106 $self->{headers} = [split];
|
|
|
107 }
|
|
|
108 close HEAD;
|
|
|
109
|
|
|
110 # check alt and Ensembl_transcriptid headers
|
|
|
111 foreach my $h(qw(alt Ensembl_gene)) {
|
|
|
112 die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}};
|
|
|
113 }
|
|
|
114
|
|
|
115 # check we have hg38_pos col, only present in dbscSNV >= 1.1
|
|
|
116 if($self->pos_column eq 'hg38_pos') {
|
|
|
117 die("ERROR: Could not find hg38_pos column in $file\n") unless grep {$_ eq 'hg38_pos'} @{$self->{headers}};
|
|
|
118 }
|
|
|
119
|
|
|
120 $self->{cols} = {
|
|
|
121 'ada_score' => 1,
|
|
|
122 'rf_score' => 1
|
|
|
123 };
|
|
|
124
|
|
|
125 return $self;
|
|
|
126 }
|
|
|
127
|
|
|
128 sub feature_types {
|
|
|
129 return ['Transcript'];
|
|
|
130 }
|
|
|
131
|
|
|
132 sub variation_feature_types {
|
|
|
133 return ['VariationFeature'];
|
|
|
134 }
|
|
|
135
|
|
|
136 sub get_header_info {
|
|
|
137 return {
|
|
|
138 ada_score => 'dbscSNV ADA score',
|
|
|
139 rf_score => 'dbscSNV RF score'
|
|
|
140 }
|
|
|
141 }
|
|
|
142
|
|
|
143 sub run {
|
|
|
144 my ($self, $tva) = @_;
|
|
|
145
|
|
|
146 my $vf = $tva->variation_feature;
|
|
|
147
|
|
|
148 return {} unless $vf->{start} eq $vf->{end};
|
|
|
149 # return {} unless grep {$_->SO_term =~ /splic/} @{$tva->get_all_OverlapConsequences};
|
|
|
150
|
|
|
151 # get allele, reverse comp if needed
|
|
|
152 my $allele = $tva->variation_feature_seq;
|
|
|
153 reverse_comp(\$allele) if $vf->{strand} < 0;
|
|
|
154
|
|
|
155 return {} unless $allele =~ /^[ACGT]$/;
|
|
|
156
|
|
|
157 # get gene stable ID
|
|
|
158 my $g_id = $tva->transcript->{_gene_stable_id} || $tva->transcript->gene->stable_id;
|
|
|
159
|
|
|
160 my $data;
|
|
|
161 my $pos_column = $self->pos_column;
|
|
|
162
|
|
|
163 foreach my $tmp_data(@{$self->get_data($vf->{chr}, $vf->{start} - 1, $vf->{end})}) {
|
|
|
164 # compare allele and transcript
|
|
|
165 next unless
|
|
|
166 $tmp_data->{$pos_column} == $vf->{start} &&
|
|
|
167 defined($tmp_data->{alt}) &&
|
|
|
168 $tmp_data->{alt} eq $allele; # &&
|
|
|
169 # defined($tmp_data->{Ensembl_gene}) &&
|
|
|
170 # $tmp_data->{Ensembl_gene} =~ /$g_id($|;)/;
|
|
|
171
|
|
|
172 $data = $tmp_data;
|
|
|
173 last;
|
|
|
174 }
|
|
|
175
|
|
|
176 return {} unless scalar keys %$data;
|
|
|
177
|
|
|
178 # get required data
|
|
|
179 my %return =
|
|
|
180 map {$_ => $data->{$_}}
|
|
|
181 grep {$data->{$_} ne '.'} # ignore missing data
|
|
|
182 grep {defined($self->{cols}->{$_})} # only include selected cols
|
|
|
183 keys %$data;
|
|
|
184
|
|
|
185 return \%return;
|
|
|
186 }
|
|
|
187
|
|
|
188 sub parse_data {
|
|
|
189 my ($self, $line) = @_;
|
|
|
190
|
|
|
191 $line =~ s/\r$//g;
|
|
|
192
|
|
|
193 my @split = split /\t/, $line;
|
|
|
194
|
|
|
195 # parse data into hash of col names and values
|
|
|
196 my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
|
|
|
197
|
|
|
198 return \%data;
|
|
|
199 }
|
|
|
200
|
|
|
201 sub get_start {
|
|
|
202 return $_[1]->{$_[0]->pos_column};
|
|
|
203 }
|
|
|
204
|
|
|
205 sub get_end {
|
|
|
206 return $_[1]->{$_[0]->pos_column};
|
|
|
207 }
|
|
|
208
|
|
|
209 sub pos_column {
|
|
|
210 my $self = shift;
|
|
|
211
|
|
|
212 # work out which column to use
|
|
|
213 unless(exists($self->{pos_column})) {
|
|
|
214 if(my $assembly = $self->{_param_assembly} || $self->{config}->{assembly}) {
|
|
|
215 if($assembly eq 'GRCh37') {
|
|
|
216 $self->{pos_column} = 'pos';
|
|
|
217 }
|
|
|
218 elsif($assembly eq 'GRCh38') {
|
|
|
219 $self->{pos_column} = 'hg38_pos';
|
|
|
220 }
|
|
|
221 else {
|
|
|
222 die("ERROR: Assembly \"$assembly\" is not compatible with this plugin\n");
|
|
|
223 }
|
|
|
224 }
|
|
|
225 else {
|
|
|
226 die("ERROR: Could not establish which position column to use based on assembly; try setting assembly manually with --assembly\n");
|
|
|
227 }
|
|
|
228 }
|
|
|
229
|
|
|
230 return $self->{pos_column};
|
|
|
231 }
|
|
|
232
|
|
|
233 1;
|
|
|
234
|