comparison dir_plugins/dbscSNV.pm @ 0:e545d0a25ffe draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:17:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e545d0a25ffe
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 dbscSNV
27
28 =head1 SYNOPSIS
29
30 mv dbscSNV.pm ~/.vep/Plugins
31 ./vep -i variations.vcf --plugin dbscSNV,/path/to/dbscSNV1.1_GRCh38.txt.gz
32
33 =head1 DESCRIPTION
34
35 A VEP plugin that retrieves data for splicing variants from a tabix-indexed
36 dbscSNV file.
37
38 Please cite the dbscSNV publication alongside the VEP if you use this resource:
39 http://nar.oxfordjournals.org/content/42/22/13534
40
41 The Bio::DB::HTS perl library or tabix utility must be installed in your path
42 to use this plugin. The dbscSNV data file can be downloaded from
43 https://sites.google.com/site/jpopgen/dbNSFP.
44
45 The file must be processed and indexed by tabix before use by this plugin.
46 dbscSNV1.1 has coordinates for both GRCh38 and GRCh37; the file must be
47 processed differently according to the assembly you use.
48
49 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip
50 > unzip dbscSNV1.1.zip
51 > head -n1 dbscSNV1.1.chr1 > h
52
53 # GRCh38
54 > cat dbscSNV1.1.chr* | grep -v ^chr | sort -k5,5 -k6,6n | cat h - | bgzip -c > dbscSNV1.1_GRCh38.txt.gz
55 > tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz
56
57 # GRCh37
58 > cat dbscSNV1.1.chr* | grep -v ^chr | cat h - | bgzip -c > dbscSNV1.1_GRCh37.txt.gz
59 > tabix -s 1 -b 2 -e 2 -c c dbscSNV1.1_GRCh37.txt.gz
60
61 Note that in the last command we tell tabix that the header line starts with "c";
62 this may change to the default of "#" in future versions of dbscSNV.
63
64 Tabix also allows the data file to be hosted on a remote server. This plugin is
65 fully compatible with such a setup - simply use the URL of the remote file:
66
67 --plugin dbscSNV,http://my.files.com/dbscSNV.txt.gz
68
69 Note that transcript sequences referred to in dbscSNV may be out of sync with
70 those in the latest release of Ensembl; this may lead to discrepancies with
71 scores retrieved from other sources.
72
73 =cut
74
75 package dbscSNV;
76
77 use strict;
78 use warnings;
79
80 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
81
82 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
83
84 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
85
86 sub new {
87 my $class = shift;
88
89 my $self = $class->SUPER::new(@_);
90
91 $self->expand_left(0);
92 $self->expand_right(0);
93
94 # get dbNSFP file
95 my $file = $self->params->[0];
96 $self->add_file($file);
97
98 if(my $assembly = $self->params->[1]) {
99 $self->{_param_assembly} = $assembly;
100 }
101
102 # get headers
103 open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
104 while(<HEAD>) {
105 chomp;
106 $self->{headers} = [split];
107 }
108 close HEAD;
109
110 # check alt and Ensembl_transcriptid headers
111 foreach my $h(qw(alt Ensembl_gene)) {
112 die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}};
113 }
114
115 # check we have hg38_pos col, only present in dbscSNV >= 1.1
116 if($self->pos_column eq 'hg38_pos') {
117 die("ERROR: Could not find hg38_pos column in $file\n") unless grep {$_ eq 'hg38_pos'} @{$self->{headers}};
118 }
119
120 $self->{cols} = {
121 'ada_score' => 1,
122 'rf_score' => 1
123 };
124
125 return $self;
126 }
127
128 sub feature_types {
129 return ['Transcript'];
130 }
131
132 sub variation_feature_types {
133 return ['VariationFeature'];
134 }
135
136 sub get_header_info {
137 return {
138 ada_score => 'dbscSNV ADA score',
139 rf_score => 'dbscSNV RF score'
140 }
141 }
142
143 sub run {
144 my ($self, $tva) = @_;
145
146 my $vf = $tva->variation_feature;
147
148 return {} unless $vf->{start} eq $vf->{end};
149 # return {} unless grep {$_->SO_term =~ /splic/} @{$tva->get_all_OverlapConsequences};
150
151 # get allele, reverse comp if needed
152 my $allele = $tva->variation_feature_seq;
153 reverse_comp(\$allele) if $vf->{strand} < 0;
154
155 return {} unless $allele =~ /^[ACGT]$/;
156
157 # get gene stable ID
158 my $g_id = $tva->transcript->{_gene_stable_id} || $tva->transcript->gene->stable_id;
159
160 my $data;
161 my $pos_column = $self->pos_column;
162
163 foreach my $tmp_data(@{$self->get_data($vf->{chr}, $vf->{start} - 1, $vf->{end})}) {
164 # compare allele and transcript
165 next unless
166 $tmp_data->{$pos_column} == $vf->{start} &&
167 defined($tmp_data->{alt}) &&
168 $tmp_data->{alt} eq $allele; # &&
169 # defined($tmp_data->{Ensembl_gene}) &&
170 # $tmp_data->{Ensembl_gene} =~ /$g_id($|;)/;
171
172 $data = $tmp_data;
173 last;
174 }
175
176 return {} unless scalar keys %$data;
177
178 # get required data
179 my %return =
180 map {$_ => $data->{$_}}
181 grep {$data->{$_} ne '.'} # ignore missing data
182 grep {defined($self->{cols}->{$_})} # only include selected cols
183 keys %$data;
184
185 return \%return;
186 }
187
188 sub parse_data {
189 my ($self, $line) = @_;
190
191 $line =~ s/\r$//g;
192
193 my @split = split /\t/, $line;
194
195 # parse data into hash of col names and values
196 my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
197
198 return \%data;
199 }
200
201 sub get_start {
202 return $_[1]->{$_[0]->pos_column};
203 }
204
205 sub get_end {
206 return $_[1]->{$_[0]->pos_column};
207 }
208
209 sub pos_column {
210 my $self = shift;
211
212 # work out which column to use
213 unless(exists($self->{pos_column})) {
214 if(my $assembly = $self->{_param_assembly} || $self->{config}->{assembly}) {
215 if($assembly eq 'GRCh37') {
216 $self->{pos_column} = 'pos';
217 }
218 elsif($assembly eq 'GRCh38') {
219 $self->{pos_column} = 'hg38_pos';
220 }
221 else {
222 die("ERROR: Assembly \"$assembly\" is not compatible with this plugin\n");
223 }
224 }
225 else {
226 die("ERROR: Could not establish which position column to use based on assembly; try setting assembly manually with --assembly\n");
227 }
228 }
229
230 return $self->{pos_column};
231 }
232
233 1;
234