|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 PolyPhen_SIFT
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv PolyPhen_SIFT.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf -cache --plugin PolyPhen_SIFT
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 A VEP plugin that retrieves PolyPhen and SIFT predictions from a
|
|
|
36 locally constructed sqlite database. It can be used when your main
|
|
|
37 source of VEP transcript annotation (e.g. a GFF file or GFF-based cache)
|
|
|
38 does not contain these predictions.
|
|
|
39
|
|
|
40 You must either download or create a sqlite database of the predictions.
|
|
|
41 You may point to the file by adding db=[file] as a parameter:
|
|
|
42
|
|
|
43 --plugin PolyPhen_SIFT,db=[file]
|
|
|
44
|
|
|
45 Human predictions (assembly-independent) are available here:
|
|
|
46
|
|
|
47 https://dl.dropboxusercontent.com/u/12936195/homo_sapiens.PolyPhen_SIFT.db
|
|
|
48
|
|
|
49 (Please note the download location of this file may change)
|
|
|
50
|
|
|
51 Place this file in $HOME/.vep to have the plugin find it automatically.
|
|
|
52 You may change this directory by adding dir=[dir] as a parameter:
|
|
|
53
|
|
|
54 --plugin PolyPhen_SIFT,dir=[dir]
|
|
|
55
|
|
|
56 To create the database, you must have an active database connection
|
|
|
57 (i.e. not using --offline) and add create_db=1 as a parameter:
|
|
|
58
|
|
|
59 --plugin PolyPhen_SIFT,create_db=1
|
|
|
60
|
|
|
61 *** NB: this will take some time!!! ***
|
|
|
62
|
|
|
63 By default the file is created as:
|
|
|
64
|
|
|
65 ${HOME}/.vep/[species].PolyPhen_SIFT.db
|
|
|
66
|
|
|
67 =cut
|
|
|
68
|
|
|
69 package PolyPhen_SIFT;
|
|
|
70
|
|
|
71 use strict;
|
|
|
72 use warnings;
|
|
|
73 use DBI;
|
|
|
74 use Digest::MD5 qw(md5_hex);
|
|
|
75 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
|
|
|
76
|
|
|
77 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
|
|
|
78
|
|
|
79 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
80
|
|
|
81 sub new {
|
|
|
82 my $class = shift;
|
|
|
83
|
|
|
84 my $self = $class->SUPER::new(@_);
|
|
|
85
|
|
|
86 my $param_hash = $self->params_to_hash();
|
|
|
87
|
|
|
88 my $species = $self->config->{species} || 'homo_sapiens';
|
|
|
89 my $dir = $param_hash->{dir} || $self->{config}->{dir};
|
|
|
90 my $db = $param_hash->{db} || $dir.'/'.$species.'.PolyPhen_SIFT.db';
|
|
|
91
|
|
|
92 # create DB?
|
|
|
93 if($param_hash->{create_db}) {
|
|
|
94 die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db;
|
|
|
95
|
|
|
96 $self->{dbh} = DBI->connect("dbi:SQLite:dbname=$db","","");
|
|
|
97 $self->{dbh}->do("CREATE TABLE predictions(md5, analysis, matrix)");
|
|
|
98
|
|
|
99 my $sth = $self->{dbh}->prepare("INSERT INTO predictions VALUES(?, ?, ?)");
|
|
|
100
|
|
|
101 my $mysql = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc->prepare(qq{
|
|
|
102 SELECT m.translation_md5, a.value, p.prediction_matrix
|
|
|
103 FROM translation_md5 m, attrib a, protein_function_predictions p
|
|
|
104 WHERE m.translation_md5_id = p.translation_md5_id
|
|
|
105 AND p.analysis_attrib_id = a.attrib_id
|
|
|
106 }, {mysql_use_result => 1});
|
|
|
107
|
|
|
108 my ($md5, $attrib, $matrix);
|
|
|
109 $mysql->execute();
|
|
|
110 $mysql->bind_columns(\$md5, \$attrib, \$matrix);
|
|
|
111 $sth->execute($md5, $attrib, $matrix) while $mysql->fetch();
|
|
|
112 $sth->finish();
|
|
|
113 $mysql->finish();
|
|
|
114
|
|
|
115 $self->{dbh}->do("CREATE INDEX md5_idx ON predictions(md5)");
|
|
|
116 }
|
|
|
117
|
|
|
118 die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db;
|
|
|
119
|
|
|
120 $self->{initial_pid} = $$;
|
|
|
121 $self->{db_file} = $db;
|
|
|
122
|
|
|
123 $self->{dbh} ||= DBI->connect("dbi:SQLite:dbname=$db","","");
|
|
|
124 $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?");
|
|
|
125
|
|
|
126 return $self;
|
|
|
127 }
|
|
|
128
|
|
|
129 sub feature_types {
|
|
|
130 return ['Transcript'];
|
|
|
131 }
|
|
|
132
|
|
|
133 sub get_header_info {
|
|
|
134 return {
|
|
|
135 PolyPhen_humdiv_score => 'PolyPhen humdiv score from PolyPhen_SIFT plugin',
|
|
|
136 PolyPhen_humdiv_pred => 'PolyPhen humdiv prediction from PolyPhen_SIFT plugin',
|
|
|
137 PolyPhen_humvar_score => 'PolyPhen humvar score from PolyPhen_SIFT plugin',
|
|
|
138 PolyPhen_humvar_pred => 'PolyPhen humvar prediction from PolyPhen_SIFT plugin',
|
|
|
139 SIFT_score => 'SIFT score from PolyPhen_SIFT plugin',
|
|
|
140 SIFT_pred => 'SIFT prediction from PolyPhen_SIFT plugin',
|
|
|
141 };
|
|
|
142 }
|
|
|
143
|
|
|
144 sub run {
|
|
|
145 my ($self, $tva) = @_;
|
|
|
146
|
|
|
147 # only for missense variants
|
|
|
148 return {} unless grep {$_->SO_term eq 'missense_variant'} @{$tva->get_all_OverlapConsequences};
|
|
|
149
|
|
|
150 my $tr = $tva->transcript;
|
|
|
151 my $tr_vep_cache = $tr->{_variation_effect_feature_cache} ||= {};
|
|
|
152
|
|
|
153 ## if predictions are not available for both tools in the cache, look in the SQLite database
|
|
|
154 unless(exists($tr_vep_cache->{protein_function_predictions}) &&
|
|
|
155 $tva->sift_prediction() && $tva->polyphen_prediction()
|
|
|
156 ){
|
|
|
157
|
|
|
158 # get peptide
|
|
|
159 unless($tr_vep_cache->{peptide}) {
|
|
|
160 my $translation = $tr->translate;
|
|
|
161 return {} unless $translation;
|
|
|
162 $tr_vep_cache->{peptide} = $translation->seq;
|
|
|
163 }
|
|
|
164
|
|
|
165 # get data, indexed on md5 of peptide sequence
|
|
|
166 my $md5 = md5_hex($tr_vep_cache->{peptide});
|
|
|
167
|
|
|
168 my $data = $self->fetch_from_cache($md5);
|
|
|
169
|
|
|
170 unless($data) {
|
|
|
171
|
|
|
172 # forked, reconnect to DB
|
|
|
173 if($$ != $self->{initial_pid}) {
|
|
|
174 $self->{dbh} = DBI->connect("dbi:SQLite:dbname=".$self->{db_file},"","");
|
|
|
175 $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?");
|
|
|
176
|
|
|
177 # set this so only do once per fork
|
|
|
178 $self->{initial_pid} = $$;
|
|
|
179 }
|
|
|
180
|
|
|
181 $self->{get_sth}->execute($md5);
|
|
|
182
|
|
|
183 $data = {};
|
|
|
184
|
|
|
185 while(my $arrayref = $self->{get_sth}->fetchrow_arrayref) {
|
|
|
186 my $analysis = $arrayref->[1];
|
|
|
187 my ($super_analysis, $sub_analysis) = split('_', $arrayref->[1]);
|
|
|
188
|
|
|
189 $data->{$analysis} = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
|
|
|
190 -translation_md5 => $arrayref->[0],
|
|
|
191 -analysis => $super_analysis,
|
|
|
192 -sub_analysis => $sub_analysis,
|
|
|
193 -matrix => $arrayref->[2]
|
|
|
194 );
|
|
|
195 }
|
|
|
196
|
|
|
197 $self->add_to_cache($md5, $data);
|
|
|
198 }
|
|
|
199
|
|
|
200 $tr_vep_cache->{protein_function_predictions} = $data;
|
|
|
201 }
|
|
|
202
|
|
|
203 my $return = {};
|
|
|
204
|
|
|
205 foreach my $tool_string(qw(SIFT PolyPhen_humdiv PolyPhen_humvar)) {
|
|
|
206 my ($tool, $analysis) = split('_', $tool_string);
|
|
|
207 my $lc_tool = lc($tool);
|
|
|
208
|
|
|
209 my $pred_meth = $lc_tool.'_prediction';
|
|
|
210 my $score_meth = $lc_tool.'_score';
|
|
|
211
|
|
|
212 my $pred = $tva->$pred_meth($analysis);
|
|
|
213
|
|
|
214 if($pred) {
|
|
|
215 $pred =~ s/\s+/\_/g;
|
|
|
216 $pred =~ s/\_\-\_/\_/g;
|
|
|
217 $return->{$tool_string.'_pred'} = $pred;
|
|
|
218
|
|
|
219 my $score = $tva->$score_meth($analysis);
|
|
|
220 $return->{$tool_string.'_score'} = $score if defined($score);
|
|
|
221 }
|
|
|
222 }
|
|
|
223
|
|
|
224 return $return;
|
|
|
225 }
|
|
|
226
|
|
|
227 sub fetch_from_cache {
|
|
|
228 my $self = shift;
|
|
|
229 my $md5 = shift;
|
|
|
230
|
|
|
231 my $cache = $self->{_cache} ||= [];
|
|
|
232
|
|
|
233 my ($data) = map {$_->{data}} grep {$_->{md5} eq $md5} @$cache;
|
|
|
234 return $data;
|
|
|
235 }
|
|
|
236
|
|
|
237 sub add_to_cache {
|
|
|
238 my $self = shift;
|
|
|
239 my $md5 = shift;
|
|
|
240 my $data = shift;
|
|
|
241
|
|
|
242 my $cache = $self->{_cache} ||= [];
|
|
|
243 push @$cache, {md5 => $md5, data => $data};
|
|
|
244
|
|
|
245 shift @$cache while scalar @$cache > 50;
|
|
|
246 }
|
|
|
247
|
|
|
248 1;
|
|
|
249
|