comparison dir_plugins/PolyPhen_SIFT.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
comparison
equal deleted inserted replaced
2:17c98d091710 3:49397129aec0
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 PolyPhen_SIFT
27
28 =head1 SYNOPSIS
29
30 mv PolyPhen_SIFT.pm ~/.vep/Plugins
31 ./vep -i variations.vcf -cache --plugin PolyPhen_SIFT
32
33 =head1 DESCRIPTION
34
35 A VEP plugin that retrieves PolyPhen and SIFT predictions from a
36 locally constructed sqlite database. It can be used when your main
37 source of VEP transcript annotation (e.g. a GFF file or GFF-based cache)
38 does not contain these predictions.
39
40 You must either download or create a sqlite database of the predictions.
41 You may point to the file by adding db=[file] as a parameter:
42
43 --plugin PolyPhen_SIFT,db=[file]
44
45 Human predictions (assembly-independent) are available here:
46
47 https://dl.dropboxusercontent.com/u/12936195/homo_sapiens.PolyPhen_SIFT.db
48
49 (Please note the download location of this file may change)
50
51 Place this file in $HOME/.vep to have the plugin find it automatically.
52 You may change this directory by adding dir=[dir] as a parameter:
53
54 --plugin PolyPhen_SIFT,dir=[dir]
55
56 To create the database, you must have an active database connection
57 (i.e. not using --offline) and add create_db=1 as a parameter:
58
59 --plugin PolyPhen_SIFT,create_db=1
60
61 *** NB: this will take some time!!! ***
62
63 By default the file is created as:
64
65 ${HOME}/.vep/[species].PolyPhen_SIFT.db
66
67 =cut
68
69 package PolyPhen_SIFT;
70
71 use strict;
72 use warnings;
73 use DBI;
74 use Digest::MD5 qw(md5_hex);
75 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix;
76
77 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
78
79 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
80
81 sub new {
82 my $class = shift;
83
84 my $self = $class->SUPER::new(@_);
85
86 my $param_hash = $self->params_to_hash();
87
88 my $species = $self->config->{species} || 'homo_sapiens';
89 my $dir = $param_hash->{dir} || $self->{config}->{dir};
90 my $db = $param_hash->{db} || $dir.'/'.$species.'.PolyPhen_SIFT.db';
91
92 # create DB?
93 if($param_hash->{create_db}) {
94 die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db;
95
96 $self->{dbh} = DBI->connect("dbi:SQLite:dbname=$db","","");
97 $self->{dbh}->do("CREATE TABLE predictions(md5, analysis, matrix)");
98
99 my $sth = $self->{dbh}->prepare("INSERT INTO predictions VALUES(?, ?, ?)");
100
101 my $mysql = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc->prepare(qq{
102 SELECT m.translation_md5, a.value, p.prediction_matrix
103 FROM translation_md5 m, attrib a, protein_function_predictions p
104 WHERE m.translation_md5_id = p.translation_md5_id
105 AND p.analysis_attrib_id = a.attrib_id
106 }, {mysql_use_result => 1});
107
108 my ($md5, $attrib, $matrix);
109 $mysql->execute();
110 $mysql->bind_columns(\$md5, \$attrib, \$matrix);
111 $sth->execute($md5, $attrib, $matrix) while $mysql->fetch();
112 $sth->finish();
113 $mysql->finish();
114
115 $self->{dbh}->do("CREATE INDEX md5_idx ON predictions(md5)");
116 }
117
118 die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db;
119
120 $self->{initial_pid} = $$;
121 $self->{db_file} = $db;
122
123 $self->{dbh} ||= DBI->connect("dbi:SQLite:dbname=$db","","");
124 $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?");
125
126 return $self;
127 }
128
129 sub feature_types {
130 return ['Transcript'];
131 }
132
133 sub get_header_info {
134 return {
135 PolyPhen_humdiv_score => 'PolyPhen humdiv score from PolyPhen_SIFT plugin',
136 PolyPhen_humdiv_pred => 'PolyPhen humdiv prediction from PolyPhen_SIFT plugin',
137 PolyPhen_humvar_score => 'PolyPhen humvar score from PolyPhen_SIFT plugin',
138 PolyPhen_humvar_pred => 'PolyPhen humvar prediction from PolyPhen_SIFT plugin',
139 SIFT_score => 'SIFT score from PolyPhen_SIFT plugin',
140 SIFT_pred => 'SIFT prediction from PolyPhen_SIFT plugin',
141 };
142 }
143
144 sub run {
145 my ($self, $tva) = @_;
146
147 # only for missense variants
148 return {} unless grep {$_->SO_term eq 'missense_variant'} @{$tva->get_all_OverlapConsequences};
149
150 my $tr = $tva->transcript;
151 my $tr_vep_cache = $tr->{_variation_effect_feature_cache} ||= {};
152
153 ## if predictions are not available for both tools in the cache, look in the SQLite database
154 unless(exists($tr_vep_cache->{protein_function_predictions}) &&
155 $tva->sift_prediction() && $tva->polyphen_prediction()
156 ){
157
158 # get peptide
159 unless($tr_vep_cache->{peptide}) {
160 my $translation = $tr->translate;
161 return {} unless $translation;
162 $tr_vep_cache->{peptide} = $translation->seq;
163 }
164
165 # get data, indexed on md5 of peptide sequence
166 my $md5 = md5_hex($tr_vep_cache->{peptide});
167
168 my $data = $self->fetch_from_cache($md5);
169
170 unless($data) {
171
172 # forked, reconnect to DB
173 if($$ != $self->{initial_pid}) {
174 $self->{dbh} = DBI->connect("dbi:SQLite:dbname=".$self->{db_file},"","");
175 $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?");
176
177 # set this so only do once per fork
178 $self->{initial_pid} = $$;
179 }
180
181 $self->{get_sth}->execute($md5);
182
183 $data = {};
184
185 while(my $arrayref = $self->{get_sth}->fetchrow_arrayref) {
186 my $analysis = $arrayref->[1];
187 my ($super_analysis, $sub_analysis) = split('_', $arrayref->[1]);
188
189 $data->{$analysis} = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new(
190 -translation_md5 => $arrayref->[0],
191 -analysis => $super_analysis,
192 -sub_analysis => $sub_analysis,
193 -matrix => $arrayref->[2]
194 );
195 }
196
197 $self->add_to_cache($md5, $data);
198 }
199
200 $tr_vep_cache->{protein_function_predictions} = $data;
201 }
202
203 my $return = {};
204
205 foreach my $tool_string(qw(SIFT PolyPhen_humdiv PolyPhen_humvar)) {
206 my ($tool, $analysis) = split('_', $tool_string);
207 my $lc_tool = lc($tool);
208
209 my $pred_meth = $lc_tool.'_prediction';
210 my $score_meth = $lc_tool.'_score';
211
212 my $pred = $tva->$pred_meth($analysis);
213
214 if($pred) {
215 $pred =~ s/\s+/\_/g;
216 $pred =~ s/\_\-\_/\_/g;
217 $return->{$tool_string.'_pred'} = $pred;
218
219 my $score = $tva->$score_meth($analysis);
220 $return->{$tool_string.'_score'} = $score if defined($score);
221 }
222 }
223
224 return $return;
225 }
226
227 sub fetch_from_cache {
228 my $self = shift;
229 my $md5 = shift;
230
231 my $cache = $self->{_cache} ||= [];
232
233 my ($data) = map {$_->{data}} grep {$_->{md5} eq $md5} @$cache;
234 return $data;
235 }
236
237 sub add_to_cache {
238 my $self = shift;
239 my $md5 = shift;
240 my $data = shift;
241
242 my $cache = $self->{_cache} ||= [];
243 push @$cache, {md5 => $md5, data => $data};
244
245 shift @$cache while scalar @$cache > 50;
246 }
247
248 1;
249