Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/PolyPhen_SIFT.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
comparison
equal
deleted
inserted
replaced
| 2:17c98d091710 | 3:49397129aec0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 PolyPhen_SIFT | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv PolyPhen_SIFT.pm ~/.vep/Plugins | |
| 31 ./vep -i variations.vcf -cache --plugin PolyPhen_SIFT | |
| 32 | |
| 33 =head1 DESCRIPTION | |
| 34 | |
| 35 A VEP plugin that retrieves PolyPhen and SIFT predictions from a | |
| 36 locally constructed sqlite database. It can be used when your main | |
| 37 source of VEP transcript annotation (e.g. a GFF file or GFF-based cache) | |
| 38 does not contain these predictions. | |
| 39 | |
| 40 You must either download or create a sqlite database of the predictions. | |
| 41 You may point to the file by adding db=[file] as a parameter: | |
| 42 | |
| 43 --plugin PolyPhen_SIFT,db=[file] | |
| 44 | |
| 45 Human predictions (assembly-independent) are available here: | |
| 46 | |
| 47 https://dl.dropboxusercontent.com/u/12936195/homo_sapiens.PolyPhen_SIFT.db | |
| 48 | |
| 49 (Please note the download location of this file may change) | |
| 50 | |
| 51 Place this file in $HOME/.vep to have the plugin find it automatically. | |
| 52 You may change this directory by adding dir=[dir] as a parameter: | |
| 53 | |
| 54 --plugin PolyPhen_SIFT,dir=[dir] | |
| 55 | |
| 56 To create the database, you must have an active database connection | |
| 57 (i.e. not using --offline) and add create_db=1 as a parameter: | |
| 58 | |
| 59 --plugin PolyPhen_SIFT,create_db=1 | |
| 60 | |
| 61 *** NB: this will take some time!!! *** | |
| 62 | |
| 63 By default the file is created as: | |
| 64 | |
| 65 ${HOME}/.vep/[species].PolyPhen_SIFT.db | |
| 66 | |
| 67 =cut | |
| 68 | |
| 69 package PolyPhen_SIFT; | |
| 70 | |
| 71 use strict; | |
| 72 use warnings; | |
| 73 use DBI; | |
| 74 use Digest::MD5 qw(md5_hex); | |
| 75 use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix; | |
| 76 | |
| 77 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | |
| 78 | |
| 79 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 80 | |
| 81 sub new { | |
| 82 my $class = shift; | |
| 83 | |
| 84 my $self = $class->SUPER::new(@_); | |
| 85 | |
| 86 my $param_hash = $self->params_to_hash(); | |
| 87 | |
| 88 my $species = $self->config->{species} || 'homo_sapiens'; | |
| 89 my $dir = $param_hash->{dir} || $self->{config}->{dir}; | |
| 90 my $db = $param_hash->{db} || $dir.'/'.$species.'.PolyPhen_SIFT.db'; | |
| 91 | |
| 92 # create DB? | |
| 93 if($param_hash->{create_db}) { | |
| 94 die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db; | |
| 95 | |
| 96 $self->{dbh} = DBI->connect("dbi:SQLite:dbname=$db","",""); | |
| 97 $self->{dbh}->do("CREATE TABLE predictions(md5, analysis, matrix)"); | |
| 98 | |
| 99 my $sth = $self->{dbh}->prepare("INSERT INTO predictions VALUES(?, ?, ?)"); | |
| 100 | |
| 101 my $mysql = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc->prepare(qq{ | |
| 102 SELECT m.translation_md5, a.value, p.prediction_matrix | |
| 103 FROM translation_md5 m, attrib a, protein_function_predictions p | |
| 104 WHERE m.translation_md5_id = p.translation_md5_id | |
| 105 AND p.analysis_attrib_id = a.attrib_id | |
| 106 }, {mysql_use_result => 1}); | |
| 107 | |
| 108 my ($md5, $attrib, $matrix); | |
| 109 $mysql->execute(); | |
| 110 $mysql->bind_columns(\$md5, \$attrib, \$matrix); | |
| 111 $sth->execute($md5, $attrib, $matrix) while $mysql->fetch(); | |
| 112 $sth->finish(); | |
| 113 $mysql->finish(); | |
| 114 | |
| 115 $self->{dbh}->do("CREATE INDEX md5_idx ON predictions(md5)"); | |
| 116 } | |
| 117 | |
| 118 die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db; | |
| 119 | |
| 120 $self->{initial_pid} = $$; | |
| 121 $self->{db_file} = $db; | |
| 122 | |
| 123 $self->{dbh} ||= DBI->connect("dbi:SQLite:dbname=$db","",""); | |
| 124 $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?"); | |
| 125 | |
| 126 return $self; | |
| 127 } | |
| 128 | |
| 129 sub feature_types { | |
| 130 return ['Transcript']; | |
| 131 } | |
| 132 | |
| 133 sub get_header_info { | |
| 134 return { | |
| 135 PolyPhen_humdiv_score => 'PolyPhen humdiv score from PolyPhen_SIFT plugin', | |
| 136 PolyPhen_humdiv_pred => 'PolyPhen humdiv prediction from PolyPhen_SIFT plugin', | |
| 137 PolyPhen_humvar_score => 'PolyPhen humvar score from PolyPhen_SIFT plugin', | |
| 138 PolyPhen_humvar_pred => 'PolyPhen humvar prediction from PolyPhen_SIFT plugin', | |
| 139 SIFT_score => 'SIFT score from PolyPhen_SIFT plugin', | |
| 140 SIFT_pred => 'SIFT prediction from PolyPhen_SIFT plugin', | |
| 141 }; | |
| 142 } | |
| 143 | |
| 144 sub run { | |
| 145 my ($self, $tva) = @_; | |
| 146 | |
| 147 # only for missense variants | |
| 148 return {} unless grep {$_->SO_term eq 'missense_variant'} @{$tva->get_all_OverlapConsequences}; | |
| 149 | |
| 150 my $tr = $tva->transcript; | |
| 151 my $tr_vep_cache = $tr->{_variation_effect_feature_cache} ||= {}; | |
| 152 | |
| 153 ## if predictions are not available for both tools in the cache, look in the SQLite database | |
| 154 unless(exists($tr_vep_cache->{protein_function_predictions}) && | |
| 155 $tva->sift_prediction() && $tva->polyphen_prediction() | |
| 156 ){ | |
| 157 | |
| 158 # get peptide | |
| 159 unless($tr_vep_cache->{peptide}) { | |
| 160 my $translation = $tr->translate; | |
| 161 return {} unless $translation; | |
| 162 $tr_vep_cache->{peptide} = $translation->seq; | |
| 163 } | |
| 164 | |
| 165 # get data, indexed on md5 of peptide sequence | |
| 166 my $md5 = md5_hex($tr_vep_cache->{peptide}); | |
| 167 | |
| 168 my $data = $self->fetch_from_cache($md5); | |
| 169 | |
| 170 unless($data) { | |
| 171 | |
| 172 # forked, reconnect to DB | |
| 173 if($$ != $self->{initial_pid}) { | |
| 174 $self->{dbh} = DBI->connect("dbi:SQLite:dbname=".$self->{db_file},"",""); | |
| 175 $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?"); | |
| 176 | |
| 177 # set this so only do once per fork | |
| 178 $self->{initial_pid} = $$; | |
| 179 } | |
| 180 | |
| 181 $self->{get_sth}->execute($md5); | |
| 182 | |
| 183 $data = {}; | |
| 184 | |
| 185 while(my $arrayref = $self->{get_sth}->fetchrow_arrayref) { | |
| 186 my $analysis = $arrayref->[1]; | |
| 187 my ($super_analysis, $sub_analysis) = split('_', $arrayref->[1]); | |
| 188 | |
| 189 $data->{$analysis} = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( | |
| 190 -translation_md5 => $arrayref->[0], | |
| 191 -analysis => $super_analysis, | |
| 192 -sub_analysis => $sub_analysis, | |
| 193 -matrix => $arrayref->[2] | |
| 194 ); | |
| 195 } | |
| 196 | |
| 197 $self->add_to_cache($md5, $data); | |
| 198 } | |
| 199 | |
| 200 $tr_vep_cache->{protein_function_predictions} = $data; | |
| 201 } | |
| 202 | |
| 203 my $return = {}; | |
| 204 | |
| 205 foreach my $tool_string(qw(SIFT PolyPhen_humdiv PolyPhen_humvar)) { | |
| 206 my ($tool, $analysis) = split('_', $tool_string); | |
| 207 my $lc_tool = lc($tool); | |
| 208 | |
| 209 my $pred_meth = $lc_tool.'_prediction'; | |
| 210 my $score_meth = $lc_tool.'_score'; | |
| 211 | |
| 212 my $pred = $tva->$pred_meth($analysis); | |
| 213 | |
| 214 if($pred) { | |
| 215 $pred =~ s/\s+/\_/g; | |
| 216 $pred =~ s/\_\-\_/\_/g; | |
| 217 $return->{$tool_string.'_pred'} = $pred; | |
| 218 | |
| 219 my $score = $tva->$score_meth($analysis); | |
| 220 $return->{$tool_string.'_score'} = $score if defined($score); | |
| 221 } | |
| 222 } | |
| 223 | |
| 224 return $return; | |
| 225 } | |
| 226 | |
| 227 sub fetch_from_cache { | |
| 228 my $self = shift; | |
| 229 my $md5 = shift; | |
| 230 | |
| 231 my $cache = $self->{_cache} ||= []; | |
| 232 | |
| 233 my ($data) = map {$_->{data}} grep {$_->{md5} eq $md5} @$cache; | |
| 234 return $data; | |
| 235 } | |
| 236 | |
| 237 sub add_to_cache { | |
| 238 my $self = shift; | |
| 239 my $md5 = shift; | |
| 240 my $data = shift; | |
| 241 | |
| 242 my $cache = $self->{_cache} ||= []; | |
| 243 push @$cache, {md5 => $md5, data => $data}; | |
| 244 | |
| 245 shift @$cache while scalar @$cache > 50; | |
| 246 } | |
| 247 | |
| 248 1; | |
| 249 |
