Mercurial > repos > dvanzessen > vep_emc
diff dir_plugins/PolyPhen_SIFT.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dir_plugins/PolyPhen_SIFT.pm Mon Jul 15 05:20:39 2019 -0400 @@ -0,0 +1,249 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl <http://www.ensembl.org/info/about/contact/index.html> + +=cut + +=head1 NAME + + PolyPhen_SIFT + +=head1 SYNOPSIS + + mv PolyPhen_SIFT.pm ~/.vep/Plugins + ./vep -i variations.vcf -cache --plugin PolyPhen_SIFT + +=head1 DESCRIPTION + + A VEP plugin that retrieves PolyPhen and SIFT predictions from a + locally constructed sqlite database. It can be used when your main + source of VEP transcript annotation (e.g. a GFF file or GFF-based cache) + does not contain these predictions. + + You must either download or create a sqlite database of the predictions. + You may point to the file by adding db=[file] as a parameter: + + --plugin PolyPhen_SIFT,db=[file] + + Human predictions (assembly-independent) are available here: + + https://dl.dropboxusercontent.com/u/12936195/homo_sapiens.PolyPhen_SIFT.db + + (Please note the download location of this file may change) + + Place this file in $HOME/.vep to have the plugin find it automatically. + You may change this directory by adding dir=[dir] as a parameter: + + --plugin PolyPhen_SIFT,dir=[dir] + + To create the database, you must have an active database connection + (i.e. not using --offline) and add create_db=1 as a parameter: + + --plugin PolyPhen_SIFT,create_db=1 + + *** NB: this will take some time!!! *** + + By default the file is created as: + + ${HOME}/.vep/[species].PolyPhen_SIFT.db + +=cut + +package PolyPhen_SIFT; + +use strict; +use warnings; +use DBI; +use Digest::MD5 qw(md5_hex); +use Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix; + +use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + my $param_hash = $self->params_to_hash(); + + my $species = $self->config->{species} || 'homo_sapiens'; + my $dir = $param_hash->{dir} || $self->{config}->{dir}; + my $db = $param_hash->{db} || $dir.'/'.$species.'.PolyPhen_SIFT.db'; + + # create DB? + if($param_hash->{create_db}) { + die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db; + + $self->{dbh} = DBI->connect("dbi:SQLite:dbname=$db","",""); + $self->{dbh}->do("CREATE TABLE predictions(md5, analysis, matrix)"); + + my $sth = $self->{dbh}->prepare("INSERT INTO predictions VALUES(?, ?, ?)"); + + my $mysql = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc->prepare(qq{ + SELECT m.translation_md5, a.value, p.prediction_matrix + FROM translation_md5 m, attrib a, protein_function_predictions p + WHERE m.translation_md5_id = p.translation_md5_id + AND p.analysis_attrib_id = a.attrib_id + }, {mysql_use_result => 1}); + + my ($md5, $attrib, $matrix); + $mysql->execute(); + $mysql->bind_columns(\$md5, \$attrib, \$matrix); + $sth->execute($md5, $attrib, $matrix) while $mysql->fetch(); + $sth->finish(); + $mysql->finish(); + + $self->{dbh}->do("CREATE INDEX md5_idx ON predictions(md5)"); + } + + die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db; + + $self->{initial_pid} = $$; + $self->{db_file} = $db; + + $self->{dbh} ||= DBI->connect("dbi:SQLite:dbname=$db","",""); + $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?"); + + return $self; +} + +sub feature_types { + return ['Transcript']; +} + +sub get_header_info { + return { + PolyPhen_humdiv_score => 'PolyPhen humdiv score from PolyPhen_SIFT plugin', + PolyPhen_humdiv_pred => 'PolyPhen humdiv prediction from PolyPhen_SIFT plugin', + PolyPhen_humvar_score => 'PolyPhen humvar score from PolyPhen_SIFT plugin', + PolyPhen_humvar_pred => 'PolyPhen humvar prediction from PolyPhen_SIFT plugin', + SIFT_score => 'SIFT score from PolyPhen_SIFT plugin', + SIFT_pred => 'SIFT prediction from PolyPhen_SIFT plugin', + }; +} + +sub run { + my ($self, $tva) = @_; + + # only for missense variants + return {} unless grep {$_->SO_term eq 'missense_variant'} @{$tva->get_all_OverlapConsequences}; + + my $tr = $tva->transcript; + my $tr_vep_cache = $tr->{_variation_effect_feature_cache} ||= {}; + + ## if predictions are not available for both tools in the cache, look in the SQLite database + unless(exists($tr_vep_cache->{protein_function_predictions}) && + $tva->sift_prediction() && $tva->polyphen_prediction() + ){ + + # get peptide + unless($tr_vep_cache->{peptide}) { + my $translation = $tr->translate; + return {} unless $translation; + $tr_vep_cache->{peptide} = $translation->seq; + } + + # get data, indexed on md5 of peptide sequence + my $md5 = md5_hex($tr_vep_cache->{peptide}); + + my $data = $self->fetch_from_cache($md5); + + unless($data) { + + # forked, reconnect to DB + if($$ != $self->{initial_pid}) { + $self->{dbh} = DBI->connect("dbi:SQLite:dbname=".$self->{db_file},"",""); + $self->{get_sth} = $self->{dbh}->prepare("SELECT md5, analysis, matrix FROM predictions WHERE md5 = ?"); + + # set this so only do once per fork + $self->{initial_pid} = $$; + } + + $self->{get_sth}->execute($md5); + + $data = {}; + + while(my $arrayref = $self->{get_sth}->fetchrow_arrayref) { + my $analysis = $arrayref->[1]; + my ($super_analysis, $sub_analysis) = split('_', $arrayref->[1]); + + $data->{$analysis} = Bio::EnsEMBL::Variation::ProteinFunctionPredictionMatrix->new( + -translation_md5 => $arrayref->[0], + -analysis => $super_analysis, + -sub_analysis => $sub_analysis, + -matrix => $arrayref->[2] + ); + } + + $self->add_to_cache($md5, $data); + } + + $tr_vep_cache->{protein_function_predictions} = $data; + } + + my $return = {}; + + foreach my $tool_string(qw(SIFT PolyPhen_humdiv PolyPhen_humvar)) { + my ($tool, $analysis) = split('_', $tool_string); + my $lc_tool = lc($tool); + + my $pred_meth = $lc_tool.'_prediction'; + my $score_meth = $lc_tool.'_score'; + + my $pred = $tva->$pred_meth($analysis); + + if($pred) { + $pred =~ s/\s+/\_/g; + $pred =~ s/\_\-\_/\_/g; + $return->{$tool_string.'_pred'} = $pred; + + my $score = $tva->$score_meth($analysis); + $return->{$tool_string.'_score'} = $score if defined($score); + } + } + + return $return; +} + +sub fetch_from_cache { + my $self = shift; + my $md5 = shift; + + my $cache = $self->{_cache} ||= []; + + my ($data) = map {$_->{data}} grep {$_->{md5} eq $md5} @$cache; + return $data; +} + +sub add_to_cache { + my $self = shift; + my $md5 = shift; + my $data = shift; + + my $cache = $self->{_cache} ||= []; + push @$cache, {md5 => $md5, data => $data}; + + shift @$cache while scalar @$cache > 50; +} + +1; +
