Mercurial > repos > dvanzessen > vep_emc
diff dir_plugins/dbscSNV.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dir_plugins/dbscSNV.pm Mon Jul 15 05:20:39 2019 -0400 @@ -0,0 +1,234 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl <http://www.ensembl.org/info/about/contact/index.html> + +=cut + +=head1 NAME + + dbscSNV + +=head1 SYNOPSIS + + mv dbscSNV.pm ~/.vep/Plugins + ./vep -i variations.vcf --plugin dbscSNV,/path/to/dbscSNV1.1_GRCh38.txt.gz + +=head1 DESCRIPTION + + A VEP plugin that retrieves data for splicing variants from a tabix-indexed + dbscSNV file. + + Please cite the dbscSNV publication alongside the VEP if you use this resource: + http://nar.oxfordjournals.org/content/42/22/13534 + + The Bio::DB::HTS perl library or tabix utility must be installed in your path + to use this plugin. The dbscSNV data file can be downloaded from + https://sites.google.com/site/jpopgen/dbNSFP. + + The file must be processed and indexed by tabix before use by this plugin. + dbscSNV1.1 has coordinates for both GRCh38 and GRCh37; the file must be + processed differently according to the assembly you use. + + > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip + > unzip dbscSNV1.1.zip + > head -n1 dbscSNV1.1.chr1 > h + + # GRCh38 + > cat dbscSNV1.1.chr* | grep -v ^chr | sort -k5,5 -k6,6n | cat h - | bgzip -c > dbscSNV1.1_GRCh38.txt.gz + > tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz + + # GRCh37 + > cat dbscSNV1.1.chr* | grep -v ^chr | cat h - | bgzip -c > dbscSNV1.1_GRCh37.txt.gz + > tabix -s 1 -b 2 -e 2 -c c dbscSNV1.1_GRCh37.txt.gz + + Note that in the last command we tell tabix that the header line starts with "c"; + this may change to the default of "#" in future versions of dbscSNV. + + Tabix also allows the data file to be hosted on a remote server. This plugin is + fully compatible with such a setup - simply use the URL of the remote file: + + --plugin dbscSNV,http://my.files.com/dbscSNV.txt.gz + + Note that transcript sequences referred to in dbscSNV may be out of sync with + those in the latest release of Ensembl; this may lead to discrepancies with + scores retrieved from other sources. + +=cut + +package dbscSNV; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); + +use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + $self->expand_left(0); + $self->expand_right(0); + + # get dbNSFP file + my $file = $self->params->[0]; + $self->add_file($file); + + if(my $assembly = $self->params->[1]) { + $self->{_param_assembly} = $assembly; + } + + # get headers + open HEAD, "tabix -fh $file 1:1-1 2>&1 | "; + while(<HEAD>) { + chomp; + $self->{headers} = [split]; + } + close HEAD; + + # check alt and Ensembl_transcriptid headers + foreach my $h(qw(alt Ensembl_gene)) { + die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}}; + } + + # check we have hg38_pos col, only present in dbscSNV >= 1.1 + if($self->pos_column eq 'hg38_pos') { + die("ERROR: Could not find hg38_pos column in $file\n") unless grep {$_ eq 'hg38_pos'} @{$self->{headers}}; + } + + $self->{cols} = { + 'ada_score' => 1, + 'rf_score' => 1 + }; + + return $self; +} + +sub feature_types { + return ['Transcript']; +} + +sub variation_feature_types { + return ['VariationFeature']; +} + +sub get_header_info { + return { + ada_score => 'dbscSNV ADA score', + rf_score => 'dbscSNV RF score' + } +} + +sub run { + my ($self, $tva) = @_; + + my $vf = $tva->variation_feature; + + return {} unless $vf->{start} eq $vf->{end}; + # return {} unless grep {$_->SO_term =~ /splic/} @{$tva->get_all_OverlapConsequences}; + + # get allele, reverse comp if needed + my $allele = $tva->variation_feature_seq; + reverse_comp(\$allele) if $vf->{strand} < 0; + + return {} unless $allele =~ /^[ACGT]$/; + + # get gene stable ID + my $g_id = $tva->transcript->{_gene_stable_id} || $tva->transcript->gene->stable_id; + + my $data; + my $pos_column = $self->pos_column; + + foreach my $tmp_data(@{$self->get_data($vf->{chr}, $vf->{start} - 1, $vf->{end})}) { + # compare allele and transcript + next unless + $tmp_data->{$pos_column} == $vf->{start} && + defined($tmp_data->{alt}) && + $tmp_data->{alt} eq $allele; # && + # defined($tmp_data->{Ensembl_gene}) && + # $tmp_data->{Ensembl_gene} =~ /$g_id($|;)/; + + $data = $tmp_data; + last; + } + + return {} unless scalar keys %$data; + + # get required data + my %return = + map {$_ => $data->{$_}} + grep {$data->{$_} ne '.'} # ignore missing data + grep {defined($self->{cols}->{$_})} # only include selected cols + keys %$data; + + return \%return; +} + +sub parse_data { + my ($self, $line) = @_; + + $line =~ s/\r$//g; + + my @split = split /\t/, $line; + + # parse data into hash of col names and values + my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1)); + + return \%data; +} + +sub get_start { + return $_[1]->{$_[0]->pos_column}; +} + +sub get_end { + return $_[1]->{$_[0]->pos_column}; +} + +sub pos_column { + my $self = shift; + + # work out which column to use + unless(exists($self->{pos_column})) { + if(my $assembly = $self->{_param_assembly} || $self->{config}->{assembly}) { + if($assembly eq 'GRCh37') { + $self->{pos_column} = 'pos'; + } + elsif($assembly eq 'GRCh38') { + $self->{pos_column} = 'hg38_pos'; + } + else { + die("ERROR: Assembly \"$assembly\" is not compatible with this plugin\n"); + } + } + else { + die("ERROR: Could not establish which position column to use based on assembly; try setting assembly manually with --assembly\n"); + } + } + + return $self->{pos_column}; +} + +1; +
