diff dir_plugins/dbNSFP.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dir_plugins/dbNSFP.pm	Mon Jul 15 05:20:39 2019 -0400
@@ -0,0 +1,383 @@
+=head1 LICENSE
+
+Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+Copyright [2016-2018] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=head1 CONTACT
+
+ Ensembl <http://www.ensembl.org/info/about/contact/index.html>
+    
+=cut
+
+=head1 NAME
+
+ dbNSFP
+
+=head1 SYNOPSIS
+
+ mv dbNSFP.pm ~/.vep/Plugins
+ ./vep -i variations.vcf --plugin dbNSFP,/path/to/dbNSFP.gz,col1,col2
+
+=head1 DESCRIPTION
+
+ A VEP plugin that retrieves data for missense variants from a tabix-indexed
+ dbNSFP file.
+ 
+ Please cite the dbNSFP publication alongside the VEP if you use this resource:
+ http://www.ncbi.nlm.nih.gov/pubmed/21520341
+ 
+ You must have the Bio::DB::HTS module or the tabix utility must be installed
+ in your path to use this plugin. The dbNSFP data file can be downloaded from
+ https://sites.google.com/site/jpopgen/dbNSFP.
+
+ Release 3.5a of dbNSFP uses GRCh38/hg38 coordinates and GRCh37/hg19
+ coordinates. 
+ To use plugin with GRCh37/hg19 data:
+ > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip
+ > unzip dbNSFPv3.5a.zip
+ > head -n1 dbNSFP3.5a_variant.chr1 > h
+ > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | awk '$8 != "."' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP_hg19.gz
+ > tabix -s 8 -b 9 -e 9 dbNSFP_hg19.gz
+
+ To use plugin with GRCh38/hg38 data:
+ > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip
+ > unzip dbNSFPv3.5a.zip
+ > head -n1 dbNSFP3.5a_variant.chr1 > h
+ > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | sort -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP.gz
+ > tabix -s 1 -b 2 -e 2 dbNSFP.gz
+ 
+ When running the plugin you must list at least one column to retrieve from the
+ dbNSFP file, specified as parameters to the plugin e.g.
+ 
+ --plugin dbNSFP,/path/to/dbNSFP.gz,LRT_score,GERP++_RS
+
+ You may include all columns with ALL; this fetches a large amount of data per
+ variant!:
+ 
+ --plugin dbNSFP,/path/to/dbNSFP.gz,ALL
+ 
+ Tabix also allows the data file to be hosted on a remote server. This plugin is
+ fully compatible with such a setup - simply use the URL of the remote file:
+ 
+ --plugin dbNSFP,http://my.files.com/dbNSFP.gz,col1,col2
+
+ The plugin replaces occurrences of ';' with ',' and '|' with '&'. However, some
+ data field columns, e.g. Interpro_domain, use the replacement characters. We
+ added a file with replacement logic for customising the required replacement
+ of ';' and '|' in dbNSFP data columns. In addition to the default replacements
+ (; to , and | to &) users can add customised replacements. Users can either modify
+ the file dbNSFP_replacement_logic in the VEP_plugins directory or provide their own
+ file as second argument when calling the plugin:
+
+ --plugin dbNSFP,/path/to/dbNSFP.gz,/path/to/dbNSFP_replacement_logic,LRT_score,GERP++_RS
+ 
+ Note that transcript sequences referred to in dbNSFP may be out of sync with
+ those in the latest release of Ensembl; this may lead to discrepancies with
+ scores retrieved from other sources.
+
+ If the dbNSFP README file is found in the same directory as the data file,
+ column descriptions will be read from this and incorporated into the VEP output
+ file header.
+ 
+=cut
+
+package dbNSFP;
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
+
+use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
+
+use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
+
+my %INCLUDE_SO = map {$_ => 1} qw(missense_variant stop_lost stop_gained start_lost);
+
+sub new {
+  my $class = shift;
+  
+  my $self = $class->SUPER::new(@_);
+
+  $self->expand_left(0);
+  $self->expand_right(0);
+  
+  # get dbNSFP file
+  my $file = $self->params->[0];
+  $self->add_file($file);
+  
+  # get headers
+  open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
+  while(<HEAD>) {
+    next unless /^\#/;
+    chomp;
+    $self->{headers} = [split];
+  }
+  close HEAD;
+  
+  die "ERROR: Could not read headers from $file\n" unless defined($self->{headers}) && scalar @{$self->{headers}};
+  
+  # check alt and Ensembl_transcriptid headers
+  foreach my $h(qw(alt Ensembl_transcriptid)) {
+    die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}};
+  }
+
+  my $i = 1; 
+  # check if 2nd argument is a file that specifies replacement logic
+  # read replacement logic 
+  my $replacement_file = $self->params->[$i];
+  if (defined $replacement_file && -e $replacement_file) {
+    $self->add_replacement_logic($replacement_file);  
+    $i++;
+  } else {
+    $self->add_replacement_logic();  
+  } 
+ 
+  # get required columns
+  while(defined($self->params->[$i])) {
+    my $col = $self->params->[$i];
+    if($col eq 'ALL') {
+      $self->{cols} = {map {$_ => 1} @{$self->{headers}}};
+      last;
+    }
+    die "ERROR: Column $col not found in header for file $file. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless grep {$_ eq $col} @{$self->{headers}};
+    
+    $self->{cols}->{$self->params->[$i]} = 1;
+    $i++;
+  }
+  
+  die "ERROR: No columns selected to fetch. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless defined($self->{cols}) && scalar keys %{$self->{cols}};
+  
+  return $self;
+}
+
+sub feature_types {
+  return ['Transcript'];
+}
+
+sub get_header_info {
+  my $self = shift;
+
+  if(!exists($self->{_header_info})) {
+
+    # look for readme
+    my $file_dir = $self->files->[0];
+
+    my %rm_descs;
+
+    # won't work for remote
+    if($file_dir !~ /tp\:\/\//) {
+
+      # get just dir
+      $file_dir =~ s/\/[^\/]+$/\//;
+
+      if(opendir DIR, $file_dir) {
+        my ($readme_file) = grep {/dbnsfp.*readme\.txt/i} readdir DIR;
+        closedir DIR;
+
+        if(open RM, $file_dir.$readme_file) {
+          my ($col, $reading);
+
+          # parse dbNSFP readme
+          # relevant lines look like:
+          #
+          # 1   column1_name: description blah blah
+          #     blah blah blah
+          # 2   column2_name: description blah blah
+          #     blah blah blah
+
+          while(<RM>) {
+            chomp;
+            s/\r$//g;
+
+            if(/^\d+\s/) {
+              $reading = 1;
+
+              m/^\d+\s+(.+?)\:\s+(.+)/;
+              $col = $1;
+
+              $rm_descs{$col} = '(from dbNSFP) '.$2 if $col && $2;
+            }
+            elsif($reading && /\w/) {
+              s/^\s+//;
+              $rm_descs{$col} .= ' '.$_;
+            }
+            else {
+              $reading = 0;
+            }
+          }
+
+          close RM;
+
+          # remove multiple spaces
+          $rm_descs{$_} =~ s/\s+/ /g for keys %rm_descs;
+        }
+      }
+    }
+
+    $self->{_header_info} = {map {$_ => $rm_descs{$_} || ($_.' from dbNSFP file')} keys %{$self->{cols}}};
+  }
+  
+  return $self->{_header_info};
+}
+
+sub run {
+  my ($self, $tva) = @_;
+  
+  # only for missense variants
+  return {} unless grep {$INCLUDE_SO{$_->SO_term}} @{$tva->get_all_OverlapConsequences};
+  
+  my $vf = $tva->variation_feature;
+  
+  return {} unless $vf->{start} eq $vf->{end};
+  
+  # get allele, reverse comp if needed
+  my $allele = $tva->variation_feature_seq;
+  reverse_comp(\$allele) if $vf->{strand} < 0;
+  
+  return {} unless $allele =~ /^[ACGT]$/;
+  
+  # get transcript stable ID
+  my $tr_id = $tva->transcript->stable_id;
+
+  my $data;
+  my $pos;
+
+  my $assembly = $self->{config}->{assembly};
+  my $chr = ($vf->{chr} =~ /MT/i) ? 'M' : $vf->{chr};
+  foreach my $tmp_data(@{$self->get_data($chr, $vf->{start} - 1, $vf->{end})}) {
+    # compare allele and transcript
+    if ($assembly eq 'GRCh37') {
+      if (exists $tmp_data->{'pos(1-coor)'}) {
+        # for dbNSFP version 2.9.1
+        $pos = $tmp_data->{'pos(1-coor)'}
+      } elsif (exists $tmp_data->{'hg19_pos(1-based)'}) {
+        # for dbNSFP version 3.5c indexed for hg19/(=GRCh37)
+        $pos =  $tmp_data->{'hg19_pos(1-based)'}
+      } else {
+        die "dbNSFP file does not contain required columns (pos(1-coor) for version 2.9.1 or hg19_pos(1-based) for version 3.5c) to use with GRCh37";
+      }
+    } else {
+      if (exists $tmp_data->{'pos(1-based)'}) {
+        $pos = $tmp_data->{'pos(1-based)'}
+      } else {
+        die "dbNSFP file does not contain required column pos(1-based) to use with GRCh38";
+      }
+    }
+
+    next unless
+      $pos == $vf->{start} &&
+      defined($tmp_data->{alt}) &&
+      $tmp_data->{alt} eq $allele;
+    
+    # make a clean copy as we're going to edit it
+    %$data = %$tmp_data;
+
+    # convert data with multiple transcript values
+    # if($data->{Ensembl_transcriptid} =~ m/\;/) {
+
+    #   # find the "index" of this transcript
+    #   my @tr_ids = split(';', $data->{Ensembl_transcriptid});
+    #   my $tr_index;
+
+    #   for my $i(0..$#tr_ids) {
+    #     $tr_index = $i;
+    #     last if $tr_ids[$tr_index] =~ /^$tr_id(\.\d+)?$/;
+    #   }
+
+    #   next unless defined($tr_index);
+
+    #   # now alter other fields
+    #   foreach my $key(keys %$data) {
+    #     if($data->{$key} =~ m/\;/) {
+    #       my @split = split(';', $data->{$key});
+    #       die("ERROR: Transcript index out of range") if $tr_index > $#split;
+    #       $data->{$key} = $split[$tr_index];
+    #     } 
+    #   }
+    # }
+    last;
+  }
+  
+  return {} unless scalar keys %$data;
+  
+  # get required data
+  my @from = @{$self->{replacement}->{default}->{from}};
+  my @to = @{$self->{replacement}->{default}->{to}};
+
+  my %return;
+  foreach my $colname (keys %$data) {
+    next if(!defined($self->{cols}->{$colname}));
+    next if($data->{$colname} eq '.');
+
+    my @from = @{$self->{replacement}->{default}->{from}};
+    my @to   = @{$self->{replacement}->{default}->{to}};
+    @from    = @{$self->{replacement}->{$colname}->{from}} if (defined $self->{replacement}->{$colname});
+    @to      = @{$self->{replacement}->{$colname}->{to}} if (defined $self->{replacement}->{$colname});
+    for my $i (0 .. $#from) {
+      $data->{$colname} =~ s/\Q$from[$i]\E/$to[$i]/g;
+    }
+    $return{$colname} = $data->{$colname};
+  }
+  
+  return \%return;
+}
+
+sub parse_data {
+  my ($self, $line) = @_;
+
+  $line =~ s/\r$//g;
+
+  my @split = split /\t/, $line;
+  
+  # parse data into hash of col names and values
+  my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
+
+  return \%data;
+}
+
+sub get_start {  
+  return $_[1]->{'pos(1-based)'};
+}
+
+sub get_end {
+  return $_[1]->{'pos(1-based)'};
+}
+
+sub add_replacement_logic {
+  my $self = shift;
+  my $file = shift;
+  $file ||= 'dbNSFP_replacement_logic';
+  if (! -e $file) {
+    $self->{replacement}->{default}->{from} = [';', '|'];
+    $self->{replacement}->{default}->{to} = [',', '&'];
+  } else {
+    open FILE, $file;
+    while(<FILE>) {
+      chomp;
+      next if /^colname/;
+      my ($colname, $from, $to) = split/\s+/;
+      die ("ERROR: 3 values separated by whitespace are required: colname from to.") if(!($colname && $from && $to));
+      push @{$self->{replacement}->{$colname}->{from}}, $from;
+      push @{$self->{replacement}->{$colname}->{to}}, $to;
+    }
+    close FILE;
+    die("ERROR: No default replacement logic has been specified.\n") if (!defined $self->{replacement}->{default});
+  }
+}
+
+1;
+
+