view dir_plugins/LD.pm @ 10:f594c6bed58f draft default tip

Uploaded
author dvanzessen
date Tue, 21 Apr 2020 11:40:19 +0000
parents e545d0a25ffe
children
line wrap: on
line source

=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2018] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=head1 CONTACT

 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
  
=cut

=head1 NAME

 LD

=head1 SYNOPSIS

 mv LD.pm ~/.vep/Plugins
 ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8

=head1 DESCRIPTION

 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
 finds variants in linkage disequilibrium with any overlapping existing 
 variants from the Ensembl variation databases. You can configure the 
 population used to calculate the r2 value, and the r2 cutoff used by 
 passing arguments to the plugin via the VEP command line (separated 
 by commas). This plugin adds a single new entry to the Extra column 
 with a comma-separated list of linked variant IDs and the associated 
 r2 values, e.g.:

   LinkedVariants=rs123:0.879,rs234:0.943

 If no arguments are supplied, the default population used is the CEU
 sample from the 1000 Genomes Project phase 3, and the default r2
 cutoff used is 0.8.

 WARNING: Calculating LD is a relatively slow procedure, so this will 
 slow VEP down considerably when running on large numbers of
 variants. Consider running vep followed by filter_vep to get a smaller
 input set:

   ./vep -i input.vcf -cache -vcf -o input_vep.vcf
   ./filter_vep -i input_vep.vcf -filter "Consequence is missense_variant" > input_vep_filtered.vcf
   ./vep -i input_vep_filtered.vcf -cache -plugin LD

=cut

=head1 INSTALLATION

 LD calculation requires additional installation steps.

 The JSON perl library is required; see VEP's installation instructions
 for guidance: http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#additional

 A binary from the ensembl-variation git repository must be compiled and either
 added to your PATH or specified on the command line. In the ensembl-vep
 directory:

   export HTSLIB_DIR=${PWD}/htslib
   git clone https://github.com/Ensembl/ensembl-variation
   cd ensembl-variation/C_code
   make

 You may EITHER add this path to your PATH environment variable (add this line
 to your $HOME/.bashrc to make the change permanent):

   export PATH=${PATH}:${PWD}

 OR you may specify the full path to the ld_vcf binary on the vep command line:

   ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8,$PWD/ensembl-variation/C_code/ld_vcf

=cut

=head1 DATA

 By default genotype data to calculate LD is retrieved from tabix-indexed
 VCF files hosted on Ensembl's FTP servers. It is possible to download this
 data to your local machine and have the LD plugin read genotype data from
 there instead, giving faster performance and reducing network traffic.

 These commands show how to get the data files for GRCh38.

   mkdir variation_genotype
   cd variation_genotype
   lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/
   cd ..

 For GRCh37 replace the lftp command with:

   lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/

 We must now modify the JSON configuration file used to find the data. Starting
 in the ensembl-vep directory:

   perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json

 Or for GRCh37:

   perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json 

=cut

package LD;

use strict;
use warnings;

use Bio::EnsEMBL::Registry;

use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);

sub feature_types {
  return ['Feature','Intergenic'];
}

sub get_header_info {

  my $self = shift;
  
  return {
    LinkedVariants => "Variants in LD (r2 >= ".$self->{r2_cutoff}.
      ") with overlapping existing variants from the ".
      $self->{pop}->name." population",
  };
}

sub new {
  my $class = shift;

  my $self = $class->SUPER::new(@_);

  if ($self->config->{offline}) {
    warn "Warning: a connection to the database is required to calculate LD\n";
  }

  my $reg = 'Bio::EnsEMBL::Registry';

  # turn on the check for existing variants

  $self->config->{check_existing} = 1;

  # fetch our population

  my ($pop_name, $r2_cutoff, $ld_binary) = @{ $self->params };

  # set some defaults

  $pop_name ||= '1000GENOMES:phase_3:CEU';

  $r2_cutoff = 0.8 unless defined $r2_cutoff;

  my $pop_adap = $reg->get_adaptor('human', 'variation', 'population')
    || die "Failed to get population adaptor\n";

  my $valid_pops = $pop_adap->fetch_all_LD_Populations();
  my ($pop) = grep {$_->name eq $pop_name} @$valid_pops;
  die "Invalid population '$pop_name'; valid populations are:\n".join(", ", map {$_->name} @$valid_pops)."\n" unless $pop;

  $self->{pop} = $pop;
  $self->{r2_cutoff} = $r2_cutoff;
  
  # prefetch the necessary adaptors
  
  my $ld_adap = $reg->get_adaptor('human', 'variation', 'ldfeaturecontainer')
    || die "Failed to get LD adaptor\n";
  $ld_adap->db->use_vcf(1);
  my $var_adap = $reg->get_adaptor('human', 'variation', 'variation')
    || die "Failed to get variation adaptor\n";
    
  my $var_feat_adap = $reg->get_adaptor('human', 'variation', 'variationfeature')
    || die "Failed to get variation feature adaptor\n";

  if($ld_binary) {
    die("Specified LD binary \"$ld_binary\" does not exist\n") unless -e $ld_binary;
    $Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor::VCF_BINARY_FILE = $ld_binary;
  }
   
  $self->{ld_adap} = $ld_adap;
  $self->{var_adap} = $var_adap;
  $self->{var_feat_adap} = $var_feat_adap;

  return $self;
}

sub run {
  my ($self, $vfoa, $line_hash) = @_;

  # fetch the existing variants from the line hash
  return {} unless $line_hash->{Existing_variation};

  my @vars = ref($line_hash->{Existing_variation}) eq 'ARRAY' ? @{$line_hash->{Existing_variation}} : split(',', $line_hash->{Existing_variation});

  my @linked;


  for my $var (@vars) {

    # check cache
    my $res;

    if($self->{cache}) {
      ($res) = grep {$_->{var} eq $var} @{$self->{cache}};
    }
    
    unless($res) {
      my @this_linked;
    
      # fetch a variation for each overlapping variant ID
      if (my $v = $self->{var_adap}->fetch_by_name($var)) {

        # and fetch the associated variation features

        for my $vf (@{ $self->{var_feat_adap}->fetch_all_by_Variation($v) }) {

          # we're only interested in variation features that overlap our variant

          if ($vf->slice->name eq $vfoa->variation_feature->slice->name) {

            # fetch an LD feature container for this variation feature and our preconfigured population
            if (my $ldfc = $self->{ld_adap}->fetch_by_VariationFeature($vf, $self->{pop})) {
            
              # loop over all the linked variants
              # we pass 1 to get_all_ld_values() so that it doesn't lazy load
              # VariationFeature objects - we only need the name here anyway
              for my $result (@{ $ldfc->get_all_ld_values(1) }) {
              
                # apply our r2 cutoff

                if ($result->{r2} >= $self->{r2_cutoff}) {

                  my $v1 = $result->{variation_name1};
                  my $v2 = $result->{variation_name2};

                  # I'm not sure which of these are the query variant, so just check the names
                    
                  my $linked = $v1 eq $var ? $v2 : $v1;
                  
                  push @this_linked, sprintf("%s:%.3f", $linked, $result->{r2});
                }
              }
            }
          }
        }
      }

      # cache it
      $res = {
        var => $var,
        linked => \@this_linked
      };

      push @{$self->{cache}}, $res;
      shift @{$self->{cache}} while scalar @{$self->{cache}} > 50;
    }

    push @linked, @{$res->{linked}};
  }

  return scalar @linked ? {LinkedVariants => join(',', @linked)} : {};
}

1;