Mercurial > repos > dvanzessen > vep_emc
diff dir_plugins/LD.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dir_plugins/LD.pm Mon Jul 15 05:20:39 2019 -0400 @@ -0,0 +1,277 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl <http://www.ensembl.org/info/about/contact/index.html> + +=cut + +=head1 NAME + + LD + +=head1 SYNOPSIS + + mv LD.pm ~/.vep/Plugins + ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8 + +=head1 DESCRIPTION + + This is a plugin for the Ensembl Variant Effect Predictor (VEP) that + finds variants in linkage disequilibrium with any overlapping existing + variants from the Ensembl variation databases. You can configure the + population used to calculate the r2 value, and the r2 cutoff used by + passing arguments to the plugin via the VEP command line (separated + by commas). This plugin adds a single new entry to the Extra column + with a comma-separated list of linked variant IDs and the associated + r2 values, e.g.: + + LinkedVariants=rs123:0.879,rs234:0.943 + + If no arguments are supplied, the default population used is the CEU + sample from the 1000 Genomes Project phase 3, and the default r2 + cutoff used is 0.8. + + WARNING: Calculating LD is a relatively slow procedure, so this will + slow VEP down considerably when running on large numbers of + variants. Consider running vep followed by filter_vep to get a smaller + input set: + + ./vep -i input.vcf -cache -vcf -o input_vep.vcf + ./filter_vep -i input_vep.vcf -filter "Consequence is missense_variant" > input_vep_filtered.vcf + ./vep -i input_vep_filtered.vcf -cache -plugin LD + +=cut + +=head1 INSTALLATION + + LD calculation requires additional installation steps. + + The JSON perl library is required; see VEP's installation instructions + for guidance: http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#additional + + A binary from the ensembl-variation git repository must be compiled and either + added to your PATH or specified on the command line. In the ensembl-vep + directory: + + export HTSLIB_DIR=${PWD}/htslib + git clone https://github.com/Ensembl/ensembl-variation + cd ensembl-variation/C_code + make + + You may EITHER add this path to your PATH environment variable (add this line + to your $HOME/.bashrc to make the change permanent): + + export PATH=${PATH}:${PWD} + + OR you may specify the full path to the ld_vcf binary on the vep command line: + + ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8,$PWD/ensembl-variation/C_code/ld_vcf + +=cut + +=head1 DATA + + By default genotype data to calculate LD is retrieved from tabix-indexed + VCF files hosted on Ensembl's FTP servers. It is possible to download this + data to your local machine and have the LD plugin read genotype data from + there instead, giving faster performance and reducing network traffic. + + These commands show how to get the data files for GRCh38. + + mkdir variation_genotype + cd variation_genotype + lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/ + cd .. + + For GRCh37 replace the lftp command with: + + lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/ + + We must now modify the JSON configuration file used to find the data. Starting + in the ensembl-vep directory: + + perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json + + Or for GRCh37: + + perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json + +=cut + +package LD; + +use strict; +use warnings; + +use Bio::EnsEMBL::Registry; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +sub feature_types { + return ['Feature','Intergenic']; +} + +sub get_header_info { + + my $self = shift; + + return { + LinkedVariants => "Variants in LD (r2 >= ".$self->{r2_cutoff}. + ") with overlapping existing variants from the ". + $self->{pop}->name." population", + }; +} + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + if ($self->config->{offline}) { + warn "Warning: a connection to the database is required to calculate LD\n"; + } + + my $reg = 'Bio::EnsEMBL::Registry'; + + # turn on the check for existing variants + + $self->config->{check_existing} = 1; + + # fetch our population + + my ($pop_name, $r2_cutoff, $ld_binary) = @{ $self->params }; + + # set some defaults + + $pop_name ||= '1000GENOMES:phase_3:CEU'; + + $r2_cutoff = 0.8 unless defined $r2_cutoff; + + my $pop_adap = $reg->get_adaptor('human', 'variation', 'population') + || die "Failed to get population adaptor\n"; + + my $valid_pops = $pop_adap->fetch_all_LD_Populations(); + my ($pop) = grep {$_->name eq $pop_name} @$valid_pops; + die "Invalid population '$pop_name'; valid populations are:\n".join(", ", map {$_->name} @$valid_pops)."\n" unless $pop; + + $self->{pop} = $pop; + $self->{r2_cutoff} = $r2_cutoff; + + # prefetch the necessary adaptors + + my $ld_adap = $reg->get_adaptor('human', 'variation', 'ldfeaturecontainer') + || die "Failed to get LD adaptor\n"; + $ld_adap->db->use_vcf(1); + my $var_adap = $reg->get_adaptor('human', 'variation', 'variation') + || die "Failed to get variation adaptor\n"; + + my $var_feat_adap = $reg->get_adaptor('human', 'variation', 'variationfeature') + || die "Failed to get variation feature adaptor\n"; + + if($ld_binary) { + die("Specified LD binary \"$ld_binary\" does not exist\n") unless -e $ld_binary; + $Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor::VCF_BINARY_FILE = $ld_binary; + } + + $self->{ld_adap} = $ld_adap; + $self->{var_adap} = $var_adap; + $self->{var_feat_adap} = $var_feat_adap; + + return $self; +} + +sub run { + my ($self, $vfoa, $line_hash) = @_; + + # fetch the existing variants from the line hash + return {} unless $line_hash->{Existing_variation}; + + my @vars = ref($line_hash->{Existing_variation}) eq 'ARRAY' ? @{$line_hash->{Existing_variation}} : split(',', $line_hash->{Existing_variation}); + + my @linked; + + + for my $var (@vars) { + + # check cache + my $res; + + if($self->{cache}) { + ($res) = grep {$_->{var} eq $var} @{$self->{cache}}; + } + + unless($res) { + my @this_linked; + + # fetch a variation for each overlapping variant ID + if (my $v = $self->{var_adap}->fetch_by_name($var)) { + + # and fetch the associated variation features + + for my $vf (@{ $self->{var_feat_adap}->fetch_all_by_Variation($v) }) { + + # we're only interested in variation features that overlap our variant + + if ($vf->slice->name eq $vfoa->variation_feature->slice->name) { + + # fetch an LD feature container for this variation feature and our preconfigured population + if (my $ldfc = $self->{ld_adap}->fetch_by_VariationFeature($vf, $self->{pop})) { + + # loop over all the linked variants + # we pass 1 to get_all_ld_values() so that it doesn't lazy load + # VariationFeature objects - we only need the name here anyway + for my $result (@{ $ldfc->get_all_ld_values(1) }) { + + # apply our r2 cutoff + + if ($result->{r2} >= $self->{r2_cutoff}) { + + my $v1 = $result->{variation_name1}; + my $v2 = $result->{variation_name2}; + + # I'm not sure which of these are the query variant, so just check the names + + my $linked = $v1 eq $var ? $v2 : $v1; + + push @this_linked, sprintf("%s:%.3f", $linked, $result->{r2}); + } + } + } + } + } + } + + # cache it + $res = { + var => $var, + linked => \@this_linked + }; + + push @{$self->{cache}}, $res; + shift @{$self->{cache}} while scalar @{$self->{cache}} > 50; + } + + push @linked, @{$res->{linked}}; + } + + return scalar @linked ? {LinkedVariants => join(',', @linked)} : {}; +} + +1; +
