| 0 | 1 =head1 LICENSE | 
|  | 2 | 
|  | 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | 
|  | 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | 
|  | 5 | 
|  | 6 Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 7 you may not use this file except in compliance with the License. | 
|  | 8 You may obtain a copy of the License at | 
|  | 9 | 
|  | 10    http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | 
|  | 12 Unless required by applicable law or agreed to in writing, software | 
|  | 13 distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 15 See the License for the specific language governing permissions and | 
|  | 16 limitations under the License. | 
|  | 17 | 
|  | 18 =head1 CONTACT | 
|  | 19 | 
|  | 20  Ensembl <http://www.ensembl.org/info/about/contact/index.html> | 
|  | 21 | 
|  | 22 =cut | 
|  | 23 | 
|  | 24 =head1 NAME | 
|  | 25 | 
|  | 26  LD | 
|  | 27 | 
|  | 28 =head1 SYNOPSIS | 
|  | 29 | 
|  | 30  mv LD.pm ~/.vep/Plugins | 
|  | 31  ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8 | 
|  | 32 | 
|  | 33 =head1 DESCRIPTION | 
|  | 34 | 
|  | 35  This is a plugin for the Ensembl Variant Effect Predictor (VEP) that | 
|  | 36  finds variants in linkage disequilibrium with any overlapping existing | 
|  | 37  variants from the Ensembl variation databases. You can configure the | 
|  | 38  population used to calculate the r2 value, and the r2 cutoff used by | 
|  | 39  passing arguments to the plugin via the VEP command line (separated | 
|  | 40  by commas). This plugin adds a single new entry to the Extra column | 
|  | 41  with a comma-separated list of linked variant IDs and the associated | 
|  | 42  r2 values, e.g.: | 
|  | 43 | 
|  | 44    LinkedVariants=rs123:0.879,rs234:0.943 | 
|  | 45 | 
|  | 46  If no arguments are supplied, the default population used is the CEU | 
|  | 47  sample from the 1000 Genomes Project phase 3, and the default r2 | 
|  | 48  cutoff used is 0.8. | 
|  | 49 | 
|  | 50  WARNING: Calculating LD is a relatively slow procedure, so this will | 
|  | 51  slow VEP down considerably when running on large numbers of | 
|  | 52  variants. Consider running vep followed by filter_vep to get a smaller | 
|  | 53  input set: | 
|  | 54 | 
|  | 55    ./vep -i input.vcf -cache -vcf -o input_vep.vcf | 
|  | 56    ./filter_vep -i input_vep.vcf -filter "Consequence is missense_variant" > input_vep_filtered.vcf | 
|  | 57    ./vep -i input_vep_filtered.vcf -cache -plugin LD | 
|  | 58 | 
|  | 59 =cut | 
|  | 60 | 
|  | 61 =head1 INSTALLATION | 
|  | 62 | 
|  | 63  LD calculation requires additional installation steps. | 
|  | 64 | 
|  | 65  The JSON perl library is required; see VEP's installation instructions | 
|  | 66  for guidance: http://www.ensembl.org/info/docs/tools/vep/script/vep_download.html#additional | 
|  | 67 | 
|  | 68  A binary from the ensembl-variation git repository must be compiled and either | 
|  | 69  added to your PATH or specified on the command line. In the ensembl-vep | 
|  | 70  directory: | 
|  | 71 | 
|  | 72    export HTSLIB_DIR=${PWD}/htslib | 
|  | 73    git clone https://github.com/Ensembl/ensembl-variation | 
|  | 74    cd ensembl-variation/C_code | 
|  | 75    make | 
|  | 76 | 
|  | 77  You may EITHER add this path to your PATH environment variable (add this line | 
|  | 78  to your $HOME/.bashrc to make the change permanent): | 
|  | 79 | 
|  | 80    export PATH=${PATH}:${PWD} | 
|  | 81 | 
|  | 82  OR you may specify the full path to the ld_vcf binary on the vep command line: | 
|  | 83 | 
|  | 84    ./vep -i variations.vcf --plugin LD,1000GENOMES:phase_3:CEU,0.8,$PWD/ensembl-variation/C_code/ld_vcf | 
|  | 85 | 
|  | 86 =cut | 
|  | 87 | 
|  | 88 =head1 DATA | 
|  | 89 | 
|  | 90  By default genotype data to calculate LD is retrieved from tabix-indexed | 
|  | 91  VCF files hosted on Ensembl's FTP servers. It is possible to download this | 
|  | 92  data to your local machine and have the LD plugin read genotype data from | 
|  | 93  there instead, giving faster performance and reducing network traffic. | 
|  | 94 | 
|  | 95  These commands show how to get the data files for GRCh38. | 
|  | 96 | 
|  | 97    mkdir variation_genotype | 
|  | 98    cd variation_genotype | 
|  | 99    lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.GRCh38_dbSNP.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/ | 
|  | 100    cd .. | 
|  | 101 | 
|  | 102  For GRCh37 replace the lftp command with: | 
|  | 103 | 
|  | 104    lftp -e "mget ALL.chr*.phase3_shapeit2_mvncall_integrated_v3plus_nounphased.rsID.genotypes.vcf.gz*" ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37/variation_genotype/ | 
|  | 105 | 
|  | 106  We must now modify the JSON configuration file used to find the data. Starting | 
|  | 107  in the ensembl-vep directory: | 
|  | 108 | 
|  | 109    perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json | 
|  | 110 | 
|  | 111  Or for GRCh37: | 
|  | 112 | 
|  | 113    perl -pi -e "s|ftp://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh37|$PWD|" Bio/EnsEMBL/Variation/DBSQL/vcf_config.json | 
|  | 114 | 
|  | 115 =cut | 
|  | 116 | 
|  | 117 package LD; | 
|  | 118 | 
|  | 119 use strict; | 
|  | 120 use warnings; | 
|  | 121 | 
|  | 122 use Bio::EnsEMBL::Registry; | 
|  | 123 | 
|  | 124 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | 
|  | 125 | 
|  | 126 sub feature_types { | 
|  | 127   return ['Feature','Intergenic']; | 
|  | 128 } | 
|  | 129 | 
|  | 130 sub get_header_info { | 
|  | 131 | 
|  | 132   my $self = shift; | 
|  | 133 | 
|  | 134   return { | 
|  | 135     LinkedVariants => "Variants in LD (r2 >= ".$self->{r2_cutoff}. | 
|  | 136       ") with overlapping existing variants from the ". | 
|  | 137       $self->{pop}->name." population", | 
|  | 138   }; | 
|  | 139 } | 
|  | 140 | 
|  | 141 sub new { | 
|  | 142   my $class = shift; | 
|  | 143 | 
|  | 144   my $self = $class->SUPER::new(@_); | 
|  | 145 | 
|  | 146   if ($self->config->{offline}) { | 
|  | 147     warn "Warning: a connection to the database is required to calculate LD\n"; | 
|  | 148   } | 
|  | 149 | 
|  | 150   my $reg = 'Bio::EnsEMBL::Registry'; | 
|  | 151 | 
|  | 152   # turn on the check for existing variants | 
|  | 153 | 
|  | 154   $self->config->{check_existing} = 1; | 
|  | 155 | 
|  | 156   # fetch our population | 
|  | 157 | 
|  | 158   my ($pop_name, $r2_cutoff, $ld_binary) = @{ $self->params }; | 
|  | 159 | 
|  | 160   # set some defaults | 
|  | 161 | 
|  | 162   $pop_name ||= '1000GENOMES:phase_3:CEU'; | 
|  | 163 | 
|  | 164   $r2_cutoff = 0.8 unless defined $r2_cutoff; | 
|  | 165 | 
|  | 166   my $pop_adap = $reg->get_adaptor('human', 'variation', 'population') | 
|  | 167     || die "Failed to get population adaptor\n"; | 
|  | 168 | 
|  | 169   my $valid_pops = $pop_adap->fetch_all_LD_Populations(); | 
|  | 170   my ($pop) = grep {$_->name eq $pop_name} @$valid_pops; | 
|  | 171   die "Invalid population '$pop_name'; valid populations are:\n".join(", ", map {$_->name} @$valid_pops)."\n" unless $pop; | 
|  | 172 | 
|  | 173   $self->{pop} = $pop; | 
|  | 174   $self->{r2_cutoff} = $r2_cutoff; | 
|  | 175 | 
|  | 176   # prefetch the necessary adaptors | 
|  | 177 | 
|  | 178   my $ld_adap = $reg->get_adaptor('human', 'variation', 'ldfeaturecontainer') | 
|  | 179     || die "Failed to get LD adaptor\n"; | 
|  | 180   $ld_adap->db->use_vcf(1); | 
|  | 181   my $var_adap = $reg->get_adaptor('human', 'variation', 'variation') | 
|  | 182     || die "Failed to get variation adaptor\n"; | 
|  | 183 | 
|  | 184   my $var_feat_adap = $reg->get_adaptor('human', 'variation', 'variationfeature') | 
|  | 185     || die "Failed to get variation feature adaptor\n"; | 
|  | 186 | 
|  | 187   if($ld_binary) { | 
|  | 188     die("Specified LD binary \"$ld_binary\" does not exist\n") unless -e $ld_binary; | 
|  | 189     $Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor::VCF_BINARY_FILE = $ld_binary; | 
|  | 190   } | 
|  | 191 | 
|  | 192   $self->{ld_adap} = $ld_adap; | 
|  | 193   $self->{var_adap} = $var_adap; | 
|  | 194   $self->{var_feat_adap} = $var_feat_adap; | 
|  | 195 | 
|  | 196   return $self; | 
|  | 197 } | 
|  | 198 | 
|  | 199 sub run { | 
|  | 200   my ($self, $vfoa, $line_hash) = @_; | 
|  | 201 | 
|  | 202   # fetch the existing variants from the line hash | 
|  | 203   return {} unless $line_hash->{Existing_variation}; | 
|  | 204 | 
|  | 205   my @vars = ref($line_hash->{Existing_variation}) eq 'ARRAY' ? @{$line_hash->{Existing_variation}} : split(',', $line_hash->{Existing_variation}); | 
|  | 206 | 
|  | 207   my @linked; | 
|  | 208 | 
|  | 209 | 
|  | 210   for my $var (@vars) { | 
|  | 211 | 
|  | 212     # check cache | 
|  | 213     my $res; | 
|  | 214 | 
|  | 215     if($self->{cache}) { | 
|  | 216       ($res) = grep {$_->{var} eq $var} @{$self->{cache}}; | 
|  | 217     } | 
|  | 218 | 
|  | 219     unless($res) { | 
|  | 220       my @this_linked; | 
|  | 221 | 
|  | 222       # fetch a variation for each overlapping variant ID | 
|  | 223       if (my $v = $self->{var_adap}->fetch_by_name($var)) { | 
|  | 224 | 
|  | 225         # and fetch the associated variation features | 
|  | 226 | 
|  | 227         for my $vf (@{ $self->{var_feat_adap}->fetch_all_by_Variation($v) }) { | 
|  | 228 | 
|  | 229           # we're only interested in variation features that overlap our variant | 
|  | 230 | 
|  | 231           if ($vf->slice->name eq $vfoa->variation_feature->slice->name) { | 
|  | 232 | 
|  | 233             # fetch an LD feature container for this variation feature and our preconfigured population | 
|  | 234             if (my $ldfc = $self->{ld_adap}->fetch_by_VariationFeature($vf, $self->{pop})) { | 
|  | 235 | 
|  | 236               # loop over all the linked variants | 
|  | 237               # we pass 1 to get_all_ld_values() so that it doesn't lazy load | 
|  | 238               # VariationFeature objects - we only need the name here anyway | 
|  | 239               for my $result (@{ $ldfc->get_all_ld_values(1) }) { | 
|  | 240 | 
|  | 241                 # apply our r2 cutoff | 
|  | 242 | 
|  | 243                 if ($result->{r2} >= $self->{r2_cutoff}) { | 
|  | 244 | 
|  | 245                   my $v1 = $result->{variation_name1}; | 
|  | 246                   my $v2 = $result->{variation_name2}; | 
|  | 247 | 
|  | 248                   # I'm not sure which of these are the query variant, so just check the names | 
|  | 249 | 
|  | 250                   my $linked = $v1 eq $var ? $v2 : $v1; | 
|  | 251 | 
|  | 252                   push @this_linked, sprintf("%s:%.3f", $linked, $result->{r2}); | 
|  | 253                 } | 
|  | 254               } | 
|  | 255             } | 
|  | 256           } | 
|  | 257         } | 
|  | 258       } | 
|  | 259 | 
|  | 260       # cache it | 
|  | 261       $res = { | 
|  | 262         var => $var, | 
|  | 263         linked => \@this_linked | 
|  | 264       }; | 
|  | 265 | 
|  | 266       push @{$self->{cache}}, $res; | 
|  | 267       shift @{$self->{cache}} while scalar @{$self->{cache}} > 50; | 
|  | 268     } | 
|  | 269 | 
|  | 270     push @linked, @{$res->{linked}}; | 
|  | 271   } | 
|  | 272 | 
|  | 273   return scalar @linked ? {LinkedVariants => join(',', @linked)} : {}; | 
|  | 274 } | 
|  | 275 | 
|  | 276 1; | 
|  | 277 |