| 0 | 1 =head1 LICENSE | 
|  | 2 | 
|  | 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | 
|  | 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | 
|  | 5 | 
|  | 6 Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 7 you may not use this file except in compliance with the License. | 
|  | 8 You may obtain a copy of the License at | 
|  | 9 | 
|  | 10      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | 
|  | 12 Unless required by applicable law or agreed to in writing, software | 
|  | 13 distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 15 See the License for the specific language governing permissions and | 
|  | 16 limitations under the License. | 
|  | 17 | 
|  | 18 =head1 CONTACT | 
|  | 19 | 
|  | 20  Ensembl <http://www.ensembl.org/info/about/contact/index.html> | 
|  | 21 | 
|  | 22 =cut | 
|  | 23 | 
|  | 24 =head1 NAME | 
|  | 25 | 
|  | 26  ExAC | 
|  | 27 | 
|  | 28 =head1 SYNOPSIS | 
|  | 29 | 
|  | 30  mv ExAC.pm ~/.vep/Plugins | 
|  | 31  ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz | 
|  | 32  ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC | 
|  | 33  ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,,AN | 
|  | 34  ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC,AN | 
|  | 35 | 
|  | 36 | 
|  | 37 | 
|  | 38 =head1 DESCRIPTION | 
|  | 39 | 
|  | 40  A VEP plugin that retrieves ExAC allele frequencies. | 
|  | 41 | 
|  | 42  Visit ftp://ftp.broadinstitute.org/pub/ExAC_release/current to download the latest ExAC VCF. | 
|  | 43 | 
|  | 44  Note that the currently available version of the ExAC data file (0.3) is only available | 
|  | 45  on the GRCh37 assembly; therefore it can only be used with this plugin when using the | 
|  | 46  VEP on GRCh37. See http://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#assembly | 
|  | 47 | 
|  | 48  The tabix utility must be installed in your path to use this plugin. | 
|  | 49 | 
|  | 50  The plugin takes 3 command line arguments. Second and third arguments are not mandatory. If AC specified as second | 
|  | 51  argument Allele counts per population will be included in output. If AN specified as third argument Allele specific | 
|  | 52  chromosome counts will be included in output. | 
|  | 53 | 
|  | 54 | 
|  | 55 =cut | 
|  | 56 | 
|  | 57 package ExAC; | 
|  | 58 | 
|  | 59 use strict; | 
|  | 60 use warnings; | 
|  | 61 | 
|  | 62 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | 
|  | 63 use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); | 
|  | 64 | 
|  | 65 use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_slice); | 
|  | 66 | 
|  | 67 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | 
|  | 68 | 
|  | 69 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | 
|  | 70 | 
|  | 71 sub new { | 
|  | 72   my $class = shift; | 
|  | 73 | 
|  | 74   my $self = $class->SUPER::new(@_); | 
|  | 75 | 
|  | 76   # test tabix | 
|  | 77   die "ERROR: tabix does not seem to be in your path\n" unless `which tabix 2>&1` =~ /tabix$/; | 
|  | 78 | 
|  | 79   # get ExAC file | 
|  | 80   my $file = $self->params->[0]; | 
|  | 81 | 
|  | 82   # get AC,AN options | 
|  | 83   if (exists($self->params->[1]) && $self->params->[1] eq 'AC'){ | 
|  | 84     $self->{display_ac} = 1; | 
|  | 85   } | 
|  | 86   else { | 
|  | 87     $self->{display_ac} = 0; | 
|  | 88   } | 
|  | 89 | 
|  | 90   if (exists($self->params->[2]) && $self->params->[2] eq 'AN'){ | 
|  | 91     $self->{display_an} = 1; | 
|  | 92   } | 
|  | 93   else { | 
|  | 94     $self->{display_an} = 0; | 
|  | 95   } | 
|  | 96 | 
|  | 97   # remote files? | 
|  | 98   if($file =~ /tp\:\/\//) { | 
|  | 99     my $remote_test = `tabix -f $file 1:1-1 2>&1`; | 
|  | 100     print STDERR "$remote_test\n"; | 
|  | 101     # if($remote_test && $remote_test !~ /get_local_version/) { | 
|  | 102     #   die "$remote_test\nERROR: Could not find file or index file for remote annotation file $file\n"; | 
|  | 103     # } | 
|  | 104   } | 
|  | 105 | 
|  | 106   # check files exist | 
|  | 107   else { | 
|  | 108     die "ERROR: ExAC file $file not found; you can download it from ftp://ftp.broadinstitute.org/pub/ExAC_release/current\n" unless -e $file; | 
|  | 109     die "ERROR: Tabix index file $file\.tbi not found - perhaps you need to create it first?\n" unless -e $file.'.tbi'; | 
|  | 110   } | 
|  | 111 | 
|  | 112   $self->{file} = $file; | 
|  | 113 | 
|  | 114   return $self; | 
|  | 115 } | 
|  | 116 | 
|  | 117 sub feature_types { | 
|  | 118   return ['Feature','Intergenic']; | 
|  | 119 } | 
|  | 120 | 
|  | 121 sub get_header_info { | 
|  | 122   my $self = shift; | 
|  | 123 | 
|  | 124   if(!exists($self->{header_info})) { | 
|  | 125     open IN, "tabix -f -h ".$self->{file}." 1:1-1 |"; | 
|  | 126 | 
|  | 127     my %headers = (); | 
|  | 128     my @lines = <IN>; | 
|  | 129 | 
|  | 130     while(my $line = shift @lines) { | 
|  | 131       if($line =~ /ID\=AC(\_[A-Zdj]+)?\,.*\"(.+)\"/) { | 
|  | 132         my ($pop, $desc) = ($1, $2); | 
|  | 133 | 
|  | 134         $desc =~ s/Counts?/frequency/i; | 
|  | 135         $pop ||= ''; | 
|  | 136 | 
|  | 137         my $field_name = 'ExAC_AF'.$pop; | 
|  | 138         $headers{$field_name} = 'ExAC '.$desc; | 
|  | 139 | 
|  | 140         if ($self->{display_ac}){ | 
|  | 141           $field_name = 'ExAC_AC'.$pop; | 
|  | 142           $headers{$field_name} = 'ExAC'.$pop.' Allele count'; | 
|  | 143         } | 
|  | 144         if ($self->{display_an}){ | 
|  | 145           $field_name = 'ExAC_AN'.$pop; | 
|  | 146           $headers{$field_name} = 'ExAC'.$pop.' Allele number'; | 
|  | 147         } | 
|  | 148 | 
|  | 149         # store this header on self | 
|  | 150         push @{$self->{headers}}, 'AC'.$pop; | 
|  | 151       } | 
|  | 152     } | 
|  | 153 | 
|  | 154     close IN; | 
|  | 155 | 
|  | 156     die "ERROR: No valid headers found in ExAC VCF file\n" unless scalar keys %headers; | 
|  | 157 | 
|  | 158     $self->{header_info} = \%headers; | 
|  | 159   } | 
|  | 160 | 
|  | 161   return $self->{header_info}; | 
|  | 162 } | 
|  | 163 | 
|  | 164 sub run { | 
|  | 165   my ($self, $tva) = @_; | 
|  | 166   # make sure headers have been loaded | 
|  | 167   $self->get_header_info(); | 
|  | 168 | 
|  | 169   my $vf = $tva->variation_feature; | 
|  | 170   my $name = $vf->variation_name; | 
|  | 171 | 
|  | 172   # get allele, reverse comp if needed | 
|  | 173   my $allele; | 
|  | 174 | 
|  | 175   $allele = $tva->variation_feature_seq; | 
|  | 176   reverse_comp(\$allele) if $vf->{strand} < 0; | 
|  | 177 | 
|  | 178   # adjust coords to account for VCF-like storage of indels | 
|  | 179   my ($s, $e) = ($vf->{start} - 1, $vf->{end} + 1); | 
|  | 180 | 
|  | 181   my $vf_chr = $vf->{chr}; | 
|  | 182   $vf_chr =~ s/chr//; | 
|  | 183   my $pos_string = sprintf("%s:%i-%i", $vf_chr, $s, $e); | 
|  | 184 | 
|  | 185 | 
|  | 186   # clear cache if it looks like the coords are the same | 
|  | 187   # but allele type is different | 
|  | 188   delete $self->{cache} if | 
|  | 189     defined($self->{cache}->{$pos_string}) && | 
|  | 190     scalar keys %{$self->{cache}->{$pos_string}} && | 
|  | 191     !defined($self->{cache}->{$pos_string}->{$allele}); | 
|  | 192 | 
|  | 193   my $data = {}; | 
|  | 194 | 
|  | 195   # cached? | 
|  | 196   if(defined($self->{cache}) && defined($self->{cache}->{$pos_string})) { | 
|  | 197     $data = $self->{cache}->{$pos_string}; | 
|  | 198   } | 
|  | 199 | 
|  | 200   # read from file | 
|  | 201   else { | 
|  | 202     open TABIX, sprintf("tabix -f %s %s |", $self->{file}, $pos_string); | 
|  | 203 | 
|  | 204     while(<TABIX>) { | 
|  | 205       chomp; | 
|  | 206       s/\r$//g; | 
|  | 207       # parse VCF line into a VariationFeature object | 
|  | 208       my ($vcf_vf) = @{parse_line({format => 'vcf', minimal => 1}, $_)}; | 
|  | 209 | 
|  | 210       # check parsed OK | 
|  | 211       next unless $vcf_vf && $vcf_vf->isa('Bio::EnsEMBL::Variation::VariationFeature'); | 
|  | 212 | 
|  | 213       my @vcf_alleles = split /\//, $vcf_vf->allele_string; | 
|  | 214       my $ref_allele  = shift @vcf_alleles; | 
|  | 215       my $vcf_vf_start = $vcf_vf->{start}; | 
|  | 216       my $vcf_vf_end = $vcf_vf->{end}; | 
|  | 217 | 
|  | 218       my @vf_alleles = split /\//, $vf->allele_string; | 
|  | 219       my $vf_ref_allele = shift @vf_alleles; | 
|  | 220 | 
|  | 221       if ($vcf_vf_start != $vf->{start} || $vcf_vf_end != $vf->{end}) { | 
|  | 222         my $matched_alleles = get_matched_variant_alleles({ref => $vf_ref_allele, alts => [$allele], pos => $vf->{start}}, {ref => $ref_allele, alts => \@vcf_alleles,  pos => $vcf_vf_start}); | 
|  | 223 | 
|  | 224         next unless (@$matched_alleles); | 
|  | 225         # We only match one alt allele from the input VF against alleles from the VCF line. b_allele is the matched allele from the VCF alt alleles | 
|  | 226         $allele = $matched_alleles->[0]->{b_allele}; | 
|  | 227       } | 
|  | 228       # iterate over required headers | 
|  | 229       HEADER: | 
|  | 230       foreach my $h(@{$self->{headers} || []}) { | 
|  | 231         my $total_ac = 0; | 
|  | 232 | 
|  | 233         if(/$h\=([0-9\,]+)/) { | 
|  | 234 | 
|  | 235           # grab AC | 
|  | 236           my @ac = split /\,/, $1; | 
|  | 237           next unless scalar @ac == scalar @vcf_alleles; | 
|  | 238 | 
|  | 239           # now sed header to get AN | 
|  | 240           my $anh = $h; | 
|  | 241           $anh =~ s/AC/AN/; | 
|  | 242 | 
|  | 243           my $afh = $h; | 
|  | 244           $afh =~ s/AC/AF/; | 
|  | 245 | 
|  | 246           # get AC from header | 
|  | 247           my $ach = $h; | 
|  | 248 | 
|  | 249           if(/$anh\=([0-9\,]+)/) { | 
|  | 250 | 
|  | 251             # grab AN | 
|  | 252             my @an = split /\,/, $1; | 
|  | 253             next unless @an; | 
|  | 254             my $an; | 
|  | 255 | 
|  | 256             foreach my $a(@vcf_alleles) { | 
|  | 257               my $ac = shift @ac; | 
|  | 258               $an = shift @an if @an; | 
|  | 259 | 
|  | 260               $total_ac += $ac; | 
|  | 261               if ($self->{display_ac}){ | 
|  | 262                 $data->{$a}->{'ExAC_'.$ach} = $ac; | 
|  | 263               } | 
|  | 264               if ($self->{display_an}){ | 
|  | 265                 $data->{$a}->{'ExAC_'.$anh} = $an; | 
|  | 266               } | 
|  | 267 | 
|  | 268               $data->{$a}->{'ExAC_'.$afh} = sprintf("%.3g", $ac / $an) if $an; | 
|  | 269             } | 
|  | 270 | 
|  | 271             # use total to get ref allele freq | 
|  | 272             if ($self->{display_ac}){ | 
|  | 273              $data->{$ref_allele}->{'ExAC_'.$ach} = $total_ac; | 
|  | 274             } | 
|  | 275             if ($self->{display_an}){ | 
|  | 276               $data->{$ref_allele}->{'ExAC_'.$anh} = $an; | 
|  | 277             } | 
|  | 278             $data->{$ref_allele}->{'ExAC_'.$afh} = sprintf("%.3g", 1 - ($total_ac / $an)) if $an; | 
|  | 279           } | 
|  | 280         } | 
|  | 281       } | 
|  | 282     } | 
|  | 283 | 
|  | 284     close TABIX; | 
|  | 285   } | 
|  | 286 | 
|  | 287   # overwrite cache | 
|  | 288   $self->{cache} = {$pos_string => $data}; | 
|  | 289   return defined($data->{$allele}) ? $data->{$allele} : {}; | 
|  | 290 } | 
|  | 291 | 
|  | 292 1; | 
|  | 293 |