Mercurial > repos > dvanzessen > vep_emc
diff dir_plugins/ExAC.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dir_plugins/ExAC.pm Mon Jul 15 05:20:39 2019 -0400 @@ -0,0 +1,293 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl <http://www.ensembl.org/info/about/contact/index.html> + +=cut + +=head1 NAME + + ExAC + +=head1 SYNOPSIS + + mv ExAC.pm ~/.vep/Plugins + ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz + ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC + ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,,AN + ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC,AN + + + +=head1 DESCRIPTION + + A VEP plugin that retrieves ExAC allele frequencies. + + Visit ftp://ftp.broadinstitute.org/pub/ExAC_release/current to download the latest ExAC VCF. + + Note that the currently available version of the ExAC data file (0.3) is only available + on the GRCh37 assembly; therefore it can only be used with this plugin when using the + VEP on GRCh37. See http://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#assembly + + The tabix utility must be installed in your path to use this plugin. + + The plugin takes 3 command line arguments. Second and third arguments are not mandatory. If AC specified as second + argument Allele counts per population will be included in output. If AN specified as third argument Allele specific + chromosome counts will be included in output. + + +=cut + +package ExAC; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); + +use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_slice); + +use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + # test tabix + die "ERROR: tabix does not seem to be in your path\n" unless `which tabix 2>&1` =~ /tabix$/; + + # get ExAC file + my $file = $self->params->[0]; + + # get AC,AN options + if (exists($self->params->[1]) && $self->params->[1] eq 'AC'){ + $self->{display_ac} = 1; + } + else { + $self->{display_ac} = 0; + } + + if (exists($self->params->[2]) && $self->params->[2] eq 'AN'){ + $self->{display_an} = 1; + } + else { + $self->{display_an} = 0; + } + + # remote files? + if($file =~ /tp\:\/\//) { + my $remote_test = `tabix -f $file 1:1-1 2>&1`; + print STDERR "$remote_test\n"; + # if($remote_test && $remote_test !~ /get_local_version/) { + # die "$remote_test\nERROR: Could not find file or index file for remote annotation file $file\n"; + # } + } + + # check files exist + else { + die "ERROR: ExAC file $file not found; you can download it from ftp://ftp.broadinstitute.org/pub/ExAC_release/current\n" unless -e $file; + die "ERROR: Tabix index file $file\.tbi not found - perhaps you need to create it first?\n" unless -e $file.'.tbi'; + } + + $self->{file} = $file; + + return $self; +} + +sub feature_types { + return ['Feature','Intergenic']; +} + +sub get_header_info { + my $self = shift; + + if(!exists($self->{header_info})) { + open IN, "tabix -f -h ".$self->{file}." 1:1-1 |"; + + my %headers = (); + my @lines = <IN>; + + while(my $line = shift @lines) { + if($line =~ /ID\=AC(\_[A-Zdj]+)?\,.*\"(.+)\"/) { + my ($pop, $desc) = ($1, $2); + + $desc =~ s/Counts?/frequency/i; + $pop ||= ''; + + my $field_name = 'ExAC_AF'.$pop; + $headers{$field_name} = 'ExAC '.$desc; + + if ($self->{display_ac}){ + $field_name = 'ExAC_AC'.$pop; + $headers{$field_name} = 'ExAC'.$pop.' Allele count'; + } + if ($self->{display_an}){ + $field_name = 'ExAC_AN'.$pop; + $headers{$field_name} = 'ExAC'.$pop.' Allele number'; + } + + # store this header on self + push @{$self->{headers}}, 'AC'.$pop; + } + } + + close IN; + + die "ERROR: No valid headers found in ExAC VCF file\n" unless scalar keys %headers; + + $self->{header_info} = \%headers; + } + + return $self->{header_info}; +} + +sub run { + my ($self, $tva) = @_; + # make sure headers have been loaded + $self->get_header_info(); + + my $vf = $tva->variation_feature; + my $name = $vf->variation_name; + + # get allele, reverse comp if needed + my $allele; + + $allele = $tva->variation_feature_seq; + reverse_comp(\$allele) if $vf->{strand} < 0; + + # adjust coords to account for VCF-like storage of indels + my ($s, $e) = ($vf->{start} - 1, $vf->{end} + 1); + + my $vf_chr = $vf->{chr}; + $vf_chr =~ s/chr//; + my $pos_string = sprintf("%s:%i-%i", $vf_chr, $s, $e); + + + # clear cache if it looks like the coords are the same + # but allele type is different + delete $self->{cache} if + defined($self->{cache}->{$pos_string}) && + scalar keys %{$self->{cache}->{$pos_string}} && + !defined($self->{cache}->{$pos_string}->{$allele}); + + my $data = {}; + + # cached? + if(defined($self->{cache}) && defined($self->{cache}->{$pos_string})) { + $data = $self->{cache}->{$pos_string}; + } + + # read from file + else { + open TABIX, sprintf("tabix -f %s %s |", $self->{file}, $pos_string); + + while(<TABIX>) { + chomp; + s/\r$//g; + # parse VCF line into a VariationFeature object + my ($vcf_vf) = @{parse_line({format => 'vcf', minimal => 1}, $_)}; + + # check parsed OK + next unless $vcf_vf && $vcf_vf->isa('Bio::EnsEMBL::Variation::VariationFeature'); + + my @vcf_alleles = split /\//, $vcf_vf->allele_string; + my $ref_allele = shift @vcf_alleles; + my $vcf_vf_start = $vcf_vf->{start}; + my $vcf_vf_end = $vcf_vf->{end}; + + my @vf_alleles = split /\//, $vf->allele_string; + my $vf_ref_allele = shift @vf_alleles; + + if ($vcf_vf_start != $vf->{start} || $vcf_vf_end != $vf->{end}) { + my $matched_alleles = get_matched_variant_alleles({ref => $vf_ref_allele, alts => [$allele], pos => $vf->{start}}, {ref => $ref_allele, alts => \@vcf_alleles, pos => $vcf_vf_start}); + + next unless (@$matched_alleles); + # We only match one alt allele from the input VF against alleles from the VCF line. b_allele is the matched allele from the VCF alt alleles + $allele = $matched_alleles->[0]->{b_allele}; + } + # iterate over required headers + HEADER: + foreach my $h(@{$self->{headers} || []}) { + my $total_ac = 0; + + if(/$h\=([0-9\,]+)/) { + + # grab AC + my @ac = split /\,/, $1; + next unless scalar @ac == scalar @vcf_alleles; + + # now sed header to get AN + my $anh = $h; + $anh =~ s/AC/AN/; + + my $afh = $h; + $afh =~ s/AC/AF/; + + # get AC from header + my $ach = $h; + + if(/$anh\=([0-9\,]+)/) { + + # grab AN + my @an = split /\,/, $1; + next unless @an; + my $an; + + foreach my $a(@vcf_alleles) { + my $ac = shift @ac; + $an = shift @an if @an; + + $total_ac += $ac; + if ($self->{display_ac}){ + $data->{$a}->{'ExAC_'.$ach} = $ac; + } + if ($self->{display_an}){ + $data->{$a}->{'ExAC_'.$anh} = $an; + } + + $data->{$a}->{'ExAC_'.$afh} = sprintf("%.3g", $ac / $an) if $an; + } + + # use total to get ref allele freq + if ($self->{display_ac}){ + $data->{$ref_allele}->{'ExAC_'.$ach} = $total_ac; + } + if ($self->{display_an}){ + $data->{$ref_allele}->{'ExAC_'.$anh} = $an; + } + $data->{$ref_allele}->{'ExAC_'.$afh} = sprintf("%.3g", 1 - ($total_ac / $an)) if $an; + } + } + } + } + + close TABIX; + } + + # overwrite cache + $self->{cache} = {$pos_string => $data}; + return defined($data->{$allele}) ? $data->{$allele} : {}; +} + +1; +
