Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/ExAC.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen | 
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 | 
| parents | e545d0a25ffe | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 2:17c98d091710 | 3:49397129aec0 | 
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 ExAC | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv ExAC.pm ~/.vep/Plugins | |
| 31 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz | |
| 32 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC | |
| 33 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,,AN | |
| 34 ./vep -i variations.vcf --plugin ExAC,/path/to/ExAC/ExAC.r0.3.sites.vep.vcf.gz,AC,AN | |
| 35 | |
| 36 | |
| 37 | |
| 38 =head1 DESCRIPTION | |
| 39 | |
| 40 A VEP plugin that retrieves ExAC allele frequencies. | |
| 41 | |
| 42 Visit ftp://ftp.broadinstitute.org/pub/ExAC_release/current to download the latest ExAC VCF. | |
| 43 | |
| 44 Note that the currently available version of the ExAC data file (0.3) is only available | |
| 45 on the GRCh37 assembly; therefore it can only be used with this plugin when using the | |
| 46 VEP on GRCh37. See http://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#assembly | |
| 47 | |
| 48 The tabix utility must be installed in your path to use this plugin. | |
| 49 | |
| 50 The plugin takes 3 command line arguments. Second and third arguments are not mandatory. If AC specified as second | |
| 51 argument Allele counts per population will be included in output. If AN specified as third argument Allele specific | |
| 52 chromosome counts will be included in output. | |
| 53 | |
| 54 | |
| 55 =cut | |
| 56 | |
| 57 package ExAC; | |
| 58 | |
| 59 use strict; | |
| 60 use warnings; | |
| 61 | |
| 62 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 63 use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); | |
| 64 | |
| 65 use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_slice); | |
| 66 | |
| 67 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | |
| 68 | |
| 69 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 70 | |
| 71 sub new { | |
| 72 my $class = shift; | |
| 73 | |
| 74 my $self = $class->SUPER::new(@_); | |
| 75 | |
| 76 # test tabix | |
| 77 die "ERROR: tabix does not seem to be in your path\n" unless `which tabix 2>&1` =~ /tabix$/; | |
| 78 | |
| 79 # get ExAC file | |
| 80 my $file = $self->params->[0]; | |
| 81 | |
| 82 # get AC,AN options | |
| 83 if (exists($self->params->[1]) && $self->params->[1] eq 'AC'){ | |
| 84 $self->{display_ac} = 1; | |
| 85 } | |
| 86 else { | |
| 87 $self->{display_ac} = 0; | |
| 88 } | |
| 89 | |
| 90 if (exists($self->params->[2]) && $self->params->[2] eq 'AN'){ | |
| 91 $self->{display_an} = 1; | |
| 92 } | |
| 93 else { | |
| 94 $self->{display_an} = 0; | |
| 95 } | |
| 96 | |
| 97 # remote files? | |
| 98 if($file =~ /tp\:\/\//) { | |
| 99 my $remote_test = `tabix -f $file 1:1-1 2>&1`; | |
| 100 print STDERR "$remote_test\n"; | |
| 101 # if($remote_test && $remote_test !~ /get_local_version/) { | |
| 102 # die "$remote_test\nERROR: Could not find file or index file for remote annotation file $file\n"; | |
| 103 # } | |
| 104 } | |
| 105 | |
| 106 # check files exist | |
| 107 else { | |
| 108 die "ERROR: ExAC file $file not found; you can download it from ftp://ftp.broadinstitute.org/pub/ExAC_release/current\n" unless -e $file; | |
| 109 die "ERROR: Tabix index file $file\.tbi not found - perhaps you need to create it first?\n" unless -e $file.'.tbi'; | |
| 110 } | |
| 111 | |
| 112 $self->{file} = $file; | |
| 113 | |
| 114 return $self; | |
| 115 } | |
| 116 | |
| 117 sub feature_types { | |
| 118 return ['Feature','Intergenic']; | |
| 119 } | |
| 120 | |
| 121 sub get_header_info { | |
| 122 my $self = shift; | |
| 123 | |
| 124 if(!exists($self->{header_info})) { | |
| 125 open IN, "tabix -f -h ".$self->{file}." 1:1-1 |"; | |
| 126 | |
| 127 my %headers = (); | |
| 128 my @lines = <IN>; | |
| 129 | |
| 130 while(my $line = shift @lines) { | |
| 131 if($line =~ /ID\=AC(\_[A-Zdj]+)?\,.*\"(.+)\"/) { | |
| 132 my ($pop, $desc) = ($1, $2); | |
| 133 | |
| 134 $desc =~ s/Counts?/frequency/i; | |
| 135 $pop ||= ''; | |
| 136 | |
| 137 my $field_name = 'ExAC_AF'.$pop; | |
| 138 $headers{$field_name} = 'ExAC '.$desc; | |
| 139 | |
| 140 if ($self->{display_ac}){ | |
| 141 $field_name = 'ExAC_AC'.$pop; | |
| 142 $headers{$field_name} = 'ExAC'.$pop.' Allele count'; | |
| 143 } | |
| 144 if ($self->{display_an}){ | |
| 145 $field_name = 'ExAC_AN'.$pop; | |
| 146 $headers{$field_name} = 'ExAC'.$pop.' Allele number'; | |
| 147 } | |
| 148 | |
| 149 # store this header on self | |
| 150 push @{$self->{headers}}, 'AC'.$pop; | |
| 151 } | |
| 152 } | |
| 153 | |
| 154 close IN; | |
| 155 | |
| 156 die "ERROR: No valid headers found in ExAC VCF file\n" unless scalar keys %headers; | |
| 157 | |
| 158 $self->{header_info} = \%headers; | |
| 159 } | |
| 160 | |
| 161 return $self->{header_info}; | |
| 162 } | |
| 163 | |
| 164 sub run { | |
| 165 my ($self, $tva) = @_; | |
| 166 # make sure headers have been loaded | |
| 167 $self->get_header_info(); | |
| 168 | |
| 169 my $vf = $tva->variation_feature; | |
| 170 my $name = $vf->variation_name; | |
| 171 | |
| 172 # get allele, reverse comp if needed | |
| 173 my $allele; | |
| 174 | |
| 175 $allele = $tva->variation_feature_seq; | |
| 176 reverse_comp(\$allele) if $vf->{strand} < 0; | |
| 177 | |
| 178 # adjust coords to account for VCF-like storage of indels | |
| 179 my ($s, $e) = ($vf->{start} - 1, $vf->{end} + 1); | |
| 180 | |
| 181 my $vf_chr = $vf->{chr}; | |
| 182 $vf_chr =~ s/chr//; | |
| 183 my $pos_string = sprintf("%s:%i-%i", $vf_chr, $s, $e); | |
| 184 | |
| 185 | |
| 186 # clear cache if it looks like the coords are the same | |
| 187 # but allele type is different | |
| 188 delete $self->{cache} if | |
| 189 defined($self->{cache}->{$pos_string}) && | |
| 190 scalar keys %{$self->{cache}->{$pos_string}} && | |
| 191 !defined($self->{cache}->{$pos_string}->{$allele}); | |
| 192 | |
| 193 my $data = {}; | |
| 194 | |
| 195 # cached? | |
| 196 if(defined($self->{cache}) && defined($self->{cache}->{$pos_string})) { | |
| 197 $data = $self->{cache}->{$pos_string}; | |
| 198 } | |
| 199 | |
| 200 # read from file | |
| 201 else { | |
| 202 open TABIX, sprintf("tabix -f %s %s |", $self->{file}, $pos_string); | |
| 203 | |
| 204 while(<TABIX>) { | |
| 205 chomp; | |
| 206 s/\r$//g; | |
| 207 # parse VCF line into a VariationFeature object | |
| 208 my ($vcf_vf) = @{parse_line({format => 'vcf', minimal => 1}, $_)}; | |
| 209 | |
| 210 # check parsed OK | |
| 211 next unless $vcf_vf && $vcf_vf->isa('Bio::EnsEMBL::Variation::VariationFeature'); | |
| 212 | |
| 213 my @vcf_alleles = split /\//, $vcf_vf->allele_string; | |
| 214 my $ref_allele = shift @vcf_alleles; | |
| 215 my $vcf_vf_start = $vcf_vf->{start}; | |
| 216 my $vcf_vf_end = $vcf_vf->{end}; | |
| 217 | |
| 218 my @vf_alleles = split /\//, $vf->allele_string; | |
| 219 my $vf_ref_allele = shift @vf_alleles; | |
| 220 | |
| 221 if ($vcf_vf_start != $vf->{start} || $vcf_vf_end != $vf->{end}) { | |
| 222 my $matched_alleles = get_matched_variant_alleles({ref => $vf_ref_allele, alts => [$allele], pos => $vf->{start}}, {ref => $ref_allele, alts => \@vcf_alleles, pos => $vcf_vf_start}); | |
| 223 | |
| 224 next unless (@$matched_alleles); | |
| 225 # We only match one alt allele from the input VF against alleles from the VCF line. b_allele is the matched allele from the VCF alt alleles | |
| 226 $allele = $matched_alleles->[0]->{b_allele}; | |
| 227 } | |
| 228 # iterate over required headers | |
| 229 HEADER: | |
| 230 foreach my $h(@{$self->{headers} || []}) { | |
| 231 my $total_ac = 0; | |
| 232 | |
| 233 if(/$h\=([0-9\,]+)/) { | |
| 234 | |
| 235 # grab AC | |
| 236 my @ac = split /\,/, $1; | |
| 237 next unless scalar @ac == scalar @vcf_alleles; | |
| 238 | |
| 239 # now sed header to get AN | |
| 240 my $anh = $h; | |
| 241 $anh =~ s/AC/AN/; | |
| 242 | |
| 243 my $afh = $h; | |
| 244 $afh =~ s/AC/AF/; | |
| 245 | |
| 246 # get AC from header | |
| 247 my $ach = $h; | |
| 248 | |
| 249 if(/$anh\=([0-9\,]+)/) { | |
| 250 | |
| 251 # grab AN | |
| 252 my @an = split /\,/, $1; | |
| 253 next unless @an; | |
| 254 my $an; | |
| 255 | |
| 256 foreach my $a(@vcf_alleles) { | |
| 257 my $ac = shift @ac; | |
| 258 $an = shift @an if @an; | |
| 259 | |
| 260 $total_ac += $ac; | |
| 261 if ($self->{display_ac}){ | |
| 262 $data->{$a}->{'ExAC_'.$ach} = $ac; | |
| 263 } | |
| 264 if ($self->{display_an}){ | |
| 265 $data->{$a}->{'ExAC_'.$anh} = $an; | |
| 266 } | |
| 267 | |
| 268 $data->{$a}->{'ExAC_'.$afh} = sprintf("%.3g", $ac / $an) if $an; | |
| 269 } | |
| 270 | |
| 271 # use total to get ref allele freq | |
| 272 if ($self->{display_ac}){ | |
| 273 $data->{$ref_allele}->{'ExAC_'.$ach} = $total_ac; | |
| 274 } | |
| 275 if ($self->{display_an}){ | |
| 276 $data->{$ref_allele}->{'ExAC_'.$anh} = $an; | |
| 277 } | |
| 278 $data->{$ref_allele}->{'ExAC_'.$afh} = sprintf("%.3g", 1 - ($total_ac / $an)) if $an; | |
| 279 } | |
| 280 } | |
| 281 } | |
| 282 } | |
| 283 | |
| 284 close TABIX; | |
| 285 } | |
| 286 | |
| 287 # overwrite cache | |
| 288 $self->{cache} = {$pos_string => $data}; | |
| 289 return defined($data->{$allele}) ? $data->{$allele} : {}; | |
| 290 } | |
| 291 | |
| 292 1; | |
| 293 | 
