diff dir_plugins/AncestralAllele.pm @ 0:e545d0a25ffe draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:17:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dir_plugins/AncestralAllele.pm	Mon Jul 15 05:17:17 2019 -0400
@@ -0,0 +1,281 @@
+=head1 LICENSE
+
+Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
+Copyright [2016-2018] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=head1 CONTACT
+
+ Ensembl <http://www.ensembl.org/info/about/contact/index.html>
+
+=cut
+
+=head1 NAME
+
+ AncestralAllele
+
+=head1 SYNOPSIS
+
+ mv AncestralAllele.pm ~/.vep/Plugins
+ ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
+
+=head1 DESCRIPTION
+
+ A VEP plugin that retrieves ancestral allele sequences from a FASTA file.
+
+ Ensembl produces FASTA file dumps of the ancestral sequences of key species.
+ They are available from ftp://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/
+
+ For optimal retrieval speed, you should pre-process the FASTA files into a single
+ bgzipped file that can be accessed via Bio::DB::HTS::Faidx (installed by VEP's
+ INSTALL.pl):
+
+ wget ftp://ftp.ensembl.org/pub/release-90/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38_e86.tar.gz
+ tar xfz homo_sapiens_ancestor_GRCh38_e86.tar.gz
+ cat homo_sapiens_ancestor_GRCh38_e86/*.fa | bgzip -c > homo_sapiens_ancestor_GRCh38_e86.fa.gz
+ rm -rf homo_sapiens_ancestor_GRCh38_e86/ homo_sapiens_ancestor_GRCh38_e86.tar.gz
+ ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
+
+ The plugin is also compatible with Bio::DB::Fasta and an uncompressed FASTA file.
+
+ Note the first time you run the plugin with a newly generated FASTA file it will
+ spend some time indexing the file. DO NOT INTERRUPT THIS PROCESS, particularly
+ if you do not have Bio::DB::HTS installed.
+
+ Special cases:
+   "-" represents an insertion
+   "?" indicates the chromosome could not be looked up in the FASTA
+
+=cut
+
+package AncestralAllele;
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
+
+use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
+
+use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
+
+our $CAN_USE_FAIDX;
+our $DEBUG = 0;
+
+BEGIN {
+  if (eval { require Bio::DB::HTS::Faidx; 1 }) {
+    $CAN_USE_FAIDX = 1;
+  }
+}
+
+sub new {
+  my $class = shift;
+  
+  my $self = $class->SUPER::new(@_);
+
+  # get file
+  my $fasta_file = $self->params->[0] || "";
+  die("ERROR: Ancestral FASTA file $fasta_file not provided or found\n") unless $fasta_file && -e $fasta_file;
+  $self->fasta_file($fasta_file);
+
+  # get type of index we're creating/using
+  my $type = $fasta_file =~ /\.gz$/ ? 'Bio::DB::HTS::Faidx' : 'Bio::DB::Fasta';
+  die("ERROR: Bio::DB::HTS required to access compressed FASTA file $fasta_file\n") if $type eq 'Bio::DB::HTS::Faidx' && !$CAN_USE_FAIDX;
+  $self->index_type($type);
+
+  # init DB here so indexing doesn't happen at some point in the middle of VEP run
+  $self->fasta_db;
+
+  return $self;
+}
+
+sub feature_types {
+  return ['Feature','Intergenic'];
+}
+
+sub get_header_info {
+  return { AA => 'Ancestral allele from '.$_[0]->fasta_file }
+}
+
+sub run {
+  my ($self, $tva) = @_;
+  
+  my $vf = $tva->variation_feature;
+
+  my ($start, $end) = ($vf->{start}, $vf->{end});
+
+  # cache AA on VF, prevents repeat lookups for multiple TVAs
+  if(!exists($vf->{_vep_plugin_AncestralAllele})) {
+
+    # insertion
+    return { AA => $vf->{_vep_plugin_AncestralAllele} = '-' } if $start > $end;
+
+    my $sr_name = $self->get_sr_name($vf->{chr});
+
+    # chr not in FASTA
+    return { AA => $vf->{_vep_plugin_AncestralAllele} = '?' } unless $sr_name;
+
+    my $fasta_db = $self->fasta_db;
+    my $type = $self->index_type;
+    my ($seq, $length);
+
+    if($type eq 'Bio::DB::HTS::Faidx') {
+      my $location_string = $sr_name.":".$start."-".$end ;
+      ($seq, $length) = $fasta_db->get_sequence($location_string);
+    }
+    elsif($type eq 'Bio::DB::Fasta') {
+      $seq = $fasta_db->seq($sr_name, $start => $end);
+    }
+    else {
+      throw("ERROR: Don't know how to fetch sequence from a ".ref($fasta_db)."\n");
+    }
+
+    reverse_comp(\$seq) if $vf->{strand} < 0;
+
+    $vf->{_vep_plugin_AncestralAllele} = $seq;
+  }
+
+  return { AA => $vf->{_vep_plugin_AncestralAllele} };
+}
+
+# getter/setter for fasta file path
+sub fasta_file {
+  my $self = shift;
+  $self->{_fasta_file} = shift if @_;
+  return $self->{_fasta_file};
+}
+
+# getter/setter for fasta index type
+sub index_type {
+  my $self = shift;
+  $self->{_index_type} = shift if @_;
+  return $self->{_index_type};
+}
+
+## CRIBBED FROM Bio::EnsEMBL::Variation::Utils::FastaSequence
+## MOVE THIS CODE TO SHARED UTILS SPACE?
+# creates the FASTA DB object
+# returns either a Bio::DB::Fasta or a Bio::DB::HTS::Faidx
+# uses a lock file to avoid partial indexes getting used
+sub fasta_db {
+  my $self = shift;
+
+  # if we've forked we need to re-connect
+  # but make sure we only do this once per fork
+  my $prev_pid = $self->{_prev_pid} ||= $$;
+  if($prev_pid ne $$) {
+    delete $self->{_fasta_db};
+    $self->{_prev_pid} = $$;
+  }
+
+  if(!exists($self->{_fasta_db})) {
+    my $fasta = $self->fasta_file;
+    my $type = $self->index_type;
+
+    # check lock file
+    my $lock_file = $fasta;
+    $lock_file .= -d $fasta ? '/.vep.lock' : '.vep.lock';
+
+    # lock file exists, indexing failed
+    if(-e $lock_file) {
+      print STDERR "Lock file found, removing to allow re-indexing\n" if $DEBUG;
+      for(qw(.fai .index .gzi /directory.index /directory.fai .vep.lock)) {
+        unlink($fasta.$_) if -e $fasta.$_;
+      }
+    }
+
+    my $index_exists = 0;
+
+    for my $fn(map {$fasta.$_} qw(.fai .index .gzi /directory.index /directory.fai)) {
+      if(-e $fn) {
+        $index_exists = 1;
+        last;
+      }
+    }
+
+    # create lock file
+    unless($index_exists) {
+      open LOCK, ">$lock_file" or throw("ERROR: Could not write to FASTA lock file $lock_file\n");
+      print LOCK "1\n";
+      close LOCK;
+      print STDERR "Indexing $fasta\n" if $DEBUG;
+    }
+
+    # run indexing
+    my $fasta_db;
+    if($type eq 'Bio::DB::HTS::Faidx' && $CAN_USE_FAIDX) {
+      $fasta_db = Bio::DB::HTS::Faidx->new($fasta);
+    }
+    elsif(!$type || $type eq 'Bio::DB::Fasta') {
+      $fasta_db = Bio::DB::Fasta->new($fasta);
+    }
+    else {
+      throw("ERROR: Don't know how to index with type $type\n");
+    }
+    print STDERR "Finished indexing\n" if $DEBUG;
+
+    throw("ERROR: Unable to create FASTA DB\n") unless $fasta_db;
+
+    # remove lock file
+    unlink($lock_file) unless $index_exists;
+
+    $self->{_fasta_db} = $fasta_db;
+  }
+
+  return $self->{_fasta_db};
+}
+
+# gets seq region name as it appears in the FASTA
+# the file contains names like "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
+# so want to map from "10" or "chr10" to "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
+sub get_sr_name {
+  my $self = shift;
+  my $sr_name = shift;
+
+  my $map = $self->fasta_name_map;
+  my $fasta_db = $self->fasta_db;
+
+  unless($map->{$sr_name}) {
+    foreach my $alt(
+      $sr_name =~ /^chr/i ? (substr($sr_name, 3)) : ('chr'.$sr_name, 'CHR'.$sr_name)
+    ) {
+      if($map->{$alt}) {
+        print STDERR "USING SYNONYM $alt FOR $sr_name\n" if $DEBUG;
+        $sr_name = $alt;
+        last;
+      }
+    }
+  }
+
+  return $map->{$sr_name} || undef;
+}
+
+sub fasta_name_map {
+  my ($self, $chr) = @_;
+
+  if(!exists($self->{_name_map})) {
+    my $fasta_db = $self->fasta_db;
+
+    my %map = map {(split(':', $_))[2] => $_}
+      $fasta_db->isa('Bio::DB::Fasta') ?
+      (grep {$_ !~ /^\_\_/} $fasta_db->get_all_primary_ids) :
+      ($fasta_db->get_all_sequence_ids);
+
+    $self->{_name_map} = \%map;
+  }
+
+  return $self->{_name_map};
+}
+
+1;