comparison dir_plugins/AncestralAllele.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
comparison
equal deleted inserted replaced
2:17c98d091710 3:49397129aec0
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 AncestralAllele
27
28 =head1 SYNOPSIS
29
30 mv AncestralAllele.pm ~/.vep/Plugins
31 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
32
33 =head1 DESCRIPTION
34
35 A VEP plugin that retrieves ancestral allele sequences from a FASTA file.
36
37 Ensembl produces FASTA file dumps of the ancestral sequences of key species.
38 They are available from ftp://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/
39
40 For optimal retrieval speed, you should pre-process the FASTA files into a single
41 bgzipped file that can be accessed via Bio::DB::HTS::Faidx (installed by VEP's
42 INSTALL.pl):
43
44 wget ftp://ftp.ensembl.org/pub/release-90/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38_e86.tar.gz
45 tar xfz homo_sapiens_ancestor_GRCh38_e86.tar.gz
46 cat homo_sapiens_ancestor_GRCh38_e86/*.fa | bgzip -c > homo_sapiens_ancestor_GRCh38_e86.fa.gz
47 rm -rf homo_sapiens_ancestor_GRCh38_e86/ homo_sapiens_ancestor_GRCh38_e86.tar.gz
48 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
49
50 The plugin is also compatible with Bio::DB::Fasta and an uncompressed FASTA file.
51
52 Note the first time you run the plugin with a newly generated FASTA file it will
53 spend some time indexing the file. DO NOT INTERRUPT THIS PROCESS, particularly
54 if you do not have Bio::DB::HTS installed.
55
56 Special cases:
57 "-" represents an insertion
58 "?" indicates the chromosome could not be looked up in the FASTA
59
60 =cut
61
62 package AncestralAllele;
63
64 use strict;
65 use warnings;
66
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
68
69 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
70
71 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
72
73 our $CAN_USE_FAIDX;
74 our $DEBUG = 0;
75
76 BEGIN {
77 if (eval { require Bio::DB::HTS::Faidx; 1 }) {
78 $CAN_USE_FAIDX = 1;
79 }
80 }
81
82 sub new {
83 my $class = shift;
84
85 my $self = $class->SUPER::new(@_);
86
87 # get file
88 my $fasta_file = $self->params->[0] || "";
89 die("ERROR: Ancestral FASTA file $fasta_file not provided or found\n") unless $fasta_file && -e $fasta_file;
90 $self->fasta_file($fasta_file);
91
92 # get type of index we're creating/using
93 my $type = $fasta_file =~ /\.gz$/ ? 'Bio::DB::HTS::Faidx' : 'Bio::DB::Fasta';
94 die("ERROR: Bio::DB::HTS required to access compressed FASTA file $fasta_file\n") if $type eq 'Bio::DB::HTS::Faidx' && !$CAN_USE_FAIDX;
95 $self->index_type($type);
96
97 # init DB here so indexing doesn't happen at some point in the middle of VEP run
98 $self->fasta_db;
99
100 return $self;
101 }
102
103 sub feature_types {
104 return ['Feature','Intergenic'];
105 }
106
107 sub get_header_info {
108 return { AA => 'Ancestral allele from '.$_[0]->fasta_file }
109 }
110
111 sub run {
112 my ($self, $tva) = @_;
113
114 my $vf = $tva->variation_feature;
115
116 my ($start, $end) = ($vf->{start}, $vf->{end});
117
118 # cache AA on VF, prevents repeat lookups for multiple TVAs
119 if(!exists($vf->{_vep_plugin_AncestralAllele})) {
120
121 # insertion
122 return { AA => $vf->{_vep_plugin_AncestralAllele} = '-' } if $start > $end;
123
124 my $sr_name = $self->get_sr_name($vf->{chr});
125
126 # chr not in FASTA
127 return { AA => $vf->{_vep_plugin_AncestralAllele} = '?' } unless $sr_name;
128
129 my $fasta_db = $self->fasta_db;
130 my $type = $self->index_type;
131 my ($seq, $length);
132
133 if($type eq 'Bio::DB::HTS::Faidx') {
134 my $location_string = $sr_name.":".$start."-".$end ;
135 ($seq, $length) = $fasta_db->get_sequence($location_string);
136 }
137 elsif($type eq 'Bio::DB::Fasta') {
138 $seq = $fasta_db->seq($sr_name, $start => $end);
139 }
140 else {
141 throw("ERROR: Don't know how to fetch sequence from a ".ref($fasta_db)."\n");
142 }
143
144 reverse_comp(\$seq) if $vf->{strand} < 0;
145
146 $vf->{_vep_plugin_AncestralAllele} = $seq;
147 }
148
149 return { AA => $vf->{_vep_plugin_AncestralAllele} };
150 }
151
152 # getter/setter for fasta file path
153 sub fasta_file {
154 my $self = shift;
155 $self->{_fasta_file} = shift if @_;
156 return $self->{_fasta_file};
157 }
158
159 # getter/setter for fasta index type
160 sub index_type {
161 my $self = shift;
162 $self->{_index_type} = shift if @_;
163 return $self->{_index_type};
164 }
165
166 ## CRIBBED FROM Bio::EnsEMBL::Variation::Utils::FastaSequence
167 ## MOVE THIS CODE TO SHARED UTILS SPACE?
168 # creates the FASTA DB object
169 # returns either a Bio::DB::Fasta or a Bio::DB::HTS::Faidx
170 # uses a lock file to avoid partial indexes getting used
171 sub fasta_db {
172 my $self = shift;
173
174 # if we've forked we need to re-connect
175 # but make sure we only do this once per fork
176 my $prev_pid = $self->{_prev_pid} ||= $$;
177 if($prev_pid ne $$) {
178 delete $self->{_fasta_db};
179 $self->{_prev_pid} = $$;
180 }
181
182 if(!exists($self->{_fasta_db})) {
183 my $fasta = $self->fasta_file;
184 my $type = $self->index_type;
185
186 # check lock file
187 my $lock_file = $fasta;
188 $lock_file .= -d $fasta ? '/.vep.lock' : '.vep.lock';
189
190 # lock file exists, indexing failed
191 if(-e $lock_file) {
192 print STDERR "Lock file found, removing to allow re-indexing\n" if $DEBUG;
193 for(qw(.fai .index .gzi /directory.index /directory.fai .vep.lock)) {
194 unlink($fasta.$_) if -e $fasta.$_;
195 }
196 }
197
198 my $index_exists = 0;
199
200 for my $fn(map {$fasta.$_} qw(.fai .index .gzi /directory.index /directory.fai)) {
201 if(-e $fn) {
202 $index_exists = 1;
203 last;
204 }
205 }
206
207 # create lock file
208 unless($index_exists) {
209 open LOCK, ">$lock_file" or throw("ERROR: Could not write to FASTA lock file $lock_file\n");
210 print LOCK "1\n";
211 close LOCK;
212 print STDERR "Indexing $fasta\n" if $DEBUG;
213 }
214
215 # run indexing
216 my $fasta_db;
217 if($type eq 'Bio::DB::HTS::Faidx' && $CAN_USE_FAIDX) {
218 $fasta_db = Bio::DB::HTS::Faidx->new($fasta);
219 }
220 elsif(!$type || $type eq 'Bio::DB::Fasta') {
221 $fasta_db = Bio::DB::Fasta->new($fasta);
222 }
223 else {
224 throw("ERROR: Don't know how to index with type $type\n");
225 }
226 print STDERR "Finished indexing\n" if $DEBUG;
227
228 throw("ERROR: Unable to create FASTA DB\n") unless $fasta_db;
229
230 # remove lock file
231 unlink($lock_file) unless $index_exists;
232
233 $self->{_fasta_db} = $fasta_db;
234 }
235
236 return $self->{_fasta_db};
237 }
238
239 # gets seq region name as it appears in the FASTA
240 # the file contains names like "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
241 # so want to map from "10" or "chr10" to "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
242 sub get_sr_name {
243 my $self = shift;
244 my $sr_name = shift;
245
246 my $map = $self->fasta_name_map;
247 my $fasta_db = $self->fasta_db;
248
249 unless($map->{$sr_name}) {
250 foreach my $alt(
251 $sr_name =~ /^chr/i ? (substr($sr_name, 3)) : ('chr'.$sr_name, 'CHR'.$sr_name)
252 ) {
253 if($map->{$alt}) {
254 print STDERR "USING SYNONYM $alt FOR $sr_name\n" if $DEBUG;
255 $sr_name = $alt;
256 last;
257 }
258 }
259 }
260
261 return $map->{$sr_name} || undef;
262 }
263
264 sub fasta_name_map {
265 my ($self, $chr) = @_;
266
267 if(!exists($self->{_name_map})) {
268 my $fasta_db = $self->fasta_db;
269
270 my %map = map {(split(':', $_))[2] => $_}
271 $fasta_db->isa('Bio::DB::Fasta') ?
272 (grep {$_ !~ /^\_\_/} $fasta_db->get_all_primary_ids) :
273 ($fasta_db->get_all_sequence_ids);
274
275 $self->{_name_map} = \%map;
276 }
277
278 return $self->{_name_map};
279 }
280
281 1;