|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 AncestralAllele
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv AncestralAllele.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 A VEP plugin that retrieves ancestral allele sequences from a FASTA file.
|
|
|
36
|
|
|
37 Ensembl produces FASTA file dumps of the ancestral sequences of key species.
|
|
|
38 They are available from ftp://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/
|
|
|
39
|
|
|
40 For optimal retrieval speed, you should pre-process the FASTA files into a single
|
|
|
41 bgzipped file that can be accessed via Bio::DB::HTS::Faidx (installed by VEP's
|
|
|
42 INSTALL.pl):
|
|
|
43
|
|
|
44 wget ftp://ftp.ensembl.org/pub/release-90/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38_e86.tar.gz
|
|
|
45 tar xfz homo_sapiens_ancestor_GRCh38_e86.tar.gz
|
|
|
46 cat homo_sapiens_ancestor_GRCh38_e86/*.fa | bgzip -c > homo_sapiens_ancestor_GRCh38_e86.fa.gz
|
|
|
47 rm -rf homo_sapiens_ancestor_GRCh38_e86/ homo_sapiens_ancestor_GRCh38_e86.tar.gz
|
|
|
48 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
|
|
|
49
|
|
|
50 The plugin is also compatible with Bio::DB::Fasta and an uncompressed FASTA file.
|
|
|
51
|
|
|
52 Note the first time you run the plugin with a newly generated FASTA file it will
|
|
|
53 spend some time indexing the file. DO NOT INTERRUPT THIS PROCESS, particularly
|
|
|
54 if you do not have Bio::DB::HTS installed.
|
|
|
55
|
|
|
56 Special cases:
|
|
|
57 "-" represents an insertion
|
|
|
58 "?" indicates the chromosome could not be looked up in the FASTA
|
|
|
59
|
|
|
60 =cut
|
|
|
61
|
|
|
62 package AncestralAllele;
|
|
|
63
|
|
|
64 use strict;
|
|
|
65 use warnings;
|
|
|
66
|
|
|
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
68
|
|
|
69 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
|
|
|
70
|
|
|
71 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
72
|
|
|
73 our $CAN_USE_FAIDX;
|
|
|
74 our $DEBUG = 0;
|
|
|
75
|
|
|
76 BEGIN {
|
|
|
77 if (eval { require Bio::DB::HTS::Faidx; 1 }) {
|
|
|
78 $CAN_USE_FAIDX = 1;
|
|
|
79 }
|
|
|
80 }
|
|
|
81
|
|
|
82 sub new {
|
|
|
83 my $class = shift;
|
|
|
84
|
|
|
85 my $self = $class->SUPER::new(@_);
|
|
|
86
|
|
|
87 # get file
|
|
|
88 my $fasta_file = $self->params->[0] || "";
|
|
|
89 die("ERROR: Ancestral FASTA file $fasta_file not provided or found\n") unless $fasta_file && -e $fasta_file;
|
|
|
90 $self->fasta_file($fasta_file);
|
|
|
91
|
|
|
92 # get type of index we're creating/using
|
|
|
93 my $type = $fasta_file =~ /\.gz$/ ? 'Bio::DB::HTS::Faidx' : 'Bio::DB::Fasta';
|
|
|
94 die("ERROR: Bio::DB::HTS required to access compressed FASTA file $fasta_file\n") if $type eq 'Bio::DB::HTS::Faidx' && !$CAN_USE_FAIDX;
|
|
|
95 $self->index_type($type);
|
|
|
96
|
|
|
97 # init DB here so indexing doesn't happen at some point in the middle of VEP run
|
|
|
98 $self->fasta_db;
|
|
|
99
|
|
|
100 return $self;
|
|
|
101 }
|
|
|
102
|
|
|
103 sub feature_types {
|
|
|
104 return ['Feature','Intergenic'];
|
|
|
105 }
|
|
|
106
|
|
|
107 sub get_header_info {
|
|
|
108 return { AA => 'Ancestral allele from '.$_[0]->fasta_file }
|
|
|
109 }
|
|
|
110
|
|
|
111 sub run {
|
|
|
112 my ($self, $tva) = @_;
|
|
|
113
|
|
|
114 my $vf = $tva->variation_feature;
|
|
|
115
|
|
|
116 my ($start, $end) = ($vf->{start}, $vf->{end});
|
|
|
117
|
|
|
118 # cache AA on VF, prevents repeat lookups for multiple TVAs
|
|
|
119 if(!exists($vf->{_vep_plugin_AncestralAllele})) {
|
|
|
120
|
|
|
121 # insertion
|
|
|
122 return { AA => $vf->{_vep_plugin_AncestralAllele} = '-' } if $start > $end;
|
|
|
123
|
|
|
124 my $sr_name = $self->get_sr_name($vf->{chr});
|
|
|
125
|
|
|
126 # chr not in FASTA
|
|
|
127 return { AA => $vf->{_vep_plugin_AncestralAllele} = '?' } unless $sr_name;
|
|
|
128
|
|
|
129 my $fasta_db = $self->fasta_db;
|
|
|
130 my $type = $self->index_type;
|
|
|
131 my ($seq, $length);
|
|
|
132
|
|
|
133 if($type eq 'Bio::DB::HTS::Faidx') {
|
|
|
134 my $location_string = $sr_name.":".$start."-".$end ;
|
|
|
135 ($seq, $length) = $fasta_db->get_sequence($location_string);
|
|
|
136 }
|
|
|
137 elsif($type eq 'Bio::DB::Fasta') {
|
|
|
138 $seq = $fasta_db->seq($sr_name, $start => $end);
|
|
|
139 }
|
|
|
140 else {
|
|
|
141 throw("ERROR: Don't know how to fetch sequence from a ".ref($fasta_db)."\n");
|
|
|
142 }
|
|
|
143
|
|
|
144 reverse_comp(\$seq) if $vf->{strand} < 0;
|
|
|
145
|
|
|
146 $vf->{_vep_plugin_AncestralAllele} = $seq;
|
|
|
147 }
|
|
|
148
|
|
|
149 return { AA => $vf->{_vep_plugin_AncestralAllele} };
|
|
|
150 }
|
|
|
151
|
|
|
152 # getter/setter for fasta file path
|
|
|
153 sub fasta_file {
|
|
|
154 my $self = shift;
|
|
|
155 $self->{_fasta_file} = shift if @_;
|
|
|
156 return $self->{_fasta_file};
|
|
|
157 }
|
|
|
158
|
|
|
159 # getter/setter for fasta index type
|
|
|
160 sub index_type {
|
|
|
161 my $self = shift;
|
|
|
162 $self->{_index_type} = shift if @_;
|
|
|
163 return $self->{_index_type};
|
|
|
164 }
|
|
|
165
|
|
|
166 ## CRIBBED FROM Bio::EnsEMBL::Variation::Utils::FastaSequence
|
|
|
167 ## MOVE THIS CODE TO SHARED UTILS SPACE?
|
|
|
168 # creates the FASTA DB object
|
|
|
169 # returns either a Bio::DB::Fasta or a Bio::DB::HTS::Faidx
|
|
|
170 # uses a lock file to avoid partial indexes getting used
|
|
|
171 sub fasta_db {
|
|
|
172 my $self = shift;
|
|
|
173
|
|
|
174 # if we've forked we need to re-connect
|
|
|
175 # but make sure we only do this once per fork
|
|
|
176 my $prev_pid = $self->{_prev_pid} ||= $$;
|
|
|
177 if($prev_pid ne $$) {
|
|
|
178 delete $self->{_fasta_db};
|
|
|
179 $self->{_prev_pid} = $$;
|
|
|
180 }
|
|
|
181
|
|
|
182 if(!exists($self->{_fasta_db})) {
|
|
|
183 my $fasta = $self->fasta_file;
|
|
|
184 my $type = $self->index_type;
|
|
|
185
|
|
|
186 # check lock file
|
|
|
187 my $lock_file = $fasta;
|
|
|
188 $lock_file .= -d $fasta ? '/.vep.lock' : '.vep.lock';
|
|
|
189
|
|
|
190 # lock file exists, indexing failed
|
|
|
191 if(-e $lock_file) {
|
|
|
192 print STDERR "Lock file found, removing to allow re-indexing\n" if $DEBUG;
|
|
|
193 for(qw(.fai .index .gzi /directory.index /directory.fai .vep.lock)) {
|
|
|
194 unlink($fasta.$_) if -e $fasta.$_;
|
|
|
195 }
|
|
|
196 }
|
|
|
197
|
|
|
198 my $index_exists = 0;
|
|
|
199
|
|
|
200 for my $fn(map {$fasta.$_} qw(.fai .index .gzi /directory.index /directory.fai)) {
|
|
|
201 if(-e $fn) {
|
|
|
202 $index_exists = 1;
|
|
|
203 last;
|
|
|
204 }
|
|
|
205 }
|
|
|
206
|
|
|
207 # create lock file
|
|
|
208 unless($index_exists) {
|
|
|
209 open LOCK, ">$lock_file" or throw("ERROR: Could not write to FASTA lock file $lock_file\n");
|
|
|
210 print LOCK "1\n";
|
|
|
211 close LOCK;
|
|
|
212 print STDERR "Indexing $fasta\n" if $DEBUG;
|
|
|
213 }
|
|
|
214
|
|
|
215 # run indexing
|
|
|
216 my $fasta_db;
|
|
|
217 if($type eq 'Bio::DB::HTS::Faidx' && $CAN_USE_FAIDX) {
|
|
|
218 $fasta_db = Bio::DB::HTS::Faidx->new($fasta);
|
|
|
219 }
|
|
|
220 elsif(!$type || $type eq 'Bio::DB::Fasta') {
|
|
|
221 $fasta_db = Bio::DB::Fasta->new($fasta);
|
|
|
222 }
|
|
|
223 else {
|
|
|
224 throw("ERROR: Don't know how to index with type $type\n");
|
|
|
225 }
|
|
|
226 print STDERR "Finished indexing\n" if $DEBUG;
|
|
|
227
|
|
|
228 throw("ERROR: Unable to create FASTA DB\n") unless $fasta_db;
|
|
|
229
|
|
|
230 # remove lock file
|
|
|
231 unlink($lock_file) unless $index_exists;
|
|
|
232
|
|
|
233 $self->{_fasta_db} = $fasta_db;
|
|
|
234 }
|
|
|
235
|
|
|
236 return $self->{_fasta_db};
|
|
|
237 }
|
|
|
238
|
|
|
239 # gets seq region name as it appears in the FASTA
|
|
|
240 # the file contains names like "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
|
|
|
241 # so want to map from "10" or "chr10" to "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
|
|
|
242 sub get_sr_name {
|
|
|
243 my $self = shift;
|
|
|
244 my $sr_name = shift;
|
|
|
245
|
|
|
246 my $map = $self->fasta_name_map;
|
|
|
247 my $fasta_db = $self->fasta_db;
|
|
|
248
|
|
|
249 unless($map->{$sr_name}) {
|
|
|
250 foreach my $alt(
|
|
|
251 $sr_name =~ /^chr/i ? (substr($sr_name, 3)) : ('chr'.$sr_name, 'CHR'.$sr_name)
|
|
|
252 ) {
|
|
|
253 if($map->{$alt}) {
|
|
|
254 print STDERR "USING SYNONYM $alt FOR $sr_name\n" if $DEBUG;
|
|
|
255 $sr_name = $alt;
|
|
|
256 last;
|
|
|
257 }
|
|
|
258 }
|
|
|
259 }
|
|
|
260
|
|
|
261 return $map->{$sr_name} || undef;
|
|
|
262 }
|
|
|
263
|
|
|
264 sub fasta_name_map {
|
|
|
265 my ($self, $chr) = @_;
|
|
|
266
|
|
|
267 if(!exists($self->{_name_map})) {
|
|
|
268 my $fasta_db = $self->fasta_db;
|
|
|
269
|
|
|
270 my %map = map {(split(':', $_))[2] => $_}
|
|
|
271 $fasta_db->isa('Bio::DB::Fasta') ?
|
|
|
272 (grep {$_ !~ /^\_\_/} $fasta_db->get_all_primary_ids) :
|
|
|
273 ($fasta_db->get_all_sequence_ids);
|
|
|
274
|
|
|
275 $self->{_name_map} = \%map;
|
|
|
276 }
|
|
|
277
|
|
|
278 return $self->{_name_map};
|
|
|
279 }
|
|
|
280
|
|
|
281 1;
|