|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 LocalID
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv LocalID.pm ~/.vep/Plugins
|
|
|
31
|
|
|
32 ## first run create database
|
|
|
33
|
|
|
34 # EITHER create from Ensembl variation database
|
|
|
35 # VERY slow but includes variant synonyms, if not required see next command
|
|
|
36 ./vep -i variant_ids.txt --plugin LocalID,create_db=1 -safe
|
|
|
37
|
|
|
38 # OR create from cache directory
|
|
|
39 # faster but does not include synonyms
|
|
|
40 # parameter passed to from_cache may be full path to cache e.g. $HOME/.vep/homo_sapiens/88_GRCh38
|
|
|
41 # cache may be tabix converted or in default state (http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#convert)
|
|
|
42 ./vep -i variant_ids.txt --plugin LocalID,create_db=1,from_cache=1 -safe
|
|
|
43
|
|
|
44 # subsequent runs
|
|
|
45 ./vep -i variant_ids.txt --plugin LocalID
|
|
|
46
|
|
|
47 # db file can be specified with db=[file]
|
|
|
48 # default file name is $HOME/.vep/[species]_[version]_[assembly].variant_ids.sqlite3
|
|
|
49 ./vep -i variant_ids.txt --plugin LocalID,db=my_db_file.txt
|
|
|
50
|
|
|
51 =head1 DESCRIPTION
|
|
|
52
|
|
|
53 The LocalID plugin allows you to use variant IDs as input without making a database connection.
|
|
|
54
|
|
|
55 Requires sqlite3.
|
|
|
56
|
|
|
57 A local sqlite3 database is used to look up variant IDs; this is generated either from Ensembl's
|
|
|
58 public database (very slow, but includes synonyms), or from a VEP cache file (faster, excludes
|
|
|
59 synonyms).
|
|
|
60
|
|
|
61 NB this plugin is NOT compatible with the ensembl-tools variant_effect_predictor.pl version of VEP.
|
|
|
62
|
|
|
63 =cut
|
|
|
64
|
|
|
65 package LocalID;
|
|
|
66
|
|
|
67 use strict;
|
|
|
68 use warnings;
|
|
|
69
|
|
|
70 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
|
|
|
71 use Bio::EnsEMBL::VEP::Parser::ID;
|
|
|
72 use Bio::EnsEMBL::VEP::Constants;
|
|
|
73 use Bio::EnsEMBL::VEP::Utils qw(get_compressed_filehandle);
|
|
|
74
|
|
|
75
|
|
|
76 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
77
|
|
|
78 sub new {
|
|
|
79 my $class = shift;
|
|
|
80
|
|
|
81 my $self = $class->SUPER::new(@_);
|
|
|
82
|
|
|
83 my $param_hash = $self->params_to_hash();
|
|
|
84 my $config = $self->{config};
|
|
|
85 my $species = $config->{species} || 'homo_sapiens';
|
|
|
86 my $db;
|
|
|
87
|
|
|
88 unless($db = $param_hash->{db}) {
|
|
|
89 my $version =
|
|
|
90 $config->{db_version} ||
|
|
|
91 $config->{cache_version} ||
|
|
|
92 $Bio::EnsEMBL::VEP::Constants::VEP_VERSION ||
|
|
|
93 'Bio::EnsEMBL::Registry'->software_version ||
|
|
|
94 undef;
|
|
|
95 my $assembly = $config->{assembly};
|
|
|
96 my $dir = $param_hash->{dir} || $config->{dir};
|
|
|
97
|
|
|
98 die("ERROR: Unable to determine software version - if using --offline, add --cache_version [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $version;
|
|
|
99 die("ERROR: Unable to determine assembly version - if using --offline, add --assembly [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $assembly;
|
|
|
100 $db = sprintf("%s/%s_%i_%s.variant_ids.sqlite3", $dir, $species, $version, $assembly);
|
|
|
101 }
|
|
|
102
|
|
|
103 # create DB?
|
|
|
104 $self->create_db($db, $species, $param_hash) if $param_hash->{create_db};
|
|
|
105
|
|
|
106 die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db;
|
|
|
107
|
|
|
108 $self->config->{_localid_db_file} = $db;
|
|
|
109
|
|
|
110 return $self;
|
|
|
111 }
|
|
|
112
|
|
|
113 sub create_db {
|
|
|
114 my ($self, $db, $species, $param_hash) = @_;
|
|
|
115
|
|
|
116 # requites sqlite3 command line tool
|
|
|
117 die("ERROR: sqlite3 command not found in path\n") unless `which sqlite3` =~ /\/sqlite3/;
|
|
|
118
|
|
|
119 my $config = $self->{config};
|
|
|
120
|
|
|
121 die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db;
|
|
|
122
|
|
|
123 print STDERR "## LocalID plugin\n # Creating database of variant IDs - this may take some time\n" unless $config->{quiet};
|
|
|
124
|
|
|
125 my $tmpfile = "$db.tmp$$";
|
|
|
126 open my $tmp_handle, ">$tmpfile" or die "ERROR: Unable to write to $tmpfile\n";
|
|
|
127
|
|
|
128 if(my $cache_dir = $param_hash->{from_cache}) {
|
|
|
129
|
|
|
130 # attempt to interpret cache dir from command line opts
|
|
|
131 if($cache_dir eq '1') {
|
|
|
132 my $version =
|
|
|
133 $config->{cache_version} ||
|
|
|
134 $config->{db_version} ||
|
|
|
135 $Bio::EnsEMBL::VEP::Constants::VEP_VERSION ||
|
|
|
136 ($config->{reg} ? $config->{reg}->software_version : undef);
|
|
|
137 my $assembly = $config->{assembly};
|
|
|
138 my $dir = $config->{dir_cache} || $config->{dir};
|
|
|
139
|
|
|
140 $cache_dir = "$dir\/$species\/$version\_$assembly";
|
|
|
141 }
|
|
|
142
|
|
|
143 print STDERR " # attempting to create from $cache_dir\n" unless $config->{quiet};
|
|
|
144 $self->_tmp_file_from_cache($cache_dir, $tmp_handle);
|
|
|
145 }
|
|
|
146 else {
|
|
|
147 print STDERR " # attempting to create from variation database for $species\n" unless $config->{quiet};
|
|
|
148 $self->_tmp_file_from_var_db($species, $tmp_handle);
|
|
|
149 }
|
|
|
150
|
|
|
151 close $tmp_handle;
|
|
|
152
|
|
|
153 # create database
|
|
|
154 my $dbh = DBI->connect("dbi:SQLite:dbname=$db","","");
|
|
|
155 $dbh->do("CREATE TABLE ids(id, chr, start, end, alleles, strand)");
|
|
|
156
|
|
|
157 # load tmp file into table
|
|
|
158 print STDERR " # loading database\n" unless $config->{quiet};
|
|
|
159 my $cmd = qq{sqlite3 $db '.import $tmpfile ids'};
|
|
|
160 `$cmd 2>&1` and die("ERROR: Failed to import $tmpfile to $db\n");
|
|
|
161 unlink($tmpfile);
|
|
|
162
|
|
|
163 # index
|
|
|
164 print STDERR " # indexing database\n" unless $config->{quiet};
|
|
|
165 $dbh->do("CREATE INDEX id_idx ON ids(id)");
|
|
|
166
|
|
|
167 print STDERR " # successfully built database $db\n" unless $config->{quiet};
|
|
|
168 }
|
|
|
169
|
|
|
170 sub _tmp_file_from_cache {
|
|
|
171 my ($self, $cache_dir, $tmp_handle) = @_;
|
|
|
172 my $config = $self->{config};
|
|
|
173
|
|
|
174 die("ERROR: Cache dir $cache_dir not found or not a directory\n") unless -d $cache_dir;
|
|
|
175
|
|
|
176 # read info
|
|
|
177 open INFO, $cache_dir.'/info.txt' or die("ERROR: No info.txt file found in $cache_dir\n");
|
|
|
178
|
|
|
179 my %cols;
|
|
|
180 while(<INFO>) {
|
|
|
181 next unless /^variation_cols/;
|
|
|
182 chomp;
|
|
|
183 my @tmp_cols = split(',', (split("\t", $_))[1]);
|
|
|
184 $cols{$tmp_cols[$_]} = $_ for 0..$#tmp_cols;
|
|
|
185 last;
|
|
|
186 }
|
|
|
187 close INFO;
|
|
|
188
|
|
|
189 # get all chromosome dirs
|
|
|
190 opendir DIR, $cache_dir or die("ERROR: Could not read dir $cache_dir\n");
|
|
|
191 my @chrs = grep {-d $cache_dir.'/'.$_ && !/^\./} readdir DIR;
|
|
|
192 closedir DIR;
|
|
|
193
|
|
|
194 foreach my $chr(@chrs) {
|
|
|
195 opendir CHR, $cache_dir.'/'.$chr;
|
|
|
196 my @all_files = grep {/var/ && !/\.(tb|cs)i$/} readdir CHR;
|
|
|
197 closedir CHR;
|
|
|
198
|
|
|
199 my @files = grep {/all_vars/} @all_files;
|
|
|
200 @files = @all_files unless @files;
|
|
|
201
|
|
|
202 foreach my $file(@files) {
|
|
|
203 my $fh = get_compressed_filehandle($cache_dir.'/'.$chr.'/'.$file, 1);
|
|
|
204
|
|
|
205 my $delim;
|
|
|
206
|
|
|
207 while(<$fh>) {
|
|
|
208 unless($delim) {
|
|
|
209 $delim = /\t/ ? "\t" : " ";
|
|
|
210 }
|
|
|
211
|
|
|
212 chomp;
|
|
|
213 my @split = map {($_ || '') eq '.' ? undef : $_} split($delim);
|
|
|
214
|
|
|
215 # id, chr, start, end, alleles, strand
|
|
|
216 print $tmp_handle join("|",
|
|
|
217 $split[$cols{variation_name}],
|
|
|
218 $chr,
|
|
|
219 $split[$cols{start}],
|
|
|
220 $split[$cols{end}] || $split[$cols{start}],
|
|
|
221 $split[$cols{allele_string}] || '',
|
|
|
222 $split[$cols{strand}] || 1,
|
|
|
223 )."\n";
|
|
|
224 }
|
|
|
225
|
|
|
226 close $fh;
|
|
|
227 }
|
|
|
228 }
|
|
|
229 }
|
|
|
230
|
|
|
231 sub _tmp_file_from_var_db {
|
|
|
232 my ($self, $species, $tmp_handle) = @_;
|
|
|
233 my $config = $self->{config};
|
|
|
234
|
|
|
235 my $var_dbc = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc;
|
|
|
236
|
|
|
237 my $mysql = $var_dbc->prepare(qq{
|
|
|
238 SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand
|
|
|
239 FROM variation v, variation_feature vf, seq_region s
|
|
|
240 WHERE v.variation_id = vf.variation_id
|
|
|
241 AND vf.seq_region_id = s.seq_region_id
|
|
|
242 }, {mysql_use_result => 1});
|
|
|
243
|
|
|
244 my ($i, $c, $s, $e, $a, $d);
|
|
|
245 $mysql->execute();
|
|
|
246 $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d);
|
|
|
247 print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch();
|
|
|
248 $mysql->finish();
|
|
|
249
|
|
|
250 # do synonyms
|
|
|
251 print STDERR "Processing synonyms\n" unless $config->{quiet};
|
|
|
252 $mysql = $var_dbc->prepare(qq{
|
|
|
253 SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand
|
|
|
254 FROM variation_synonym v, variation_feature vf, seq_region s
|
|
|
255 WHERE v.variation_id = vf.variation_id
|
|
|
256 AND vf.seq_region_id = s.seq_region_id
|
|
|
257 }, {mysql_use_result => 1});
|
|
|
258
|
|
|
259 $mysql->execute();
|
|
|
260 $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d);
|
|
|
261 print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch();
|
|
|
262 $mysql->finish();
|
|
|
263 }
|
|
|
264
|
|
|
265 sub run {
|
|
|
266 return {};
|
|
|
267 }
|
|
|
268
|
|
|
269 1;
|
|
|
270
|
|
|
271
|
|
|
272
|
|
|
273
|
|
|
274 ###########################################
|
|
|
275 ### Redefine methods in existing module ###
|
|
|
276 ###########################################
|
|
|
277
|
|
|
278 package Bio::EnsEMBL::VEP::Parser::ID;
|
|
|
279
|
|
|
280 no warnings qw(redefine);
|
|
|
281 sub new {
|
|
|
282 my $caller = shift;
|
|
|
283 my $class = ref($caller) || $caller;
|
|
|
284
|
|
|
285 my $self = $class->SUPER::new(@_);
|
|
|
286
|
|
|
287 return $self;
|
|
|
288 }
|
|
|
289
|
|
|
290 sub create_VariationFeatures {
|
|
|
291 my $self = shift;
|
|
|
292
|
|
|
293 my $parser = $self->parser;
|
|
|
294 $parser->next();
|
|
|
295
|
|
|
296 $self->skip_empty_lines();
|
|
|
297
|
|
|
298 return [] unless $parser->{record};
|
|
|
299
|
|
|
300 $self->line_number($self->line_number + 1);
|
|
|
301
|
|
|
302 my $id = $parser->get_value;
|
|
|
303
|
|
|
304 # remove whitespace
|
|
|
305 $id =~ s/\s+//g;
|
|
|
306
|
|
|
307 my $db = $self->id_db;
|
|
|
308 my $sth = $self->{_id_sth} ||= $db->prepare("SELECT chr, start, end, alleles, strand FROM ids WHERE id = ?");
|
|
|
309 my $ad = $self->{_var_ad} ||= $self->get_adaptor('variation', 'VariationFeature');
|
|
|
310
|
|
|
311 my @vfs;
|
|
|
312 my ($c, $s, $e, $a, $d);
|
|
|
313 $sth->execute($id);
|
|
|
314 $sth->bind_columns(\$c, \$s, \$e, \$a, \$d);
|
|
|
315
|
|
|
316 push @vfs, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
|
|
|
317 start => $s,
|
|
|
318 end => $e,
|
|
|
319 allele_string => $a,
|
|
|
320 strand => $d,
|
|
|
321 map_weight => 1,
|
|
|
322 adaptor => $ad,
|
|
|
323 variation_name => $id,
|
|
|
324 chr => $c,
|
|
|
325 }) while $sth->fetch;
|
|
|
326
|
|
|
327 $sth->finish();
|
|
|
328
|
|
|
329 return $self->post_process_vfs(\@vfs);
|
|
|
330 }
|
|
|
331
|
|
|
332 sub id_db {
|
|
|
333 my $self = shift;
|
|
|
334
|
|
|
335 unless(exists($self->{_id_db})) {
|
|
|
336 throw("ERROR: ID database not defined or detected - possible plugin compile failure\n") unless my $db = $self->config->{_params}->{_localid_db_file};
|
|
|
337 throw("ERROR: ID database file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db;
|
|
|
338 $self->{_id_db} = DBI->connect("dbi:SQLite:dbname=$db","","");
|
|
|
339 }
|
|
|
340
|
|
|
341 return $self->{_id_db};
|
|
|
342 }
|
|
|
343
|
|
|
344 1;
|