comparison dir_plugins/LocalID.pm @ 0:e545d0a25ffe draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:17:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e545d0a25ffe
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 LocalID
27
28 =head1 SYNOPSIS
29
30 mv LocalID.pm ~/.vep/Plugins
31
32 ## first run create database
33
34 # EITHER create from Ensembl variation database
35 # VERY slow but includes variant synonyms, if not required see next command
36 ./vep -i variant_ids.txt --plugin LocalID,create_db=1 -safe
37
38 # OR create from cache directory
39 # faster but does not include synonyms
40 # parameter passed to from_cache may be full path to cache e.g. $HOME/.vep/homo_sapiens/88_GRCh38
41 # cache may be tabix converted or in default state (http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#convert)
42 ./vep -i variant_ids.txt --plugin LocalID,create_db=1,from_cache=1 -safe
43
44 # subsequent runs
45 ./vep -i variant_ids.txt --plugin LocalID
46
47 # db file can be specified with db=[file]
48 # default file name is $HOME/.vep/[species]_[version]_[assembly].variant_ids.sqlite3
49 ./vep -i variant_ids.txt --plugin LocalID,db=my_db_file.txt
50
51 =head1 DESCRIPTION
52
53 The LocalID plugin allows you to use variant IDs as input without making a database connection.
54
55 Requires sqlite3.
56
57 A local sqlite3 database is used to look up variant IDs; this is generated either from Ensembl's
58 public database (very slow, but includes synonyms), or from a VEP cache file (faster, excludes
59 synonyms).
60
61 NB this plugin is NOT compatible with the ensembl-tools variant_effect_predictor.pl version of VEP.
62
63 =cut
64
65 package LocalID;
66
67 use strict;
68 use warnings;
69
70 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
71 use Bio::EnsEMBL::VEP::Parser::ID;
72 use Bio::EnsEMBL::VEP::Constants;
73 use Bio::EnsEMBL::VEP::Utils qw(get_compressed_filehandle);
74
75
76 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
77
78 sub new {
79 my $class = shift;
80
81 my $self = $class->SUPER::new(@_);
82
83 my $param_hash = $self->params_to_hash();
84 my $config = $self->{config};
85 my $species = $config->{species} || 'homo_sapiens';
86 my $db;
87
88 unless($db = $param_hash->{db}) {
89 my $version =
90 $config->{db_version} ||
91 $config->{cache_version} ||
92 $Bio::EnsEMBL::VEP::Constants::VEP_VERSION ||
93 'Bio::EnsEMBL::Registry'->software_version ||
94 undef;
95 my $assembly = $config->{assembly};
96 my $dir = $param_hash->{dir} || $config->{dir};
97
98 die("ERROR: Unable to determine software version - if using --offline, add --cache_version [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $version;
99 die("ERROR: Unable to determine assembly version - if using --offline, add --assembly [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $assembly;
100 $db = sprintf("%s/%s_%i_%s.variant_ids.sqlite3", $dir, $species, $version, $assembly);
101 }
102
103 # create DB?
104 $self->create_db($db, $species, $param_hash) if $param_hash->{create_db};
105
106 die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db;
107
108 $self->config->{_localid_db_file} = $db;
109
110 return $self;
111 }
112
113 sub create_db {
114 my ($self, $db, $species, $param_hash) = @_;
115
116 # requites sqlite3 command line tool
117 die("ERROR: sqlite3 command not found in path\n") unless `which sqlite3` =~ /\/sqlite3/;
118
119 my $config = $self->{config};
120
121 die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db;
122
123 print STDERR "## LocalID plugin\n # Creating database of variant IDs - this may take some time\n" unless $config->{quiet};
124
125 my $tmpfile = "$db.tmp$$";
126 open my $tmp_handle, ">$tmpfile" or die "ERROR: Unable to write to $tmpfile\n";
127
128 if(my $cache_dir = $param_hash->{from_cache}) {
129
130 # attempt to interpret cache dir from command line opts
131 if($cache_dir eq '1') {
132 my $version =
133 $config->{cache_version} ||
134 $config->{db_version} ||
135 $Bio::EnsEMBL::VEP::Constants::VEP_VERSION ||
136 ($config->{reg} ? $config->{reg}->software_version : undef);
137 my $assembly = $config->{assembly};
138 my $dir = $config->{dir_cache} || $config->{dir};
139
140 $cache_dir = "$dir\/$species\/$version\_$assembly";
141 }
142
143 print STDERR " # attempting to create from $cache_dir\n" unless $config->{quiet};
144 $self->_tmp_file_from_cache($cache_dir, $tmp_handle);
145 }
146 else {
147 print STDERR " # attempting to create from variation database for $species\n" unless $config->{quiet};
148 $self->_tmp_file_from_var_db($species, $tmp_handle);
149 }
150
151 close $tmp_handle;
152
153 # create database
154 my $dbh = DBI->connect("dbi:SQLite:dbname=$db","","");
155 $dbh->do("CREATE TABLE ids(id, chr, start, end, alleles, strand)");
156
157 # load tmp file into table
158 print STDERR " # loading database\n" unless $config->{quiet};
159 my $cmd = qq{sqlite3 $db '.import $tmpfile ids'};
160 `$cmd 2>&1` and die("ERROR: Failed to import $tmpfile to $db\n");
161 unlink($tmpfile);
162
163 # index
164 print STDERR " # indexing database\n" unless $config->{quiet};
165 $dbh->do("CREATE INDEX id_idx ON ids(id)");
166
167 print STDERR " # successfully built database $db\n" unless $config->{quiet};
168 }
169
170 sub _tmp_file_from_cache {
171 my ($self, $cache_dir, $tmp_handle) = @_;
172 my $config = $self->{config};
173
174 die("ERROR: Cache dir $cache_dir not found or not a directory\n") unless -d $cache_dir;
175
176 # read info
177 open INFO, $cache_dir.'/info.txt' or die("ERROR: No info.txt file found in $cache_dir\n");
178
179 my %cols;
180 while(<INFO>) {
181 next unless /^variation_cols/;
182 chomp;
183 my @tmp_cols = split(',', (split("\t", $_))[1]);
184 $cols{$tmp_cols[$_]} = $_ for 0..$#tmp_cols;
185 last;
186 }
187 close INFO;
188
189 # get all chromosome dirs
190 opendir DIR, $cache_dir or die("ERROR: Could not read dir $cache_dir\n");
191 my @chrs = grep {-d $cache_dir.'/'.$_ && !/^\./} readdir DIR;
192 closedir DIR;
193
194 foreach my $chr(@chrs) {
195 opendir CHR, $cache_dir.'/'.$chr;
196 my @all_files = grep {/var/ && !/\.(tb|cs)i$/} readdir CHR;
197 closedir CHR;
198
199 my @files = grep {/all_vars/} @all_files;
200 @files = @all_files unless @files;
201
202 foreach my $file(@files) {
203 my $fh = get_compressed_filehandle($cache_dir.'/'.$chr.'/'.$file, 1);
204
205 my $delim;
206
207 while(<$fh>) {
208 unless($delim) {
209 $delim = /\t/ ? "\t" : " ";
210 }
211
212 chomp;
213 my @split = map {($_ || '') eq '.' ? undef : $_} split($delim);
214
215 # id, chr, start, end, alleles, strand
216 print $tmp_handle join("|",
217 $split[$cols{variation_name}],
218 $chr,
219 $split[$cols{start}],
220 $split[$cols{end}] || $split[$cols{start}],
221 $split[$cols{allele_string}] || '',
222 $split[$cols{strand}] || 1,
223 )."\n";
224 }
225
226 close $fh;
227 }
228 }
229 }
230
231 sub _tmp_file_from_var_db {
232 my ($self, $species, $tmp_handle) = @_;
233 my $config = $self->{config};
234
235 my $var_dbc = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc;
236
237 my $mysql = $var_dbc->prepare(qq{
238 SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand
239 FROM variation v, variation_feature vf, seq_region s
240 WHERE v.variation_id = vf.variation_id
241 AND vf.seq_region_id = s.seq_region_id
242 }, {mysql_use_result => 1});
243
244 my ($i, $c, $s, $e, $a, $d);
245 $mysql->execute();
246 $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d);
247 print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch();
248 $mysql->finish();
249
250 # do synonyms
251 print STDERR "Processing synonyms\n" unless $config->{quiet};
252 $mysql = $var_dbc->prepare(qq{
253 SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand
254 FROM variation_synonym v, variation_feature vf, seq_region s
255 WHERE v.variation_id = vf.variation_id
256 AND vf.seq_region_id = s.seq_region_id
257 }, {mysql_use_result => 1});
258
259 $mysql->execute();
260 $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d);
261 print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch();
262 $mysql->finish();
263 }
264
265 sub run {
266 return {};
267 }
268
269 1;
270
271
272
273
274 ###########################################
275 ### Redefine methods in existing module ###
276 ###########################################
277
278 package Bio::EnsEMBL::VEP::Parser::ID;
279
280 no warnings qw(redefine);
281 sub new {
282 my $caller = shift;
283 my $class = ref($caller) || $caller;
284
285 my $self = $class->SUPER::new(@_);
286
287 return $self;
288 }
289
290 sub create_VariationFeatures {
291 my $self = shift;
292
293 my $parser = $self->parser;
294 $parser->next();
295
296 $self->skip_empty_lines();
297
298 return [] unless $parser->{record};
299
300 $self->line_number($self->line_number + 1);
301
302 my $id = $parser->get_value;
303
304 # remove whitespace
305 $id =~ s/\s+//g;
306
307 my $db = $self->id_db;
308 my $sth = $self->{_id_sth} ||= $db->prepare("SELECT chr, start, end, alleles, strand FROM ids WHERE id = ?");
309 my $ad = $self->{_var_ad} ||= $self->get_adaptor('variation', 'VariationFeature');
310
311 my @vfs;
312 my ($c, $s, $e, $a, $d);
313 $sth->execute($id);
314 $sth->bind_columns(\$c, \$s, \$e, \$a, \$d);
315
316 push @vfs, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
317 start => $s,
318 end => $e,
319 allele_string => $a,
320 strand => $d,
321 map_weight => 1,
322 adaptor => $ad,
323 variation_name => $id,
324 chr => $c,
325 }) while $sth->fetch;
326
327 $sth->finish();
328
329 return $self->post_process_vfs(\@vfs);
330 }
331
332 sub id_db {
333 my $self = shift;
334
335 unless(exists($self->{_id_db})) {
336 throw("ERROR: ID database not defined or detected - possible plugin compile failure\n") unless my $db = $self->config->{_params}->{_localid_db_file};
337 throw("ERROR: ID database file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db;
338 $self->{_id_db} = DBI->connect("dbi:SQLite:dbname=$db","","");
339 }
340
341 return $self->{_id_db};
342 }
343
344 1;