Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/LocalID.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
comparison
equal
deleted
inserted
replaced
| 2:17c98d091710 | 3:49397129aec0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 LocalID | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv LocalID.pm ~/.vep/Plugins | |
| 31 | |
| 32 ## first run create database | |
| 33 | |
| 34 # EITHER create from Ensembl variation database | |
| 35 # VERY slow but includes variant synonyms, if not required see next command | |
| 36 ./vep -i variant_ids.txt --plugin LocalID,create_db=1 -safe | |
| 37 | |
| 38 # OR create from cache directory | |
| 39 # faster but does not include synonyms | |
| 40 # parameter passed to from_cache may be full path to cache e.g. $HOME/.vep/homo_sapiens/88_GRCh38 | |
| 41 # cache may be tabix converted or in default state (http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#convert) | |
| 42 ./vep -i variant_ids.txt --plugin LocalID,create_db=1,from_cache=1 -safe | |
| 43 | |
| 44 # subsequent runs | |
| 45 ./vep -i variant_ids.txt --plugin LocalID | |
| 46 | |
| 47 # db file can be specified with db=[file] | |
| 48 # default file name is $HOME/.vep/[species]_[version]_[assembly].variant_ids.sqlite3 | |
| 49 ./vep -i variant_ids.txt --plugin LocalID,db=my_db_file.txt | |
| 50 | |
| 51 =head1 DESCRIPTION | |
| 52 | |
| 53 The LocalID plugin allows you to use variant IDs as input without making a database connection. | |
| 54 | |
| 55 Requires sqlite3. | |
| 56 | |
| 57 A local sqlite3 database is used to look up variant IDs; this is generated either from Ensembl's | |
| 58 public database (very slow, but includes synonyms), or from a VEP cache file (faster, excludes | |
| 59 synonyms). | |
| 60 | |
| 61 NB this plugin is NOT compatible with the ensembl-tools variant_effect_predictor.pl version of VEP. | |
| 62 | |
| 63 =cut | |
| 64 | |
| 65 package LocalID; | |
| 66 | |
| 67 use strict; | |
| 68 use warnings; | |
| 69 | |
| 70 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | |
| 71 use Bio::EnsEMBL::VEP::Parser::ID; | |
| 72 use Bio::EnsEMBL::VEP::Constants; | |
| 73 use Bio::EnsEMBL::VEP::Utils qw(get_compressed_filehandle); | |
| 74 | |
| 75 | |
| 76 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 77 | |
| 78 sub new { | |
| 79 my $class = shift; | |
| 80 | |
| 81 my $self = $class->SUPER::new(@_); | |
| 82 | |
| 83 my $param_hash = $self->params_to_hash(); | |
| 84 my $config = $self->{config}; | |
| 85 my $species = $config->{species} || 'homo_sapiens'; | |
| 86 my $db; | |
| 87 | |
| 88 unless($db = $param_hash->{db}) { | |
| 89 my $version = | |
| 90 $config->{db_version} || | |
| 91 $config->{cache_version} || | |
| 92 $Bio::EnsEMBL::VEP::Constants::VEP_VERSION || | |
| 93 'Bio::EnsEMBL::Registry'->software_version || | |
| 94 undef; | |
| 95 my $assembly = $config->{assembly}; | |
| 96 my $dir = $param_hash->{dir} || $config->{dir}; | |
| 97 | |
| 98 die("ERROR: Unable to determine software version - if using --offline, add --cache_version [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $version; | |
| 99 die("ERROR: Unable to determine assembly version - if using --offline, add --assembly [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $assembly; | |
| 100 $db = sprintf("%s/%s_%i_%s.variant_ids.sqlite3", $dir, $species, $version, $assembly); | |
| 101 } | |
| 102 | |
| 103 # create DB? | |
| 104 $self->create_db($db, $species, $param_hash) if $param_hash->{create_db}; | |
| 105 | |
| 106 die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db; | |
| 107 | |
| 108 $self->config->{_localid_db_file} = $db; | |
| 109 | |
| 110 return $self; | |
| 111 } | |
| 112 | |
| 113 sub create_db { | |
| 114 my ($self, $db, $species, $param_hash) = @_; | |
| 115 | |
| 116 # requites sqlite3 command line tool | |
| 117 die("ERROR: sqlite3 command not found in path\n") unless `which sqlite3` =~ /\/sqlite3/; | |
| 118 | |
| 119 my $config = $self->{config}; | |
| 120 | |
| 121 die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db; | |
| 122 | |
| 123 print STDERR "## LocalID plugin\n # Creating database of variant IDs - this may take some time\n" unless $config->{quiet}; | |
| 124 | |
| 125 my $tmpfile = "$db.tmp$$"; | |
| 126 open my $tmp_handle, ">$tmpfile" or die "ERROR: Unable to write to $tmpfile\n"; | |
| 127 | |
| 128 if(my $cache_dir = $param_hash->{from_cache}) { | |
| 129 | |
| 130 # attempt to interpret cache dir from command line opts | |
| 131 if($cache_dir eq '1') { | |
| 132 my $version = | |
| 133 $config->{cache_version} || | |
| 134 $config->{db_version} || | |
| 135 $Bio::EnsEMBL::VEP::Constants::VEP_VERSION || | |
| 136 ($config->{reg} ? $config->{reg}->software_version : undef); | |
| 137 my $assembly = $config->{assembly}; | |
| 138 my $dir = $config->{dir_cache} || $config->{dir}; | |
| 139 | |
| 140 $cache_dir = "$dir\/$species\/$version\_$assembly"; | |
| 141 } | |
| 142 | |
| 143 print STDERR " # attempting to create from $cache_dir\n" unless $config->{quiet}; | |
| 144 $self->_tmp_file_from_cache($cache_dir, $tmp_handle); | |
| 145 } | |
| 146 else { | |
| 147 print STDERR " # attempting to create from variation database for $species\n" unless $config->{quiet}; | |
| 148 $self->_tmp_file_from_var_db($species, $tmp_handle); | |
| 149 } | |
| 150 | |
| 151 close $tmp_handle; | |
| 152 | |
| 153 # create database | |
| 154 my $dbh = DBI->connect("dbi:SQLite:dbname=$db","",""); | |
| 155 $dbh->do("CREATE TABLE ids(id, chr, start, end, alleles, strand)"); | |
| 156 | |
| 157 # load tmp file into table | |
| 158 print STDERR " # loading database\n" unless $config->{quiet}; | |
| 159 my $cmd = qq{sqlite3 $db '.import $tmpfile ids'}; | |
| 160 `$cmd 2>&1` and die("ERROR: Failed to import $tmpfile to $db\n"); | |
| 161 unlink($tmpfile); | |
| 162 | |
| 163 # index | |
| 164 print STDERR " # indexing database\n" unless $config->{quiet}; | |
| 165 $dbh->do("CREATE INDEX id_idx ON ids(id)"); | |
| 166 | |
| 167 print STDERR " # successfully built database $db\n" unless $config->{quiet}; | |
| 168 } | |
| 169 | |
| 170 sub _tmp_file_from_cache { | |
| 171 my ($self, $cache_dir, $tmp_handle) = @_; | |
| 172 my $config = $self->{config}; | |
| 173 | |
| 174 die("ERROR: Cache dir $cache_dir not found or not a directory\n") unless -d $cache_dir; | |
| 175 | |
| 176 # read info | |
| 177 open INFO, $cache_dir.'/info.txt' or die("ERROR: No info.txt file found in $cache_dir\n"); | |
| 178 | |
| 179 my %cols; | |
| 180 while(<INFO>) { | |
| 181 next unless /^variation_cols/; | |
| 182 chomp; | |
| 183 my @tmp_cols = split(',', (split("\t", $_))[1]); | |
| 184 $cols{$tmp_cols[$_]} = $_ for 0..$#tmp_cols; | |
| 185 last; | |
| 186 } | |
| 187 close INFO; | |
| 188 | |
| 189 # get all chromosome dirs | |
| 190 opendir DIR, $cache_dir or die("ERROR: Could not read dir $cache_dir\n"); | |
| 191 my @chrs = grep {-d $cache_dir.'/'.$_ && !/^\./} readdir DIR; | |
| 192 closedir DIR; | |
| 193 | |
| 194 foreach my $chr(@chrs) { | |
| 195 opendir CHR, $cache_dir.'/'.$chr; | |
| 196 my @all_files = grep {/var/ && !/\.(tb|cs)i$/} readdir CHR; | |
| 197 closedir CHR; | |
| 198 | |
| 199 my @files = grep {/all_vars/} @all_files; | |
| 200 @files = @all_files unless @files; | |
| 201 | |
| 202 foreach my $file(@files) { | |
| 203 my $fh = get_compressed_filehandle($cache_dir.'/'.$chr.'/'.$file, 1); | |
| 204 | |
| 205 my $delim; | |
| 206 | |
| 207 while(<$fh>) { | |
| 208 unless($delim) { | |
| 209 $delim = /\t/ ? "\t" : " "; | |
| 210 } | |
| 211 | |
| 212 chomp; | |
| 213 my @split = map {($_ || '') eq '.' ? undef : $_} split($delim); | |
| 214 | |
| 215 # id, chr, start, end, alleles, strand | |
| 216 print $tmp_handle join("|", | |
| 217 $split[$cols{variation_name}], | |
| 218 $chr, | |
| 219 $split[$cols{start}], | |
| 220 $split[$cols{end}] || $split[$cols{start}], | |
| 221 $split[$cols{allele_string}] || '', | |
| 222 $split[$cols{strand}] || 1, | |
| 223 )."\n"; | |
| 224 } | |
| 225 | |
| 226 close $fh; | |
| 227 } | |
| 228 } | |
| 229 } | |
| 230 | |
| 231 sub _tmp_file_from_var_db { | |
| 232 my ($self, $species, $tmp_handle) = @_; | |
| 233 my $config = $self->{config}; | |
| 234 | |
| 235 my $var_dbc = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc; | |
| 236 | |
| 237 my $mysql = $var_dbc->prepare(qq{ | |
| 238 SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand | |
| 239 FROM variation v, variation_feature vf, seq_region s | |
| 240 WHERE v.variation_id = vf.variation_id | |
| 241 AND vf.seq_region_id = s.seq_region_id | |
| 242 }, {mysql_use_result => 1}); | |
| 243 | |
| 244 my ($i, $c, $s, $e, $a, $d); | |
| 245 $mysql->execute(); | |
| 246 $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d); | |
| 247 print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch(); | |
| 248 $mysql->finish(); | |
| 249 | |
| 250 # do synonyms | |
| 251 print STDERR "Processing synonyms\n" unless $config->{quiet}; | |
| 252 $mysql = $var_dbc->prepare(qq{ | |
| 253 SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand | |
| 254 FROM variation_synonym v, variation_feature vf, seq_region s | |
| 255 WHERE v.variation_id = vf.variation_id | |
| 256 AND vf.seq_region_id = s.seq_region_id | |
| 257 }, {mysql_use_result => 1}); | |
| 258 | |
| 259 $mysql->execute(); | |
| 260 $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d); | |
| 261 print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch(); | |
| 262 $mysql->finish(); | |
| 263 } | |
| 264 | |
| 265 sub run { | |
| 266 return {}; | |
| 267 } | |
| 268 | |
| 269 1; | |
| 270 | |
| 271 | |
| 272 | |
| 273 | |
| 274 ########################################### | |
| 275 ### Redefine methods in existing module ### | |
| 276 ########################################### | |
| 277 | |
| 278 package Bio::EnsEMBL::VEP::Parser::ID; | |
| 279 | |
| 280 no warnings qw(redefine); | |
| 281 sub new { | |
| 282 my $caller = shift; | |
| 283 my $class = ref($caller) || $caller; | |
| 284 | |
| 285 my $self = $class->SUPER::new(@_); | |
| 286 | |
| 287 return $self; | |
| 288 } | |
| 289 | |
| 290 sub create_VariationFeatures { | |
| 291 my $self = shift; | |
| 292 | |
| 293 my $parser = $self->parser; | |
| 294 $parser->next(); | |
| 295 | |
| 296 $self->skip_empty_lines(); | |
| 297 | |
| 298 return [] unless $parser->{record}; | |
| 299 | |
| 300 $self->line_number($self->line_number + 1); | |
| 301 | |
| 302 my $id = $parser->get_value; | |
| 303 | |
| 304 # remove whitespace | |
| 305 $id =~ s/\s+//g; | |
| 306 | |
| 307 my $db = $self->id_db; | |
| 308 my $sth = $self->{_id_sth} ||= $db->prepare("SELECT chr, start, end, alleles, strand FROM ids WHERE id = ?"); | |
| 309 my $ad = $self->{_var_ad} ||= $self->get_adaptor('variation', 'VariationFeature'); | |
| 310 | |
| 311 my @vfs; | |
| 312 my ($c, $s, $e, $a, $d); | |
| 313 $sth->execute($id); | |
| 314 $sth->bind_columns(\$c, \$s, \$e, \$a, \$d); | |
| 315 | |
| 316 push @vfs, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | |
| 317 start => $s, | |
| 318 end => $e, | |
| 319 allele_string => $a, | |
| 320 strand => $d, | |
| 321 map_weight => 1, | |
| 322 adaptor => $ad, | |
| 323 variation_name => $id, | |
| 324 chr => $c, | |
| 325 }) while $sth->fetch; | |
| 326 | |
| 327 $sth->finish(); | |
| 328 | |
| 329 return $self->post_process_vfs(\@vfs); | |
| 330 } | |
| 331 | |
| 332 sub id_db { | |
| 333 my $self = shift; | |
| 334 | |
| 335 unless(exists($self->{_id_db})) { | |
| 336 throw("ERROR: ID database not defined or detected - possible plugin compile failure\n") unless my $db = $self->config->{_params}->{_localid_db_file}; | |
| 337 throw("ERROR: ID database file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db; | |
| 338 $self->{_id_db} = DBI->connect("dbi:SQLite:dbname=$db","",""); | |
| 339 } | |
| 340 | |
| 341 return $self->{_id_db}; | |
| 342 } | |
| 343 | |
| 344 1; |
