| 0 | 1 =head1 LICENSE | 
|  | 2 | 
|  | 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | 
|  | 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | 
|  | 5 | 
|  | 6 Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 7 you may not use this file except in compliance with the License. | 
|  | 8 You may obtain a copy of the License at | 
|  | 9 | 
|  | 10      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | 
|  | 12 Unless required by applicable law or agreed to in writing, software | 
|  | 13 distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 15 See the License for the specific language governing permissions and | 
|  | 16 limitations under the License. | 
|  | 17 | 
|  | 18 =head1 CONTACT | 
|  | 19 | 
|  | 20  Ensembl <http://www.ensembl.org/info/about/contact/index.html> | 
|  | 21 | 
|  | 22 =cut | 
|  | 23 | 
|  | 24 =head1 NAME | 
|  | 25 | 
|  | 26  LocalID | 
|  | 27 | 
|  | 28 =head1 SYNOPSIS | 
|  | 29 | 
|  | 30  mv LocalID.pm ~/.vep/Plugins | 
|  | 31 | 
|  | 32  ## first run create database | 
|  | 33 | 
|  | 34  # EITHER create from Ensembl variation database | 
|  | 35  # VERY slow but includes variant synonyms, if not required see next command | 
|  | 36  ./vep -i variant_ids.txt --plugin LocalID,create_db=1 -safe | 
|  | 37 | 
|  | 38  # OR create from cache directory | 
|  | 39  # faster but does not include synonyms | 
|  | 40  # parameter passed to from_cache may be full path to cache e.g. $HOME/.vep/homo_sapiens/88_GRCh38 | 
|  | 41  # cache may be tabix converted or in default state (http://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#convert) | 
|  | 42  ./vep -i variant_ids.txt --plugin LocalID,create_db=1,from_cache=1 -safe | 
|  | 43 | 
|  | 44  # subsequent runs | 
|  | 45  ./vep -i variant_ids.txt --plugin LocalID | 
|  | 46 | 
|  | 47  # db file can be specified with db=[file] | 
|  | 48  # default file name is $HOME/.vep/[species]_[version]_[assembly].variant_ids.sqlite3 | 
|  | 49  ./vep -i variant_ids.txt --plugin LocalID,db=my_db_file.txt | 
|  | 50 | 
|  | 51 =head1 DESCRIPTION | 
|  | 52 | 
|  | 53  The LocalID plugin allows you to use variant IDs as input without making a database connection. | 
|  | 54 | 
|  | 55  Requires sqlite3. | 
|  | 56 | 
|  | 57  A local sqlite3 database is used to look up variant IDs; this is generated either from Ensembl's | 
|  | 58  public database (very slow, but includes synonyms), or from a VEP cache file (faster, excludes | 
|  | 59  synonyms). | 
|  | 60 | 
|  | 61  NB this plugin is NOT compatible with the ensembl-tools variant_effect_predictor.pl version of VEP. | 
|  | 62 | 
|  | 63 =cut | 
|  | 64 | 
|  | 65 package LocalID; | 
|  | 66 | 
|  | 67 use strict; | 
|  | 68 use warnings; | 
|  | 69 | 
|  | 70 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | 
|  | 71 use Bio::EnsEMBL::VEP::Parser::ID; | 
|  | 72 use Bio::EnsEMBL::VEP::Constants; | 
|  | 73 use Bio::EnsEMBL::VEP::Utils qw(get_compressed_filehandle); | 
|  | 74 | 
|  | 75 | 
|  | 76 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | 
|  | 77 | 
|  | 78 sub new { | 
|  | 79   my $class = shift; | 
|  | 80 | 
|  | 81   my $self = $class->SUPER::new(@_); | 
|  | 82 | 
|  | 83   my $param_hash = $self->params_to_hash(); | 
|  | 84   my $config     = $self->{config}; | 
|  | 85   my $species    = $config->{species} || 'homo_sapiens'; | 
|  | 86   my $db; | 
|  | 87 | 
|  | 88   unless($db = $param_hash->{db}) { | 
|  | 89     my $version = | 
|  | 90       $config->{db_version} || | 
|  | 91       $config->{cache_version} || | 
|  | 92       $Bio::EnsEMBL::VEP::Constants::VEP_VERSION || | 
|  | 93       'Bio::EnsEMBL::Registry'->software_version || | 
|  | 94       undef; | 
|  | 95     my $assembly = $config->{assembly}; | 
|  | 96     my $dir = $param_hash->{dir} || $config->{dir}; | 
|  | 97 | 
|  | 98     die("ERROR: Unable to determine software version - if using --offline, add --cache_version [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $version; | 
|  | 99     die("ERROR: Unable to determine assembly version - if using --offline, add --assembly [version] or add the ID database name to your --plugin string as \"db=[file]\"\n") unless $assembly; | 
|  | 100     $db = sprintf("%s/%s_%i_%s.variant_ids.sqlite3", $dir, $species, $version, $assembly); | 
|  | 101   } | 
|  | 102 | 
|  | 103   # create DB? | 
|  | 104   $self->create_db($db, $species, $param_hash) if $param_hash->{create_db}; | 
|  | 105 | 
|  | 106   die("ERROR: DB file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db; | 
|  | 107 | 
|  | 108   $self->config->{_localid_db_file} = $db; | 
|  | 109 | 
|  | 110   return $self; | 
|  | 111 } | 
|  | 112 | 
|  | 113 sub create_db { | 
|  | 114   my ($self, $db, $species, $param_hash) = @_; | 
|  | 115 | 
|  | 116   # requites sqlite3 command line tool | 
|  | 117   die("ERROR: sqlite3 command not found in path\n") unless `which sqlite3` =~ /\/sqlite3/; | 
|  | 118 | 
|  | 119   my $config = $self->{config}; | 
|  | 120 | 
|  | 121   die("ERROR: DB file $db already exists - remove and re-run to overwrite\n") if -e $db; | 
|  | 122 | 
|  | 123   print STDERR "## LocalID plugin\n # Creating database of variant IDs - this may take some time\n" unless $config->{quiet}; | 
|  | 124 | 
|  | 125   my $tmpfile = "$db.tmp$$"; | 
|  | 126   open my $tmp_handle, ">$tmpfile" or die "ERROR: Unable to write to $tmpfile\n"; | 
|  | 127 | 
|  | 128   if(my $cache_dir = $param_hash->{from_cache}) { | 
|  | 129 | 
|  | 130     # attempt to interpret cache dir from command line opts | 
|  | 131     if($cache_dir eq '1') { | 
|  | 132       my $version = | 
|  | 133         $config->{cache_version} || | 
|  | 134         $config->{db_version} || | 
|  | 135         $Bio::EnsEMBL::VEP::Constants::VEP_VERSION || | 
|  | 136         ($config->{reg} ? $config->{reg}->software_version : undef); | 
|  | 137       my $assembly = $config->{assembly}; | 
|  | 138       my $dir = $config->{dir_cache} || $config->{dir}; | 
|  | 139 | 
|  | 140       $cache_dir = "$dir\/$species\/$version\_$assembly"; | 
|  | 141     } | 
|  | 142 | 
|  | 143     print STDERR " # attempting to create from $cache_dir\n" unless $config->{quiet}; | 
|  | 144     $self->_tmp_file_from_cache($cache_dir, $tmp_handle); | 
|  | 145   } | 
|  | 146   else { | 
|  | 147     print STDERR " # attempting to create from variation database for $species\n" unless $config->{quiet}; | 
|  | 148     $self->_tmp_file_from_var_db($species, $tmp_handle); | 
|  | 149   } | 
|  | 150 | 
|  | 151   close $tmp_handle; | 
|  | 152 | 
|  | 153   # create database | 
|  | 154   my $dbh = DBI->connect("dbi:SQLite:dbname=$db","",""); | 
|  | 155   $dbh->do("CREATE TABLE ids(id, chr, start, end, alleles, strand)"); | 
|  | 156 | 
|  | 157   # load tmp file into table | 
|  | 158   print STDERR " # loading database\n" unless $config->{quiet}; | 
|  | 159   my $cmd = qq{sqlite3 $db '.import $tmpfile ids'}; | 
|  | 160   `$cmd 2>&1` and die("ERROR: Failed to import $tmpfile to $db\n"); | 
|  | 161   unlink($tmpfile); | 
|  | 162 | 
|  | 163   # index | 
|  | 164   print STDERR " # indexing database\n" unless $config->{quiet}; | 
|  | 165   $dbh->do("CREATE INDEX id_idx ON ids(id)"); | 
|  | 166 | 
|  | 167   print STDERR " # successfully built database $db\n" unless $config->{quiet}; | 
|  | 168 } | 
|  | 169 | 
|  | 170 sub _tmp_file_from_cache { | 
|  | 171   my ($self, $cache_dir, $tmp_handle) = @_; | 
|  | 172   my $config = $self->{config}; | 
|  | 173 | 
|  | 174   die("ERROR: Cache dir $cache_dir not found or not a directory\n") unless -d $cache_dir; | 
|  | 175 | 
|  | 176   # read info | 
|  | 177   open INFO, $cache_dir.'/info.txt' or die("ERROR: No info.txt file found in $cache_dir\n"); | 
|  | 178 | 
|  | 179   my %cols; | 
|  | 180   while(<INFO>) { | 
|  | 181     next unless /^variation_cols/; | 
|  | 182     chomp; | 
|  | 183     my @tmp_cols = split(',', (split("\t", $_))[1]); | 
|  | 184     $cols{$tmp_cols[$_]} = $_ for 0..$#tmp_cols; | 
|  | 185     last; | 
|  | 186   } | 
|  | 187   close INFO; | 
|  | 188 | 
|  | 189   # get all chromosome dirs | 
|  | 190   opendir DIR, $cache_dir or die("ERROR: Could not read dir $cache_dir\n"); | 
|  | 191   my @chrs = grep {-d $cache_dir.'/'.$_ && !/^\./} readdir DIR; | 
|  | 192   closedir DIR; | 
|  | 193 | 
|  | 194   foreach my $chr(@chrs) { | 
|  | 195     opendir CHR, $cache_dir.'/'.$chr; | 
|  | 196     my @all_files = grep {/var/ && !/\.(tb|cs)i$/} readdir CHR; | 
|  | 197     closedir CHR; | 
|  | 198 | 
|  | 199     my @files = grep {/all_vars/} @all_files; | 
|  | 200     @files = @all_files unless @files; | 
|  | 201 | 
|  | 202     foreach my $file(@files) { | 
|  | 203       my $fh = get_compressed_filehandle($cache_dir.'/'.$chr.'/'.$file, 1); | 
|  | 204 | 
|  | 205       my $delim; | 
|  | 206 | 
|  | 207       while(<$fh>) { | 
|  | 208         unless($delim) { | 
|  | 209           $delim = /\t/ ? "\t" : " "; | 
|  | 210         } | 
|  | 211 | 
|  | 212         chomp; | 
|  | 213         my @split = map {($_ || '') eq '.' ? undef : $_} split($delim); | 
|  | 214 | 
|  | 215         # id, chr, start, end, alleles, strand | 
|  | 216         print $tmp_handle join("|", | 
|  | 217           $split[$cols{variation_name}], | 
|  | 218           $chr, | 
|  | 219           $split[$cols{start}], | 
|  | 220           $split[$cols{end}] || $split[$cols{start}], | 
|  | 221           $split[$cols{allele_string}] || '', | 
|  | 222           $split[$cols{strand}] || 1, | 
|  | 223         )."\n"; | 
|  | 224       } | 
|  | 225 | 
|  | 226       close $fh; | 
|  | 227     } | 
|  | 228   } | 
|  | 229 } | 
|  | 230 | 
|  | 231 sub _tmp_file_from_var_db { | 
|  | 232   my ($self, $species, $tmp_handle) = @_; | 
|  | 233   my $config = $self->{config}; | 
|  | 234 | 
|  | 235   my $var_dbc = Bio::EnsEMBL::Registry->get_adaptor($species, 'variation', 'variation')->db->dbc; | 
|  | 236 | 
|  | 237   my $mysql = $var_dbc->prepare(qq{ | 
|  | 238     SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand | 
|  | 239     FROM variation v, variation_feature vf, seq_region s | 
|  | 240     WHERE v.variation_id = vf.variation_id | 
|  | 241     AND vf.seq_region_id = s.seq_region_id | 
|  | 242   }, {mysql_use_result => 1}); | 
|  | 243 | 
|  | 244   my ($i, $c, $s, $e, $a, $d); | 
|  | 245   $mysql->execute(); | 
|  | 246   $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d); | 
|  | 247   print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch(); | 
|  | 248   $mysql->finish(); | 
|  | 249 | 
|  | 250   # do synonyms | 
|  | 251   print STDERR "Processing synonyms\n" unless $config->{quiet}; | 
|  | 252   $mysql = $var_dbc->prepare(qq{ | 
|  | 253     SELECT v.name, s.name, vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand | 
|  | 254     FROM variation_synonym v, variation_feature vf, seq_region s | 
|  | 255     WHERE v.variation_id = vf.variation_id | 
|  | 256     AND vf.seq_region_id = s.seq_region_id | 
|  | 257   }, {mysql_use_result => 1}); | 
|  | 258 | 
|  | 259   $mysql->execute(); | 
|  | 260   $mysql->bind_columns(\$i, \$c, \$s, \$e, \$a, \$d); | 
|  | 261   print $tmp_handle join("|", ($i, $c, $s, $e, $a, $d))."\n" while $mysql->fetch(); | 
|  | 262   $mysql->finish(); | 
|  | 263 } | 
|  | 264 | 
|  | 265 sub run { | 
|  | 266   return {}; | 
|  | 267 } | 
|  | 268 | 
|  | 269 1; | 
|  | 270 | 
|  | 271 | 
|  | 272 | 
|  | 273 | 
|  | 274 ########################################### | 
|  | 275 ### Redefine methods in existing module ### | 
|  | 276 ########################################### | 
|  | 277 | 
|  | 278 package Bio::EnsEMBL::VEP::Parser::ID; | 
|  | 279 | 
|  | 280 no warnings qw(redefine); | 
|  | 281 sub new { | 
|  | 282   my $caller = shift; | 
|  | 283   my $class = ref($caller) || $caller; | 
|  | 284 | 
|  | 285   my $self = $class->SUPER::new(@_); | 
|  | 286 | 
|  | 287   return $self; | 
|  | 288 } | 
|  | 289 | 
|  | 290 sub create_VariationFeatures { | 
|  | 291   my $self = shift; | 
|  | 292 | 
|  | 293   my $parser = $self->parser; | 
|  | 294   $parser->next(); | 
|  | 295 | 
|  | 296   $self->skip_empty_lines(); | 
|  | 297 | 
|  | 298   return [] unless $parser->{record}; | 
|  | 299 | 
|  | 300   $self->line_number($self->line_number + 1); | 
|  | 301 | 
|  | 302   my $id = $parser->get_value; | 
|  | 303 | 
|  | 304   # remove whitespace | 
|  | 305   $id =~ s/\s+//g; | 
|  | 306 | 
|  | 307   my $db  = $self->id_db; | 
|  | 308   my $sth = $self->{_id_sth} ||= $db->prepare("SELECT chr, start, end, alleles, strand FROM ids WHERE id = ?"); | 
|  | 309   my $ad  = $self->{_var_ad} ||= $self->get_adaptor('variation', 'VariationFeature'); | 
|  | 310 | 
|  | 311   my @vfs; | 
|  | 312   my ($c, $s, $e, $a, $d); | 
|  | 313   $sth->execute($id); | 
|  | 314   $sth->bind_columns(\$c, \$s, \$e, \$a, \$d); | 
|  | 315 | 
|  | 316   push @vfs, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | 
|  | 317     start          => $s, | 
|  | 318     end            => $e, | 
|  | 319     allele_string  => $a, | 
|  | 320     strand         => $d, | 
|  | 321     map_weight     => 1, | 
|  | 322     adaptor        => $ad, | 
|  | 323     variation_name => $id, | 
|  | 324     chr            => $c, | 
|  | 325   }) while $sth->fetch; | 
|  | 326 | 
|  | 327   $sth->finish(); | 
|  | 328 | 
|  | 329   return $self->post_process_vfs(\@vfs); | 
|  | 330 } | 
|  | 331 | 
|  | 332 sub id_db { | 
|  | 333   my $self = shift; | 
|  | 334 | 
|  | 335   unless(exists($self->{_id_db})) { | 
|  | 336     throw("ERROR: ID database not defined or detected - possible plugin compile failure\n") unless my $db = $self->config->{_params}->{_localid_db_file}; | 
|  | 337     throw("ERROR: ID database file $db not found - you need to download or create it first, see documentation in plugin file\n") unless -e $db; | 
|  | 338     $self->{_id_db} = DBI->connect("dbi:SQLite:dbname=$db","",""); | 
|  | 339   } | 
|  | 340 | 
|  | 341   return $self->{_id_db}; | 
|  | 342 } | 
|  | 343 | 
|  | 344 1; |