Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/variant_effect_predictor.pl @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 =head1 LICENSE | |
| 4 | |
| 5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 6 Genome Research Limited. All rights reserved. | |
| 7 | |
| 8 This software is distributed under a modified Apache license. | |
| 9 For license details, please see | |
| 10 | |
| 11 http://www.ensembl.org/info/about/code_licence.html | |
| 12 | |
| 13 =head1 CONTACT | |
| 14 | |
| 15 Please email comments or questions to the public Ensembl | |
| 16 developers list at <dev@ensembl.org>. | |
| 17 | |
| 18 Questions may also be sent to the Ensembl help desk at | |
| 19 <helpdesk@ensembl.org>. | |
| 20 | |
| 21 =cut | |
| 22 | |
| 23 =head1 NAME | |
| 24 | |
| 25 Variant Effect Predictor - a script to predict the consequences of genomic variants | |
| 26 | |
| 27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html | |
| 28 | |
| 29 Version 2.6 | |
| 30 | |
| 31 by Will McLaren (wm2@ebi.ac.uk) | |
| 32 =cut | |
| 33 | |
| 34 use strict; | |
| 35 use Getopt::Long; | |
| 36 use FileHandle; | |
| 37 use FindBin qw($Bin); | |
| 38 use lib $Bin; | |
| 39 | |
| 40 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); | |
| 41 use Bio::EnsEMBL::Variation::Utils::VEP qw( | |
| 42 parse_line | |
| 43 vf_to_consequences | |
| 44 validate_vf | |
| 45 convert_to_vcf | |
| 46 load_dumped_adaptor_cache | |
| 47 dump_adaptor_cache | |
| 48 get_all_consequences | |
| 49 get_slice | |
| 50 build_full_cache | |
| 51 read_cache_info | |
| 52 get_time | |
| 53 debug | |
| 54 @OUTPUT_COLS | |
| 55 @REG_FEAT_TYPES | |
| 56 %FILTER_SHORTCUTS | |
| 57 ); | |
| 58 | |
| 59 # global vars | |
| 60 my $VERSION = '2.6'; | |
| 61 | |
| 62 | |
| 63 # define headers that would normally go in the extra field | |
| 64 # keyed on the config parameter used to turn it on | |
| 65 my %extra_headers = ( | |
| 66 protein => ['ENSP'], | |
| 67 canonical => ['CANONICAL'], | |
| 68 ccds => ['CCDS'], | |
| 69 hgvs => ['HGVSc','HGVSp'], | |
| 70 hgnc => ['HGNC'], | |
| 71 sift => ['SIFT'], | |
| 72 polyphen => ['PolyPhen'], | |
| 73 numbers => ['EXON','INTRON'], | |
| 74 domains => ['DOMAINS'], | |
| 75 regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'], | |
| 76 cell_type => ['CELL_TYPE'], | |
| 77 individual => ['IND'], | |
| 78 xref_refseq => ['RefSeq'], | |
| 79 check_svs => ['SV'], | |
| 80 check_frequency => ['FREQS'], | |
| 81 gmaf => ['GMAF'], | |
| 82 user => ['DISTANCE'], | |
| 83 ); | |
| 84 | |
| 85 my %extra_descs = ( | |
| 86 'CANONICAL' => 'Indicates if transcript is canonical for this gene', | |
| 87 'CCDS' => 'Indicates if transcript is a CCDS transcript', | |
| 88 'HGNC' => 'HGNC gene identifier', | |
| 89 'ENSP' => 'Ensembl protein identifer', | |
| 90 'HGVSc' => 'HGVS coding sequence name', | |
| 91 'HGVSp' => 'HGVS protein sequence name', | |
| 92 'SIFT' => 'SIFT prediction', | |
| 93 'PolyPhen' => 'PolyPhen prediction', | |
| 94 'EXON' => 'Exon number(s) / total', | |
| 95 'INTRON' => 'Intron number(s) / total', | |
| 96 'DOMAINS' => 'The source and identifer of any overlapping protein domains', | |
| 97 'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position', | |
| 98 'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP', | |
| 99 'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP', | |
| 100 'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP', | |
| 101 'CELL_TYPE' => 'List of cell types and classifications for regulatory feature', | |
| 102 'IND' => 'Individual name', | |
| 103 'SV' => 'IDs of overlapping structural variants', | |
| 104 'FREQS' => 'Frequencies of overlapping variants used in filtering', | |
| 105 'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1', | |
| 106 'DISTANCE' => 'Shortest distance from variant to transcript', | |
| 107 ); | |
| 108 | |
| 109 # set output autoflush for progress bars | |
| 110 $| = 1; | |
| 111 | |
| 112 # configure from command line opts | |
| 113 my $config = &configure(scalar @ARGV); | |
| 114 | |
| 115 # run the main sub routine | |
| 116 &main($config); | |
| 117 | |
| 118 # this is the main sub-routine - it needs the configured $config hash | |
| 119 sub main { | |
| 120 my $config = shift; | |
| 121 | |
| 122 debug("Starting...") unless defined $config->{quiet}; | |
| 123 | |
| 124 $config->{start_time} = time(); | |
| 125 $config->{last_time} = time(); | |
| 126 | |
| 127 my $tr_cache = {}; | |
| 128 my $rf_cache = {}; | |
| 129 | |
| 130 # create a hash to hold slices so we don't get the same one twice | |
| 131 my %slice_cache = (); | |
| 132 | |
| 133 my @vfs; | |
| 134 my ($vf_count, $total_vf_count); | |
| 135 my $in_file_handle = $config->{in_file_handle}; | |
| 136 | |
| 137 # initialize line number in config | |
| 138 $config->{line_number} = 0; | |
| 139 | |
| 140 # read the file | |
| 141 while(<$in_file_handle>) { | |
| 142 chomp; | |
| 143 | |
| 144 $config->{line_number}++; | |
| 145 | |
| 146 # header line? | |
| 147 if(/^\#/) { | |
| 148 | |
| 149 # retain header lines if we are outputting VCF | |
| 150 if(defined($config->{vcf})) { | |
| 151 push @{$config->{headers}}, $_; | |
| 152 } | |
| 153 | |
| 154 # line with sample labels in VCF | |
| 155 if(defined($config->{individual}) && /^#CHROM/) { | |
| 156 my @split = split /\s+/; | |
| 157 | |
| 158 # no individuals | |
| 159 die("ERROR: No individual data found in VCF\n") if scalar @split <= 9; | |
| 160 | |
| 161 # get individual column indices | |
| 162 my %ind_cols = map {$split[$_] => $_} (9..$#split); | |
| 163 | |
| 164 # all? | |
| 165 if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) { | |
| 166 $config->{ind_cols} = \%ind_cols; | |
| 167 } | |
| 168 else { | |
| 169 my %new_ind_cols; | |
| 170 | |
| 171 # check we have specified individual(s) | |
| 172 foreach my $ind(@{$config->{individual}}) { | |
| 173 die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind}; | |
| 174 $new_ind_cols{$ind} = $ind_cols{$ind}; | |
| 175 } | |
| 176 | |
| 177 $config->{ind_cols} = \%new_ind_cols; | |
| 178 } | |
| 179 } | |
| 180 | |
| 181 next; | |
| 182 } | |
| 183 | |
| 184 # configure output file | |
| 185 $config->{out_file_handle} ||= &get_out_file_handle($config); | |
| 186 | |
| 187 # some lines (pileup) may actually parse out into more than one variant | |
| 188 foreach my $vf(@{&parse_line($config, $_)}) { | |
| 189 | |
| 190 $vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original}); | |
| 191 | |
| 192 # now get the slice | |
| 193 if(!defined($vf->{slice})) { | |
| 194 my $slice; | |
| 195 | |
| 196 # don't get slices if we're using cache | |
| 197 # we can steal them from transcript objects later | |
| 198 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { | |
| 199 | |
| 200 # check if we have fetched this slice already | |
| 201 if(defined $slice_cache{$vf->{chr}}) { | |
| 202 $slice = $slice_cache{$vf->{chr}}; | |
| 203 } | |
| 204 | |
| 205 # if not create a new one | |
| 206 else { | |
| 207 | |
| 208 $slice = &get_slice($config, $vf->{chr}); | |
| 209 | |
| 210 # if failed, warn and skip this line | |
| 211 if(!defined($slice)) { | |
| 212 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; | |
| 213 next; | |
| 214 } | |
| 215 | |
| 216 # store the hash | |
| 217 $slice_cache{$vf->{chr}} = $slice; | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 $vf->{slice} = $slice; | |
| 222 } | |
| 223 | |
| 224 # validate the VF | |
| 225 next unless validate_vf($config, $vf); | |
| 226 | |
| 227 # make a name if one doesn't exist | |
| 228 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term}); | |
| 229 | |
| 230 # jump out to convert here | |
| 231 if(defined($config->{convert})) { | |
| 232 &convert_vf($config, $vf); | |
| 233 next; | |
| 234 } | |
| 235 | |
| 236 if(defined $config->{whole_genome}) { | |
| 237 push @vfs, $vf; | |
| 238 $vf_count++; | |
| 239 $total_vf_count++; | |
| 240 | |
| 241 if($vf_count == $config->{buffer_size}) { | |
| 242 debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); | |
| 243 | |
| 244 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; | |
| 245 | |
| 246 # calculate stats | |
| 247 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); | |
| 248 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); | |
| 249 $config->{last_time} = time(); | |
| 250 | |
| 251 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); | |
| 252 | |
| 253 @vfs = (); | |
| 254 $vf_count = 0; | |
| 255 } | |
| 256 } | |
| 257 else { | |
| 258 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; | |
| 259 $vf_count++; | |
| 260 $total_vf_count++; | |
| 261 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); | |
| 262 } | |
| 263 } | |
| 264 } | |
| 265 | |
| 266 # if in whole-genome mode, finish off the rest of the buffer | |
| 267 if(defined $config->{whole_genome} && scalar @vfs) { | |
| 268 debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); | |
| 269 | |
| 270 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; | |
| 271 | |
| 272 # calculate stats | |
| 273 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); | |
| 274 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); | |
| 275 $config->{last_time} = time(); | |
| 276 | |
| 277 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); | |
| 278 | |
| 279 debug($config->{filter_count}, "/$total_vf_count variants remain after filtering") if defined($config->{filter}) && !defined($config->{quiet}); | |
| 280 } | |
| 281 | |
| 282 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); | |
| 283 | |
| 284 debug("Finished!") unless defined $config->{quiet}; | |
| 285 } | |
| 286 | |
| 287 # sets up configuration hash that is used throughout the script | |
| 288 sub configure { | |
| 289 my $args = shift; | |
| 290 | |
| 291 my $config = {}; | |
| 292 | |
| 293 GetOptions( | |
| 294 $config, | |
| 295 'help', # displays help message | |
| 296 | |
| 297 # input options, | |
| 298 'config=s', # config file name | |
| 299 'input_file|i=s', # input file name | |
| 300 'format=s', # input file format | |
| 301 | |
| 302 # DB options | |
| 303 'species=s', # species e.g. human, homo_sapiens | |
| 304 'registry=s', # registry file | |
| 305 'host=s', # database host | |
| 306 'port=s', # database port | |
| 307 'user=s', # database user name | |
| 308 'password=s', # database password | |
| 309 'db_version=i', # Ensembl database version to use e.g. 62 | |
| 310 'genomes', # automatically sets DB params for e!Genomes | |
| 311 'refseq', # use otherfeatures RefSeq DB instead of Ensembl | |
| 312 #'no_disconnect', # disables disconnect_when_inactive | |
| 313 | |
| 314 # runtime options | |
| 315 'most_severe', # only return most severe consequence | |
| 316 'summary', # only return one line per variation with all consquence types | |
| 317 'per_gene', # only return most severe per gene | |
| 318 'buffer_size=i', # number of variations to read in before analysis | |
| 319 'chunk_size=s', # size in bases of "chunks" used in internal hash structure | |
| 320 'failed=i', # include failed variations when finding existing | |
| 321 'no_whole_genome', # disables now default whole-genome mode | |
| 322 'whole_genome', # proxy for whole genome mode - now just warns user | |
| 323 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) | |
| 324 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT | |
| 325 'check_ref', # check supplied reference allele against DB | |
| 326 'check_existing', # find existing co-located variations | |
| 327 'check_svs', # find overlapping structural variations | |
| 328 'check_alleles', # only attribute co-located if alleles are the same | |
| 329 'check_frequency', # enable frequency checking | |
| 330 'gmaf', # add global MAF of existing var | |
| 331 'freq_filter=s', # exclude or include | |
| 332 'freq_freq=f', # frequency to filter on | |
| 333 'freq_gt_lt=s', # gt or lt (greater than or less than) | |
| 334 'freq_pop=s', # population to filter on | |
| 335 'allow_non_variant', # allow non-variant VCF lines through | |
| 336 'individual=s', # give results by genotype for individuals | |
| 337 'phased', # force VCF genotypes to be interpreted as phased | |
| 338 'fork=i', # fork into N processes | |
| 339 | |
| 340 # verbosity options | |
| 341 'verbose|v', # print out a bit more info while running | |
| 342 'quiet', # print nothing to STDOUT (unless using -o stdout) | |
| 343 'no_progress', # don't display progress bars | |
| 344 | |
| 345 # output options | |
| 346 'everything|e', # switch on EVERYTHING :-) | |
| 347 'output_file|o=s', # output file name | |
| 348 'force_overwrite', # force overwrite of output file if already exists | |
| 349 'terms|t=s', # consequence terms to use e.g. NCBI, SO | |
| 350 'coding_only', # only return results for consequences in coding regions | |
| 351 'canonical', # indicates if transcript is canonical | |
| 352 'ccds', # output CCDS identifer | |
| 353 'xref_refseq', # output refseq mrna xref | |
| 354 'protein', # add e! protein ID to extra column | |
| 355 'hgnc', # add HGNC gene ID to extra column | |
| 356 'hgvs', # add HGVS names to extra column | |
| 357 'sift=s', # SIFT predictions | |
| 358 'polyphen=s', # PolyPhen predictions | |
| 359 'condel=s', # Condel predictions | |
| 360 'regulatory', # enable regulatory stuff | |
| 361 'cell_type=s', # filter cell types for regfeats | |
| 362 'convert=s', # convert input to another format (doesn't run VEP) | |
| 363 'filter=s', # run in filtering mode | |
| 364 'no_intergenic', # don't print out INTERGENIC consequences | |
| 365 'gvf', # produce gvf output | |
| 366 'vcf', # produce vcf output | |
| 367 'original', # produce output in input format | |
| 368 'no_consequences', # don't calculate consequences | |
| 369 'lrg', # enable LRG-based features | |
| 370 'fields=s', # define your own output fields | |
| 371 'domains', # output overlapping protein features | |
| 372 'numbers', # include exon and intron numbers | |
| 373 | |
| 374 # cache stuff | |
| 375 'cache', # use cache | |
| 376 'write_cache', # enables writing to the cache | |
| 377 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs | |
| 378 'no_adaptor_cache', # don't write adaptor cache | |
| 379 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript | |
| 380 'strip', # strips adaptors etc from objects before caching them | |
| 381 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more | |
| 382 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) | |
| 383 'cache_region_size=i', # size of region in bases for each cache file | |
| 384 'no_slice_cache', # tell API not to cache features on slice | |
| 385 'standalone', # standalone renamed offline | |
| 386 'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection | |
| 387 'skip_db_check', # don't compare DB parameters with cached | |
| 388 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" | |
| 389 'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation | |
| 390 'tmpdir=s', # tmp dir used for BigWig retrieval | |
| 391 'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory | |
| 392 | |
| 393 # debug | |
| 394 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL | |
| 395 'count_queries', # counts SQL queries executed | |
| 396 'admin', # allows me to build off public hosts | |
| 397 'debug', # print out debug info | |
| 398 'tabix', # experimental use tabix cache files | |
| 399 ) or die "ERROR: Failed to parse command-line flags\n"; | |
| 400 | |
| 401 # print usage message if requested or no args supplied | |
| 402 if(defined($config->{help}) || !$args) { | |
| 403 &usage; | |
| 404 exit(0); | |
| 405 } | |
| 406 | |
| 407 # dir is where the cache and plugins live | |
| 408 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); | |
| 409 | |
| 410 # dir gets set to the specific cache directory later on, so take a copy to use | |
| 411 # when configuring plugins | |
| 412 | |
| 413 $config->{toplevel_dir} = $config->{dir}; | |
| 414 | |
| 415 # ini file? | |
| 416 my $ini_file = $config->{dir}.'/vep.ini'; | |
| 417 | |
| 418 if(-e $ini_file) { | |
| 419 read_config_from_file($config, $ini_file); | |
| 420 } | |
| 421 | |
| 422 # config file? | |
| 423 if(defined $config->{config}) { | |
| 424 read_config_from_file($config, $config->{config}); | |
| 425 } | |
| 426 | |
| 427 # can't be both quiet and verbose | |
| 428 die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose}); | |
| 429 | |
| 430 # check forking | |
| 431 if(defined($config->{fork})) { | |
| 432 die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1; | |
| 433 | |
| 434 # check we can use MIME::Base64 | |
| 435 eval q{ use MIME::Base64; }; | |
| 436 | |
| 437 if($@) { | |
| 438 debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); | |
| 439 delete $config->{fork}; | |
| 440 } | |
| 441 else { | |
| 442 | |
| 443 # try a practice fork | |
| 444 my $pid = fork; | |
| 445 | |
| 446 if(!defined($pid)) { | |
| 447 debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet}); | |
| 448 delete $config->{fork}; | |
| 449 } | |
| 450 elsif($pid) { | |
| 451 waitpid($pid, 0); | |
| 452 } | |
| 453 elsif($pid == 0) { | |
| 454 exit(0); | |
| 455 } | |
| 456 } | |
| 457 } | |
| 458 | |
| 459 # check file format | |
| 460 if(defined $config->{format}) { | |
| 461 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i; | |
| 462 } | |
| 463 | |
| 464 # check convert format | |
| 465 if(defined $config->{convert}) { | |
| 466 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i; | |
| 467 } | |
| 468 | |
| 469 # check if user still using --standalone | |
| 470 if(defined $config->{standalone}) { | |
| 471 die "ERROR: --standalone replaced by --offline\n"; | |
| 472 } | |
| 473 | |
| 474 # connection settings for Ensembl Genomes | |
| 475 if($config->{genomes}) { | |
| 476 $config->{host} ||= 'mysql.ebi.ac.uk'; | |
| 477 $config->{port} ||= 4157; | |
| 478 } | |
| 479 | |
| 480 # connection settings for main Ensembl | |
| 481 else { | |
| 482 $config->{species} ||= "homo_sapiens"; | |
| 483 $config->{host} ||= 'ensembldb.ensembl.org'; | |
| 484 $config->{port} ||= 5306; | |
| 485 } | |
| 486 | |
| 487 # refseq or core? | |
| 488 if(defined($config->{refseq})) { | |
| 489 $config->{core_type} = 'otherfeatures'; | |
| 490 } | |
| 491 else { | |
| 492 $config->{core_type} = 'core'; | |
| 493 } | |
| 494 | |
| 495 # output term | |
| 496 if(defined $config->{terms}) { | |
| 497 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; | |
| 498 if($config->{terms} =~ /ensembl|display/i) { | |
| 499 $config->{terms} = 'display'; | |
| 500 } | |
| 501 else { | |
| 502 $config->{terms} = uc($config->{terms}); | |
| 503 } | |
| 504 } | |
| 505 | |
| 506 # everything? | |
| 507 if(defined($config->{everything})) { | |
| 508 my %everything = ( | |
| 509 sift => 'b', | |
| 510 polyphen => 'b', | |
| 511 ccds => 1, | |
| 512 hgvs => 1, | |
| 513 hgnc => 1, | |
| 514 numbers => 1, | |
| 515 domains => 1, | |
| 516 regulatory => 1, | |
| 517 canonical => 1, | |
| 518 protein => 1, | |
| 519 gmaf => 1, | |
| 520 ); | |
| 521 | |
| 522 $config->{$_} = $everything{$_} for keys %everything; | |
| 523 | |
| 524 # these ones won't work with offline | |
| 525 delete $config->{hgvs} if defined($config->{offline}); | |
| 526 } | |
| 527 | |
| 528 # check nsSNP tools | |
| 529 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { | |
| 530 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; | |
| 531 | |
| 532 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; | |
| 533 | |
| 534 die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel'; | |
| 535 } | |
| 536 | |
| 537 # force quiet if outputting to STDOUT | |
| 538 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { | |
| 539 delete $config->{verbose} if defined($config->{verbose}); | |
| 540 $config->{quiet} = 1; | |
| 541 } | |
| 542 | |
| 543 # individual(s) specified? | |
| 544 if(defined($config->{individual})) { | |
| 545 $config->{individual} = [split /\,/, $config->{individual}]; | |
| 546 | |
| 547 # force allow_non_variant | |
| 548 $config->{allow_non_variant} = 1; | |
| 549 } | |
| 550 | |
| 551 # summarise options if verbose | |
| 552 if(defined $config->{verbose}) { | |
| 553 my $header =<<INTRO; | |
| 554 #----------------------------------# | |
| 555 # ENSEMBL VARIANT EFFECT PREDICTOR # | |
| 556 #----------------------------------# | |
| 557 | |
| 558 version $VERSION | |
| 559 | |
| 560 By Will McLaren (wm2\@ebi.ac.uk) | |
| 561 | |
| 562 Configuration options: | |
| 563 | |
| 564 INTRO | |
| 565 print $header; | |
| 566 | |
| 567 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1]; | |
| 568 | |
| 569 foreach my $key(sort keys %$config) { | |
| 570 next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0; | |
| 571 print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n"; | |
| 572 } | |
| 573 | |
| 574 print "\n".("-" x 20)."\n\n"; | |
| 575 } | |
| 576 | |
| 577 # check custom annotations | |
| 578 for my $i(0..$#{$config->{custom}}) { | |
| 579 my $custom = $config->{custom}->[$i]; | |
| 580 | |
| 581 my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom; | |
| 582 $type ||= 'exact'; | |
| 583 $format ||= 'bed'; | |
| 584 $coords ||= 0; | |
| 585 | |
| 586 # check type | |
| 587 die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/; | |
| 588 | |
| 589 # check format | |
| 590 die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/; | |
| 591 | |
| 592 # bigwig format | |
| 593 if($format eq 'bigwig') { | |
| 594 # check for bigWigToWig | |
| 595 die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/; | |
| 596 } | |
| 597 | |
| 598 else { | |
| 599 # check for tabix | |
| 600 die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/; | |
| 601 | |
| 602 # remote files? | |
| 603 if($filepath =~ /tp\:\/\//) { | |
| 604 my $remote_test = `tabix $filepath 1:1-1 2>&1`; | |
| 605 if($remote_test =~ /fail/) { | |
| 606 die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n"; | |
| 607 } | |
| 608 elsif($remote_test =~ /get_local_version/) { | |
| 609 debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet}); | |
| 610 } | |
| 611 } | |
| 612 | |
| 613 # check files exist | |
| 614 else { | |
| 615 die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath; | |
| 616 die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi'; | |
| 617 } | |
| 618 } | |
| 619 | |
| 620 $config->{custom}->[$i] = { | |
| 621 'file' => $filepath, | |
| 622 'name' => $shortname || 'CUSTOM'.($i + 1), | |
| 623 'type' => $type, | |
| 624 'format' => $format, | |
| 625 'coords' => $coords, | |
| 626 }; | |
| 627 } | |
| 628 | |
| 629 # check if using filter and original | |
| 630 die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter}); | |
| 631 | |
| 632 # filter by consequence? | |
| 633 if(defined($config->{filter})) { | |
| 634 | |
| 635 my %filters = map {$_ => 1} split /\,/, $config->{filter}; | |
| 636 | |
| 637 # add in shortcuts | |
| 638 foreach my $filter(keys %filters) { | |
| 639 my $value = 1; | |
| 640 if($filter =~ /^no_/) { | |
| 641 delete $filters{$filter}; | |
| 642 $filter =~ s/^no_//g; | |
| 643 $value = 0; | |
| 644 $filters{$filter} = $value; | |
| 645 } | |
| 646 | |
| 647 if(defined($FILTER_SHORTCUTS{$filter})) { | |
| 648 delete $filters{$filter}; | |
| 649 $filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}}; | |
| 650 } | |
| 651 } | |
| 652 | |
| 653 $config->{filter} = \%filters; | |
| 654 | |
| 655 $config->{filter_count} = 0; | |
| 656 } | |
| 657 | |
| 658 # set defaults | |
| 659 $config->{user} ||= 'anonymous'; | |
| 660 $config->{buffer_size} ||= 5000; | |
| 661 $config->{chunk_size} ||= '50kb'; | |
| 662 $config->{output_file} ||= "variant_effect_output.txt"; | |
| 663 $config->{tmpdir} ||= '/tmp'; | |
| 664 $config->{format} ||= 'guess'; | |
| 665 $config->{terms} ||= 'SO'; | |
| 666 $config->{cache_region_size} ||= 1000000; | |
| 667 $config->{compress} ||= 'zcat'; | |
| 668 | |
| 669 # regulatory has to be on for cell_type | |
| 670 if(defined($config->{cell_type})) { | |
| 671 $config->{regulatory} = 1; | |
| 672 $config->{cell_type} = [split /\,/, $config->{cell_type}] if defined($config->{cell_type}); | |
| 673 } | |
| 674 | |
| 675 # can't use a whole bunch of options with most_severe | |
| 676 if(defined($config->{most_severe})) { | |
| 677 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) { | |
| 678 die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag}); | |
| 679 } | |
| 680 } | |
| 681 | |
| 682 # can't use a whole bunch of options with summary | |
| 683 if(defined($config->{summary})) { | |
| 684 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) { | |
| 685 die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag}); | |
| 686 } | |
| 687 } | |
| 688 | |
| 689 # frequency filtering | |
| 690 if(defined($config->{check_frequency})) { | |
| 691 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { | |
| 692 die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag}; | |
| 693 } | |
| 694 | |
| 695 # need to set check_existing | |
| 696 $config->{check_existing} = 1; | |
| 697 } | |
| 698 | |
| 699 $config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf}; | |
| 700 | |
| 701 # warn users still using whole_genome flag | |
| 702 if(defined($config->{whole_genome})) { | |
| 703 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); | |
| 704 } | |
| 705 | |
| 706 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; | |
| 707 $config->{failed} = 0 unless defined $config->{failed}; | |
| 708 $config->{chunk_size} =~ s/mb?/000000/i; | |
| 709 $config->{chunk_size} =~ s/kb?/000/i; | |
| 710 $config->{cache_region_size} =~ s/mb?/000000/i; | |
| 711 $config->{cache_region_size} =~ s/kb?/000/i; | |
| 712 | |
| 713 # cluck and display executed SQL? | |
| 714 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); | |
| 715 | |
| 716 # offline needs cache, can't use HGVS | |
| 717 if(defined($config->{offline})) { | |
| 718 $config->{cache} = 1; | |
| 719 | |
| 720 #die("ERROR: Cannot generate HGVS coordinates in offline mode\n") if defined($config->{hgvs}); | |
| 721 die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs'; | |
| 722 die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id'; | |
| 723 die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency}); | |
| 724 die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv}); | |
| 725 } | |
| 726 | |
| 727 # write_cache needs cache | |
| 728 $config->{cache} = 1 if defined $config->{write_cache}; | |
| 729 | |
| 730 # no_slice_cache, prefetch and whole_genome have to be on to use cache | |
| 731 if(defined($config->{cache})) { | |
| 732 $config->{prefetch} = 1; | |
| 733 $config->{no_slice_cache} = 1; | |
| 734 $config->{whole_genome} = 1; | |
| 735 $config->{strip} = 1; | |
| 736 } | |
| 737 | |
| 738 $config->{build} = $config->{rebuild} if defined($config->{rebuild}); | |
| 739 | |
| 740 # force options for full build | |
| 741 if(defined($config->{build})) { | |
| 742 $config->{prefetch} = 1; | |
| 743 $config->{hgnc} = 1; | |
| 744 $config->{no_slice_cache} = 1; | |
| 745 $config->{cache} = 1; | |
| 746 $config->{strip} = 1; | |
| 747 $config->{write_cache} = 1; | |
| 748 $config->{cell_type} = 1 if defined($config->{regulatory}); | |
| 749 } | |
| 750 | |
| 751 # connect to databases | |
| 752 $config->{reg} = &connect_to_dbs($config); | |
| 753 | |
| 754 # complete dir with species name and db_version | |
| 755 $config->{dir} .= '/'.( | |
| 756 join '/', ( | |
| 757 defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), | |
| 758 $config->{db_version} || $config->{reg}->software_version | |
| 759 ) | |
| 760 ); | |
| 761 | |
| 762 # warn user cache directory doesn't exist | |
| 763 if(!-e $config->{dir}) { | |
| 764 | |
| 765 # if using write_cache | |
| 766 if(defined($config->{write_cache})) { | |
| 767 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); | |
| 768 } | |
| 769 | |
| 770 # want to read cache, not found | |
| 771 elsif(defined($config->{cache})) { | |
| 772 die("ERROR: Cache directory ", $config->{dir}, " not found"); | |
| 773 } | |
| 774 } | |
| 775 | |
| 776 if(defined($config->{cache})) { | |
| 777 # read cache info | |
| 778 if(read_cache_info($config)) { | |
| 779 debug("Read existing cache info") unless defined $config->{quiet}; | |
| 780 } | |
| 781 } | |
| 782 | |
| 783 # we configure plugins here because they can sometimes switch on the | |
| 784 # regulatory config option | |
| 785 configure_plugins($config); | |
| 786 | |
| 787 # include regulatory modules if requested | |
| 788 if(defined($config->{regulatory})) { | |
| 789 # do the use statements here so that users don't have to have the | |
| 790 # funcgen API installed to use the rest of the script | |
| 791 eval q{ | |
| 792 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; | |
| 793 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; | |
| 794 use Bio::EnsEMBL::Funcgen::MotifFeature; | |
| 795 use Bio::EnsEMBL::Funcgen::RegulatoryFeature; | |
| 796 use Bio::EnsEMBL::Funcgen::BindingMatrix; | |
| 797 }; | |
| 798 | |
| 799 if($@) { | |
| 800 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n"); | |
| 801 } | |
| 802 } | |
| 803 | |
| 804 # user defined custom output fields | |
| 805 if(defined($config->{fields})) { | |
| 806 $config->{fields} = [split ',', $config->{fields}]; | |
| 807 debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet}); | |
| 808 $config->{fields_redefined} = 1; | |
| 809 } | |
| 810 $config->{fields} ||= \@OUTPUT_COLS; | |
| 811 | |
| 812 # suppress warnings that the FeatureAdpators spit if using no_slice_cache | |
| 813 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); | |
| 814 | |
| 815 # get adaptors (don't get them in offline mode) | |
| 816 unless(defined($config->{offline})) { | |
| 817 | |
| 818 if(defined($config->{cache}) && !defined($config->{write_cache})) { | |
| 819 | |
| 820 # try and load adaptors from cache | |
| 821 if(!&load_dumped_adaptor_cache($config)) { | |
| 822 &get_adaptors($config); | |
| 823 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); | |
| 824 } | |
| 825 | |
| 826 # check cached adaptors match DB params | |
| 827 else { | |
| 828 my $dbc = $config->{sa}->{dbc}; | |
| 829 | |
| 830 my $ok = 1; | |
| 831 | |
| 832 if($dbc->{_host} ne $config->{host}) { | |
| 833 | |
| 834 # ens-livemirror, useastdb and ensembldb should all have identical DBs | |
| 835 unless( | |
| 836 ( | |
| 837 $dbc->{_host} eq 'ens-livemirror' | |
| 838 || $dbc->{_host} eq 'ensembldb.ensembl.org' | |
| 839 || $dbc->{_host} eq 'useastdb.ensembl.org' | |
| 840 ) && ( | |
| 841 $config->{host} eq 'ens-livemirror' | |
| 842 || $config->{host} eq 'ensembldb.ensembl.org' | |
| 843 || $config->{host} eq 'useastdb.ensembl.org' | |
| 844 ) | |
| 845 ) { | |
| 846 $ok = 0; | |
| 847 } | |
| 848 | |
| 849 unless(defined($config->{skip_db_check})) { | |
| 850 # but we still need to reconnect | |
| 851 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); | |
| 852 | |
| 853 &get_adaptors($config); | |
| 854 } | |
| 855 } | |
| 856 | |
| 857 if(!$ok) { | |
| 858 if(defined($config->{skip_db_check})) { | |
| 859 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); | |
| 860 } | |
| 861 else { | |
| 862 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; | |
| 863 } | |
| 864 } | |
| 865 } | |
| 866 } | |
| 867 else { | |
| 868 &get_adaptors($config); | |
| 869 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); | |
| 870 } | |
| 871 | |
| 872 # reg adaptors (only fetches if not retrieved from cache already) | |
| 873 &get_reg_adaptors($config) if defined($config->{regulatory}); | |
| 874 } | |
| 875 | |
| 876 # check cell types | |
| 877 if(defined($config->{cell_type}) && !defined($config->{build})) { | |
| 878 my $cls = ''; | |
| 879 | |
| 880 if(defined($config->{cache})) { | |
| 881 $cls = $config->{cache_cell_types}; | |
| 882 } | |
| 883 else { | |
| 884 my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); | |
| 885 $cls = join ",", map {$_->name} @{$cta->fetch_all}; | |
| 886 } | |
| 887 | |
| 888 foreach my $cl(@{$config->{cell_type}}) { | |
| 889 die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/; | |
| 890 } | |
| 891 } | |
| 892 | |
| 893 # get terminal width for progress bars | |
| 894 unless(defined($config->{quiet})) { | |
| 895 my $width; | |
| 896 | |
| 897 # module may not be installed | |
| 898 eval q{ | |
| 899 use Term::ReadKey; | |
| 900 }; | |
| 901 | |
| 902 if(!$@) { | |
| 903 my ($w, $h); | |
| 904 | |
| 905 # module may be installed, but e.g. | |
| 906 eval { | |
| 907 #($w, $h) = GetTerminalSize(); | |
| 908 $w = 167; | |
| 909 $h = 30; | |
| 910 }; | |
| 911 | |
| 912 $width = $w if defined $w; | |
| 913 } | |
| 914 | |
| 915 $width ||= 60; | |
| 916 $width -= 12; | |
| 917 $config->{terminal_width} = $width; | |
| 918 } | |
| 919 | |
| 920 # jump out to build cache if requested | |
| 921 if(defined($config->{build})) { | |
| 922 | |
| 923 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { | |
| 924 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); | |
| 925 } | |
| 926 | |
| 927 # build the cache | |
| 928 debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); | |
| 929 build_full_cache($config); | |
| 930 | |
| 931 # exit script | |
| 932 debug("Finished building cache") unless defined($config->{quiet}); | |
| 933 exit(0); | |
| 934 } | |
| 935 | |
| 936 | |
| 937 # warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG | |
| 938 if(defined($config->{cache})) { | |
| 939 | |
| 940 # these two def depend on DB | |
| 941 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency lrg check_sv)) { | |
| 942 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); | |
| 943 } | |
| 944 | |
| 945 # as does using HGVS or IDs as input | |
| 946 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); | |
| 947 | |
| 948 # the rest may be in the cache | |
| 949 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) { | |
| 950 next if defined($config->{'cache_'.$param}); | |
| 951 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); | |
| 952 } | |
| 953 } | |
| 954 | |
| 955 # get list of chrs if supplied | |
| 956 if(defined($config->{chr})) { | |
| 957 my %chrs; | |
| 958 | |
| 959 foreach my $val(split /\,/, $config->{chr}) { | |
| 960 my @nnn = split /\-/, $val; | |
| 961 | |
| 962 foreach my $chr($nnn[0]..$nnn[-1]) { | |
| 963 $chrs{$chr} = 1; | |
| 964 } | |
| 965 } | |
| 966 | |
| 967 $config->{chr} = \%chrs; | |
| 968 } | |
| 969 | |
| 970 # get input file handle | |
| 971 $config->{in_file_handle} = &get_in_file_handle($config); | |
| 972 | |
| 973 return $config; | |
| 974 } | |
| 975 | |
| 976 # reads config from a file | |
| 977 sub read_config_from_file { | |
| 978 my $config = shift; | |
| 979 my $file = shift; | |
| 980 | |
| 981 open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n"; | |
| 982 | |
| 983 while(<CONFIG>) { | |
| 984 next if /^\#/; | |
| 985 my @split = split /\s+|\=/; | |
| 986 my $key = shift @split; | |
| 987 $key =~ s/^\-//g; | |
| 988 | |
| 989 if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') { | |
| 990 push @{$config->{$key}}, @split; | |
| 991 } | |
| 992 else { | |
| 993 $config->{$key} ||= $split[0]; | |
| 994 } | |
| 995 } | |
| 996 | |
| 997 close CONFIG; | |
| 998 | |
| 999 # force quiet if outputting to STDOUT | |
| 1000 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { | |
| 1001 delete $config->{verbose} if defined($config->{verbose}); | |
| 1002 $config->{quiet} = 1; | |
| 1003 } | |
| 1004 | |
| 1005 debug("Read configuration from $file") unless defined($config->{quiet}); | |
| 1006 } | |
| 1007 | |
| 1008 # configures custom VEP plugins | |
| 1009 sub configure_plugins { | |
| 1010 | |
| 1011 my $config = shift; | |
| 1012 | |
| 1013 $config->{plugins} = []; | |
| 1014 | |
| 1015 if (my @plugins = @{ $config->{plugin} }) { | |
| 1016 | |
| 1017 # add the Plugins directory onto @INC | |
| 1018 | |
| 1019 unshift @INC, $config->{toplevel_dir}."/Plugins"; | |
| 1020 | |
| 1021 for my $plugin (@plugins) { | |
| 1022 | |
| 1023 # parse out the module name and parameters | |
| 1024 | |
| 1025 my ($module, @params) = split /,/, $plugin; | |
| 1026 | |
| 1027 # check we can use the module | |
| 1028 | |
| 1029 eval qq{ | |
| 1030 use $module; | |
| 1031 }; | |
| 1032 if ($@) { | |
| 1033 debug("Failed to compile plugin $module: $@") unless defined($config->{quiet}); | |
| 1034 next; | |
| 1035 } | |
| 1036 | |
| 1037 # now check we can instantiate it, passing any parameters to the constructor | |
| 1038 | |
| 1039 my $instance; | |
| 1040 | |
| 1041 eval { | |
| 1042 $instance = $module->new($config, @params); | |
| 1043 }; | |
| 1044 if ($@) { | |
| 1045 debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet}); | |
| 1046 next; | |
| 1047 } | |
| 1048 | |
| 1049 # check that the versions match | |
| 1050 | |
| 1051 my $plugin_version; | |
| 1052 | |
| 1053 if ($instance->can('version')) { | |
| 1054 $plugin_version = $instance->version; | |
| 1055 } | |
| 1056 | |
| 1057 my $version_ok = 1; | |
| 1058 | |
| 1059 if ($plugin_version) { | |
| 1060 my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version; | |
| 1061 my ($major, $minor, $maintenance) = split /\./, $VERSION; | |
| 1062 | |
| 1063 if ($plugin_major != $major) { | |
| 1064 debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet}); | |
| 1065 $version_ok = 0; | |
| 1066 } | |
| 1067 } | |
| 1068 else { | |
| 1069 debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet}); | |
| 1070 $version_ok = 0; | |
| 1071 } | |
| 1072 | |
| 1073 debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok; | |
| 1074 | |
| 1075 # check that it implements all necessary methods | |
| 1076 | |
| 1077 for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) { | |
| 1078 unless ($instance->can($required)) { | |
| 1079 debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet}); | |
| 1080 next; | |
| 1081 } | |
| 1082 } | |
| 1083 | |
| 1084 # all's good, so save the instance in our list of plugins | |
| 1085 | |
| 1086 push @{ $config->{plugins} }, $instance; | |
| 1087 | |
| 1088 debug("Loaded plugin: $module") unless defined($config->{quiet}); | |
| 1089 | |
| 1090 # for convenience, check if the plugin wants regulatory stuff and turn on the config option if so | |
| 1091 | |
| 1092 if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) { | |
| 1093 debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet}); | |
| 1094 $config->{regulatory} = 1; | |
| 1095 } | |
| 1096 } | |
| 1097 } | |
| 1098 } | |
| 1099 | |
| 1100 # connects to DBs (not done in offline mode) | |
| 1101 sub connect_to_dbs { | |
| 1102 my $config = shift; | |
| 1103 | |
| 1104 # get registry | |
| 1105 my $reg = 'Bio::EnsEMBL::Registry'; | |
| 1106 | |
| 1107 unless(defined($config->{offline})) { | |
| 1108 # load DB options from registry file if given | |
| 1109 if(defined($config->{registry})) { | |
| 1110 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); | |
| 1111 $reg->load_all( | |
| 1112 $config->{registry}, | |
| 1113 $config->{verbose}, | |
| 1114 undef, | |
| 1115 $config->{no_slice_cache} | |
| 1116 ); | |
| 1117 } | |
| 1118 | |
| 1119 # otherwise manually connect to DB server | |
| 1120 else { | |
| 1121 $reg->load_registry_from_db( | |
| 1122 -host => $config->{host}, | |
| 1123 -user => $config->{user}, | |
| 1124 -pass => $config->{password}, | |
| 1125 -port => $config->{port}, | |
| 1126 -db_version => $config->{db_version}, | |
| 1127 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, | |
| 1128 -verbose => $config->{verbose}, | |
| 1129 -no_cache => $config->{no_slice_cache}, | |
| 1130 ); | |
| 1131 } | |
| 1132 | |
| 1133 eval { $reg->set_reconnect_when_lost() }; | |
| 1134 | |
| 1135 if(defined($config->{verbose})) { | |
| 1136 # get a meta container adaptors to check version | |
| 1137 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); | |
| 1138 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); | |
| 1139 | |
| 1140 if($core_mca && $var_mca) { | |
| 1141 debug( | |
| 1142 "Connected to core version ", $core_mca->get_schema_version, " database ", | |
| 1143 "and variation version ", $var_mca->get_schema_version, " database" | |
| 1144 ); | |
| 1145 } | |
| 1146 } | |
| 1147 } | |
| 1148 | |
| 1149 return $reg; | |
| 1150 } | |
| 1151 | |
| 1152 # get adaptors from DB | |
| 1153 sub get_adaptors { | |
| 1154 my $config = shift; | |
| 1155 | |
| 1156 die "ERROR: No registry" unless defined $config->{reg}; | |
| 1157 | |
| 1158 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); | |
| 1159 $config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature'); | |
| 1160 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); | |
| 1161 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); | |
| 1162 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); | |
| 1163 | |
| 1164 # get fake ones for species with no var DB | |
| 1165 if(!defined($config->{vfa})) { | |
| 1166 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); | |
| 1167 $config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species}); | |
| 1168 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); | |
| 1169 } | |
| 1170 | |
| 1171 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); | |
| 1172 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); | |
| 1173 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); | |
| 1174 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); | |
| 1175 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); | |
| 1176 | |
| 1177 # cache schema version | |
| 1178 $config->{mca}->get_schema_version if defined $config->{mca}; | |
| 1179 | |
| 1180 # check we got slice adaptor - can't continue without a core DB | |
| 1181 die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; | |
| 1182 } | |
| 1183 | |
| 1184 # gets regulatory adaptors | |
| 1185 sub get_reg_adaptors { | |
| 1186 my $config = shift; | |
| 1187 | |
| 1188 foreach my $type(@REG_FEAT_TYPES) { | |
| 1189 next if defined($config->{$type.'_adaptor'}); | |
| 1190 | |
| 1191 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); | |
| 1192 if(defined($adaptor)) { | |
| 1193 $config->{$type.'_adaptor'} = $adaptor; | |
| 1194 } | |
| 1195 else { | |
| 1196 delete $config->{regulatory}; | |
| 1197 last; | |
| 1198 } | |
| 1199 } | |
| 1200 } | |
| 1201 | |
| 1202 # gets file handle for input | |
| 1203 sub get_in_file_handle { | |
| 1204 my $config = shift; | |
| 1205 | |
| 1206 # define the filehandle to read input from | |
| 1207 my $in_file_handle = new FileHandle; | |
| 1208 | |
| 1209 if(defined($config->{input_file})) { | |
| 1210 | |
| 1211 # check defined input file exists | |
| 1212 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; | |
| 1213 | |
| 1214 if($config->{input_file} =~ /\.gz$/){ | |
| 1215 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); | |
| 1216 } | |
| 1217 else { | |
| 1218 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); | |
| 1219 } | |
| 1220 } | |
| 1221 | |
| 1222 # no file specified - try to read data off command line | |
| 1223 else { | |
| 1224 $in_file_handle = 'STDIN'; | |
| 1225 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; | |
| 1226 } | |
| 1227 | |
| 1228 return $in_file_handle; | |
| 1229 } | |
| 1230 | |
| 1231 # gets file handle for output and adds header | |
| 1232 sub get_out_file_handle { | |
| 1233 my $config = shift; | |
| 1234 | |
| 1235 # define filehandle to write to | |
| 1236 my $out_file_handle = new FileHandle; | |
| 1237 | |
| 1238 # check if file exists | |
| 1239 if(-e $config->{output_file} && !defined($config->{force_overwrite})) { | |
| 1240 # die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with -- force_overwrite\n"); | |
| 1241 } | |
| 1242 | |
| 1243 if($config->{output_file} =~ /stdout/i) { | |
| 1244 $out_file_handle = *STDOUT; | |
| 1245 } | |
| 1246 else { | |
| 1247 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); | |
| 1248 } | |
| 1249 | |
| 1250 # define headers for a VCF file | |
| 1251 my @vcf_headers = ( | |
| 1252 '#CHROM', | |
| 1253 'POS', | |
| 1254 'ID', | |
| 1255 'REF', | |
| 1256 'ALT', | |
| 1257 'QUAL', | |
| 1258 'FILTER', | |
| 1259 'INFO' | |
| 1260 ); | |
| 1261 | |
| 1262 # file conversion, don't want to add normal headers | |
| 1263 if(defined($config->{convert})) { | |
| 1264 # header for VCF | |
| 1265 if($config->{convert} =~ /vcf/i) { | |
| 1266 print $out_file_handle "##fileformat=VCFv4.0\n"; | |
| 1267 print $out_file_handle join "\t", @vcf_headers; | |
| 1268 print $out_file_handle "\n"; | |
| 1269 } | |
| 1270 | |
| 1271 return $out_file_handle; | |
| 1272 } | |
| 1273 | |
| 1274 # GVF output, no header | |
| 1275 elsif(defined($config->{gvf}) || defined($config->{original})) { | |
| 1276 print $out_file_handle join "\n", @{$config->{headers}} if defined($config->{headers}) && defined($config->{original}); | |
| 1277 return $out_file_handle; | |
| 1278 } | |
| 1279 | |
| 1280 elsif(defined($config->{vcf})) { | |
| 1281 | |
| 1282 # create an info string for the VCF header | |
| 1283 my @new_headers; | |
| 1284 | |
| 1285 # if the user has defined the fields themselves, we don't need to worry | |
| 1286 if(defined $config->{fields_redefined}) { | |
| 1287 @new_headers = @{$config->{fields}}; | |
| 1288 } | |
| 1289 else { | |
| 1290 @new_headers = ( | |
| 1291 | |
| 1292 # get default headers, minus variation name and location (already encoded in VCF) | |
| 1293 grep { | |
| 1294 $_ ne 'Uploaded_variation' and | |
| 1295 $_ ne 'Location' and | |
| 1296 $_ ne 'Extra' | |
| 1297 } @{$config->{fields}}, | |
| 1298 | |
| 1299 # get extra headers | |
| 1300 map {@{$extra_headers{$_}}} | |
| 1301 grep {defined $config->{$_}} | |
| 1302 keys %extra_headers | |
| 1303 ); | |
| 1304 | |
| 1305 # plugin headers | |
| 1306 foreach my $plugin_header(split /\n/, get_plugin_headers($config)) { | |
| 1307 $plugin_header =~ /\#\# (.+?)\t\:.+/; | |
| 1308 push @new_headers, $1; | |
| 1309 } | |
| 1310 | |
| 1311 # redefine the main headers list in config | |
| 1312 $config->{fields} = \@new_headers; | |
| 1313 } | |
| 1314 | |
| 1315 # add the newly defined headers as a header to the VCF | |
| 1316 my $string = join '|', @{$config->{fields}}; | |
| 1317 my @vcf_info_strings = ('##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: '.$string.'">'); | |
| 1318 | |
| 1319 # add custom headers | |
| 1320 foreach my $custom(@{$config->{custom}}) { | |
| 1321 push @vcf_info_strings, '##INFO=<ID='.$custom->{name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">'; | |
| 1322 } | |
| 1323 | |
| 1324 # if this is already a VCF file, we need to add our new headers in the right place | |
| 1325 if(defined($config->{headers})) { | |
| 1326 | |
| 1327 for my $i(0..$#{$config->{headers}}) { | |
| 1328 if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) { | |
| 1329 splice(@{$config->{headers}}, $i, 0, @vcf_info_strings); | |
| 1330 } | |
| 1331 } | |
| 1332 | |
| 1333 print $out_file_handle join "\n", @{$config->{headers}}; | |
| 1334 print $out_file_handle "\n"; | |
| 1335 } | |
| 1336 | |
| 1337 else { | |
| 1338 print $out_file_handle "##fileformat=VCFv4.0\n"; | |
| 1339 print $out_file_handle join "\n", @vcf_info_strings; | |
| 1340 print $out_file_handle "\n"; | |
| 1341 print $out_file_handle join "\t", @vcf_headers; | |
| 1342 print $out_file_handle "\n"; | |
| 1343 } | |
| 1344 | |
| 1345 return $out_file_handle; | |
| 1346 } | |
| 1347 | |
| 1348 # make header | |
| 1349 my $time = &get_time; | |
| 1350 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; | |
| 1351 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); | |
| 1352 my $version_string = | |
| 1353 "Using API version ".$config->{reg}->software_version. | |
| 1354 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); | |
| 1355 | |
| 1356 # add key for extra column headers based on config | |
| 1357 my $extra_column_keys = join "\n", | |
| 1358 map {'## '.$_.' : '.$extra_descs{$_}} | |
| 1359 sort map {@{$extra_headers{$_}}} | |
| 1360 grep {defined $config->{$_}} | |
| 1361 keys %extra_headers; | |
| 1362 | |
| 1363 my $header =<<HEAD; | |
| 1364 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION | |
| 1365 ## Output produced at $time | |
| 1366 ## Connected to $db_string | |
| 1367 ## $version_string | |
| 1368 ## Extra column keys: | |
| 1369 $extra_column_keys | |
| 1370 HEAD | |
| 1371 | |
| 1372 $header .= get_plugin_headers($config); | |
| 1373 | |
| 1374 # add headers | |
| 1375 print $out_file_handle $header; | |
| 1376 | |
| 1377 # add custom data defs | |
| 1378 if(defined($config->{custom})) { | |
| 1379 foreach my $custom(@{$config->{custom}}) { | |
| 1380 print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n"; | |
| 1381 } | |
| 1382 } | |
| 1383 | |
| 1384 # add column headers | |
| 1385 print $out_file_handle '#', (join "\t", @{$config->{fields}}); | |
| 1386 print $out_file_handle "\n"; | |
| 1387 | |
| 1388 return $out_file_handle; | |
| 1389 } | |
| 1390 | |
| 1391 sub get_plugin_headers { | |
| 1392 | |
| 1393 my $config = shift; | |
| 1394 | |
| 1395 my $header = ""; | |
| 1396 | |
| 1397 for my $plugin (@{ $config->{plugins} }) { | |
| 1398 if (my $hdr = $plugin->get_header_info) { | |
| 1399 for my $key (keys %$hdr) { | |
| 1400 my $val = $hdr->{$key}; | |
| 1401 | |
| 1402 $header .= "## $key\t: $val\n"; | |
| 1403 } | |
| 1404 } | |
| 1405 } | |
| 1406 | |
| 1407 return $header; | |
| 1408 } | |
| 1409 | |
| 1410 # convert a variation feature to a line of output | |
| 1411 sub convert_vf { | |
| 1412 my $config = shift; | |
| 1413 my $vf = shift; | |
| 1414 | |
| 1415 my $convert_method = 'convert_to_'.lc($config->{convert}); | |
| 1416 my $method_ref = \&$convert_method; | |
| 1417 | |
| 1418 my $line = &$method_ref($config, $vf); | |
| 1419 my $handle = $config->{out_file_handle}; | |
| 1420 | |
| 1421 if(scalar @$line) { | |
| 1422 print $handle join "\t", @$line; | |
| 1423 print $handle "\n"; | |
| 1424 } | |
| 1425 } | |
| 1426 | |
| 1427 # converts to Ensembl format | |
| 1428 sub convert_to_ensembl { | |
| 1429 my $config = shift; | |
| 1430 my $vf = shift; | |
| 1431 | |
| 1432 return [ | |
| 1433 $vf->{chr} || $vf->seq_region_name, | |
| 1434 $vf->start, | |
| 1435 $vf->end, | |
| 1436 $vf->allele_string, | |
| 1437 $vf->strand, | |
| 1438 $vf->variation_name | |
| 1439 ]; | |
| 1440 } | |
| 1441 | |
| 1442 | |
| 1443 # converts to pileup format | |
| 1444 sub convert_to_pileup { | |
| 1445 my $config = shift; | |
| 1446 my $vf = shift; | |
| 1447 | |
| 1448 # look for imbalance in the allele string | |
| 1449 my %allele_lengths; | |
| 1450 my @alleles = split /\//, $vf->allele_string; | |
| 1451 | |
| 1452 foreach my $allele(@alleles) { | |
| 1453 $allele =~ s/\-//g; | |
| 1454 $allele_lengths{length($allele)} = 1; | |
| 1455 } | |
| 1456 | |
| 1457 # in/del | |
| 1458 if(scalar keys %allele_lengths > 1) { | |
| 1459 | |
| 1460 if($vf->allele_string =~ /\-/) { | |
| 1461 | |
| 1462 # insertion? | |
| 1463 if($alleles[0] eq '-') { | |
| 1464 shift @alleles; | |
| 1465 | |
| 1466 for my $i(0..$#alleles) { | |
| 1467 $alleles[$i] =~ s/\-//g; | |
| 1468 $alleles[$i] = '+'.$alleles[$i]; | |
| 1469 } | |
| 1470 } | |
| 1471 | |
| 1472 else { | |
| 1473 @alleles = grep {$_ ne '-'} @alleles; | |
| 1474 | |
| 1475 for my $i(0..$#alleles) { | |
| 1476 $alleles[$i] =~ s/\-//g; | |
| 1477 $alleles[$i] = '-'.$alleles[$i]; | |
| 1478 } | |
| 1479 } | |
| 1480 | |
| 1481 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; | |
| 1482 | |
| 1483 return [ | |
| 1484 $vf->{chr} || $vf->seq_region_name, | |
| 1485 $vf->start - 1, | |
| 1486 '*', | |
| 1487 (join "/", @alleles), | |
| 1488 ]; | |
| 1489 } | |
| 1490 | |
| 1491 else { | |
| 1492 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); | |
| 1493 return []; | |
| 1494 } | |
| 1495 | |
| 1496 } | |
| 1497 | |
| 1498 # balanced sub | |
| 1499 else { | |
| 1500 return [ | |
| 1501 $vf->{chr} || $vf->seq_region_name, | |
| 1502 $vf->start, | |
| 1503 shift @alleles, | |
| 1504 (join "/", @alleles), | |
| 1505 ]; | |
| 1506 } | |
| 1507 } | |
| 1508 | |
| 1509 # converts to HGVS (hackily returns many lines) | |
| 1510 sub convert_to_hgvs { | |
| 1511 my $config = shift; | |
| 1512 my $vf = shift; | |
| 1513 | |
| 1514 # ensure we have a slice | |
| 1515 $vf->{slice} ||= get_slice($config, $vf->{chr}); | |
| 1516 | |
| 1517 my $tvs = $vf->get_all_TranscriptVariations; | |
| 1518 | |
| 1519 my @return = values %{$vf->get_all_hgvs_notations()}; | |
| 1520 | |
| 1521 if(defined($tvs)) { | |
| 1522 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs; | |
| 1523 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs; | |
| 1524 } | |
| 1525 | |
| 1526 return [join "\n", @return]; | |
| 1527 } | |
| 1528 | |
| 1529 # prints a line of output from the hash | |
| 1530 sub print_line { | |
| 1531 my $config = shift; | |
| 1532 my $line = shift; | |
| 1533 return unless defined($line); | |
| 1534 | |
| 1535 my $output; | |
| 1536 | |
| 1537 # normal | |
| 1538 if(ref($line) eq 'HASH') { | |
| 1539 my %extra = %{$line->{Extra}}; | |
| 1540 | |
| 1541 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; | |
| 1542 | |
| 1543 # if the fields have been redefined we need to search through in case | |
| 1544 # any of the defined fields are actually part of the Extra hash | |
| 1545 $output = join "\t", map { | |
| 1546 (defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-')) | |
| 1547 } @{$config->{fields}}; | |
| 1548 } | |
| 1549 | |
| 1550 # gvf/vcf | |
| 1551 else { | |
| 1552 $output = $$line; | |
| 1553 } | |
| 1554 | |
| 1555 my $fh = $config->{out_file_handle}; | |
| 1556 print $fh "$output\n"; | |
| 1557 } | |
| 1558 | |
| 1559 # outputs usage message | |
| 1560 sub usage { | |
| 1561 my $usage =<<END; | |
| 1562 #----------------------------------# | |
| 1563 # ENSEMBL VARIANT EFFECT PREDICTOR # | |
| 1564 #----------------------------------# | |
| 1565 | |
| 1566 version $VERSION | |
| 1567 | |
| 1568 By Will McLaren (wm2\@ebi.ac.uk) | |
| 1569 | |
| 1570 http://www.ensembl.org/info/docs/variation/vep/vep_script.html | |
| 1571 | |
| 1572 Usage: | |
| 1573 perl variant_effect_predictor.pl [arguments] | |
| 1574 | |
| 1575 Options | |
| 1576 ======= | |
| 1577 | |
| 1578 --help Display this message and quit | |
| 1579 --verbose Display verbose output as the script runs [default: off] | |
| 1580 --quiet Suppress status and warning messages [default: off] | |
| 1581 --no_progress Suppress progress bars [default: off] | |
| 1582 | |
| 1583 --config Load configuration from file. Any command line options | |
| 1584 specified overwrite those in the file [default: off] | |
| 1585 | |
| 1586 --everything Shortcut switch to turn on commonly used options. See web | |
| 1587 documentation for details [default: off] | |
| 1588 | |
| 1589 --fork [num_forks] Use forking to improve script runtime [default: off] | |
| 1590 | |
| 1591 -i | --input_file Input file - if not specified, reads from STDIN. Files | |
| 1592 may be gzip compressed. | |
| 1593 --format Specify input file format - one of "ensembl", "pileup", | |
| 1594 "vcf", "hgvs", "id" or "guess" to try and work out format. | |
| 1595 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this | |
| 1596 will force --quiet [default: "variant_effect_output.txt"] | |
| 1597 --force_overwrite Force overwriting of output file [default: quit if file | |
| 1598 exists] | |
| 1599 --original Writes output as it was in input - must be used with --filter | |
| 1600 since no consequence data is added [default: off] | |
| 1601 --vcf Write output as VCF [default: off] | |
| 1602 --gvf Write output as GVF [default: off] | |
| 1603 --fields [field list] Define a custom output format by specifying a comma-separated | |
| 1604 list of field names. Field names normally present in the | |
| 1605 "Extra" field may also be specified, including those added by | |
| 1606 plugin modules. Can also be used to configure VCF output | |
| 1607 columns [default: off] | |
| 1608 | |
| 1609 --species [species] Species to use [default: "human"] | |
| 1610 | |
| 1611 -t | --terms Type of consequence terms to output - one of "SO", "ensembl" | |
| 1612 [default: SO] | |
| 1613 | |
| 1614 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off] | |
| 1615 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off] | |
| 1616 | |
| 1617 NB: SIFT and PolyPhen predictions are currently available for human only | |
| 1618 NB: Condel support has been moved to a VEP plugin module - see documentation | |
| 1619 | |
| 1620 --regulatory Look for overlaps with regulatory regions. The script can | |
| 1621 also call if a variant falls in a high information position | |
| 1622 within a transcription factor binding site. Output lines have | |
| 1623 a Feature type of RegulatoryFeature or MotifFeature | |
| 1624 [default: off] | |
| 1625 --cell_type [types] Report only regulatory regions that are found in the given cell | |
| 1626 type(s). Can be a single cell type or a comma-separated list. | |
| 1627 The functional type in each cell type is reported under | |
| 1628 CELL_TYPE in the output. To retrieve a list of cell types, use | |
| 1629 "--cell_type list" [default: off] | |
| 1630 | |
| 1631 NB: Regulatory consequences are currently available for human and mouse only | |
| 1632 | |
| 1633 --custom [file list] Add custom annotations from tabix-indexed files. See | |
| 1634 documentation for full details [default: off] | |
| 1635 --plugin [plugin_name] Use named plugin module [default: off] | |
| 1636 --hgnc Add HGNC gene identifiers to output [default: off] | |
| 1637 --hgvs Output HGVS identifiers (coding and protein). Requires database | |
| 1638 connection [default: off] | |
| 1639 --ccds Output CCDS transcript identifiers [default: off] | |
| 1640 --xref_refseq Output aligned RefSeq mRNA identifier for transcript. NB: the | |
| 1641 RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND | |
| 1642 FREQUENTLY WILL NOT, match exactly in sequence, exon structure | |
| 1643 and protein product [default: off] | |
| 1644 --protein Output Ensembl protein identifer [default: off] | |
| 1645 --canonical Indicate if the transcript for this consequence is the canonical | |
| 1646 transcript for this gene [default: off] | |
| 1647 --domains Include details of any overlapping protein domains [default: off] | |
| 1648 --numbers Include exon & intron numbers [default: off] | |
| 1649 | |
| 1650 --no_intergenic Excludes intergenic consequences from the output [default: off] | |
| 1651 --coding_only Only return consequences that fall in the coding region of | |
| 1652 transcripts [default: off] | |
| 1653 --most_severe Ouptut only the most severe consequence per variation. | |
| 1654 Transcript-specific columns will be left blank. [default: off] | |
| 1655 --summary Output only a comma-separated list of all consequences per | |
| 1656 variation. Transcript-specific columns will be left blank. | |
| 1657 [default: off] | |
| 1658 --per_gene Output only the most severe consequence per gene. Where more | |
| 1659 than one transcript has the same consequence, the transcript | |
| 1660 chosen is arbitrary. [default: off] | |
| 1661 | |
| 1662 | |
| 1663 --check_ref If specified, checks supplied reference allele against stored | |
| 1664 entry in Ensembl Core database [default: off] | |
| 1665 --check_existing If specified, checks for existing co-located variations in the | |
| 1666 Ensembl Variation database [default: off] | |
| 1667 --failed [0|1] Include (1) or exclude (0) variants that have been flagged as | |
| 1668 failed by Ensembl when checking for existing variants. | |
| 1669 [default: exclude] | |
| 1670 --check_alleles If specified, the alleles of existing co-located variations | |
| 1671 are compared to the input; an existing variation will only | |
| 1672 be reported if no novel allele is in the input (strand is | |
| 1673 accounted for) [default: off] | |
| 1674 --check_svs Report overlapping structural variants [default: off] | |
| 1675 | |
| 1676 --filter [filters] Filter output by consequence type. Use this to output only | |
| 1677 variants that have at least one consequence type matching the | |
| 1678 filter. Multiple filters can be used separated by ",". By | |
| 1679 combining this with --original it is possible to run the VEP | |
| 1680 iteratively to progressively filter a set of variants. See | |
| 1681 documentation for full details [default: off] | |
| 1682 | |
| 1683 --check_frequency Turns on frequency filtering. Use this to include or exclude | |
| 1684 variants based on the frequency of co-located existing | |
| 1685 variants in the Ensembl Variation database. You must also | |
| 1686 specify all of the following --freq flags [default: off] | |
| 1687 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap, | |
| 1688 1kg_yri for YRI 1000 genomes. See documentation for more | |
| 1689 details | |
| 1690 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5 | |
| 1691 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or | |
| 1692 less than (lt) --freq_freq | |
| 1693 --freq_filter Specify whether variants that pass the above should be included | |
| 1694 [exclude|include] or excluded from analysis | |
| 1695 --gmaf Include global MAF of existing variant from 1000 Genomes | |
| 1696 Phase 1 in output | |
| 1697 | |
| 1698 --individual [id] Consider only alternate alleles present in the genotypes of the | |
| 1699 specified individual(s). May be a single individual, a comma- | |
| 1700 separated list or "all" to assess all individuals separately. | |
| 1701 Each individual and variant combination is given on a separate | |
| 1702 line of output. Only works with VCF files containing individual | |
| 1703 genotype data; individual IDs are taken from column headers. | |
| 1704 --allow_non_variant Prints out non-variant lines when using VCF input | |
| 1705 --phased Force VCF individual genotypes to be interpreted as phased. | |
| 1706 For use with plugins that depend on phased state. | |
| 1707 | |
| 1708 --chr [list] Select a subset of chromosomes to analyse from your file. Any | |
| 1709 data not on this chromosome in the input will be skipped. The | |
| 1710 list can be comma separated, with "-" characters representing | |
| 1711 a range e.g. 1-5,8,15,X [default: off] | |
| 1712 --gp If specified, tries to read GRCh37 position from GP field in the | |
| 1713 INFO column of a VCF file. Only applies when VCF is the input | |
| 1714 format and human is the species [default: off] | |
| 1715 | |
| 1716 --convert Convert the input file to the output format specified. | |
| 1717 [ensembl|vcf|pileup] Converted output is written to the file specified in | |
| 1718 --output_file. No consequence calculation is carried out when | |
| 1719 doing file conversion. [default: off] | |
| 1720 | |
| 1721 --refseq Use the otherfeatures database to retrieve transcripts - this | |
| 1722 database contains RefSeq transcripts (as well as CCDS and | |
| 1723 Ensembl EST alignments) [default: off] | |
| 1724 --host Manually define database host [default: "ensembldb.ensembl.org"] | |
| 1725 -u | --user Database username [default: "anonymous"] | |
| 1726 --port Database port [default: 5306] | |
| 1727 --password Database password [default: no password] | |
| 1728 --genomes Sets DB connection params for Ensembl Genomes [default: off] | |
| 1729 --registry Registry file to use defines DB connections [default: off] | |
| 1730 Defining a registry file overrides above connection settings. | |
| 1731 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not | |
| 1732 advised due to likely incompatibilities between API and DB | |
| 1733 | |
| 1734 --no_whole_genome Run in old-style, non-whole genome mode [default: off] | |
| 1735 --buffer_size Sets the number of variants sent in each batch [default: 5000] | |
| 1736 Increasing buffer size can retrieve results more quickly | |
| 1737 but requires more memory. Only applies to whole genome mode. | |
| 1738 | |
| 1739 --cache Enables read-only use of cache [default: off] | |
| 1740 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"] | |
| 1741 --write_cache Enable writing to cache [default: off] | |
| 1742 --build [all|list] Build a complete cache for the selected species. Build for all | |
| 1743 chromosomes with --build all, or a list of chromosomes (see | |
| 1744 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS | |
| 1745 VIOLATES OUR FAIR USAGE POLICY [default: off] | |
| 1746 | |
| 1747 --compress Specify utility to decompress cache files - may be "gzcat" or | |
| 1748 "gzip -dc" Only use if default does not work [default: zcat] | |
| 1749 | |
| 1750 --skip_db_check ADVANCED! Force the script to use a cache built from a different | |
| 1751 database than specified with --host. Only use this if you are | |
| 1752 sure the hosts are compatible (e.g. ensembldb.ensembl.org and | |
| 1753 useastdb.ensembl.org) [default: off] | |
| 1754 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one | |
| 1755 file in the cache. [default: 1MB] | |
| 1756 END | |
| 1757 | |
| 1758 print $usage; | |
| 1759 } |
