comparison variant_effect_predictor/variant_effect_predictor.pl @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #!/usr/bin/perl
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 CONTACT
14
15 Please email comments or questions to the public Ensembl
16 developers list at <dev@ensembl.org>.
17
18 Questions may also be sent to the Ensembl help desk at
19 <helpdesk@ensembl.org>.
20
21 =cut
22
23 =head1 NAME
24
25 Variant Effect Predictor - a script to predict the consequences of genomic variants
26
27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
28
29 Version 2.6
30
31 by Will McLaren (wm2@ebi.ac.uk)
32 =cut
33
34 use strict;
35 use Getopt::Long;
36 use FileHandle;
37 use FindBin qw($Bin);
38 use lib $Bin;
39
40 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
41 use Bio::EnsEMBL::Variation::Utils::VEP qw(
42 parse_line
43 vf_to_consequences
44 validate_vf
45 convert_to_vcf
46 load_dumped_adaptor_cache
47 dump_adaptor_cache
48 get_all_consequences
49 get_slice
50 build_full_cache
51 read_cache_info
52 get_time
53 debug
54 @OUTPUT_COLS
55 @REG_FEAT_TYPES
56 %FILTER_SHORTCUTS
57 );
58
59 # global vars
60 my $VERSION = '2.6';
61
62
63 # define headers that would normally go in the extra field
64 # keyed on the config parameter used to turn it on
65 my %extra_headers = (
66 protein => ['ENSP'],
67 canonical => ['CANONICAL'],
68 ccds => ['CCDS'],
69 hgvs => ['HGVSc','HGVSp'],
70 hgnc => ['HGNC'],
71 sift => ['SIFT'],
72 polyphen => ['PolyPhen'],
73 numbers => ['EXON','INTRON'],
74 domains => ['DOMAINS'],
75 regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'],
76 cell_type => ['CELL_TYPE'],
77 individual => ['IND'],
78 xref_refseq => ['RefSeq'],
79 check_svs => ['SV'],
80 check_frequency => ['FREQS'],
81 gmaf => ['GMAF'],
82 user => ['DISTANCE'],
83 );
84
85 my %extra_descs = (
86 'CANONICAL' => 'Indicates if transcript is canonical for this gene',
87 'CCDS' => 'Indicates if transcript is a CCDS transcript',
88 'HGNC' => 'HGNC gene identifier',
89 'ENSP' => 'Ensembl protein identifer',
90 'HGVSc' => 'HGVS coding sequence name',
91 'HGVSp' => 'HGVS protein sequence name',
92 'SIFT' => 'SIFT prediction',
93 'PolyPhen' => 'PolyPhen prediction',
94 'EXON' => 'Exon number(s) / total',
95 'INTRON' => 'Intron number(s) / total',
96 'DOMAINS' => 'The source and identifer of any overlapping protein domains',
97 'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position',
98 'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP',
99 'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP',
100 'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP',
101 'CELL_TYPE' => 'List of cell types and classifications for regulatory feature',
102 'IND' => 'Individual name',
103 'SV' => 'IDs of overlapping structural variants',
104 'FREQS' => 'Frequencies of overlapping variants used in filtering',
105 'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1',
106 'DISTANCE' => 'Shortest distance from variant to transcript',
107 );
108
109 # set output autoflush for progress bars
110 $| = 1;
111
112 # configure from command line opts
113 my $config = &configure(scalar @ARGV);
114
115 # run the main sub routine
116 &main($config);
117
118 # this is the main sub-routine - it needs the configured $config hash
119 sub main {
120 my $config = shift;
121
122 debug("Starting...") unless defined $config->{quiet};
123
124 $config->{start_time} = time();
125 $config->{last_time} = time();
126
127 my $tr_cache = {};
128 my $rf_cache = {};
129
130 # create a hash to hold slices so we don't get the same one twice
131 my %slice_cache = ();
132
133 my @vfs;
134 my ($vf_count, $total_vf_count);
135 my $in_file_handle = $config->{in_file_handle};
136
137 # initialize line number in config
138 $config->{line_number} = 0;
139
140 # read the file
141 while(<$in_file_handle>) {
142 chomp;
143
144 $config->{line_number}++;
145
146 # header line?
147 if(/^\#/) {
148
149 # retain header lines if we are outputting VCF
150 if(defined($config->{vcf})) {
151 push @{$config->{headers}}, $_;
152 }
153
154 # line with sample labels in VCF
155 if(defined($config->{individual}) && /^#CHROM/) {
156 my @split = split /\s+/;
157
158 # no individuals
159 die("ERROR: No individual data found in VCF\n") if scalar @split <= 9;
160
161 # get individual column indices
162 my %ind_cols = map {$split[$_] => $_} (9..$#split);
163
164 # all?
165 if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) {
166 $config->{ind_cols} = \%ind_cols;
167 }
168 else {
169 my %new_ind_cols;
170
171 # check we have specified individual(s)
172 foreach my $ind(@{$config->{individual}}) {
173 die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind};
174 $new_ind_cols{$ind} = $ind_cols{$ind};
175 }
176
177 $config->{ind_cols} = \%new_ind_cols;
178 }
179 }
180
181 next;
182 }
183
184 # configure output file
185 $config->{out_file_handle} ||= &get_out_file_handle($config);
186
187 # some lines (pileup) may actually parse out into more than one variant
188 foreach my $vf(@{&parse_line($config, $_)}) {
189
190 $vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original});
191
192 # now get the slice
193 if(!defined($vf->{slice})) {
194 my $slice;
195
196 # don't get slices if we're using cache
197 # we can steal them from transcript objects later
198 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
199
200 # check if we have fetched this slice already
201 if(defined $slice_cache{$vf->{chr}}) {
202 $slice = $slice_cache{$vf->{chr}};
203 }
204
205 # if not create a new one
206 else {
207
208 $slice = &get_slice($config, $vf->{chr});
209
210 # if failed, warn and skip this line
211 if(!defined($slice)) {
212 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
213 next;
214 }
215
216 # store the hash
217 $slice_cache{$vf->{chr}} = $slice;
218 }
219 }
220
221 $vf->{slice} = $slice;
222 }
223
224 # validate the VF
225 next unless validate_vf($config, $vf);
226
227 # make a name if one doesn't exist
228 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term});
229
230 # jump out to convert here
231 if(defined($config->{convert})) {
232 &convert_vf($config, $vf);
233 next;
234 }
235
236 if(defined $config->{whole_genome}) {
237 push @vfs, $vf;
238 $vf_count++;
239 $total_vf_count++;
240
241 if($vf_count == $config->{buffer_size}) {
242 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
243
244 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
245
246 # calculate stats
247 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
248 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
249 $config->{last_time} = time();
250
251 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
252
253 @vfs = ();
254 $vf_count = 0;
255 }
256 }
257 else {
258 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
259 $vf_count++;
260 $total_vf_count++;
261 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
262 }
263 }
264 }
265
266 # if in whole-genome mode, finish off the rest of the buffer
267 if(defined $config->{whole_genome} && scalar @vfs) {
268 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
269
270 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
271
272 # calculate stats
273 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
274 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
275 $config->{last_time} = time();
276
277 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
278
279 debug($config->{filter_count}, "/$total_vf_count variants remain after filtering") if defined($config->{filter}) && !defined($config->{quiet});
280 }
281
282 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
283
284 debug("Finished!") unless defined $config->{quiet};
285 }
286
287 # sets up configuration hash that is used throughout the script
288 sub configure {
289 my $args = shift;
290
291 my $config = {};
292
293 GetOptions(
294 $config,
295 'help', # displays help message
296
297 # input options,
298 'config=s', # config file name
299 'input_file|i=s', # input file name
300 'format=s', # input file format
301
302 # DB options
303 'species=s', # species e.g. human, homo_sapiens
304 'registry=s', # registry file
305 'host=s', # database host
306 'port=s', # database port
307 'user=s', # database user name
308 'password=s', # database password
309 'db_version=i', # Ensembl database version to use e.g. 62
310 'genomes', # automatically sets DB params for e!Genomes
311 'refseq', # use otherfeatures RefSeq DB instead of Ensembl
312 #'no_disconnect', # disables disconnect_when_inactive
313
314 # runtime options
315 'most_severe', # only return most severe consequence
316 'summary', # only return one line per variation with all consquence types
317 'per_gene', # only return most severe per gene
318 'buffer_size=i', # number of variations to read in before analysis
319 'chunk_size=s', # size in bases of "chunks" used in internal hash structure
320 'failed=i', # include failed variations when finding existing
321 'no_whole_genome', # disables now default whole-genome mode
322 'whole_genome', # proxy for whole genome mode - now just warns user
323 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
324 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
325 'check_ref', # check supplied reference allele against DB
326 'check_existing', # find existing co-located variations
327 'check_svs', # find overlapping structural variations
328 'check_alleles', # only attribute co-located if alleles are the same
329 'check_frequency', # enable frequency checking
330 'gmaf', # add global MAF of existing var
331 'freq_filter=s', # exclude or include
332 'freq_freq=f', # frequency to filter on
333 'freq_gt_lt=s', # gt or lt (greater than or less than)
334 'freq_pop=s', # population to filter on
335 'allow_non_variant', # allow non-variant VCF lines through
336 'individual=s', # give results by genotype for individuals
337 'phased', # force VCF genotypes to be interpreted as phased
338 'fork=i', # fork into N processes
339
340 # verbosity options
341 'verbose|v', # print out a bit more info while running
342 'quiet', # print nothing to STDOUT (unless using -o stdout)
343 'no_progress', # don't display progress bars
344
345 # output options
346 'everything|e', # switch on EVERYTHING :-)
347 'output_file|o=s', # output file name
348 'force_overwrite', # force overwrite of output file if already exists
349 'terms|t=s', # consequence terms to use e.g. NCBI, SO
350 'coding_only', # only return results for consequences in coding regions
351 'canonical', # indicates if transcript is canonical
352 'ccds', # output CCDS identifer
353 'xref_refseq', # output refseq mrna xref
354 'protein', # add e! protein ID to extra column
355 'hgnc', # add HGNC gene ID to extra column
356 'hgvs', # add HGVS names to extra column
357 'sift=s', # SIFT predictions
358 'polyphen=s', # PolyPhen predictions
359 'condel=s', # Condel predictions
360 'regulatory', # enable regulatory stuff
361 'cell_type=s', # filter cell types for regfeats
362 'convert=s', # convert input to another format (doesn't run VEP)
363 'filter=s', # run in filtering mode
364 'no_intergenic', # don't print out INTERGENIC consequences
365 'gvf', # produce gvf output
366 'vcf', # produce vcf output
367 'original', # produce output in input format
368 'no_consequences', # don't calculate consequences
369 'lrg', # enable LRG-based features
370 'fields=s', # define your own output fields
371 'domains', # output overlapping protein features
372 'numbers', # include exon and intron numbers
373
374 # cache stuff
375 'cache', # use cache
376 'write_cache', # enables writing to the cache
377 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
378 'no_adaptor_cache', # don't write adaptor cache
379 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript
380 'strip', # strips adaptors etc from objects before caching them
381 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
382 'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
383 'cache_region_size=i', # size of region in bases for each cache file
384 'no_slice_cache', # tell API not to cache features on slice
385 'standalone', # standalone renamed offline
386 'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection
387 'skip_db_check', # don't compare DB parameters with cached
388 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
389 'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation
390 'tmpdir=s', # tmp dir used for BigWig retrieval
391 'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory
392
393 # debug
394 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
395 'count_queries', # counts SQL queries executed
396 'admin', # allows me to build off public hosts
397 'debug', # print out debug info
398 'tabix', # experimental use tabix cache files
399 ) or die "ERROR: Failed to parse command-line flags\n";
400
401 # print usage message if requested or no args supplied
402 if(defined($config->{help}) || !$args) {
403 &usage;
404 exit(0);
405 }
406
407 # dir is where the cache and plugins live
408 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep');
409
410 # dir gets set to the specific cache directory later on, so take a copy to use
411 # when configuring plugins
412
413 $config->{toplevel_dir} = $config->{dir};
414
415 # ini file?
416 my $ini_file = $config->{dir}.'/vep.ini';
417
418 if(-e $ini_file) {
419 read_config_from_file($config, $ini_file);
420 }
421
422 # config file?
423 if(defined $config->{config}) {
424 read_config_from_file($config, $config->{config});
425 }
426
427 # can't be both quiet and verbose
428 die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose});
429
430 # check forking
431 if(defined($config->{fork})) {
432 die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1;
433
434 # check we can use MIME::Base64
435 eval q{ use MIME::Base64; };
436
437 if($@) {
438 debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet});
439 delete $config->{fork};
440 }
441 else {
442
443 # try a practice fork
444 my $pid = fork;
445
446 if(!defined($pid)) {
447 debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet});
448 delete $config->{fork};
449 }
450 elsif($pid) {
451 waitpid($pid, 0);
452 }
453 elsif($pid == 0) {
454 exit(0);
455 }
456 }
457 }
458
459 # check file format
460 if(defined $config->{format}) {
461 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i;
462 }
463
464 # check convert format
465 if(defined $config->{convert}) {
466 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i;
467 }
468
469 # check if user still using --standalone
470 if(defined $config->{standalone}) {
471 die "ERROR: --standalone replaced by --offline\n";
472 }
473
474 # connection settings for Ensembl Genomes
475 if($config->{genomes}) {
476 $config->{host} ||= 'mysql.ebi.ac.uk';
477 $config->{port} ||= 4157;
478 }
479
480 # connection settings for main Ensembl
481 else {
482 $config->{species} ||= "homo_sapiens";
483 $config->{host} ||= 'ensembldb.ensembl.org';
484 $config->{port} ||= 5306;
485 }
486
487 # refseq or core?
488 if(defined($config->{refseq})) {
489 $config->{core_type} = 'otherfeatures';
490 }
491 else {
492 $config->{core_type} = 'core';
493 }
494
495 # output term
496 if(defined $config->{terms}) {
497 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i;
498 if($config->{terms} =~ /ensembl|display/i) {
499 $config->{terms} = 'display';
500 }
501 else {
502 $config->{terms} = uc($config->{terms});
503 }
504 }
505
506 # everything?
507 if(defined($config->{everything})) {
508 my %everything = (
509 sift => 'b',
510 polyphen => 'b',
511 ccds => 1,
512 hgvs => 1,
513 hgnc => 1,
514 numbers => 1,
515 domains => 1,
516 regulatory => 1,
517 canonical => 1,
518 protein => 1,
519 gmaf => 1,
520 );
521
522 $config->{$_} = $everything{$_} for keys %everything;
523
524 # these ones won't work with offline
525 delete $config->{hgvs} if defined($config->{offline});
526 }
527
528 # check nsSNP tools
529 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) {
530 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/;
531
532 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i;
533
534 die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel';
535 }
536
537 # force quiet if outputting to STDOUT
538 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
539 delete $config->{verbose} if defined($config->{verbose});
540 $config->{quiet} = 1;
541 }
542
543 # individual(s) specified?
544 if(defined($config->{individual})) {
545 $config->{individual} = [split /\,/, $config->{individual}];
546
547 # force allow_non_variant
548 $config->{allow_non_variant} = 1;
549 }
550
551 # summarise options if verbose
552 if(defined $config->{verbose}) {
553 my $header =<<INTRO;
554 #----------------------------------#
555 # ENSEMBL VARIANT EFFECT PREDICTOR #
556 #----------------------------------#
557
558 version $VERSION
559
560 By Will McLaren (wm2\@ebi.ac.uk)
561
562 Configuration options:
563
564 INTRO
565 print $header;
566
567 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
568
569 foreach my $key(sort keys %$config) {
570 next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0;
571 print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n";
572 }
573
574 print "\n".("-" x 20)."\n\n";
575 }
576
577 # check custom annotations
578 for my $i(0..$#{$config->{custom}}) {
579 my $custom = $config->{custom}->[$i];
580
581 my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom;
582 $type ||= 'exact';
583 $format ||= 'bed';
584 $coords ||= 0;
585
586 # check type
587 die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/;
588
589 # check format
590 die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/;
591
592 # bigwig format
593 if($format eq 'bigwig') {
594 # check for bigWigToWig
595 die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/;
596 }
597
598 else {
599 # check for tabix
600 die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/;
601
602 # remote files?
603 if($filepath =~ /tp\:\/\//) {
604 my $remote_test = `tabix $filepath 1:1-1 2>&1`;
605 if($remote_test =~ /fail/) {
606 die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n";
607 }
608 elsif($remote_test =~ /get_local_version/) {
609 debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet});
610 }
611 }
612
613 # check files exist
614 else {
615 die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath;
616 die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi';
617 }
618 }
619
620 $config->{custom}->[$i] = {
621 'file' => $filepath,
622 'name' => $shortname || 'CUSTOM'.($i + 1),
623 'type' => $type,
624 'format' => $format,
625 'coords' => $coords,
626 };
627 }
628
629 # check if using filter and original
630 die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter});
631
632 # filter by consequence?
633 if(defined($config->{filter})) {
634
635 my %filters = map {$_ => 1} split /\,/, $config->{filter};
636
637 # add in shortcuts
638 foreach my $filter(keys %filters) {
639 my $value = 1;
640 if($filter =~ /^no_/) {
641 delete $filters{$filter};
642 $filter =~ s/^no_//g;
643 $value = 0;
644 $filters{$filter} = $value;
645 }
646
647 if(defined($FILTER_SHORTCUTS{$filter})) {
648 delete $filters{$filter};
649 $filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}};
650 }
651 }
652
653 $config->{filter} = \%filters;
654
655 $config->{filter_count} = 0;
656 }
657
658 # set defaults
659 $config->{user} ||= 'anonymous';
660 $config->{buffer_size} ||= 5000;
661 $config->{chunk_size} ||= '50kb';
662 $config->{output_file} ||= "variant_effect_output.txt";
663 $config->{tmpdir} ||= '/tmp';
664 $config->{format} ||= 'guess';
665 $config->{terms} ||= 'SO';
666 $config->{cache_region_size} ||= 1000000;
667 $config->{compress} ||= 'zcat';
668
669 # regulatory has to be on for cell_type
670 if(defined($config->{cell_type})) {
671 $config->{regulatory} = 1;
672 $config->{cell_type} = [split /\,/, $config->{cell_type}] if defined($config->{cell_type});
673 }
674
675 # can't use a whole bunch of options with most_severe
676 if(defined($config->{most_severe})) {
677 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) {
678 die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag});
679 }
680 }
681
682 # can't use a whole bunch of options with summary
683 if(defined($config->{summary})) {
684 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) {
685 die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag});
686 }
687 }
688
689 # frequency filtering
690 if(defined($config->{check_frequency})) {
691 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
692 die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag};
693 }
694
695 # need to set check_existing
696 $config->{check_existing} = 1;
697 }
698
699 $config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf};
700
701 # warn users still using whole_genome flag
702 if(defined($config->{whole_genome})) {
703 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
704 }
705
706 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome};
707 $config->{failed} = 0 unless defined $config->{failed};
708 $config->{chunk_size} =~ s/mb?/000000/i;
709 $config->{chunk_size} =~ s/kb?/000/i;
710 $config->{cache_region_size} =~ s/mb?/000000/i;
711 $config->{cache_region_size} =~ s/kb?/000/i;
712
713 # cluck and display executed SQL?
714 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck});
715
716 # offline needs cache, can't use HGVS
717 if(defined($config->{offline})) {
718 $config->{cache} = 1;
719
720 #die("ERROR: Cannot generate HGVS coordinates in offline mode\n") if defined($config->{hgvs});
721 die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs';
722 die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id';
723 die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency});
724 die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv});
725 }
726
727 # write_cache needs cache
728 $config->{cache} = 1 if defined $config->{write_cache};
729
730 # no_slice_cache, prefetch and whole_genome have to be on to use cache
731 if(defined($config->{cache})) {
732 $config->{prefetch} = 1;
733 $config->{no_slice_cache} = 1;
734 $config->{whole_genome} = 1;
735 $config->{strip} = 1;
736 }
737
738 $config->{build} = $config->{rebuild} if defined($config->{rebuild});
739
740 # force options for full build
741 if(defined($config->{build})) {
742 $config->{prefetch} = 1;
743 $config->{hgnc} = 1;
744 $config->{no_slice_cache} = 1;
745 $config->{cache} = 1;
746 $config->{strip} = 1;
747 $config->{write_cache} = 1;
748 $config->{cell_type} = 1 if defined($config->{regulatory});
749 }
750
751 # connect to databases
752 $config->{reg} = &connect_to_dbs($config);
753
754 # complete dir with species name and db_version
755 $config->{dir} .= '/'.(
756 join '/', (
757 defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}),
758 $config->{db_version} || $config->{reg}->software_version
759 )
760 );
761
762 # warn user cache directory doesn't exist
763 if(!-e $config->{dir}) {
764
765 # if using write_cache
766 if(defined($config->{write_cache})) {
767 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet});
768 }
769
770 # want to read cache, not found
771 elsif(defined($config->{cache})) {
772 die("ERROR: Cache directory ", $config->{dir}, " not found");
773 }
774 }
775
776 if(defined($config->{cache})) {
777 # read cache info
778 if(read_cache_info($config)) {
779 debug("Read existing cache info") unless defined $config->{quiet};
780 }
781 }
782
783 # we configure plugins here because they can sometimes switch on the
784 # regulatory config option
785 configure_plugins($config);
786
787 # include regulatory modules if requested
788 if(defined($config->{regulatory})) {
789 # do the use statements here so that users don't have to have the
790 # funcgen API installed to use the rest of the script
791 eval q{
792 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
793 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
794 use Bio::EnsEMBL::Funcgen::MotifFeature;
795 use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
796 use Bio::EnsEMBL::Funcgen::BindingMatrix;
797 };
798
799 if($@) {
800 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n");
801 }
802 }
803
804 # user defined custom output fields
805 if(defined($config->{fields})) {
806 $config->{fields} = [split ',', $config->{fields}];
807 debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet});
808 $config->{fields_redefined} = 1;
809 }
810 $config->{fields} ||= \@OUTPUT_COLS;
811
812 # suppress warnings that the FeatureAdpators spit if using no_slice_cache
813 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
814
815 # get adaptors (don't get them in offline mode)
816 unless(defined($config->{offline})) {
817
818 if(defined($config->{cache}) && !defined($config->{write_cache})) {
819
820 # try and load adaptors from cache
821 if(!&load_dumped_adaptor_cache($config)) {
822 &get_adaptors($config);
823 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache});
824 }
825
826 # check cached adaptors match DB params
827 else {
828 my $dbc = $config->{sa}->{dbc};
829
830 my $ok = 1;
831
832 if($dbc->{_host} ne $config->{host}) {
833
834 # ens-livemirror, useastdb and ensembldb should all have identical DBs
835 unless(
836 (
837 $dbc->{_host} eq 'ens-livemirror'
838 || $dbc->{_host} eq 'ensembldb.ensembl.org'
839 || $dbc->{_host} eq 'useastdb.ensembl.org'
840 ) && (
841 $config->{host} eq 'ens-livemirror'
842 || $config->{host} eq 'ensembldb.ensembl.org'
843 || $config->{host} eq 'useastdb.ensembl.org'
844 )
845 ) {
846 $ok = 0;
847 }
848
849 unless(defined($config->{skip_db_check})) {
850 # but we still need to reconnect
851 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet});
852
853 &get_adaptors($config);
854 }
855 }
856
857 if(!$ok) {
858 if(defined($config->{skip_db_check})) {
859 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet});
860 }
861 else {
862 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set";
863 }
864 }
865 }
866 }
867 else {
868 &get_adaptors($config);
869 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache});
870 }
871
872 # reg adaptors (only fetches if not retrieved from cache already)
873 &get_reg_adaptors($config) if defined($config->{regulatory});
874 }
875
876 # check cell types
877 if(defined($config->{cell_type}) && !defined($config->{build})) {
878 my $cls = '';
879
880 if(defined($config->{cache})) {
881 $cls = $config->{cache_cell_types};
882 }
883 else {
884 my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor();
885 $cls = join ",", map {$_->name} @{$cta->fetch_all};
886 }
887
888 foreach my $cl(@{$config->{cell_type}}) {
889 die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/;
890 }
891 }
892
893 # get terminal width for progress bars
894 unless(defined($config->{quiet})) {
895 my $width;
896
897 # module may not be installed
898 eval q{
899 use Term::ReadKey;
900 };
901
902 if(!$@) {
903 my ($w, $h);
904
905 # module may be installed, but e.g.
906 eval {
907 #($w, $h) = GetTerminalSize();
908 $w = 167;
909 $h = 30;
910 };
911
912 $width = $w if defined $w;
913 }
914
915 $width ||= 60;
916 $width -= 12;
917 $config->{terminal_width} = $width;
918 }
919
920 # jump out to build cache if requested
921 if(defined($config->{build})) {
922
923 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
924 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
925 }
926
927 # build the cache
928 debug("Building cache for ".$config->{species}) unless defined($config->{quiet});
929 build_full_cache($config);
930
931 # exit script
932 debug("Finished building cache") unless defined($config->{quiet});
933 exit(0);
934 }
935
936
937 # warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG
938 if(defined($config->{cache})) {
939
940 # these two def depend on DB
941 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency lrg check_sv)) {
942 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet});
943 }
944
945 # as does using HGVS or IDs as input
946 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet});
947
948 # the rest may be in the cache
949 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) {
950 next if defined($config->{'cache_'.$param});
951 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet});
952 }
953 }
954
955 # get list of chrs if supplied
956 if(defined($config->{chr})) {
957 my %chrs;
958
959 foreach my $val(split /\,/, $config->{chr}) {
960 my @nnn = split /\-/, $val;
961
962 foreach my $chr($nnn[0]..$nnn[-1]) {
963 $chrs{$chr} = 1;
964 }
965 }
966
967 $config->{chr} = \%chrs;
968 }
969
970 # get input file handle
971 $config->{in_file_handle} = &get_in_file_handle($config);
972
973 return $config;
974 }
975
976 # reads config from a file
977 sub read_config_from_file {
978 my $config = shift;
979 my $file = shift;
980
981 open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n";
982
983 while(<CONFIG>) {
984 next if /^\#/;
985 my @split = split /\s+|\=/;
986 my $key = shift @split;
987 $key =~ s/^\-//g;
988
989 if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') {
990 push @{$config->{$key}}, @split;
991 }
992 else {
993 $config->{$key} ||= $split[0];
994 }
995 }
996
997 close CONFIG;
998
999 # force quiet if outputting to STDOUT
1000 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
1001 delete $config->{verbose} if defined($config->{verbose});
1002 $config->{quiet} = 1;
1003 }
1004
1005 debug("Read configuration from $file") unless defined($config->{quiet});
1006 }
1007
1008 # configures custom VEP plugins
1009 sub configure_plugins {
1010
1011 my $config = shift;
1012
1013 $config->{plugins} = [];
1014
1015 if (my @plugins = @{ $config->{plugin} }) {
1016
1017 # add the Plugins directory onto @INC
1018
1019 unshift @INC, $config->{toplevel_dir}."/Plugins";
1020
1021 for my $plugin (@plugins) {
1022
1023 # parse out the module name and parameters
1024
1025 my ($module, @params) = split /,/, $plugin;
1026
1027 # check we can use the module
1028
1029 eval qq{
1030 use $module;
1031 };
1032 if ($@) {
1033 debug("Failed to compile plugin $module: $@") unless defined($config->{quiet});
1034 next;
1035 }
1036
1037 # now check we can instantiate it, passing any parameters to the constructor
1038
1039 my $instance;
1040
1041 eval {
1042 $instance = $module->new($config, @params);
1043 };
1044 if ($@) {
1045 debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet});
1046 next;
1047 }
1048
1049 # check that the versions match
1050
1051 my $plugin_version;
1052
1053 if ($instance->can('version')) {
1054 $plugin_version = $instance->version;
1055 }
1056
1057 my $version_ok = 1;
1058
1059 if ($plugin_version) {
1060 my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version;
1061 my ($major, $minor, $maintenance) = split /\./, $VERSION;
1062
1063 if ($plugin_major != $major) {
1064 debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet});
1065 $version_ok = 0;
1066 }
1067 }
1068 else {
1069 debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet});
1070 $version_ok = 0;
1071 }
1072
1073 debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok;
1074
1075 # check that it implements all necessary methods
1076
1077 for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) {
1078 unless ($instance->can($required)) {
1079 debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet});
1080 next;
1081 }
1082 }
1083
1084 # all's good, so save the instance in our list of plugins
1085
1086 push @{ $config->{plugins} }, $instance;
1087
1088 debug("Loaded plugin: $module") unless defined($config->{quiet});
1089
1090 # for convenience, check if the plugin wants regulatory stuff and turn on the config option if so
1091
1092 if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) {
1093 debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet});
1094 $config->{regulatory} = 1;
1095 }
1096 }
1097 }
1098 }
1099
1100 # connects to DBs (not done in offline mode)
1101 sub connect_to_dbs {
1102 my $config = shift;
1103
1104 # get registry
1105 my $reg = 'Bio::EnsEMBL::Registry';
1106
1107 unless(defined($config->{offline})) {
1108 # load DB options from registry file if given
1109 if(defined($config->{registry})) {
1110 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet});
1111 $reg->load_all(
1112 $config->{registry},
1113 $config->{verbose},
1114 undef,
1115 $config->{no_slice_cache}
1116 );
1117 }
1118
1119 # otherwise manually connect to DB server
1120 else {
1121 $reg->load_registry_from_db(
1122 -host => $config->{host},
1123 -user => $config->{user},
1124 -pass => $config->{password},
1125 -port => $config->{port},
1126 -db_version => $config->{db_version},
1127 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef,
1128 -verbose => $config->{verbose},
1129 -no_cache => $config->{no_slice_cache},
1130 );
1131 }
1132
1133 eval { $reg->set_reconnect_when_lost() };
1134
1135 if(defined($config->{verbose})) {
1136 # get a meta container adaptors to check version
1137 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer');
1138 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer');
1139
1140 if($core_mca && $var_mca) {
1141 debug(
1142 "Connected to core version ", $core_mca->get_schema_version, " database ",
1143 "and variation version ", $var_mca->get_schema_version, " database"
1144 );
1145 }
1146 }
1147 }
1148
1149 return $reg;
1150 }
1151
1152 # get adaptors from DB
1153 sub get_adaptors {
1154 my $config = shift;
1155
1156 die "ERROR: No registry" unless defined $config->{reg};
1157
1158 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature');
1159 $config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature');
1160 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation');
1161 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix');
1162 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation');
1163
1164 # get fake ones for species with no var DB
1165 if(!defined($config->{vfa})) {
1166 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species});
1167 $config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species});
1168 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species});
1169 }
1170
1171 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice');
1172 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene');
1173 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript');
1174 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer');
1175 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem');
1176
1177 # cache schema version
1178 $config->{mca}->get_schema_version if defined $config->{mca};
1179
1180 # check we got slice adaptor - can't continue without a core DB
1181 die("ERROR: Could not connect to core database\n") unless defined $config->{sa};
1182 }
1183
1184 # gets regulatory adaptors
1185 sub get_reg_adaptors {
1186 my $config = shift;
1187
1188 foreach my $type(@REG_FEAT_TYPES) {
1189 next if defined($config->{$type.'_adaptor'});
1190
1191 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type);
1192 if(defined($adaptor)) {
1193 $config->{$type.'_adaptor'} = $adaptor;
1194 }
1195 else {
1196 delete $config->{regulatory};
1197 last;
1198 }
1199 }
1200 }
1201
1202 # gets file handle for input
1203 sub get_in_file_handle {
1204 my $config = shift;
1205
1206 # define the filehandle to read input from
1207 my $in_file_handle = new FileHandle;
1208
1209 if(defined($config->{input_file})) {
1210
1211 # check defined input file exists
1212 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file};
1213
1214 if($config->{input_file} =~ /\.gz$/){
1215 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
1216 }
1217 else {
1218 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
1219 }
1220 }
1221
1222 # no file specified - try to read data off command line
1223 else {
1224 $in_file_handle = 'STDIN';
1225 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet};
1226 }
1227
1228 return $in_file_handle;
1229 }
1230
1231 # gets file handle for output and adds header
1232 sub get_out_file_handle {
1233 my $config = shift;
1234
1235 # define filehandle to write to
1236 my $out_file_handle = new FileHandle;
1237
1238 # check if file exists
1239 if(-e $config->{output_file} && !defined($config->{force_overwrite})) {
1240 # die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with -- force_overwrite\n");
1241 }
1242
1243 if($config->{output_file} =~ /stdout/i) {
1244 $out_file_handle = *STDOUT;
1245 }
1246 else {
1247 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n");
1248 }
1249
1250 # define headers for a VCF file
1251 my @vcf_headers = (
1252 '#CHROM',
1253 'POS',
1254 'ID',
1255 'REF',
1256 'ALT',
1257 'QUAL',
1258 'FILTER',
1259 'INFO'
1260 );
1261
1262 # file conversion, don't want to add normal headers
1263 if(defined($config->{convert})) {
1264 # header for VCF
1265 if($config->{convert} =~ /vcf/i) {
1266 print $out_file_handle "##fileformat=VCFv4.0\n";
1267 print $out_file_handle join "\t", @vcf_headers;
1268 print $out_file_handle "\n";
1269 }
1270
1271 return $out_file_handle;
1272 }
1273
1274 # GVF output, no header
1275 elsif(defined($config->{gvf}) || defined($config->{original})) {
1276 print $out_file_handle join "\n", @{$config->{headers}} if defined($config->{headers}) && defined($config->{original});
1277 return $out_file_handle;
1278 }
1279
1280 elsif(defined($config->{vcf})) {
1281
1282 # create an info string for the VCF header
1283 my @new_headers;
1284
1285 # if the user has defined the fields themselves, we don't need to worry
1286 if(defined $config->{fields_redefined}) {
1287 @new_headers = @{$config->{fields}};
1288 }
1289 else {
1290 @new_headers = (
1291
1292 # get default headers, minus variation name and location (already encoded in VCF)
1293 grep {
1294 $_ ne 'Uploaded_variation' and
1295 $_ ne 'Location' and
1296 $_ ne 'Extra'
1297 } @{$config->{fields}},
1298
1299 # get extra headers
1300 map {@{$extra_headers{$_}}}
1301 grep {defined $config->{$_}}
1302 keys %extra_headers
1303 );
1304
1305 # plugin headers
1306 foreach my $plugin_header(split /\n/, get_plugin_headers($config)) {
1307 $plugin_header =~ /\#\# (.+?)\t\:.+/;
1308 push @new_headers, $1;
1309 }
1310
1311 # redefine the main headers list in config
1312 $config->{fields} = \@new_headers;
1313 }
1314
1315 # add the newly defined headers as a header to the VCF
1316 my $string = join '|', @{$config->{fields}};
1317 my @vcf_info_strings = ('##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: '.$string.'">');
1318
1319 # add custom headers
1320 foreach my $custom(@{$config->{custom}}) {
1321 push @vcf_info_strings, '##INFO=<ID='.$custom->{name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">';
1322 }
1323
1324 # if this is already a VCF file, we need to add our new headers in the right place
1325 if(defined($config->{headers})) {
1326
1327 for my $i(0..$#{$config->{headers}}) {
1328 if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) {
1329 splice(@{$config->{headers}}, $i, 0, @vcf_info_strings);
1330 }
1331 }
1332
1333 print $out_file_handle join "\n", @{$config->{headers}};
1334 print $out_file_handle "\n";
1335 }
1336
1337 else {
1338 print $out_file_handle "##fileformat=VCFv4.0\n";
1339 print $out_file_handle join "\n", @vcf_info_strings;
1340 print $out_file_handle "\n";
1341 print $out_file_handle join "\t", @vcf_headers;
1342 print $out_file_handle "\n";
1343 }
1344
1345 return $out_file_handle;
1346 }
1347
1348 # make header
1349 my $time = &get_time;
1350 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca};
1351 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache});
1352 my $version_string =
1353 "Using API version ".$config->{reg}->software_version.
1354 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?');
1355
1356 # add key for extra column headers based on config
1357 my $extra_column_keys = join "\n",
1358 map {'## '.$_.' : '.$extra_descs{$_}}
1359 sort map {@{$extra_headers{$_}}}
1360 grep {defined $config->{$_}}
1361 keys %extra_headers;
1362
1363 my $header =<<HEAD;
1364 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION
1365 ## Output produced at $time
1366 ## Connected to $db_string
1367 ## $version_string
1368 ## Extra column keys:
1369 $extra_column_keys
1370 HEAD
1371
1372 $header .= get_plugin_headers($config);
1373
1374 # add headers
1375 print $out_file_handle $header;
1376
1377 # add custom data defs
1378 if(defined($config->{custom})) {
1379 foreach my $custom(@{$config->{custom}}) {
1380 print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n";
1381 }
1382 }
1383
1384 # add column headers
1385 print $out_file_handle '#', (join "\t", @{$config->{fields}});
1386 print $out_file_handle "\n";
1387
1388 return $out_file_handle;
1389 }
1390
1391 sub get_plugin_headers {
1392
1393 my $config = shift;
1394
1395 my $header = "";
1396
1397 for my $plugin (@{ $config->{plugins} }) {
1398 if (my $hdr = $plugin->get_header_info) {
1399 for my $key (keys %$hdr) {
1400 my $val = $hdr->{$key};
1401
1402 $header .= "## $key\t: $val\n";
1403 }
1404 }
1405 }
1406
1407 return $header;
1408 }
1409
1410 # convert a variation feature to a line of output
1411 sub convert_vf {
1412 my $config = shift;
1413 my $vf = shift;
1414
1415 my $convert_method = 'convert_to_'.lc($config->{convert});
1416 my $method_ref = \&$convert_method;
1417
1418 my $line = &$method_ref($config, $vf);
1419 my $handle = $config->{out_file_handle};
1420
1421 if(scalar @$line) {
1422 print $handle join "\t", @$line;
1423 print $handle "\n";
1424 }
1425 }
1426
1427 # converts to Ensembl format
1428 sub convert_to_ensembl {
1429 my $config = shift;
1430 my $vf = shift;
1431
1432 return [
1433 $vf->{chr} || $vf->seq_region_name,
1434 $vf->start,
1435 $vf->end,
1436 $vf->allele_string,
1437 $vf->strand,
1438 $vf->variation_name
1439 ];
1440 }
1441
1442
1443 # converts to pileup format
1444 sub convert_to_pileup {
1445 my $config = shift;
1446 my $vf = shift;
1447
1448 # look for imbalance in the allele string
1449 my %allele_lengths;
1450 my @alleles = split /\//, $vf->allele_string;
1451
1452 foreach my $allele(@alleles) {
1453 $allele =~ s/\-//g;
1454 $allele_lengths{length($allele)} = 1;
1455 }
1456
1457 # in/del
1458 if(scalar keys %allele_lengths > 1) {
1459
1460 if($vf->allele_string =~ /\-/) {
1461
1462 # insertion?
1463 if($alleles[0] eq '-') {
1464 shift @alleles;
1465
1466 for my $i(0..$#alleles) {
1467 $alleles[$i] =~ s/\-//g;
1468 $alleles[$i] = '+'.$alleles[$i];
1469 }
1470 }
1471
1472 else {
1473 @alleles = grep {$_ ne '-'} @alleles;
1474
1475 for my $i(0..$#alleles) {
1476 $alleles[$i] =~ s/\-//g;
1477 $alleles[$i] = '-'.$alleles[$i];
1478 }
1479 }
1480
1481 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles;
1482
1483 return [
1484 $vf->{chr} || $vf->seq_region_name,
1485 $vf->start - 1,
1486 '*',
1487 (join "/", @alleles),
1488 ];
1489 }
1490
1491 else {
1492 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet});
1493 return [];
1494 }
1495
1496 }
1497
1498 # balanced sub
1499 else {
1500 return [
1501 $vf->{chr} || $vf->seq_region_name,
1502 $vf->start,
1503 shift @alleles,
1504 (join "/", @alleles),
1505 ];
1506 }
1507 }
1508
1509 # converts to HGVS (hackily returns many lines)
1510 sub convert_to_hgvs {
1511 my $config = shift;
1512 my $vf = shift;
1513
1514 # ensure we have a slice
1515 $vf->{slice} ||= get_slice($config, $vf->{chr});
1516
1517 my $tvs = $vf->get_all_TranscriptVariations;
1518
1519 my @return = values %{$vf->get_all_hgvs_notations()};
1520
1521 if(defined($tvs)) {
1522 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs;
1523 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs;
1524 }
1525
1526 return [join "\n", @return];
1527 }
1528
1529 # prints a line of output from the hash
1530 sub print_line {
1531 my $config = shift;
1532 my $line = shift;
1533 return unless defined($line);
1534
1535 my $output;
1536
1537 # normal
1538 if(ref($line) eq 'HASH') {
1539 my %extra = %{$line->{Extra}};
1540
1541 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} };
1542
1543 # if the fields have been redefined we need to search through in case
1544 # any of the defined fields are actually part of the Extra hash
1545 $output = join "\t", map {
1546 (defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-'))
1547 } @{$config->{fields}};
1548 }
1549
1550 # gvf/vcf
1551 else {
1552 $output = $$line;
1553 }
1554
1555 my $fh = $config->{out_file_handle};
1556 print $fh "$output\n";
1557 }
1558
1559 # outputs usage message
1560 sub usage {
1561 my $usage =<<END;
1562 #----------------------------------#
1563 # ENSEMBL VARIANT EFFECT PREDICTOR #
1564 #----------------------------------#
1565
1566 version $VERSION
1567
1568 By Will McLaren (wm2\@ebi.ac.uk)
1569
1570 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
1571
1572 Usage:
1573 perl variant_effect_predictor.pl [arguments]
1574
1575 Options
1576 =======
1577
1578 --help Display this message and quit
1579 --verbose Display verbose output as the script runs [default: off]
1580 --quiet Suppress status and warning messages [default: off]
1581 --no_progress Suppress progress bars [default: off]
1582
1583 --config Load configuration from file. Any command line options
1584 specified overwrite those in the file [default: off]
1585
1586 --everything Shortcut switch to turn on commonly used options. See web
1587 documentation for details [default: off]
1588
1589 --fork [num_forks] Use forking to improve script runtime [default: off]
1590
1591 -i | --input_file Input file - if not specified, reads from STDIN. Files
1592 may be gzip compressed.
1593 --format Specify input file format - one of "ensembl", "pileup",
1594 "vcf", "hgvs", "id" or "guess" to try and work out format.
1595 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this
1596 will force --quiet [default: "variant_effect_output.txt"]
1597 --force_overwrite Force overwriting of output file [default: quit if file
1598 exists]
1599 --original Writes output as it was in input - must be used with --filter
1600 since no consequence data is added [default: off]
1601 --vcf Write output as VCF [default: off]
1602 --gvf Write output as GVF [default: off]
1603 --fields [field list] Define a custom output format by specifying a comma-separated
1604 list of field names. Field names normally present in the
1605 "Extra" field may also be specified, including those added by
1606 plugin modules. Can also be used to configure VCF output
1607 columns [default: off]
1608
1609 --species [species] Species to use [default: "human"]
1610
1611 -t | --terms Type of consequence terms to output - one of "SO", "ensembl"
1612 [default: SO]
1613
1614 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off]
1615 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off]
1616
1617 NB: SIFT and PolyPhen predictions are currently available for human only
1618 NB: Condel support has been moved to a VEP plugin module - see documentation
1619
1620 --regulatory Look for overlaps with regulatory regions. The script can
1621 also call if a variant falls in a high information position
1622 within a transcription factor binding site. Output lines have
1623 a Feature type of RegulatoryFeature or MotifFeature
1624 [default: off]
1625 --cell_type [types] Report only regulatory regions that are found in the given cell
1626 type(s). Can be a single cell type or a comma-separated list.
1627 The functional type in each cell type is reported under
1628 CELL_TYPE in the output. To retrieve a list of cell types, use
1629 "--cell_type list" [default: off]
1630
1631 NB: Regulatory consequences are currently available for human and mouse only
1632
1633 --custom [file list] Add custom annotations from tabix-indexed files. See
1634 documentation for full details [default: off]
1635 --plugin [plugin_name] Use named plugin module [default: off]
1636 --hgnc Add HGNC gene identifiers to output [default: off]
1637 --hgvs Output HGVS identifiers (coding and protein). Requires database
1638 connection [default: off]
1639 --ccds Output CCDS transcript identifiers [default: off]
1640 --xref_refseq Output aligned RefSeq mRNA identifier for transcript. NB: the
1641 RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND
1642 FREQUENTLY WILL NOT, match exactly in sequence, exon structure
1643 and protein product [default: off]
1644 --protein Output Ensembl protein identifer [default: off]
1645 --canonical Indicate if the transcript for this consequence is the canonical
1646 transcript for this gene [default: off]
1647 --domains Include details of any overlapping protein domains [default: off]
1648 --numbers Include exon & intron numbers [default: off]
1649
1650 --no_intergenic Excludes intergenic consequences from the output [default: off]
1651 --coding_only Only return consequences that fall in the coding region of
1652 transcripts [default: off]
1653 --most_severe Ouptut only the most severe consequence per variation.
1654 Transcript-specific columns will be left blank. [default: off]
1655 --summary Output only a comma-separated list of all consequences per
1656 variation. Transcript-specific columns will be left blank.
1657 [default: off]
1658 --per_gene Output only the most severe consequence per gene. Where more
1659 than one transcript has the same consequence, the transcript
1660 chosen is arbitrary. [default: off]
1661
1662
1663 --check_ref If specified, checks supplied reference allele against stored
1664 entry in Ensembl Core database [default: off]
1665 --check_existing If specified, checks for existing co-located variations in the
1666 Ensembl Variation database [default: off]
1667 --failed [0|1] Include (1) or exclude (0) variants that have been flagged as
1668 failed by Ensembl when checking for existing variants.
1669 [default: exclude]
1670 --check_alleles If specified, the alleles of existing co-located variations
1671 are compared to the input; an existing variation will only
1672 be reported if no novel allele is in the input (strand is
1673 accounted for) [default: off]
1674 --check_svs Report overlapping structural variants [default: off]
1675
1676 --filter [filters] Filter output by consequence type. Use this to output only
1677 variants that have at least one consequence type matching the
1678 filter. Multiple filters can be used separated by ",". By
1679 combining this with --original it is possible to run the VEP
1680 iteratively to progressively filter a set of variants. See
1681 documentation for full details [default: off]
1682
1683 --check_frequency Turns on frequency filtering. Use this to include or exclude
1684 variants based on the frequency of co-located existing
1685 variants in the Ensembl Variation database. You must also
1686 specify all of the following --freq flags [default: off]
1687 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap,
1688 1kg_yri for YRI 1000 genomes. See documentation for more
1689 details
1690 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5
1691 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or
1692 less than (lt) --freq_freq
1693 --freq_filter Specify whether variants that pass the above should be included
1694 [exclude|include] or excluded from analysis
1695 --gmaf Include global MAF of existing variant from 1000 Genomes
1696 Phase 1 in output
1697
1698 --individual [id] Consider only alternate alleles present in the genotypes of the
1699 specified individual(s). May be a single individual, a comma-
1700 separated list or "all" to assess all individuals separately.
1701 Each individual and variant combination is given on a separate
1702 line of output. Only works with VCF files containing individual
1703 genotype data; individual IDs are taken from column headers.
1704 --allow_non_variant Prints out non-variant lines when using VCF input
1705 --phased Force VCF individual genotypes to be interpreted as phased.
1706 For use with plugins that depend on phased state.
1707
1708 --chr [list] Select a subset of chromosomes to analyse from your file. Any
1709 data not on this chromosome in the input will be skipped. The
1710 list can be comma separated, with "-" characters representing
1711 a range e.g. 1-5,8,15,X [default: off]
1712 --gp If specified, tries to read GRCh37 position from GP field in the
1713 INFO column of a VCF file. Only applies when VCF is the input
1714 format and human is the species [default: off]
1715
1716 --convert Convert the input file to the output format specified.
1717 [ensembl|vcf|pileup] Converted output is written to the file specified in
1718 --output_file. No consequence calculation is carried out when
1719 doing file conversion. [default: off]
1720
1721 --refseq Use the otherfeatures database to retrieve transcripts - this
1722 database contains RefSeq transcripts (as well as CCDS and
1723 Ensembl EST alignments) [default: off]
1724 --host Manually define database host [default: "ensembldb.ensembl.org"]
1725 -u | --user Database username [default: "anonymous"]
1726 --port Database port [default: 5306]
1727 --password Database password [default: no password]
1728 --genomes Sets DB connection params for Ensembl Genomes [default: off]
1729 --registry Registry file to use defines DB connections [default: off]
1730 Defining a registry file overrides above connection settings.
1731 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not
1732 advised due to likely incompatibilities between API and DB
1733
1734 --no_whole_genome Run in old-style, non-whole genome mode [default: off]
1735 --buffer_size Sets the number of variants sent in each batch [default: 5000]
1736 Increasing buffer size can retrieve results more quickly
1737 but requires more memory. Only applies to whole genome mode.
1738
1739 --cache Enables read-only use of cache [default: off]
1740 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"]
1741 --write_cache Enable writing to cache [default: off]
1742 --build [all|list] Build a complete cache for the selected species. Build for all
1743 chromosomes with --build all, or a list of chromosomes (see
1744 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS
1745 VIOLATES OUR FAIR USAGE POLICY [default: off]
1746
1747 --compress Specify utility to decompress cache files - may be "gzcat" or
1748 "gzip -dc" Only use if default does not work [default: zcat]
1749
1750 --skip_db_check ADVANCED! Force the script to use a cache built from a different
1751 database than specified with --host. Only use this if you are
1752 sure the hosts are compatible (e.g. ensembldb.ensembl.org and
1753 useastdb.ensembl.org) [default: off]
1754 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one
1755 file in the cache. [default: 1MB]
1756 END
1757
1758 print $usage;
1759 }