0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =cut
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Variant Effect Predictor - a script to predict the consequences of genomic variants
|
|
26
|
|
27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
|
|
28
|
|
29 Version 2.6
|
|
30
|
|
31 by Will McLaren (wm2@ebi.ac.uk)
|
|
32 =cut
|
|
33
|
|
34 use strict;
|
|
35 use Getopt::Long;
|
|
36 use FileHandle;
|
|
37 use FindBin qw($Bin);
|
|
38 use lib $Bin;
|
|
39
|
|
40 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
|
|
41 use Bio::EnsEMBL::Variation::Utils::VEP qw(
|
|
42 parse_line
|
|
43 vf_to_consequences
|
|
44 validate_vf
|
|
45 convert_to_vcf
|
|
46 load_dumped_adaptor_cache
|
|
47 dump_adaptor_cache
|
|
48 get_all_consequences
|
|
49 get_slice
|
|
50 build_full_cache
|
|
51 read_cache_info
|
|
52 get_time
|
|
53 debug
|
|
54 @OUTPUT_COLS
|
|
55 @REG_FEAT_TYPES
|
|
56 %FILTER_SHORTCUTS
|
|
57 );
|
|
58
|
|
59 # global vars
|
|
60 my $VERSION = '2.6';
|
|
61
|
|
62
|
|
63 # define headers that would normally go in the extra field
|
|
64 # keyed on the config parameter used to turn it on
|
|
65 my %extra_headers = (
|
|
66 protein => ['ENSP'],
|
|
67 canonical => ['CANONICAL'],
|
|
68 ccds => ['CCDS'],
|
|
69 hgvs => ['HGVSc','HGVSp'],
|
|
70 hgnc => ['HGNC'],
|
|
71 sift => ['SIFT'],
|
|
72 polyphen => ['PolyPhen'],
|
|
73 numbers => ['EXON','INTRON'],
|
|
74 domains => ['DOMAINS'],
|
|
75 regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'],
|
|
76 cell_type => ['CELL_TYPE'],
|
|
77 individual => ['IND'],
|
|
78 xref_refseq => ['RefSeq'],
|
|
79 check_svs => ['SV'],
|
|
80 check_frequency => ['FREQS'],
|
|
81 gmaf => ['GMAF'],
|
|
82 user => ['DISTANCE'],
|
|
83 );
|
|
84
|
|
85 my %extra_descs = (
|
|
86 'CANONICAL' => 'Indicates if transcript is canonical for this gene',
|
|
87 'CCDS' => 'Indicates if transcript is a CCDS transcript',
|
|
88 'HGNC' => 'HGNC gene identifier',
|
|
89 'ENSP' => 'Ensembl protein identifer',
|
|
90 'HGVSc' => 'HGVS coding sequence name',
|
|
91 'HGVSp' => 'HGVS protein sequence name',
|
|
92 'SIFT' => 'SIFT prediction',
|
|
93 'PolyPhen' => 'PolyPhen prediction',
|
|
94 'EXON' => 'Exon number(s) / total',
|
|
95 'INTRON' => 'Intron number(s) / total',
|
|
96 'DOMAINS' => 'The source and identifer of any overlapping protein domains',
|
|
97 'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position',
|
|
98 'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP',
|
|
99 'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP',
|
|
100 'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP',
|
|
101 'CELL_TYPE' => 'List of cell types and classifications for regulatory feature',
|
|
102 'IND' => 'Individual name',
|
|
103 'SV' => 'IDs of overlapping structural variants',
|
|
104 'FREQS' => 'Frequencies of overlapping variants used in filtering',
|
|
105 'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1',
|
|
106 'DISTANCE' => 'Shortest distance from variant to transcript',
|
|
107 );
|
|
108
|
|
109 # set output autoflush for progress bars
|
|
110 $| = 1;
|
|
111
|
|
112 # configure from command line opts
|
|
113 my $config = &configure(scalar @ARGV);
|
|
114
|
|
115 # run the main sub routine
|
|
116 &main($config);
|
|
117
|
|
118 # this is the main sub-routine - it needs the configured $config hash
|
|
119 sub main {
|
|
120 my $config = shift;
|
|
121
|
|
122 debug("Starting...") unless defined $config->{quiet};
|
|
123
|
|
124 $config->{start_time} = time();
|
|
125 $config->{last_time} = time();
|
|
126
|
|
127 my $tr_cache = {};
|
|
128 my $rf_cache = {};
|
|
129
|
|
130 # create a hash to hold slices so we don't get the same one twice
|
|
131 my %slice_cache = ();
|
|
132
|
|
133 my @vfs;
|
|
134 my ($vf_count, $total_vf_count);
|
|
135 my $in_file_handle = $config->{in_file_handle};
|
|
136
|
|
137 # initialize line number in config
|
|
138 $config->{line_number} = 0;
|
|
139
|
|
140 # read the file
|
|
141 while(<$in_file_handle>) {
|
|
142 chomp;
|
|
143
|
|
144 $config->{line_number}++;
|
|
145
|
|
146 # header line?
|
|
147 if(/^\#/) {
|
|
148
|
|
149 # retain header lines if we are outputting VCF
|
|
150 if(defined($config->{vcf})) {
|
|
151 push @{$config->{headers}}, $_;
|
|
152 }
|
|
153
|
|
154 # line with sample labels in VCF
|
|
155 if(defined($config->{individual}) && /^#CHROM/) {
|
|
156 my @split = split /\s+/;
|
|
157
|
|
158 # no individuals
|
|
159 die("ERROR: No individual data found in VCF\n") if scalar @split <= 9;
|
|
160
|
|
161 # get individual column indices
|
|
162 my %ind_cols = map {$split[$_] => $_} (9..$#split);
|
|
163
|
|
164 # all?
|
|
165 if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) {
|
|
166 $config->{ind_cols} = \%ind_cols;
|
|
167 }
|
|
168 else {
|
|
169 my %new_ind_cols;
|
|
170
|
|
171 # check we have specified individual(s)
|
|
172 foreach my $ind(@{$config->{individual}}) {
|
|
173 die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind};
|
|
174 $new_ind_cols{$ind} = $ind_cols{$ind};
|
|
175 }
|
|
176
|
|
177 $config->{ind_cols} = \%new_ind_cols;
|
|
178 }
|
|
179 }
|
|
180
|
|
181 next;
|
|
182 }
|
|
183
|
|
184 # configure output file
|
|
185 $config->{out_file_handle} ||= &get_out_file_handle($config);
|
|
186
|
|
187 # some lines (pileup) may actually parse out into more than one variant
|
|
188 foreach my $vf(@{&parse_line($config, $_)}) {
|
|
189
|
|
190 $vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original});
|
|
191
|
|
192 # now get the slice
|
|
193 if(!defined($vf->{slice})) {
|
|
194 my $slice;
|
|
195
|
|
196 # don't get slices if we're using cache
|
|
197 # we can steal them from transcript objects later
|
|
198 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
|
|
199
|
|
200 # check if we have fetched this slice already
|
|
201 if(defined $slice_cache{$vf->{chr}}) {
|
|
202 $slice = $slice_cache{$vf->{chr}};
|
|
203 }
|
|
204
|
|
205 # if not create a new one
|
|
206 else {
|
|
207
|
|
208 $slice = &get_slice($config, $vf->{chr});
|
|
209
|
|
210 # if failed, warn and skip this line
|
|
211 if(!defined($slice)) {
|
|
212 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
|
|
213 next;
|
|
214 }
|
|
215
|
|
216 # store the hash
|
|
217 $slice_cache{$vf->{chr}} = $slice;
|
|
218 }
|
|
219 }
|
|
220
|
|
221 $vf->{slice} = $slice;
|
|
222 }
|
|
223
|
|
224 # validate the VF
|
|
225 next unless validate_vf($config, $vf);
|
|
226
|
|
227 # make a name if one doesn't exist
|
|
228 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term});
|
|
229
|
|
230 # jump out to convert here
|
|
231 if(defined($config->{convert})) {
|
|
232 &convert_vf($config, $vf);
|
|
233 next;
|
|
234 }
|
|
235
|
|
236 if(defined $config->{whole_genome}) {
|
|
237 push @vfs, $vf;
|
|
238 $vf_count++;
|
|
239 $total_vf_count++;
|
|
240
|
|
241 if($vf_count == $config->{buffer_size}) {
|
|
242 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
|
|
243
|
|
244 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
|
|
245
|
|
246 # calculate stats
|
|
247 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
|
|
248 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
|
|
249 $config->{last_time} = time();
|
|
250
|
|
251 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
|
|
252
|
|
253 @vfs = ();
|
|
254 $vf_count = 0;
|
|
255 }
|
|
256 }
|
|
257 else {
|
|
258 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
|
|
259 $vf_count++;
|
|
260 $total_vf_count++;
|
|
261 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
|
|
262 }
|
|
263 }
|
|
264 }
|
|
265
|
|
266 # if in whole-genome mode, finish off the rest of the buffer
|
|
267 if(defined $config->{whole_genome} && scalar @vfs) {
|
|
268 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
|
|
269
|
|
270 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
|
|
271
|
|
272 # calculate stats
|
|
273 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
|
|
274 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
|
|
275 $config->{last_time} = time();
|
|
276
|
|
277 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
|
|
278
|
|
279 debug($config->{filter_count}, "/$total_vf_count variants remain after filtering") if defined($config->{filter}) && !defined($config->{quiet});
|
|
280 }
|
|
281
|
|
282 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
|
|
283
|
|
284 debug("Finished!") unless defined $config->{quiet};
|
|
285 }
|
|
286
|
|
287 # sets up configuration hash that is used throughout the script
|
|
288 sub configure {
|
|
289 my $args = shift;
|
|
290
|
|
291 my $config = {};
|
|
292
|
|
293 GetOptions(
|
|
294 $config,
|
|
295 'help', # displays help message
|
|
296
|
|
297 # input options,
|
|
298 'config=s', # config file name
|
|
299 'input_file|i=s', # input file name
|
|
300 'format=s', # input file format
|
|
301
|
|
302 # DB options
|
|
303 'species=s', # species e.g. human, homo_sapiens
|
|
304 'registry=s', # registry file
|
|
305 'host=s', # database host
|
|
306 'port=s', # database port
|
|
307 'user=s', # database user name
|
|
308 'password=s', # database password
|
|
309 'db_version=i', # Ensembl database version to use e.g. 62
|
|
310 'genomes', # automatically sets DB params for e!Genomes
|
|
311 'refseq', # use otherfeatures RefSeq DB instead of Ensembl
|
|
312 #'no_disconnect', # disables disconnect_when_inactive
|
|
313
|
|
314 # runtime options
|
|
315 'most_severe', # only return most severe consequence
|
|
316 'summary', # only return one line per variation with all consquence types
|
|
317 'per_gene', # only return most severe per gene
|
|
318 'buffer_size=i', # number of variations to read in before analysis
|
|
319 'chunk_size=s', # size in bases of "chunks" used in internal hash structure
|
|
320 'failed=i', # include failed variations when finding existing
|
|
321 'no_whole_genome', # disables now default whole-genome mode
|
|
322 'whole_genome', # proxy for whole genome mode - now just warns user
|
|
323 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
|
|
324 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
|
|
325 'check_ref', # check supplied reference allele against DB
|
|
326 'check_existing', # find existing co-located variations
|
|
327 'check_svs', # find overlapping structural variations
|
|
328 'check_alleles', # only attribute co-located if alleles are the same
|
|
329 'check_frequency', # enable frequency checking
|
|
330 'gmaf', # add global MAF of existing var
|
|
331 'freq_filter=s', # exclude or include
|
|
332 'freq_freq=f', # frequency to filter on
|
|
333 'freq_gt_lt=s', # gt or lt (greater than or less than)
|
|
334 'freq_pop=s', # population to filter on
|
|
335 'allow_non_variant', # allow non-variant VCF lines through
|
|
336 'individual=s', # give results by genotype for individuals
|
|
337 'phased', # force VCF genotypes to be interpreted as phased
|
|
338 'fork=i', # fork into N processes
|
|
339
|
|
340 # verbosity options
|
|
341 'verbose|v', # print out a bit more info while running
|
|
342 'quiet', # print nothing to STDOUT (unless using -o stdout)
|
|
343 'no_progress', # don't display progress bars
|
|
344
|
|
345 # output options
|
|
346 'everything|e', # switch on EVERYTHING :-)
|
|
347 'output_file|o=s', # output file name
|
|
348 'force_overwrite', # force overwrite of output file if already exists
|
|
349 'terms|t=s', # consequence terms to use e.g. NCBI, SO
|
|
350 'coding_only', # only return results for consequences in coding regions
|
|
351 'canonical', # indicates if transcript is canonical
|
|
352 'ccds', # output CCDS identifer
|
|
353 'xref_refseq', # output refseq mrna xref
|
|
354 'protein', # add e! protein ID to extra column
|
|
355 'hgnc', # add HGNC gene ID to extra column
|
|
356 'hgvs', # add HGVS names to extra column
|
|
357 'sift=s', # SIFT predictions
|
|
358 'polyphen=s', # PolyPhen predictions
|
|
359 'condel=s', # Condel predictions
|
|
360 'regulatory', # enable regulatory stuff
|
|
361 'cell_type=s', # filter cell types for regfeats
|
|
362 'convert=s', # convert input to another format (doesn't run VEP)
|
|
363 'filter=s', # run in filtering mode
|
|
364 'no_intergenic', # don't print out INTERGENIC consequences
|
|
365 'gvf', # produce gvf output
|
|
366 'vcf', # produce vcf output
|
|
367 'original', # produce output in input format
|
|
368 'no_consequences', # don't calculate consequences
|
|
369 'lrg', # enable LRG-based features
|
|
370 'fields=s', # define your own output fields
|
|
371 'domains', # output overlapping protein features
|
|
372 'numbers', # include exon and intron numbers
|
|
373
|
|
374 # cache stuff
|
|
375 'cache', # use cache
|
|
376 'write_cache', # enables writing to the cache
|
|
377 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
|
|
378 'no_adaptor_cache', # don't write adaptor cache
|
|
379 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript
|
|
380 'strip', # strips adaptors etc from objects before caching them
|
|
381 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
|
|
382 'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
|
|
383 'cache_region_size=i', # size of region in bases for each cache file
|
|
384 'no_slice_cache', # tell API not to cache features on slice
|
|
385 'standalone', # standalone renamed offline
|
|
386 'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection
|
|
387 'skip_db_check', # don't compare DB parameters with cached
|
|
388 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
|
|
389 'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation
|
|
390 'tmpdir=s', # tmp dir used for BigWig retrieval
|
|
391 'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory
|
|
392
|
|
393 # debug
|
|
394 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
|
|
395 'count_queries', # counts SQL queries executed
|
|
396 'admin', # allows me to build off public hosts
|
|
397 'debug', # print out debug info
|
|
398 'tabix', # experimental use tabix cache files
|
|
399 ) or die "ERROR: Failed to parse command-line flags\n";
|
|
400
|
|
401 # print usage message if requested or no args supplied
|
|
402 if(defined($config->{help}) || !$args) {
|
|
403 &usage;
|
|
404 exit(0);
|
|
405 }
|
|
406
|
|
407 # dir is where the cache and plugins live
|
|
408 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep');
|
|
409
|
|
410 # dir gets set to the specific cache directory later on, so take a copy to use
|
|
411 # when configuring plugins
|
|
412
|
|
413 $config->{toplevel_dir} = $config->{dir};
|
|
414
|
|
415 # ini file?
|
|
416 my $ini_file = $config->{dir}.'/vep.ini';
|
|
417
|
|
418 if(-e $ini_file) {
|
|
419 read_config_from_file($config, $ini_file);
|
|
420 }
|
|
421
|
|
422 # config file?
|
|
423 if(defined $config->{config}) {
|
|
424 read_config_from_file($config, $config->{config});
|
|
425 }
|
|
426
|
|
427 # can't be both quiet and verbose
|
|
428 die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose});
|
|
429
|
|
430 # check forking
|
|
431 if(defined($config->{fork})) {
|
|
432 die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1;
|
|
433
|
|
434 # check we can use MIME::Base64
|
|
435 eval q{ use MIME::Base64; };
|
|
436
|
|
437 if($@) {
|
|
438 debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet});
|
|
439 delete $config->{fork};
|
|
440 }
|
|
441 else {
|
|
442
|
|
443 # try a practice fork
|
|
444 my $pid = fork;
|
|
445
|
|
446 if(!defined($pid)) {
|
|
447 debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet});
|
|
448 delete $config->{fork};
|
|
449 }
|
|
450 elsif($pid) {
|
|
451 waitpid($pid, 0);
|
|
452 }
|
|
453 elsif($pid == 0) {
|
|
454 exit(0);
|
|
455 }
|
|
456 }
|
|
457 }
|
|
458
|
|
459 # check file format
|
|
460 if(defined $config->{format}) {
|
|
461 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i;
|
|
462 }
|
|
463
|
|
464 # check convert format
|
|
465 if(defined $config->{convert}) {
|
|
466 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i;
|
|
467 }
|
|
468
|
|
469 # check if user still using --standalone
|
|
470 if(defined $config->{standalone}) {
|
|
471 die "ERROR: --standalone replaced by --offline\n";
|
|
472 }
|
|
473
|
|
474 # connection settings for Ensembl Genomes
|
|
475 if($config->{genomes}) {
|
|
476 $config->{host} ||= 'mysql.ebi.ac.uk';
|
|
477 $config->{port} ||= 4157;
|
|
478 }
|
|
479
|
|
480 # connection settings for main Ensembl
|
|
481 else {
|
|
482 $config->{species} ||= "homo_sapiens";
|
|
483 $config->{host} ||= 'ensembldb.ensembl.org';
|
|
484 $config->{port} ||= 5306;
|
|
485 }
|
|
486
|
|
487 # refseq or core?
|
|
488 if(defined($config->{refseq})) {
|
|
489 $config->{core_type} = 'otherfeatures';
|
|
490 }
|
|
491 else {
|
|
492 $config->{core_type} = 'core';
|
|
493 }
|
|
494
|
|
495 # output term
|
|
496 if(defined $config->{terms}) {
|
|
497 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i;
|
|
498 if($config->{terms} =~ /ensembl|display/i) {
|
|
499 $config->{terms} = 'display';
|
|
500 }
|
|
501 else {
|
|
502 $config->{terms} = uc($config->{terms});
|
|
503 }
|
|
504 }
|
|
505
|
|
506 # everything?
|
|
507 if(defined($config->{everything})) {
|
|
508 my %everything = (
|
|
509 sift => 'b',
|
|
510 polyphen => 'b',
|
|
511 ccds => 1,
|
|
512 hgvs => 1,
|
|
513 hgnc => 1,
|
|
514 numbers => 1,
|
|
515 domains => 1,
|
|
516 regulatory => 1,
|
|
517 canonical => 1,
|
|
518 protein => 1,
|
|
519 gmaf => 1,
|
|
520 );
|
|
521
|
|
522 $config->{$_} = $everything{$_} for keys %everything;
|
|
523
|
|
524 # these ones won't work with offline
|
|
525 delete $config->{hgvs} if defined($config->{offline});
|
|
526 }
|
|
527
|
|
528 # check nsSNP tools
|
|
529 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) {
|
|
530 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/;
|
|
531
|
|
532 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i;
|
|
533
|
|
534 die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel';
|
|
535 }
|
|
536
|
|
537 # force quiet if outputting to STDOUT
|
|
538 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
|
|
539 delete $config->{verbose} if defined($config->{verbose});
|
|
540 $config->{quiet} = 1;
|
|
541 }
|
|
542
|
|
543 # individual(s) specified?
|
|
544 if(defined($config->{individual})) {
|
|
545 $config->{individual} = [split /\,/, $config->{individual}];
|
|
546
|
|
547 # force allow_non_variant
|
|
548 $config->{allow_non_variant} = 1;
|
|
549 }
|
|
550
|
|
551 # summarise options if verbose
|
|
552 if(defined $config->{verbose}) {
|
|
553 my $header =<<INTRO;
|
|
554 #----------------------------------#
|
|
555 # ENSEMBL VARIANT EFFECT PREDICTOR #
|
|
556 #----------------------------------#
|
|
557
|
|
558 version $VERSION
|
|
559
|
|
560 By Will McLaren (wm2\@ebi.ac.uk)
|
|
561
|
|
562 Configuration options:
|
|
563
|
|
564 INTRO
|
|
565 print $header;
|
|
566
|
|
567 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
|
|
568
|
|
569 foreach my $key(sort keys %$config) {
|
|
570 next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0;
|
|
571 print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n";
|
|
572 }
|
|
573
|
|
574 print "\n".("-" x 20)."\n\n";
|
|
575 }
|
|
576
|
|
577 # check custom annotations
|
|
578 for my $i(0..$#{$config->{custom}}) {
|
|
579 my $custom = $config->{custom}->[$i];
|
|
580
|
|
581 my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom;
|
|
582 $type ||= 'exact';
|
|
583 $format ||= 'bed';
|
|
584 $coords ||= 0;
|
|
585
|
|
586 # check type
|
|
587 die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/;
|
|
588
|
|
589 # check format
|
|
590 die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/;
|
|
591
|
|
592 # bigwig format
|
|
593 if($format eq 'bigwig') {
|
|
594 # check for bigWigToWig
|
|
595 die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/;
|
|
596 }
|
|
597
|
|
598 else {
|
|
599 # check for tabix
|
|
600 die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/;
|
|
601
|
|
602 # remote files?
|
|
603 if($filepath =~ /tp\:\/\//) {
|
|
604 my $remote_test = `tabix $filepath 1:1-1 2>&1`;
|
|
605 if($remote_test =~ /fail/) {
|
|
606 die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n";
|
|
607 }
|
|
608 elsif($remote_test =~ /get_local_version/) {
|
|
609 debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet});
|
|
610 }
|
|
611 }
|
|
612
|
|
613 # check files exist
|
|
614 else {
|
|
615 die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath;
|
|
616 die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi';
|
|
617 }
|
|
618 }
|
|
619
|
|
620 $config->{custom}->[$i] = {
|
|
621 'file' => $filepath,
|
|
622 'name' => $shortname || 'CUSTOM'.($i + 1),
|
|
623 'type' => $type,
|
|
624 'format' => $format,
|
|
625 'coords' => $coords,
|
|
626 };
|
|
627 }
|
|
628
|
|
629 # check if using filter and original
|
|
630 die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter});
|
|
631
|
|
632 # filter by consequence?
|
|
633 if(defined($config->{filter})) {
|
|
634
|
|
635 my %filters = map {$_ => 1} split /\,/, $config->{filter};
|
|
636
|
|
637 # add in shortcuts
|
|
638 foreach my $filter(keys %filters) {
|
|
639 my $value = 1;
|
|
640 if($filter =~ /^no_/) {
|
|
641 delete $filters{$filter};
|
|
642 $filter =~ s/^no_//g;
|
|
643 $value = 0;
|
|
644 $filters{$filter} = $value;
|
|
645 }
|
|
646
|
|
647 if(defined($FILTER_SHORTCUTS{$filter})) {
|
|
648 delete $filters{$filter};
|
|
649 $filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}};
|
|
650 }
|
|
651 }
|
|
652
|
|
653 $config->{filter} = \%filters;
|
|
654
|
|
655 $config->{filter_count} = 0;
|
|
656 }
|
|
657
|
|
658 # set defaults
|
|
659 $config->{user} ||= 'anonymous';
|
|
660 $config->{buffer_size} ||= 5000;
|
|
661 $config->{chunk_size} ||= '50kb';
|
|
662 $config->{output_file} ||= "variant_effect_output.txt";
|
|
663 $config->{tmpdir} ||= '/tmp';
|
|
664 $config->{format} ||= 'guess';
|
|
665 $config->{terms} ||= 'SO';
|
|
666 $config->{cache_region_size} ||= 1000000;
|
|
667 $config->{compress} ||= 'zcat';
|
|
668
|
|
669 # regulatory has to be on for cell_type
|
|
670 if(defined($config->{cell_type})) {
|
|
671 $config->{regulatory} = 1;
|
|
672 $config->{cell_type} = [split /\,/, $config->{cell_type}] if defined($config->{cell_type});
|
|
673 }
|
|
674
|
|
675 # can't use a whole bunch of options with most_severe
|
|
676 if(defined($config->{most_severe})) {
|
|
677 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) {
|
|
678 die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag});
|
|
679 }
|
|
680 }
|
|
681
|
|
682 # can't use a whole bunch of options with summary
|
|
683 if(defined($config->{summary})) {
|
|
684 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) {
|
|
685 die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag});
|
|
686 }
|
|
687 }
|
|
688
|
|
689 # frequency filtering
|
|
690 if(defined($config->{check_frequency})) {
|
|
691 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
|
|
692 die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag};
|
|
693 }
|
|
694
|
|
695 # need to set check_existing
|
|
696 $config->{check_existing} = 1;
|
|
697 }
|
|
698
|
|
699 $config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf};
|
|
700
|
|
701 # warn users still using whole_genome flag
|
|
702 if(defined($config->{whole_genome})) {
|
|
703 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
|
|
704 }
|
|
705
|
|
706 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome};
|
|
707 $config->{failed} = 0 unless defined $config->{failed};
|
|
708 $config->{chunk_size} =~ s/mb?/000000/i;
|
|
709 $config->{chunk_size} =~ s/kb?/000/i;
|
|
710 $config->{cache_region_size} =~ s/mb?/000000/i;
|
|
711 $config->{cache_region_size} =~ s/kb?/000/i;
|
|
712
|
|
713 # cluck and display executed SQL?
|
|
714 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck});
|
|
715
|
|
716 # offline needs cache, can't use HGVS
|
|
717 if(defined($config->{offline})) {
|
|
718 $config->{cache} = 1;
|
|
719
|
|
720 #die("ERROR: Cannot generate HGVS coordinates in offline mode\n") if defined($config->{hgvs});
|
|
721 die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs';
|
|
722 die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id';
|
|
723 die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency});
|
|
724 die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv});
|
|
725 }
|
|
726
|
|
727 # write_cache needs cache
|
|
728 $config->{cache} = 1 if defined $config->{write_cache};
|
|
729
|
|
730 # no_slice_cache, prefetch and whole_genome have to be on to use cache
|
|
731 if(defined($config->{cache})) {
|
|
732 $config->{prefetch} = 1;
|
|
733 $config->{no_slice_cache} = 1;
|
|
734 $config->{whole_genome} = 1;
|
|
735 $config->{strip} = 1;
|
|
736 }
|
|
737
|
|
738 $config->{build} = $config->{rebuild} if defined($config->{rebuild});
|
|
739
|
|
740 # force options for full build
|
|
741 if(defined($config->{build})) {
|
|
742 $config->{prefetch} = 1;
|
|
743 $config->{hgnc} = 1;
|
|
744 $config->{no_slice_cache} = 1;
|
|
745 $config->{cache} = 1;
|
|
746 $config->{strip} = 1;
|
|
747 $config->{write_cache} = 1;
|
|
748 $config->{cell_type} = 1 if defined($config->{regulatory});
|
|
749 }
|
|
750
|
|
751 # connect to databases
|
|
752 $config->{reg} = &connect_to_dbs($config);
|
|
753
|
|
754 # complete dir with species name and db_version
|
|
755 $config->{dir} .= '/'.(
|
|
756 join '/', (
|
|
757 defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}),
|
|
758 $config->{db_version} || $config->{reg}->software_version
|
|
759 )
|
|
760 );
|
|
761
|
|
762 # warn user cache directory doesn't exist
|
|
763 if(!-e $config->{dir}) {
|
|
764
|
|
765 # if using write_cache
|
|
766 if(defined($config->{write_cache})) {
|
|
767 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet});
|
|
768 }
|
|
769
|
|
770 # want to read cache, not found
|
|
771 elsif(defined($config->{cache})) {
|
|
772 die("ERROR: Cache directory ", $config->{dir}, " not found");
|
|
773 }
|
|
774 }
|
|
775
|
|
776 if(defined($config->{cache})) {
|
|
777 # read cache info
|
|
778 if(read_cache_info($config)) {
|
|
779 debug("Read existing cache info") unless defined $config->{quiet};
|
|
780 }
|
|
781 }
|
|
782
|
|
783 # we configure plugins here because they can sometimes switch on the
|
|
784 # regulatory config option
|
|
785 configure_plugins($config);
|
|
786
|
|
787 # include regulatory modules if requested
|
|
788 if(defined($config->{regulatory})) {
|
|
789 # do the use statements here so that users don't have to have the
|
|
790 # funcgen API installed to use the rest of the script
|
|
791 eval q{
|
|
792 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
|
|
793 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
|
|
794 use Bio::EnsEMBL::Funcgen::MotifFeature;
|
|
795 use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
|
|
796 use Bio::EnsEMBL::Funcgen::BindingMatrix;
|
|
797 };
|
|
798
|
|
799 if($@) {
|
|
800 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n");
|
|
801 }
|
|
802 }
|
|
803
|
|
804 # user defined custom output fields
|
|
805 if(defined($config->{fields})) {
|
|
806 $config->{fields} = [split ',', $config->{fields}];
|
|
807 debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet});
|
|
808 $config->{fields_redefined} = 1;
|
|
809 }
|
|
810 $config->{fields} ||= \@OUTPUT_COLS;
|
|
811
|
|
812 # suppress warnings that the FeatureAdpators spit if using no_slice_cache
|
|
813 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
|
|
814
|
|
815 # get adaptors (don't get them in offline mode)
|
|
816 unless(defined($config->{offline})) {
|
|
817
|
|
818 if(defined($config->{cache}) && !defined($config->{write_cache})) {
|
|
819
|
|
820 # try and load adaptors from cache
|
|
821 if(!&load_dumped_adaptor_cache($config)) {
|
|
822 &get_adaptors($config);
|
|
823 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache});
|
|
824 }
|
|
825
|
|
826 # check cached adaptors match DB params
|
|
827 else {
|
|
828 my $dbc = $config->{sa}->{dbc};
|
|
829
|
|
830 my $ok = 1;
|
|
831
|
|
832 if($dbc->{_host} ne $config->{host}) {
|
|
833
|
|
834 # ens-livemirror, useastdb and ensembldb should all have identical DBs
|
|
835 unless(
|
|
836 (
|
|
837 $dbc->{_host} eq 'ens-livemirror'
|
|
838 || $dbc->{_host} eq 'ensembldb.ensembl.org'
|
|
839 || $dbc->{_host} eq 'useastdb.ensembl.org'
|
|
840 ) && (
|
|
841 $config->{host} eq 'ens-livemirror'
|
|
842 || $config->{host} eq 'ensembldb.ensembl.org'
|
|
843 || $config->{host} eq 'useastdb.ensembl.org'
|
|
844 )
|
|
845 ) {
|
|
846 $ok = 0;
|
|
847 }
|
|
848
|
|
849 unless(defined($config->{skip_db_check})) {
|
|
850 # but we still need to reconnect
|
|
851 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet});
|
|
852
|
|
853 &get_adaptors($config);
|
|
854 }
|
|
855 }
|
|
856
|
|
857 if(!$ok) {
|
|
858 if(defined($config->{skip_db_check})) {
|
|
859 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet});
|
|
860 }
|
|
861 else {
|
|
862 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set";
|
|
863 }
|
|
864 }
|
|
865 }
|
|
866 }
|
|
867 else {
|
|
868 &get_adaptors($config);
|
|
869 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache});
|
|
870 }
|
|
871
|
|
872 # reg adaptors (only fetches if not retrieved from cache already)
|
|
873 &get_reg_adaptors($config) if defined($config->{regulatory});
|
|
874 }
|
|
875
|
|
876 # check cell types
|
|
877 if(defined($config->{cell_type}) && !defined($config->{build})) {
|
|
878 my $cls = '';
|
|
879
|
|
880 if(defined($config->{cache})) {
|
|
881 $cls = $config->{cache_cell_types};
|
|
882 }
|
|
883 else {
|
|
884 my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor();
|
|
885 $cls = join ",", map {$_->name} @{$cta->fetch_all};
|
|
886 }
|
|
887
|
|
888 foreach my $cl(@{$config->{cell_type}}) {
|
|
889 die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/;
|
|
890 }
|
|
891 }
|
|
892
|
|
893 # get terminal width for progress bars
|
|
894 unless(defined($config->{quiet})) {
|
|
895 my $width;
|
|
896
|
|
897 # module may not be installed
|
|
898 eval q{
|
|
899 use Term::ReadKey;
|
|
900 };
|
|
901
|
|
902 if(!$@) {
|
|
903 my ($w, $h);
|
|
904
|
|
905 # module may be installed, but e.g.
|
|
906 eval {
|
|
907 #($w, $h) = GetTerminalSize();
|
|
908 $w = 167;
|
|
909 $h = 30;
|
|
910 };
|
|
911
|
|
912 $width = $w if defined $w;
|
|
913 }
|
|
914
|
|
915 $width ||= 60;
|
|
916 $width -= 12;
|
|
917 $config->{terminal_width} = $width;
|
|
918 }
|
|
919
|
|
920 # jump out to build cache if requested
|
|
921 if(defined($config->{build})) {
|
|
922
|
|
923 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
|
|
924 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
|
|
925 }
|
|
926
|
|
927 # build the cache
|
|
928 debug("Building cache for ".$config->{species}) unless defined($config->{quiet});
|
|
929 build_full_cache($config);
|
|
930
|
|
931 # exit script
|
|
932 debug("Finished building cache") unless defined($config->{quiet});
|
|
933 exit(0);
|
|
934 }
|
|
935
|
|
936
|
|
937 # warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG
|
|
938 if(defined($config->{cache})) {
|
|
939
|
|
940 # these two def depend on DB
|
|
941 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency lrg check_sv)) {
|
|
942 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet});
|
|
943 }
|
|
944
|
|
945 # as does using HGVS or IDs as input
|
|
946 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet});
|
|
947
|
|
948 # the rest may be in the cache
|
|
949 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) {
|
|
950 next if defined($config->{'cache_'.$param});
|
|
951 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet});
|
|
952 }
|
|
953 }
|
|
954
|
|
955 # get list of chrs if supplied
|
|
956 if(defined($config->{chr})) {
|
|
957 my %chrs;
|
|
958
|
|
959 foreach my $val(split /\,/, $config->{chr}) {
|
|
960 my @nnn = split /\-/, $val;
|
|
961
|
|
962 foreach my $chr($nnn[0]..$nnn[-1]) {
|
|
963 $chrs{$chr} = 1;
|
|
964 }
|
|
965 }
|
|
966
|
|
967 $config->{chr} = \%chrs;
|
|
968 }
|
|
969
|
|
970 # get input file handle
|
|
971 $config->{in_file_handle} = &get_in_file_handle($config);
|
|
972
|
|
973 return $config;
|
|
974 }
|
|
975
|
|
976 # reads config from a file
|
|
977 sub read_config_from_file {
|
|
978 my $config = shift;
|
|
979 my $file = shift;
|
|
980
|
|
981 open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n";
|
|
982
|
|
983 while(<CONFIG>) {
|
|
984 next if /^\#/;
|
|
985 my @split = split /\s+|\=/;
|
|
986 my $key = shift @split;
|
|
987 $key =~ s/^\-//g;
|
|
988
|
|
989 if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') {
|
|
990 push @{$config->{$key}}, @split;
|
|
991 }
|
|
992 else {
|
|
993 $config->{$key} ||= $split[0];
|
|
994 }
|
|
995 }
|
|
996
|
|
997 close CONFIG;
|
|
998
|
|
999 # force quiet if outputting to STDOUT
|
|
1000 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
|
|
1001 delete $config->{verbose} if defined($config->{verbose});
|
|
1002 $config->{quiet} = 1;
|
|
1003 }
|
|
1004
|
|
1005 debug("Read configuration from $file") unless defined($config->{quiet});
|
|
1006 }
|
|
1007
|
|
1008 # configures custom VEP plugins
|
|
1009 sub configure_plugins {
|
|
1010
|
|
1011 my $config = shift;
|
|
1012
|
|
1013 $config->{plugins} = [];
|
|
1014
|
|
1015 if (my @plugins = @{ $config->{plugin} }) {
|
|
1016
|
|
1017 # add the Plugins directory onto @INC
|
|
1018
|
|
1019 unshift @INC, $config->{toplevel_dir}."/Plugins";
|
|
1020
|
|
1021 for my $plugin (@plugins) {
|
|
1022
|
|
1023 # parse out the module name and parameters
|
|
1024
|
|
1025 my ($module, @params) = split /,/, $plugin;
|
|
1026
|
|
1027 # check we can use the module
|
|
1028
|
|
1029 eval qq{
|
|
1030 use $module;
|
|
1031 };
|
|
1032 if ($@) {
|
|
1033 debug("Failed to compile plugin $module: $@") unless defined($config->{quiet});
|
|
1034 next;
|
|
1035 }
|
|
1036
|
|
1037 # now check we can instantiate it, passing any parameters to the constructor
|
|
1038
|
|
1039 my $instance;
|
|
1040
|
|
1041 eval {
|
|
1042 $instance = $module->new($config, @params);
|
|
1043 };
|
|
1044 if ($@) {
|
|
1045 debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet});
|
|
1046 next;
|
|
1047 }
|
|
1048
|
|
1049 # check that the versions match
|
|
1050
|
|
1051 my $plugin_version;
|
|
1052
|
|
1053 if ($instance->can('version')) {
|
|
1054 $plugin_version = $instance->version;
|
|
1055 }
|
|
1056
|
|
1057 my $version_ok = 1;
|
|
1058
|
|
1059 if ($plugin_version) {
|
|
1060 my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version;
|
|
1061 my ($major, $minor, $maintenance) = split /\./, $VERSION;
|
|
1062
|
|
1063 if ($plugin_major != $major) {
|
|
1064 debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet});
|
|
1065 $version_ok = 0;
|
|
1066 }
|
|
1067 }
|
|
1068 else {
|
|
1069 debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet});
|
|
1070 $version_ok = 0;
|
|
1071 }
|
|
1072
|
|
1073 debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok;
|
|
1074
|
|
1075 # check that it implements all necessary methods
|
|
1076
|
|
1077 for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) {
|
|
1078 unless ($instance->can($required)) {
|
|
1079 debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet});
|
|
1080 next;
|
|
1081 }
|
|
1082 }
|
|
1083
|
|
1084 # all's good, so save the instance in our list of plugins
|
|
1085
|
|
1086 push @{ $config->{plugins} }, $instance;
|
|
1087
|
|
1088 debug("Loaded plugin: $module") unless defined($config->{quiet});
|
|
1089
|
|
1090 # for convenience, check if the plugin wants regulatory stuff and turn on the config option if so
|
|
1091
|
|
1092 if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) {
|
|
1093 debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet});
|
|
1094 $config->{regulatory} = 1;
|
|
1095 }
|
|
1096 }
|
|
1097 }
|
|
1098 }
|
|
1099
|
|
1100 # connects to DBs (not done in offline mode)
|
|
1101 sub connect_to_dbs {
|
|
1102 my $config = shift;
|
|
1103
|
|
1104 # get registry
|
|
1105 my $reg = 'Bio::EnsEMBL::Registry';
|
|
1106
|
|
1107 unless(defined($config->{offline})) {
|
|
1108 # load DB options from registry file if given
|
|
1109 if(defined($config->{registry})) {
|
|
1110 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet});
|
|
1111 $reg->load_all(
|
|
1112 $config->{registry},
|
|
1113 $config->{verbose},
|
|
1114 undef,
|
|
1115 $config->{no_slice_cache}
|
|
1116 );
|
|
1117 }
|
|
1118
|
|
1119 # otherwise manually connect to DB server
|
|
1120 else {
|
|
1121 $reg->load_registry_from_db(
|
|
1122 -host => $config->{host},
|
|
1123 -user => $config->{user},
|
|
1124 -pass => $config->{password},
|
|
1125 -port => $config->{port},
|
|
1126 -db_version => $config->{db_version},
|
|
1127 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef,
|
|
1128 -verbose => $config->{verbose},
|
|
1129 -no_cache => $config->{no_slice_cache},
|
|
1130 );
|
|
1131 }
|
|
1132
|
|
1133 eval { $reg->set_reconnect_when_lost() };
|
|
1134
|
|
1135 if(defined($config->{verbose})) {
|
|
1136 # get a meta container adaptors to check version
|
|
1137 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer');
|
|
1138 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer');
|
|
1139
|
|
1140 if($core_mca && $var_mca) {
|
|
1141 debug(
|
|
1142 "Connected to core version ", $core_mca->get_schema_version, " database ",
|
|
1143 "and variation version ", $var_mca->get_schema_version, " database"
|
|
1144 );
|
|
1145 }
|
|
1146 }
|
|
1147 }
|
|
1148
|
|
1149 return $reg;
|
|
1150 }
|
|
1151
|
|
1152 # get adaptors from DB
|
|
1153 sub get_adaptors {
|
|
1154 my $config = shift;
|
|
1155
|
|
1156 die "ERROR: No registry" unless defined $config->{reg};
|
|
1157
|
|
1158 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature');
|
|
1159 $config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature');
|
|
1160 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation');
|
|
1161 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix');
|
|
1162 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation');
|
|
1163
|
|
1164 # get fake ones for species with no var DB
|
|
1165 if(!defined($config->{vfa})) {
|
|
1166 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species});
|
|
1167 $config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species});
|
|
1168 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species});
|
|
1169 }
|
|
1170
|
|
1171 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice');
|
|
1172 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene');
|
|
1173 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript');
|
|
1174 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer');
|
|
1175 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem');
|
|
1176
|
|
1177 # cache schema version
|
|
1178 $config->{mca}->get_schema_version if defined $config->{mca};
|
|
1179
|
|
1180 # check we got slice adaptor - can't continue without a core DB
|
|
1181 die("ERROR: Could not connect to core database\n") unless defined $config->{sa};
|
|
1182 }
|
|
1183
|
|
1184 # gets regulatory adaptors
|
|
1185 sub get_reg_adaptors {
|
|
1186 my $config = shift;
|
|
1187
|
|
1188 foreach my $type(@REG_FEAT_TYPES) {
|
|
1189 next if defined($config->{$type.'_adaptor'});
|
|
1190
|
|
1191 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type);
|
|
1192 if(defined($adaptor)) {
|
|
1193 $config->{$type.'_adaptor'} = $adaptor;
|
|
1194 }
|
|
1195 else {
|
|
1196 delete $config->{regulatory};
|
|
1197 last;
|
|
1198 }
|
|
1199 }
|
|
1200 }
|
|
1201
|
|
1202 # gets file handle for input
|
|
1203 sub get_in_file_handle {
|
|
1204 my $config = shift;
|
|
1205
|
|
1206 # define the filehandle to read input from
|
|
1207 my $in_file_handle = new FileHandle;
|
|
1208
|
|
1209 if(defined($config->{input_file})) {
|
|
1210
|
|
1211 # check defined input file exists
|
|
1212 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file};
|
|
1213
|
|
1214 if($config->{input_file} =~ /\.gz$/){
|
|
1215 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
|
|
1216 }
|
|
1217 else {
|
|
1218 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
|
|
1219 }
|
|
1220 }
|
|
1221
|
|
1222 # no file specified - try to read data off command line
|
|
1223 else {
|
|
1224 $in_file_handle = 'STDIN';
|
|
1225 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet};
|
|
1226 }
|
|
1227
|
|
1228 return $in_file_handle;
|
|
1229 }
|
|
1230
|
|
1231 # gets file handle for output and adds header
|
|
1232 sub get_out_file_handle {
|
|
1233 my $config = shift;
|
|
1234
|
|
1235 # define filehandle to write to
|
|
1236 my $out_file_handle = new FileHandle;
|
|
1237
|
|
1238 # check if file exists
|
|
1239 if(-e $config->{output_file} && !defined($config->{force_overwrite})) {
|
|
1240 # die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with -- force_overwrite\n");
|
|
1241 }
|
|
1242
|
|
1243 if($config->{output_file} =~ /stdout/i) {
|
|
1244 $out_file_handle = *STDOUT;
|
|
1245 }
|
|
1246 else {
|
|
1247 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n");
|
|
1248 }
|
|
1249
|
|
1250 # define headers for a VCF file
|
|
1251 my @vcf_headers = (
|
|
1252 '#CHROM',
|
|
1253 'POS',
|
|
1254 'ID',
|
|
1255 'REF',
|
|
1256 'ALT',
|
|
1257 'QUAL',
|
|
1258 'FILTER',
|
|
1259 'INFO'
|
|
1260 );
|
|
1261
|
|
1262 # file conversion, don't want to add normal headers
|
|
1263 if(defined($config->{convert})) {
|
|
1264 # header for VCF
|
|
1265 if($config->{convert} =~ /vcf/i) {
|
|
1266 print $out_file_handle "##fileformat=VCFv4.0\n";
|
|
1267 print $out_file_handle join "\t", @vcf_headers;
|
|
1268 print $out_file_handle "\n";
|
|
1269 }
|
|
1270
|
|
1271 return $out_file_handle;
|
|
1272 }
|
|
1273
|
|
1274 # GVF output, no header
|
|
1275 elsif(defined($config->{gvf}) || defined($config->{original})) {
|
|
1276 print $out_file_handle join "\n", @{$config->{headers}} if defined($config->{headers}) && defined($config->{original});
|
|
1277 return $out_file_handle;
|
|
1278 }
|
|
1279
|
|
1280 elsif(defined($config->{vcf})) {
|
|
1281
|
|
1282 # create an info string for the VCF header
|
|
1283 my @new_headers;
|
|
1284
|
|
1285 # if the user has defined the fields themselves, we don't need to worry
|
|
1286 if(defined $config->{fields_redefined}) {
|
|
1287 @new_headers = @{$config->{fields}};
|
|
1288 }
|
|
1289 else {
|
|
1290 @new_headers = (
|
|
1291
|
|
1292 # get default headers, minus variation name and location (already encoded in VCF)
|
|
1293 grep {
|
|
1294 $_ ne 'Uploaded_variation' and
|
|
1295 $_ ne 'Location' and
|
|
1296 $_ ne 'Extra'
|
|
1297 } @{$config->{fields}},
|
|
1298
|
|
1299 # get extra headers
|
|
1300 map {@{$extra_headers{$_}}}
|
|
1301 grep {defined $config->{$_}}
|
|
1302 keys %extra_headers
|
|
1303 );
|
|
1304
|
|
1305 # plugin headers
|
|
1306 foreach my $plugin_header(split /\n/, get_plugin_headers($config)) {
|
|
1307 $plugin_header =~ /\#\# (.+?)\t\:.+/;
|
|
1308 push @new_headers, $1;
|
|
1309 }
|
|
1310
|
|
1311 # redefine the main headers list in config
|
|
1312 $config->{fields} = \@new_headers;
|
|
1313 }
|
|
1314
|
|
1315 # add the newly defined headers as a header to the VCF
|
|
1316 my $string = join '|', @{$config->{fields}};
|
|
1317 my @vcf_info_strings = ('##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: '.$string.'">');
|
|
1318
|
|
1319 # add custom headers
|
|
1320 foreach my $custom(@{$config->{custom}}) {
|
|
1321 push @vcf_info_strings, '##INFO=<ID='.$custom->{name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">';
|
|
1322 }
|
|
1323
|
|
1324 # if this is already a VCF file, we need to add our new headers in the right place
|
|
1325 if(defined($config->{headers})) {
|
|
1326
|
|
1327 for my $i(0..$#{$config->{headers}}) {
|
|
1328 if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) {
|
|
1329 splice(@{$config->{headers}}, $i, 0, @vcf_info_strings);
|
|
1330 }
|
|
1331 }
|
|
1332
|
|
1333 print $out_file_handle join "\n", @{$config->{headers}};
|
|
1334 print $out_file_handle "\n";
|
|
1335 }
|
|
1336
|
|
1337 else {
|
|
1338 print $out_file_handle "##fileformat=VCFv4.0\n";
|
|
1339 print $out_file_handle join "\n", @vcf_info_strings;
|
|
1340 print $out_file_handle "\n";
|
|
1341 print $out_file_handle join "\t", @vcf_headers;
|
|
1342 print $out_file_handle "\n";
|
|
1343 }
|
|
1344
|
|
1345 return $out_file_handle;
|
|
1346 }
|
|
1347
|
|
1348 # make header
|
|
1349 my $time = &get_time;
|
|
1350 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca};
|
|
1351 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache});
|
|
1352 my $version_string =
|
|
1353 "Using API version ".$config->{reg}->software_version.
|
|
1354 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?');
|
|
1355
|
|
1356 # add key for extra column headers based on config
|
|
1357 my $extra_column_keys = join "\n",
|
|
1358 map {'## '.$_.' : '.$extra_descs{$_}}
|
|
1359 sort map {@{$extra_headers{$_}}}
|
|
1360 grep {defined $config->{$_}}
|
|
1361 keys %extra_headers;
|
|
1362
|
|
1363 my $header =<<HEAD;
|
|
1364 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION
|
|
1365 ## Output produced at $time
|
|
1366 ## Connected to $db_string
|
|
1367 ## $version_string
|
|
1368 ## Extra column keys:
|
|
1369 $extra_column_keys
|
|
1370 HEAD
|
|
1371
|
|
1372 $header .= get_plugin_headers($config);
|
|
1373
|
|
1374 # add headers
|
|
1375 print $out_file_handle $header;
|
|
1376
|
|
1377 # add custom data defs
|
|
1378 if(defined($config->{custom})) {
|
|
1379 foreach my $custom(@{$config->{custom}}) {
|
|
1380 print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n";
|
|
1381 }
|
|
1382 }
|
|
1383
|
|
1384 # add column headers
|
|
1385 print $out_file_handle '#', (join "\t", @{$config->{fields}});
|
|
1386 print $out_file_handle "\n";
|
|
1387
|
|
1388 return $out_file_handle;
|
|
1389 }
|
|
1390
|
|
1391 sub get_plugin_headers {
|
|
1392
|
|
1393 my $config = shift;
|
|
1394
|
|
1395 my $header = "";
|
|
1396
|
|
1397 for my $plugin (@{ $config->{plugins} }) {
|
|
1398 if (my $hdr = $plugin->get_header_info) {
|
|
1399 for my $key (keys %$hdr) {
|
|
1400 my $val = $hdr->{$key};
|
|
1401
|
|
1402 $header .= "## $key\t: $val\n";
|
|
1403 }
|
|
1404 }
|
|
1405 }
|
|
1406
|
|
1407 return $header;
|
|
1408 }
|
|
1409
|
|
1410 # convert a variation feature to a line of output
|
|
1411 sub convert_vf {
|
|
1412 my $config = shift;
|
|
1413 my $vf = shift;
|
|
1414
|
|
1415 my $convert_method = 'convert_to_'.lc($config->{convert});
|
|
1416 my $method_ref = \&$convert_method;
|
|
1417
|
|
1418 my $line = &$method_ref($config, $vf);
|
|
1419 my $handle = $config->{out_file_handle};
|
|
1420
|
|
1421 if(scalar @$line) {
|
|
1422 print $handle join "\t", @$line;
|
|
1423 print $handle "\n";
|
|
1424 }
|
|
1425 }
|
|
1426
|
|
1427 # converts to Ensembl format
|
|
1428 sub convert_to_ensembl {
|
|
1429 my $config = shift;
|
|
1430 my $vf = shift;
|
|
1431
|
|
1432 return [
|
|
1433 $vf->{chr} || $vf->seq_region_name,
|
|
1434 $vf->start,
|
|
1435 $vf->end,
|
|
1436 $vf->allele_string,
|
|
1437 $vf->strand,
|
|
1438 $vf->variation_name
|
|
1439 ];
|
|
1440 }
|
|
1441
|
|
1442
|
|
1443 # converts to pileup format
|
|
1444 sub convert_to_pileup {
|
|
1445 my $config = shift;
|
|
1446 my $vf = shift;
|
|
1447
|
|
1448 # look for imbalance in the allele string
|
|
1449 my %allele_lengths;
|
|
1450 my @alleles = split /\//, $vf->allele_string;
|
|
1451
|
|
1452 foreach my $allele(@alleles) {
|
|
1453 $allele =~ s/\-//g;
|
|
1454 $allele_lengths{length($allele)} = 1;
|
|
1455 }
|
|
1456
|
|
1457 # in/del
|
|
1458 if(scalar keys %allele_lengths > 1) {
|
|
1459
|
|
1460 if($vf->allele_string =~ /\-/) {
|
|
1461
|
|
1462 # insertion?
|
|
1463 if($alleles[0] eq '-') {
|
|
1464 shift @alleles;
|
|
1465
|
|
1466 for my $i(0..$#alleles) {
|
|
1467 $alleles[$i] =~ s/\-//g;
|
|
1468 $alleles[$i] = '+'.$alleles[$i];
|
|
1469 }
|
|
1470 }
|
|
1471
|
|
1472 else {
|
|
1473 @alleles = grep {$_ ne '-'} @alleles;
|
|
1474
|
|
1475 for my $i(0..$#alleles) {
|
|
1476 $alleles[$i] =~ s/\-//g;
|
|
1477 $alleles[$i] = '-'.$alleles[$i];
|
|
1478 }
|
|
1479 }
|
|
1480
|
|
1481 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles;
|
|
1482
|
|
1483 return [
|
|
1484 $vf->{chr} || $vf->seq_region_name,
|
|
1485 $vf->start - 1,
|
|
1486 '*',
|
|
1487 (join "/", @alleles),
|
|
1488 ];
|
|
1489 }
|
|
1490
|
|
1491 else {
|
|
1492 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet});
|
|
1493 return [];
|
|
1494 }
|
|
1495
|
|
1496 }
|
|
1497
|
|
1498 # balanced sub
|
|
1499 else {
|
|
1500 return [
|
|
1501 $vf->{chr} || $vf->seq_region_name,
|
|
1502 $vf->start,
|
|
1503 shift @alleles,
|
|
1504 (join "/", @alleles),
|
|
1505 ];
|
|
1506 }
|
|
1507 }
|
|
1508
|
|
1509 # converts to HGVS (hackily returns many lines)
|
|
1510 sub convert_to_hgvs {
|
|
1511 my $config = shift;
|
|
1512 my $vf = shift;
|
|
1513
|
|
1514 # ensure we have a slice
|
|
1515 $vf->{slice} ||= get_slice($config, $vf->{chr});
|
|
1516
|
|
1517 my $tvs = $vf->get_all_TranscriptVariations;
|
|
1518
|
|
1519 my @return = values %{$vf->get_all_hgvs_notations()};
|
|
1520
|
|
1521 if(defined($tvs)) {
|
|
1522 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs;
|
|
1523 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs;
|
|
1524 }
|
|
1525
|
|
1526 return [join "\n", @return];
|
|
1527 }
|
|
1528
|
|
1529 # prints a line of output from the hash
|
|
1530 sub print_line {
|
|
1531 my $config = shift;
|
|
1532 my $line = shift;
|
|
1533 return unless defined($line);
|
|
1534
|
|
1535 my $output;
|
|
1536
|
|
1537 # normal
|
|
1538 if(ref($line) eq 'HASH') {
|
|
1539 my %extra = %{$line->{Extra}};
|
|
1540
|
|
1541 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} };
|
|
1542
|
|
1543 # if the fields have been redefined we need to search through in case
|
|
1544 # any of the defined fields are actually part of the Extra hash
|
|
1545 $output = join "\t", map {
|
|
1546 (defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-'))
|
|
1547 } @{$config->{fields}};
|
|
1548 }
|
|
1549
|
|
1550 # gvf/vcf
|
|
1551 else {
|
|
1552 $output = $$line;
|
|
1553 }
|
|
1554
|
|
1555 my $fh = $config->{out_file_handle};
|
|
1556 print $fh "$output\n";
|
|
1557 }
|
|
1558
|
|
1559 # outputs usage message
|
|
1560 sub usage {
|
|
1561 my $usage =<<END;
|
|
1562 #----------------------------------#
|
|
1563 # ENSEMBL VARIANT EFFECT PREDICTOR #
|
|
1564 #----------------------------------#
|
|
1565
|
|
1566 version $VERSION
|
|
1567
|
|
1568 By Will McLaren (wm2\@ebi.ac.uk)
|
|
1569
|
|
1570 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
|
|
1571
|
|
1572 Usage:
|
|
1573 perl variant_effect_predictor.pl [arguments]
|
|
1574
|
|
1575 Options
|
|
1576 =======
|
|
1577
|
|
1578 --help Display this message and quit
|
|
1579 --verbose Display verbose output as the script runs [default: off]
|
|
1580 --quiet Suppress status and warning messages [default: off]
|
|
1581 --no_progress Suppress progress bars [default: off]
|
|
1582
|
|
1583 --config Load configuration from file. Any command line options
|
|
1584 specified overwrite those in the file [default: off]
|
|
1585
|
|
1586 --everything Shortcut switch to turn on commonly used options. See web
|
|
1587 documentation for details [default: off]
|
|
1588
|
|
1589 --fork [num_forks] Use forking to improve script runtime [default: off]
|
|
1590
|
|
1591 -i | --input_file Input file - if not specified, reads from STDIN. Files
|
|
1592 may be gzip compressed.
|
|
1593 --format Specify input file format - one of "ensembl", "pileup",
|
|
1594 "vcf", "hgvs", "id" or "guess" to try and work out format.
|
|
1595 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this
|
|
1596 will force --quiet [default: "variant_effect_output.txt"]
|
|
1597 --force_overwrite Force overwriting of output file [default: quit if file
|
|
1598 exists]
|
|
1599 --original Writes output as it was in input - must be used with --filter
|
|
1600 since no consequence data is added [default: off]
|
|
1601 --vcf Write output as VCF [default: off]
|
|
1602 --gvf Write output as GVF [default: off]
|
|
1603 --fields [field list] Define a custom output format by specifying a comma-separated
|
|
1604 list of field names. Field names normally present in the
|
|
1605 "Extra" field may also be specified, including those added by
|
|
1606 plugin modules. Can also be used to configure VCF output
|
|
1607 columns [default: off]
|
|
1608
|
|
1609 --species [species] Species to use [default: "human"]
|
|
1610
|
|
1611 -t | --terms Type of consequence terms to output - one of "SO", "ensembl"
|
|
1612 [default: SO]
|
|
1613
|
|
1614 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off]
|
|
1615 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off]
|
|
1616
|
|
1617 NB: SIFT and PolyPhen predictions are currently available for human only
|
|
1618 NB: Condel support has been moved to a VEP plugin module - see documentation
|
|
1619
|
|
1620 --regulatory Look for overlaps with regulatory regions. The script can
|
|
1621 also call if a variant falls in a high information position
|
|
1622 within a transcription factor binding site. Output lines have
|
|
1623 a Feature type of RegulatoryFeature or MotifFeature
|
|
1624 [default: off]
|
|
1625 --cell_type [types] Report only regulatory regions that are found in the given cell
|
|
1626 type(s). Can be a single cell type or a comma-separated list.
|
|
1627 The functional type in each cell type is reported under
|
|
1628 CELL_TYPE in the output. To retrieve a list of cell types, use
|
|
1629 "--cell_type list" [default: off]
|
|
1630
|
|
1631 NB: Regulatory consequences are currently available for human and mouse only
|
|
1632
|
|
1633 --custom [file list] Add custom annotations from tabix-indexed files. See
|
|
1634 documentation for full details [default: off]
|
|
1635 --plugin [plugin_name] Use named plugin module [default: off]
|
|
1636 --hgnc Add HGNC gene identifiers to output [default: off]
|
|
1637 --hgvs Output HGVS identifiers (coding and protein). Requires database
|
|
1638 connection [default: off]
|
|
1639 --ccds Output CCDS transcript identifiers [default: off]
|
|
1640 --xref_refseq Output aligned RefSeq mRNA identifier for transcript. NB: the
|
|
1641 RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND
|
|
1642 FREQUENTLY WILL NOT, match exactly in sequence, exon structure
|
|
1643 and protein product [default: off]
|
|
1644 --protein Output Ensembl protein identifer [default: off]
|
|
1645 --canonical Indicate if the transcript for this consequence is the canonical
|
|
1646 transcript for this gene [default: off]
|
|
1647 --domains Include details of any overlapping protein domains [default: off]
|
|
1648 --numbers Include exon & intron numbers [default: off]
|
|
1649
|
|
1650 --no_intergenic Excludes intergenic consequences from the output [default: off]
|
|
1651 --coding_only Only return consequences that fall in the coding region of
|
|
1652 transcripts [default: off]
|
|
1653 --most_severe Ouptut only the most severe consequence per variation.
|
|
1654 Transcript-specific columns will be left blank. [default: off]
|
|
1655 --summary Output only a comma-separated list of all consequences per
|
|
1656 variation. Transcript-specific columns will be left blank.
|
|
1657 [default: off]
|
|
1658 --per_gene Output only the most severe consequence per gene. Where more
|
|
1659 than one transcript has the same consequence, the transcript
|
|
1660 chosen is arbitrary. [default: off]
|
|
1661
|
|
1662
|
|
1663 --check_ref If specified, checks supplied reference allele against stored
|
|
1664 entry in Ensembl Core database [default: off]
|
|
1665 --check_existing If specified, checks for existing co-located variations in the
|
|
1666 Ensembl Variation database [default: off]
|
|
1667 --failed [0|1] Include (1) or exclude (0) variants that have been flagged as
|
|
1668 failed by Ensembl when checking for existing variants.
|
|
1669 [default: exclude]
|
|
1670 --check_alleles If specified, the alleles of existing co-located variations
|
|
1671 are compared to the input; an existing variation will only
|
|
1672 be reported if no novel allele is in the input (strand is
|
|
1673 accounted for) [default: off]
|
|
1674 --check_svs Report overlapping structural variants [default: off]
|
|
1675
|
|
1676 --filter [filters] Filter output by consequence type. Use this to output only
|
|
1677 variants that have at least one consequence type matching the
|
|
1678 filter. Multiple filters can be used separated by ",". By
|
|
1679 combining this with --original it is possible to run the VEP
|
|
1680 iteratively to progressively filter a set of variants. See
|
|
1681 documentation for full details [default: off]
|
|
1682
|
|
1683 --check_frequency Turns on frequency filtering. Use this to include or exclude
|
|
1684 variants based on the frequency of co-located existing
|
|
1685 variants in the Ensembl Variation database. You must also
|
|
1686 specify all of the following --freq flags [default: off]
|
|
1687 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap,
|
|
1688 1kg_yri for YRI 1000 genomes. See documentation for more
|
|
1689 details
|
|
1690 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5
|
|
1691 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or
|
|
1692 less than (lt) --freq_freq
|
|
1693 --freq_filter Specify whether variants that pass the above should be included
|
|
1694 [exclude|include] or excluded from analysis
|
|
1695 --gmaf Include global MAF of existing variant from 1000 Genomes
|
|
1696 Phase 1 in output
|
|
1697
|
|
1698 --individual [id] Consider only alternate alleles present in the genotypes of the
|
|
1699 specified individual(s). May be a single individual, a comma-
|
|
1700 separated list or "all" to assess all individuals separately.
|
|
1701 Each individual and variant combination is given on a separate
|
|
1702 line of output. Only works with VCF files containing individual
|
|
1703 genotype data; individual IDs are taken from column headers.
|
|
1704 --allow_non_variant Prints out non-variant lines when using VCF input
|
|
1705 --phased Force VCF individual genotypes to be interpreted as phased.
|
|
1706 For use with plugins that depend on phased state.
|
|
1707
|
|
1708 --chr [list] Select a subset of chromosomes to analyse from your file. Any
|
|
1709 data not on this chromosome in the input will be skipped. The
|
|
1710 list can be comma separated, with "-" characters representing
|
|
1711 a range e.g. 1-5,8,15,X [default: off]
|
|
1712 --gp If specified, tries to read GRCh37 position from GP field in the
|
|
1713 INFO column of a VCF file. Only applies when VCF is the input
|
|
1714 format and human is the species [default: off]
|
|
1715
|
|
1716 --convert Convert the input file to the output format specified.
|
|
1717 [ensembl|vcf|pileup] Converted output is written to the file specified in
|
|
1718 --output_file. No consequence calculation is carried out when
|
|
1719 doing file conversion. [default: off]
|
|
1720
|
|
1721 --refseq Use the otherfeatures database to retrieve transcripts - this
|
|
1722 database contains RefSeq transcripts (as well as CCDS and
|
|
1723 Ensembl EST alignments) [default: off]
|
|
1724 --host Manually define database host [default: "ensembldb.ensembl.org"]
|
|
1725 -u | --user Database username [default: "anonymous"]
|
|
1726 --port Database port [default: 5306]
|
|
1727 --password Database password [default: no password]
|
|
1728 --genomes Sets DB connection params for Ensembl Genomes [default: off]
|
|
1729 --registry Registry file to use defines DB connections [default: off]
|
|
1730 Defining a registry file overrides above connection settings.
|
|
1731 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not
|
|
1732 advised due to likely incompatibilities between API and DB
|
|
1733
|
|
1734 --no_whole_genome Run in old-style, non-whole genome mode [default: off]
|
|
1735 --buffer_size Sets the number of variants sent in each batch [default: 5000]
|
|
1736 Increasing buffer size can retrieve results more quickly
|
|
1737 but requires more memory. Only applies to whole genome mode.
|
|
1738
|
|
1739 --cache Enables read-only use of cache [default: off]
|
|
1740 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"]
|
|
1741 --write_cache Enable writing to cache [default: off]
|
|
1742 --build [all|list] Build a complete cache for the selected species. Build for all
|
|
1743 chromosomes with --build all, or a list of chromosomes (see
|
|
1744 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS
|
|
1745 VIOLATES OUR FAIR USAGE POLICY [default: off]
|
|
1746
|
|
1747 --compress Specify utility to decompress cache files - may be "gzcat" or
|
|
1748 "gzip -dc" Only use if default does not work [default: zcat]
|
|
1749
|
|
1750 --skip_db_check ADVANCED! Force the script to use a cache built from a different
|
|
1751 database than specified with --host. Only use this if you are
|
|
1752 sure the hosts are compatible (e.g. ensembldb.ensembl.org and
|
|
1753 useastdb.ensembl.org) [default: off]
|
|
1754 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one
|
|
1755 file in the cache. [default: 1MB]
|
|
1756 END
|
|
1757
|
|
1758 print $usage;
|
|
1759 }
|