0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =cut
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Variant Effect Predictor - a script to predict the consequences of genomic variants
|
|
26
|
|
27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
|
|
28
|
|
29 Version 2.2
|
|
30
|
|
31 by Will McLaren (wm2@ebi.ac.uk)
|
|
32 =cut
|
|
33
|
|
34 use strict;
|
|
35 use Getopt::Long;
|
|
36 use FileHandle;
|
|
37 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
|
|
38 use Bio::EnsEMBL::Variation::Utils::VEP qw(
|
|
39 parse_line
|
|
40 vf_to_consequences
|
|
41 validate_vf
|
|
42 load_dumped_adaptor_cache
|
|
43 dump_adaptor_cache
|
|
44 get_all_consequences
|
|
45 get_slice
|
|
46 build_full_cache
|
|
47 read_cache_info
|
|
48 get_time
|
|
49 debug
|
|
50 @OUTPUT_COLS
|
|
51 @REG_FEAT_TYPES
|
|
52 );
|
|
53
|
|
54 # global vars
|
|
55 my $VERSION = '2.2';
|
|
56
|
|
57 # set output autoflush for progress bars
|
|
58 $| = 1;
|
|
59
|
|
60 # configure from command line opts
|
|
61 my $config = &configure(scalar @ARGV);
|
|
62
|
|
63 # run the main sub routine
|
|
64 &main($config);
|
|
65
|
|
66 # this is the main sub-routine - it needs the configured $config hash
|
|
67 sub main {
|
|
68 my $config = shift;
|
|
69
|
|
70 debug("Starting...") unless defined $config->{quiet};
|
|
71
|
|
72 my $tr_cache = {};
|
|
73 my $rf_cache = {};
|
|
74
|
|
75 # create a hash to hold slices so we don't get the same one twice
|
|
76 my %slice_cache = ();
|
|
77
|
|
78 my @vfs;
|
|
79 my ($vf_count, $total_vf_count);
|
|
80 my $in_file_handle = $config->{in_file_handle};
|
|
81
|
|
82 # initialize line number in config
|
|
83 $config->{line_number} = 0;
|
|
84
|
|
85 # read the file
|
|
86 while(<$in_file_handle>) {
|
|
87 chomp;
|
|
88
|
|
89 $config->{line_number}++;
|
|
90
|
|
91 # header line?
|
|
92 next if /^\#/;
|
|
93
|
|
94 # some lines (pileup) may actually parse out into more than one variant
|
|
95 foreach my $vf(@{&parse_line($config, $_)}) {
|
|
96
|
|
97 # validate the VF
|
|
98 next unless validate_vf($config, $vf);
|
|
99
|
|
100 # now get the slice
|
|
101 if(!defined($vf->{slice})) {
|
|
102 my $slice;
|
|
103
|
|
104 # don't get slices if we're using cache
|
|
105 # we can steal them from transcript objects later
|
|
106 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
|
|
107
|
|
108 # check if we have fetched this slice already
|
|
109 if(defined $slice_cache{$vf->{chr}}) {
|
|
110 $slice = $slice_cache{$vf->{chr}};
|
|
111 }
|
|
112
|
|
113 # if not create a new one
|
|
114 else {
|
|
115
|
|
116 $slice = &get_slice($config, $vf->{chr});
|
|
117
|
|
118 # if failed, warn and skip this line
|
|
119 if(!defined($slice)) {
|
|
120 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
|
|
121 next;
|
|
122 }
|
|
123
|
|
124 # store the hash
|
|
125 $slice_cache{$vf->{chr}} = $slice;
|
|
126 }
|
|
127 }
|
|
128
|
|
129 $vf->{slice} = $slice;
|
|
130 }
|
|
131
|
|
132 # make a name if one doesn't exist
|
|
133 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.$vf->{allele_string};
|
|
134
|
|
135 # jump out to convert here
|
|
136 if(defined($config->{convert})) {
|
|
137 &convert_vf($config, $vf);
|
|
138 next;
|
|
139 }
|
|
140
|
|
141 if(defined $config->{whole_genome}) {
|
|
142 push @vfs, $vf;
|
|
143 $vf_count++;
|
|
144 $total_vf_count++;
|
|
145
|
|
146 if($vf_count == $config->{buffer_size}) {
|
|
147 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
|
|
148
|
|
149 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
|
|
150
|
|
151 debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
|
|
152
|
|
153 @vfs = ();
|
|
154 $vf_count = 0;
|
|
155 }
|
|
156 }
|
|
157 else {
|
|
158 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
|
|
159 $vf_count++;
|
|
160 $total_vf_count++;
|
|
161 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
|
|
162 }
|
|
163 }
|
|
164 }
|
|
165
|
|
166 # if in whole-genome mode, finish off the rest of the buffer
|
|
167 if(defined $config->{whole_genome} && scalar @vfs) {
|
|
168 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
|
|
169
|
|
170 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
|
|
171
|
|
172 debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
|
|
173 }
|
|
174
|
|
175 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
|
|
176
|
|
177 debug("Finished!") unless defined $config->{quiet};
|
|
178 }
|
|
179
|
|
180 # sets up configuration hash that is used throughout the script
|
|
181 sub configure {
|
|
182 my $args = shift;
|
|
183
|
|
184 my $config = {};
|
|
185
|
|
186 GetOptions(
|
|
187 $config,
|
|
188 'help', # displays help message
|
|
189
|
|
190 # input options,
|
|
191 'config=s', # config file name
|
|
192 'input_file=s', # input file name
|
|
193 'format=s', # input file format
|
|
194
|
|
195 # DB options
|
|
196 'species=s', # species e.g. human, homo_sapiens
|
|
197 'registry=s', # registry file
|
|
198 'host=s', # database host
|
|
199 'port=s', # database port
|
|
200 'user=s', # database user name
|
|
201 'password=s', # database password
|
|
202 'db_version=i', # Ensembl database version to use e.g. 62
|
|
203 'genomes', # automatically sets DB params for e!Genomes
|
|
204 'refseq', # use otherfeatures RefSeq DB instead of Ensembl
|
|
205 #'no_disconnect', # disables disconnect_when_inactive
|
|
206
|
|
207 # runtime options
|
|
208 'most_severe', # only return most severe consequence
|
|
209 'summary', # only return one line per variation with all consquence types
|
|
210 'per_gene', # only return most severe per gene
|
|
211 'buffer_size=i', # number of variations to read in before analysis
|
|
212 'chunk_size=s', # size in bases of "chunks" used in internal hash structure
|
|
213 'failed=i', # include failed variations when finding existing
|
|
214 'no_whole_genome', # disables now default whole-genome mode
|
|
215 'whole_genome', # proxy for whole genome mode - now just warns user
|
|
216 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
|
|
217 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
|
|
218 'check_ref', # check supplied reference allele against DB
|
|
219 'check_existing', # find existing co-located variations
|
|
220 'check_alleles', # only attribute co-located if alleles are the same
|
|
221 'check_frequency', # enable frequency checking
|
|
222 'freq_filter=s', # exclude or include
|
|
223 'freq_freq=f', # frequency to filter on
|
|
224 'freq_gt_lt=s', # gt or lt (greater than or less than)
|
|
225 'freq_pop=s', # population to filter on
|
|
226
|
|
227 # verbosity options
|
|
228 'verbose', # print out a bit more info while running
|
|
229 'quiet', # print nothing to STDOUT (unless using -o stdout)
|
|
230 'no_progress', # don't display progress bars
|
|
231
|
|
232 # output options
|
|
233 'output_file=s', # output file name
|
|
234 'force_overwrite', # force overwrite of output file if already exists
|
|
235 'terms=s', # consequence terms to use e.g. NCBI, SO
|
|
236 'coding_only', # only return results for consequences in coding regions
|
|
237 'canonical', # indicates if transcript is canonical
|
|
238 'protein', # add e! protein ID to extra column
|
|
239 'hgnc', # add HGNC gene ID to extra column
|
|
240 'hgvs', # add HGVS names to extra column
|
|
241 'sift=s', # SIFT predictions
|
|
242 'polyphen=s', # PolyPhen predictions
|
|
243 'condel=s', # Condel predictions
|
|
244 'gene', # force gene column to be populated (disabled by default, enabled when using cache)
|
|
245 'regulatory', # enable regulatory stuff
|
|
246 'convert=s', # convert input to another format (doesn't run VEP)
|
|
247 'no_intergenic', # don't print out INTERGENIC consequences
|
|
248 'gvf', # produce gvf output
|
|
249
|
|
250 # cache stuff
|
|
251 'cache', # use cache
|
|
252 'write_cache', # enables writing to the cache
|
|
253 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
|
|
254 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript
|
|
255 'strip', # strips adaptors etc from objects before caching them
|
|
256 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
|
|
257 'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
|
|
258 'cache_region_size=i', # size of region in bases for each cache file
|
|
259 'no_slice_cache', # tell API not to cache features on slice
|
|
260 'standalone', # standalone mode uses minimal set of modules installed in same dir, no DB connection
|
|
261 'skip_db_check', # don't compare DB parameters with cached
|
|
262 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
|
|
263
|
|
264 # debug
|
|
265 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
|
|
266 'count_queries', # counts SQL queries executed
|
|
267 'admin', # allows me to build off public hosts
|
|
268 'debug', # print out debug info
|
|
269 );
|
|
270
|
|
271 # print usage message if requested or no args supplied
|
|
272 if(defined($config->{help}) || !$args) {
|
|
273 &usage;
|
|
274 exit(0);
|
|
275 }
|
|
276
|
|
277 # config file?
|
|
278 if(defined $config->{config}) {
|
|
279
|
|
280 open CONFIG, $config->{config} or die "ERROR: Could not open config file \"".$config->{config}."\"\n";
|
|
281
|
|
282 while(<CONFIG>) {
|
|
283 next if /^\#/;
|
|
284 my ($key, $value) = split /\s+|\=/;
|
|
285 $key =~ s/^\-//g;
|
|
286 $config->{$key} = $value unless defined $config->{$key};
|
|
287 }
|
|
288
|
|
289 close CONFIG;
|
|
290 }
|
|
291
|
|
292 # can't be both quiet and verbose
|
|
293 die "ERROR: Can't be both quiet and verbose!" if defined($config->{quiet}) && defined($config->{verbose});
|
|
294
|
|
295 # check file format
|
|
296 if(defined $config->{format}) {
|
|
297 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /pileup|vcf|guess|hgvs|ensembl|id/i;
|
|
298 }
|
|
299
|
|
300 # check convert format
|
|
301 if(defined $config->{convert}) {
|
|
302 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup/i;
|
|
303 }
|
|
304
|
|
305 # connection settings for Ensembl Genomes
|
|
306 if($config->{genomes}) {
|
|
307 $config->{host} ||= 'mysql.ebi.ac.uk';
|
|
308 $config->{port} ||= 4157;
|
|
309 }
|
|
310
|
|
311 # connection settings for main Ensembl
|
|
312 else {
|
|
313 $config->{species} ||= "homo_sapiens";
|
|
314 $config->{host} ||= 'ensembldb.ensembl.org';
|
|
315 $config->{port} ||= 5306;
|
|
316 }
|
|
317
|
|
318 # refseq or core?
|
|
319 if(defined($config->{refseq})) {
|
|
320 die "ERROR: SIFT, PolyPhen and Condel predictions not available fore RefSeq transcripts" if defined $config->{sift} || defined $config->{polyphen} || defined $config->{condel};
|
|
321
|
|
322 $config->{core_type} = 'otherfeatures';
|
|
323 }
|
|
324 else {
|
|
325 $config->{core_type} = 'core';
|
|
326 }
|
|
327
|
|
328 # output term
|
|
329 if(defined $config->{terms}) {
|
|
330 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i;
|
|
331 if($config->{terms} =~ /ensembl|display/i) {
|
|
332 $config->{terms} = 'display';
|
|
333 }
|
|
334 else {
|
|
335 $config->{terms} = uc($config->{terms});
|
|
336 }
|
|
337 }
|
|
338
|
|
339 # check nsSNP tools
|
|
340 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) {
|
|
341 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/;
|
|
342
|
|
343 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i;
|
|
344
|
|
345 die "ERROR: $tool not available in standalone mode\n" if defined($config->{standalone});
|
|
346
|
|
347 # use V2 of the Condel algorithm, possibly gives fewer false positives
|
|
348 if($tool eq 'Condel' && $config->{lc($tool)} =~ /1$/) {
|
|
349 $Bio::EnsEMBL::Variation::Utils::Condel::USE_V2 = 0;
|
|
350 }
|
|
351 }
|
|
352
|
|
353 # force quiet if outputting to STDOUT
|
|
354 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
|
|
355 delete $config->{verbose} if defined($config->{verbose});
|
|
356 $config->{quiet} = 1;
|
|
357 }
|
|
358
|
|
359 # summarise options if verbose
|
|
360 if(defined $config->{verbose}) {
|
|
361 my $header =<<INTRO;
|
|
362 #----------------------------------#
|
|
363 # ENSEMBL VARIANT EFFECT PREDICTOR #
|
|
364 #----------------------------------#
|
|
365
|
|
366 version $VERSION
|
|
367
|
|
368 By Will McLaren (wm2\@ebi.ac.uk)
|
|
369
|
|
370 Configuration options:
|
|
371
|
|
372 INTRO
|
|
373 print $header;
|
|
374
|
|
375 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
|
|
376
|
|
377 foreach my $key(sort keys %$config) {
|
|
378 print $key.(' ' x (($max_length - length($key)) + 4)).$config->{$key}."\n";
|
|
379 }
|
|
380
|
|
381 print "\n".("-" x 20)."\n\n";
|
|
382 }
|
|
383
|
|
384 # set defaults
|
|
385 $config->{user} ||= 'anonymous';
|
|
386 $config->{buffer_size} ||= 5000;
|
|
387 $config->{chunk_size} ||= '50kb';
|
|
388 $config->{output_file} ||= "variant_effect_output.txt";
|
|
389 $config->{tmpdir} ||= '/tmp';
|
|
390 $config->{format} ||= 'guess';
|
|
391 $config->{terms} ||= 'display';
|
|
392 $config->{gene} ||= 1 unless defined($config->{whole_genome});
|
|
393 $config->{cache_region_size} ||= 1000000;
|
|
394 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep');
|
|
395 $config->{compress} ||= 'zcat';
|
|
396
|
|
397 # frequency filtering
|
|
398 if(defined($config->{check_frequency})) {
|
|
399 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
|
|
400 die "ERROR: To use --check_frequency you must also specify flag --$flag" unless defined $config->{$flag};
|
|
401 }
|
|
402
|
|
403 # need to set check_existing
|
|
404 $config->{check_existing} = 1;
|
|
405 }
|
|
406
|
|
407 $config->{check_existing} = 1 if defined $config->{check_alleles};
|
|
408
|
|
409 # warn users still using whole_genome flag
|
|
410 if(defined($config->{whole_genome})) {
|
|
411 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
|
|
412 }
|
|
413
|
|
414 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome};
|
|
415 $config->{include_failed} = 1 unless defined $config->{include_failed};
|
|
416 $config->{chunk_size} =~ s/mb?/000000/i;
|
|
417 $config->{chunk_size} =~ s/kb?/000/i;
|
|
418 $config->{cache_region_size} =~ s/mb?/000000/i;
|
|
419 $config->{cache_region_size} =~ s/kb?/000/i;
|
|
420
|
|
421 # cluck and display executed SQL?
|
|
422 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck});
|
|
423
|
|
424 # standalone needs cache, can't use HGVS
|
|
425 if(defined($config->{standalone})) {
|
|
426 $config->{cache} = 1;
|
|
427
|
|
428 die("ERROR: Cannot generate HGVS coordinates in standalone mode") if defined($config->{hgvs});
|
|
429 die("ERROR: Cannot use HGVS as input in standalone mode") if $config->{format} eq 'hgvs';
|
|
430 die("ERROR: Cannot use variant identifiers as input in standalone mode") if $config->{format} eq 'id';
|
|
431 die("ERROR: Cannot do frequency filtering in standalone mode") if defined($config->{check_frequency});
|
|
432 }
|
|
433
|
|
434 # write_cache needs cache
|
|
435 $config->{cache} = 1 if defined $config->{write_cache};
|
|
436
|
|
437 # no_slice_cache, prefetch and whole_genome have to be on to use cache
|
|
438 if(defined($config->{cache})) {
|
|
439 $config->{prefetch} = 1;
|
|
440 $config->{no_slice_cache} = 1;
|
|
441 $config->{whole_genome} = 1;
|
|
442 $config->{strip} = 1;
|
|
443 }
|
|
444
|
|
445 $config->{build} = $config->{rebuild} if defined($config->{rebuild});
|
|
446
|
|
447 # force options for full build
|
|
448 if(defined($config->{build})) {
|
|
449 $config->{prefetch} = 1;
|
|
450 $config->{gene} = 1;
|
|
451 $config->{hgnc} = 1;
|
|
452 $config->{no_slice_cache} = 1;
|
|
453 $config->{cache} = 1;
|
|
454 $config->{strip} = 1;
|
|
455 $config->{write_cache} = 1;
|
|
456 }
|
|
457
|
|
458 # connect to databases
|
|
459 $config->{reg} = &connect_to_dbs($config);
|
|
460
|
|
461 # complete dir with species name and db_version
|
|
462 $config->{dir} .= '/'.(
|
|
463 join '/', (
|
|
464 defined($config->{standalone}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}),
|
|
465 $config->{db_version} || $config->{reg}->software_version
|
|
466 )
|
|
467 );
|
|
468
|
|
469 if(defined($config->{cache})) {
|
|
470 # read cache info
|
|
471 if(read_cache_info($config)) {
|
|
472 debug("Read existing cache info") unless defined $config->{quiet};
|
|
473 }
|
|
474 }
|
|
475
|
|
476 # include regulatory modules if requested
|
|
477 if(defined($config->{regulatory})) {
|
|
478 # do the use statements here so that users don't have to have the
|
|
479 # funcgen API install to use the rest of the script
|
|
480 eval q{
|
|
481 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
|
|
482 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
|
|
483 use Bio::EnsEMBL::Funcgen::MotifFeature;
|
|
484 use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
|
|
485 use Bio::EnsEMBL::Funcgen::BindingMatrix;
|
|
486 };
|
|
487
|
|
488 if($@) {
|
|
489 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory\n");
|
|
490 }
|
|
491 }
|
|
492
|
|
493 # warn user cache directory doesn't exist
|
|
494 if(!-e $config->{dir}) {
|
|
495
|
|
496 # if using write_cache
|
|
497 if(defined($config->{write_cache})) {
|
|
498 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet});
|
|
499 }
|
|
500
|
|
501 # want to read cache, not found
|
|
502 elsif(defined($config->{cache})) {
|
|
503 die("ERROR: Cache directory ", $config->{dir}, " not found");
|
|
504 }
|
|
505 }
|
|
506
|
|
507 # suppress warnings that the FeatureAdpators spit if using no_slice_cache
|
|
508 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
|
|
509
|
|
510 # get adaptors
|
|
511 if(defined($config->{cache}) && !defined($config->{write_cache})) {
|
|
512
|
|
513 # try and load adaptors from cache
|
|
514 if(!&load_dumped_adaptor_cache($config)) {
|
|
515 &get_adaptors($config);
|
|
516 &dump_adaptor_cache($config) if defined($config->{write_cache});
|
|
517 }
|
|
518
|
|
519 # check cached adaptors match DB params
|
|
520 else {
|
|
521 my $dbc = $config->{sa}->{dbc};
|
|
522
|
|
523 my $ok = 1;
|
|
524
|
|
525 if($dbc->{_host} ne $config->{host}) {
|
|
526
|
|
527 # ens-livemirror, useastdb and ensembldb should all have identical DBs
|
|
528 unless(
|
|
529 (
|
|
530 $dbc->{_host} eq 'ens-livemirror'
|
|
531 || $dbc->{_host} eq 'ensembldb.ensembl.org'
|
|
532 || $dbc->{_host} eq 'useastdb.ensembl.org'
|
|
533 ) && (
|
|
534 $config->{host} eq 'ens-livemirror'
|
|
535 || $config->{host} eq 'ensembldb.ensembl.org'
|
|
536 || $config->{host} eq 'useastdb.ensembl.org'
|
|
537 )
|
|
538 ) {
|
|
539 $ok = 0;
|
|
540 }
|
|
541
|
|
542 # but we still need to reconnect
|
|
543 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet});
|
|
544
|
|
545 &get_adaptors($config);
|
|
546 }
|
|
547
|
|
548 if(!$ok) {
|
|
549 if(defined($config->{skip_db_check})) {
|
|
550 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet});
|
|
551 }
|
|
552 else {
|
|
553 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set";
|
|
554 }
|
|
555 }
|
|
556 }
|
|
557 }
|
|
558 else {
|
|
559 &get_adaptors($config);
|
|
560 &dump_adaptor_cache($config) if defined($config->{write_cache})
|
|
561 }
|
|
562
|
|
563 # reg adaptors (only fetches if not retrieved from cache already)
|
|
564 &get_reg_adaptors($config) if defined($config->{regulatory});
|
|
565
|
|
566 # get terminal width for progress bars
|
|
567 unless(defined($config->{quiet})) {
|
|
568 my $width;
|
|
569
|
|
570 # module may not be installed
|
|
571 eval q{
|
|
572 use Term::ReadKey;
|
|
573 };
|
|
574
|
|
575 if(!$@) {
|
|
576 my ($w, $h);
|
|
577
|
|
578 # module may be installed, but e.g.
|
|
579 eval {
|
|
580 ($w, $h) = GetTerminalSize();
|
|
581 };
|
|
582
|
|
583 $width = $w if defined $w;
|
|
584 }
|
|
585
|
|
586 $width ||= 60;
|
|
587 $width -= 12;
|
|
588 $config->{terminal_width} = $width;
|
|
589 }
|
|
590
|
|
591 # jump out to build cache if requested
|
|
592 if(defined($config->{build})) {
|
|
593
|
|
594 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
|
|
595 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
|
|
596 }
|
|
597
|
|
598 # build the cache
|
|
599 debug("Building cache for ".$config->{species}) unless defined($config->{quiet});
|
|
600 build_full_cache($config);
|
|
601
|
|
602 # exit script
|
|
603 debug("Finished building cache") unless defined($config->{quiet});
|
|
604 exit(0);
|
|
605 }
|
|
606
|
|
607 # warn user DB will be used for SIFT/PolyPhen/Condel/HGVS/frequency
|
|
608 if(defined($config->{cache})) {
|
|
609
|
|
610 # these two def depend on DB
|
|
611 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency)) {
|
|
612 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet});
|
|
613 }
|
|
614
|
|
615 # as does using HGVS or IDs as input
|
|
616 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet});
|
|
617
|
|
618 # the rest may be in the cache
|
|
619 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen condel regulatory)) {
|
|
620 next if defined($config->{'cache_'.$param});
|
|
621 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet});
|
|
622 }
|
|
623 }
|
|
624
|
|
625 # get list of chrs if supplied
|
|
626 if(defined($config->{chr})) {
|
|
627 my %chrs;
|
|
628
|
|
629 foreach my $val(split /\,/, $config->{chr}) {
|
|
630 my @nnn = split /\-/, $val;
|
|
631
|
|
632 foreach my $chr($nnn[0]..$nnn[-1]) {
|
|
633 $chrs{$chr} = 1;
|
|
634 }
|
|
635 }
|
|
636
|
|
637 $config->{chr} = \%chrs;
|
|
638 }
|
|
639
|
|
640 # get input file handle
|
|
641 $config->{in_file_handle} = &get_in_file_handle($config);
|
|
642
|
|
643 # configure output file
|
|
644 $config->{out_file_handle} = &get_out_file_handle($config);
|
|
645
|
|
646 return $config;
|
|
647 }
|
|
648
|
|
649 # connects to DBs; in standalone mode this just loads registry module
|
|
650 sub connect_to_dbs {
|
|
651 my $config = shift;
|
|
652
|
|
653 # get registry
|
|
654 my $reg = 'Bio::EnsEMBL::Registry';
|
|
655
|
|
656 unless(defined($config->{standalone})) {
|
|
657 # load DB options from registry file if given
|
|
658 if(defined($config->{registry})) {
|
|
659 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet});
|
|
660 $reg->load_all(
|
|
661 $config->{registry},
|
|
662 $config->{verbose},
|
|
663 undef,
|
|
664 $config->{no_slice_cache}
|
|
665 );
|
|
666 }
|
|
667
|
|
668 # otherwise manually connect to DB server
|
|
669 else {
|
|
670 $reg->load_registry_from_db(
|
|
671 -host => $config->{host},
|
|
672 -user => $config->{user},
|
|
673 -pass => $config->{password},
|
|
674 -port => $config->{port},
|
|
675 -db_version => $config->{db_version},
|
|
676 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef,
|
|
677 -verbose => $config->{verbose},
|
|
678 -no_cache => $config->{no_slice_cache},
|
|
679 );
|
|
680 }
|
|
681
|
|
682 eval { $reg->set_reconnect_when_lost() };
|
|
683
|
|
684 if(defined($config->{verbose})) {
|
|
685 # get a meta container adaptors to check version
|
|
686 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer');
|
|
687 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer');
|
|
688
|
|
689 if($core_mca && $var_mca) {
|
|
690 debug(
|
|
691 "Connected to core version ", $core_mca->get_schema_version, " database ",
|
|
692 "and variation version ", $var_mca->get_schema_version, " database"
|
|
693 );
|
|
694 }
|
|
695 }
|
|
696 }
|
|
697
|
|
698 return $reg;
|
|
699 }
|
|
700
|
|
701 # get adaptors from DB
|
|
702 sub get_adaptors {
|
|
703 my $config = shift;
|
|
704
|
|
705 die "ERROR: No registry" unless defined $config->{reg};
|
|
706
|
|
707 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature');
|
|
708 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation');
|
|
709 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix');
|
|
710 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation');
|
|
711
|
|
712 # get fake ones for species with no var DB
|
|
713 if(!defined($config->{vfa})) {
|
|
714 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species});
|
|
715 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species});
|
|
716 }
|
|
717 else {
|
|
718 $config->{vfa}->db->include_failed_variations($config->{include_failed}) if defined($config->{vfa}->db) && $config->{vfa}->db->can('include_failed_variations');
|
|
719 }
|
|
720
|
|
721 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice');
|
|
722 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene');
|
|
723 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript');
|
|
724 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer');
|
|
725 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem');
|
|
726
|
|
727 # cache schema version
|
|
728 $config->{mca}->get_schema_version if defined $config->{mca};
|
|
729
|
|
730 # check we got slice adaptor - can't continue without a core DB
|
|
731 die("ERROR: Could not connect to core database\n") unless defined $config->{sa};
|
|
732 }
|
|
733
|
|
734 # gets regulatory adaptors
|
|
735 sub get_reg_adaptors {
|
|
736 my $config = shift;
|
|
737
|
|
738 foreach my $type(@REG_FEAT_TYPES) {
|
|
739 next if defined($config->{$type.'_adaptor'});
|
|
740
|
|
741 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type);
|
|
742 if(defined($adaptor)) {
|
|
743 $config->{$type.'_adaptor'} = $adaptor;
|
|
744 }
|
|
745 else {
|
|
746 delete $config->{regulatory};
|
|
747 last;
|
|
748 }
|
|
749 }
|
|
750 }
|
|
751
|
|
752 # gets file handle for input
|
|
753 sub get_in_file_handle {
|
|
754 my $config = shift;
|
|
755
|
|
756 # define the filehandle to read input from
|
|
757 my $in_file_handle = new FileHandle;
|
|
758
|
|
759 if(defined($config->{input_file})) {
|
|
760
|
|
761 # check defined input file exists
|
|
762 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file};
|
|
763
|
|
764 if($config->{input_file} =~ /\.gz$/){
|
|
765 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
|
|
766 }
|
|
767 else {
|
|
768 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
|
|
769 }
|
|
770 }
|
|
771
|
|
772 # no file specified - try to read data off command line
|
|
773 else {
|
|
774 $in_file_handle = 'STDIN';
|
|
775 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet};
|
|
776 }
|
|
777
|
|
778 return $in_file_handle;
|
|
779 }
|
|
780
|
|
781 # gets file handle for output and adds header
|
|
782 sub get_out_file_handle {
|
|
783 my $config = shift;
|
|
784
|
|
785 # define filehandle to write to
|
|
786 my $out_file_handle = new FileHandle;
|
|
787
|
|
788 # check if file exists
|
|
789 if(-e $config->{output_file} && !defined($config->{force_overwrite})) {
|
|
790 die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n");
|
|
791 }
|
|
792
|
|
793 if($config->{output_file} =~ /stdout/i) {
|
|
794 $out_file_handle = *STDOUT;
|
|
795 }
|
|
796 else {
|
|
797 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n");
|
|
798 }
|
|
799
|
|
800 # file conversion, don't want to add normal headers
|
|
801 if(defined($config->{convert})) {
|
|
802 # header for VCF
|
|
803 if($config->{convert} =~ /vcf/i) {
|
|
804 print $out_file_handle join "\t", (
|
|
805 '#CHROM',
|
|
806 'POS',
|
|
807 'ID',
|
|
808 'REF',
|
|
809 'ALT',
|
|
810 'QUAL',
|
|
811 'FILTER',
|
|
812 'INFO'
|
|
813 );
|
|
814 print $out_file_handle "\n";
|
|
815 }
|
|
816
|
|
817 return $out_file_handle;
|
|
818 }
|
|
819
|
|
820 # GVF output, no header
|
|
821 elsif(defined($config->{gvf})) {
|
|
822 return $out_file_handle;
|
|
823 }
|
|
824
|
|
825 # make header
|
|
826 my $time = &get_time;
|
|
827 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca};
|
|
828 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache});
|
|
829 my $version_string =
|
|
830 "Using API version ".$config->{reg}->software_version.
|
|
831 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?');
|
|
832
|
|
833 my $header =<<HEAD;
|
|
834 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION
|
|
835 ## Output produced at $time
|
|
836 ## Connected to $db_string
|
|
837 ## $version_string
|
|
838 ## Extra column keys:
|
|
839 ## CANONICAL : Indicates if transcript is canonical for this gene
|
|
840 ## HGNC : HGNC gene identifier
|
|
841 ## ENSP : Ensembl protein identifer
|
|
842 ## HGVSc : HGVS coding sequence name
|
|
843 ## HGVSp : HGVS protein sequence name
|
|
844 ## SIFT : SIFT prediction
|
|
845 ## PolyPhen : PolyPhen prediction
|
|
846 ## Condel : Condel SIFT/PolyPhen consensus prediction
|
|
847 ## MATRIX : The source and identifier of a transcription factor binding profile aligned at this position
|
|
848 ## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of a transcription factor binding profile
|
|
849 HEAD
|
|
850
|
|
851 # add headers
|
|
852 print $out_file_handle $header;
|
|
853
|
|
854 # add column headers
|
|
855 print $out_file_handle '#', (join "\t", @OUTPUT_COLS);
|
|
856 print $out_file_handle "\n";
|
|
857
|
|
858 return $out_file_handle;
|
|
859 }
|
|
860
|
|
861 # convert a variation feature to a line of output
|
|
862 sub convert_vf {
|
|
863 my $config = shift;
|
|
864 my $vf = shift;
|
|
865
|
|
866 my $convert_method = 'convert_to_'.lc($config->{convert});
|
|
867 my $method_ref = \&$convert_method;
|
|
868
|
|
869 my $line = &$method_ref($config, $vf);
|
|
870 my $handle = $config->{out_file_handle};
|
|
871
|
|
872 if(scalar @$line) {
|
|
873 print $handle join "\t", @$line;
|
|
874 print $handle "\n";
|
|
875 }
|
|
876 }
|
|
877
|
|
878 # converts to Ensembl format
|
|
879 sub convert_to_ensembl {
|
|
880 my $config = shift;
|
|
881 my $vf = shift;
|
|
882
|
|
883 return [
|
|
884 $vf->{chr} || $vf->seq_region_name,
|
|
885 $vf->start,
|
|
886 $vf->end,
|
|
887 $vf->allele_string,
|
|
888 $vf->strand,
|
|
889 $vf->variation_name
|
|
890 ];
|
|
891 }
|
|
892
|
|
893 # converts to VCF format
|
|
894 sub convert_to_vcf {
|
|
895 my $config = shift;
|
|
896 my $vf = shift;
|
|
897
|
|
898 # look for imbalance in the allele string
|
|
899 my %allele_lengths;
|
|
900 my @alleles = split /\//, $vf->allele_string;
|
|
901
|
|
902 foreach my $allele(@alleles) {
|
|
903 $allele =~ s/\-//g;
|
|
904 $allele_lengths{length($allele)} = 1;
|
|
905 }
|
|
906
|
|
907 # in/del/unbalanced
|
|
908 if(scalar keys %allele_lengths > 1) {
|
|
909
|
|
910 # we need the ref base before the variation
|
|
911 # default to N in case we can't get it
|
|
912 my $prev_base = 'N';
|
|
913
|
|
914 unless(defined($config->{cache})) {
|
|
915 my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1);
|
|
916 $prev_base = $slice->seq if defined($slice);
|
|
917 }
|
|
918
|
|
919 for my $i(0..$#alleles) {
|
|
920 $alleles[$i] =~ s/\-//g;
|
|
921 $alleles[$i] = $prev_base.$alleles[$i];
|
|
922 }
|
|
923
|
|
924 return [
|
|
925 $vf->{chr} || $vf->seq_region_name,
|
|
926 $vf->start - 1,
|
|
927 $vf->variation_name,
|
|
928 shift @alleles,
|
|
929 (join ",", @alleles),
|
|
930 '.', '.', '.'
|
|
931 ];
|
|
932
|
|
933 }
|
|
934
|
|
935 # balanced sub
|
|
936 else {
|
|
937 return [
|
|
938 $vf->{chr} || $vf->seq_region_name,
|
|
939 $vf->start,
|
|
940 $vf->variation_name,
|
|
941 shift @alleles,
|
|
942 (join ",", @alleles),
|
|
943 '.', '.', '.'
|
|
944 ];
|
|
945 }
|
|
946 }
|
|
947
|
|
948
|
|
949 # converts to pileup format
|
|
950 sub convert_to_pileup {
|
|
951 my $config = shift;
|
|
952 my $vf = shift;
|
|
953
|
|
954 # look for imbalance in the allele string
|
|
955 my %allele_lengths;
|
|
956 my @alleles = split /\//, $vf->allele_string;
|
|
957
|
|
958 foreach my $allele(@alleles) {
|
|
959 $allele =~ s/\-//g;
|
|
960 $allele_lengths{length($allele)} = 1;
|
|
961 }
|
|
962
|
|
963 # in/del
|
|
964 if(scalar keys %allele_lengths > 1) {
|
|
965
|
|
966 if($vf->allele_string =~ /\-/) {
|
|
967
|
|
968 # insertion?
|
|
969 if($alleles[0] eq '-') {
|
|
970 shift @alleles;
|
|
971
|
|
972 for my $i(0..$#alleles) {
|
|
973 $alleles[$i] =~ s/\-//g;
|
|
974 $alleles[$i] = '+'.$alleles[$i];
|
|
975 }
|
|
976 }
|
|
977
|
|
978 else {
|
|
979 @alleles = grep {$_ ne '-'} @alleles;
|
|
980
|
|
981 for my $i(0..$#alleles) {
|
|
982 $alleles[$i] =~ s/\-//g;
|
|
983 $alleles[$i] = '-'.$alleles[$i];
|
|
984 }
|
|
985 }
|
|
986
|
|
987 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles;
|
|
988
|
|
989 return [
|
|
990 $vf->{chr} || $vf->seq_region_name,
|
|
991 $vf->start - 1,
|
|
992 '*',
|
|
993 (join "/", @alleles),
|
|
994 ];
|
|
995 }
|
|
996
|
|
997 else {
|
|
998 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet});
|
|
999 return [];
|
|
1000 }
|
|
1001
|
|
1002 }
|
|
1003
|
|
1004 # balanced sub
|
|
1005 else {
|
|
1006 return [
|
|
1007 $vf->{chr} || $vf->seq_region_name,
|
|
1008 $vf->start,
|
|
1009 shift @alleles,
|
|
1010 (join "/", @alleles),
|
|
1011 ];
|
|
1012 }
|
|
1013 }
|
|
1014
|
|
1015 # prints a line of output from the hash
|
|
1016 sub print_line {
|
|
1017 my $config = shift;
|
|
1018 my $line = shift;
|
|
1019 return unless defined($line);
|
|
1020
|
|
1021 my $output;
|
|
1022
|
|
1023 # normal
|
|
1024 if(ref($line) eq 'HASH') {
|
|
1025 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} };
|
|
1026 $output = join "\t", map { $line->{$_} || '-' } @OUTPUT_COLS;
|
|
1027 }
|
|
1028
|
|
1029 # gvf
|
|
1030 else {
|
|
1031 $output = $line;
|
|
1032 }
|
|
1033
|
|
1034 my $fh = $config->{out_file_handle};
|
|
1035 print $fh "$output\n";
|
|
1036 }
|
|
1037
|
|
1038 # outputs usage message
|
|
1039 sub usage {
|
|
1040 my $usage =<<END;
|
|
1041 #----------------------------------#
|
|
1042 # ENSEMBL VARIANT EFFECT PREDICTOR #
|
|
1043 #----------------------------------#
|
|
1044
|
|
1045 version $VERSION
|
|
1046
|
|
1047 By Will McLaren (wm2\@ebi.ac.uk)
|
|
1048
|
|
1049 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
|
|
1050
|
|
1051 Usage:
|
|
1052 perl variant_effect_predictor.pl [arguments]
|
|
1053
|
|
1054 Options
|
|
1055 =======
|
|
1056
|
|
1057 --help Display this message and quit
|
|
1058 --verbose Display verbose output as the script runs [default: off]
|
|
1059 --quiet Suppress status and warning messages [default: off]
|
|
1060 --no_progress Suppress progress bars [default: off]
|
|
1061
|
|
1062 --config Load configuration from file. Any command line options
|
|
1063 specified overwrite those in the file [default: off]
|
|
1064
|
|
1065 -i | --input_file Input file - if not specified, reads from STDIN. Files
|
|
1066 may be gzip compressed.
|
|
1067 --format Specify input file format - one of "ensembl", "pileup",
|
|
1068 "vcf", "hgvs", "id" or "guess" to try and work out format.
|
|
1069 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this
|
|
1070 will force --quiet [default: "variant_effect_output.txt"]
|
|
1071 --force_overwrite Force overwriting of output file [default: quit if file
|
|
1072 exists]
|
|
1073
|
|
1074 --species [species] Species to use [default: "human"]
|
|
1075
|
|
1076 -t | --terms Type of consequence terms to output - one of "ensembl", "SO",
|
|
1077 "NCBI" [default: ensembl]
|
|
1078
|
|
1079 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off]
|
|
1080 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off]
|
|
1081 --condel=[p|s|b] Add Condel SIFT/PolyPhen consensus [p]rediction, [s]core or
|
|
1082 [b]oth. Add 1 (e.g. b1) to option to use old Condel algorithm
|
|
1083 [default: off]
|
|
1084
|
|
1085 NB: SIFT, PolyPhen and Condel predictions are currently available for human only
|
|
1086
|
|
1087 --regulatory Look for overlaps with regulatory regions. The script can
|
|
1088 also call if a variant falls in a high information position
|
|
1089 within a transcription factor binding site. Output lines have
|
|
1090 a Feature type of RegulatoryFeature or MotifFeature
|
|
1091 [default: off]
|
|
1092
|
|
1093 NB: Regulatory consequences are currently available for human and mouse only
|
|
1094
|
|
1095 --hgnc If specified, HGNC gene identifiers are output alongside the
|
|
1096 Ensembl Gene identifier [default: off]
|
|
1097 --hgvs Output HGVS identifiers (coding and protein). Requires database
|
|
1098 connection [default: off]
|
|
1099 --protein Output Ensembl protein identifer [default: off]
|
|
1100 --gene Force output of Ensembl gene identifer - disabled by default
|
|
1101 unless using --cache or --no_whole_genome [default: off]
|
|
1102 --canonical Indicate if the transcript for this consequence is the canonical
|
|
1103 transcript for this gene [default: off]
|
|
1104
|
|
1105 --coding_only Only return consequences that fall in the coding region of
|
|
1106 transcripts [default: off]
|
|
1107 --most_severe Ouptut only the most severe consequence per variation.
|
|
1108 Transcript-specific columns will be left blank. [default: off]
|
|
1109 --summary Output only a comma-separated list of all consequences per
|
|
1110 variation. Transcript-specific columns will be left blank.
|
|
1111 [default: off]
|
|
1112 --per_gene Output only the most severe consequence per gene. Where more
|
|
1113 than one transcript has the same consequence, the transcript
|
|
1114 chosen is arbitrary. [default: off]
|
|
1115
|
|
1116
|
|
1117 --check_ref If specified, checks supplied reference allele against stored
|
|
1118 entry in Ensembl Core database [default: off]
|
|
1119 --check_existing If specified, checks for existing co-located variations in the
|
|
1120 Ensembl Variation database [default: off]
|
|
1121 --check_alleles If specified, the alleles of existing co-located variations
|
|
1122 are compared to the input; an existing variation will only
|
|
1123 be reported if no novel allele is in the input (strand is
|
|
1124 accounted for) [default: off]
|
|
1125
|
|
1126 --no_intergenic Excludes intergenic consequences from the output [default: off]
|
|
1127
|
|
1128 --check_frequency Turns on frequency filtering. Use this to include or exclude
|
|
1129 variants based on the frequency of co-located existing
|
|
1130 variants in the Ensembl Variation database. You must also
|
|
1131 specify all of the following --freq flags [default: off]
|
|
1132 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap,
|
|
1133 1kg_yri for YRI 1000 genomes. See documentation for more
|
|
1134 details
|
|
1135 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5
|
|
1136 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or
|
|
1137 less than (lt) --freq_freq
|
|
1138 --freq_filter Specify whether variants that pass the above should be included
|
|
1139 [exclude|include] or excluded from analysis
|
|
1140
|
|
1141 --chr [list] Select a subset of chromosomes to analyse from your file. Any
|
|
1142 data not on this chromosome in the input will be skipped. The
|
|
1143 list can be comma separated, with "-" characters representing
|
|
1144 a range e.g. 1-5,8,15,X [default: off]
|
|
1145 --gp If specified, tries to read GRCh37 position from GP field in the
|
|
1146 INFO column of a VCF file. Only applies when VCF is the input
|
|
1147 format and human is the species [default: off]
|
|
1148
|
|
1149 --convert Convert the input file to the output format specified.
|
|
1150 [ensembl|vcf|pileup] Converted output is written to the file specified in
|
|
1151 --output_file. No consequence calculation is carried out when
|
|
1152 doing file conversion. [default: off]
|
|
1153
|
|
1154 --refseq Use the otherfeatures database to retrieve transcripts - this
|
|
1155 database contains RefSeq transcripts (as well as CCDS and
|
|
1156 Ensembl EST alignments) [default: off]
|
|
1157 --host Manually define database host [default: "ensembldb.ensembl.org"]
|
|
1158 -u | --user Database username [default: "anonymous"]
|
|
1159 --port Database port [default: 5306]
|
|
1160 --password Database password [default: no password]
|
|
1161 --genomes Sets DB connection params for Ensembl Genomes [default: off]
|
|
1162 --registry Registry file to use defines DB connections [default: off]
|
|
1163 Defining a registry file overrides above connection settings.
|
|
1164 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not
|
|
1165 advised due to likely incompatibilities between API and DB
|
|
1166
|
|
1167 --no_whole_genome Run in old-style, non-whole genome mode [default: off]
|
|
1168 --buffer_size Sets the number of variants sent in each batch [default: 5000]
|
|
1169 Increasing buffer size can retrieve results more quickly
|
|
1170 but requires more memory. Only applies to whole genome mode.
|
|
1171
|
|
1172 --cache Enables read-only use of cache [default: off]
|
|
1173 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"]
|
|
1174 --write_cache Enable writing to cache [default: off]
|
|
1175 --build [all|list] Build a complete cache for the selected species. Build for all
|
|
1176 chromosomes with --build all, or a list of chromosomes (see
|
|
1177 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS
|
|
1178 VIOLATES OUR FAIR USAGE POLICY [default: off]
|
|
1179
|
|
1180 --compress Specify utility to decompress cache files - may be "gzcat" or
|
|
1181 "gzip -dc" Only use if default does not work [default: zcat]
|
|
1182
|
|
1183 --skip_db_check ADVANCED! Force the script to use a cache built from a different
|
|
1184 database than specified with --host. Only use this if you are
|
|
1185 sure the hosts are compatible (e.g. ensembldb.ensembl.org and
|
|
1186 useastdb.ensembl.org) [default: off]
|
|
1187 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one
|
|
1188 file in the cache. [default: 1MB]
|
|
1189 END
|
|
1190
|
|
1191 print $usage;
|
|
1192 }
|