Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/variant_effect_predictor.pl @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
6 Genome Research Limited. All rights reserved. | |
7 | |
8 This software is distributed under a modified Apache license. | |
9 For license details, please see | |
10 | |
11 http://www.ensembl.org/info/about/code_licence.html | |
12 | |
13 =head1 CONTACT | |
14 | |
15 Please email comments or questions to the public Ensembl | |
16 developers list at <dev@ensembl.org>. | |
17 | |
18 Questions may also be sent to the Ensembl help desk at | |
19 <helpdesk@ensembl.org>. | |
20 | |
21 =cut | |
22 | |
23 =head1 NAME | |
24 | |
25 Variant Effect Predictor - a script to predict the consequences of genomic variants | |
26 | |
27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html | |
28 | |
29 Version 2.6 | |
30 | |
31 by Will McLaren (wm2@ebi.ac.uk) | |
32 =cut | |
33 | |
34 use strict; | |
35 use Getopt::Long; | |
36 use FileHandle; | |
37 use FindBin qw($Bin); | |
38 use lib $Bin; | |
39 | |
40 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); | |
41 use Bio::EnsEMBL::Variation::Utils::VEP qw( | |
42 parse_line | |
43 vf_to_consequences | |
44 validate_vf | |
45 convert_to_vcf | |
46 load_dumped_adaptor_cache | |
47 dump_adaptor_cache | |
48 get_all_consequences | |
49 get_slice | |
50 build_full_cache | |
51 read_cache_info | |
52 get_time | |
53 debug | |
54 @OUTPUT_COLS | |
55 @REG_FEAT_TYPES | |
56 %FILTER_SHORTCUTS | |
57 ); | |
58 | |
59 # global vars | |
60 my $VERSION = '2.6'; | |
61 | |
62 | |
63 # define headers that would normally go in the extra field | |
64 # keyed on the config parameter used to turn it on | |
65 my %extra_headers = ( | |
66 protein => ['ENSP'], | |
67 canonical => ['CANONICAL'], | |
68 ccds => ['CCDS'], | |
69 hgvs => ['HGVSc','HGVSp'], | |
70 hgnc => ['HGNC'], | |
71 sift => ['SIFT'], | |
72 polyphen => ['PolyPhen'], | |
73 numbers => ['EXON','INTRON'], | |
74 domains => ['DOMAINS'], | |
75 regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'], | |
76 cell_type => ['CELL_TYPE'], | |
77 individual => ['IND'], | |
78 xref_refseq => ['RefSeq'], | |
79 check_svs => ['SV'], | |
80 check_frequency => ['FREQS'], | |
81 gmaf => ['GMAF'], | |
82 user => ['DISTANCE'], | |
83 ); | |
84 | |
85 my %extra_descs = ( | |
86 'CANONICAL' => 'Indicates if transcript is canonical for this gene', | |
87 'CCDS' => 'Indicates if transcript is a CCDS transcript', | |
88 'HGNC' => 'HGNC gene identifier', | |
89 'ENSP' => 'Ensembl protein identifer', | |
90 'HGVSc' => 'HGVS coding sequence name', | |
91 'HGVSp' => 'HGVS protein sequence name', | |
92 'SIFT' => 'SIFT prediction', | |
93 'PolyPhen' => 'PolyPhen prediction', | |
94 'EXON' => 'Exon number(s) / total', | |
95 'INTRON' => 'Intron number(s) / total', | |
96 'DOMAINS' => 'The source and identifer of any overlapping protein domains', | |
97 'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position', | |
98 'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP', | |
99 'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP', | |
100 'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP', | |
101 'CELL_TYPE' => 'List of cell types and classifications for regulatory feature', | |
102 'IND' => 'Individual name', | |
103 'SV' => 'IDs of overlapping structural variants', | |
104 'FREQS' => 'Frequencies of overlapping variants used in filtering', | |
105 'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1', | |
106 'DISTANCE' => 'Shortest distance from variant to transcript', | |
107 ); | |
108 | |
109 # set output autoflush for progress bars | |
110 $| = 1; | |
111 | |
112 # configure from command line opts | |
113 my $config = &configure(scalar @ARGV); | |
114 | |
115 # run the main sub routine | |
116 &main($config); | |
117 | |
118 # this is the main sub-routine - it needs the configured $config hash | |
119 sub main { | |
120 my $config = shift; | |
121 | |
122 debug("Starting...") unless defined $config->{quiet}; | |
123 | |
124 $config->{start_time} = time(); | |
125 $config->{last_time} = time(); | |
126 | |
127 my $tr_cache = {}; | |
128 my $rf_cache = {}; | |
129 | |
130 # create a hash to hold slices so we don't get the same one twice | |
131 my %slice_cache = (); | |
132 | |
133 my @vfs; | |
134 my ($vf_count, $total_vf_count); | |
135 my $in_file_handle = $config->{in_file_handle}; | |
136 | |
137 # initialize line number in config | |
138 $config->{line_number} = 0; | |
139 | |
140 # read the file | |
141 while(<$in_file_handle>) { | |
142 chomp; | |
143 | |
144 $config->{line_number}++; | |
145 | |
146 # header line? | |
147 if(/^\#/) { | |
148 | |
149 # retain header lines if we are outputting VCF | |
150 if(defined($config->{vcf})) { | |
151 push @{$config->{headers}}, $_; | |
152 } | |
153 | |
154 # line with sample labels in VCF | |
155 if(defined($config->{individual}) && /^#CHROM/) { | |
156 my @split = split /\s+/; | |
157 | |
158 # no individuals | |
159 die("ERROR: No individual data found in VCF\n") if scalar @split <= 9; | |
160 | |
161 # get individual column indices | |
162 my %ind_cols = map {$split[$_] => $_} (9..$#split); | |
163 | |
164 # all? | |
165 if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) { | |
166 $config->{ind_cols} = \%ind_cols; | |
167 } | |
168 else { | |
169 my %new_ind_cols; | |
170 | |
171 # check we have specified individual(s) | |
172 foreach my $ind(@{$config->{individual}}) { | |
173 die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind}; | |
174 $new_ind_cols{$ind} = $ind_cols{$ind}; | |
175 } | |
176 | |
177 $config->{ind_cols} = \%new_ind_cols; | |
178 } | |
179 } | |
180 | |
181 next; | |
182 } | |
183 | |
184 # configure output file | |
185 $config->{out_file_handle} ||= &get_out_file_handle($config); | |
186 | |
187 # some lines (pileup) may actually parse out into more than one variant | |
188 foreach my $vf(@{&parse_line($config, $_)}) { | |
189 | |
190 $vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original}); | |
191 | |
192 # now get the slice | |
193 if(!defined($vf->{slice})) { | |
194 my $slice; | |
195 | |
196 # don't get slices if we're using cache | |
197 # we can steal them from transcript objects later | |
198 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { | |
199 | |
200 # check if we have fetched this slice already | |
201 if(defined $slice_cache{$vf->{chr}}) { | |
202 $slice = $slice_cache{$vf->{chr}}; | |
203 } | |
204 | |
205 # if not create a new one | |
206 else { | |
207 | |
208 $slice = &get_slice($config, $vf->{chr}); | |
209 | |
210 # if failed, warn and skip this line | |
211 if(!defined($slice)) { | |
212 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; | |
213 next; | |
214 } | |
215 | |
216 # store the hash | |
217 $slice_cache{$vf->{chr}} = $slice; | |
218 } | |
219 } | |
220 | |
221 $vf->{slice} = $slice; | |
222 } | |
223 | |
224 # validate the VF | |
225 next unless validate_vf($config, $vf); | |
226 | |
227 # make a name if one doesn't exist | |
228 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term}); | |
229 | |
230 # jump out to convert here | |
231 if(defined($config->{convert})) { | |
232 &convert_vf($config, $vf); | |
233 next; | |
234 } | |
235 | |
236 if(defined $config->{whole_genome}) { | |
237 push @vfs, $vf; | |
238 $vf_count++; | |
239 $total_vf_count++; | |
240 | |
241 if($vf_count == $config->{buffer_size}) { | |
242 debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); | |
243 | |
244 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; | |
245 | |
246 # calculate stats | |
247 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); | |
248 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); | |
249 $config->{last_time} = time(); | |
250 | |
251 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); | |
252 | |
253 @vfs = (); | |
254 $vf_count = 0; | |
255 } | |
256 } | |
257 else { | |
258 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; | |
259 $vf_count++; | |
260 $total_vf_count++; | |
261 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); | |
262 } | |
263 } | |
264 } | |
265 | |
266 # if in whole-genome mode, finish off the rest of the buffer | |
267 if(defined $config->{whole_genome} && scalar @vfs) { | |
268 debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); | |
269 | |
270 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; | |
271 | |
272 # calculate stats | |
273 my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); | |
274 my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); | |
275 $config->{last_time} = time(); | |
276 | |
277 debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); | |
278 | |
279 debug($config->{filter_count}, "/$total_vf_count variants remain after filtering") if defined($config->{filter}) && !defined($config->{quiet}); | |
280 } | |
281 | |
282 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); | |
283 | |
284 debug("Finished!") unless defined $config->{quiet}; | |
285 } | |
286 | |
287 # sets up configuration hash that is used throughout the script | |
288 sub configure { | |
289 my $args = shift; | |
290 | |
291 my $config = {}; | |
292 | |
293 GetOptions( | |
294 $config, | |
295 'help', # displays help message | |
296 | |
297 # input options, | |
298 'config=s', # config file name | |
299 'input_file|i=s', # input file name | |
300 'format=s', # input file format | |
301 | |
302 # DB options | |
303 'species=s', # species e.g. human, homo_sapiens | |
304 'registry=s', # registry file | |
305 'host=s', # database host | |
306 'port=s', # database port | |
307 'user=s', # database user name | |
308 'password=s', # database password | |
309 'db_version=i', # Ensembl database version to use e.g. 62 | |
310 'genomes', # automatically sets DB params for e!Genomes | |
311 'refseq', # use otherfeatures RefSeq DB instead of Ensembl | |
312 #'no_disconnect', # disables disconnect_when_inactive | |
313 | |
314 # runtime options | |
315 'most_severe', # only return most severe consequence | |
316 'summary', # only return one line per variation with all consquence types | |
317 'per_gene', # only return most severe per gene | |
318 'buffer_size=i', # number of variations to read in before analysis | |
319 'chunk_size=s', # size in bases of "chunks" used in internal hash structure | |
320 'failed=i', # include failed variations when finding existing | |
321 'no_whole_genome', # disables now default whole-genome mode | |
322 'whole_genome', # proxy for whole genome mode - now just warns user | |
323 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) | |
324 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT | |
325 'check_ref', # check supplied reference allele against DB | |
326 'check_existing', # find existing co-located variations | |
327 'check_svs', # find overlapping structural variations | |
328 'check_alleles', # only attribute co-located if alleles are the same | |
329 'check_frequency', # enable frequency checking | |
330 'gmaf', # add global MAF of existing var | |
331 'freq_filter=s', # exclude or include | |
332 'freq_freq=f', # frequency to filter on | |
333 'freq_gt_lt=s', # gt or lt (greater than or less than) | |
334 'freq_pop=s', # population to filter on | |
335 'allow_non_variant', # allow non-variant VCF lines through | |
336 'individual=s', # give results by genotype for individuals | |
337 'phased', # force VCF genotypes to be interpreted as phased | |
338 'fork=i', # fork into N processes | |
339 | |
340 # verbosity options | |
341 'verbose|v', # print out a bit more info while running | |
342 'quiet', # print nothing to STDOUT (unless using -o stdout) | |
343 'no_progress', # don't display progress bars | |
344 | |
345 # output options | |
346 'everything|e', # switch on EVERYTHING :-) | |
347 'output_file|o=s', # output file name | |
348 'force_overwrite', # force overwrite of output file if already exists | |
349 'terms|t=s', # consequence terms to use e.g. NCBI, SO | |
350 'coding_only', # only return results for consequences in coding regions | |
351 'canonical', # indicates if transcript is canonical | |
352 'ccds', # output CCDS identifer | |
353 'xref_refseq', # output refseq mrna xref | |
354 'protein', # add e! protein ID to extra column | |
355 'hgnc', # add HGNC gene ID to extra column | |
356 'hgvs', # add HGVS names to extra column | |
357 'sift=s', # SIFT predictions | |
358 'polyphen=s', # PolyPhen predictions | |
359 'condel=s', # Condel predictions | |
360 'regulatory', # enable regulatory stuff | |
361 'cell_type=s', # filter cell types for regfeats | |
362 'convert=s', # convert input to another format (doesn't run VEP) | |
363 'filter=s', # run in filtering mode | |
364 'no_intergenic', # don't print out INTERGENIC consequences | |
365 'gvf', # produce gvf output | |
366 'vcf', # produce vcf output | |
367 'original', # produce output in input format | |
368 'no_consequences', # don't calculate consequences | |
369 'lrg', # enable LRG-based features | |
370 'fields=s', # define your own output fields | |
371 'domains', # output overlapping protein features | |
372 'numbers', # include exon and intron numbers | |
373 | |
374 # cache stuff | |
375 'cache', # use cache | |
376 'write_cache', # enables writing to the cache | |
377 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs | |
378 'no_adaptor_cache', # don't write adaptor cache | |
379 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript | |
380 'strip', # strips adaptors etc from objects before caching them | |
381 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more | |
382 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) | |
383 'cache_region_size=i', # size of region in bases for each cache file | |
384 'no_slice_cache', # tell API not to cache features on slice | |
385 'standalone', # standalone renamed offline | |
386 'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection | |
387 'skip_db_check', # don't compare DB parameters with cached | |
388 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" | |
389 'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation | |
390 'tmpdir=s', # tmp dir used for BigWig retrieval | |
391 'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory | |
392 | |
393 # debug | |
394 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL | |
395 'count_queries', # counts SQL queries executed | |
396 'admin', # allows me to build off public hosts | |
397 'debug', # print out debug info | |
398 'tabix', # experimental use tabix cache files | |
399 ) or die "ERROR: Failed to parse command-line flags\n"; | |
400 | |
401 # print usage message if requested or no args supplied | |
402 if(defined($config->{help}) || !$args) { | |
403 &usage; | |
404 exit(0); | |
405 } | |
406 | |
407 # dir is where the cache and plugins live | |
408 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); | |
409 | |
410 # dir gets set to the specific cache directory later on, so take a copy to use | |
411 # when configuring plugins | |
412 | |
413 $config->{toplevel_dir} = $config->{dir}; | |
414 | |
415 # ini file? | |
416 my $ini_file = $config->{dir}.'/vep.ini'; | |
417 | |
418 if(-e $ini_file) { | |
419 read_config_from_file($config, $ini_file); | |
420 } | |
421 | |
422 # config file? | |
423 if(defined $config->{config}) { | |
424 read_config_from_file($config, $config->{config}); | |
425 } | |
426 | |
427 # can't be both quiet and verbose | |
428 die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose}); | |
429 | |
430 # check forking | |
431 if(defined($config->{fork})) { | |
432 die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1; | |
433 | |
434 # check we can use MIME::Base64 | |
435 eval q{ use MIME::Base64; }; | |
436 | |
437 if($@) { | |
438 debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); | |
439 delete $config->{fork}; | |
440 } | |
441 else { | |
442 | |
443 # try a practice fork | |
444 my $pid = fork; | |
445 | |
446 if(!defined($pid)) { | |
447 debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet}); | |
448 delete $config->{fork}; | |
449 } | |
450 elsif($pid) { | |
451 waitpid($pid, 0); | |
452 } | |
453 elsif($pid == 0) { | |
454 exit(0); | |
455 } | |
456 } | |
457 } | |
458 | |
459 # check file format | |
460 if(defined $config->{format}) { | |
461 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i; | |
462 } | |
463 | |
464 # check convert format | |
465 if(defined $config->{convert}) { | |
466 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i; | |
467 } | |
468 | |
469 # check if user still using --standalone | |
470 if(defined $config->{standalone}) { | |
471 die "ERROR: --standalone replaced by --offline\n"; | |
472 } | |
473 | |
474 # connection settings for Ensembl Genomes | |
475 if($config->{genomes}) { | |
476 $config->{host} ||= 'mysql.ebi.ac.uk'; | |
477 $config->{port} ||= 4157; | |
478 } | |
479 | |
480 # connection settings for main Ensembl | |
481 else { | |
482 $config->{species} ||= "homo_sapiens"; | |
483 $config->{host} ||= 'ensembldb.ensembl.org'; | |
484 $config->{port} ||= 5306; | |
485 } | |
486 | |
487 # refseq or core? | |
488 if(defined($config->{refseq})) { | |
489 $config->{core_type} = 'otherfeatures'; | |
490 } | |
491 else { | |
492 $config->{core_type} = 'core'; | |
493 } | |
494 | |
495 # output term | |
496 if(defined $config->{terms}) { | |
497 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; | |
498 if($config->{terms} =~ /ensembl|display/i) { | |
499 $config->{terms} = 'display'; | |
500 } | |
501 else { | |
502 $config->{terms} = uc($config->{terms}); | |
503 } | |
504 } | |
505 | |
506 # everything? | |
507 if(defined($config->{everything})) { | |
508 my %everything = ( | |
509 sift => 'b', | |
510 polyphen => 'b', | |
511 ccds => 1, | |
512 hgvs => 1, | |
513 hgnc => 1, | |
514 numbers => 1, | |
515 domains => 1, | |
516 regulatory => 1, | |
517 canonical => 1, | |
518 protein => 1, | |
519 gmaf => 1, | |
520 ); | |
521 | |
522 $config->{$_} = $everything{$_} for keys %everything; | |
523 | |
524 # these ones won't work with offline | |
525 delete $config->{hgvs} if defined($config->{offline}); | |
526 } | |
527 | |
528 # check nsSNP tools | |
529 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { | |
530 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; | |
531 | |
532 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; | |
533 | |
534 die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel'; | |
535 } | |
536 | |
537 # force quiet if outputting to STDOUT | |
538 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { | |
539 delete $config->{verbose} if defined($config->{verbose}); | |
540 $config->{quiet} = 1; | |
541 } | |
542 | |
543 # individual(s) specified? | |
544 if(defined($config->{individual})) { | |
545 $config->{individual} = [split /\,/, $config->{individual}]; | |
546 | |
547 # force allow_non_variant | |
548 $config->{allow_non_variant} = 1; | |
549 } | |
550 | |
551 # summarise options if verbose | |
552 if(defined $config->{verbose}) { | |
553 my $header =<<INTRO; | |
554 #----------------------------------# | |
555 # ENSEMBL VARIANT EFFECT PREDICTOR # | |
556 #----------------------------------# | |
557 | |
558 version $VERSION | |
559 | |
560 By Will McLaren (wm2\@ebi.ac.uk) | |
561 | |
562 Configuration options: | |
563 | |
564 INTRO | |
565 print $header; | |
566 | |
567 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1]; | |
568 | |
569 foreach my $key(sort keys %$config) { | |
570 next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0; | |
571 print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n"; | |
572 } | |
573 | |
574 print "\n".("-" x 20)."\n\n"; | |
575 } | |
576 | |
577 # check custom annotations | |
578 for my $i(0..$#{$config->{custom}}) { | |
579 my $custom = $config->{custom}->[$i]; | |
580 | |
581 my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom; | |
582 $type ||= 'exact'; | |
583 $format ||= 'bed'; | |
584 $coords ||= 0; | |
585 | |
586 # check type | |
587 die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/; | |
588 | |
589 # check format | |
590 die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/; | |
591 | |
592 # bigwig format | |
593 if($format eq 'bigwig') { | |
594 # check for bigWigToWig | |
595 die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/; | |
596 } | |
597 | |
598 else { | |
599 # check for tabix | |
600 die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/; | |
601 | |
602 # remote files? | |
603 if($filepath =~ /tp\:\/\//) { | |
604 my $remote_test = `tabix $filepath 1:1-1 2>&1`; | |
605 if($remote_test =~ /fail/) { | |
606 die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n"; | |
607 } | |
608 elsif($remote_test =~ /get_local_version/) { | |
609 debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet}); | |
610 } | |
611 } | |
612 | |
613 # check files exist | |
614 else { | |
615 die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath; | |
616 die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi'; | |
617 } | |
618 } | |
619 | |
620 $config->{custom}->[$i] = { | |
621 'file' => $filepath, | |
622 'name' => $shortname || 'CUSTOM'.($i + 1), | |
623 'type' => $type, | |
624 'format' => $format, | |
625 'coords' => $coords, | |
626 }; | |
627 } | |
628 | |
629 # check if using filter and original | |
630 die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter}); | |
631 | |
632 # filter by consequence? | |
633 if(defined($config->{filter})) { | |
634 | |
635 my %filters = map {$_ => 1} split /\,/, $config->{filter}; | |
636 | |
637 # add in shortcuts | |
638 foreach my $filter(keys %filters) { | |
639 my $value = 1; | |
640 if($filter =~ /^no_/) { | |
641 delete $filters{$filter}; | |
642 $filter =~ s/^no_//g; | |
643 $value = 0; | |
644 $filters{$filter} = $value; | |
645 } | |
646 | |
647 if(defined($FILTER_SHORTCUTS{$filter})) { | |
648 delete $filters{$filter}; | |
649 $filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}}; | |
650 } | |
651 } | |
652 | |
653 $config->{filter} = \%filters; | |
654 | |
655 $config->{filter_count} = 0; | |
656 } | |
657 | |
658 # set defaults | |
659 $config->{user} ||= 'anonymous'; | |
660 $config->{buffer_size} ||= 5000; | |
661 $config->{chunk_size} ||= '50kb'; | |
662 $config->{output_file} ||= "variant_effect_output.txt"; | |
663 $config->{tmpdir} ||= '/tmp'; | |
664 $config->{format} ||= 'guess'; | |
665 $config->{terms} ||= 'SO'; | |
666 $config->{cache_region_size} ||= 1000000; | |
667 $config->{compress} ||= 'zcat'; | |
668 | |
669 # regulatory has to be on for cell_type | |
670 if(defined($config->{cell_type})) { | |
671 $config->{regulatory} = 1; | |
672 $config->{cell_type} = [split /\,/, $config->{cell_type}] if defined($config->{cell_type}); | |
673 } | |
674 | |
675 # can't use a whole bunch of options with most_severe | |
676 if(defined($config->{most_severe})) { | |
677 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) { | |
678 die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag}); | |
679 } | |
680 } | |
681 | |
682 # can't use a whole bunch of options with summary | |
683 if(defined($config->{summary})) { | |
684 foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) { | |
685 die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag}); | |
686 } | |
687 } | |
688 | |
689 # frequency filtering | |
690 if(defined($config->{check_frequency})) { | |
691 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { | |
692 die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag}; | |
693 } | |
694 | |
695 # need to set check_existing | |
696 $config->{check_existing} = 1; | |
697 } | |
698 | |
699 $config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf}; | |
700 | |
701 # warn users still using whole_genome flag | |
702 if(defined($config->{whole_genome})) { | |
703 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); | |
704 } | |
705 | |
706 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; | |
707 $config->{failed} = 0 unless defined $config->{failed}; | |
708 $config->{chunk_size} =~ s/mb?/000000/i; | |
709 $config->{chunk_size} =~ s/kb?/000/i; | |
710 $config->{cache_region_size} =~ s/mb?/000000/i; | |
711 $config->{cache_region_size} =~ s/kb?/000/i; | |
712 | |
713 # cluck and display executed SQL? | |
714 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); | |
715 | |
716 # offline needs cache, can't use HGVS | |
717 if(defined($config->{offline})) { | |
718 $config->{cache} = 1; | |
719 | |
720 #die("ERROR: Cannot generate HGVS coordinates in offline mode\n") if defined($config->{hgvs}); | |
721 die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs'; | |
722 die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id'; | |
723 die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency}); | |
724 die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv}); | |
725 } | |
726 | |
727 # write_cache needs cache | |
728 $config->{cache} = 1 if defined $config->{write_cache}; | |
729 | |
730 # no_slice_cache, prefetch and whole_genome have to be on to use cache | |
731 if(defined($config->{cache})) { | |
732 $config->{prefetch} = 1; | |
733 $config->{no_slice_cache} = 1; | |
734 $config->{whole_genome} = 1; | |
735 $config->{strip} = 1; | |
736 } | |
737 | |
738 $config->{build} = $config->{rebuild} if defined($config->{rebuild}); | |
739 | |
740 # force options for full build | |
741 if(defined($config->{build})) { | |
742 $config->{prefetch} = 1; | |
743 $config->{hgnc} = 1; | |
744 $config->{no_slice_cache} = 1; | |
745 $config->{cache} = 1; | |
746 $config->{strip} = 1; | |
747 $config->{write_cache} = 1; | |
748 $config->{cell_type} = 1 if defined($config->{regulatory}); | |
749 } | |
750 | |
751 # connect to databases | |
752 $config->{reg} = &connect_to_dbs($config); | |
753 | |
754 # complete dir with species name and db_version | |
755 $config->{dir} .= '/'.( | |
756 join '/', ( | |
757 defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), | |
758 $config->{db_version} || $config->{reg}->software_version | |
759 ) | |
760 ); | |
761 | |
762 # warn user cache directory doesn't exist | |
763 if(!-e $config->{dir}) { | |
764 | |
765 # if using write_cache | |
766 if(defined($config->{write_cache})) { | |
767 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); | |
768 } | |
769 | |
770 # want to read cache, not found | |
771 elsif(defined($config->{cache})) { | |
772 die("ERROR: Cache directory ", $config->{dir}, " not found"); | |
773 } | |
774 } | |
775 | |
776 if(defined($config->{cache})) { | |
777 # read cache info | |
778 if(read_cache_info($config)) { | |
779 debug("Read existing cache info") unless defined $config->{quiet}; | |
780 } | |
781 } | |
782 | |
783 # we configure plugins here because they can sometimes switch on the | |
784 # regulatory config option | |
785 configure_plugins($config); | |
786 | |
787 # include regulatory modules if requested | |
788 if(defined($config->{regulatory})) { | |
789 # do the use statements here so that users don't have to have the | |
790 # funcgen API installed to use the rest of the script | |
791 eval q{ | |
792 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; | |
793 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; | |
794 use Bio::EnsEMBL::Funcgen::MotifFeature; | |
795 use Bio::EnsEMBL::Funcgen::RegulatoryFeature; | |
796 use Bio::EnsEMBL::Funcgen::BindingMatrix; | |
797 }; | |
798 | |
799 if($@) { | |
800 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n"); | |
801 } | |
802 } | |
803 | |
804 # user defined custom output fields | |
805 if(defined($config->{fields})) { | |
806 $config->{fields} = [split ',', $config->{fields}]; | |
807 debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet}); | |
808 $config->{fields_redefined} = 1; | |
809 } | |
810 $config->{fields} ||= \@OUTPUT_COLS; | |
811 | |
812 # suppress warnings that the FeatureAdpators spit if using no_slice_cache | |
813 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); | |
814 | |
815 # get adaptors (don't get them in offline mode) | |
816 unless(defined($config->{offline})) { | |
817 | |
818 if(defined($config->{cache}) && !defined($config->{write_cache})) { | |
819 | |
820 # try and load adaptors from cache | |
821 if(!&load_dumped_adaptor_cache($config)) { | |
822 &get_adaptors($config); | |
823 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); | |
824 } | |
825 | |
826 # check cached adaptors match DB params | |
827 else { | |
828 my $dbc = $config->{sa}->{dbc}; | |
829 | |
830 my $ok = 1; | |
831 | |
832 if($dbc->{_host} ne $config->{host}) { | |
833 | |
834 # ens-livemirror, useastdb and ensembldb should all have identical DBs | |
835 unless( | |
836 ( | |
837 $dbc->{_host} eq 'ens-livemirror' | |
838 || $dbc->{_host} eq 'ensembldb.ensembl.org' | |
839 || $dbc->{_host} eq 'useastdb.ensembl.org' | |
840 ) && ( | |
841 $config->{host} eq 'ens-livemirror' | |
842 || $config->{host} eq 'ensembldb.ensembl.org' | |
843 || $config->{host} eq 'useastdb.ensembl.org' | |
844 ) | |
845 ) { | |
846 $ok = 0; | |
847 } | |
848 | |
849 unless(defined($config->{skip_db_check})) { | |
850 # but we still need to reconnect | |
851 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); | |
852 | |
853 &get_adaptors($config); | |
854 } | |
855 } | |
856 | |
857 if(!$ok) { | |
858 if(defined($config->{skip_db_check})) { | |
859 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); | |
860 } | |
861 else { | |
862 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; | |
863 } | |
864 } | |
865 } | |
866 } | |
867 else { | |
868 &get_adaptors($config); | |
869 &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); | |
870 } | |
871 | |
872 # reg adaptors (only fetches if not retrieved from cache already) | |
873 &get_reg_adaptors($config) if defined($config->{regulatory}); | |
874 } | |
875 | |
876 # check cell types | |
877 if(defined($config->{cell_type}) && !defined($config->{build})) { | |
878 my $cls = ''; | |
879 | |
880 if(defined($config->{cache})) { | |
881 $cls = $config->{cache_cell_types}; | |
882 } | |
883 else { | |
884 my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); | |
885 $cls = join ",", map {$_->name} @{$cta->fetch_all}; | |
886 } | |
887 | |
888 foreach my $cl(@{$config->{cell_type}}) { | |
889 die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/; | |
890 } | |
891 } | |
892 | |
893 # get terminal width for progress bars | |
894 unless(defined($config->{quiet})) { | |
895 my $width; | |
896 | |
897 # module may not be installed | |
898 eval q{ | |
899 use Term::ReadKey; | |
900 }; | |
901 | |
902 if(!$@) { | |
903 my ($w, $h); | |
904 | |
905 # module may be installed, but e.g. | |
906 eval { | |
907 #($w, $h) = GetTerminalSize(); | |
908 $w = 167; | |
909 $h = 30; | |
910 }; | |
911 | |
912 $width = $w if defined $w; | |
913 } | |
914 | |
915 $width ||= 60; | |
916 $width -= 12; | |
917 $config->{terminal_width} = $width; | |
918 } | |
919 | |
920 # jump out to build cache if requested | |
921 if(defined($config->{build})) { | |
922 | |
923 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { | |
924 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); | |
925 } | |
926 | |
927 # build the cache | |
928 debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); | |
929 build_full_cache($config); | |
930 | |
931 # exit script | |
932 debug("Finished building cache") unless defined($config->{quiet}); | |
933 exit(0); | |
934 } | |
935 | |
936 | |
937 # warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG | |
938 if(defined($config->{cache})) { | |
939 | |
940 # these two def depend on DB | |
941 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency lrg check_sv)) { | |
942 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); | |
943 } | |
944 | |
945 # as does using HGVS or IDs as input | |
946 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); | |
947 | |
948 # the rest may be in the cache | |
949 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) { | |
950 next if defined($config->{'cache_'.$param}); | |
951 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); | |
952 } | |
953 } | |
954 | |
955 # get list of chrs if supplied | |
956 if(defined($config->{chr})) { | |
957 my %chrs; | |
958 | |
959 foreach my $val(split /\,/, $config->{chr}) { | |
960 my @nnn = split /\-/, $val; | |
961 | |
962 foreach my $chr($nnn[0]..$nnn[-1]) { | |
963 $chrs{$chr} = 1; | |
964 } | |
965 } | |
966 | |
967 $config->{chr} = \%chrs; | |
968 } | |
969 | |
970 # get input file handle | |
971 $config->{in_file_handle} = &get_in_file_handle($config); | |
972 | |
973 return $config; | |
974 } | |
975 | |
976 # reads config from a file | |
977 sub read_config_from_file { | |
978 my $config = shift; | |
979 my $file = shift; | |
980 | |
981 open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n"; | |
982 | |
983 while(<CONFIG>) { | |
984 next if /^\#/; | |
985 my @split = split /\s+|\=/; | |
986 my $key = shift @split; | |
987 $key =~ s/^\-//g; | |
988 | |
989 if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') { | |
990 push @{$config->{$key}}, @split; | |
991 } | |
992 else { | |
993 $config->{$key} ||= $split[0]; | |
994 } | |
995 } | |
996 | |
997 close CONFIG; | |
998 | |
999 # force quiet if outputting to STDOUT | |
1000 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { | |
1001 delete $config->{verbose} if defined($config->{verbose}); | |
1002 $config->{quiet} = 1; | |
1003 } | |
1004 | |
1005 debug("Read configuration from $file") unless defined($config->{quiet}); | |
1006 } | |
1007 | |
1008 # configures custom VEP plugins | |
1009 sub configure_plugins { | |
1010 | |
1011 my $config = shift; | |
1012 | |
1013 $config->{plugins} = []; | |
1014 | |
1015 if (my @plugins = @{ $config->{plugin} }) { | |
1016 | |
1017 # add the Plugins directory onto @INC | |
1018 | |
1019 unshift @INC, $config->{toplevel_dir}."/Plugins"; | |
1020 | |
1021 for my $plugin (@plugins) { | |
1022 | |
1023 # parse out the module name and parameters | |
1024 | |
1025 my ($module, @params) = split /,/, $plugin; | |
1026 | |
1027 # check we can use the module | |
1028 | |
1029 eval qq{ | |
1030 use $module; | |
1031 }; | |
1032 if ($@) { | |
1033 debug("Failed to compile plugin $module: $@") unless defined($config->{quiet}); | |
1034 next; | |
1035 } | |
1036 | |
1037 # now check we can instantiate it, passing any parameters to the constructor | |
1038 | |
1039 my $instance; | |
1040 | |
1041 eval { | |
1042 $instance = $module->new($config, @params); | |
1043 }; | |
1044 if ($@) { | |
1045 debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet}); | |
1046 next; | |
1047 } | |
1048 | |
1049 # check that the versions match | |
1050 | |
1051 my $plugin_version; | |
1052 | |
1053 if ($instance->can('version')) { | |
1054 $plugin_version = $instance->version; | |
1055 } | |
1056 | |
1057 my $version_ok = 1; | |
1058 | |
1059 if ($plugin_version) { | |
1060 my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version; | |
1061 my ($major, $minor, $maintenance) = split /\./, $VERSION; | |
1062 | |
1063 if ($plugin_major != $major) { | |
1064 debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet}); | |
1065 $version_ok = 0; | |
1066 } | |
1067 } | |
1068 else { | |
1069 debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet}); | |
1070 $version_ok = 0; | |
1071 } | |
1072 | |
1073 debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok; | |
1074 | |
1075 # check that it implements all necessary methods | |
1076 | |
1077 for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) { | |
1078 unless ($instance->can($required)) { | |
1079 debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet}); | |
1080 next; | |
1081 } | |
1082 } | |
1083 | |
1084 # all's good, so save the instance in our list of plugins | |
1085 | |
1086 push @{ $config->{plugins} }, $instance; | |
1087 | |
1088 debug("Loaded plugin: $module") unless defined($config->{quiet}); | |
1089 | |
1090 # for convenience, check if the plugin wants regulatory stuff and turn on the config option if so | |
1091 | |
1092 if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) { | |
1093 debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet}); | |
1094 $config->{regulatory} = 1; | |
1095 } | |
1096 } | |
1097 } | |
1098 } | |
1099 | |
1100 # connects to DBs (not done in offline mode) | |
1101 sub connect_to_dbs { | |
1102 my $config = shift; | |
1103 | |
1104 # get registry | |
1105 my $reg = 'Bio::EnsEMBL::Registry'; | |
1106 | |
1107 unless(defined($config->{offline})) { | |
1108 # load DB options from registry file if given | |
1109 if(defined($config->{registry})) { | |
1110 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); | |
1111 $reg->load_all( | |
1112 $config->{registry}, | |
1113 $config->{verbose}, | |
1114 undef, | |
1115 $config->{no_slice_cache} | |
1116 ); | |
1117 } | |
1118 | |
1119 # otherwise manually connect to DB server | |
1120 else { | |
1121 $reg->load_registry_from_db( | |
1122 -host => $config->{host}, | |
1123 -user => $config->{user}, | |
1124 -pass => $config->{password}, | |
1125 -port => $config->{port}, | |
1126 -db_version => $config->{db_version}, | |
1127 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, | |
1128 -verbose => $config->{verbose}, | |
1129 -no_cache => $config->{no_slice_cache}, | |
1130 ); | |
1131 } | |
1132 | |
1133 eval { $reg->set_reconnect_when_lost() }; | |
1134 | |
1135 if(defined($config->{verbose})) { | |
1136 # get a meta container adaptors to check version | |
1137 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); | |
1138 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); | |
1139 | |
1140 if($core_mca && $var_mca) { | |
1141 debug( | |
1142 "Connected to core version ", $core_mca->get_schema_version, " database ", | |
1143 "and variation version ", $var_mca->get_schema_version, " database" | |
1144 ); | |
1145 } | |
1146 } | |
1147 } | |
1148 | |
1149 return $reg; | |
1150 } | |
1151 | |
1152 # get adaptors from DB | |
1153 sub get_adaptors { | |
1154 my $config = shift; | |
1155 | |
1156 die "ERROR: No registry" unless defined $config->{reg}; | |
1157 | |
1158 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); | |
1159 $config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature'); | |
1160 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); | |
1161 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); | |
1162 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); | |
1163 | |
1164 # get fake ones for species with no var DB | |
1165 if(!defined($config->{vfa})) { | |
1166 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); | |
1167 $config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species}); | |
1168 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); | |
1169 } | |
1170 | |
1171 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); | |
1172 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); | |
1173 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); | |
1174 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); | |
1175 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); | |
1176 | |
1177 # cache schema version | |
1178 $config->{mca}->get_schema_version if defined $config->{mca}; | |
1179 | |
1180 # check we got slice adaptor - can't continue without a core DB | |
1181 die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; | |
1182 } | |
1183 | |
1184 # gets regulatory adaptors | |
1185 sub get_reg_adaptors { | |
1186 my $config = shift; | |
1187 | |
1188 foreach my $type(@REG_FEAT_TYPES) { | |
1189 next if defined($config->{$type.'_adaptor'}); | |
1190 | |
1191 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); | |
1192 if(defined($adaptor)) { | |
1193 $config->{$type.'_adaptor'} = $adaptor; | |
1194 } | |
1195 else { | |
1196 delete $config->{regulatory}; | |
1197 last; | |
1198 } | |
1199 } | |
1200 } | |
1201 | |
1202 # gets file handle for input | |
1203 sub get_in_file_handle { | |
1204 my $config = shift; | |
1205 | |
1206 # define the filehandle to read input from | |
1207 my $in_file_handle = new FileHandle; | |
1208 | |
1209 if(defined($config->{input_file})) { | |
1210 | |
1211 # check defined input file exists | |
1212 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; | |
1213 | |
1214 if($config->{input_file} =~ /\.gz$/){ | |
1215 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); | |
1216 } | |
1217 else { | |
1218 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); | |
1219 } | |
1220 } | |
1221 | |
1222 # no file specified - try to read data off command line | |
1223 else { | |
1224 $in_file_handle = 'STDIN'; | |
1225 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; | |
1226 } | |
1227 | |
1228 return $in_file_handle; | |
1229 } | |
1230 | |
1231 # gets file handle for output and adds header | |
1232 sub get_out_file_handle { | |
1233 my $config = shift; | |
1234 | |
1235 # define filehandle to write to | |
1236 my $out_file_handle = new FileHandle; | |
1237 | |
1238 # check if file exists | |
1239 if(-e $config->{output_file} && !defined($config->{force_overwrite})) { | |
1240 # die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with -- force_overwrite\n"); | |
1241 } | |
1242 | |
1243 if($config->{output_file} =~ /stdout/i) { | |
1244 $out_file_handle = *STDOUT; | |
1245 } | |
1246 else { | |
1247 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); | |
1248 } | |
1249 | |
1250 # define headers for a VCF file | |
1251 my @vcf_headers = ( | |
1252 '#CHROM', | |
1253 'POS', | |
1254 'ID', | |
1255 'REF', | |
1256 'ALT', | |
1257 'QUAL', | |
1258 'FILTER', | |
1259 'INFO' | |
1260 ); | |
1261 | |
1262 # file conversion, don't want to add normal headers | |
1263 if(defined($config->{convert})) { | |
1264 # header for VCF | |
1265 if($config->{convert} =~ /vcf/i) { | |
1266 print $out_file_handle "##fileformat=VCFv4.0\n"; | |
1267 print $out_file_handle join "\t", @vcf_headers; | |
1268 print $out_file_handle "\n"; | |
1269 } | |
1270 | |
1271 return $out_file_handle; | |
1272 } | |
1273 | |
1274 # GVF output, no header | |
1275 elsif(defined($config->{gvf}) || defined($config->{original})) { | |
1276 print $out_file_handle join "\n", @{$config->{headers}} if defined($config->{headers}) && defined($config->{original}); | |
1277 return $out_file_handle; | |
1278 } | |
1279 | |
1280 elsif(defined($config->{vcf})) { | |
1281 | |
1282 # create an info string for the VCF header | |
1283 my @new_headers; | |
1284 | |
1285 # if the user has defined the fields themselves, we don't need to worry | |
1286 if(defined $config->{fields_redefined}) { | |
1287 @new_headers = @{$config->{fields}}; | |
1288 } | |
1289 else { | |
1290 @new_headers = ( | |
1291 | |
1292 # get default headers, minus variation name and location (already encoded in VCF) | |
1293 grep { | |
1294 $_ ne 'Uploaded_variation' and | |
1295 $_ ne 'Location' and | |
1296 $_ ne 'Extra' | |
1297 } @{$config->{fields}}, | |
1298 | |
1299 # get extra headers | |
1300 map {@{$extra_headers{$_}}} | |
1301 grep {defined $config->{$_}} | |
1302 keys %extra_headers | |
1303 ); | |
1304 | |
1305 # plugin headers | |
1306 foreach my $plugin_header(split /\n/, get_plugin_headers($config)) { | |
1307 $plugin_header =~ /\#\# (.+?)\t\:.+/; | |
1308 push @new_headers, $1; | |
1309 } | |
1310 | |
1311 # redefine the main headers list in config | |
1312 $config->{fields} = \@new_headers; | |
1313 } | |
1314 | |
1315 # add the newly defined headers as a header to the VCF | |
1316 my $string = join '|', @{$config->{fields}}; | |
1317 my @vcf_info_strings = ('##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: '.$string.'">'); | |
1318 | |
1319 # add custom headers | |
1320 foreach my $custom(@{$config->{custom}}) { | |
1321 push @vcf_info_strings, '##INFO=<ID='.$custom->{name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">'; | |
1322 } | |
1323 | |
1324 # if this is already a VCF file, we need to add our new headers in the right place | |
1325 if(defined($config->{headers})) { | |
1326 | |
1327 for my $i(0..$#{$config->{headers}}) { | |
1328 if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) { | |
1329 splice(@{$config->{headers}}, $i, 0, @vcf_info_strings); | |
1330 } | |
1331 } | |
1332 | |
1333 print $out_file_handle join "\n", @{$config->{headers}}; | |
1334 print $out_file_handle "\n"; | |
1335 } | |
1336 | |
1337 else { | |
1338 print $out_file_handle "##fileformat=VCFv4.0\n"; | |
1339 print $out_file_handle join "\n", @vcf_info_strings; | |
1340 print $out_file_handle "\n"; | |
1341 print $out_file_handle join "\t", @vcf_headers; | |
1342 print $out_file_handle "\n"; | |
1343 } | |
1344 | |
1345 return $out_file_handle; | |
1346 } | |
1347 | |
1348 # make header | |
1349 my $time = &get_time; | |
1350 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; | |
1351 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); | |
1352 my $version_string = | |
1353 "Using API version ".$config->{reg}->software_version. | |
1354 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); | |
1355 | |
1356 # add key for extra column headers based on config | |
1357 my $extra_column_keys = join "\n", | |
1358 map {'## '.$_.' : '.$extra_descs{$_}} | |
1359 sort map {@{$extra_headers{$_}}} | |
1360 grep {defined $config->{$_}} | |
1361 keys %extra_headers; | |
1362 | |
1363 my $header =<<HEAD; | |
1364 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION | |
1365 ## Output produced at $time | |
1366 ## Connected to $db_string | |
1367 ## $version_string | |
1368 ## Extra column keys: | |
1369 $extra_column_keys | |
1370 HEAD | |
1371 | |
1372 $header .= get_plugin_headers($config); | |
1373 | |
1374 # add headers | |
1375 print $out_file_handle $header; | |
1376 | |
1377 # add custom data defs | |
1378 if(defined($config->{custom})) { | |
1379 foreach my $custom(@{$config->{custom}}) { | |
1380 print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n"; | |
1381 } | |
1382 } | |
1383 | |
1384 # add column headers | |
1385 print $out_file_handle '#', (join "\t", @{$config->{fields}}); | |
1386 print $out_file_handle "\n"; | |
1387 | |
1388 return $out_file_handle; | |
1389 } | |
1390 | |
1391 sub get_plugin_headers { | |
1392 | |
1393 my $config = shift; | |
1394 | |
1395 my $header = ""; | |
1396 | |
1397 for my $plugin (@{ $config->{plugins} }) { | |
1398 if (my $hdr = $plugin->get_header_info) { | |
1399 for my $key (keys %$hdr) { | |
1400 my $val = $hdr->{$key}; | |
1401 | |
1402 $header .= "## $key\t: $val\n"; | |
1403 } | |
1404 } | |
1405 } | |
1406 | |
1407 return $header; | |
1408 } | |
1409 | |
1410 # convert a variation feature to a line of output | |
1411 sub convert_vf { | |
1412 my $config = shift; | |
1413 my $vf = shift; | |
1414 | |
1415 my $convert_method = 'convert_to_'.lc($config->{convert}); | |
1416 my $method_ref = \&$convert_method; | |
1417 | |
1418 my $line = &$method_ref($config, $vf); | |
1419 my $handle = $config->{out_file_handle}; | |
1420 | |
1421 if(scalar @$line) { | |
1422 print $handle join "\t", @$line; | |
1423 print $handle "\n"; | |
1424 } | |
1425 } | |
1426 | |
1427 # converts to Ensembl format | |
1428 sub convert_to_ensembl { | |
1429 my $config = shift; | |
1430 my $vf = shift; | |
1431 | |
1432 return [ | |
1433 $vf->{chr} || $vf->seq_region_name, | |
1434 $vf->start, | |
1435 $vf->end, | |
1436 $vf->allele_string, | |
1437 $vf->strand, | |
1438 $vf->variation_name | |
1439 ]; | |
1440 } | |
1441 | |
1442 | |
1443 # converts to pileup format | |
1444 sub convert_to_pileup { | |
1445 my $config = shift; | |
1446 my $vf = shift; | |
1447 | |
1448 # look for imbalance in the allele string | |
1449 my %allele_lengths; | |
1450 my @alleles = split /\//, $vf->allele_string; | |
1451 | |
1452 foreach my $allele(@alleles) { | |
1453 $allele =~ s/\-//g; | |
1454 $allele_lengths{length($allele)} = 1; | |
1455 } | |
1456 | |
1457 # in/del | |
1458 if(scalar keys %allele_lengths > 1) { | |
1459 | |
1460 if($vf->allele_string =~ /\-/) { | |
1461 | |
1462 # insertion? | |
1463 if($alleles[0] eq '-') { | |
1464 shift @alleles; | |
1465 | |
1466 for my $i(0..$#alleles) { | |
1467 $alleles[$i] =~ s/\-//g; | |
1468 $alleles[$i] = '+'.$alleles[$i]; | |
1469 } | |
1470 } | |
1471 | |
1472 else { | |
1473 @alleles = grep {$_ ne '-'} @alleles; | |
1474 | |
1475 for my $i(0..$#alleles) { | |
1476 $alleles[$i] =~ s/\-//g; | |
1477 $alleles[$i] = '-'.$alleles[$i]; | |
1478 } | |
1479 } | |
1480 | |
1481 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; | |
1482 | |
1483 return [ | |
1484 $vf->{chr} || $vf->seq_region_name, | |
1485 $vf->start - 1, | |
1486 '*', | |
1487 (join "/", @alleles), | |
1488 ]; | |
1489 } | |
1490 | |
1491 else { | |
1492 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); | |
1493 return []; | |
1494 } | |
1495 | |
1496 } | |
1497 | |
1498 # balanced sub | |
1499 else { | |
1500 return [ | |
1501 $vf->{chr} || $vf->seq_region_name, | |
1502 $vf->start, | |
1503 shift @alleles, | |
1504 (join "/", @alleles), | |
1505 ]; | |
1506 } | |
1507 } | |
1508 | |
1509 # converts to HGVS (hackily returns many lines) | |
1510 sub convert_to_hgvs { | |
1511 my $config = shift; | |
1512 my $vf = shift; | |
1513 | |
1514 # ensure we have a slice | |
1515 $vf->{slice} ||= get_slice($config, $vf->{chr}); | |
1516 | |
1517 my $tvs = $vf->get_all_TranscriptVariations; | |
1518 | |
1519 my @return = values %{$vf->get_all_hgvs_notations()}; | |
1520 | |
1521 if(defined($tvs)) { | |
1522 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs; | |
1523 push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs; | |
1524 } | |
1525 | |
1526 return [join "\n", @return]; | |
1527 } | |
1528 | |
1529 # prints a line of output from the hash | |
1530 sub print_line { | |
1531 my $config = shift; | |
1532 my $line = shift; | |
1533 return unless defined($line); | |
1534 | |
1535 my $output; | |
1536 | |
1537 # normal | |
1538 if(ref($line) eq 'HASH') { | |
1539 my %extra = %{$line->{Extra}}; | |
1540 | |
1541 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; | |
1542 | |
1543 # if the fields have been redefined we need to search through in case | |
1544 # any of the defined fields are actually part of the Extra hash | |
1545 $output = join "\t", map { | |
1546 (defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-')) | |
1547 } @{$config->{fields}}; | |
1548 } | |
1549 | |
1550 # gvf/vcf | |
1551 else { | |
1552 $output = $$line; | |
1553 } | |
1554 | |
1555 my $fh = $config->{out_file_handle}; | |
1556 print $fh "$output\n"; | |
1557 } | |
1558 | |
1559 # outputs usage message | |
1560 sub usage { | |
1561 my $usage =<<END; | |
1562 #----------------------------------# | |
1563 # ENSEMBL VARIANT EFFECT PREDICTOR # | |
1564 #----------------------------------# | |
1565 | |
1566 version $VERSION | |
1567 | |
1568 By Will McLaren (wm2\@ebi.ac.uk) | |
1569 | |
1570 http://www.ensembl.org/info/docs/variation/vep/vep_script.html | |
1571 | |
1572 Usage: | |
1573 perl variant_effect_predictor.pl [arguments] | |
1574 | |
1575 Options | |
1576 ======= | |
1577 | |
1578 --help Display this message and quit | |
1579 --verbose Display verbose output as the script runs [default: off] | |
1580 --quiet Suppress status and warning messages [default: off] | |
1581 --no_progress Suppress progress bars [default: off] | |
1582 | |
1583 --config Load configuration from file. Any command line options | |
1584 specified overwrite those in the file [default: off] | |
1585 | |
1586 --everything Shortcut switch to turn on commonly used options. See web | |
1587 documentation for details [default: off] | |
1588 | |
1589 --fork [num_forks] Use forking to improve script runtime [default: off] | |
1590 | |
1591 -i | --input_file Input file - if not specified, reads from STDIN. Files | |
1592 may be gzip compressed. | |
1593 --format Specify input file format - one of "ensembl", "pileup", | |
1594 "vcf", "hgvs", "id" or "guess" to try and work out format. | |
1595 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this | |
1596 will force --quiet [default: "variant_effect_output.txt"] | |
1597 --force_overwrite Force overwriting of output file [default: quit if file | |
1598 exists] | |
1599 --original Writes output as it was in input - must be used with --filter | |
1600 since no consequence data is added [default: off] | |
1601 --vcf Write output as VCF [default: off] | |
1602 --gvf Write output as GVF [default: off] | |
1603 --fields [field list] Define a custom output format by specifying a comma-separated | |
1604 list of field names. Field names normally present in the | |
1605 "Extra" field may also be specified, including those added by | |
1606 plugin modules. Can also be used to configure VCF output | |
1607 columns [default: off] | |
1608 | |
1609 --species [species] Species to use [default: "human"] | |
1610 | |
1611 -t | --terms Type of consequence terms to output - one of "SO", "ensembl" | |
1612 [default: SO] | |
1613 | |
1614 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off] | |
1615 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off] | |
1616 | |
1617 NB: SIFT and PolyPhen predictions are currently available for human only | |
1618 NB: Condel support has been moved to a VEP plugin module - see documentation | |
1619 | |
1620 --regulatory Look for overlaps with regulatory regions. The script can | |
1621 also call if a variant falls in a high information position | |
1622 within a transcription factor binding site. Output lines have | |
1623 a Feature type of RegulatoryFeature or MotifFeature | |
1624 [default: off] | |
1625 --cell_type [types] Report only regulatory regions that are found in the given cell | |
1626 type(s). Can be a single cell type or a comma-separated list. | |
1627 The functional type in each cell type is reported under | |
1628 CELL_TYPE in the output. To retrieve a list of cell types, use | |
1629 "--cell_type list" [default: off] | |
1630 | |
1631 NB: Regulatory consequences are currently available for human and mouse only | |
1632 | |
1633 --custom [file list] Add custom annotations from tabix-indexed files. See | |
1634 documentation for full details [default: off] | |
1635 --plugin [plugin_name] Use named plugin module [default: off] | |
1636 --hgnc Add HGNC gene identifiers to output [default: off] | |
1637 --hgvs Output HGVS identifiers (coding and protein). Requires database | |
1638 connection [default: off] | |
1639 --ccds Output CCDS transcript identifiers [default: off] | |
1640 --xref_refseq Output aligned RefSeq mRNA identifier for transcript. NB: the | |
1641 RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND | |
1642 FREQUENTLY WILL NOT, match exactly in sequence, exon structure | |
1643 and protein product [default: off] | |
1644 --protein Output Ensembl protein identifer [default: off] | |
1645 --canonical Indicate if the transcript for this consequence is the canonical | |
1646 transcript for this gene [default: off] | |
1647 --domains Include details of any overlapping protein domains [default: off] | |
1648 --numbers Include exon & intron numbers [default: off] | |
1649 | |
1650 --no_intergenic Excludes intergenic consequences from the output [default: off] | |
1651 --coding_only Only return consequences that fall in the coding region of | |
1652 transcripts [default: off] | |
1653 --most_severe Ouptut only the most severe consequence per variation. | |
1654 Transcript-specific columns will be left blank. [default: off] | |
1655 --summary Output only a comma-separated list of all consequences per | |
1656 variation. Transcript-specific columns will be left blank. | |
1657 [default: off] | |
1658 --per_gene Output only the most severe consequence per gene. Where more | |
1659 than one transcript has the same consequence, the transcript | |
1660 chosen is arbitrary. [default: off] | |
1661 | |
1662 | |
1663 --check_ref If specified, checks supplied reference allele against stored | |
1664 entry in Ensembl Core database [default: off] | |
1665 --check_existing If specified, checks for existing co-located variations in the | |
1666 Ensembl Variation database [default: off] | |
1667 --failed [0|1] Include (1) or exclude (0) variants that have been flagged as | |
1668 failed by Ensembl when checking for existing variants. | |
1669 [default: exclude] | |
1670 --check_alleles If specified, the alleles of existing co-located variations | |
1671 are compared to the input; an existing variation will only | |
1672 be reported if no novel allele is in the input (strand is | |
1673 accounted for) [default: off] | |
1674 --check_svs Report overlapping structural variants [default: off] | |
1675 | |
1676 --filter [filters] Filter output by consequence type. Use this to output only | |
1677 variants that have at least one consequence type matching the | |
1678 filter. Multiple filters can be used separated by ",". By | |
1679 combining this with --original it is possible to run the VEP | |
1680 iteratively to progressively filter a set of variants. See | |
1681 documentation for full details [default: off] | |
1682 | |
1683 --check_frequency Turns on frequency filtering. Use this to include or exclude | |
1684 variants based on the frequency of co-located existing | |
1685 variants in the Ensembl Variation database. You must also | |
1686 specify all of the following --freq flags [default: off] | |
1687 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap, | |
1688 1kg_yri for YRI 1000 genomes. See documentation for more | |
1689 details | |
1690 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5 | |
1691 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or | |
1692 less than (lt) --freq_freq | |
1693 --freq_filter Specify whether variants that pass the above should be included | |
1694 [exclude|include] or excluded from analysis | |
1695 --gmaf Include global MAF of existing variant from 1000 Genomes | |
1696 Phase 1 in output | |
1697 | |
1698 --individual [id] Consider only alternate alleles present in the genotypes of the | |
1699 specified individual(s). May be a single individual, a comma- | |
1700 separated list or "all" to assess all individuals separately. | |
1701 Each individual and variant combination is given on a separate | |
1702 line of output. Only works with VCF files containing individual | |
1703 genotype data; individual IDs are taken from column headers. | |
1704 --allow_non_variant Prints out non-variant lines when using VCF input | |
1705 --phased Force VCF individual genotypes to be interpreted as phased. | |
1706 For use with plugins that depend on phased state. | |
1707 | |
1708 --chr [list] Select a subset of chromosomes to analyse from your file. Any | |
1709 data not on this chromosome in the input will be skipped. The | |
1710 list can be comma separated, with "-" characters representing | |
1711 a range e.g. 1-5,8,15,X [default: off] | |
1712 --gp If specified, tries to read GRCh37 position from GP field in the | |
1713 INFO column of a VCF file. Only applies when VCF is the input | |
1714 format and human is the species [default: off] | |
1715 | |
1716 --convert Convert the input file to the output format specified. | |
1717 [ensembl|vcf|pileup] Converted output is written to the file specified in | |
1718 --output_file. No consequence calculation is carried out when | |
1719 doing file conversion. [default: off] | |
1720 | |
1721 --refseq Use the otherfeatures database to retrieve transcripts - this | |
1722 database contains RefSeq transcripts (as well as CCDS and | |
1723 Ensembl EST alignments) [default: off] | |
1724 --host Manually define database host [default: "ensembldb.ensembl.org"] | |
1725 -u | --user Database username [default: "anonymous"] | |
1726 --port Database port [default: 5306] | |
1727 --password Database password [default: no password] | |
1728 --genomes Sets DB connection params for Ensembl Genomes [default: off] | |
1729 --registry Registry file to use defines DB connections [default: off] | |
1730 Defining a registry file overrides above connection settings. | |
1731 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not | |
1732 advised due to likely incompatibilities between API and DB | |
1733 | |
1734 --no_whole_genome Run in old-style, non-whole genome mode [default: off] | |
1735 --buffer_size Sets the number of variants sent in each batch [default: 5000] | |
1736 Increasing buffer size can retrieve results more quickly | |
1737 but requires more memory. Only applies to whole genome mode. | |
1738 | |
1739 --cache Enables read-only use of cache [default: off] | |
1740 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"] | |
1741 --write_cache Enable writing to cache [default: off] | |
1742 --build [all|list] Build a complete cache for the selected species. Build for all | |
1743 chromosomes with --build all, or a list of chromosomes (see | |
1744 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS | |
1745 VIOLATES OUR FAIR USAGE POLICY [default: off] | |
1746 | |
1747 --compress Specify utility to decompress cache files - may be "gzcat" or | |
1748 "gzip -dc" Only use if default does not work [default: zcat] | |
1749 | |
1750 --skip_db_check ADVANCED! Force the script to use a cache built from a different | |
1751 database than specified with --host. Only use this if you are | |
1752 sure the hosts are compatible (e.g. ensembldb.ensembl.org and | |
1753 useastdb.ensembl.org) [default: off] | |
1754 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one | |
1755 file in the cache. [default: 1MB] | |
1756 END | |
1757 | |
1758 print $usage; | |
1759 } |