Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/.#variant_effect_predictor.pl.1.3 @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
6 Genome Research Limited. All rights reserved. | |
7 | |
8 This software is distributed under a modified Apache license. | |
9 For license details, please see | |
10 | |
11 http://www.ensembl.org/info/about/code_licence.html | |
12 | |
13 =head1 CONTACT | |
14 | |
15 Please email comments or questions to the public Ensembl | |
16 developers list at <dev@ensembl.org>. | |
17 | |
18 Questions may also be sent to the Ensembl help desk at | |
19 <helpdesk@ensembl.org>. | |
20 | |
21 =cut | |
22 | |
23 =head1 NAME | |
24 | |
25 Variant Effect Predictor - a script to predict the consequences of genomic variants | |
26 | |
27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html | |
28 | |
29 Version 2.2 | |
30 | |
31 by Will McLaren (wm2@ebi.ac.uk) | |
32 =cut | |
33 | |
34 use strict; | |
35 use Getopt::Long; | |
36 use FileHandle; | |
37 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); | |
38 use Bio::EnsEMBL::Variation::Utils::VEP qw( | |
39 parse_line | |
40 vf_to_consequences | |
41 validate_vf | |
42 load_dumped_adaptor_cache | |
43 dump_adaptor_cache | |
44 get_all_consequences | |
45 get_slice | |
46 build_full_cache | |
47 read_cache_info | |
48 get_time | |
49 debug | |
50 @OUTPUT_COLS | |
51 @REG_FEAT_TYPES | |
52 ); | |
53 | |
54 # global vars | |
55 my $VERSION = '2.2'; | |
56 | |
57 # set output autoflush for progress bars | |
58 $| = 1; | |
59 | |
60 # configure from command line opts | |
61 my $config = &configure(scalar @ARGV); | |
62 | |
63 # run the main sub routine | |
64 &main($config); | |
65 | |
66 # this is the main sub-routine - it needs the configured $config hash | |
67 sub main { | |
68 my $config = shift; | |
69 | |
70 debug("Starting...") unless defined $config->{quiet}; | |
71 | |
72 my $tr_cache = {}; | |
73 my $rf_cache = {}; | |
74 | |
75 # create a hash to hold slices so we don't get the same one twice | |
76 my %slice_cache = (); | |
77 | |
78 my @vfs; | |
79 my ($vf_count, $total_vf_count); | |
80 my $in_file_handle = $config->{in_file_handle}; | |
81 | |
82 # initialize line number in config | |
83 $config->{line_number} = 0; | |
84 | |
85 # read the file | |
86 while(<$in_file_handle>) { | |
87 chomp; | |
88 | |
89 $config->{line_number}++; | |
90 | |
91 # header line? | |
92 next if /^\#/; | |
93 | |
94 # some lines (pileup) may actually parse out into more than one variant | |
95 foreach my $vf(@{&parse_line($config, $_)}) { | |
96 | |
97 # validate the VF | |
98 next unless validate_vf($config, $vf); | |
99 | |
100 # now get the slice | |
101 if(!defined($vf->{slice})) { | |
102 my $slice; | |
103 | |
104 # don't get slices if we're using cache | |
105 # we can steal them from transcript objects later | |
106 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { | |
107 | |
108 # check if we have fetched this slice already | |
109 if(defined $slice_cache{$vf->{chr}}) { | |
110 $slice = $slice_cache{$vf->{chr}}; | |
111 } | |
112 | |
113 # if not create a new one | |
114 else { | |
115 | |
116 $slice = &get_slice($config, $vf->{chr}); | |
117 | |
118 # if failed, warn and skip this line | |
119 if(!defined($slice)) { | |
120 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; | |
121 next; | |
122 } | |
123 | |
124 # store the hash | |
125 $slice_cache{$vf->{chr}} = $slice; | |
126 } | |
127 } | |
128 | |
129 $vf->{slice} = $slice; | |
130 } | |
131 | |
132 # make a name if one doesn't exist | |
133 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.$vf->{allele_string}; | |
134 | |
135 # jump out to convert here | |
136 if(defined($config->{convert})) { | |
137 &convert_vf($config, $vf); | |
138 next; | |
139 } | |
140 | |
141 if(defined $config->{whole_genome}) { | |
142 push @vfs, $vf; | |
143 $vf_count++; | |
144 $total_vf_count++; | |
145 | |
146 if($vf_count == $config->{buffer_size}) { | |
147 debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); | |
148 | |
149 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)}; | |
150 | |
151 debug("Processed $total_vf_count total variants") unless defined($config->{quiet}); | |
152 | |
153 @vfs = (); | |
154 $vf_count = 0; | |
155 } | |
156 } | |
157 else { | |
158 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; | |
159 $vf_count++; | |
160 $total_vf_count++; | |
161 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); | |
162 } | |
163 } | |
164 } | |
165 | |
166 # if in whole-genome mode, finish off the rest of the buffer | |
167 if(defined $config->{whole_genome} && scalar @vfs) { | |
168 debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); | |
169 | |
170 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)}; | |
171 | |
172 debug("Processed $total_vf_count total variants") unless defined($config->{quiet}); | |
173 } | |
174 | |
175 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); | |
176 | |
177 debug("Finished!") unless defined $config->{quiet}; | |
178 } | |
179 | |
180 # sets up configuration hash that is used throughout the script | |
181 sub configure { | |
182 my $args = shift; | |
183 | |
184 my $config = {}; | |
185 | |
186 GetOptions( | |
187 $config, | |
188 'help', # displays help message | |
189 | |
190 # input options, | |
191 'config=s', # config file name | |
192 'input_file=s', # input file name | |
193 'format=s', # input file format | |
194 | |
195 # DB options | |
196 'species=s', # species e.g. human, homo_sapiens | |
197 'registry=s', # registry file | |
198 'host=s', # database host | |
199 'port=s', # database port | |
200 'user=s', # database user name | |
201 'password=s', # database password | |
202 'db_version=i', # Ensembl database version to use e.g. 62 | |
203 'genomes', # automatically sets DB params for e!Genomes | |
204 'refseq', # use otherfeatures RefSeq DB instead of Ensembl | |
205 #'no_disconnect', # disables disconnect_when_inactive | |
206 | |
207 # runtime options | |
208 'most_severe', # only return most severe consequence | |
209 'summary', # only return one line per variation with all consquence types | |
210 'per_gene', # only return most severe per gene | |
211 'buffer_size=i', # number of variations to read in before analysis | |
212 'chunk_size=s', # size in bases of "chunks" used in internal hash structure | |
213 'failed=i', # include failed variations when finding existing | |
214 'no_whole_genome', # disables now default whole-genome mode | |
215 'whole_genome', # proxy for whole genome mode - now just warns user | |
216 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) | |
217 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT | |
218 'check_ref', # check supplied reference allele against DB | |
219 'check_existing', # find existing co-located variations | |
220 'check_alleles', # only attribute co-located if alleles are the same | |
221 'check_frequency', # enable frequency checking | |
222 'freq_filter=s', # exclude or include | |
223 'freq_freq=f', # frequency to filter on | |
224 'freq_gt_lt=s', # gt or lt (greater than or less than) | |
225 'freq_pop=s', # population to filter on | |
226 | |
227 # verbosity options | |
228 'verbose', # print out a bit more info while running | |
229 'quiet', # print nothing to STDOUT (unless using -o stdout) | |
230 'no_progress', # don't display progress bars | |
231 | |
232 # output options | |
233 'output_file=s', # output file name | |
234 'force_overwrite', # force overwrite of output file if already exists | |
235 'terms=s', # consequence terms to use e.g. NCBI, SO | |
236 'coding_only', # only return results for consequences in coding regions | |
237 'canonical', # indicates if transcript is canonical | |
238 'protein', # add e! protein ID to extra column | |
239 'hgnc', # add HGNC gene ID to extra column | |
240 'hgvs', # add HGVS names to extra column | |
241 'sift=s', # SIFT predictions | |
242 'polyphen=s', # PolyPhen predictions | |
243 'condel=s', # Condel predictions | |
244 'gene', # force gene column to be populated (disabled by default, enabled when using cache) | |
245 'regulatory', # enable regulatory stuff | |
246 'convert=s', # convert input to another format (doesn't run VEP) | |
247 'no_intergenic', # don't print out INTERGENIC consequences | |
248 'gvf', # produce gvf output | |
249 | |
250 # cache stuff | |
251 'cache', # use cache | |
252 'write_cache', # enables writing to the cache | |
253 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs | |
254 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript | |
255 'strip', # strips adaptors etc from objects before caching them | |
256 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more | |
257 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) | |
258 'cache_region_size=i', # size of region in bases for each cache file | |
259 'no_slice_cache', # tell API not to cache features on slice | |
260 'standalone', # standalone mode uses minimal set of modules installed in same dir, no DB connection | |
261 'skip_db_check', # don't compare DB parameters with cached | |
262 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" | |
263 | |
264 # debug | |
265 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL | |
266 'count_queries', # counts SQL queries executed | |
267 'admin', # allows me to build off public hosts | |
268 'debug', # print out debug info | |
269 ); | |
270 | |
271 # print usage message if requested or no args supplied | |
272 if(defined($config->{help}) || !$args) { | |
273 &usage; | |
274 exit(0); | |
275 } | |
276 | |
277 # config file? | |
278 if(defined $config->{config}) { | |
279 | |
280 open CONFIG, $config->{config} or die "ERROR: Could not open config file \"".$config->{config}."\"\n"; | |
281 | |
282 while(<CONFIG>) { | |
283 next if /^\#/; | |
284 my ($key, $value) = split /\s+|\=/; | |
285 $key =~ s/^\-//g; | |
286 $config->{$key} = $value unless defined $config->{$key}; | |
287 } | |
288 | |
289 close CONFIG; | |
290 } | |
291 | |
292 # can't be both quiet and verbose | |
293 die "ERROR: Can't be both quiet and verbose!" if defined($config->{quiet}) && defined($config->{verbose}); | |
294 | |
295 # check file format | |
296 if(defined $config->{format}) { | |
297 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /pileup|vcf|guess|hgvs|ensembl|id/i; | |
298 } | |
299 | |
300 # check convert format | |
301 if(defined $config->{convert}) { | |
302 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup/i; | |
303 } | |
304 | |
305 # connection settings for Ensembl Genomes | |
306 if($config->{genomes}) { | |
307 $config->{host} ||= 'mysql.ebi.ac.uk'; | |
308 $config->{port} ||= 4157; | |
309 } | |
310 | |
311 # connection settings for main Ensembl | |
312 else { | |
313 $config->{species} ||= "homo_sapiens"; | |
314 $config->{host} ||= 'ensembldb.ensembl.org'; | |
315 $config->{port} ||= 5306; | |
316 } | |
317 | |
318 # refseq or core? | |
319 if(defined($config->{refseq})) { | |
320 die "ERROR: SIFT, PolyPhen and Condel predictions not available fore RefSeq transcripts" if defined $config->{sift} || defined $config->{polyphen} || defined $config->{condel}; | |
321 | |
322 $config->{core_type} = 'otherfeatures'; | |
323 } | |
324 else { | |
325 $config->{core_type} = 'core'; | |
326 } | |
327 | |
328 # output term | |
329 if(defined $config->{terms}) { | |
330 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; | |
331 if($config->{terms} =~ /ensembl|display/i) { | |
332 $config->{terms} = 'display'; | |
333 } | |
334 else { | |
335 $config->{terms} = uc($config->{terms}); | |
336 } | |
337 } | |
338 | |
339 # check nsSNP tools | |
340 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { | |
341 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; | |
342 | |
343 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; | |
344 | |
345 die "ERROR: $tool not available in standalone mode\n" if defined($config->{standalone}); | |
346 | |
347 # use V2 of the Condel algorithm, possibly gives fewer false positives | |
348 if($tool eq 'Condel' && $config->{lc($tool)} =~ /1$/) { | |
349 $Bio::EnsEMBL::Variation::Utils::Condel::USE_V2 = 0; | |
350 } | |
351 } | |
352 | |
353 # force quiet if outputting to STDOUT | |
354 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { | |
355 delete $config->{verbose} if defined($config->{verbose}); | |
356 $config->{quiet} = 1; | |
357 } | |
358 | |
359 # summarise options if verbose | |
360 if(defined $config->{verbose}) { | |
361 my $header =<<INTRO; | |
362 #----------------------------------# | |
363 # ENSEMBL VARIANT EFFECT PREDICTOR # | |
364 #----------------------------------# | |
365 | |
366 version $VERSION | |
367 | |
368 By Will McLaren (wm2\@ebi.ac.uk) | |
369 | |
370 Configuration options: | |
371 | |
372 INTRO | |
373 print $header; | |
374 | |
375 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1]; | |
376 | |
377 foreach my $key(sort keys %$config) { | |
378 print $key.(' ' x (($max_length - length($key)) + 4)).$config->{$key}."\n"; | |
379 } | |
380 | |
381 print "\n".("-" x 20)."\n\n"; | |
382 } | |
383 | |
384 # set defaults | |
385 $config->{user} ||= 'anonymous'; | |
386 $config->{buffer_size} ||= 5000; | |
387 $config->{chunk_size} ||= '50kb'; | |
388 $config->{output_file} ||= "variant_effect_output.txt"; | |
389 $config->{tmpdir} ||= '/tmp'; | |
390 $config->{format} ||= 'guess'; | |
391 $config->{terms} ||= 'display'; | |
392 $config->{gene} ||= 1 unless defined($config->{whole_genome}); | |
393 $config->{cache_region_size} ||= 1000000; | |
394 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); | |
395 $config->{compress} ||= 'zcat'; | |
396 | |
397 # frequency filtering | |
398 if(defined($config->{check_frequency})) { | |
399 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { | |
400 die "ERROR: To use --check_frequency you must also specify flag --$flag" unless defined $config->{$flag}; | |
401 } | |
402 | |
403 # need to set check_existing | |
404 $config->{check_existing} = 1; | |
405 } | |
406 | |
407 $config->{check_existing} = 1 if defined $config->{check_alleles}; | |
408 | |
409 # warn users still using whole_genome flag | |
410 if(defined($config->{whole_genome})) { | |
411 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); | |
412 } | |
413 | |
414 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; | |
415 $config->{include_failed} = 1 unless defined $config->{include_failed}; | |
416 $config->{chunk_size} =~ s/mb?/000000/i; | |
417 $config->{chunk_size} =~ s/kb?/000/i; | |
418 $config->{cache_region_size} =~ s/mb?/000000/i; | |
419 $config->{cache_region_size} =~ s/kb?/000/i; | |
420 | |
421 # cluck and display executed SQL? | |
422 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); | |
423 | |
424 # standalone needs cache, can't use HGVS | |
425 if(defined($config->{standalone})) { | |
426 $config->{cache} = 1; | |
427 | |
428 die("ERROR: Cannot generate HGVS coordinates in standalone mode") if defined($config->{hgvs}); | |
429 die("ERROR: Cannot use HGVS as input in standalone mode") if $config->{format} eq 'hgvs'; | |
430 die("ERROR: Cannot use variant identifiers as input in standalone mode") if $config->{format} eq 'id'; | |
431 die("ERROR: Cannot do frequency filtering in standalone mode") if defined($config->{check_frequency}); | |
432 } | |
433 | |
434 # write_cache needs cache | |
435 $config->{cache} = 1 if defined $config->{write_cache}; | |
436 | |
437 # no_slice_cache, prefetch and whole_genome have to be on to use cache | |
438 if(defined($config->{cache})) { | |
439 $config->{prefetch} = 1; | |
440 $config->{no_slice_cache} = 1; | |
441 $config->{whole_genome} = 1; | |
442 $config->{strip} = 1; | |
443 } | |
444 | |
445 $config->{build} = $config->{rebuild} if defined($config->{rebuild}); | |
446 | |
447 # force options for full build | |
448 if(defined($config->{build})) { | |
449 $config->{prefetch} = 1; | |
450 $config->{gene} = 1; | |
451 $config->{hgnc} = 1; | |
452 $config->{no_slice_cache} = 1; | |
453 $config->{cache} = 1; | |
454 $config->{strip} = 1; | |
455 $config->{write_cache} = 1; | |
456 } | |
457 | |
458 # connect to databases | |
459 $config->{reg} = &connect_to_dbs($config); | |
460 | |
461 # complete dir with species name and db_version | |
462 $config->{dir} .= '/'.( | |
463 join '/', ( | |
464 defined($config->{standalone}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), | |
465 $config->{db_version} || $config->{reg}->software_version | |
466 ) | |
467 ); | |
468 | |
469 if(defined($config->{cache})) { | |
470 # read cache info | |
471 if(read_cache_info($config)) { | |
472 debug("Read existing cache info") unless defined $config->{quiet}; | |
473 } | |
474 } | |
475 | |
476 # include regulatory modules if requested | |
477 if(defined($config->{regulatory})) { | |
478 # do the use statements here so that users don't have to have the | |
479 # funcgen API install to use the rest of the script | |
480 eval q{ | |
481 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; | |
482 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; | |
483 use Bio::EnsEMBL::Funcgen::MotifFeature; | |
484 use Bio::EnsEMBL::Funcgen::RegulatoryFeature; | |
485 use Bio::EnsEMBL::Funcgen::BindingMatrix; | |
486 }; | |
487 | |
488 if($@) { | |
489 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory\n"); | |
490 } | |
491 } | |
492 | |
493 # warn user cache directory doesn't exist | |
494 if(!-e $config->{dir}) { | |
495 | |
496 # if using write_cache | |
497 if(defined($config->{write_cache})) { | |
498 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); | |
499 } | |
500 | |
501 # want to read cache, not found | |
502 elsif(defined($config->{cache})) { | |
503 die("ERROR: Cache directory ", $config->{dir}, " not found"); | |
504 } | |
505 } | |
506 | |
507 # suppress warnings that the FeatureAdpators spit if using no_slice_cache | |
508 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); | |
509 | |
510 # get adaptors | |
511 if(defined($config->{cache}) && !defined($config->{write_cache})) { | |
512 | |
513 # try and load adaptors from cache | |
514 if(!&load_dumped_adaptor_cache($config)) { | |
515 &get_adaptors($config); | |
516 &dump_adaptor_cache($config) if defined($config->{write_cache}); | |
517 } | |
518 | |
519 # check cached adaptors match DB params | |
520 else { | |
521 my $dbc = $config->{sa}->{dbc}; | |
522 | |
523 my $ok = 1; | |
524 | |
525 if($dbc->{_host} ne $config->{host}) { | |
526 | |
527 # ens-livemirror, useastdb and ensembldb should all have identical DBs | |
528 unless( | |
529 ( | |
530 $dbc->{_host} eq 'ens-livemirror' | |
531 || $dbc->{_host} eq 'ensembldb.ensembl.org' | |
532 || $dbc->{_host} eq 'useastdb.ensembl.org' | |
533 ) && ( | |
534 $config->{host} eq 'ens-livemirror' | |
535 || $config->{host} eq 'ensembldb.ensembl.org' | |
536 || $config->{host} eq 'useastdb.ensembl.org' | |
537 ) | |
538 ) { | |
539 $ok = 0; | |
540 } | |
541 | |
542 # but we still need to reconnect | |
543 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); | |
544 | |
545 &get_adaptors($config); | |
546 } | |
547 | |
548 if(!$ok) { | |
549 if(defined($config->{skip_db_check})) { | |
550 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); | |
551 } | |
552 else { | |
553 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; | |
554 } | |
555 } | |
556 } | |
557 } | |
558 else { | |
559 &get_adaptors($config); | |
560 &dump_adaptor_cache($config) if defined($config->{write_cache}) | |
561 } | |
562 | |
563 # reg adaptors (only fetches if not retrieved from cache already) | |
564 &get_reg_adaptors($config) if defined($config->{regulatory}); | |
565 | |
566 # get terminal width for progress bars | |
567 unless(defined($config->{quiet})) { | |
568 my $width; | |
569 | |
570 # module may not be installed | |
571 eval q{ | |
572 use Term::ReadKey; | |
573 }; | |
574 | |
575 if(!$@) { | |
576 my ($w, $h); | |
577 | |
578 # module may be installed, but e.g. | |
579 eval { | |
580 ($w, $h) = GetTerminalSize(); | |
581 }; | |
582 | |
583 $width = $w if defined $w; | |
584 } | |
585 | |
586 $width ||= 60; | |
587 $width -= 12; | |
588 $config->{terminal_width} = $width; | |
589 } | |
590 | |
591 # jump out to build cache if requested | |
592 if(defined($config->{build})) { | |
593 | |
594 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { | |
595 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); | |
596 } | |
597 | |
598 # build the cache | |
599 debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); | |
600 build_full_cache($config); | |
601 | |
602 # exit script | |
603 debug("Finished building cache") unless defined($config->{quiet}); | |
604 exit(0); | |
605 } | |
606 | |
607 # warn user DB will be used for SIFT/PolyPhen/Condel/HGVS/frequency | |
608 if(defined($config->{cache})) { | |
609 | |
610 # these two def depend on DB | |
611 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency)) { | |
612 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); | |
613 } | |
614 | |
615 # as does using HGVS or IDs as input | |
616 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); | |
617 | |
618 # the rest may be in the cache | |
619 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen condel regulatory)) { | |
620 next if defined($config->{'cache_'.$param}); | |
621 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); | |
622 } | |
623 } | |
624 | |
625 # get list of chrs if supplied | |
626 if(defined($config->{chr})) { | |
627 my %chrs; | |
628 | |
629 foreach my $val(split /\,/, $config->{chr}) { | |
630 my @nnn = split /\-/, $val; | |
631 | |
632 foreach my $chr($nnn[0]..$nnn[-1]) { | |
633 $chrs{$chr} = 1; | |
634 } | |
635 } | |
636 | |
637 $config->{chr} = \%chrs; | |
638 } | |
639 | |
640 # get input file handle | |
641 $config->{in_file_handle} = &get_in_file_handle($config); | |
642 | |
643 # configure output file | |
644 $config->{out_file_handle} = &get_out_file_handle($config); | |
645 | |
646 return $config; | |
647 } | |
648 | |
649 # connects to DBs; in standalone mode this just loads registry module | |
650 sub connect_to_dbs { | |
651 my $config = shift; | |
652 | |
653 # get registry | |
654 my $reg = 'Bio::EnsEMBL::Registry'; | |
655 | |
656 unless(defined($config->{standalone})) { | |
657 # load DB options from registry file if given | |
658 if(defined($config->{registry})) { | |
659 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); | |
660 $reg->load_all( | |
661 $config->{registry}, | |
662 $config->{verbose}, | |
663 undef, | |
664 $config->{no_slice_cache} | |
665 ); | |
666 } | |
667 | |
668 # otherwise manually connect to DB server | |
669 else { | |
670 $reg->load_registry_from_db( | |
671 -host => $config->{host}, | |
672 -user => $config->{user}, | |
673 -pass => $config->{password}, | |
674 -port => $config->{port}, | |
675 -db_version => $config->{db_version}, | |
676 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, | |
677 -verbose => $config->{verbose}, | |
678 -no_cache => $config->{no_slice_cache}, | |
679 ); | |
680 } | |
681 | |
682 eval { $reg->set_reconnect_when_lost() }; | |
683 | |
684 if(defined($config->{verbose})) { | |
685 # get a meta container adaptors to check version | |
686 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); | |
687 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); | |
688 | |
689 if($core_mca && $var_mca) { | |
690 debug( | |
691 "Connected to core version ", $core_mca->get_schema_version, " database ", | |
692 "and variation version ", $var_mca->get_schema_version, " database" | |
693 ); | |
694 } | |
695 } | |
696 } | |
697 | |
698 return $reg; | |
699 } | |
700 | |
701 # get adaptors from DB | |
702 sub get_adaptors { | |
703 my $config = shift; | |
704 | |
705 die "ERROR: No registry" unless defined $config->{reg}; | |
706 | |
707 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); | |
708 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); | |
709 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); | |
710 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); | |
711 | |
712 # get fake ones for species with no var DB | |
713 if(!defined($config->{vfa})) { | |
714 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); | |
715 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); | |
716 } | |
717 else { | |
718 $config->{vfa}->db->include_failed_variations($config->{include_failed}) if defined($config->{vfa}->db) && $config->{vfa}->db->can('include_failed_variations'); | |
719 } | |
720 | |
721 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); | |
722 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); | |
723 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); | |
724 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); | |
725 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); | |
726 | |
727 # cache schema version | |
728 $config->{mca}->get_schema_version if defined $config->{mca}; | |
729 | |
730 # check we got slice adaptor - can't continue without a core DB | |
731 die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; | |
732 } | |
733 | |
734 # gets regulatory adaptors | |
735 sub get_reg_adaptors { | |
736 my $config = shift; | |
737 | |
738 foreach my $type(@REG_FEAT_TYPES) { | |
739 next if defined($config->{$type.'_adaptor'}); | |
740 | |
741 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); | |
742 if(defined($adaptor)) { | |
743 $config->{$type.'_adaptor'} = $adaptor; | |
744 } | |
745 else { | |
746 delete $config->{regulatory}; | |
747 last; | |
748 } | |
749 } | |
750 } | |
751 | |
752 # gets file handle for input | |
753 sub get_in_file_handle { | |
754 my $config = shift; | |
755 | |
756 # define the filehandle to read input from | |
757 my $in_file_handle = new FileHandle; | |
758 | |
759 if(defined($config->{input_file})) { | |
760 | |
761 # check defined input file exists | |
762 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; | |
763 | |
764 if($config->{input_file} =~ /\.gz$/){ | |
765 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); | |
766 } | |
767 else { | |
768 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); | |
769 } | |
770 } | |
771 | |
772 # no file specified - try to read data off command line | |
773 else { | |
774 $in_file_handle = 'STDIN'; | |
775 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; | |
776 } | |
777 | |
778 return $in_file_handle; | |
779 } | |
780 | |
781 # gets file handle for output and adds header | |
782 sub get_out_file_handle { | |
783 my $config = shift; | |
784 | |
785 # define filehandle to write to | |
786 my $out_file_handle = new FileHandle; | |
787 | |
788 # check if file exists | |
789 if(-e $config->{output_file} && !defined($config->{force_overwrite})) { | |
790 die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n"); | |
791 } | |
792 | |
793 if($config->{output_file} =~ /stdout/i) { | |
794 $out_file_handle = *STDOUT; | |
795 } | |
796 else { | |
797 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); | |
798 } | |
799 | |
800 # file conversion, don't want to add normal headers | |
801 if(defined($config->{convert})) { | |
802 # header for VCF | |
803 if($config->{convert} =~ /vcf/i) { | |
804 print $out_file_handle join "\t", ( | |
805 '#CHROM', | |
806 'POS', | |
807 'ID', | |
808 'REF', | |
809 'ALT', | |
810 'QUAL', | |
811 'FILTER', | |
812 'INFO' | |
813 ); | |
814 print $out_file_handle "\n"; | |
815 } | |
816 | |
817 return $out_file_handle; | |
818 } | |
819 | |
820 # GVF output, no header | |
821 elsif(defined($config->{gvf})) { | |
822 return $out_file_handle; | |
823 } | |
824 | |
825 # make header | |
826 my $time = &get_time; | |
827 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; | |
828 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); | |
829 my $version_string = | |
830 "Using API version ".$config->{reg}->software_version. | |
831 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); | |
832 | |
833 my $header =<<HEAD; | |
834 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION | |
835 ## Output produced at $time | |
836 ## Connected to $db_string | |
837 ## $version_string | |
838 ## Extra column keys: | |
839 ## CANONICAL : Indicates if transcript is canonical for this gene | |
840 ## HGNC : HGNC gene identifier | |
841 ## ENSP : Ensembl protein identifer | |
842 ## HGVSc : HGVS coding sequence name | |
843 ## HGVSp : HGVS protein sequence name | |
844 ## SIFT : SIFT prediction | |
845 ## PolyPhen : PolyPhen prediction | |
846 ## Condel : Condel SIFT/PolyPhen consensus prediction | |
847 ## MATRIX : The source and identifier of a transcription factor binding profile aligned at this position | |
848 ## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of a transcription factor binding profile | |
849 HEAD | |
850 | |
851 # add headers | |
852 print $out_file_handle $header; | |
853 | |
854 # add column headers | |
855 print $out_file_handle '#', (join "\t", @OUTPUT_COLS); | |
856 print $out_file_handle "\n"; | |
857 | |
858 return $out_file_handle; | |
859 } | |
860 | |
861 # convert a variation feature to a line of output | |
862 sub convert_vf { | |
863 my $config = shift; | |
864 my $vf = shift; | |
865 | |
866 my $convert_method = 'convert_to_'.lc($config->{convert}); | |
867 my $method_ref = \&$convert_method; | |
868 | |
869 my $line = &$method_ref($config, $vf); | |
870 my $handle = $config->{out_file_handle}; | |
871 | |
872 if(scalar @$line) { | |
873 print $handle join "\t", @$line; | |
874 print $handle "\n"; | |
875 } | |
876 } | |
877 | |
878 # converts to Ensembl format | |
879 sub convert_to_ensembl { | |
880 my $config = shift; | |
881 my $vf = shift; | |
882 | |
883 return [ | |
884 $vf->{chr} || $vf->seq_region_name, | |
885 $vf->start, | |
886 $vf->end, | |
887 $vf->allele_string, | |
888 $vf->strand, | |
889 $vf->variation_name | |
890 ]; | |
891 } | |
892 | |
893 # converts to VCF format | |
894 sub convert_to_vcf { | |
895 my $config = shift; | |
896 my $vf = shift; | |
897 | |
898 # look for imbalance in the allele string | |
899 my %allele_lengths; | |
900 my @alleles = split /\//, $vf->allele_string; | |
901 | |
902 foreach my $allele(@alleles) { | |
903 $allele =~ s/\-//g; | |
904 $allele_lengths{length($allele)} = 1; | |
905 } | |
906 | |
907 # in/del/unbalanced | |
908 if(scalar keys %allele_lengths > 1) { | |
909 | |
910 # we need the ref base before the variation | |
911 # default to N in case we can't get it | |
912 my $prev_base = 'N'; | |
913 | |
914 unless(defined($config->{cache})) { | |
915 my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1); | |
916 $prev_base = $slice->seq if defined($slice); | |
917 } | |
918 | |
919 for my $i(0..$#alleles) { | |
920 $alleles[$i] =~ s/\-//g; | |
921 $alleles[$i] = $prev_base.$alleles[$i]; | |
922 } | |
923 | |
924 return [ | |
925 $vf->{chr} || $vf->seq_region_name, | |
926 $vf->start - 1, | |
927 $vf->variation_name, | |
928 shift @alleles, | |
929 (join ",", @alleles), | |
930 '.', '.', '.' | |
931 ]; | |
932 | |
933 } | |
934 | |
935 # balanced sub | |
936 else { | |
937 return [ | |
938 $vf->{chr} || $vf->seq_region_name, | |
939 $vf->start, | |
940 $vf->variation_name, | |
941 shift @alleles, | |
942 (join ",", @alleles), | |
943 '.', '.', '.' | |
944 ]; | |
945 } | |
946 } | |
947 | |
948 | |
949 # converts to pileup format | |
950 sub convert_to_pileup { | |
951 my $config = shift; | |
952 my $vf = shift; | |
953 | |
954 # look for imbalance in the allele string | |
955 my %allele_lengths; | |
956 my @alleles = split /\//, $vf->allele_string; | |
957 | |
958 foreach my $allele(@alleles) { | |
959 $allele =~ s/\-//g; | |
960 $allele_lengths{length($allele)} = 1; | |
961 } | |
962 | |
963 # in/del | |
964 if(scalar keys %allele_lengths > 1) { | |
965 | |
966 if($vf->allele_string =~ /\-/) { | |
967 | |
968 # insertion? | |
969 if($alleles[0] eq '-') { | |
970 shift @alleles; | |
971 | |
972 for my $i(0..$#alleles) { | |
973 $alleles[$i] =~ s/\-//g; | |
974 $alleles[$i] = '+'.$alleles[$i]; | |
975 } | |
976 } | |
977 | |
978 else { | |
979 @alleles = grep {$_ ne '-'} @alleles; | |
980 | |
981 for my $i(0..$#alleles) { | |
982 $alleles[$i] =~ s/\-//g; | |
983 $alleles[$i] = '-'.$alleles[$i]; | |
984 } | |
985 } | |
986 | |
987 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; | |
988 | |
989 return [ | |
990 $vf->{chr} || $vf->seq_region_name, | |
991 $vf->start - 1, | |
992 '*', | |
993 (join "/", @alleles), | |
994 ]; | |
995 } | |
996 | |
997 else { | |
998 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); | |
999 return []; | |
1000 } | |
1001 | |
1002 } | |
1003 | |
1004 # balanced sub | |
1005 else { | |
1006 return [ | |
1007 $vf->{chr} || $vf->seq_region_name, | |
1008 $vf->start, | |
1009 shift @alleles, | |
1010 (join "/", @alleles), | |
1011 ]; | |
1012 } | |
1013 } | |
1014 | |
1015 # prints a line of output from the hash | |
1016 sub print_line { | |
1017 my $config = shift; | |
1018 my $line = shift; | |
1019 return unless defined($line); | |
1020 | |
1021 my $output; | |
1022 | |
1023 # normal | |
1024 if(ref($line) eq 'HASH') { | |
1025 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; | |
1026 $output = join "\t", map { $line->{$_} || '-' } @OUTPUT_COLS; | |
1027 } | |
1028 | |
1029 # gvf | |
1030 else { | |
1031 $output = $line; | |
1032 } | |
1033 | |
1034 my $fh = $config->{out_file_handle}; | |
1035 print $fh "$output\n"; | |
1036 } | |
1037 | |
1038 # outputs usage message | |
1039 sub usage { | |
1040 my $usage =<<END; | |
1041 #----------------------------------# | |
1042 # ENSEMBL VARIANT EFFECT PREDICTOR # | |
1043 #----------------------------------# | |
1044 | |
1045 version $VERSION | |
1046 | |
1047 By Will McLaren (wm2\@ebi.ac.uk) | |
1048 | |
1049 http://www.ensembl.org/info/docs/variation/vep/vep_script.html | |
1050 | |
1051 Usage: | |
1052 perl variant_effect_predictor.pl [arguments] | |
1053 | |
1054 Options | |
1055 ======= | |
1056 | |
1057 --help Display this message and quit | |
1058 --verbose Display verbose output as the script runs [default: off] | |
1059 --quiet Suppress status and warning messages [default: off] | |
1060 --no_progress Suppress progress bars [default: off] | |
1061 | |
1062 --config Load configuration from file. Any command line options | |
1063 specified overwrite those in the file [default: off] | |
1064 | |
1065 -i | --input_file Input file - if not specified, reads from STDIN. Files | |
1066 may be gzip compressed. | |
1067 --format Specify input file format - one of "ensembl", "pileup", | |
1068 "vcf", "hgvs", "id" or "guess" to try and work out format. | |
1069 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this | |
1070 will force --quiet [default: "variant_effect_output.txt"] | |
1071 --force_overwrite Force overwriting of output file [default: quit if file | |
1072 exists] | |
1073 | |
1074 --species [species] Species to use [default: "human"] | |
1075 | |
1076 -t | --terms Type of consequence terms to output - one of "ensembl", "SO", | |
1077 "NCBI" [default: ensembl] | |
1078 | |
1079 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off] | |
1080 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off] | |
1081 --condel=[p|s|b] Add Condel SIFT/PolyPhen consensus [p]rediction, [s]core or | |
1082 [b]oth. Add 1 (e.g. b1) to option to use old Condel algorithm | |
1083 [default: off] | |
1084 | |
1085 NB: SIFT, PolyPhen and Condel predictions are currently available for human only | |
1086 | |
1087 --regulatory Look for overlaps with regulatory regions. The script can | |
1088 also call if a variant falls in a high information position | |
1089 within a transcription factor binding site. Output lines have | |
1090 a Feature type of RegulatoryFeature or MotifFeature | |
1091 [default: off] | |
1092 | |
1093 NB: Regulatory consequences are currently available for human and mouse only | |
1094 | |
1095 --hgnc If specified, HGNC gene identifiers are output alongside the | |
1096 Ensembl Gene identifier [default: off] | |
1097 --hgvs Output HGVS identifiers (coding and protein). Requires database | |
1098 connection [default: off] | |
1099 --protein Output Ensembl protein identifer [default: off] | |
1100 --gene Force output of Ensembl gene identifer - disabled by default | |
1101 unless using --cache or --no_whole_genome [default: off] | |
1102 --canonical Indicate if the transcript for this consequence is the canonical | |
1103 transcript for this gene [default: off] | |
1104 | |
1105 --coding_only Only return consequences that fall in the coding region of | |
1106 transcripts [default: off] | |
1107 --most_severe Ouptut only the most severe consequence per variation. | |
1108 Transcript-specific columns will be left blank. [default: off] | |
1109 --summary Output only a comma-separated list of all consequences per | |
1110 variation. Transcript-specific columns will be left blank. | |
1111 [default: off] | |
1112 --per_gene Output only the most severe consequence per gene. Where more | |
1113 than one transcript has the same consequence, the transcript | |
1114 chosen is arbitrary. [default: off] | |
1115 | |
1116 | |
1117 --check_ref If specified, checks supplied reference allele against stored | |
1118 entry in Ensembl Core database [default: off] | |
1119 --check_existing If specified, checks for existing co-located variations in the | |
1120 Ensembl Variation database [default: off] | |
1121 --check_alleles If specified, the alleles of existing co-located variations | |
1122 are compared to the input; an existing variation will only | |
1123 be reported if no novel allele is in the input (strand is | |
1124 accounted for) [default: off] | |
1125 | |
1126 --no_intergenic Excludes intergenic consequences from the output [default: off] | |
1127 | |
1128 --check_frequency Turns on frequency filtering. Use this to include or exclude | |
1129 variants based on the frequency of co-located existing | |
1130 variants in the Ensembl Variation database. You must also | |
1131 specify all of the following --freq flags [default: off] | |
1132 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap, | |
1133 1kg_yri for YRI 1000 genomes. See documentation for more | |
1134 details | |
1135 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5 | |
1136 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or | |
1137 less than (lt) --freq_freq | |
1138 --freq_filter Specify whether variants that pass the above should be included | |
1139 [exclude|include] or excluded from analysis | |
1140 | |
1141 --chr [list] Select a subset of chromosomes to analyse from your file. Any | |
1142 data not on this chromosome in the input will be skipped. The | |
1143 list can be comma separated, with "-" characters representing | |
1144 a range e.g. 1-5,8,15,X [default: off] | |
1145 --gp If specified, tries to read GRCh37 position from GP field in the | |
1146 INFO column of a VCF file. Only applies when VCF is the input | |
1147 format and human is the species [default: off] | |
1148 | |
1149 --convert Convert the input file to the output format specified. | |
1150 [ensembl|vcf|pileup] Converted output is written to the file specified in | |
1151 --output_file. No consequence calculation is carried out when | |
1152 doing file conversion. [default: off] | |
1153 | |
1154 --refseq Use the otherfeatures database to retrieve transcripts - this | |
1155 database contains RefSeq transcripts (as well as CCDS and | |
1156 Ensembl EST alignments) [default: off] | |
1157 --host Manually define database host [default: "ensembldb.ensembl.org"] | |
1158 -u | --user Database username [default: "anonymous"] | |
1159 --port Database port [default: 5306] | |
1160 --password Database password [default: no password] | |
1161 --genomes Sets DB connection params for Ensembl Genomes [default: off] | |
1162 --registry Registry file to use defines DB connections [default: off] | |
1163 Defining a registry file overrides above connection settings. | |
1164 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not | |
1165 advised due to likely incompatibilities between API and DB | |
1166 | |
1167 --no_whole_genome Run in old-style, non-whole genome mode [default: off] | |
1168 --buffer_size Sets the number of variants sent in each batch [default: 5000] | |
1169 Increasing buffer size can retrieve results more quickly | |
1170 but requires more memory. Only applies to whole genome mode. | |
1171 | |
1172 --cache Enables read-only use of cache [default: off] | |
1173 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"] | |
1174 --write_cache Enable writing to cache [default: off] | |
1175 --build [all|list] Build a complete cache for the selected species. Build for all | |
1176 chromosomes with --build all, or a list of chromosomes (see | |
1177 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS | |
1178 VIOLATES OUR FAIR USAGE POLICY [default: off] | |
1179 | |
1180 --compress Specify utility to decompress cache files - may be "gzcat" or | |
1181 "gzip -dc" Only use if default does not work [default: zcat] | |
1182 | |
1183 --skip_db_check ADVANCED! Force the script to use a cache built from a different | |
1184 database than specified with --host. Only use this if you are | |
1185 sure the hosts are compatible (e.g. ensembldb.ensembl.org and | |
1186 useastdb.ensembl.org) [default: off] | |
1187 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one | |
1188 file in the cache. [default: 1MB] | |
1189 END | |
1190 | |
1191 print $usage; | |
1192 } |