comparison variant_effect_predictor/.#variant_effect_predictor.pl.1.3 @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 #!/usr/bin/perl
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2011 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 CONTACT
14
15 Please email comments or questions to the public Ensembl
16 developers list at <dev@ensembl.org>.
17
18 Questions may also be sent to the Ensembl help desk at
19 <helpdesk@ensembl.org>.
20
21 =cut
22
23 =head1 NAME
24
25 Variant Effect Predictor - a script to predict the consequences of genomic variants
26
27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
28
29 Version 2.2
30
31 by Will McLaren (wm2@ebi.ac.uk)
32 =cut
33
34 use strict;
35 use Getopt::Long;
36 use FileHandle;
37 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
38 use Bio::EnsEMBL::Variation::Utils::VEP qw(
39 parse_line
40 vf_to_consequences
41 validate_vf
42 load_dumped_adaptor_cache
43 dump_adaptor_cache
44 get_all_consequences
45 get_slice
46 build_full_cache
47 read_cache_info
48 get_time
49 debug
50 @OUTPUT_COLS
51 @REG_FEAT_TYPES
52 );
53
54 # global vars
55 my $VERSION = '2.2';
56
57 # set output autoflush for progress bars
58 $| = 1;
59
60 # configure from command line opts
61 my $config = &configure(scalar @ARGV);
62
63 # run the main sub routine
64 &main($config);
65
66 # this is the main sub-routine - it needs the configured $config hash
67 sub main {
68 my $config = shift;
69
70 debug("Starting...") unless defined $config->{quiet};
71
72 my $tr_cache = {};
73 my $rf_cache = {};
74
75 # create a hash to hold slices so we don't get the same one twice
76 my %slice_cache = ();
77
78 my @vfs;
79 my ($vf_count, $total_vf_count);
80 my $in_file_handle = $config->{in_file_handle};
81
82 # initialize line number in config
83 $config->{line_number} = 0;
84
85 # read the file
86 while(<$in_file_handle>) {
87 chomp;
88
89 $config->{line_number}++;
90
91 # header line?
92 next if /^\#/;
93
94 # some lines (pileup) may actually parse out into more than one variant
95 foreach my $vf(@{&parse_line($config, $_)}) {
96
97 # validate the VF
98 next unless validate_vf($config, $vf);
99
100 # now get the slice
101 if(!defined($vf->{slice})) {
102 my $slice;
103
104 # don't get slices if we're using cache
105 # we can steal them from transcript objects later
106 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
107
108 # check if we have fetched this slice already
109 if(defined $slice_cache{$vf->{chr}}) {
110 $slice = $slice_cache{$vf->{chr}};
111 }
112
113 # if not create a new one
114 else {
115
116 $slice = &get_slice($config, $vf->{chr});
117
118 # if failed, warn and skip this line
119 if(!defined($slice)) {
120 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
121 next;
122 }
123
124 # store the hash
125 $slice_cache{$vf->{chr}} = $slice;
126 }
127 }
128
129 $vf->{slice} = $slice;
130 }
131
132 # make a name if one doesn't exist
133 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.$vf->{allele_string};
134
135 # jump out to convert here
136 if(defined($config->{convert})) {
137 &convert_vf($config, $vf);
138 next;
139 }
140
141 if(defined $config->{whole_genome}) {
142 push @vfs, $vf;
143 $vf_count++;
144 $total_vf_count++;
145
146 if($vf_count == $config->{buffer_size}) {
147 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
148
149 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
150
151 debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
152
153 @vfs = ();
154 $vf_count = 0;
155 }
156 }
157 else {
158 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
159 $vf_count++;
160 $total_vf_count++;
161 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
162 }
163 }
164 }
165
166 # if in whole-genome mode, finish off the rest of the buffer
167 if(defined $config->{whole_genome} && scalar @vfs) {
168 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
169
170 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
171
172 debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
173 }
174
175 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
176
177 debug("Finished!") unless defined $config->{quiet};
178 }
179
180 # sets up configuration hash that is used throughout the script
181 sub configure {
182 my $args = shift;
183
184 my $config = {};
185
186 GetOptions(
187 $config,
188 'help', # displays help message
189
190 # input options,
191 'config=s', # config file name
192 'input_file=s', # input file name
193 'format=s', # input file format
194
195 # DB options
196 'species=s', # species e.g. human, homo_sapiens
197 'registry=s', # registry file
198 'host=s', # database host
199 'port=s', # database port
200 'user=s', # database user name
201 'password=s', # database password
202 'db_version=i', # Ensembl database version to use e.g. 62
203 'genomes', # automatically sets DB params for e!Genomes
204 'refseq', # use otherfeatures RefSeq DB instead of Ensembl
205 #'no_disconnect', # disables disconnect_when_inactive
206
207 # runtime options
208 'most_severe', # only return most severe consequence
209 'summary', # only return one line per variation with all consquence types
210 'per_gene', # only return most severe per gene
211 'buffer_size=i', # number of variations to read in before analysis
212 'chunk_size=s', # size in bases of "chunks" used in internal hash structure
213 'failed=i', # include failed variations when finding existing
214 'no_whole_genome', # disables now default whole-genome mode
215 'whole_genome', # proxy for whole genome mode - now just warns user
216 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
217 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
218 'check_ref', # check supplied reference allele against DB
219 'check_existing', # find existing co-located variations
220 'check_alleles', # only attribute co-located if alleles are the same
221 'check_frequency', # enable frequency checking
222 'freq_filter=s', # exclude or include
223 'freq_freq=f', # frequency to filter on
224 'freq_gt_lt=s', # gt or lt (greater than or less than)
225 'freq_pop=s', # population to filter on
226
227 # verbosity options
228 'verbose', # print out a bit more info while running
229 'quiet', # print nothing to STDOUT (unless using -o stdout)
230 'no_progress', # don't display progress bars
231
232 # output options
233 'output_file=s', # output file name
234 'force_overwrite', # force overwrite of output file if already exists
235 'terms=s', # consequence terms to use e.g. NCBI, SO
236 'coding_only', # only return results for consequences in coding regions
237 'canonical', # indicates if transcript is canonical
238 'protein', # add e! protein ID to extra column
239 'hgnc', # add HGNC gene ID to extra column
240 'hgvs', # add HGVS names to extra column
241 'sift=s', # SIFT predictions
242 'polyphen=s', # PolyPhen predictions
243 'condel=s', # Condel predictions
244 'gene', # force gene column to be populated (disabled by default, enabled when using cache)
245 'regulatory', # enable regulatory stuff
246 'convert=s', # convert input to another format (doesn't run VEP)
247 'no_intergenic', # don't print out INTERGENIC consequences
248 'gvf', # produce gvf output
249
250 # cache stuff
251 'cache', # use cache
252 'write_cache', # enables writing to the cache
253 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
254 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript
255 'strip', # strips adaptors etc from objects before caching them
256 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
257 'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
258 'cache_region_size=i', # size of region in bases for each cache file
259 'no_slice_cache', # tell API not to cache features on slice
260 'standalone', # standalone mode uses minimal set of modules installed in same dir, no DB connection
261 'skip_db_check', # don't compare DB parameters with cached
262 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
263
264 # debug
265 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
266 'count_queries', # counts SQL queries executed
267 'admin', # allows me to build off public hosts
268 'debug', # print out debug info
269 );
270
271 # print usage message if requested or no args supplied
272 if(defined($config->{help}) || !$args) {
273 &usage;
274 exit(0);
275 }
276
277 # config file?
278 if(defined $config->{config}) {
279
280 open CONFIG, $config->{config} or die "ERROR: Could not open config file \"".$config->{config}."\"\n";
281
282 while(<CONFIG>) {
283 next if /^\#/;
284 my ($key, $value) = split /\s+|\=/;
285 $key =~ s/^\-//g;
286 $config->{$key} = $value unless defined $config->{$key};
287 }
288
289 close CONFIG;
290 }
291
292 # can't be both quiet and verbose
293 die "ERROR: Can't be both quiet and verbose!" if defined($config->{quiet}) && defined($config->{verbose});
294
295 # check file format
296 if(defined $config->{format}) {
297 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /pileup|vcf|guess|hgvs|ensembl|id/i;
298 }
299
300 # check convert format
301 if(defined $config->{convert}) {
302 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup/i;
303 }
304
305 # connection settings for Ensembl Genomes
306 if($config->{genomes}) {
307 $config->{host} ||= 'mysql.ebi.ac.uk';
308 $config->{port} ||= 4157;
309 }
310
311 # connection settings for main Ensembl
312 else {
313 $config->{species} ||= "homo_sapiens";
314 $config->{host} ||= 'ensembldb.ensembl.org';
315 $config->{port} ||= 5306;
316 }
317
318 # refseq or core?
319 if(defined($config->{refseq})) {
320 die "ERROR: SIFT, PolyPhen and Condel predictions not available fore RefSeq transcripts" if defined $config->{sift} || defined $config->{polyphen} || defined $config->{condel};
321
322 $config->{core_type} = 'otherfeatures';
323 }
324 else {
325 $config->{core_type} = 'core';
326 }
327
328 # output term
329 if(defined $config->{terms}) {
330 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i;
331 if($config->{terms} =~ /ensembl|display/i) {
332 $config->{terms} = 'display';
333 }
334 else {
335 $config->{terms} = uc($config->{terms});
336 }
337 }
338
339 # check nsSNP tools
340 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) {
341 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/;
342
343 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i;
344
345 die "ERROR: $tool not available in standalone mode\n" if defined($config->{standalone});
346
347 # use V2 of the Condel algorithm, possibly gives fewer false positives
348 if($tool eq 'Condel' && $config->{lc($tool)} =~ /1$/) {
349 $Bio::EnsEMBL::Variation::Utils::Condel::USE_V2 = 0;
350 }
351 }
352
353 # force quiet if outputting to STDOUT
354 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
355 delete $config->{verbose} if defined($config->{verbose});
356 $config->{quiet} = 1;
357 }
358
359 # summarise options if verbose
360 if(defined $config->{verbose}) {
361 my $header =<<INTRO;
362 #----------------------------------#
363 # ENSEMBL VARIANT EFFECT PREDICTOR #
364 #----------------------------------#
365
366 version $VERSION
367
368 By Will McLaren (wm2\@ebi.ac.uk)
369
370 Configuration options:
371
372 INTRO
373 print $header;
374
375 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
376
377 foreach my $key(sort keys %$config) {
378 print $key.(' ' x (($max_length - length($key)) + 4)).$config->{$key}."\n";
379 }
380
381 print "\n".("-" x 20)."\n\n";
382 }
383
384 # set defaults
385 $config->{user} ||= 'anonymous';
386 $config->{buffer_size} ||= 5000;
387 $config->{chunk_size} ||= '50kb';
388 $config->{output_file} ||= "variant_effect_output.txt";
389 $config->{tmpdir} ||= '/tmp';
390 $config->{format} ||= 'guess';
391 $config->{terms} ||= 'display';
392 $config->{gene} ||= 1 unless defined($config->{whole_genome});
393 $config->{cache_region_size} ||= 1000000;
394 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep');
395 $config->{compress} ||= 'zcat';
396
397 # frequency filtering
398 if(defined($config->{check_frequency})) {
399 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
400 die "ERROR: To use --check_frequency you must also specify flag --$flag" unless defined $config->{$flag};
401 }
402
403 # need to set check_existing
404 $config->{check_existing} = 1;
405 }
406
407 $config->{check_existing} = 1 if defined $config->{check_alleles};
408
409 # warn users still using whole_genome flag
410 if(defined($config->{whole_genome})) {
411 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
412 }
413
414 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome};
415 $config->{include_failed} = 1 unless defined $config->{include_failed};
416 $config->{chunk_size} =~ s/mb?/000000/i;
417 $config->{chunk_size} =~ s/kb?/000/i;
418 $config->{cache_region_size} =~ s/mb?/000000/i;
419 $config->{cache_region_size} =~ s/kb?/000/i;
420
421 # cluck and display executed SQL?
422 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck});
423
424 # standalone needs cache, can't use HGVS
425 if(defined($config->{standalone})) {
426 $config->{cache} = 1;
427
428 die("ERROR: Cannot generate HGVS coordinates in standalone mode") if defined($config->{hgvs});
429 die("ERROR: Cannot use HGVS as input in standalone mode") if $config->{format} eq 'hgvs';
430 die("ERROR: Cannot use variant identifiers as input in standalone mode") if $config->{format} eq 'id';
431 die("ERROR: Cannot do frequency filtering in standalone mode") if defined($config->{check_frequency});
432 }
433
434 # write_cache needs cache
435 $config->{cache} = 1 if defined $config->{write_cache};
436
437 # no_slice_cache, prefetch and whole_genome have to be on to use cache
438 if(defined($config->{cache})) {
439 $config->{prefetch} = 1;
440 $config->{no_slice_cache} = 1;
441 $config->{whole_genome} = 1;
442 $config->{strip} = 1;
443 }
444
445 $config->{build} = $config->{rebuild} if defined($config->{rebuild});
446
447 # force options for full build
448 if(defined($config->{build})) {
449 $config->{prefetch} = 1;
450 $config->{gene} = 1;
451 $config->{hgnc} = 1;
452 $config->{no_slice_cache} = 1;
453 $config->{cache} = 1;
454 $config->{strip} = 1;
455 $config->{write_cache} = 1;
456 }
457
458 # connect to databases
459 $config->{reg} = &connect_to_dbs($config);
460
461 # complete dir with species name and db_version
462 $config->{dir} .= '/'.(
463 join '/', (
464 defined($config->{standalone}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}),
465 $config->{db_version} || $config->{reg}->software_version
466 )
467 );
468
469 if(defined($config->{cache})) {
470 # read cache info
471 if(read_cache_info($config)) {
472 debug("Read existing cache info") unless defined $config->{quiet};
473 }
474 }
475
476 # include regulatory modules if requested
477 if(defined($config->{regulatory})) {
478 # do the use statements here so that users don't have to have the
479 # funcgen API install to use the rest of the script
480 eval q{
481 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
482 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
483 use Bio::EnsEMBL::Funcgen::MotifFeature;
484 use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
485 use Bio::EnsEMBL::Funcgen::BindingMatrix;
486 };
487
488 if($@) {
489 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory\n");
490 }
491 }
492
493 # warn user cache directory doesn't exist
494 if(!-e $config->{dir}) {
495
496 # if using write_cache
497 if(defined($config->{write_cache})) {
498 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet});
499 }
500
501 # want to read cache, not found
502 elsif(defined($config->{cache})) {
503 die("ERROR: Cache directory ", $config->{dir}, " not found");
504 }
505 }
506
507 # suppress warnings that the FeatureAdpators spit if using no_slice_cache
508 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
509
510 # get adaptors
511 if(defined($config->{cache}) && !defined($config->{write_cache})) {
512
513 # try and load adaptors from cache
514 if(!&load_dumped_adaptor_cache($config)) {
515 &get_adaptors($config);
516 &dump_adaptor_cache($config) if defined($config->{write_cache});
517 }
518
519 # check cached adaptors match DB params
520 else {
521 my $dbc = $config->{sa}->{dbc};
522
523 my $ok = 1;
524
525 if($dbc->{_host} ne $config->{host}) {
526
527 # ens-livemirror, useastdb and ensembldb should all have identical DBs
528 unless(
529 (
530 $dbc->{_host} eq 'ens-livemirror'
531 || $dbc->{_host} eq 'ensembldb.ensembl.org'
532 || $dbc->{_host} eq 'useastdb.ensembl.org'
533 ) && (
534 $config->{host} eq 'ens-livemirror'
535 || $config->{host} eq 'ensembldb.ensembl.org'
536 || $config->{host} eq 'useastdb.ensembl.org'
537 )
538 ) {
539 $ok = 0;
540 }
541
542 # but we still need to reconnect
543 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet});
544
545 &get_adaptors($config);
546 }
547
548 if(!$ok) {
549 if(defined($config->{skip_db_check})) {
550 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet});
551 }
552 else {
553 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set";
554 }
555 }
556 }
557 }
558 else {
559 &get_adaptors($config);
560 &dump_adaptor_cache($config) if defined($config->{write_cache})
561 }
562
563 # reg adaptors (only fetches if not retrieved from cache already)
564 &get_reg_adaptors($config) if defined($config->{regulatory});
565
566 # get terminal width for progress bars
567 unless(defined($config->{quiet})) {
568 my $width;
569
570 # module may not be installed
571 eval q{
572 use Term::ReadKey;
573 };
574
575 if(!$@) {
576 my ($w, $h);
577
578 # module may be installed, but e.g.
579 eval {
580 ($w, $h) = GetTerminalSize();
581 };
582
583 $width = $w if defined $w;
584 }
585
586 $width ||= 60;
587 $width -= 12;
588 $config->{terminal_width} = $width;
589 }
590
591 # jump out to build cache if requested
592 if(defined($config->{build})) {
593
594 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
595 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
596 }
597
598 # build the cache
599 debug("Building cache for ".$config->{species}) unless defined($config->{quiet});
600 build_full_cache($config);
601
602 # exit script
603 debug("Finished building cache") unless defined($config->{quiet});
604 exit(0);
605 }
606
607 # warn user DB will be used for SIFT/PolyPhen/Condel/HGVS/frequency
608 if(defined($config->{cache})) {
609
610 # these two def depend on DB
611 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency)) {
612 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet});
613 }
614
615 # as does using HGVS or IDs as input
616 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet});
617
618 # the rest may be in the cache
619 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen condel regulatory)) {
620 next if defined($config->{'cache_'.$param});
621 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet});
622 }
623 }
624
625 # get list of chrs if supplied
626 if(defined($config->{chr})) {
627 my %chrs;
628
629 foreach my $val(split /\,/, $config->{chr}) {
630 my @nnn = split /\-/, $val;
631
632 foreach my $chr($nnn[0]..$nnn[-1]) {
633 $chrs{$chr} = 1;
634 }
635 }
636
637 $config->{chr} = \%chrs;
638 }
639
640 # get input file handle
641 $config->{in_file_handle} = &get_in_file_handle($config);
642
643 # configure output file
644 $config->{out_file_handle} = &get_out_file_handle($config);
645
646 return $config;
647 }
648
649 # connects to DBs; in standalone mode this just loads registry module
650 sub connect_to_dbs {
651 my $config = shift;
652
653 # get registry
654 my $reg = 'Bio::EnsEMBL::Registry';
655
656 unless(defined($config->{standalone})) {
657 # load DB options from registry file if given
658 if(defined($config->{registry})) {
659 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet});
660 $reg->load_all(
661 $config->{registry},
662 $config->{verbose},
663 undef,
664 $config->{no_slice_cache}
665 );
666 }
667
668 # otherwise manually connect to DB server
669 else {
670 $reg->load_registry_from_db(
671 -host => $config->{host},
672 -user => $config->{user},
673 -pass => $config->{password},
674 -port => $config->{port},
675 -db_version => $config->{db_version},
676 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef,
677 -verbose => $config->{verbose},
678 -no_cache => $config->{no_slice_cache},
679 );
680 }
681
682 eval { $reg->set_reconnect_when_lost() };
683
684 if(defined($config->{verbose})) {
685 # get a meta container adaptors to check version
686 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer');
687 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer');
688
689 if($core_mca && $var_mca) {
690 debug(
691 "Connected to core version ", $core_mca->get_schema_version, " database ",
692 "and variation version ", $var_mca->get_schema_version, " database"
693 );
694 }
695 }
696 }
697
698 return $reg;
699 }
700
701 # get adaptors from DB
702 sub get_adaptors {
703 my $config = shift;
704
705 die "ERROR: No registry" unless defined $config->{reg};
706
707 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature');
708 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation');
709 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix');
710 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation');
711
712 # get fake ones for species with no var DB
713 if(!defined($config->{vfa})) {
714 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species});
715 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species});
716 }
717 else {
718 $config->{vfa}->db->include_failed_variations($config->{include_failed}) if defined($config->{vfa}->db) && $config->{vfa}->db->can('include_failed_variations');
719 }
720
721 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice');
722 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene');
723 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript');
724 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer');
725 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem');
726
727 # cache schema version
728 $config->{mca}->get_schema_version if defined $config->{mca};
729
730 # check we got slice adaptor - can't continue without a core DB
731 die("ERROR: Could not connect to core database\n") unless defined $config->{sa};
732 }
733
734 # gets regulatory adaptors
735 sub get_reg_adaptors {
736 my $config = shift;
737
738 foreach my $type(@REG_FEAT_TYPES) {
739 next if defined($config->{$type.'_adaptor'});
740
741 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type);
742 if(defined($adaptor)) {
743 $config->{$type.'_adaptor'} = $adaptor;
744 }
745 else {
746 delete $config->{regulatory};
747 last;
748 }
749 }
750 }
751
752 # gets file handle for input
753 sub get_in_file_handle {
754 my $config = shift;
755
756 # define the filehandle to read input from
757 my $in_file_handle = new FileHandle;
758
759 if(defined($config->{input_file})) {
760
761 # check defined input file exists
762 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file};
763
764 if($config->{input_file} =~ /\.gz$/){
765 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
766 }
767 else {
768 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
769 }
770 }
771
772 # no file specified - try to read data off command line
773 else {
774 $in_file_handle = 'STDIN';
775 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet};
776 }
777
778 return $in_file_handle;
779 }
780
781 # gets file handle for output and adds header
782 sub get_out_file_handle {
783 my $config = shift;
784
785 # define filehandle to write to
786 my $out_file_handle = new FileHandle;
787
788 # check if file exists
789 if(-e $config->{output_file} && !defined($config->{force_overwrite})) {
790 die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n");
791 }
792
793 if($config->{output_file} =~ /stdout/i) {
794 $out_file_handle = *STDOUT;
795 }
796 else {
797 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n");
798 }
799
800 # file conversion, don't want to add normal headers
801 if(defined($config->{convert})) {
802 # header for VCF
803 if($config->{convert} =~ /vcf/i) {
804 print $out_file_handle join "\t", (
805 '#CHROM',
806 'POS',
807 'ID',
808 'REF',
809 'ALT',
810 'QUAL',
811 'FILTER',
812 'INFO'
813 );
814 print $out_file_handle "\n";
815 }
816
817 return $out_file_handle;
818 }
819
820 # GVF output, no header
821 elsif(defined($config->{gvf})) {
822 return $out_file_handle;
823 }
824
825 # make header
826 my $time = &get_time;
827 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca};
828 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache});
829 my $version_string =
830 "Using API version ".$config->{reg}->software_version.
831 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?');
832
833 my $header =<<HEAD;
834 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION
835 ## Output produced at $time
836 ## Connected to $db_string
837 ## $version_string
838 ## Extra column keys:
839 ## CANONICAL : Indicates if transcript is canonical for this gene
840 ## HGNC : HGNC gene identifier
841 ## ENSP : Ensembl protein identifer
842 ## HGVSc : HGVS coding sequence name
843 ## HGVSp : HGVS protein sequence name
844 ## SIFT : SIFT prediction
845 ## PolyPhen : PolyPhen prediction
846 ## Condel : Condel SIFT/PolyPhen consensus prediction
847 ## MATRIX : The source and identifier of a transcription factor binding profile aligned at this position
848 ## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of a transcription factor binding profile
849 HEAD
850
851 # add headers
852 print $out_file_handle $header;
853
854 # add column headers
855 print $out_file_handle '#', (join "\t", @OUTPUT_COLS);
856 print $out_file_handle "\n";
857
858 return $out_file_handle;
859 }
860
861 # convert a variation feature to a line of output
862 sub convert_vf {
863 my $config = shift;
864 my $vf = shift;
865
866 my $convert_method = 'convert_to_'.lc($config->{convert});
867 my $method_ref = \&$convert_method;
868
869 my $line = &$method_ref($config, $vf);
870 my $handle = $config->{out_file_handle};
871
872 if(scalar @$line) {
873 print $handle join "\t", @$line;
874 print $handle "\n";
875 }
876 }
877
878 # converts to Ensembl format
879 sub convert_to_ensembl {
880 my $config = shift;
881 my $vf = shift;
882
883 return [
884 $vf->{chr} || $vf->seq_region_name,
885 $vf->start,
886 $vf->end,
887 $vf->allele_string,
888 $vf->strand,
889 $vf->variation_name
890 ];
891 }
892
893 # converts to VCF format
894 sub convert_to_vcf {
895 my $config = shift;
896 my $vf = shift;
897
898 # look for imbalance in the allele string
899 my %allele_lengths;
900 my @alleles = split /\//, $vf->allele_string;
901
902 foreach my $allele(@alleles) {
903 $allele =~ s/\-//g;
904 $allele_lengths{length($allele)} = 1;
905 }
906
907 # in/del/unbalanced
908 if(scalar keys %allele_lengths > 1) {
909
910 # we need the ref base before the variation
911 # default to N in case we can't get it
912 my $prev_base = 'N';
913
914 unless(defined($config->{cache})) {
915 my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1);
916 $prev_base = $slice->seq if defined($slice);
917 }
918
919 for my $i(0..$#alleles) {
920 $alleles[$i] =~ s/\-//g;
921 $alleles[$i] = $prev_base.$alleles[$i];
922 }
923
924 return [
925 $vf->{chr} || $vf->seq_region_name,
926 $vf->start - 1,
927 $vf->variation_name,
928 shift @alleles,
929 (join ",", @alleles),
930 '.', '.', '.'
931 ];
932
933 }
934
935 # balanced sub
936 else {
937 return [
938 $vf->{chr} || $vf->seq_region_name,
939 $vf->start,
940 $vf->variation_name,
941 shift @alleles,
942 (join ",", @alleles),
943 '.', '.', '.'
944 ];
945 }
946 }
947
948
949 # converts to pileup format
950 sub convert_to_pileup {
951 my $config = shift;
952 my $vf = shift;
953
954 # look for imbalance in the allele string
955 my %allele_lengths;
956 my @alleles = split /\//, $vf->allele_string;
957
958 foreach my $allele(@alleles) {
959 $allele =~ s/\-//g;
960 $allele_lengths{length($allele)} = 1;
961 }
962
963 # in/del
964 if(scalar keys %allele_lengths > 1) {
965
966 if($vf->allele_string =~ /\-/) {
967
968 # insertion?
969 if($alleles[0] eq '-') {
970 shift @alleles;
971
972 for my $i(0..$#alleles) {
973 $alleles[$i] =~ s/\-//g;
974 $alleles[$i] = '+'.$alleles[$i];
975 }
976 }
977
978 else {
979 @alleles = grep {$_ ne '-'} @alleles;
980
981 for my $i(0..$#alleles) {
982 $alleles[$i] =~ s/\-//g;
983 $alleles[$i] = '-'.$alleles[$i];
984 }
985 }
986
987 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles;
988
989 return [
990 $vf->{chr} || $vf->seq_region_name,
991 $vf->start - 1,
992 '*',
993 (join "/", @alleles),
994 ];
995 }
996
997 else {
998 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet});
999 return [];
1000 }
1001
1002 }
1003
1004 # balanced sub
1005 else {
1006 return [
1007 $vf->{chr} || $vf->seq_region_name,
1008 $vf->start,
1009 shift @alleles,
1010 (join "/", @alleles),
1011 ];
1012 }
1013 }
1014
1015 # prints a line of output from the hash
1016 sub print_line {
1017 my $config = shift;
1018 my $line = shift;
1019 return unless defined($line);
1020
1021 my $output;
1022
1023 # normal
1024 if(ref($line) eq 'HASH') {
1025 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} };
1026 $output = join "\t", map { $line->{$_} || '-' } @OUTPUT_COLS;
1027 }
1028
1029 # gvf
1030 else {
1031 $output = $line;
1032 }
1033
1034 my $fh = $config->{out_file_handle};
1035 print $fh "$output\n";
1036 }
1037
1038 # outputs usage message
1039 sub usage {
1040 my $usage =<<END;
1041 #----------------------------------#
1042 # ENSEMBL VARIANT EFFECT PREDICTOR #
1043 #----------------------------------#
1044
1045 version $VERSION
1046
1047 By Will McLaren (wm2\@ebi.ac.uk)
1048
1049 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
1050
1051 Usage:
1052 perl variant_effect_predictor.pl [arguments]
1053
1054 Options
1055 =======
1056
1057 --help Display this message and quit
1058 --verbose Display verbose output as the script runs [default: off]
1059 --quiet Suppress status and warning messages [default: off]
1060 --no_progress Suppress progress bars [default: off]
1061
1062 --config Load configuration from file. Any command line options
1063 specified overwrite those in the file [default: off]
1064
1065 -i | --input_file Input file - if not specified, reads from STDIN. Files
1066 may be gzip compressed.
1067 --format Specify input file format - one of "ensembl", "pileup",
1068 "vcf", "hgvs", "id" or "guess" to try and work out format.
1069 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this
1070 will force --quiet [default: "variant_effect_output.txt"]
1071 --force_overwrite Force overwriting of output file [default: quit if file
1072 exists]
1073
1074 --species [species] Species to use [default: "human"]
1075
1076 -t | --terms Type of consequence terms to output - one of "ensembl", "SO",
1077 "NCBI" [default: ensembl]
1078
1079 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off]
1080 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off]
1081 --condel=[p|s|b] Add Condel SIFT/PolyPhen consensus [p]rediction, [s]core or
1082 [b]oth. Add 1 (e.g. b1) to option to use old Condel algorithm
1083 [default: off]
1084
1085 NB: SIFT, PolyPhen and Condel predictions are currently available for human only
1086
1087 --regulatory Look for overlaps with regulatory regions. The script can
1088 also call if a variant falls in a high information position
1089 within a transcription factor binding site. Output lines have
1090 a Feature type of RegulatoryFeature or MotifFeature
1091 [default: off]
1092
1093 NB: Regulatory consequences are currently available for human and mouse only
1094
1095 --hgnc If specified, HGNC gene identifiers are output alongside the
1096 Ensembl Gene identifier [default: off]
1097 --hgvs Output HGVS identifiers (coding and protein). Requires database
1098 connection [default: off]
1099 --protein Output Ensembl protein identifer [default: off]
1100 --gene Force output of Ensembl gene identifer - disabled by default
1101 unless using --cache or --no_whole_genome [default: off]
1102 --canonical Indicate if the transcript for this consequence is the canonical
1103 transcript for this gene [default: off]
1104
1105 --coding_only Only return consequences that fall in the coding region of
1106 transcripts [default: off]
1107 --most_severe Ouptut only the most severe consequence per variation.
1108 Transcript-specific columns will be left blank. [default: off]
1109 --summary Output only a comma-separated list of all consequences per
1110 variation. Transcript-specific columns will be left blank.
1111 [default: off]
1112 --per_gene Output only the most severe consequence per gene. Where more
1113 than one transcript has the same consequence, the transcript
1114 chosen is arbitrary. [default: off]
1115
1116
1117 --check_ref If specified, checks supplied reference allele against stored
1118 entry in Ensembl Core database [default: off]
1119 --check_existing If specified, checks for existing co-located variations in the
1120 Ensembl Variation database [default: off]
1121 --check_alleles If specified, the alleles of existing co-located variations
1122 are compared to the input; an existing variation will only
1123 be reported if no novel allele is in the input (strand is
1124 accounted for) [default: off]
1125
1126 --no_intergenic Excludes intergenic consequences from the output [default: off]
1127
1128 --check_frequency Turns on frequency filtering. Use this to include or exclude
1129 variants based on the frequency of co-located existing
1130 variants in the Ensembl Variation database. You must also
1131 specify all of the following --freq flags [default: off]
1132 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap,
1133 1kg_yri for YRI 1000 genomes. See documentation for more
1134 details
1135 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5
1136 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or
1137 less than (lt) --freq_freq
1138 --freq_filter Specify whether variants that pass the above should be included
1139 [exclude|include] or excluded from analysis
1140
1141 --chr [list] Select a subset of chromosomes to analyse from your file. Any
1142 data not on this chromosome in the input will be skipped. The
1143 list can be comma separated, with "-" characters representing
1144 a range e.g. 1-5,8,15,X [default: off]
1145 --gp If specified, tries to read GRCh37 position from GP field in the
1146 INFO column of a VCF file. Only applies when VCF is the input
1147 format and human is the species [default: off]
1148
1149 --convert Convert the input file to the output format specified.
1150 [ensembl|vcf|pileup] Converted output is written to the file specified in
1151 --output_file. No consequence calculation is carried out when
1152 doing file conversion. [default: off]
1153
1154 --refseq Use the otherfeatures database to retrieve transcripts - this
1155 database contains RefSeq transcripts (as well as CCDS and
1156 Ensembl EST alignments) [default: off]
1157 --host Manually define database host [default: "ensembldb.ensembl.org"]
1158 -u | --user Database username [default: "anonymous"]
1159 --port Database port [default: 5306]
1160 --password Database password [default: no password]
1161 --genomes Sets DB connection params for Ensembl Genomes [default: off]
1162 --registry Registry file to use defines DB connections [default: off]
1163 Defining a registry file overrides above connection settings.
1164 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not
1165 advised due to likely incompatibilities between API and DB
1166
1167 --no_whole_genome Run in old-style, non-whole genome mode [default: off]
1168 --buffer_size Sets the number of variants sent in each batch [default: 5000]
1169 Increasing buffer size can retrieve results more quickly
1170 but requires more memory. Only applies to whole genome mode.
1171
1172 --cache Enables read-only use of cache [default: off]
1173 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"]
1174 --write_cache Enable writing to cache [default: off]
1175 --build [all|list] Build a complete cache for the selected species. Build for all
1176 chromosomes with --build all, or a list of chromosomes (see
1177 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS
1178 VIOLATES OUR FAIR USAGE POLICY [default: off]
1179
1180 --compress Specify utility to decompress cache files - may be "gzcat" or
1181 "gzip -dc" Only use if default does not work [default: zcat]
1182
1183 --skip_db_check ADVANCED! Force the script to use a cache built from a different
1184 database than specified with --host. Only use this if you are
1185 sure the hosts are compatible (e.g. ensembldb.ensembl.org and
1186 useastdb.ensembl.org) [default: off]
1187 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one
1188 file in the cache. [default: 1MB]
1189 END
1190
1191 print $usage;
1192 }