Mercurial > repos > mahtabm > ensembl
view variant_effect_predictor/variant_effect_predictor.pl @ 3:d30fa12e4cc5 default tip
Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author | devteam <devteam@galaxyproject.org> |
---|---|
date | Mon, 13 Jan 2014 10:38:30 -0500 |
parents | 1f6dce3d34e0 |
children |
line wrap: on
line source
#!/usr/bin/perl =head1 LICENSE Copyright (c) 1999-2012 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <dev@ensembl.org>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME Variant Effect Predictor - a script to predict the consequences of genomic variants http://www.ensembl.org/info/docs/variation/vep/vep_script.html Version 2.6 by Will McLaren (wm2@ebi.ac.uk) =cut use strict; use Getopt::Long; use FileHandle; use FindBin qw($Bin); use lib $Bin; use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); use Bio::EnsEMBL::Variation::Utils::VEP qw( parse_line vf_to_consequences validate_vf convert_to_vcf load_dumped_adaptor_cache dump_adaptor_cache get_all_consequences get_slice build_full_cache read_cache_info get_time debug @OUTPUT_COLS @REG_FEAT_TYPES %FILTER_SHORTCUTS ); # global vars my $VERSION = '2.6'; # define headers that would normally go in the extra field # keyed on the config parameter used to turn it on my %extra_headers = ( protein => ['ENSP'], canonical => ['CANONICAL'], ccds => ['CCDS'], hgvs => ['HGVSc','HGVSp'], hgnc => ['HGNC'], sift => ['SIFT'], polyphen => ['PolyPhen'], numbers => ['EXON','INTRON'], domains => ['DOMAINS'], regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'], cell_type => ['CELL_TYPE'], individual => ['IND'], xref_refseq => ['RefSeq'], check_svs => ['SV'], check_frequency => ['FREQS'], gmaf => ['GMAF'], user => ['DISTANCE'], ); my %extra_descs = ( 'CANONICAL' => 'Indicates if transcript is canonical for this gene', 'CCDS' => 'Indicates if transcript is a CCDS transcript', 'HGNC' => 'HGNC gene identifier', 'ENSP' => 'Ensembl protein identifer', 'HGVSc' => 'HGVS coding sequence name', 'HGVSp' => 'HGVS protein sequence name', 'SIFT' => 'SIFT prediction', 'PolyPhen' => 'PolyPhen prediction', 'EXON' => 'Exon number(s) / total', 'INTRON' => 'Intron number(s) / total', 'DOMAINS' => 'The source and identifer of any overlapping protein domains', 'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position', 'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP', 'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP', 'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP', 'CELL_TYPE' => 'List of cell types and classifications for regulatory feature', 'IND' => 'Individual name', 'SV' => 'IDs of overlapping structural variants', 'FREQS' => 'Frequencies of overlapping variants used in filtering', 'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1', 'DISTANCE' => 'Shortest distance from variant to transcript', ); # set output autoflush for progress bars $| = 1; # configure from command line opts my $config = &configure(scalar @ARGV); # run the main sub routine &main($config); # this is the main sub-routine - it needs the configured $config hash sub main { my $config = shift; debug("Starting...") unless defined $config->{quiet}; $config->{start_time} = time(); $config->{last_time} = time(); my $tr_cache = {}; my $rf_cache = {}; # create a hash to hold slices so we don't get the same one twice my %slice_cache = (); my @vfs; my ($vf_count, $total_vf_count); my $in_file_handle = $config->{in_file_handle}; # initialize line number in config $config->{line_number} = 0; # read the file while(<$in_file_handle>) { chomp; $config->{line_number}++; # header line? if(/^\#/) { # retain header lines if we are outputting VCF if(defined($config->{vcf})) { push @{$config->{headers}}, $_; } # line with sample labels in VCF if(defined($config->{individual}) && /^#CHROM/) { my @split = split /\s+/; # no individuals die("ERROR: No individual data found in VCF\n") if scalar @split <= 9; # get individual column indices my %ind_cols = map {$split[$_] => $_} (9..$#split); # all? if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) { $config->{ind_cols} = \%ind_cols; } else { my %new_ind_cols; # check we have specified individual(s) foreach my $ind(@{$config->{individual}}) { die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind}; $new_ind_cols{$ind} = $ind_cols{$ind}; } $config->{ind_cols} = \%new_ind_cols; } } next; } # configure output file $config->{out_file_handle} ||= &get_out_file_handle($config); # some lines (pileup) may actually parse out into more than one variant foreach my $vf(@{&parse_line($config, $_)}) { $vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original}); # now get the slice if(!defined($vf->{slice})) { my $slice; # don't get slices if we're using cache # we can steal them from transcript objects later if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { # check if we have fetched this slice already if(defined $slice_cache{$vf->{chr}}) { $slice = $slice_cache{$vf->{chr}}; } # if not create a new one else { $slice = &get_slice($config, $vf->{chr}); # if failed, warn and skip this line if(!defined($slice)) { warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; next; } # store the hash $slice_cache{$vf->{chr}} = $slice; } } $vf->{slice} = $slice; } # validate the VF next unless validate_vf($config, $vf); # make a name if one doesn't exist $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term}); # jump out to convert here if(defined($config->{convert})) { &convert_vf($config, $vf); next; } if(defined $config->{whole_genome}) { push @vfs, $vf; $vf_count++; $total_vf_count++; if($vf_count == $config->{buffer_size}) { debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; # calculate stats my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); $config->{last_time} = time(); debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); @vfs = (); $vf_count = 0; } } else { print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; $vf_count++; $total_vf_count++; debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); } } } # if in whole-genome mode, finish off the rest of the buffer if(defined $config->{whole_genome} && scalar @vfs) { debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; # calculate stats my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); $config->{last_time} = time(); debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); debug($config->{filter_count}, "/$total_vf_count variants remain after filtering") if defined($config->{filter}) && !defined($config->{quiet}); } debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); debug("Finished!") unless defined $config->{quiet}; } # sets up configuration hash that is used throughout the script sub configure { my $args = shift; my $config = {}; GetOptions( $config, 'help', # displays help message # input options, 'config=s', # config file name 'input_file|i=s', # input file name 'format=s', # input file format # DB options 'species=s', # species e.g. human, homo_sapiens 'registry=s', # registry file 'host=s', # database host 'port=s', # database port 'user=s', # database user name 'password=s', # database password 'db_version=i', # Ensembl database version to use e.g. 62 'genomes', # automatically sets DB params for e!Genomes 'refseq', # use otherfeatures RefSeq DB instead of Ensembl #'no_disconnect', # disables disconnect_when_inactive # runtime options 'most_severe', # only return most severe consequence 'summary', # only return one line per variation with all consquence types 'per_gene', # only return most severe per gene 'buffer_size=i', # number of variations to read in before analysis 'chunk_size=s', # size in bases of "chunks" used in internal hash structure 'failed=i', # include failed variations when finding existing 'no_whole_genome', # disables now default whole-genome mode 'whole_genome', # proxy for whole genome mode - now just warns user 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT 'check_ref', # check supplied reference allele against DB 'check_existing', # find existing co-located variations 'check_svs', # find overlapping structural variations 'check_alleles', # only attribute co-located if alleles are the same 'check_frequency', # enable frequency checking 'gmaf', # add global MAF of existing var 'freq_filter=s', # exclude or include 'freq_freq=f', # frequency to filter on 'freq_gt_lt=s', # gt or lt (greater than or less than) 'freq_pop=s', # population to filter on 'allow_non_variant', # allow non-variant VCF lines through 'individual=s', # give results by genotype for individuals 'phased', # force VCF genotypes to be interpreted as phased 'fork=i', # fork into N processes # verbosity options 'verbose|v', # print out a bit more info while running 'quiet', # print nothing to STDOUT (unless using -o stdout) 'no_progress', # don't display progress bars # output options 'everything|e', # switch on EVERYTHING :-) 'output_file|o=s', # output file name 'force_overwrite', # force overwrite of output file if already exists 'terms|t=s', # consequence terms to use e.g. NCBI, SO 'coding_only', # only return results for consequences in coding regions 'canonical', # indicates if transcript is canonical 'ccds', # output CCDS identifer 'xref_refseq', # output refseq mrna xref 'protein', # add e! protein ID to extra column 'hgnc', # add HGNC gene ID to extra column 'hgvs', # add HGVS names to extra column 'sift=s', # SIFT predictions 'polyphen=s', # PolyPhen predictions 'condel=s', # Condel predictions 'regulatory', # enable regulatory stuff 'cell_type=s', # filter cell types for regfeats 'convert=s', # convert input to another format (doesn't run VEP) 'filter=s', # run in filtering mode 'no_intergenic', # don't print out INTERGENIC consequences 'gvf', # produce gvf output 'vcf', # produce vcf output 'original', # produce output in input format 'no_consequences', # don't calculate consequences 'lrg', # enable LRG-based features 'fields=s', # define your own output fields 'domains', # output overlapping protein features 'numbers', # include exon and intron numbers # cache stuff 'cache', # use cache 'write_cache', # enables writing to the cache 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs 'no_adaptor_cache', # don't write adaptor cache 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript 'strip', # strips adaptors etc from objects before caching them 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) 'cache_region_size=i', # size of region in bases for each cache file 'no_slice_cache', # tell API not to cache features on slice 'standalone', # standalone renamed offline 'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection 'skip_db_check', # don't compare DB parameters with cached 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" 'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation 'tmpdir=s', # tmp dir used for BigWig retrieval 'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory # debug 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL 'count_queries', # counts SQL queries executed 'admin', # allows me to build off public hosts 'debug', # print out debug info 'tabix', # experimental use tabix cache files ) or die "ERROR: Failed to parse command-line flags\n"; # print usage message if requested or no args supplied if(defined($config->{help}) || !$args) { &usage; exit(0); } # dir is where the cache and plugins live $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); # dir gets set to the specific cache directory later on, so take a copy to use # when configuring plugins $config->{toplevel_dir} = $config->{dir}; # ini file? my $ini_file = $config->{dir}.'/vep.ini'; if(-e $ini_file) { read_config_from_file($config, $ini_file); } # config file? if(defined $config->{config}) { read_config_from_file($config, $config->{config}); } # can't be both quiet and verbose die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose}); # check forking if(defined($config->{fork})) { die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1; # check we can use MIME::Base64 eval q{ use MIME::Base64; }; if($@) { debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); delete $config->{fork}; } else { # try a practice fork my $pid = fork; if(!defined($pid)) { debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet}); delete $config->{fork}; } elsif($pid) { waitpid($pid, 0); } elsif($pid == 0) { exit(0); } } } # check file format if(defined $config->{format}) { die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i; } # check convert format if(defined $config->{convert}) { die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i; } # check if user still using --standalone if(defined $config->{standalone}) { die "ERROR: --standalone replaced by --offline\n"; } # connection settings for Ensembl Genomes if($config->{genomes}) { $config->{host} ||= 'mysql.ebi.ac.uk'; $config->{port} ||= 4157; } # connection settings for main Ensembl else { $config->{species} ||= "homo_sapiens"; $config->{host} ||= 'ensembldb.ensembl.org'; $config->{port} ||= 5306; } # refseq or core? if(defined($config->{refseq})) { $config->{core_type} = 'otherfeatures'; } else { $config->{core_type} = 'core'; } # output term if(defined $config->{terms}) { die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; if($config->{terms} =~ /ensembl|display/i) { $config->{terms} = 'display'; } else { $config->{terms} = uc($config->{terms}); } } # everything? if(defined($config->{everything})) { my %everything = ( sift => 'b', polyphen => 'b', ccds => 1, hgvs => 1, hgnc => 1, numbers => 1, domains => 1, regulatory => 1, canonical => 1, protein => 1, gmaf => 1, ); $config->{$_} = $everything{$_} for keys %everything; # these ones won't work with offline delete $config->{hgvs} if defined($config->{offline}); } # check nsSNP tools foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel'; } # force quiet if outputting to STDOUT if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { delete $config->{verbose} if defined($config->{verbose}); $config->{quiet} = 1; } # individual(s) specified? if(defined($config->{individual})) { $config->{individual} = [split /\,/, $config->{individual}]; # force allow_non_variant $config->{allow_non_variant} = 1; } # summarise options if verbose if(defined $config->{verbose}) { my $header =<<INTRO; #----------------------------------# # ENSEMBL VARIANT EFFECT PREDICTOR # #----------------------------------# version $VERSION By Will McLaren (wm2\@ebi.ac.uk) Configuration options: INTRO print $header; my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1]; foreach my $key(sort keys %$config) { next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0; print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n"; } print "\n".("-" x 20)."\n\n"; } # check custom annotations for my $i(0..$#{$config->{custom}}) { my $custom = $config->{custom}->[$i]; my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom; $type ||= 'exact'; $format ||= 'bed'; $coords ||= 0; # check type die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/; # check format die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/; # bigwig format if($format eq 'bigwig') { # check for bigWigToWig die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/; } else { # check for tabix die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/; # remote files? if($filepath =~ /tp\:\/\//) { my $remote_test = `tabix $filepath 1:1-1 2>&1`; if($remote_test =~ /fail/) { die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n"; } elsif($remote_test =~ /get_local_version/) { debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet}); } } # check files exist else { die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath; die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi'; } } $config->{custom}->[$i] = { 'file' => $filepath, 'name' => $shortname || 'CUSTOM'.($i + 1), 'type' => $type, 'format' => $format, 'coords' => $coords, }; } # check if using filter and original die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter}); # filter by consequence? if(defined($config->{filter})) { my %filters = map {$_ => 1} split /\,/, $config->{filter}; # add in shortcuts foreach my $filter(keys %filters) { my $value = 1; if($filter =~ /^no_/) { delete $filters{$filter}; $filter =~ s/^no_//g; $value = 0; $filters{$filter} = $value; } if(defined($FILTER_SHORTCUTS{$filter})) { delete $filters{$filter}; $filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}}; } } $config->{filter} = \%filters; $config->{filter_count} = 0; } # set defaults $config->{user} ||= 'anonymous'; $config->{buffer_size} ||= 5000; $config->{chunk_size} ||= '50kb'; $config->{output_file} ||= "variant_effect_output.txt"; $config->{tmpdir} ||= '/tmp'; $config->{format} ||= 'guess'; $config->{terms} ||= 'SO'; $config->{cache_region_size} ||= 1000000; $config->{compress} ||= 'zcat'; # regulatory has to be on for cell_type if(defined($config->{cell_type})) { $config->{regulatory} = 1; $config->{cell_type} = [split /\,/, $config->{cell_type}] if defined($config->{cell_type}); } # can't use a whole bunch of options with most_severe if(defined($config->{most_severe})) { foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) { die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag}); } } # can't use a whole bunch of options with summary if(defined($config->{summary})) { foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) { die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag}); } } # frequency filtering if(defined($config->{check_frequency})) { foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag}; } # need to set check_existing $config->{check_existing} = 1; } $config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf}; # warn users still using whole_genome flag if(defined($config->{whole_genome})) { debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); } $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; $config->{failed} = 0 unless defined $config->{failed}; $config->{chunk_size} =~ s/mb?/000000/i; $config->{chunk_size} =~ s/kb?/000/i; $config->{cache_region_size} =~ s/mb?/000000/i; $config->{cache_region_size} =~ s/kb?/000/i; # cluck and display executed SQL? $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); # offline needs cache, can't use HGVS if(defined($config->{offline})) { $config->{cache} = 1; #die("ERROR: Cannot generate HGVS coordinates in offline mode\n") if defined($config->{hgvs}); die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs'; die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id'; die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency}); die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv}); } # write_cache needs cache $config->{cache} = 1 if defined $config->{write_cache}; # no_slice_cache, prefetch and whole_genome have to be on to use cache if(defined($config->{cache})) { $config->{prefetch} = 1; $config->{no_slice_cache} = 1; $config->{whole_genome} = 1; $config->{strip} = 1; } $config->{build} = $config->{rebuild} if defined($config->{rebuild}); # force options for full build if(defined($config->{build})) { $config->{prefetch} = 1; $config->{hgnc} = 1; $config->{no_slice_cache} = 1; $config->{cache} = 1; $config->{strip} = 1; $config->{write_cache} = 1; $config->{cell_type} = 1 if defined($config->{regulatory}); } # connect to databases $config->{reg} = &connect_to_dbs($config); # complete dir with species name and db_version $config->{dir} .= '/'.( join '/', ( defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), $config->{db_version} || $config->{reg}->software_version ) ); # warn user cache directory doesn't exist if(!-e $config->{dir}) { # if using write_cache if(defined($config->{write_cache})) { debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); } # want to read cache, not found elsif(defined($config->{cache})) { die("ERROR: Cache directory ", $config->{dir}, " not found"); } } if(defined($config->{cache})) { # read cache info if(read_cache_info($config)) { debug("Read existing cache info") unless defined $config->{quiet}; } } # we configure plugins here because they can sometimes switch on the # regulatory config option configure_plugins($config); # include regulatory modules if requested if(defined($config->{regulatory})) { # do the use statements here so that users don't have to have the # funcgen API installed to use the rest of the script eval q{ use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; use Bio::EnsEMBL::Funcgen::MotifFeature; use Bio::EnsEMBL::Funcgen::RegulatoryFeature; use Bio::EnsEMBL::Funcgen::BindingMatrix; }; if($@) { die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n"); } } # user defined custom output fields if(defined($config->{fields})) { $config->{fields} = [split ',', $config->{fields}]; debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet}); $config->{fields_redefined} = 1; } $config->{fields} ||= \@OUTPUT_COLS; # suppress warnings that the FeatureAdpators spit if using no_slice_cache Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); # get adaptors (don't get them in offline mode) unless(defined($config->{offline})) { if(defined($config->{cache}) && !defined($config->{write_cache})) { # try and load adaptors from cache if(!&load_dumped_adaptor_cache($config)) { &get_adaptors($config); &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); } # check cached adaptors match DB params else { my $dbc = $config->{sa}->{dbc}; my $ok = 1; if($dbc->{_host} ne $config->{host}) { # ens-livemirror, useastdb and ensembldb should all have identical DBs unless( ( $dbc->{_host} eq 'ens-livemirror' || $dbc->{_host} eq 'ensembldb.ensembl.org' || $dbc->{_host} eq 'useastdb.ensembl.org' ) && ( $config->{host} eq 'ens-livemirror' || $config->{host} eq 'ensembldb.ensembl.org' || $config->{host} eq 'useastdb.ensembl.org' ) ) { $ok = 0; } unless(defined($config->{skip_db_check})) { # but we still need to reconnect debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); &get_adaptors($config); } } if(!$ok) { if(defined($config->{skip_db_check})) { debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); } else { die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; } } } } else { &get_adaptors($config); &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); } # reg adaptors (only fetches if not retrieved from cache already) &get_reg_adaptors($config) if defined($config->{regulatory}); } # check cell types if(defined($config->{cell_type}) && !defined($config->{build})) { my $cls = ''; if(defined($config->{cache})) { $cls = $config->{cache_cell_types}; } else { my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); $cls = join ",", map {$_->name} @{$cta->fetch_all}; } foreach my $cl(@{$config->{cell_type}}) { die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/; } } # get terminal width for progress bars unless(defined($config->{quiet})) { my $width; # module may not be installed eval q{ use Term::ReadKey; }; if(!$@) { my ($w, $h); # module may be installed, but e.g. eval { #($w, $h) = GetTerminalSize(); $w = 167; $h = 30; }; $width = $w if defined $w; } $width ||= 60; $width -= 12; $config->{terminal_width} = $width; } # jump out to build cache if requested if(defined($config->{build})) { if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); } # build the cache debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); build_full_cache($config); # exit script debug("Finished building cache") unless defined($config->{quiet}); exit(0); } # warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG if(defined($config->{cache})) { # these two def depend on DB foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency lrg check_sv)) { debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); } # as does using HGVS or IDs as input debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); # the rest may be in the cache foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) { next if defined($config->{'cache_'.$param}); debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); } } # get list of chrs if supplied if(defined($config->{chr})) { my %chrs; foreach my $val(split /\,/, $config->{chr}) { my @nnn = split /\-/, $val; foreach my $chr($nnn[0]..$nnn[-1]) { $chrs{$chr} = 1; } } $config->{chr} = \%chrs; } # get input file handle $config->{in_file_handle} = &get_in_file_handle($config); return $config; } # reads config from a file sub read_config_from_file { my $config = shift; my $file = shift; open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n"; while(<CONFIG>) { next if /^\#/; my @split = split /\s+|\=/; my $key = shift @split; $key =~ s/^\-//g; if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') { push @{$config->{$key}}, @split; } else { $config->{$key} ||= $split[0]; } } close CONFIG; # force quiet if outputting to STDOUT if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { delete $config->{verbose} if defined($config->{verbose}); $config->{quiet} = 1; } debug("Read configuration from $file") unless defined($config->{quiet}); } # configures custom VEP plugins sub configure_plugins { my $config = shift; $config->{plugins} = []; if (my @plugins = @{ $config->{plugin} }) { # add the Plugins directory onto @INC unshift @INC, $config->{toplevel_dir}."/Plugins"; for my $plugin (@plugins) { # parse out the module name and parameters my ($module, @params) = split /,/, $plugin; # check we can use the module eval qq{ use $module; }; if ($@) { debug("Failed to compile plugin $module: $@") unless defined($config->{quiet}); next; } # now check we can instantiate it, passing any parameters to the constructor my $instance; eval { $instance = $module->new($config, @params); }; if ($@) { debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet}); next; } # check that the versions match my $plugin_version; if ($instance->can('version')) { $plugin_version = $instance->version; } my $version_ok = 1; if ($plugin_version) { my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version; my ($major, $minor, $maintenance) = split /\./, $VERSION; if ($plugin_major != $major) { debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet}); $version_ok = 0; } } else { debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet}); $version_ok = 0; } debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok; # check that it implements all necessary methods for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) { unless ($instance->can($required)) { debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet}); next; } } # all's good, so save the instance in our list of plugins push @{ $config->{plugins} }, $instance; debug("Loaded plugin: $module") unless defined($config->{quiet}); # for convenience, check if the plugin wants regulatory stuff and turn on the config option if so if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) { debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet}); $config->{regulatory} = 1; } } } } # connects to DBs (not done in offline mode) sub connect_to_dbs { my $config = shift; # get registry my $reg = 'Bio::EnsEMBL::Registry'; unless(defined($config->{offline})) { # load DB options from registry file if given if(defined($config->{registry})) { debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); $reg->load_all( $config->{registry}, $config->{verbose}, undef, $config->{no_slice_cache} ); } # otherwise manually connect to DB server else { $reg->load_registry_from_db( -host => $config->{host}, -user => $config->{user}, -pass => $config->{password}, -port => $config->{port}, -db_version => $config->{db_version}, -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, -verbose => $config->{verbose}, -no_cache => $config->{no_slice_cache}, ); } eval { $reg->set_reconnect_when_lost() }; if(defined($config->{verbose})) { # get a meta container adaptors to check version my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); if($core_mca && $var_mca) { debug( "Connected to core version ", $core_mca->get_schema_version, " database ", "and variation version ", $var_mca->get_schema_version, " database" ); } } } return $reg; } # get adaptors from DB sub get_adaptors { my $config = shift; die "ERROR: No registry" unless defined $config->{reg}; $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); $config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature'); $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); # get fake ones for species with no var DB if(!defined($config->{vfa})) { $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); $config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species}); $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); } $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); # cache schema version $config->{mca}->get_schema_version if defined $config->{mca}; # check we got slice adaptor - can't continue without a core DB die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; } # gets regulatory adaptors sub get_reg_adaptors { my $config = shift; foreach my $type(@REG_FEAT_TYPES) { next if defined($config->{$type.'_adaptor'}); my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); if(defined($adaptor)) { $config->{$type.'_adaptor'} = $adaptor; } else { delete $config->{regulatory}; last; } } } # gets file handle for input sub get_in_file_handle { my $config = shift; # define the filehandle to read input from my $in_file_handle = new FileHandle; if(defined($config->{input_file})) { # check defined input file exists die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; if($config->{input_file} =~ /\.gz$/){ $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); } else { $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); } } # no file specified - try to read data off command line else { $in_file_handle = 'STDIN'; debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; } return $in_file_handle; } # gets file handle for output and adds header sub get_out_file_handle { my $config = shift; # define filehandle to write to my $out_file_handle = new FileHandle; # check if file exists if(-e $config->{output_file} && !defined($config->{force_overwrite})) { # die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with -- force_overwrite\n"); } if($config->{output_file} =~ /stdout/i) { $out_file_handle = *STDOUT; } else { $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); } # define headers for a VCF file my @vcf_headers = ( '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO' ); # file conversion, don't want to add normal headers if(defined($config->{convert})) { # header for VCF if($config->{convert} =~ /vcf/i) { print $out_file_handle "##fileformat=VCFv4.0\n"; print $out_file_handle join "\t", @vcf_headers; print $out_file_handle "\n"; } return $out_file_handle; } # GVF output, no header elsif(defined($config->{gvf}) || defined($config->{original})) { print $out_file_handle join "\n", @{$config->{headers}} if defined($config->{headers}) && defined($config->{original}); return $out_file_handle; } elsif(defined($config->{vcf})) { # create an info string for the VCF header my @new_headers; # if the user has defined the fields themselves, we don't need to worry if(defined $config->{fields_redefined}) { @new_headers = @{$config->{fields}}; } else { @new_headers = ( # get default headers, minus variation name and location (already encoded in VCF) grep { $_ ne 'Uploaded_variation' and $_ ne 'Location' and $_ ne 'Extra' } @{$config->{fields}}, # get extra headers map {@{$extra_headers{$_}}} grep {defined $config->{$_}} keys %extra_headers ); # plugin headers foreach my $plugin_header(split /\n/, get_plugin_headers($config)) { $plugin_header =~ /\#\# (.+?)\t\:.+/; push @new_headers, $1; } # redefine the main headers list in config $config->{fields} = \@new_headers; } # add the newly defined headers as a header to the VCF my $string = join '|', @{$config->{fields}}; my @vcf_info_strings = ('##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: '.$string.'">'); # add custom headers foreach my $custom(@{$config->{custom}}) { push @vcf_info_strings, '##INFO=<ID='.$custom->{name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">'; } # if this is already a VCF file, we need to add our new headers in the right place if(defined($config->{headers})) { for my $i(0..$#{$config->{headers}}) { if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) { splice(@{$config->{headers}}, $i, 0, @vcf_info_strings); } } print $out_file_handle join "\n", @{$config->{headers}}; print $out_file_handle "\n"; } else { print $out_file_handle "##fileformat=VCFv4.0\n"; print $out_file_handle join "\n", @vcf_info_strings; print $out_file_handle "\n"; print $out_file_handle join "\t", @vcf_headers; print $out_file_handle "\n"; } return $out_file_handle; } # make header my $time = &get_time; my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); my $version_string = "Using API version ".$config->{reg}->software_version. ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); # add key for extra column headers based on config my $extra_column_keys = join "\n", map {'## '.$_.' : '.$extra_descs{$_}} sort map {@{$extra_headers{$_}}} grep {defined $config->{$_}} keys %extra_headers; my $header =<<HEAD; ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION ## Output produced at $time ## Connected to $db_string ## $version_string ## Extra column keys: $extra_column_keys HEAD $header .= get_plugin_headers($config); # add headers print $out_file_handle $header; # add custom data defs if(defined($config->{custom})) { foreach my $custom(@{$config->{custom}}) { print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n"; } } # add column headers print $out_file_handle '#', (join "\t", @{$config->{fields}}); print $out_file_handle "\n"; return $out_file_handle; } sub get_plugin_headers { my $config = shift; my $header = ""; for my $plugin (@{ $config->{plugins} }) { if (my $hdr = $plugin->get_header_info) { for my $key (keys %$hdr) { my $val = $hdr->{$key}; $header .= "## $key\t: $val\n"; } } } return $header; } # convert a variation feature to a line of output sub convert_vf { my $config = shift; my $vf = shift; my $convert_method = 'convert_to_'.lc($config->{convert}); my $method_ref = \&$convert_method; my $line = &$method_ref($config, $vf); my $handle = $config->{out_file_handle}; if(scalar @$line) { print $handle join "\t", @$line; print $handle "\n"; } } # converts to Ensembl format sub convert_to_ensembl { my $config = shift; my $vf = shift; return [ $vf->{chr} || $vf->seq_region_name, $vf->start, $vf->end, $vf->allele_string, $vf->strand, $vf->variation_name ]; } # converts to pileup format sub convert_to_pileup { my $config = shift; my $vf = shift; # look for imbalance in the allele string my %allele_lengths; my @alleles = split /\//, $vf->allele_string; foreach my $allele(@alleles) { $allele =~ s/\-//g; $allele_lengths{length($allele)} = 1; } # in/del if(scalar keys %allele_lengths > 1) { if($vf->allele_string =~ /\-/) { # insertion? if($alleles[0] eq '-') { shift @alleles; for my $i(0..$#alleles) { $alleles[$i] =~ s/\-//g; $alleles[$i] = '+'.$alleles[$i]; } } else { @alleles = grep {$_ ne '-'} @alleles; for my $i(0..$#alleles) { $alleles[$i] =~ s/\-//g; $alleles[$i] = '-'.$alleles[$i]; } } @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; return [ $vf->{chr} || $vf->seq_region_name, $vf->start - 1, '*', (join "/", @alleles), ]; } else { warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); return []; } } # balanced sub else { return [ $vf->{chr} || $vf->seq_region_name, $vf->start, shift @alleles, (join "/", @alleles), ]; } } # converts to HGVS (hackily returns many lines) sub convert_to_hgvs { my $config = shift; my $vf = shift; # ensure we have a slice $vf->{slice} ||= get_slice($config, $vf->{chr}); my $tvs = $vf->get_all_TranscriptVariations; my @return = values %{$vf->get_all_hgvs_notations()}; if(defined($tvs)) { push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs; push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs; } return [join "\n", @return]; } # prints a line of output from the hash sub print_line { my $config = shift; my $line = shift; return unless defined($line); my $output; # normal if(ref($line) eq 'HASH') { my %extra = %{$line->{Extra}}; $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; # if the fields have been redefined we need to search through in case # any of the defined fields are actually part of the Extra hash $output = join "\t", map { (defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-')) } @{$config->{fields}}; } # gvf/vcf else { $output = $$line; } my $fh = $config->{out_file_handle}; print $fh "$output\n"; } # outputs usage message sub usage { my $usage =<<END; #----------------------------------# # ENSEMBL VARIANT EFFECT PREDICTOR # #----------------------------------# version $VERSION By Will McLaren (wm2\@ebi.ac.uk) http://www.ensembl.org/info/docs/variation/vep/vep_script.html Usage: perl variant_effect_predictor.pl [arguments] Options ======= --help Display this message and quit --verbose Display verbose output as the script runs [default: off] --quiet Suppress status and warning messages [default: off] --no_progress Suppress progress bars [default: off] --config Load configuration from file. Any command line options specified overwrite those in the file [default: off] --everything Shortcut switch to turn on commonly used options. See web documentation for details [default: off] --fork [num_forks] Use forking to improve script runtime [default: off] -i | --input_file Input file - if not specified, reads from STDIN. Files may be gzip compressed. --format Specify input file format - one of "ensembl", "pileup", "vcf", "hgvs", "id" or "guess" to try and work out format. -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this will force --quiet [default: "variant_effect_output.txt"] --force_overwrite Force overwriting of output file [default: quit if file exists] --original Writes output as it was in input - must be used with --filter since no consequence data is added [default: off] --vcf Write output as VCF [default: off] --gvf Write output as GVF [default: off] --fields [field list] Define a custom output format by specifying a comma-separated list of field names. Field names normally present in the "Extra" field may also be specified, including those added by plugin modules. Can also be used to configure VCF output columns [default: off] --species [species] Species to use [default: "human"] -t | --terms Type of consequence terms to output - one of "SO", "ensembl" [default: SO] --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off] --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off] NB: SIFT and PolyPhen predictions are currently available for human only NB: Condel support has been moved to a VEP plugin module - see documentation --regulatory Look for overlaps with regulatory regions. The script can also call if a variant falls in a high information position within a transcription factor binding site. Output lines have a Feature type of RegulatoryFeature or MotifFeature [default: off] --cell_type [types] Report only regulatory regions that are found in the given cell type(s). Can be a single cell type or a comma-separated list. The functional type in each cell type is reported under CELL_TYPE in the output. To retrieve a list of cell types, use "--cell_type list" [default: off] NB: Regulatory consequences are currently available for human and mouse only --custom [file list] Add custom annotations from tabix-indexed files. See documentation for full details [default: off] --plugin [plugin_name] Use named plugin module [default: off] --hgnc Add HGNC gene identifiers to output [default: off] --hgvs Output HGVS identifiers (coding and protein). Requires database connection [default: off] --ccds Output CCDS transcript identifiers [default: off] --xref_refseq Output aligned RefSeq mRNA identifier for transcript. NB: the RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND FREQUENTLY WILL NOT, match exactly in sequence, exon structure and protein product [default: off] --protein Output Ensembl protein identifer [default: off] --canonical Indicate if the transcript for this consequence is the canonical transcript for this gene [default: off] --domains Include details of any overlapping protein domains [default: off] --numbers Include exon & intron numbers [default: off] --no_intergenic Excludes intergenic consequences from the output [default: off] --coding_only Only return consequences that fall in the coding region of transcripts [default: off] --most_severe Ouptut only the most severe consequence per variation. Transcript-specific columns will be left blank. [default: off] --summary Output only a comma-separated list of all consequences per variation. Transcript-specific columns will be left blank. [default: off] --per_gene Output only the most severe consequence per gene. Where more than one transcript has the same consequence, the transcript chosen is arbitrary. [default: off] --check_ref If specified, checks supplied reference allele against stored entry in Ensembl Core database [default: off] --check_existing If specified, checks for existing co-located variations in the Ensembl Variation database [default: off] --failed [0|1] Include (1) or exclude (0) variants that have been flagged as failed by Ensembl when checking for existing variants. [default: exclude] --check_alleles If specified, the alleles of existing co-located variations are compared to the input; an existing variation will only be reported if no novel allele is in the input (strand is accounted for) [default: off] --check_svs Report overlapping structural variants [default: off] --filter [filters] Filter output by consequence type. Use this to output only variants that have at least one consequence type matching the filter. Multiple filters can be used separated by ",". By combining this with --original it is possible to run the VEP iteratively to progressively filter a set of variants. See documentation for full details [default: off] --check_frequency Turns on frequency filtering. Use this to include or exclude variants based on the frequency of co-located existing variants in the Ensembl Variation database. You must also specify all of the following --freq flags [default: off] --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap, 1kg_yri for YRI 1000 genomes. See documentation for more details --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or less than (lt) --freq_freq --freq_filter Specify whether variants that pass the above should be included [exclude|include] or excluded from analysis --gmaf Include global MAF of existing variant from 1000 Genomes Phase 1 in output --individual [id] Consider only alternate alleles present in the genotypes of the specified individual(s). May be a single individual, a comma- separated list or "all" to assess all individuals separately. Each individual and variant combination is given on a separate line of output. Only works with VCF files containing individual genotype data; individual IDs are taken from column headers. --allow_non_variant Prints out non-variant lines when using VCF input --phased Force VCF individual genotypes to be interpreted as phased. For use with plugins that depend on phased state. --chr [list] Select a subset of chromosomes to analyse from your file. Any data not on this chromosome in the input will be skipped. The list can be comma separated, with "-" characters representing a range e.g. 1-5,8,15,X [default: off] --gp If specified, tries to read GRCh37 position from GP field in the INFO column of a VCF file. Only applies when VCF is the input format and human is the species [default: off] --convert Convert the input file to the output format specified. [ensembl|vcf|pileup] Converted output is written to the file specified in --output_file. No consequence calculation is carried out when doing file conversion. [default: off] --refseq Use the otherfeatures database to retrieve transcripts - this database contains RefSeq transcripts (as well as CCDS and Ensembl EST alignments) [default: off] --host Manually define database host [default: "ensembldb.ensembl.org"] -u | --user Database username [default: "anonymous"] --port Database port [default: 5306] --password Database password [default: no password] --genomes Sets DB connection params for Ensembl Genomes [default: off] --registry Registry file to use defines DB connections [default: off] Defining a registry file overrides above connection settings. --db_version=[number] Force script to load DBs from a specific Ensembl version. Not advised due to likely incompatibilities between API and DB --no_whole_genome Run in old-style, non-whole genome mode [default: off] --buffer_size Sets the number of variants sent in each batch [default: 5000] Increasing buffer size can retrieve results more quickly but requires more memory. Only applies to whole genome mode. --cache Enables read-only use of cache [default: off] --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"] --write_cache Enable writing to cache [default: off] --build [all|list] Build a complete cache for the selected species. Build for all chromosomes with --build all, or a list of chromosomes (see --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS VIOLATES OUR FAIR USAGE POLICY [default: off] --compress Specify utility to decompress cache files - may be "gzcat" or "gzip -dc" Only use if default does not work [default: zcat] --skip_db_check ADVANCED! Force the script to use a cache built from a different database than specified with --host. Only use this if you are sure the hosts are compatible (e.g. ensembldb.ensembl.org and useastdb.ensembl.org) [default: off] --cache_region_size ADVANCED! The size in base-pairs of the region covered by one file in the cache. [default: 1MB] END print $usage; }