Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/variant_effect_predictor.pl @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/variant_effect_predictor.pl Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1759 @@ +#!/usr/bin/perl + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Variant Effect Predictor - a script to predict the consequences of genomic variants + +http://www.ensembl.org/info/docs/variation/vep/vep_script.html + +Version 2.6 + +by Will McLaren (wm2@ebi.ac.uk) +=cut + +use strict; +use Getopt::Long; +use FileHandle; +use FindBin qw($Bin); +use lib $Bin; + +use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); +use Bio::EnsEMBL::Variation::Utils::VEP qw( + parse_line + vf_to_consequences + validate_vf + convert_to_vcf + load_dumped_adaptor_cache + dump_adaptor_cache + get_all_consequences + get_slice + build_full_cache + read_cache_info + get_time + debug + @OUTPUT_COLS + @REG_FEAT_TYPES + %FILTER_SHORTCUTS +); + +# global vars +my $VERSION = '2.6'; + + +# define headers that would normally go in the extra field +# keyed on the config parameter used to turn it on +my %extra_headers = ( + protein => ['ENSP'], + canonical => ['CANONICAL'], + ccds => ['CCDS'], + hgvs => ['HGVSc','HGVSp'], + hgnc => ['HGNC'], + sift => ['SIFT'], + polyphen => ['PolyPhen'], + numbers => ['EXON','INTRON'], + domains => ['DOMAINS'], + regulatory => ['MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE'], + cell_type => ['CELL_TYPE'], + individual => ['IND'], + xref_refseq => ['RefSeq'], + check_svs => ['SV'], + check_frequency => ['FREQS'], + gmaf => ['GMAF'], + user => ['DISTANCE'], +); + +my %extra_descs = ( + 'CANONICAL' => 'Indicates if transcript is canonical for this gene', + 'CCDS' => 'Indicates if transcript is a CCDS transcript', + 'HGNC' => 'HGNC gene identifier', + 'ENSP' => 'Ensembl protein identifer', + 'HGVSc' => 'HGVS coding sequence name', + 'HGVSp' => 'HGVS protein sequence name', + 'SIFT' => 'SIFT prediction', + 'PolyPhen' => 'PolyPhen prediction', + 'EXON' => 'Exon number(s) / total', + 'INTRON' => 'Intron number(s) / total', + 'DOMAINS' => 'The source and identifer of any overlapping protein domains', + 'MOTIF_NAME' => 'The source and identifier of a transcription factor binding profile (TFBP) aligned at this position', + 'MOTIF_POS' => 'The relative position of the variation in the aligned TFBP', + 'HIGH_INF_POS' => 'A flag indicating if the variant falls in a high information position of the TFBP', + 'MOTIF_SCORE_CHANGE' => 'The difference in motif score of the reference and variant sequences for the TFBP', + 'CELL_TYPE' => 'List of cell types and classifications for regulatory feature', + 'IND' => 'Individual name', + 'SV' => 'IDs of overlapping structural variants', + 'FREQS' => 'Frequencies of overlapping variants used in filtering', + 'GMAF' => 'Minor allele and frequency of existing variation in 1000 Genomes Phase 1', + 'DISTANCE' => 'Shortest distance from variant to transcript', +); + +# set output autoflush for progress bars +$| = 1; + +# configure from command line opts +my $config = &configure(scalar @ARGV); + +# run the main sub routine +&main($config); + +# this is the main sub-routine - it needs the configured $config hash +sub main { + my $config = shift; + + debug("Starting...") unless defined $config->{quiet}; + + $config->{start_time} = time(); + $config->{last_time} = time(); + + my $tr_cache = {}; + my $rf_cache = {}; + + # create a hash to hold slices so we don't get the same one twice + my %slice_cache = (); + + my @vfs; + my ($vf_count, $total_vf_count); + my $in_file_handle = $config->{in_file_handle}; + + # initialize line number in config + $config->{line_number} = 0; + + # read the file + while(<$in_file_handle>) { + chomp; + + $config->{line_number}++; + + # header line? + if(/^\#/) { + + # retain header lines if we are outputting VCF + if(defined($config->{vcf})) { + push @{$config->{headers}}, $_; + } + + # line with sample labels in VCF + if(defined($config->{individual}) && /^#CHROM/) { + my @split = split /\s+/; + + # no individuals + die("ERROR: No individual data found in VCF\n") if scalar @split <= 9; + + # get individual column indices + my %ind_cols = map {$split[$_] => $_} (9..$#split); + + # all? + if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) { + $config->{ind_cols} = \%ind_cols; + } + else { + my %new_ind_cols; + + # check we have specified individual(s) + foreach my $ind(@{$config->{individual}}) { + die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind}; + $new_ind_cols{$ind} = $ind_cols{$ind}; + } + + $config->{ind_cols} = \%new_ind_cols; + } + } + + next; + } + + # configure output file + $config->{out_file_handle} ||= &get_out_file_handle($config); + + # some lines (pileup) may actually parse out into more than one variant + foreach my $vf(@{&parse_line($config, $_)}) { + + $vf->{_line} = $_ ;#if defined($config->{vcf}) || defined($config->{original}); + + # now get the slice + if(!defined($vf->{slice})) { + my $slice; + + # don't get slices if we're using cache + # we can steal them from transcript objects later + if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { + + # check if we have fetched this slice already + if(defined $slice_cache{$vf->{chr}}) { + $slice = $slice_cache{$vf->{chr}}; + } + + # if not create a new one + else { + + $slice = &get_slice($config, $vf->{chr}); + + # if failed, warn and skip this line + if(!defined($slice)) { + warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; + next; + } + + # store the hash + $slice_cache{$vf->{chr}} = $slice; + } + } + + $vf->{slice} = $slice; + } + + # validate the VF + next unless validate_vf($config, $vf); + + # make a name if one doesn't exist + $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term}); + + # jump out to convert here + if(defined($config->{convert})) { + &convert_vf($config, $vf); + next; + } + + if(defined $config->{whole_genome}) { + push @vfs, $vf; + $vf_count++; + $total_vf_count++; + + if($vf_count == $config->{buffer_size}) { + debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); + + print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; + + # calculate stats + my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); + my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); + $config->{last_time} = time(); + + debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); + + @vfs = (); + $vf_count = 0; + } + } + else { + print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; + $vf_count++; + $total_vf_count++; + debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); + } + } + } + + # if in whole-genome mode, finish off the rest of the buffer + if(defined $config->{whole_genome} && scalar @vfs) { + debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); + + print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)}; + + # calculate stats + my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1)); + my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1)); + $config->{last_time} = time(); + + debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet}); + + debug($config->{filter_count}, "/$total_vf_count variants remain after filtering") if defined($config->{filter}) && !defined($config->{quiet}); + } + + debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); + + debug("Finished!") unless defined $config->{quiet}; +} + +# sets up configuration hash that is used throughout the script +sub configure { + my $args = shift; + + my $config = {}; + + GetOptions( + $config, + 'help', # displays help message + + # input options, + 'config=s', # config file name + 'input_file|i=s', # input file name + 'format=s', # input file format + + # DB options + 'species=s', # species e.g. human, homo_sapiens + 'registry=s', # registry file + 'host=s', # database host + 'port=s', # database port + 'user=s', # database user name + 'password=s', # database password + 'db_version=i', # Ensembl database version to use e.g. 62 + 'genomes', # automatically sets DB params for e!Genomes + 'refseq', # use otherfeatures RefSeq DB instead of Ensembl + #'no_disconnect', # disables disconnect_when_inactive + + # runtime options + 'most_severe', # only return most severe consequence + 'summary', # only return one line per variation with all consquence types + 'per_gene', # only return most severe per gene + 'buffer_size=i', # number of variations to read in before analysis + 'chunk_size=s', # size in bases of "chunks" used in internal hash structure + 'failed=i', # include failed variations when finding existing + 'no_whole_genome', # disables now default whole-genome mode + 'whole_genome', # proxy for whole genome mode - now just warns user + 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) + 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT + 'check_ref', # check supplied reference allele against DB + 'check_existing', # find existing co-located variations + 'check_svs', # find overlapping structural variations + 'check_alleles', # only attribute co-located if alleles are the same + 'check_frequency', # enable frequency checking + 'gmaf', # add global MAF of existing var + 'freq_filter=s', # exclude or include + 'freq_freq=f', # frequency to filter on + 'freq_gt_lt=s', # gt or lt (greater than or less than) + 'freq_pop=s', # population to filter on + 'allow_non_variant', # allow non-variant VCF lines through + 'individual=s', # give results by genotype for individuals + 'phased', # force VCF genotypes to be interpreted as phased + 'fork=i', # fork into N processes + + # verbosity options + 'verbose|v', # print out a bit more info while running + 'quiet', # print nothing to STDOUT (unless using -o stdout) + 'no_progress', # don't display progress bars + + # output options + 'everything|e', # switch on EVERYTHING :-) + 'output_file|o=s', # output file name + 'force_overwrite', # force overwrite of output file if already exists + 'terms|t=s', # consequence terms to use e.g. NCBI, SO + 'coding_only', # only return results for consequences in coding regions + 'canonical', # indicates if transcript is canonical + 'ccds', # output CCDS identifer + 'xref_refseq', # output refseq mrna xref + 'protein', # add e! protein ID to extra column + 'hgnc', # add HGNC gene ID to extra column + 'hgvs', # add HGVS names to extra column + 'sift=s', # SIFT predictions + 'polyphen=s', # PolyPhen predictions + 'condel=s', # Condel predictions + 'regulatory', # enable regulatory stuff + 'cell_type=s', # filter cell types for regfeats + 'convert=s', # convert input to another format (doesn't run VEP) + 'filter=s', # run in filtering mode + 'no_intergenic', # don't print out INTERGENIC consequences + 'gvf', # produce gvf output + 'vcf', # produce vcf output + 'original', # produce output in input format + 'no_consequences', # don't calculate consequences + 'lrg', # enable LRG-based features + 'fields=s', # define your own output fields + 'domains', # output overlapping protein features + 'numbers', # include exon and intron numbers + + # cache stuff + 'cache', # use cache + 'write_cache', # enables writing to the cache + 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs + 'no_adaptor_cache', # don't write adaptor cache + 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript + 'strip', # strips adaptors etc from objects before caching them + 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more + 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) + 'cache_region_size=i', # size of region in bases for each cache file + 'no_slice_cache', # tell API not to cache features on slice + 'standalone', # standalone renamed offline + 'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection + 'skip_db_check', # don't compare DB parameters with cached + 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" + 'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation + 'tmpdir=s', # tmp dir used for BigWig retrieval + 'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory + + # debug + 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL + 'count_queries', # counts SQL queries executed + 'admin', # allows me to build off public hosts + 'debug', # print out debug info + 'tabix', # experimental use tabix cache files + ) or die "ERROR: Failed to parse command-line flags\n"; + + # print usage message if requested or no args supplied + if(defined($config->{help}) || !$args) { + &usage; + exit(0); + } + + # dir is where the cache and plugins live + $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); + + # dir gets set to the specific cache directory later on, so take a copy to use + # when configuring plugins + + $config->{toplevel_dir} = $config->{dir}; + + # ini file? + my $ini_file = $config->{dir}.'/vep.ini'; + + if(-e $ini_file) { + read_config_from_file($config, $ini_file); + } + + # config file? + if(defined $config->{config}) { + read_config_from_file($config, $config->{config}); + } + + # can't be both quiet and verbose + die "ERROR: Can't be both quiet and verbose!\n" if defined($config->{quiet}) && defined($config->{verbose}); + + # check forking + if(defined($config->{fork})) { + die "ERROR: Fork number must be greater than 1\n" if $config->{fork} <= 1; + + # check we can use MIME::Base64 + eval q{ use MIME::Base64; }; + + if($@) { + debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); + delete $config->{fork}; + } + else { + + # try a practice fork + my $pid = fork; + + if(!defined($pid)) { + debug("WARNING: Fork test failed, forking disabled") unless defined($config->{quiet}); + delete $config->{fork}; + } + elsif($pid) { + waitpid($pid, 0); + } + elsif($pid == 0) { + exit(0); + } + } + } + + # check file format + if(defined $config->{format}) { + die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /^(pileup|vcf|guess|hgvs|ensembl|id|vep)$/i; + } + + # check convert format + if(defined $config->{convert}) { + die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup|hgvs/i; + } + + # check if user still using --standalone + if(defined $config->{standalone}) { + die "ERROR: --standalone replaced by --offline\n"; + } + + # connection settings for Ensembl Genomes + if($config->{genomes}) { + $config->{host} ||= 'mysql.ebi.ac.uk'; + $config->{port} ||= 4157; + } + + # connection settings for main Ensembl + else { + $config->{species} ||= "homo_sapiens"; + $config->{host} ||= 'ensembldb.ensembl.org'; + $config->{port} ||= 5306; + } + + # refseq or core? + if(defined($config->{refseq})) { + $config->{core_type} = 'otherfeatures'; + } + else { + $config->{core_type} = 'core'; + } + + # output term + if(defined $config->{terms}) { + die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; + if($config->{terms} =~ /ensembl|display/i) { + $config->{terms} = 'display'; + } + else { + $config->{terms} = uc($config->{terms}); + } + } + + # everything? + if(defined($config->{everything})) { + my %everything = ( + sift => 'b', + polyphen => 'b', + ccds => 1, + hgvs => 1, + hgnc => 1, + numbers => 1, + domains => 1, + regulatory => 1, + canonical => 1, + protein => 1, + gmaf => 1, + ); + + $config->{$_} = $everything{$_} for keys %everything; + + # these ones won't work with offline + delete $config->{hgvs} if defined($config->{offline}); + } + + # check nsSNP tools + foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { + die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; + + die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; + + die "ERROR: $tool functionality is now available as a VEP Plugin - see http://www.ensembl.org/info/docs/variation/vep/vep_script.html#plugins\n" if $tool eq 'Condel'; + } + + # force quiet if outputting to STDOUT + if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { + delete $config->{verbose} if defined($config->{verbose}); + $config->{quiet} = 1; + } + + # individual(s) specified? + if(defined($config->{individual})) { + $config->{individual} = [split /\,/, $config->{individual}]; + + # force allow_non_variant + $config->{allow_non_variant} = 1; + } + + # summarise options if verbose + if(defined $config->{verbose}) { + my $header =<<INTRO; +#----------------------------------# +# ENSEMBL VARIANT EFFECT PREDICTOR # +#----------------------------------# + +version $VERSION + +By Will McLaren (wm2\@ebi.ac.uk) + +Configuration options: + +INTRO + print $header; + + my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1]; + + foreach my $key(sort keys %$config) { + next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0; + print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n"; + } + + print "\n".("-" x 20)."\n\n"; + } + + # check custom annotations + for my $i(0..$#{$config->{custom}}) { + my $custom = $config->{custom}->[$i]; + + my ($filepath, $shortname, $format, $type, $coords) = split /\,/, $custom; + $type ||= 'exact'; + $format ||= 'bed'; + $coords ||= 0; + + # check type + die "ERROR: Type $type for custom annotation file $filepath is not allowed (must be one of \"exact\", \"overlap\")\n" unless $type =~ /exact|overlap/; + + # check format + die "ERROR: Format $format for custom annotation file $filepath is not allowed (must be one of \"bed\", \"vcf\", \"gtf\", \"gff\", \"bigwig\")\n" unless $format =~ /bed|vcf|gff|gtf|bigwig/; + + # bigwig format + if($format eq 'bigwig') { + # check for bigWigToWig + die "ERROR: bigWigToWig does not seem to be in your path - this is required to use bigwig format custom annotations\n" unless `which bigWigToWig 2>&1` =~ /bigWigToWig$/; + } + + else { + # check for tabix + die "ERROR: tabix does not seem to be in your path - this is required to use custom annotations\n" unless `which tabix 2>&1` =~ /tabix$/; + + # remote files? + if($filepath =~ /tp\:\/\//) { + my $remote_test = `tabix $filepath 1:1-1 2>&1`; + if($remote_test =~ /fail/) { + die "$remote_test\nERROR: Could not find file or index file for remote annotation file $filepath\n"; + } + elsif($remote_test =~ /get_local_version/) { + debug("Downloaded tabix index file for remote annotation file $filepath") unless defined($config->{quiet}); + } + } + + # check files exist + else { + die "ERROR: Custom annotation file $filepath not found\n" unless -e $filepath; + die "ERROR: Tabix index file $filepath\.tbi not found - perhaps you need to create it first?\n" unless -e $filepath.'.tbi'; + } + } + + $config->{custom}->[$i] = { + 'file' => $filepath, + 'name' => $shortname || 'CUSTOM'.($i + 1), + 'type' => $type, + 'format' => $format, + 'coords' => $coords, + }; + } + + # check if using filter and original + die "ERROR: You must also provide output filters using --filter to use --original\n" if defined($config->{original}) && !defined($config->{filter}); + + # filter by consequence? + if(defined($config->{filter})) { + + my %filters = map {$_ => 1} split /\,/, $config->{filter}; + + # add in shortcuts + foreach my $filter(keys %filters) { + my $value = 1; + if($filter =~ /^no_/) { + delete $filters{$filter}; + $filter =~ s/^no_//g; + $value = 0; + $filters{$filter} = $value; + } + + if(defined($FILTER_SHORTCUTS{$filter})) { + delete $filters{$filter}; + $filters{$_} = $value for keys %{$FILTER_SHORTCUTS{$filter}}; + } + } + + $config->{filter} = \%filters; + + $config->{filter_count} = 0; + } + + # set defaults + $config->{user} ||= 'anonymous'; + $config->{buffer_size} ||= 5000; + $config->{chunk_size} ||= '50kb'; + $config->{output_file} ||= "variant_effect_output.txt"; + $config->{tmpdir} ||= '/tmp'; + $config->{format} ||= 'guess'; + $config->{terms} ||= 'SO'; + $config->{cache_region_size} ||= 1000000; + $config->{compress} ||= 'zcat'; + + # regulatory has to be on for cell_type + if(defined($config->{cell_type})) { + $config->{regulatory} = 1; + $config->{cell_type} = [split /\,/, $config->{cell_type}] if defined($config->{cell_type}); + } + + # can't use a whole bunch of options with most_severe + if(defined($config->{most_severe})) { + foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains summary)) { + die "ERROR: --most_severe is not compatible with --$flag\n" if defined($config->{$flag}); + } + } + + # can't use a whole bunch of options with summary + if(defined($config->{summary})) { + foreach my $flag(qw(no_intergenic protein hgnc sift polyphen coding_only ccds canonical xref_refseq numbers domains most_severe)) { + die "ERROR: --summary is not compatible with --$flag\n" if defined($config->{$flag}); + } + } + + # frequency filtering + if(defined($config->{check_frequency})) { + foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { + die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag}; + } + + # need to set check_existing + $config->{check_existing} = 1; + } + + $config->{check_existing} = 1 if defined $config->{check_alleles} || defined $config->{gmaf}; + + # warn users still using whole_genome flag + if(defined($config->{whole_genome})) { + debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); + } + + $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; + $config->{failed} = 0 unless defined $config->{failed}; + $config->{chunk_size} =~ s/mb?/000000/i; + $config->{chunk_size} =~ s/kb?/000/i; + $config->{cache_region_size} =~ s/mb?/000000/i; + $config->{cache_region_size} =~ s/kb?/000/i; + + # cluck and display executed SQL? + $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); + + # offline needs cache, can't use HGVS + if(defined($config->{offline})) { + $config->{cache} = 1; + + #die("ERROR: Cannot generate HGVS coordinates in offline mode\n") if defined($config->{hgvs}); + die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs'; + die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id'; + die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency}); + die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv}); + } + + # write_cache needs cache + $config->{cache} = 1 if defined $config->{write_cache}; + + # no_slice_cache, prefetch and whole_genome have to be on to use cache + if(defined($config->{cache})) { + $config->{prefetch} = 1; + $config->{no_slice_cache} = 1; + $config->{whole_genome} = 1; + $config->{strip} = 1; + } + + $config->{build} = $config->{rebuild} if defined($config->{rebuild}); + + # force options for full build + if(defined($config->{build})) { + $config->{prefetch} = 1; + $config->{hgnc} = 1; + $config->{no_slice_cache} = 1; + $config->{cache} = 1; + $config->{strip} = 1; + $config->{write_cache} = 1; + $config->{cell_type} = 1 if defined($config->{regulatory}); + } + + # connect to databases + $config->{reg} = &connect_to_dbs($config); + + # complete dir with species name and db_version + $config->{dir} .= '/'.( + join '/', ( + defined($config->{offline}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), + $config->{db_version} || $config->{reg}->software_version + ) + ); + + # warn user cache directory doesn't exist + if(!-e $config->{dir}) { + + # if using write_cache + if(defined($config->{write_cache})) { + debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); + } + + # want to read cache, not found + elsif(defined($config->{cache})) { + die("ERROR: Cache directory ", $config->{dir}, " not found"); + } + } + + if(defined($config->{cache})) { + # read cache info + if(read_cache_info($config)) { + debug("Read existing cache info") unless defined $config->{quiet}; + } + } + + # we configure plugins here because they can sometimes switch on the + # regulatory config option + configure_plugins($config); + + # include regulatory modules if requested + if(defined($config->{regulatory})) { + # do the use statements here so that users don't have to have the + # funcgen API installed to use the rest of the script + eval q{ + use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; + use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; + use Bio::EnsEMBL::Funcgen::MotifFeature; + use Bio::EnsEMBL::Funcgen::RegulatoryFeature; + use Bio::EnsEMBL::Funcgen::BindingMatrix; + }; + + if($@) { + die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n"); + } + } + + # user defined custom output fields + if(defined($config->{fields})) { + $config->{fields} = [split ',', $config->{fields}]; + debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet}); + $config->{fields_redefined} = 1; + } + $config->{fields} ||= \@OUTPUT_COLS; + + # suppress warnings that the FeatureAdpators spit if using no_slice_cache + Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); + + # get adaptors (don't get them in offline mode) + unless(defined($config->{offline})) { + + if(defined($config->{cache}) && !defined($config->{write_cache})) { + + # try and load adaptors from cache + if(!&load_dumped_adaptor_cache($config)) { + &get_adaptors($config); + &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); + } + + # check cached adaptors match DB params + else { + my $dbc = $config->{sa}->{dbc}; + + my $ok = 1; + + if($dbc->{_host} ne $config->{host}) { + + # ens-livemirror, useastdb and ensembldb should all have identical DBs + unless( + ( + $dbc->{_host} eq 'ens-livemirror' + || $dbc->{_host} eq 'ensembldb.ensembl.org' + || $dbc->{_host} eq 'useastdb.ensembl.org' + ) && ( + $config->{host} eq 'ens-livemirror' + || $config->{host} eq 'ensembldb.ensembl.org' + || $config->{host} eq 'useastdb.ensembl.org' + ) + ) { + $ok = 0; + } + + unless(defined($config->{skip_db_check})) { + # but we still need to reconnect + debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); + + &get_adaptors($config); + } + } + + if(!$ok) { + if(defined($config->{skip_db_check})) { + debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); + } + else { + die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; + } + } + } + } + else { + &get_adaptors($config); + &dump_adaptor_cache($config) if defined($config->{write_cache}) && !defined($config->{no_adaptor_cache}); + } + + # reg adaptors (only fetches if not retrieved from cache already) + &get_reg_adaptors($config) if defined($config->{regulatory}); + } + + # check cell types + if(defined($config->{cell_type}) && !defined($config->{build})) { + my $cls = ''; + + if(defined($config->{cache})) { + $cls = $config->{cache_cell_types}; + } + else { + my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); + $cls = join ",", map {$_->name} @{$cta->fetch_all}; + } + + foreach my $cl(@{$config->{cell_type}}) { + die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/; + } + } + + # get terminal width for progress bars + unless(defined($config->{quiet})) { + my $width; + + # module may not be installed + eval q{ + use Term::ReadKey; + }; + + if(!$@) { + my ($w, $h); + + # module may be installed, but e.g. + eval { + #($w, $h) = GetTerminalSize(); + $w = 167; + $h = 30; + }; + + $width = $w if defined $w; + } + + $width ||= 60; + $width -= 12; + $config->{terminal_width} = $width; + } + + # jump out to build cache if requested + if(defined($config->{build})) { + + if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { + die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); + } + + # build the cache + debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); + build_full_cache($config); + + # exit script + debug("Finished building cache") unless defined($config->{quiet}); + exit(0); + } + + + # warn user DB will be used for SIFT/PolyPhen/HGVS/frequency/LRG + if(defined($config->{cache})) { + + # these two def depend on DB + foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency lrg check_sv)) { + debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); + } + + # as does using HGVS or IDs as input + debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); + + # the rest may be in the cache + foreach my $param(grep {defined $config->{$_}} qw(sift polyphen regulatory)) { + next if defined($config->{'cache_'.$param}); + debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); + } + } + + # get list of chrs if supplied + if(defined($config->{chr})) { + my %chrs; + + foreach my $val(split /\,/, $config->{chr}) { + my @nnn = split /\-/, $val; + + foreach my $chr($nnn[0]..$nnn[-1]) { + $chrs{$chr} = 1; + } + } + + $config->{chr} = \%chrs; + } + + # get input file handle + $config->{in_file_handle} = &get_in_file_handle($config); + + return $config; +} + +# reads config from a file +sub read_config_from_file { + my $config = shift; + my $file = shift; + + open CONFIG, $file or die "ERROR: Could not open config file \"$file\"\n"; + + while(<CONFIG>) { + next if /^\#/; + my @split = split /\s+|\=/; + my $key = shift @split; + $key =~ s/^\-//g; + + if(defined($config->{$key}) && ref($config->{$key}) eq 'ARRAY') { + push @{$config->{$key}}, @split; + } + else { + $config->{$key} ||= $split[0]; + } + } + + close CONFIG; + + # force quiet if outputting to STDOUT + if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { + delete $config->{verbose} if defined($config->{verbose}); + $config->{quiet} = 1; + } + + debug("Read configuration from $file") unless defined($config->{quiet}); +} + +# configures custom VEP plugins +sub configure_plugins { + + my $config = shift; + + $config->{plugins} = []; + + if (my @plugins = @{ $config->{plugin} }) { + + # add the Plugins directory onto @INC + + unshift @INC, $config->{toplevel_dir}."/Plugins"; + + for my $plugin (@plugins) { + + # parse out the module name and parameters + + my ($module, @params) = split /,/, $plugin; + + # check we can use the module + + eval qq{ + use $module; + }; + if ($@) { + debug("Failed to compile plugin $module: $@") unless defined($config->{quiet}); + next; + } + + # now check we can instantiate it, passing any parameters to the constructor + + my $instance; + + eval { + $instance = $module->new($config, @params); + }; + if ($@) { + debug("Failed to instantiate plugin $module: $@") unless defined($config->{quiet}); + next; + } + + # check that the versions match + + my $plugin_version; + + if ($instance->can('version')) { + $plugin_version = $instance->version; + } + + my $version_ok = 1; + + if ($plugin_version) { + my ($plugin_major, $plugin_minor, $plugin_maintenance) = split /\./, $plugin_version; + my ($major, $minor, $maintenance) = split /\./, $VERSION; + + if ($plugin_major != $major) { + debug("Warning: plugin $plugin version ($plugin_version) does not match the current VEP version ($VERSION)") unless defined($config->{quiet}); + $version_ok = 0; + } + } + else { + debug("Warning: plugin $plugin does not define a version number") unless defined($config->{quiet}); + $version_ok = 0; + } + + debug("You may experience unexpected behaviour with this plugin") unless defined($config->{quiet}) || $version_ok; + + # check that it implements all necessary methods + + for my $required(qw(run get_header_info check_feature_type check_variant_feature_type)) { + unless ($instance->can($required)) { + debug("Plugin $module doesn't implement a required method '$required', does it inherit from BaseVepPlugin?") unless defined($config->{quiet}); + next; + } + } + + # all's good, so save the instance in our list of plugins + + push @{ $config->{plugins} }, $instance; + + debug("Loaded plugin: $module") unless defined($config->{quiet}); + + # for convenience, check if the plugin wants regulatory stuff and turn on the config option if so + + if (grep { $_ =~ /motif|regulatory/i } @{ $instance->feature_types }) { + debug("Fetching regulatory features for plugin: $module") unless defined($config->{quiet}); + $config->{regulatory} = 1; + } + } + } +} + +# connects to DBs (not done in offline mode) +sub connect_to_dbs { + my $config = shift; + + # get registry + my $reg = 'Bio::EnsEMBL::Registry'; + + unless(defined($config->{offline})) { + # load DB options from registry file if given + if(defined($config->{registry})) { + debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); + $reg->load_all( + $config->{registry}, + $config->{verbose}, + undef, + $config->{no_slice_cache} + ); + } + + # otherwise manually connect to DB server + else { + $reg->load_registry_from_db( + -host => $config->{host}, + -user => $config->{user}, + -pass => $config->{password}, + -port => $config->{port}, + -db_version => $config->{db_version}, + -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, + -verbose => $config->{verbose}, + -no_cache => $config->{no_slice_cache}, + ); + } + + eval { $reg->set_reconnect_when_lost() }; + + if(defined($config->{verbose})) { + # get a meta container adaptors to check version + my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); + my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); + + if($core_mca && $var_mca) { + debug( + "Connected to core version ", $core_mca->get_schema_version, " database ", + "and variation version ", $var_mca->get_schema_version, " database" + ); + } + } + } + + return $reg; +} + +# get adaptors from DB +sub get_adaptors { + my $config = shift; + + die "ERROR: No registry" unless defined $config->{reg}; + + $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); + $config->{svfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'structuralvariationfeature'); + $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); + $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); + $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); + + # get fake ones for species with no var DB + if(!defined($config->{vfa})) { + $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); + $config->{svfa} = Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor->new_fake($config->{species}); + $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); + } + + $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); + $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); + $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); + $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); + $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); + + # cache schema version + $config->{mca}->get_schema_version if defined $config->{mca}; + + # check we got slice adaptor - can't continue without a core DB + die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; +} + +# gets regulatory adaptors +sub get_reg_adaptors { + my $config = shift; + + foreach my $type(@REG_FEAT_TYPES) { + next if defined($config->{$type.'_adaptor'}); + + my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); + if(defined($adaptor)) { + $config->{$type.'_adaptor'} = $adaptor; + } + else { + delete $config->{regulatory}; + last; + } + } +} + +# gets file handle for input +sub get_in_file_handle { + my $config = shift; + + # define the filehandle to read input from + my $in_file_handle = new FileHandle; + + if(defined($config->{input_file})) { + + # check defined input file exists + die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; + + if($config->{input_file} =~ /\.gz$/){ + $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); + } + else { + $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); + } + } + + # no file specified - try to read data off command line + else { + $in_file_handle = 'STDIN'; + debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; + } + + return $in_file_handle; +} + +# gets file handle for output and adds header +sub get_out_file_handle { + my $config = shift; + + # define filehandle to write to + my $out_file_handle = new FileHandle; + + # check if file exists + if(-e $config->{output_file} && !defined($config->{force_overwrite})) { + # die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with -- force_overwrite\n"); + } + + if($config->{output_file} =~ /stdout/i) { + $out_file_handle = *STDOUT; + } + else { + $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); + } + + # define headers for a VCF file + my @vcf_headers = ( + '#CHROM', + 'POS', + 'ID', + 'REF', + 'ALT', + 'QUAL', + 'FILTER', + 'INFO' + ); + + # file conversion, don't want to add normal headers + if(defined($config->{convert})) { + # header for VCF + if($config->{convert} =~ /vcf/i) { + print $out_file_handle "##fileformat=VCFv4.0\n"; + print $out_file_handle join "\t", @vcf_headers; + print $out_file_handle "\n"; + } + + return $out_file_handle; + } + + # GVF output, no header + elsif(defined($config->{gvf}) || defined($config->{original})) { + print $out_file_handle join "\n", @{$config->{headers}} if defined($config->{headers}) && defined($config->{original}); + return $out_file_handle; + } + + elsif(defined($config->{vcf})) { + + # create an info string for the VCF header + my @new_headers; + + # if the user has defined the fields themselves, we don't need to worry + if(defined $config->{fields_redefined}) { + @new_headers = @{$config->{fields}}; + } + else { + @new_headers = ( + + # get default headers, minus variation name and location (already encoded in VCF) + grep { + $_ ne 'Uploaded_variation' and + $_ ne 'Location' and + $_ ne 'Extra' + } @{$config->{fields}}, + + # get extra headers + map {@{$extra_headers{$_}}} + grep {defined $config->{$_}} + keys %extra_headers + ); + + # plugin headers + foreach my $plugin_header(split /\n/, get_plugin_headers($config)) { + $plugin_header =~ /\#\# (.+?)\t\:.+/; + push @new_headers, $1; + } + + # redefine the main headers list in config + $config->{fields} = \@new_headers; + } + + # add the newly defined headers as a header to the VCF + my $string = join '|', @{$config->{fields}}; + my @vcf_info_strings = ('##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: '.$string.'">'); + + # add custom headers + foreach my $custom(@{$config->{custom}}) { + push @vcf_info_strings, '##INFO=<ID='.$custom->{name}.',Number=.,Type=String,Description="'.$custom->{file}.' ('.$custom->{type}.')">'; + } + + # if this is already a VCF file, we need to add our new headers in the right place + if(defined($config->{headers})) { + + for my $i(0..$#{$config->{headers}}) { + if($config->{headers}->[$i] =~ /^\#CHROM\s+POS\s+ID/) { + splice(@{$config->{headers}}, $i, 0, @vcf_info_strings); + } + } + + print $out_file_handle join "\n", @{$config->{headers}}; + print $out_file_handle "\n"; + } + + else { + print $out_file_handle "##fileformat=VCFv4.0\n"; + print $out_file_handle join "\n", @vcf_info_strings; + print $out_file_handle "\n"; + print $out_file_handle join "\t", @vcf_headers; + print $out_file_handle "\n"; + } + + return $out_file_handle; + } + + # make header + my $time = &get_time; + my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; + $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); + my $version_string = + "Using API version ".$config->{reg}->software_version. + ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); + + # add key for extra column headers based on config + my $extra_column_keys = join "\n", + map {'## '.$_.' : '.$extra_descs{$_}} + sort map {@{$extra_headers{$_}}} + grep {defined $config->{$_}} + keys %extra_headers; + + my $header =<<HEAD; +## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION +## Output produced at $time +## Connected to $db_string +## $version_string +## Extra column keys: +$extra_column_keys +HEAD + + $header .= get_plugin_headers($config); + + # add headers + print $out_file_handle $header; + + # add custom data defs + if(defined($config->{custom})) { + foreach my $custom(@{$config->{custom}}) { + print $out_file_handle '## '.$custom->{name}."\t: ".$custom->{file}.' ('.$custom->{type}.")\n"; + } + } + + # add column headers + print $out_file_handle '#', (join "\t", @{$config->{fields}}); + print $out_file_handle "\n"; + + return $out_file_handle; +} + +sub get_plugin_headers { + + my $config = shift; + + my $header = ""; + + for my $plugin (@{ $config->{plugins} }) { + if (my $hdr = $plugin->get_header_info) { + for my $key (keys %$hdr) { + my $val = $hdr->{$key}; + + $header .= "## $key\t: $val\n"; + } + } + } + + return $header; +} + +# convert a variation feature to a line of output +sub convert_vf { + my $config = shift; + my $vf = shift; + + my $convert_method = 'convert_to_'.lc($config->{convert}); + my $method_ref = \&$convert_method; + + my $line = &$method_ref($config, $vf); + my $handle = $config->{out_file_handle}; + + if(scalar @$line) { + print $handle join "\t", @$line; + print $handle "\n"; + } +} + +# converts to Ensembl format +sub convert_to_ensembl { + my $config = shift; + my $vf = shift; + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + $vf->end, + $vf->allele_string, + $vf->strand, + $vf->variation_name + ]; +} + + +# converts to pileup format +sub convert_to_pileup { + my $config = shift; + my $vf = shift; + + # look for imbalance in the allele string + my %allele_lengths; + my @alleles = split /\//, $vf->allele_string; + + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $allele_lengths{length($allele)} = 1; + } + + # in/del + if(scalar keys %allele_lengths > 1) { + + if($vf->allele_string =~ /\-/) { + + # insertion? + if($alleles[0] eq '-') { + shift @alleles; + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = '+'.$alleles[$i]; + } + } + + else { + @alleles = grep {$_ ne '-'} @alleles; + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = '-'.$alleles[$i]; + } + } + + @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start - 1, + '*', + (join "/", @alleles), + ]; + } + + else { + warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); + return []; + } + + } + + # balanced sub + else { + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + shift @alleles, + (join "/", @alleles), + ]; + } +} + +# converts to HGVS (hackily returns many lines) +sub convert_to_hgvs { + my $config = shift; + my $vf = shift; + + # ensure we have a slice + $vf->{slice} ||= get_slice($config, $vf->{chr}); + + my $tvs = $vf->get_all_TranscriptVariations; + + my @return = values %{$vf->get_all_hgvs_notations()}; + + if(defined($tvs)) { + push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'c')}} @$tvs; + push @return, map {values %{$vf->get_all_hgvs_notations($_->transcript, 'p')}} @$tvs; + } + + return [join "\n", @return]; +} + +# prints a line of output from the hash +sub print_line { + my $config = shift; + my $line = shift; + return unless defined($line); + + my $output; + + # normal + if(ref($line) eq 'HASH') { + my %extra = %{$line->{Extra}}; + + $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; + + # if the fields have been redefined we need to search through in case + # any of the defined fields are actually part of the Extra hash + $output = join "\t", map { + (defined $line->{$_} ? $line->{$_} : (defined $extra{$_} ? $extra{$_} : '-')) + } @{$config->{fields}}; + } + + # gvf/vcf + else { + $output = $$line; + } + + my $fh = $config->{out_file_handle}; + print $fh "$output\n"; +} + +# outputs usage message +sub usage { + my $usage =<<END; +#----------------------------------# +# ENSEMBL VARIANT EFFECT PREDICTOR # +#----------------------------------# + +version $VERSION + +By Will McLaren (wm2\@ebi.ac.uk) + +http://www.ensembl.org/info/docs/variation/vep/vep_script.html + +Usage: +perl variant_effect_predictor.pl [arguments] + +Options +======= + +--help Display this message and quit +--verbose Display verbose output as the script runs [default: off] +--quiet Suppress status and warning messages [default: off] +--no_progress Suppress progress bars [default: off] + +--config Load configuration from file. Any command line options + specified overwrite those in the file [default: off] + +--everything Shortcut switch to turn on commonly used options. See web + documentation for details [default: off] + +--fork [num_forks] Use forking to improve script runtime [default: off] + +-i | --input_file Input file - if not specified, reads from STDIN. Files + may be gzip compressed. +--format Specify input file format - one of "ensembl", "pileup", + "vcf", "hgvs", "id" or "guess" to try and work out format. +-o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this + will force --quiet [default: "variant_effect_output.txt"] +--force_overwrite Force overwriting of output file [default: quit if file + exists] +--original Writes output as it was in input - must be used with --filter + since no consequence data is added [default: off] +--vcf Write output as VCF [default: off] +--gvf Write output as GVF [default: off] +--fields [field list] Define a custom output format by specifying a comma-separated + list of field names. Field names normally present in the + "Extra" field may also be specified, including those added by + plugin modules. Can also be used to configure VCF output + columns [default: off] + +--species [species] Species to use [default: "human"] + +-t | --terms Type of consequence terms to output - one of "SO", "ensembl" + [default: SO] + +--sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off] +--polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off] + +NB: SIFT and PolyPhen predictions are currently available for human only +NB: Condel support has been moved to a VEP plugin module - see documentation + +--regulatory Look for overlaps with regulatory regions. The script can + also call if a variant falls in a high information position + within a transcription factor binding site. Output lines have + a Feature type of RegulatoryFeature or MotifFeature + [default: off] +--cell_type [types] Report only regulatory regions that are found in the given cell + type(s). Can be a single cell type or a comma-separated list. + The functional type in each cell type is reported under + CELL_TYPE in the output. To retrieve a list of cell types, use + "--cell_type list" [default: off] + +NB: Regulatory consequences are currently available for human and mouse only + +--custom [file list] Add custom annotations from tabix-indexed files. See + documentation for full details [default: off] +--plugin [plugin_name] Use named plugin module [default: off] +--hgnc Add HGNC gene identifiers to output [default: off] +--hgvs Output HGVS identifiers (coding and protein). Requires database + connection [default: off] +--ccds Output CCDS transcript identifiers [default: off] +--xref_refseq Output aligned RefSeq mRNA identifier for transcript. NB: the + RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND + FREQUENTLY WILL NOT, match exactly in sequence, exon structure + and protein product [default: off] +--protein Output Ensembl protein identifer [default: off] +--canonical Indicate if the transcript for this consequence is the canonical + transcript for this gene [default: off] +--domains Include details of any overlapping protein domains [default: off] +--numbers Include exon & intron numbers [default: off] + +--no_intergenic Excludes intergenic consequences from the output [default: off] +--coding_only Only return consequences that fall in the coding region of + transcripts [default: off] +--most_severe Ouptut only the most severe consequence per variation. + Transcript-specific columns will be left blank. [default: off] +--summary Output only a comma-separated list of all consequences per + variation. Transcript-specific columns will be left blank. + [default: off] +--per_gene Output only the most severe consequence per gene. Where more + than one transcript has the same consequence, the transcript + chosen is arbitrary. [default: off] + + +--check_ref If specified, checks supplied reference allele against stored + entry in Ensembl Core database [default: off] +--check_existing If specified, checks for existing co-located variations in the + Ensembl Variation database [default: off] +--failed [0|1] Include (1) or exclude (0) variants that have been flagged as + failed by Ensembl when checking for existing variants. + [default: exclude] +--check_alleles If specified, the alleles of existing co-located variations + are compared to the input; an existing variation will only + be reported if no novel allele is in the input (strand is + accounted for) [default: off] +--check_svs Report overlapping structural variants [default: off] + +--filter [filters] Filter output by consequence type. Use this to output only + variants that have at least one consequence type matching the + filter. Multiple filters can be used separated by ",". By + combining this with --original it is possible to run the VEP + iteratively to progressively filter a set of variants. See + documentation for full details [default: off] + +--check_frequency Turns on frequency filtering. Use this to include or exclude + variants based on the frequency of co-located existing + variants in the Ensembl Variation database. You must also + specify all of the following --freq flags [default: off] +--freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap, + 1kg_yri for YRI 1000 genomes. See documentation for more + details +--freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5 +--freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or + less than (lt) --freq_freq +--freq_filter Specify whether variants that pass the above should be included + [exclude|include] or excluded from analysis +--gmaf Include global MAF of existing variant from 1000 Genomes + Phase 1 in output + +--individual [id] Consider only alternate alleles present in the genotypes of the + specified individual(s). May be a single individual, a comma- + separated list or "all" to assess all individuals separately. + Each individual and variant combination is given on a separate + line of output. Only works with VCF files containing individual + genotype data; individual IDs are taken from column headers. +--allow_non_variant Prints out non-variant lines when using VCF input +--phased Force VCF individual genotypes to be interpreted as phased. + For use with plugins that depend on phased state. + +--chr [list] Select a subset of chromosomes to analyse from your file. Any + data not on this chromosome in the input will be skipped. The + list can be comma separated, with "-" characters representing + a range e.g. 1-5,8,15,X [default: off] +--gp If specified, tries to read GRCh37 position from GP field in the + INFO column of a VCF file. Only applies when VCF is the input + format and human is the species [default: off] + +--convert Convert the input file to the output format specified. + [ensembl|vcf|pileup] Converted output is written to the file specified in + --output_file. No consequence calculation is carried out when + doing file conversion. [default: off] + +--refseq Use the otherfeatures database to retrieve transcripts - this + database contains RefSeq transcripts (as well as CCDS and + Ensembl EST alignments) [default: off] +--host Manually define database host [default: "ensembldb.ensembl.org"] +-u | --user Database username [default: "anonymous"] +--port Database port [default: 5306] +--password Database password [default: no password] +--genomes Sets DB connection params for Ensembl Genomes [default: off] +--registry Registry file to use defines DB connections [default: off] + Defining a registry file overrides above connection settings. +--db_version=[number] Force script to load DBs from a specific Ensembl version. Not + advised due to likely incompatibilities between API and DB + +--no_whole_genome Run in old-style, non-whole genome mode [default: off] +--buffer_size Sets the number of variants sent in each batch [default: 5000] + Increasing buffer size can retrieve results more quickly + but requires more memory. Only applies to whole genome mode. + +--cache Enables read-only use of cache [default: off] +--dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"] +--write_cache Enable writing to cache [default: off] +--build [all|list] Build a complete cache for the selected species. Build for all + chromosomes with --build all, or a list of chromosomes (see + --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS + VIOLATES OUR FAIR USAGE POLICY [default: off] + +--compress Specify utility to decompress cache files - may be "gzcat" or + "gzip -dc" Only use if default does not work [default: zcat] + +--skip_db_check ADVANCED! Force the script to use a cache built from a different + database than specified with --host. Only use this if you are + sure the hosts are compatible (e.g. ensembldb.ensembl.org and + useastdb.ensembl.org) [default: off] +--cache_region_size ADVANCED! The size in base-pairs of the region covered by one + file in the cache. [default: 1MB] +END + + print $usage; +}
