Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/.#variant_effect_predictor.pl.1.3 @ 0:21066c0abaf5 draft
Uploaded
| author | willmclaren |
|---|---|
| date | Fri, 03 Aug 2012 10:04:48 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/.#variant_effect_predictor.pl.1.3 Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,1192 @@ +#!/usr/bin/perl + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Variant Effect Predictor - a script to predict the consequences of genomic variants + +http://www.ensembl.org/info/docs/variation/vep/vep_script.html + +Version 2.2 + +by Will McLaren (wm2@ebi.ac.uk) +=cut + +use strict; +use Getopt::Long; +use FileHandle; +use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); +use Bio::EnsEMBL::Variation::Utils::VEP qw( + parse_line + vf_to_consequences + validate_vf + load_dumped_adaptor_cache + dump_adaptor_cache + get_all_consequences + get_slice + build_full_cache + read_cache_info + get_time + debug + @OUTPUT_COLS + @REG_FEAT_TYPES +); + +# global vars +my $VERSION = '2.2'; + +# set output autoflush for progress bars +$| = 1; + +# configure from command line opts +my $config = &configure(scalar @ARGV); + +# run the main sub routine +&main($config); + +# this is the main sub-routine - it needs the configured $config hash +sub main { + my $config = shift; + + debug("Starting...") unless defined $config->{quiet}; + + my $tr_cache = {}; + my $rf_cache = {}; + + # create a hash to hold slices so we don't get the same one twice + my %slice_cache = (); + + my @vfs; + my ($vf_count, $total_vf_count); + my $in_file_handle = $config->{in_file_handle}; + + # initialize line number in config + $config->{line_number} = 0; + + # read the file + while(<$in_file_handle>) { + chomp; + + $config->{line_number}++; + + # header line? + next if /^\#/; + + # some lines (pileup) may actually parse out into more than one variant + foreach my $vf(@{&parse_line($config, $_)}) { + + # validate the VF + next unless validate_vf($config, $vf); + + # now get the slice + if(!defined($vf->{slice})) { + my $slice; + + # don't get slices if we're using cache + # we can steal them from transcript objects later + if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { + + # check if we have fetched this slice already + if(defined $slice_cache{$vf->{chr}}) { + $slice = $slice_cache{$vf->{chr}}; + } + + # if not create a new one + else { + + $slice = &get_slice($config, $vf->{chr}); + + # if failed, warn and skip this line + if(!defined($slice)) { + warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; + next; + } + + # store the hash + $slice_cache{$vf->{chr}} = $slice; + } + } + + $vf->{slice} = $slice; + } + + # make a name if one doesn't exist + $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.$vf->{allele_string}; + + # jump out to convert here + if(defined($config->{convert})) { + &convert_vf($config, $vf); + next; + } + + if(defined $config->{whole_genome}) { + push @vfs, $vf; + $vf_count++; + $total_vf_count++; + + if($vf_count == $config->{buffer_size}) { + debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); + + print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)}; + + debug("Processed $total_vf_count total variants") unless defined($config->{quiet}); + + @vfs = (); + $vf_count = 0; + } + } + else { + print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; + $vf_count++; + $total_vf_count++; + debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); + } + } + } + + # if in whole-genome mode, finish off the rest of the buffer + if(defined $config->{whole_genome} && scalar @vfs) { + debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); + + print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)}; + + debug("Processed $total_vf_count total variants") unless defined($config->{quiet}); + } + + debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); + + debug("Finished!") unless defined $config->{quiet}; +} + +# sets up configuration hash that is used throughout the script +sub configure { + my $args = shift; + + my $config = {}; + + GetOptions( + $config, + 'help', # displays help message + + # input options, + 'config=s', # config file name + 'input_file=s', # input file name + 'format=s', # input file format + + # DB options + 'species=s', # species e.g. human, homo_sapiens + 'registry=s', # registry file + 'host=s', # database host + 'port=s', # database port + 'user=s', # database user name + 'password=s', # database password + 'db_version=i', # Ensembl database version to use e.g. 62 + 'genomes', # automatically sets DB params for e!Genomes + 'refseq', # use otherfeatures RefSeq DB instead of Ensembl + #'no_disconnect', # disables disconnect_when_inactive + + # runtime options + 'most_severe', # only return most severe consequence + 'summary', # only return one line per variation with all consquence types + 'per_gene', # only return most severe per gene + 'buffer_size=i', # number of variations to read in before analysis + 'chunk_size=s', # size in bases of "chunks" used in internal hash structure + 'failed=i', # include failed variations when finding existing + 'no_whole_genome', # disables now default whole-genome mode + 'whole_genome', # proxy for whole genome mode - now just warns user + 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) + 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT + 'check_ref', # check supplied reference allele against DB + 'check_existing', # find existing co-located variations + 'check_alleles', # only attribute co-located if alleles are the same + 'check_frequency', # enable frequency checking + 'freq_filter=s', # exclude or include + 'freq_freq=f', # frequency to filter on + 'freq_gt_lt=s', # gt or lt (greater than or less than) + 'freq_pop=s', # population to filter on + + # verbosity options + 'verbose', # print out a bit more info while running + 'quiet', # print nothing to STDOUT (unless using -o stdout) + 'no_progress', # don't display progress bars + + # output options + 'output_file=s', # output file name + 'force_overwrite', # force overwrite of output file if already exists + 'terms=s', # consequence terms to use e.g. NCBI, SO + 'coding_only', # only return results for consequences in coding regions + 'canonical', # indicates if transcript is canonical + 'protein', # add e! protein ID to extra column + 'hgnc', # add HGNC gene ID to extra column + 'hgvs', # add HGVS names to extra column + 'sift=s', # SIFT predictions + 'polyphen=s', # PolyPhen predictions + 'condel=s', # Condel predictions + 'gene', # force gene column to be populated (disabled by default, enabled when using cache) + 'regulatory', # enable regulatory stuff + 'convert=s', # convert input to another format (doesn't run VEP) + 'no_intergenic', # don't print out INTERGENIC consequences + 'gvf', # produce gvf output + + # cache stuff + 'cache', # use cache + 'write_cache', # enables writing to the cache + 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs + 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript + 'strip', # strips adaptors etc from objects before caching them + 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more + 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) + 'cache_region_size=i', # size of region in bases for each cache file + 'no_slice_cache', # tell API not to cache features on slice + 'standalone', # standalone mode uses minimal set of modules installed in same dir, no DB connection + 'skip_db_check', # don't compare DB parameters with cached + 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" + + # debug + 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL + 'count_queries', # counts SQL queries executed + 'admin', # allows me to build off public hosts + 'debug', # print out debug info + ); + + # print usage message if requested or no args supplied + if(defined($config->{help}) || !$args) { + &usage; + exit(0); + } + + # config file? + if(defined $config->{config}) { + + open CONFIG, $config->{config} or die "ERROR: Could not open config file \"".$config->{config}."\"\n"; + + while(<CONFIG>) { + next if /^\#/; + my ($key, $value) = split /\s+|\=/; + $key =~ s/^\-//g; + $config->{$key} = $value unless defined $config->{$key}; + } + + close CONFIG; + } + + # can't be both quiet and verbose + die "ERROR: Can't be both quiet and verbose!" if defined($config->{quiet}) && defined($config->{verbose}); + + # check file format + if(defined $config->{format}) { + die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /pileup|vcf|guess|hgvs|ensembl|id/i; + } + + # check convert format + if(defined $config->{convert}) { + die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup/i; + } + + # connection settings for Ensembl Genomes + if($config->{genomes}) { + $config->{host} ||= 'mysql.ebi.ac.uk'; + $config->{port} ||= 4157; + } + + # connection settings for main Ensembl + else { + $config->{species} ||= "homo_sapiens"; + $config->{host} ||= 'ensembldb.ensembl.org'; + $config->{port} ||= 5306; + } + + # refseq or core? + if(defined($config->{refseq})) { + die "ERROR: SIFT, PolyPhen and Condel predictions not available fore RefSeq transcripts" if defined $config->{sift} || defined $config->{polyphen} || defined $config->{condel}; + + $config->{core_type} = 'otherfeatures'; + } + else { + $config->{core_type} = 'core'; + } + + # output term + if(defined $config->{terms}) { + die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; + if($config->{terms} =~ /ensembl|display/i) { + $config->{terms} = 'display'; + } + else { + $config->{terms} = uc($config->{terms}); + } + } + + # check nsSNP tools + foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { + die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; + + die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; + + die "ERROR: $tool not available in standalone mode\n" if defined($config->{standalone}); + + # use V2 of the Condel algorithm, possibly gives fewer false positives + if($tool eq 'Condel' && $config->{lc($tool)} =~ /1$/) { + $Bio::EnsEMBL::Variation::Utils::Condel::USE_V2 = 0; + } + } + + # force quiet if outputting to STDOUT + if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { + delete $config->{verbose} if defined($config->{verbose}); + $config->{quiet} = 1; + } + + # summarise options if verbose + if(defined $config->{verbose}) { + my $header =<<INTRO; +#----------------------------------# +# ENSEMBL VARIANT EFFECT PREDICTOR # +#----------------------------------# + +version $VERSION + +By Will McLaren (wm2\@ebi.ac.uk) + +Configuration options: + +INTRO + print $header; + + my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1]; + + foreach my $key(sort keys %$config) { + print $key.(' ' x (($max_length - length($key)) + 4)).$config->{$key}."\n"; + } + + print "\n".("-" x 20)."\n\n"; + } + + # set defaults + $config->{user} ||= 'anonymous'; + $config->{buffer_size} ||= 5000; + $config->{chunk_size} ||= '50kb'; + $config->{output_file} ||= "variant_effect_output.txt"; + $config->{tmpdir} ||= '/tmp'; + $config->{format} ||= 'guess'; + $config->{terms} ||= 'display'; + $config->{gene} ||= 1 unless defined($config->{whole_genome}); + $config->{cache_region_size} ||= 1000000; + $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); + $config->{compress} ||= 'zcat'; + + # frequency filtering + if(defined($config->{check_frequency})) { + foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { + die "ERROR: To use --check_frequency you must also specify flag --$flag" unless defined $config->{$flag}; + } + + # need to set check_existing + $config->{check_existing} = 1; + } + + $config->{check_existing} = 1 if defined $config->{check_alleles}; + + # warn users still using whole_genome flag + if(defined($config->{whole_genome})) { + debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); + } + + $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; + $config->{include_failed} = 1 unless defined $config->{include_failed}; + $config->{chunk_size} =~ s/mb?/000000/i; + $config->{chunk_size} =~ s/kb?/000/i; + $config->{cache_region_size} =~ s/mb?/000000/i; + $config->{cache_region_size} =~ s/kb?/000/i; + + # cluck and display executed SQL? + $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); + + # standalone needs cache, can't use HGVS + if(defined($config->{standalone})) { + $config->{cache} = 1; + + die("ERROR: Cannot generate HGVS coordinates in standalone mode") if defined($config->{hgvs}); + die("ERROR: Cannot use HGVS as input in standalone mode") if $config->{format} eq 'hgvs'; + die("ERROR: Cannot use variant identifiers as input in standalone mode") if $config->{format} eq 'id'; + die("ERROR: Cannot do frequency filtering in standalone mode") if defined($config->{check_frequency}); + } + + # write_cache needs cache + $config->{cache} = 1 if defined $config->{write_cache}; + + # no_slice_cache, prefetch and whole_genome have to be on to use cache + if(defined($config->{cache})) { + $config->{prefetch} = 1; + $config->{no_slice_cache} = 1; + $config->{whole_genome} = 1; + $config->{strip} = 1; + } + + $config->{build} = $config->{rebuild} if defined($config->{rebuild}); + + # force options for full build + if(defined($config->{build})) { + $config->{prefetch} = 1; + $config->{gene} = 1; + $config->{hgnc} = 1; + $config->{no_slice_cache} = 1; + $config->{cache} = 1; + $config->{strip} = 1; + $config->{write_cache} = 1; + } + + # connect to databases + $config->{reg} = &connect_to_dbs($config); + + # complete dir with species name and db_version + $config->{dir} .= '/'.( + join '/', ( + defined($config->{standalone}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), + $config->{db_version} || $config->{reg}->software_version + ) + ); + + if(defined($config->{cache})) { + # read cache info + if(read_cache_info($config)) { + debug("Read existing cache info") unless defined $config->{quiet}; + } + } + + # include regulatory modules if requested + if(defined($config->{regulatory})) { + # do the use statements here so that users don't have to have the + # funcgen API install to use the rest of the script + eval q{ + use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; + use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; + use Bio::EnsEMBL::Funcgen::MotifFeature; + use Bio::EnsEMBL::Funcgen::RegulatoryFeature; + use Bio::EnsEMBL::Funcgen::BindingMatrix; + }; + + if($@) { + die("ERROR: Ensembl Funcgen API must be installed to use --regulatory\n"); + } + } + + # warn user cache directory doesn't exist + if(!-e $config->{dir}) { + + # if using write_cache + if(defined($config->{write_cache})) { + debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); + } + + # want to read cache, not found + elsif(defined($config->{cache})) { + die("ERROR: Cache directory ", $config->{dir}, " not found"); + } + } + + # suppress warnings that the FeatureAdpators spit if using no_slice_cache + Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); + + # get adaptors + if(defined($config->{cache}) && !defined($config->{write_cache})) { + + # try and load adaptors from cache + if(!&load_dumped_adaptor_cache($config)) { + &get_adaptors($config); + &dump_adaptor_cache($config) if defined($config->{write_cache}); + } + + # check cached adaptors match DB params + else { + my $dbc = $config->{sa}->{dbc}; + + my $ok = 1; + + if($dbc->{_host} ne $config->{host}) { + + # ens-livemirror, useastdb and ensembldb should all have identical DBs + unless( + ( + $dbc->{_host} eq 'ens-livemirror' + || $dbc->{_host} eq 'ensembldb.ensembl.org' + || $dbc->{_host} eq 'useastdb.ensembl.org' + ) && ( + $config->{host} eq 'ens-livemirror' + || $config->{host} eq 'ensembldb.ensembl.org' + || $config->{host} eq 'useastdb.ensembl.org' + ) + ) { + $ok = 0; + } + + # but we still need to reconnect + debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); + + &get_adaptors($config); + } + + if(!$ok) { + if(defined($config->{skip_db_check})) { + debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); + } + else { + die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; + } + } + } + } + else { + &get_adaptors($config); + &dump_adaptor_cache($config) if defined($config->{write_cache}) + } + + # reg adaptors (only fetches if not retrieved from cache already) + &get_reg_adaptors($config) if defined($config->{regulatory}); + + # get terminal width for progress bars + unless(defined($config->{quiet})) { + my $width; + + # module may not be installed + eval q{ + use Term::ReadKey; + }; + + if(!$@) { + my ($w, $h); + + # module may be installed, but e.g. + eval { + ($w, $h) = GetTerminalSize(); + }; + + $width = $w if defined $w; + } + + $width ||= 60; + $width -= 12; + $config->{terminal_width} = $width; + } + + # jump out to build cache if requested + if(defined($config->{build})) { + + if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { + die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); + } + + # build the cache + debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); + build_full_cache($config); + + # exit script + debug("Finished building cache") unless defined($config->{quiet}); + exit(0); + } + + # warn user DB will be used for SIFT/PolyPhen/Condel/HGVS/frequency + if(defined($config->{cache})) { + + # these two def depend on DB + foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency)) { + debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); + } + + # as does using HGVS or IDs as input + debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); + + # the rest may be in the cache + foreach my $param(grep {defined $config->{$_}} qw(sift polyphen condel regulatory)) { + next if defined($config->{'cache_'.$param}); + debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); + } + } + + # get list of chrs if supplied + if(defined($config->{chr})) { + my %chrs; + + foreach my $val(split /\,/, $config->{chr}) { + my @nnn = split /\-/, $val; + + foreach my $chr($nnn[0]..$nnn[-1]) { + $chrs{$chr} = 1; + } + } + + $config->{chr} = \%chrs; + } + + # get input file handle + $config->{in_file_handle} = &get_in_file_handle($config); + + # configure output file + $config->{out_file_handle} = &get_out_file_handle($config); + + return $config; +} + +# connects to DBs; in standalone mode this just loads registry module +sub connect_to_dbs { + my $config = shift; + + # get registry + my $reg = 'Bio::EnsEMBL::Registry'; + + unless(defined($config->{standalone})) { + # load DB options from registry file if given + if(defined($config->{registry})) { + debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); + $reg->load_all( + $config->{registry}, + $config->{verbose}, + undef, + $config->{no_slice_cache} + ); + } + + # otherwise manually connect to DB server + else { + $reg->load_registry_from_db( + -host => $config->{host}, + -user => $config->{user}, + -pass => $config->{password}, + -port => $config->{port}, + -db_version => $config->{db_version}, + -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, + -verbose => $config->{verbose}, + -no_cache => $config->{no_slice_cache}, + ); + } + + eval { $reg->set_reconnect_when_lost() }; + + if(defined($config->{verbose})) { + # get a meta container adaptors to check version + my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); + my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); + + if($core_mca && $var_mca) { + debug( + "Connected to core version ", $core_mca->get_schema_version, " database ", + "and variation version ", $var_mca->get_schema_version, " database" + ); + } + } + } + + return $reg; +} + +# get adaptors from DB +sub get_adaptors { + my $config = shift; + + die "ERROR: No registry" unless defined $config->{reg}; + + $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); + $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); + $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); + $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); + + # get fake ones for species with no var DB + if(!defined($config->{vfa})) { + $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); + $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); + } + else { + $config->{vfa}->db->include_failed_variations($config->{include_failed}) if defined($config->{vfa}->db) && $config->{vfa}->db->can('include_failed_variations'); + } + + $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); + $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); + $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); + $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); + $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); + + # cache schema version + $config->{mca}->get_schema_version if defined $config->{mca}; + + # check we got slice adaptor - can't continue without a core DB + die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; +} + +# gets regulatory adaptors +sub get_reg_adaptors { + my $config = shift; + + foreach my $type(@REG_FEAT_TYPES) { + next if defined($config->{$type.'_adaptor'}); + + my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); + if(defined($adaptor)) { + $config->{$type.'_adaptor'} = $adaptor; + } + else { + delete $config->{regulatory}; + last; + } + } +} + +# gets file handle for input +sub get_in_file_handle { + my $config = shift; + + # define the filehandle to read input from + my $in_file_handle = new FileHandle; + + if(defined($config->{input_file})) { + + # check defined input file exists + die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; + + if($config->{input_file} =~ /\.gz$/){ + $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); + } + else { + $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); + } + } + + # no file specified - try to read data off command line + else { + $in_file_handle = 'STDIN'; + debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; + } + + return $in_file_handle; +} + +# gets file handle for output and adds header +sub get_out_file_handle { + my $config = shift; + + # define filehandle to write to + my $out_file_handle = new FileHandle; + + # check if file exists + if(-e $config->{output_file} && !defined($config->{force_overwrite})) { + die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n"); + } + + if($config->{output_file} =~ /stdout/i) { + $out_file_handle = *STDOUT; + } + else { + $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); + } + + # file conversion, don't want to add normal headers + if(defined($config->{convert})) { + # header for VCF + if($config->{convert} =~ /vcf/i) { + print $out_file_handle join "\t", ( + '#CHROM', + 'POS', + 'ID', + 'REF', + 'ALT', + 'QUAL', + 'FILTER', + 'INFO' + ); + print $out_file_handle "\n"; + } + + return $out_file_handle; + } + + # GVF output, no header + elsif(defined($config->{gvf})) { + return $out_file_handle; + } + + # make header + my $time = &get_time; + my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; + $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); + my $version_string = + "Using API version ".$config->{reg}->software_version. + ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); + + my $header =<<HEAD; +## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION +## Output produced at $time +## Connected to $db_string +## $version_string +## Extra column keys: +## CANONICAL : Indicates if transcript is canonical for this gene +## HGNC : HGNC gene identifier +## ENSP : Ensembl protein identifer +## HGVSc : HGVS coding sequence name +## HGVSp : HGVS protein sequence name +## SIFT : SIFT prediction +## PolyPhen : PolyPhen prediction +## Condel : Condel SIFT/PolyPhen consensus prediction +## MATRIX : The source and identifier of a transcription factor binding profile aligned at this position +## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of a transcription factor binding profile +HEAD + + # add headers + print $out_file_handle $header; + + # add column headers + print $out_file_handle '#', (join "\t", @OUTPUT_COLS); + print $out_file_handle "\n"; + + return $out_file_handle; +} + +# convert a variation feature to a line of output +sub convert_vf { + my $config = shift; + my $vf = shift; + + my $convert_method = 'convert_to_'.lc($config->{convert}); + my $method_ref = \&$convert_method; + + my $line = &$method_ref($config, $vf); + my $handle = $config->{out_file_handle}; + + if(scalar @$line) { + print $handle join "\t", @$line; + print $handle "\n"; + } +} + +# converts to Ensembl format +sub convert_to_ensembl { + my $config = shift; + my $vf = shift; + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + $vf->end, + $vf->allele_string, + $vf->strand, + $vf->variation_name + ]; +} + +# converts to VCF format +sub convert_to_vcf { + my $config = shift; + my $vf = shift; + + # look for imbalance in the allele string + my %allele_lengths; + my @alleles = split /\//, $vf->allele_string; + + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $allele_lengths{length($allele)} = 1; + } + + # in/del/unbalanced + if(scalar keys %allele_lengths > 1) { + + # we need the ref base before the variation + # default to N in case we can't get it + my $prev_base = 'N'; + + unless(defined($config->{cache})) { + my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1); + $prev_base = $slice->seq if defined($slice); + } + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = $prev_base.$alleles[$i]; + } + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start - 1, + $vf->variation_name, + shift @alleles, + (join ",", @alleles), + '.', '.', '.' + ]; + + } + + # balanced sub + else { + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + $vf->variation_name, + shift @alleles, + (join ",", @alleles), + '.', '.', '.' + ]; + } +} + + +# converts to pileup format +sub convert_to_pileup { + my $config = shift; + my $vf = shift; + + # look for imbalance in the allele string + my %allele_lengths; + my @alleles = split /\//, $vf->allele_string; + + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $allele_lengths{length($allele)} = 1; + } + + # in/del + if(scalar keys %allele_lengths > 1) { + + if($vf->allele_string =~ /\-/) { + + # insertion? + if($alleles[0] eq '-') { + shift @alleles; + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = '+'.$alleles[$i]; + } + } + + else { + @alleles = grep {$_ ne '-'} @alleles; + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = '-'.$alleles[$i]; + } + } + + @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start - 1, + '*', + (join "/", @alleles), + ]; + } + + else { + warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); + return []; + } + + } + + # balanced sub + else { + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + shift @alleles, + (join "/", @alleles), + ]; + } +} + +# prints a line of output from the hash +sub print_line { + my $config = shift; + my $line = shift; + return unless defined($line); + + my $output; + + # normal + if(ref($line) eq 'HASH') { + $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; + $output = join "\t", map { $line->{$_} || '-' } @OUTPUT_COLS; + } + + # gvf + else { + $output = $line; + } + + my $fh = $config->{out_file_handle}; + print $fh "$output\n"; +} + +# outputs usage message +sub usage { + my $usage =<<END; +#----------------------------------# +# ENSEMBL VARIANT EFFECT PREDICTOR # +#----------------------------------# + +version $VERSION + +By Will McLaren (wm2\@ebi.ac.uk) + +http://www.ensembl.org/info/docs/variation/vep/vep_script.html + +Usage: +perl variant_effect_predictor.pl [arguments] + +Options +======= + +--help Display this message and quit +--verbose Display verbose output as the script runs [default: off] +--quiet Suppress status and warning messages [default: off] +--no_progress Suppress progress bars [default: off] + +--config Load configuration from file. Any command line options + specified overwrite those in the file [default: off] + +-i | --input_file Input file - if not specified, reads from STDIN. Files + may be gzip compressed. +--format Specify input file format - one of "ensembl", "pileup", + "vcf", "hgvs", "id" or "guess" to try and work out format. +-o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this + will force --quiet [default: "variant_effect_output.txt"] +--force_overwrite Force overwriting of output file [default: quit if file + exists] + +--species [species] Species to use [default: "human"] + +-t | --terms Type of consequence terms to output - one of "ensembl", "SO", + "NCBI" [default: ensembl] + +--sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off] +--polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off] +--condel=[p|s|b] Add Condel SIFT/PolyPhen consensus [p]rediction, [s]core or + [b]oth. Add 1 (e.g. b1) to option to use old Condel algorithm + [default: off] + +NB: SIFT, PolyPhen and Condel predictions are currently available for human only + +--regulatory Look for overlaps with regulatory regions. The script can + also call if a variant falls in a high information position + within a transcription factor binding site. Output lines have + a Feature type of RegulatoryFeature or MotifFeature + [default: off] + +NB: Regulatory consequences are currently available for human and mouse only + +--hgnc If specified, HGNC gene identifiers are output alongside the + Ensembl Gene identifier [default: off] +--hgvs Output HGVS identifiers (coding and protein). Requires database + connection [default: off] +--protein Output Ensembl protein identifer [default: off] +--gene Force output of Ensembl gene identifer - disabled by default + unless using --cache or --no_whole_genome [default: off] +--canonical Indicate if the transcript for this consequence is the canonical + transcript for this gene [default: off] + +--coding_only Only return consequences that fall in the coding region of + transcripts [default: off] +--most_severe Ouptut only the most severe consequence per variation. + Transcript-specific columns will be left blank. [default: off] +--summary Output only a comma-separated list of all consequences per + variation. Transcript-specific columns will be left blank. + [default: off] +--per_gene Output only the most severe consequence per gene. Where more + than one transcript has the same consequence, the transcript + chosen is arbitrary. [default: off] + + +--check_ref If specified, checks supplied reference allele against stored + entry in Ensembl Core database [default: off] +--check_existing If specified, checks for existing co-located variations in the + Ensembl Variation database [default: off] +--check_alleles If specified, the alleles of existing co-located variations + are compared to the input; an existing variation will only + be reported if no novel allele is in the input (strand is + accounted for) [default: off] + +--no_intergenic Excludes intergenic consequences from the output [default: off] + +--check_frequency Turns on frequency filtering. Use this to include or exclude + variants based on the frequency of co-located existing + variants in the Ensembl Variation database. You must also + specify all of the following --freq flags [default: off] +--freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap, + 1kg_yri for YRI 1000 genomes. See documentation for more + details +--freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5 +--freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or + less than (lt) --freq_freq +--freq_filter Specify whether variants that pass the above should be included + [exclude|include] or excluded from analysis + +--chr [list] Select a subset of chromosomes to analyse from your file. Any + data not on this chromosome in the input will be skipped. The + list can be comma separated, with "-" characters representing + a range e.g. 1-5,8,15,X [default: off] +--gp If specified, tries to read GRCh37 position from GP field in the + INFO column of a VCF file. Only applies when VCF is the input + format and human is the species [default: off] + +--convert Convert the input file to the output format specified. + [ensembl|vcf|pileup] Converted output is written to the file specified in + --output_file. No consequence calculation is carried out when + doing file conversion. [default: off] + +--refseq Use the otherfeatures database to retrieve transcripts - this + database contains RefSeq transcripts (as well as CCDS and + Ensembl EST alignments) [default: off] +--host Manually define database host [default: "ensembldb.ensembl.org"] +-u | --user Database username [default: "anonymous"] +--port Database port [default: 5306] +--password Database password [default: no password] +--genomes Sets DB connection params for Ensembl Genomes [default: off] +--registry Registry file to use defines DB connections [default: off] + Defining a registry file overrides above connection settings. +--db_version=[number] Force script to load DBs from a specific Ensembl version. Not + advised due to likely incompatibilities between API and DB + +--no_whole_genome Run in old-style, non-whole genome mode [default: off] +--buffer_size Sets the number of variants sent in each batch [default: 5000] + Increasing buffer size can retrieve results more quickly + but requires more memory. Only applies to whole genome mode. + +--cache Enables read-only use of cache [default: off] +--dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"] +--write_cache Enable writing to cache [default: off] +--build [all|list] Build a complete cache for the selected species. Build for all + chromosomes with --build all, or a list of chromosomes (see + --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS + VIOLATES OUR FAIR USAGE POLICY [default: off] + +--compress Specify utility to decompress cache files - may be "gzcat" or + "gzip -dc" Only use if default does not work [default: zcat] + +--skip_db_check ADVANCED! Force the script to use a cache built from a different + database than specified with --host. Only use this if you are + sure the hosts are compatible (e.g. ensembldb.ensembl.org and + useastdb.ensembl.org) [default: off] +--cache_region_size ADVANCED! The size in base-pairs of the region covered by one + file in the cache. [default: 1MB] +END + + print $usage; +}
