diff variant_effect_predictor/.#variant_effect_predictor.pl.1.3 @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/.#variant_effect_predictor.pl.1.3	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,1192 @@
+#!/usr/bin/perl
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Variant Effect Predictor - a script to predict the consequences of genomic variants
+
+http://www.ensembl.org/info/docs/variation/vep/vep_script.html
+
+Version 2.2
+
+by Will McLaren (wm2@ebi.ac.uk)
+=cut
+
+use strict;
+use Getopt::Long;
+use FileHandle;
+use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
+use Bio::EnsEMBL::Variation::Utils::VEP qw(
+    parse_line
+    vf_to_consequences
+    validate_vf
+    load_dumped_adaptor_cache
+    dump_adaptor_cache
+    get_all_consequences
+    get_slice
+    build_full_cache
+    read_cache_info
+    get_time
+    debug
+    @OUTPUT_COLS
+    @REG_FEAT_TYPES
+);
+
+# global vars
+my $VERSION = '2.2';
+
+# set output autoflush for progress bars
+$| = 1;
+
+# configure from command line opts
+my $config = &configure(scalar @ARGV);
+
+# run the main sub routine
+&main($config);
+
+# this is the main sub-routine - it needs the configured $config hash
+sub main {
+    my $config = shift;
+    
+    debug("Starting...") unless defined $config->{quiet};
+    
+    my $tr_cache = {};
+    my $rf_cache = {};
+    
+    # create a hash to hold slices so we don't get the same one twice
+    my %slice_cache = ();
+    
+    my @vfs;    
+    my ($vf_count, $total_vf_count);
+    my $in_file_handle = $config->{in_file_handle};
+    
+    # initialize line number in config
+    $config->{line_number} = 0;
+    
+    # read the file
+    while(<$in_file_handle>) {
+        chomp;
+        
+        $config->{line_number}++;
+        
+        # header line?
+        next if /^\#/;
+        
+        # some lines (pileup) may actually parse out into more than one variant
+        foreach my $vf(@{&parse_line($config, $_)}) {
+            
+            # validate the VF
+            next unless validate_vf($config, $vf);
+            
+            # now get the slice
+            if(!defined($vf->{slice})) {
+                my $slice;
+                
+                # don't get slices if we're using cache
+                # we can steal them from transcript objects later
+                if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
+                    
+                    # check if we have fetched this slice already
+                    if(defined $slice_cache{$vf->{chr}}) {
+                        $slice = $slice_cache{$vf->{chr}};
+                    }
+                    
+                    # if not create a new one
+                    else {
+                        
+                        $slice = &get_slice($config, $vf->{chr});
+                        
+                        # if failed, warn and skip this line
+                        if(!defined($slice)) {
+                            warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
+                            next;
+                        }    
+                        
+                        # store the hash
+                        $slice_cache{$vf->{chr}} = $slice;
+                    }
+                }
+                
+                $vf->{slice} = $slice;
+            }
+            
+            # make a name if one doesn't exist
+            $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.$vf->{allele_string};
+            
+            # jump out to convert here
+            if(defined($config->{convert})) {
+                &convert_vf($config, $vf);
+                next;
+            }
+            
+            if(defined $config->{whole_genome}) {
+                push @vfs, $vf;
+                $vf_count++;
+                $total_vf_count++;
+                
+                if($vf_count == $config->{buffer_size}) {
+                    debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
+                    
+                    print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
+                    
+                    debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
+                    
+                    @vfs = ();
+                    $vf_count = 0;
+                }
+            }
+            else {
+                print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
+                $vf_count++;
+                $total_vf_count++;
+                debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
+            }
+        }
+    }
+    
+    # if in whole-genome mode, finish off the rest of the buffer
+    if(defined $config->{whole_genome} && scalar @vfs) {
+        debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
+        
+        print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
+        
+        debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
+    }
+    
+    debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
+    
+    debug("Finished!") unless defined $config->{quiet};
+}
+
+# sets up configuration hash that is used throughout the script
+sub configure {
+    my $args = shift;
+    
+    my $config = {};
+    
+    GetOptions(
+        $config,
+        'help',                    # displays help message
+        
+        # input options,
+        'config=s',                # config file name
+        'input_file=s',            # input file name
+        'format=s',                # input file format
+        
+        # DB options
+        'species=s',               # species e.g. human, homo_sapiens
+        'registry=s',              # registry file
+        'host=s',                  # database host
+        'port=s',                  # database port
+        'user=s',                  # database user name
+        'password=s',              # database password
+        'db_version=i',            # Ensembl database version to use e.g. 62
+        'genomes',                 # automatically sets DB params for e!Genomes
+        'refseq',                  # use otherfeatures RefSeq DB instead of Ensembl
+        #'no_disconnect',           # disables disconnect_when_inactive
+        
+        # runtime options
+        'most_severe',             # only return most severe consequence
+        'summary',                 # only return one line per variation with all consquence types
+        'per_gene',                # only return most severe per gene
+        'buffer_size=i',           # number of variations to read in before analysis
+        'chunk_size=s',            # size in bases of "chunks" used in internal hash structure
+        'failed=i',                # include failed variations when finding existing
+        'no_whole_genome',         # disables now default whole-genome mode
+        'whole_genome',            # proxy for whole genome mode - now just warns user
+        'gp',                      # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
+        'chr=s',                   # analyse only these chromosomes, e.g. 1-5,10,MT
+        'check_ref',               # check supplied reference allele against DB
+        'check_existing',          # find existing co-located variations
+        'check_alleles',           # only attribute co-located if alleles are the same
+        'check_frequency',         # enable frequency checking
+        'freq_filter=s',           # exclude or include
+        'freq_freq=f',             # frequency to filter on
+        'freq_gt_lt=s',            # gt or lt (greater than or less than)
+        'freq_pop=s',              # population to filter on
+        
+        # verbosity options
+        'verbose',                 # print out a bit more info while running
+        'quiet',                   # print nothing to STDOUT (unless using -o stdout)
+        'no_progress',             # don't display progress bars
+        
+        # output options
+        'output_file=s',           # output file name
+        'force_overwrite',         # force overwrite of output file if already exists
+        'terms=s',                 # consequence terms to use e.g. NCBI, SO
+        'coding_only',             # only return results for consequences in coding regions
+        'canonical',               # indicates if transcript is canonical
+        'protein',                 # add e! protein ID to extra column
+        'hgnc',                    # add HGNC gene ID to extra column
+        'hgvs',                    # add HGVS names to extra column
+        'sift=s',                  # SIFT predictions
+        'polyphen=s',              # PolyPhen predictions
+        'condel=s',                # Condel predictions
+        'gene',                    # force gene column to be populated (disabled by default, enabled when using cache)
+        'regulatory',              # enable regulatory stuff
+        'convert=s',               # convert input to another format (doesn't run VEP)
+        'no_intergenic',           # don't print out INTERGENIC consequences
+        'gvf',                     # produce gvf output
+        
+        # cache stuff
+        'cache',                   # use cache
+        'write_cache',             # enables writing to the cache
+        'build=s',                 # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
+        'prefetch',                # prefetch exons, translation, introns, codon table etc for each transcript
+        'strip',                   # strips adaptors etc from objects before caching them
+        'rebuild=s',               # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
+        'dir=s',                   # dir where cache is found (defaults to $HOME/.vep/)
+        'cache_region_size=i',     # size of region in bases for each cache file
+        'no_slice_cache',          # tell API not to cache features on slice
+        'standalone',              # standalone mode uses minimal set of modules installed in same dir, no DB connection
+        'skip_db_check',           # don't compare DB parameters with cached
+        'compress=s',              # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
+        
+        # debug
+        'cluck',                   # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
+        'count_queries',           # counts SQL queries executed
+        'admin',                   # allows me to build off public hosts
+        'debug',                   # print out debug info
+    );
+    
+    # print usage message if requested or no args supplied
+    if(defined($config->{help}) || !$args) {
+        &usage;
+        exit(0);
+    }
+    
+    # config file?
+    if(defined $config->{config}) {
+        
+        open CONFIG, $config->{config} or die "ERROR: Could not open config file \"".$config->{config}."\"\n";
+        
+        while(<CONFIG>) {
+            next if /^\#/;
+            my ($key, $value) = split /\s+|\=/;
+            $key =~ s/^\-//g;
+            $config->{$key} = $value unless defined $config->{$key};
+        }
+        
+        close CONFIG;
+    }
+
+    # can't be both quiet and verbose
+    die "ERROR: Can't be both quiet and verbose!" if defined($config->{quiet}) && defined($config->{verbose});
+    
+    # check file format
+    if(defined $config->{format}) {
+        die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /pileup|vcf|guess|hgvs|ensembl|id/i;
+    }
+    
+    # check convert format
+    if(defined $config->{convert}) {
+        die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup/i;
+    }
+    
+    # connection settings for Ensembl Genomes
+    if($config->{genomes}) {
+        $config->{host} ||= 'mysql.ebi.ac.uk';
+        $config->{port} ||= 4157;
+    }
+    
+    # connection settings for main Ensembl
+    else {
+        $config->{species} ||= "homo_sapiens";
+        $config->{host}    ||= 'ensembldb.ensembl.org';
+        $config->{port}    ||= 5306;
+    }
+    
+    # refseq or core?
+    if(defined($config->{refseq})) {
+        die "ERROR: SIFT, PolyPhen and Condel predictions not available fore RefSeq transcripts" if defined $config->{sift} || defined $config->{polyphen} || defined $config->{condel};
+        
+        $config->{core_type} = 'otherfeatures';
+    }
+    else {
+        $config->{core_type} = 'core';
+    }
+    
+    # output term
+    if(defined $config->{terms}) {
+        die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i;
+        if($config->{terms} =~ /ensembl|display/i) {
+            $config->{terms} = 'display';
+        }
+        else {
+            $config->{terms} = uc($config->{terms});
+        }
+    }
+    
+    # check nsSNP tools
+    foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) {
+        die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/;
+        
+        die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i;
+        
+        die "ERROR: $tool not available in standalone mode\n" if defined($config->{standalone});
+        
+        # use V2 of the Condel algorithm, possibly gives fewer false positives
+        if($tool eq 'Condel' && $config->{lc($tool)} =~ /1$/) {
+            $Bio::EnsEMBL::Variation::Utils::Condel::USE_V2 = 0;
+        }
+    }
+    
+    # force quiet if outputting to STDOUT
+    if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
+        delete $config->{verbose} if defined($config->{verbose});
+        $config->{quiet} = 1;
+    }
+    
+    # summarise options if verbose
+    if(defined $config->{verbose}) {
+        my $header =<<INTRO;
+#----------------------------------#
+# ENSEMBL VARIANT EFFECT PREDICTOR #
+#----------------------------------#
+
+version $VERSION
+
+By Will McLaren (wm2\@ebi.ac.uk)
+
+Configuration options:
+
+INTRO
+        print $header;
+        
+        my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
+        
+        foreach my $key(sort keys %$config) {
+            print $key.(' ' x (($max_length - length($key)) + 4)).$config->{$key}."\n";
+        }
+        
+        print "\n".("-" x 20)."\n\n";
+    }
+    
+    # set defaults
+    $config->{user}              ||= 'anonymous';
+    $config->{buffer_size}       ||= 5000;
+    $config->{chunk_size}        ||= '50kb';
+    $config->{output_file}       ||= "variant_effect_output.txt";
+    $config->{tmpdir}            ||= '/tmp';
+    $config->{format}            ||= 'guess';
+    $config->{terms}             ||= 'display';
+    $config->{gene}              ||= 1 unless defined($config->{whole_genome});
+    $config->{cache_region_size} ||= 1000000;
+    $config->{dir}               ||= join '/', ($ENV{'HOME'}, '.vep');
+    $config->{compress}          ||= 'zcat';
+    
+    # frequency filtering
+    if(defined($config->{check_frequency})) {
+        foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
+            die "ERROR: To use --check_frequency you must also specify flag --$flag" unless defined $config->{$flag};
+        }
+        
+        # need to set check_existing
+        $config->{check_existing} = 1;
+    }
+    
+    $config->{check_existing} = 1 if defined $config->{check_alleles};
+    
+    # warn users still using whole_genome flag
+    if(defined($config->{whole_genome})) {
+        debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
+    }
+    
+    $config->{whole_genome}      = 1 unless defined $config->{no_whole_genome};
+    $config->{include_failed}    = 1 unless defined $config->{include_failed};
+    $config->{chunk_size}        =~ s/mb?/000000/i;
+    $config->{chunk_size}        =~ s/kb?/000/i;
+    $config->{cache_region_size} =~ s/mb?/000000/i;
+    $config->{cache_region_size} =~ s/kb?/000/i;
+    
+    # cluck and display executed SQL?
+    $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck});
+    
+    # standalone needs cache, can't use HGVS
+    if(defined($config->{standalone})) {
+        $config->{cache} = 1;
+        
+        die("ERROR: Cannot generate HGVS coordinates in standalone mode") if defined($config->{hgvs});
+        die("ERROR: Cannot use HGVS as input in standalone mode") if $config->{format} eq 'hgvs';
+        die("ERROR: Cannot use variant identifiers as input in standalone mode") if $config->{format} eq 'id';
+        die("ERROR: Cannot do frequency filtering in standalone mode") if defined($config->{check_frequency});
+    }
+    
+    # write_cache needs cache
+    $config->{cache} = 1 if defined $config->{write_cache};
+    
+    # no_slice_cache, prefetch and whole_genome have to be on to use cache
+    if(defined($config->{cache})) {
+        $config->{prefetch} = 1;
+        $config->{no_slice_cache} = 1;
+        $config->{whole_genome} = 1;
+        $config->{strip} = 1;
+    }
+    
+    $config->{build} = $config->{rebuild} if defined($config->{rebuild});
+    
+    # force options for full build
+    if(defined($config->{build})) {
+        $config->{prefetch} = 1;
+        $config->{gene} = 1;
+        $config->{hgnc} = 1;
+        $config->{no_slice_cache} = 1;
+        $config->{cache} = 1;
+        $config->{strip} = 1;
+        $config->{write_cache} = 1;
+    }
+    
+    # connect to databases
+    $config->{reg} = &connect_to_dbs($config);
+    
+    # complete dir with species name and db_version
+    $config->{dir} .= '/'.(
+        join '/', (
+            defined($config->{standalone}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}),
+            $config->{db_version} || $config->{reg}->software_version
+        )
+    );
+    
+    if(defined($config->{cache})) {
+        # read cache info
+        if(read_cache_info($config)) {
+            debug("Read existing cache info") unless defined $config->{quiet};
+        }
+    }
+    
+    # include regulatory modules if requested
+    if(defined($config->{regulatory})) {
+        # do the use statements here so that users don't have to have the
+        # funcgen API install to use the rest of the script
+        eval q{
+            use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
+            use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
+            use Bio::EnsEMBL::Funcgen::MotifFeature;
+            use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
+            use Bio::EnsEMBL::Funcgen::BindingMatrix;
+        };
+        
+        if($@) {
+            die("ERROR: Ensembl Funcgen API must be installed to use --regulatory\n");
+        }
+    }
+    
+    # warn user cache directory doesn't exist
+    if(!-e $config->{dir}) {
+        
+        # if using write_cache
+        if(defined($config->{write_cache})) {
+            debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet});
+        }
+        
+        # want to read cache, not found
+        elsif(defined($config->{cache})) {
+            die("ERROR: Cache directory ", $config->{dir}, " not found");
+        }
+    }
+    
+    # suppress warnings that the FeatureAdpators spit if using no_slice_cache
+    Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
+    
+    # get adaptors
+    if(defined($config->{cache}) && !defined($config->{write_cache})) {
+        
+        # try and load adaptors from cache
+        if(!&load_dumped_adaptor_cache($config)) {
+            &get_adaptors($config);
+            &dump_adaptor_cache($config) if defined($config->{write_cache});
+        }
+        
+        # check cached adaptors match DB params
+        else {
+            my $dbc = $config->{sa}->{dbc};
+        
+            my $ok = 1;
+            
+            if($dbc->{_host} ne $config->{host}) {
+                
+                # ens-livemirror, useastdb and ensembldb should all have identical DBs
+                unless(
+                    (
+                        $dbc->{_host} eq 'ens-livemirror'
+                        || $dbc->{_host} eq 'ensembldb.ensembl.org'
+                        || $dbc->{_host} eq 'useastdb.ensembl.org'
+                    ) && (
+                        $config->{host} eq 'ens-livemirror'
+                        || $config->{host} eq 'ensembldb.ensembl.org'
+                        || $config->{host} eq 'useastdb.ensembl.org'
+                    )
+                ) {
+                    $ok = 0;
+                }
+                
+                # but we still need to reconnect
+                debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet});
+                
+                &get_adaptors($config);
+            }
+            
+            if(!$ok) {
+                if(defined($config->{skip_db_check})) {
+                    debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet});
+                }
+                else {
+                    die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set";
+                }
+            }
+        }
+    }
+    else {
+        &get_adaptors($config);
+        &dump_adaptor_cache($config) if defined($config->{write_cache})
+    }
+    
+    # reg adaptors (only fetches if not retrieved from cache already)
+    &get_reg_adaptors($config) if defined($config->{regulatory});
+    
+    # get terminal width for progress bars
+    unless(defined($config->{quiet})) {
+        my $width;
+        
+        # module may not be installed
+        eval q{
+            use Term::ReadKey;
+        };
+        
+        if(!$@) {
+            my ($w, $h);
+            
+            # module may be installed, but e.g.
+            eval {
+                ($w, $h) = GetTerminalSize();
+            };
+            
+            $width = $w if defined $w;
+        }
+        
+        $width ||= 60;
+        $width -= 12;
+        $config->{terminal_width} = $width;
+    }
+    
+    # jump out to build cache if requested
+    if(defined($config->{build})) {
+        
+        if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
+            die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
+        }
+        
+        # build the cache
+        debug("Building cache for ".$config->{species}) unless defined($config->{quiet});
+        build_full_cache($config);
+        
+        # exit script
+        debug("Finished building cache") unless defined($config->{quiet});
+        exit(0);
+    }
+    
+    # warn user DB will be used for SIFT/PolyPhen/Condel/HGVS/frequency
+    if(defined($config->{cache})) {
+        
+        # these two def depend on DB
+        foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency)) {
+            debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet});
+        }
+        
+        # as does using HGVS or IDs as input
+        debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet});
+        
+        # the rest may be in the cache
+        foreach my $param(grep {defined $config->{$_}} qw(sift polyphen condel regulatory)) {
+            next if defined($config->{'cache_'.$param});
+            debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet});
+        }
+    }
+    
+    # get list of chrs if supplied
+    if(defined($config->{chr})) {
+        my %chrs;
+        
+        foreach my $val(split /\,/, $config->{chr}) {
+            my @nnn = split /\-/, $val;
+            
+            foreach my $chr($nnn[0]..$nnn[-1]) {
+                $chrs{$chr} = 1;
+            }
+        }
+        
+        $config->{chr} = \%chrs;
+    }
+    
+    # get input file handle
+    $config->{in_file_handle} = &get_in_file_handle($config);
+    
+    # configure output file
+    $config->{out_file_handle} = &get_out_file_handle($config);
+    
+    return $config;
+}
+
+# connects to DBs; in standalone mode this just loads registry module
+sub connect_to_dbs {
+    my $config = shift;
+    
+    # get registry
+    my $reg = 'Bio::EnsEMBL::Registry';
+    
+    unless(defined($config->{standalone})) {
+        # load DB options from registry file if given
+        if(defined($config->{registry})) {
+            debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet});
+            $reg->load_all(
+                $config->{registry},
+                $config->{verbose},
+                undef,
+                $config->{no_slice_cache}
+            );
+        }
+        
+        # otherwise manually connect to DB server
+        else {
+            $reg->load_registry_from_db(
+                -host       => $config->{host},
+                -user       => $config->{user},
+                -pass       => $config->{password},
+                -port       => $config->{port},
+                -db_version => $config->{db_version},
+                -species    => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef,
+                -verbose    => $config->{verbose},
+                -no_cache   => $config->{no_slice_cache},
+            );
+        }
+        
+        eval { $reg->set_reconnect_when_lost() };
+        
+        if(defined($config->{verbose})) {
+            # get a meta container adaptors to check version
+            my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer');
+            my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer');
+            
+            if($core_mca && $var_mca) {
+                debug(
+                    "Connected to core version ", $core_mca->get_schema_version, " database ",
+                    "and variation version ", $var_mca->get_schema_version, " database"
+                );
+            }
+        }
+    }
+    
+    return $reg;
+}
+
+# get adaptors from DB
+sub get_adaptors {
+    my $config = shift;
+    
+    die "ERROR: No registry" unless defined $config->{reg};
+    
+    $config->{vfa}   = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature');
+    $config->{tva}   = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation');
+    $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix');
+    $config->{va}    = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation');
+    
+    # get fake ones for species with no var DB
+    if(!defined($config->{vfa})) {
+        $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species});
+        $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species});
+    }
+    else {
+        $config->{vfa}->db->include_failed_variations($config->{include_failed}) if defined($config->{vfa}->db) && $config->{vfa}->db->can('include_failed_variations');
+    }
+    
+    $config->{sa}  = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice');
+    $config->{ga}  = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene');
+    $config->{ta}  = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript');
+    $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer');
+    $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem');
+    
+    # cache schema version
+    $config->{mca}->get_schema_version if defined $config->{mca};
+    
+    # check we got slice adaptor - can't continue without a core DB
+    die("ERROR: Could not connect to core database\n") unless defined $config->{sa};
+}
+
+# gets regulatory adaptors
+sub get_reg_adaptors {
+    my $config = shift;
+
+    foreach my $type(@REG_FEAT_TYPES) {
+        next if defined($config->{$type.'_adaptor'});
+        
+        my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type);
+        if(defined($adaptor)) {
+            $config->{$type.'_adaptor'} = $adaptor;
+        }
+        else {
+            delete $config->{regulatory};
+            last;
+        }
+    }
+}
+
+# gets file handle for input
+sub get_in_file_handle {
+    my $config = shift;
+
+    # define the filehandle to read input from
+    my $in_file_handle = new FileHandle;
+    
+    if(defined($config->{input_file})) {
+        
+        # check defined input file exists
+        die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file};
+        
+        if($config->{input_file} =~ /\.gz$/){
+            $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
+        }
+        else {
+            $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
+        }
+    }
+    
+    # no file specified - try to read data off command line
+    else {
+        $in_file_handle = 'STDIN';
+        debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet};
+    }
+    
+    return $in_file_handle;
+}
+
+# gets file handle for output and adds header
+sub get_out_file_handle {
+    my $config = shift;
+    
+    # define filehandle to write to
+    my $out_file_handle = new FileHandle;
+    
+    # check if file exists
+    if(-e $config->{output_file} && !defined($config->{force_overwrite})) {
+        die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n");
+    }
+    
+    if($config->{output_file} =~ /stdout/i) {
+        $out_file_handle = *STDOUT;
+    }
+    else {
+        $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n");
+    }
+    
+    # file conversion, don't want to add normal headers
+    if(defined($config->{convert})) {
+        # header for VCF
+        if($config->{convert} =~ /vcf/i) {
+            print $out_file_handle join "\t", (
+                '#CHROM',
+                'POS',
+                'ID',
+                'REF',
+                'ALT',
+                'QUAL',
+                'FILTER',
+                'INFO'
+            );
+            print $out_file_handle "\n";
+        }
+        
+        return $out_file_handle;
+    }
+    
+    # GVF output, no header
+    elsif(defined($config->{gvf})) {
+        return $out_file_handle;
+    }
+    
+    # make header
+    my $time = &get_time;
+    my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca};
+    $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache});
+    my $version_string =
+        "Using API version ".$config->{reg}->software_version.
+        ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?');
+    
+    my $header =<<HEAD;
+## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION
+## Output produced at $time
+## Connected to $db_string
+## $version_string
+## Extra column keys:
+## CANONICAL    : Indicates if transcript is canonical for this gene
+## HGNC         : HGNC gene identifier
+## ENSP         : Ensembl protein identifer
+## HGVSc        : HGVS coding sequence name
+## HGVSp        : HGVS protein sequence name
+## SIFT         : SIFT prediction
+## PolyPhen     : PolyPhen prediction
+## Condel       : Condel SIFT/PolyPhen consensus prediction
+## MATRIX       : The source and identifier of a transcription factor binding profile aligned at this position
+## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of a transcription factor binding profile
+HEAD
+    
+    # add headers
+    print $out_file_handle $header;
+    
+    # add column headers
+    print $out_file_handle '#', (join "\t", @OUTPUT_COLS);
+    print $out_file_handle "\n";
+    
+    return $out_file_handle;
+}
+
+# convert a variation feature to a line of output
+sub convert_vf {
+    my $config = shift;
+    my $vf = shift;
+    
+    my $convert_method = 'convert_to_'.lc($config->{convert});
+    my $method_ref   = \&$convert_method; 
+    
+    my $line = &$method_ref($config, $vf);
+    my $handle = $config->{out_file_handle};
+    
+    if(scalar @$line) {
+        print $handle join "\t", @$line;
+        print $handle "\n";
+    }
+}
+
+# converts to Ensembl format
+sub convert_to_ensembl {
+    my $config = shift;
+    my $vf = shift;
+    
+    return [
+        $vf->{chr} || $vf->seq_region_name,
+        $vf->start,
+        $vf->end,
+        $vf->allele_string,
+        $vf->strand,
+        $vf->variation_name
+    ];
+}
+
+# converts to VCF format
+sub convert_to_vcf {
+    my $config = shift;
+    my $vf = shift;
+    
+    # look for imbalance in the allele string
+    my %allele_lengths;
+    my @alleles = split /\//, $vf->allele_string;
+    
+    foreach my $allele(@alleles) {
+        $allele =~ s/\-//g;
+        $allele_lengths{length($allele)} = 1;
+    }
+    
+    # in/del/unbalanced
+    if(scalar keys %allele_lengths > 1) {
+        
+        # we need the ref base before the variation
+        # default to N in case we can't get it
+        my $prev_base = 'N';
+        
+        unless(defined($config->{cache})) {
+            my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1);
+            $prev_base = $slice->seq if defined($slice);
+        }
+        
+        for my $i(0..$#alleles) {
+            $alleles[$i] =~ s/\-//g;
+            $alleles[$i] = $prev_base.$alleles[$i];
+        }
+        
+        return [
+            $vf->{chr} || $vf->seq_region_name,
+            $vf->start - 1,
+            $vf->variation_name,
+            shift @alleles,
+            (join ",", @alleles),
+            '.', '.', '.'
+        ];
+        
+    }
+    
+    # balanced sub
+    else {
+        return [
+            $vf->{chr} || $vf->seq_region_name,
+            $vf->start,
+            $vf->variation_name,
+            shift @alleles,
+            (join ",", @alleles),
+            '.', '.', '.'
+        ];
+    }
+}
+
+
+# converts to pileup format
+sub convert_to_pileup {
+    my $config = shift;
+    my $vf = shift;
+    
+    # look for imbalance in the allele string
+    my %allele_lengths;
+    my @alleles = split /\//, $vf->allele_string;
+    
+    foreach my $allele(@alleles) {
+        $allele =~ s/\-//g;
+        $allele_lengths{length($allele)} = 1;
+    }
+    
+    # in/del
+    if(scalar keys %allele_lengths > 1) {
+        
+        if($vf->allele_string =~ /\-/) {
+            
+            # insertion?
+            if($alleles[0] eq '-') {
+                shift @alleles;
+            
+                for my $i(0..$#alleles) {
+                    $alleles[$i] =~ s/\-//g;
+                    $alleles[$i] = '+'.$alleles[$i];
+                }
+            }
+            
+            else {
+                @alleles = grep {$_ ne '-'} @alleles;
+                
+                for my $i(0..$#alleles) {
+                    $alleles[$i] =~ s/\-//g;
+                    $alleles[$i] = '-'.$alleles[$i];
+                }
+            }
+            
+            @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles;
+            
+            return [
+                $vf->{chr} || $vf->seq_region_name,
+                $vf->start - 1,
+                '*',
+                (join "/", @alleles),
+            ];
+        }
+        
+        else {
+            warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet});
+            return [];
+        }
+        
+    }
+    
+    # balanced sub
+    else {
+        return [
+            $vf->{chr} || $vf->seq_region_name,
+            $vf->start,
+            shift @alleles,
+            (join "/", @alleles),
+        ];
+    }
+}
+
+# prints a line of output from the hash
+sub print_line {
+    my $config = shift;
+    my $line = shift;
+    return unless defined($line);
+    
+    my $output;
+    
+    # normal
+    if(ref($line) eq 'HASH') {
+        $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} };
+        $output = join "\t", map { $line->{$_} || '-' } @OUTPUT_COLS;
+    }
+    
+    # gvf
+    else {
+        $output = $line;
+    }
+    
+    my $fh = $config->{out_file_handle};
+    print $fh "$output\n";
+}
+
+# outputs usage message
+sub usage {
+    my $usage =<<END;
+#----------------------------------#
+# ENSEMBL VARIANT EFFECT PREDICTOR #
+#----------------------------------#
+
+version $VERSION
+
+By Will McLaren (wm2\@ebi.ac.uk)
+
+http://www.ensembl.org/info/docs/variation/vep/vep_script.html
+
+Usage:
+perl variant_effect_predictor.pl [arguments]
+
+Options
+=======
+
+--help                 Display this message and quit
+--verbose              Display verbose output as the script runs [default: off]
+--quiet                Suppress status and warning messages [default: off]
+--no_progress          Suppress progress bars [default: off]
+
+--config               Load configuration from file. Any command line options
+                       specified overwrite those in the file [default: off]
+
+-i | --input_file      Input file - if not specified, reads from STDIN. Files
+                       may be gzip compressed.
+--format               Specify input file format - one of "ensembl", "pileup",
+                       "vcf", "hgvs", "id" or "guess" to try and work out format.
+-o | --output_file     Output file. Write to STDOUT by specifying -o STDOUT - this
+                       will force --quiet [default: "variant_effect_output.txt"]
+--force_overwrite      Force overwriting of output file [default: quit if file
+                       exists]
+                       
+--species [species]    Species to use [default: "human"]
+
+-t | --terms           Type of consequence terms to output - one of "ensembl", "SO",
+                       "NCBI" [default: ensembl]
+ 
+--sift=[p|s|b]         Add SIFT [p]rediction, [s]core or [b]oth [default: off]
+--polyphen=[p|s|b]     Add PolyPhen [p]rediction, [s]core or [b]oth [default: off]
+--condel=[p|s|b]       Add Condel SIFT/PolyPhen consensus [p]rediction, [s]core or
+                       [b]oth. Add 1 (e.g. b1) to option to use old Condel algorithm
+                       [default: off]
+
+NB: SIFT, PolyPhen and Condel predictions are currently available for human only
+
+--regulatory           Look for overlaps with regulatory regions. The script can
+                       also call if a variant falls in a high information position
+                       within a transcription factor binding site. Output lines have
+                       a Feature type of RegulatoryFeature or MotifFeature
+                       [default: off]
+                       
+NB: Regulatory consequences are currently available for human and mouse only
+
+--hgnc                 If specified, HGNC gene identifiers are output alongside the
+                       Ensembl Gene identifier [default: off]
+--hgvs                 Output HGVS identifiers (coding and protein). Requires database
+                       connection [default: off]
+--protein              Output Ensembl protein identifer [default: off]
+--gene                 Force output of Ensembl gene identifer - disabled by default
+                       unless using --cache or --no_whole_genome [default: off]
+--canonical            Indicate if the transcript for this consequence is the canonical
+                       transcript for this gene [default: off]
+
+--coding_only          Only return consequences that fall in the coding region of
+                       transcripts [default: off]
+--most_severe          Ouptut only the most severe consequence per variation.
+                       Transcript-specific columns will be left blank. [default: off]
+--summary              Output only a comma-separated list of all consequences per
+                       variation. Transcript-specific columns will be left blank.
+                       [default: off]
+--per_gene             Output only the most severe consequence per gene. Where more
+                       than one transcript has the same consequence, the transcript
+                       chosen is arbitrary. [default: off]
+
+
+--check_ref            If specified, checks supplied reference allele against stored
+                       entry in Ensembl Core database [default: off]
+--check_existing       If specified, checks for existing co-located variations in the
+                       Ensembl Variation database [default: off]
+--check_alleles        If specified, the alleles of existing co-located variations
+                       are compared to the input; an existing variation will only
+                       be reported if no novel allele is in the input (strand is
+                       accounted for) [default: off]
+                       
+--no_intergenic        Excludes intergenic consequences from the output [default: off]
+
+--check_frequency      Turns on frequency filtering. Use this to include or exclude
+                       variants based on the frequency of co-located existing
+                       variants in the Ensembl Variation database. You must also
+                       specify all of the following --freq flags [default: off]
+--freq_pop [pop]       Name of the population to use e.g. hapmap_ceu for CEU HapMap,
+                       1kg_yri for YRI 1000 genomes. See documentation for more
+                       details
+--freq_freq [freq]     Frequency to use in filter. Must be a number between 0 and 0.5
+--freq_gt_lt [gt|lt]   Specify whether the frequency should be greater than (gt) or
+                       less than (lt) --freq_freq
+--freq_filter          Specify whether variants that pass the above should be included
+  [exclude|include]    or excluded from analysis
+
+--chr [list]           Select a subset of chromosomes to analyse from your file. Any
+                       data not on this chromosome in the input will be skipped. The
+                       list can be comma separated, with "-" characters representing
+                       a range e.g. 1-5,8,15,X [default: off]
+--gp                   If specified, tries to read GRCh37 position from GP field in the
+                       INFO column of a VCF file. Only applies when VCF is the input
+                       format and human is the species [default: off]
+                       
+--convert              Convert the input file to the output format specified.
+  [ensembl|vcf|pileup] Converted output is written to the file specified in
+                       --output_file. No consequence calculation is carried out when
+                       doing file conversion. [default: off]
+
+--refseq               Use the otherfeatures database to retrieve transcripts - this
+                       database contains RefSeq transcripts (as well as CCDS and
+                       Ensembl EST alignments) [default: off]
+--host                 Manually define database host [default: "ensembldb.ensembl.org"]
+-u | --user            Database username [default: "anonymous"]
+--port                 Database port [default: 5306]
+--password             Database password [default: no password]
+--genomes              Sets DB connection params for Ensembl Genomes [default: off]
+--registry             Registry file to use defines DB connections [default: off]
+                       Defining a registry file overrides above connection settings.
+--db_version=[number]  Force script to load DBs from a specific Ensembl version. Not
+                       advised due to likely incompatibilities between API and DB
+
+--no_whole_genome      Run in old-style, non-whole genome mode [default: off]
+--buffer_size          Sets the number of variants sent in each batch [default: 5000]
+                       Increasing buffer size can retrieve results more quickly
+                       but requires more memory. Only applies to whole genome mode.
+                       
+--cache                Enables read-only use of cache [default: off]
+--dir [directory]      Specify the base cache directory to use [default: "\$HOME/.vep/"]
+--write_cache          Enable writing to cache [default: off]
+--build [all|list]     Build a complete cache for the selected species. Build for all
+                       chromosomes with --build all, or a list of chromosomes (see
+                       --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS
+                       VIOLATES OUR FAIR USAGE POLICY [default: off]
+                       
+--compress             Specify utility to decompress cache files - may be "gzcat" or
+                       "gzip -dc" Only use if default does not work [default: zcat]
+                       
+--skip_db_check        ADVANCED! Force the script to use a cache built from a different
+                       database than specified with --host. Only use this if you are
+                       sure the hosts are compatible (e.g. ensembldb.ensembl.org and
+                       useastdb.ensembl.org) [default: off]
+--cache_region_size    ADVANCED! The size in base-pairs of the region covered by one
+                       file in the cache. [default: 1MB]
+END
+
+    print $usage;
+}