Mercurial > repos > willmclaren > ensembl_vep
view variant_effect_predictor/.#variant_effect_predictor.pl.1.3 @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl =head1 LICENSE Copyright (c) 1999-2011 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <dev@ensembl.org>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME Variant Effect Predictor - a script to predict the consequences of genomic variants http://www.ensembl.org/info/docs/variation/vep/vep_script.html Version 2.2 by Will McLaren (wm2@ebi.ac.uk) =cut use strict; use Getopt::Long; use FileHandle; use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); use Bio::EnsEMBL::Variation::Utils::VEP qw( parse_line vf_to_consequences validate_vf load_dumped_adaptor_cache dump_adaptor_cache get_all_consequences get_slice build_full_cache read_cache_info get_time debug @OUTPUT_COLS @REG_FEAT_TYPES ); # global vars my $VERSION = '2.2'; # set output autoflush for progress bars $| = 1; # configure from command line opts my $config = &configure(scalar @ARGV); # run the main sub routine &main($config); # this is the main sub-routine - it needs the configured $config hash sub main { my $config = shift; debug("Starting...") unless defined $config->{quiet}; my $tr_cache = {}; my $rf_cache = {}; # create a hash to hold slices so we don't get the same one twice my %slice_cache = (); my @vfs; my ($vf_count, $total_vf_count); my $in_file_handle = $config->{in_file_handle}; # initialize line number in config $config->{line_number} = 0; # read the file while(<$in_file_handle>) { chomp; $config->{line_number}++; # header line? next if /^\#/; # some lines (pileup) may actually parse out into more than one variant foreach my $vf(@{&parse_line($config, $_)}) { # validate the VF next unless validate_vf($config, $vf); # now get the slice if(!defined($vf->{slice})) { my $slice; # don't get slices if we're using cache # we can steal them from transcript objects later if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) { # check if we have fetched this slice already if(defined $slice_cache{$vf->{chr}}) { $slice = $slice_cache{$vf->{chr}}; } # if not create a new one else { $slice = &get_slice($config, $vf->{chr}); # if failed, warn and skip this line if(!defined($slice)) { warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet}; next; } # store the hash $slice_cache{$vf->{chr}} = $slice; } } $vf->{slice} = $slice; } # make a name if one doesn't exist $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.$vf->{allele_string}; # jump out to convert here if(defined($config->{convert})) { &convert_vf($config, $vf); next; } if(defined $config->{whole_genome}) { push @vfs, $vf; $vf_count++; $total_vf_count++; if($vf_count == $config->{buffer_size}) { debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)}; debug("Processed $total_vf_count total variants") unless defined($config->{quiet}); @vfs = (); $vf_count = 0; } } else { print_line($config, $_) foreach @{vf_to_consequences($config, $vf)}; $vf_count++; $total_vf_count++; debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose}); } } } # if in whole-genome mode, finish off the rest of the buffer if(defined $config->{whole_genome} && scalar @vfs) { debug("Read $vf_count variants into buffer") unless defined($config->{quiet}); print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)}; debug("Processed $total_vf_count total variants") unless defined($config->{quiet}); } debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet}); debug("Finished!") unless defined $config->{quiet}; } # sets up configuration hash that is used throughout the script sub configure { my $args = shift; my $config = {}; GetOptions( $config, 'help', # displays help message # input options, 'config=s', # config file name 'input_file=s', # input file name 'format=s', # input file format # DB options 'species=s', # species e.g. human, homo_sapiens 'registry=s', # registry file 'host=s', # database host 'port=s', # database port 'user=s', # database user name 'password=s', # database password 'db_version=i', # Ensembl database version to use e.g. 62 'genomes', # automatically sets DB params for e!Genomes 'refseq', # use otherfeatures RefSeq DB instead of Ensembl #'no_disconnect', # disables disconnect_when_inactive # runtime options 'most_severe', # only return most severe consequence 'summary', # only return one line per variation with all consquence types 'per_gene', # only return most severe per gene 'buffer_size=i', # number of variations to read in before analysis 'chunk_size=s', # size in bases of "chunks" used in internal hash structure 'failed=i', # include failed variations when finding existing 'no_whole_genome', # disables now default whole-genome mode 'whole_genome', # proxy for whole genome mode - now just warns user 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG) 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT 'check_ref', # check supplied reference allele against DB 'check_existing', # find existing co-located variations 'check_alleles', # only attribute co-located if alleles are the same 'check_frequency', # enable frequency checking 'freq_filter=s', # exclude or include 'freq_freq=f', # frequency to filter on 'freq_gt_lt=s', # gt or lt (greater than or less than) 'freq_pop=s', # population to filter on # verbosity options 'verbose', # print out a bit more info while running 'quiet', # print nothing to STDOUT (unless using -o stdout) 'no_progress', # don't display progress bars # output options 'output_file=s', # output file name 'force_overwrite', # force overwrite of output file if already exists 'terms=s', # consequence terms to use e.g. NCBI, SO 'coding_only', # only return results for consequences in coding regions 'canonical', # indicates if transcript is canonical 'protein', # add e! protein ID to extra column 'hgnc', # add HGNC gene ID to extra column 'hgvs', # add HGVS names to extra column 'sift=s', # SIFT predictions 'polyphen=s', # PolyPhen predictions 'condel=s', # Condel predictions 'gene', # force gene column to be populated (disabled by default, enabled when using cache) 'regulatory', # enable regulatory stuff 'convert=s', # convert input to another format (doesn't run VEP) 'no_intergenic', # don't print out INTERGENIC consequences 'gvf', # produce gvf output # cache stuff 'cache', # use cache 'write_cache', # enables writing to the cache 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript 'strip', # strips adaptors etc from objects before caching them 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more 'dir=s', # dir where cache is found (defaults to $HOME/.vep/) 'cache_region_size=i', # size of region in bases for each cache file 'no_slice_cache', # tell API not to cache features on slice 'standalone', # standalone mode uses minimal set of modules installed in same dir, no DB connection 'skip_db_check', # don't compare DB parameters with cached 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc" # debug 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL 'count_queries', # counts SQL queries executed 'admin', # allows me to build off public hosts 'debug', # print out debug info ); # print usage message if requested or no args supplied if(defined($config->{help}) || !$args) { &usage; exit(0); } # config file? if(defined $config->{config}) { open CONFIG, $config->{config} or die "ERROR: Could not open config file \"".$config->{config}."\"\n"; while(<CONFIG>) { next if /^\#/; my ($key, $value) = split /\s+|\=/; $key =~ s/^\-//g; $config->{$key} = $value unless defined $config->{$key}; } close CONFIG; } # can't be both quiet and verbose die "ERROR: Can't be both quiet and verbose!" if defined($config->{quiet}) && defined($config->{verbose}); # check file format if(defined $config->{format}) { die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /pileup|vcf|guess|hgvs|ensembl|id/i; } # check convert format if(defined $config->{convert}) { die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup/i; } # connection settings for Ensembl Genomes if($config->{genomes}) { $config->{host} ||= 'mysql.ebi.ac.uk'; $config->{port} ||= 4157; } # connection settings for main Ensembl else { $config->{species} ||= "homo_sapiens"; $config->{host} ||= 'ensembldb.ensembl.org'; $config->{port} ||= 5306; } # refseq or core? if(defined($config->{refseq})) { die "ERROR: SIFT, PolyPhen and Condel predictions not available fore RefSeq transcripts" if defined $config->{sift} || defined $config->{polyphen} || defined $config->{condel}; $config->{core_type} = 'otherfeatures'; } else { $config->{core_type} = 'core'; } # output term if(defined $config->{terms}) { die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i; if($config->{terms} =~ /ensembl|display/i) { $config->{terms} = 'display'; } else { $config->{terms} = uc($config->{terms}); } } # check nsSNP tools foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) { die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/; die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i; die "ERROR: $tool not available in standalone mode\n" if defined($config->{standalone}); # use V2 of the Condel algorithm, possibly gives fewer false positives if($tool eq 'Condel' && $config->{lc($tool)} =~ /1$/) { $Bio::EnsEMBL::Variation::Utils::Condel::USE_V2 = 0; } } # force quiet if outputting to STDOUT if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) { delete $config->{verbose} if defined($config->{verbose}); $config->{quiet} = 1; } # summarise options if verbose if(defined $config->{verbose}) { my $header =<<INTRO; #----------------------------------# # ENSEMBL VARIANT EFFECT PREDICTOR # #----------------------------------# version $VERSION By Will McLaren (wm2\@ebi.ac.uk) Configuration options: INTRO print $header; my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1]; foreach my $key(sort keys %$config) { print $key.(' ' x (($max_length - length($key)) + 4)).$config->{$key}."\n"; } print "\n".("-" x 20)."\n\n"; } # set defaults $config->{user} ||= 'anonymous'; $config->{buffer_size} ||= 5000; $config->{chunk_size} ||= '50kb'; $config->{output_file} ||= "variant_effect_output.txt"; $config->{tmpdir} ||= '/tmp'; $config->{format} ||= 'guess'; $config->{terms} ||= 'display'; $config->{gene} ||= 1 unless defined($config->{whole_genome}); $config->{cache_region_size} ||= 1000000; $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep'); $config->{compress} ||= 'zcat'; # frequency filtering if(defined($config->{check_frequency})) { foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) { die "ERROR: To use --check_frequency you must also specify flag --$flag" unless defined $config->{$flag}; } # need to set check_existing $config->{check_existing} = 1; } $config->{check_existing} = 1 if defined $config->{check_alleles}; # warn users still using whole_genome flag if(defined($config->{whole_genome})) { debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet}); } $config->{whole_genome} = 1 unless defined $config->{no_whole_genome}; $config->{include_failed} = 1 unless defined $config->{include_failed}; $config->{chunk_size} =~ s/mb?/000000/i; $config->{chunk_size} =~ s/kb?/000/i; $config->{cache_region_size} =~ s/mb?/000000/i; $config->{cache_region_size} =~ s/kb?/000/i; # cluck and display executed SQL? $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck}); # standalone needs cache, can't use HGVS if(defined($config->{standalone})) { $config->{cache} = 1; die("ERROR: Cannot generate HGVS coordinates in standalone mode") if defined($config->{hgvs}); die("ERROR: Cannot use HGVS as input in standalone mode") if $config->{format} eq 'hgvs'; die("ERROR: Cannot use variant identifiers as input in standalone mode") if $config->{format} eq 'id'; die("ERROR: Cannot do frequency filtering in standalone mode") if defined($config->{check_frequency}); } # write_cache needs cache $config->{cache} = 1 if defined $config->{write_cache}; # no_slice_cache, prefetch and whole_genome have to be on to use cache if(defined($config->{cache})) { $config->{prefetch} = 1; $config->{no_slice_cache} = 1; $config->{whole_genome} = 1; $config->{strip} = 1; } $config->{build} = $config->{rebuild} if defined($config->{rebuild}); # force options for full build if(defined($config->{build})) { $config->{prefetch} = 1; $config->{gene} = 1; $config->{hgnc} = 1; $config->{no_slice_cache} = 1; $config->{cache} = 1; $config->{strip} = 1; $config->{write_cache} = 1; } # connect to databases $config->{reg} = &connect_to_dbs($config); # complete dir with species name and db_version $config->{dir} .= '/'.( join '/', ( defined($config->{standalone}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}), $config->{db_version} || $config->{reg}->software_version ) ); if(defined($config->{cache})) { # read cache info if(read_cache_info($config)) { debug("Read existing cache info") unless defined $config->{quiet}; } } # include regulatory modules if requested if(defined($config->{regulatory})) { # do the use statements here so that users don't have to have the # funcgen API install to use the rest of the script eval q{ use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor; use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor; use Bio::EnsEMBL::Funcgen::MotifFeature; use Bio::EnsEMBL::Funcgen::RegulatoryFeature; use Bio::EnsEMBL::Funcgen::BindingMatrix; }; if($@) { die("ERROR: Ensembl Funcgen API must be installed to use --regulatory\n"); } } # warn user cache directory doesn't exist if(!-e $config->{dir}) { # if using write_cache if(defined($config->{write_cache})) { debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet}); } # want to read cache, not found elsif(defined($config->{cache})) { die("ERROR: Cache directory ", $config->{dir}, " not found"); } } # suppress warnings that the FeatureAdpators spit if using no_slice_cache Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache}); # get adaptors if(defined($config->{cache}) && !defined($config->{write_cache})) { # try and load adaptors from cache if(!&load_dumped_adaptor_cache($config)) { &get_adaptors($config); &dump_adaptor_cache($config) if defined($config->{write_cache}); } # check cached adaptors match DB params else { my $dbc = $config->{sa}->{dbc}; my $ok = 1; if($dbc->{_host} ne $config->{host}) { # ens-livemirror, useastdb and ensembldb should all have identical DBs unless( ( $dbc->{_host} eq 'ens-livemirror' || $dbc->{_host} eq 'ensembldb.ensembl.org' || $dbc->{_host} eq 'useastdb.ensembl.org' ) && ( $config->{host} eq 'ens-livemirror' || $config->{host} eq 'ensembldb.ensembl.org' || $config->{host} eq 'useastdb.ensembl.org' ) ) { $ok = 0; } # but we still need to reconnect debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet}); &get_adaptors($config); } if(!$ok) { if(defined($config->{skip_db_check})) { debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet}); } else { die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set"; } } } } else { &get_adaptors($config); &dump_adaptor_cache($config) if defined($config->{write_cache}) } # reg adaptors (only fetches if not retrieved from cache already) &get_reg_adaptors($config) if defined($config->{regulatory}); # get terminal width for progress bars unless(defined($config->{quiet})) { my $width; # module may not be installed eval q{ use Term::ReadKey; }; if(!$@) { my ($w, $h); # module may be installed, but e.g. eval { ($w, $h) = GetTerminalSize(); }; $width = $w if defined $w; } $width ||= 60; $width -= 12; $config->{terminal_width} = $width; } # jump out to build cache if requested if(defined($config->{build})) { if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) { die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n"); } # build the cache debug("Building cache for ".$config->{species}) unless defined($config->{quiet}); build_full_cache($config); # exit script debug("Finished building cache") unless defined($config->{quiet}); exit(0); } # warn user DB will be used for SIFT/PolyPhen/Condel/HGVS/frequency if(defined($config->{cache})) { # these two def depend on DB foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency)) { debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet}); } # as does using HGVS or IDs as input debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet}); # the rest may be in the cache foreach my $param(grep {defined $config->{$_}} qw(sift polyphen condel regulatory)) { next if defined($config->{'cache_'.$param}); debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet}); } } # get list of chrs if supplied if(defined($config->{chr})) { my %chrs; foreach my $val(split /\,/, $config->{chr}) { my @nnn = split /\-/, $val; foreach my $chr($nnn[0]..$nnn[-1]) { $chrs{$chr} = 1; } } $config->{chr} = \%chrs; } # get input file handle $config->{in_file_handle} = &get_in_file_handle($config); # configure output file $config->{out_file_handle} = &get_out_file_handle($config); return $config; } # connects to DBs; in standalone mode this just loads registry module sub connect_to_dbs { my $config = shift; # get registry my $reg = 'Bio::EnsEMBL::Registry'; unless(defined($config->{standalone})) { # load DB options from registry file if given if(defined($config->{registry})) { debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet}); $reg->load_all( $config->{registry}, $config->{verbose}, undef, $config->{no_slice_cache} ); } # otherwise manually connect to DB server else { $reg->load_registry_from_db( -host => $config->{host}, -user => $config->{user}, -pass => $config->{password}, -port => $config->{port}, -db_version => $config->{db_version}, -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef, -verbose => $config->{verbose}, -no_cache => $config->{no_slice_cache}, ); } eval { $reg->set_reconnect_when_lost() }; if(defined($config->{verbose})) { # get a meta container adaptors to check version my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer'); my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer'); if($core_mca && $var_mca) { debug( "Connected to core version ", $core_mca->get_schema_version, " database ", "and variation version ", $var_mca->get_schema_version, " database" ); } } } return $reg; } # get adaptors from DB sub get_adaptors { my $config = shift; die "ERROR: No registry" unless defined $config->{reg}; $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature'); $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation'); $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix'); $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation'); # get fake ones for species with no var DB if(!defined($config->{vfa})) { $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species}); $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species}); } else { $config->{vfa}->db->include_failed_variations($config->{include_failed}) if defined($config->{vfa}->db) && $config->{vfa}->db->can('include_failed_variations'); } $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice'); $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene'); $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript'); $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer'); $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem'); # cache schema version $config->{mca}->get_schema_version if defined $config->{mca}; # check we got slice adaptor - can't continue without a core DB die("ERROR: Could not connect to core database\n") unless defined $config->{sa}; } # gets regulatory adaptors sub get_reg_adaptors { my $config = shift; foreach my $type(@REG_FEAT_TYPES) { next if defined($config->{$type.'_adaptor'}); my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type); if(defined($adaptor)) { $config->{$type.'_adaptor'} = $adaptor; } else { delete $config->{regulatory}; last; } } } # gets file handle for input sub get_in_file_handle { my $config = shift; # define the filehandle to read input from my $in_file_handle = new FileHandle; if(defined($config->{input_file})) { # check defined input file exists die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file}; if($config->{input_file} =~ /\.gz$/){ $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); } else { $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n"); } } # no file specified - try to read data off command line else { $in_file_handle = 'STDIN'; debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet}; } return $in_file_handle; } # gets file handle for output and adds header sub get_out_file_handle { my $config = shift; # define filehandle to write to my $out_file_handle = new FileHandle; # check if file exists if(-e $config->{output_file} && !defined($config->{force_overwrite})) { die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n"); } if($config->{output_file} =~ /stdout/i) { $out_file_handle = *STDOUT; } else { $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n"); } # file conversion, don't want to add normal headers if(defined($config->{convert})) { # header for VCF if($config->{convert} =~ /vcf/i) { print $out_file_handle join "\t", ( '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO' ); print $out_file_handle "\n"; } return $out_file_handle; } # GVF output, no header elsif(defined($config->{gvf})) { return $out_file_handle; } # make header my $time = &get_time; my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca}; $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache}); my $version_string = "Using API version ".$config->{reg}->software_version. ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?'); my $header =<<HEAD; ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION ## Output produced at $time ## Connected to $db_string ## $version_string ## Extra column keys: ## CANONICAL : Indicates if transcript is canonical for this gene ## HGNC : HGNC gene identifier ## ENSP : Ensembl protein identifer ## HGVSc : HGVS coding sequence name ## HGVSp : HGVS protein sequence name ## SIFT : SIFT prediction ## PolyPhen : PolyPhen prediction ## Condel : Condel SIFT/PolyPhen consensus prediction ## MATRIX : The source and identifier of a transcription factor binding profile aligned at this position ## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of a transcription factor binding profile HEAD # add headers print $out_file_handle $header; # add column headers print $out_file_handle '#', (join "\t", @OUTPUT_COLS); print $out_file_handle "\n"; return $out_file_handle; } # convert a variation feature to a line of output sub convert_vf { my $config = shift; my $vf = shift; my $convert_method = 'convert_to_'.lc($config->{convert}); my $method_ref = \&$convert_method; my $line = &$method_ref($config, $vf); my $handle = $config->{out_file_handle}; if(scalar @$line) { print $handle join "\t", @$line; print $handle "\n"; } } # converts to Ensembl format sub convert_to_ensembl { my $config = shift; my $vf = shift; return [ $vf->{chr} || $vf->seq_region_name, $vf->start, $vf->end, $vf->allele_string, $vf->strand, $vf->variation_name ]; } # converts to VCF format sub convert_to_vcf { my $config = shift; my $vf = shift; # look for imbalance in the allele string my %allele_lengths; my @alleles = split /\//, $vf->allele_string; foreach my $allele(@alleles) { $allele =~ s/\-//g; $allele_lengths{length($allele)} = 1; } # in/del/unbalanced if(scalar keys %allele_lengths > 1) { # we need the ref base before the variation # default to N in case we can't get it my $prev_base = 'N'; unless(defined($config->{cache})) { my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1); $prev_base = $slice->seq if defined($slice); } for my $i(0..$#alleles) { $alleles[$i] =~ s/\-//g; $alleles[$i] = $prev_base.$alleles[$i]; } return [ $vf->{chr} || $vf->seq_region_name, $vf->start - 1, $vf->variation_name, shift @alleles, (join ",", @alleles), '.', '.', '.' ]; } # balanced sub else { return [ $vf->{chr} || $vf->seq_region_name, $vf->start, $vf->variation_name, shift @alleles, (join ",", @alleles), '.', '.', '.' ]; } } # converts to pileup format sub convert_to_pileup { my $config = shift; my $vf = shift; # look for imbalance in the allele string my %allele_lengths; my @alleles = split /\//, $vf->allele_string; foreach my $allele(@alleles) { $allele =~ s/\-//g; $allele_lengths{length($allele)} = 1; } # in/del if(scalar keys %allele_lengths > 1) { if($vf->allele_string =~ /\-/) { # insertion? if($alleles[0] eq '-') { shift @alleles; for my $i(0..$#alleles) { $alleles[$i] =~ s/\-//g; $alleles[$i] = '+'.$alleles[$i]; } } else { @alleles = grep {$_ ne '-'} @alleles; for my $i(0..$#alleles) { $alleles[$i] =~ s/\-//g; $alleles[$i] = '-'.$alleles[$i]; } } @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles; return [ $vf->{chr} || $vf->seq_region_name, $vf->start - 1, '*', (join "/", @alleles), ]; } else { warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet}); return []; } } # balanced sub else { return [ $vf->{chr} || $vf->seq_region_name, $vf->start, shift @alleles, (join "/", @alleles), ]; } } # prints a line of output from the hash sub print_line { my $config = shift; my $line = shift; return unless defined($line); my $output; # normal if(ref($line) eq 'HASH') { $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} }; $output = join "\t", map { $line->{$_} || '-' } @OUTPUT_COLS; } # gvf else { $output = $line; } my $fh = $config->{out_file_handle}; print $fh "$output\n"; } # outputs usage message sub usage { my $usage =<<END; #----------------------------------# # ENSEMBL VARIANT EFFECT PREDICTOR # #----------------------------------# version $VERSION By Will McLaren (wm2\@ebi.ac.uk) http://www.ensembl.org/info/docs/variation/vep/vep_script.html Usage: perl variant_effect_predictor.pl [arguments] Options ======= --help Display this message and quit --verbose Display verbose output as the script runs [default: off] --quiet Suppress status and warning messages [default: off] --no_progress Suppress progress bars [default: off] --config Load configuration from file. Any command line options specified overwrite those in the file [default: off] -i | --input_file Input file - if not specified, reads from STDIN. Files may be gzip compressed. --format Specify input file format - one of "ensembl", "pileup", "vcf", "hgvs", "id" or "guess" to try and work out format. -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this will force --quiet [default: "variant_effect_output.txt"] --force_overwrite Force overwriting of output file [default: quit if file exists] --species [species] Species to use [default: "human"] -t | --terms Type of consequence terms to output - one of "ensembl", "SO", "NCBI" [default: ensembl] --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off] --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off] --condel=[p|s|b] Add Condel SIFT/PolyPhen consensus [p]rediction, [s]core or [b]oth. Add 1 (e.g. b1) to option to use old Condel algorithm [default: off] NB: SIFT, PolyPhen and Condel predictions are currently available for human only --regulatory Look for overlaps with regulatory regions. The script can also call if a variant falls in a high information position within a transcription factor binding site. Output lines have a Feature type of RegulatoryFeature or MotifFeature [default: off] NB: Regulatory consequences are currently available for human and mouse only --hgnc If specified, HGNC gene identifiers are output alongside the Ensembl Gene identifier [default: off] --hgvs Output HGVS identifiers (coding and protein). Requires database connection [default: off] --protein Output Ensembl protein identifer [default: off] --gene Force output of Ensembl gene identifer - disabled by default unless using --cache or --no_whole_genome [default: off] --canonical Indicate if the transcript for this consequence is the canonical transcript for this gene [default: off] --coding_only Only return consequences that fall in the coding region of transcripts [default: off] --most_severe Ouptut only the most severe consequence per variation. Transcript-specific columns will be left blank. [default: off] --summary Output only a comma-separated list of all consequences per variation. Transcript-specific columns will be left blank. [default: off] --per_gene Output only the most severe consequence per gene. Where more than one transcript has the same consequence, the transcript chosen is arbitrary. [default: off] --check_ref If specified, checks supplied reference allele against stored entry in Ensembl Core database [default: off] --check_existing If specified, checks for existing co-located variations in the Ensembl Variation database [default: off] --check_alleles If specified, the alleles of existing co-located variations are compared to the input; an existing variation will only be reported if no novel allele is in the input (strand is accounted for) [default: off] --no_intergenic Excludes intergenic consequences from the output [default: off] --check_frequency Turns on frequency filtering. Use this to include or exclude variants based on the frequency of co-located existing variants in the Ensembl Variation database. You must also specify all of the following --freq flags [default: off] --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap, 1kg_yri for YRI 1000 genomes. See documentation for more details --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or less than (lt) --freq_freq --freq_filter Specify whether variants that pass the above should be included [exclude|include] or excluded from analysis --chr [list] Select a subset of chromosomes to analyse from your file. Any data not on this chromosome in the input will be skipped. The list can be comma separated, with "-" characters representing a range e.g. 1-5,8,15,X [default: off] --gp If specified, tries to read GRCh37 position from GP field in the INFO column of a VCF file. Only applies when VCF is the input format and human is the species [default: off] --convert Convert the input file to the output format specified. [ensembl|vcf|pileup] Converted output is written to the file specified in --output_file. No consequence calculation is carried out when doing file conversion. [default: off] --refseq Use the otherfeatures database to retrieve transcripts - this database contains RefSeq transcripts (as well as CCDS and Ensembl EST alignments) [default: off] --host Manually define database host [default: "ensembldb.ensembl.org"] -u | --user Database username [default: "anonymous"] --port Database port [default: 5306] --password Database password [default: no password] --genomes Sets DB connection params for Ensembl Genomes [default: off] --registry Registry file to use defines DB connections [default: off] Defining a registry file overrides above connection settings. --db_version=[number] Force script to load DBs from a specific Ensembl version. Not advised due to likely incompatibilities between API and DB --no_whole_genome Run in old-style, non-whole genome mode [default: off] --buffer_size Sets the number of variants sent in each batch [default: 5000] Increasing buffer size can retrieve results more quickly but requires more memory. Only applies to whole genome mode. --cache Enables read-only use of cache [default: off] --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"] --write_cache Enable writing to cache [default: off] --build [all|list] Build a complete cache for the selected species. Build for all chromosomes with --build all, or a list of chromosomes (see --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS VIOLATES OUR FAIR USAGE POLICY [default: off] --compress Specify utility to decompress cache files - may be "gzcat" or "gzip -dc" Only use if default does not work [default: zcat] --skip_db_check ADVANCED! Force the script to use a cache built from a different database than specified with --host. Only use this if you are sure the hosts are compatible (e.g. ensembldb.ensembl.org and useastdb.ensembl.org) [default: off] --cache_region_size ADVANCED! The size in base-pairs of the region covered by one file in the cache. [default: 1MB] END print $usage; }