Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VEP.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VEP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,4713 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +# EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::VEP - Methods used by the Variant Effect Predictor + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::Utils::VEP qw(configure); + + my $config = configure(); + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Utils::VEP; + +# module list +use Getopt::Long; +use FileHandle; +use File::Path qw(make_path); +use Storable qw(nstore_fd fd_retrieve freeze thaw); +use Scalar::Util qw(weaken); +use Digest::MD5 qw(md5_hex); + +use Bio::EnsEMBL::Registry; +use Bio::EnsEMBL::Variation::VariationFeature; +use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor; +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); +use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; +use Bio::EnsEMBL::Variation::StructuralVariationFeature; +use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor; +use Bio::EnsEMBL::Variation::TranscriptStructuralVariation; + +# we need to manually include all these modules for caching to work +use Bio::EnsEMBL::CoordSystem; +use Bio::EnsEMBL::Transcript; +use Bio::EnsEMBL::Translation; +use Bio::EnsEMBL::Exon; +use Bio::EnsEMBL::ProteinFeature; +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::DBSQL::GeneAdaptor; +use Bio::EnsEMBL::DBSQL::SliceAdaptor; +use Bio::EnsEMBL::DBSQL::TranslationAdaptor; +use Bio::EnsEMBL::DBSQL::TranscriptAdaptor; +use Bio::EnsEMBL::DBSQL::MetaContainer; +use Bio::EnsEMBL::DBSQL::CoordSystemAdaptor; + +use Exporter; +use vars qw(@ISA @EXPORT_OK); +@ISA = qw(Exporter); + +# open socket pairs for cross-process comms +use Socket; +socketpair(CHILD, PARENT, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "ERROR: Failed to open socketpair: $!"; +CHILD->autoflush(1); +PARENT->autoflush(1); + +@EXPORT_OK = qw( + &parse_line + &vf_to_consequences + &validate_vf + &read_cache_info + &dump_adaptor_cache + &load_dumped_adaptor_cache + &load_dumped_variation_cache + &get_all_consequences + &get_slice + &build_slice_cache + &build_full_cache + ®ions_from_hash + &get_time + &debug + &convert_to_vcf + &progress + &end_progress + @REG_FEAT_TYPES + @OUTPUT_COLS + @VEP_WEB_CONFIG + %FILTER_SHORTCUTS +); + +our @OUTPUT_COLS = qw( + Uploaded_variation + Location + Allele + Gene + Feature + Feature_type + Consequence + cDNA_position + CDS_position + Protein_position + Amino_acids + Codons + Existing_variation + Extra +); + +our @REG_FEAT_TYPES = qw( + RegulatoryFeature + MotifFeature +); + +our @VEP_WEB_CONFIG = qw( + format + check_existing + coding_only + core_type + hgnc + protein + hgvs + terms + check_frequency + freq_filter + freq_gt_lt + freq_freq + freq_pop + sift + polyphen + regulatory +); + +our %FILTER_SHORTCUTS = ( + upstream => { + '5KB_upstream_variant' => 1, + '2KB_upstream_variant' => 1, + }, + downstream => { + '5KB_downstream_variant' => 1, + '2KB_downstream_variant' => 1, + '500B_downstream_variant' => 1, + }, + utr => { + '5_prime_UTR_variant' => 1, + '3_prime_UTR_variant' => 1, + }, + splice => { + splice_donor_variant => 1, + splice_acceptor_variant => 1, + splice_region_variant => 1, + }, + coding_change => { + stop_lost => 1, + stop_gained => 1, + missense_variant => 1, + frameshift_variant => 1, + inframe_insertion => 1, + inframe_deletion => 1, + }, + regulatory => { + regulatory_region_variant => 1, + TF_binding_site_variant => 1, + }, +); + +# parses a line of input, returns VF object(s) +sub parse_line { + my $config = shift; + my $line = shift; + + # find out file format - will only do this on first line + if(!defined($config->{format}) || (defined($config->{format}) && $config->{format} eq 'guess')) { + $config->{format} = &detect_format($line); + debug("Detected format of input file as ", $config->{format}) unless defined($config->{quiet}); + + # HGVS and ID formats need DB + die("ERROR: Can't use ".uc($config->{format})." format in offline mode") if $config->{format} =~ /id|hgvs/ && defined($config->{offline}); + + # force certain options if format is VEP output + if($config->{format} eq 'vep') { + $config->{no_consequence} = 1; + delete $config->{regulatory}; + debug("Forcing no consequence calculation") unless defined($config->{quiet}); + } + } + + # check that format is vcf when using --individual + die("ERROR: --individual only compatible with VCF input files\n") if defined($config->{individual}) && $config->{format} ne 'vcf'; + + my $parse_method = 'parse_'.$config->{format}; + $parse_method =~ s/vep_//; + my $method_ref = \&$parse_method; + + my $vfs = &$method_ref($config, $line); + + $vfs = add_lrg_mappings($config, $vfs) if defined($config->{lrg}); + + return $vfs; +} + +# sub-routine to detect format of input +sub detect_format { + my $line = shift; + my @data = split /\s+/, $line; + + # HGVS: ENST00000285667.3:c.1047_1048insC + if ( + scalar @data == 1 && + $data[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i + ) { + return 'hgvs'; + } + + # variant identifier: rs123456 + elsif ( + scalar @data == 1 + ) { + return 'id'; + } + + # VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ + elsif ( + $data[0] =~ /(chr)?\w+/ && + $data[1] =~ /^\d+$/ && + $data[3] =~ /^[ACGTN-]+$/i && + $data[4] =~ /^([\.ACGTN-]+\,?)+$/i + ) { + return 'vcf'; + } + + # pileup: chr1 60 T A + elsif ( + $data[0] =~ /(chr)?\w+/ && + $data[1] =~ /^\d+$/ && + $data[2] =~ /^[\*ACGTN-]+$/i && + $data[3] =~ /^[\*ACGTNRYSWKM\+\/-]+$/i + ) { + return 'pileup'; + } + + # ensembl: 20 14370 14370 A/G + + elsif ( + $data[0] =~ /\w+/ && + $data[1] =~ /^\d+$/ && + $data[2] =~ /^\d+$/ && + $data[3] =~ /[ACGTN-]+\/[ACGTN-]+/i + ) { + return 'ensembl'; + } + + # vep output: ID 1:142849179 - - - - INTERGENIC + elsif ( + $data[0] =~ /\w+/ && + $data[1] =~ /^\w+?\:\d+(\-\d+)*$/ && + scalar @data == 14 + ) { + return 'vep'; + } + + else { + die("ERROR: Could not detect input file format\n"); + } +} + +# parse a line of Ensembl format input into a variation feature object +sub parse_ensembl { + my $config = shift; + my $line = shift; + + my ($chr, $start, $end, $allele_string, $strand, $var_name) = split /\s+/, $line; + + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $end, + allele_string => $allele_string, + strand => $strand, + map_weight => 1, + adaptor => $config->{vfa}, + variation_name => $var_name, + chr => $chr, + }); + + return [$vf]; +} + +# parse a line of VCF input into a variation feature object +sub parse_vcf { + my $config = shift; + my $line = shift; + + my @data = split /\s+/, $line; + + # non-variant + my $non_variant = 0; + + if($data[4] eq '.') { + if(defined($config->{allow_non_variant})) { + $non_variant = 1; + } + else { + return []; + } + } + + # get relevant data + my ($chr, $start, $end, $ref, $alt) = ($data[0], $data[1], $data[1], $data[3], $data[4]); + + # some VCF files have a GRCh37 pos defined in GP flag in INFO column + # if user has requested, we can use that as the position instead + if(defined $config->{gp}) { + $chr = undef; + $start = undef; + + foreach my $pair(split /\;/, $data[7]) { + my ($key, $value) = split /\=/, $pair; + if($key eq 'GP') { + ($chr, $start) = split /\:/, $value; + $end = $start; + } + } + + unless(defined($chr) and defined($start)) { + warn "No GP flag found in INFO column" unless defined $config->{quiet}; + return []; + } + } + + # adjust end coord + $end += (length($ref) - 1); + + # structural variation + if((defined($data[7]) && $data[7] =~ /SVTYPE/) || $alt =~ /\<|\[|\]|\>/) { + + # parse INFO field + my %info = (); + + foreach my $bit(split /\;/, $data[7]) { + my ($key, $value) = split /\=/, $bit; + $info{$key} = $value; + } + + # like indels, SVs have the base before included for reference + $start++; + + # work out the end coord + if(defined($info{END})) { + $end = $info{END}; + } + elsif(defined($info{SVLEN})) { + $end = $start + abs($info{SVLEN}) - 1; + } + + # check for imprecise breakpoints + my ($min_start, $max_start, $min_end, $max_end); + + if(defined($info{CIPOS})) { + my ($low, $high) = split /\,/, $info{CIPOS}; + $min_start = $start + $low; + $max_start = $start + $high; + } + + if(defined($info{CIEND})) { + my ($low, $high) = split /\,/, $info{CIEND}; + $min_end = $end + $low; + $max_end = $end + $high; + } + + # get type + my $type = $info{SVTYPE}; + my $so_term; + + if(defined($type)) { + # convert to SO term + my %terms = ( + INS => 'insertion', + DEL => 'deletion', + TDUP => 'tandem_duplication', + DUP => 'duplication' + ); + + $so_term = defined $terms{$type} ? $terms{$type} : $type; + } + + my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({ + start => $start, + inner_start => $max_start, + outer_start => $min_start, + end => $end, + inner_end => $min_end, + outer_end => $max_end, + strand => 1, + adaptor => $config->{svfa}, + variation_name => $data[2] eq '.' ? undef : $data[2], + chr => $chr, + class_SO_term => $so_term, + }); + + return [$svf]; + } + + # normal variation + else { + # find out if any of the alt alleles make this an insertion or a deletion + my ($is_indel, $is_sub, $ins_count, $total_count); + foreach my $alt_allele(split /\,/, $alt) { + $is_indel = 1 if $alt_allele =~ /D|I/; + $is_indel = 1 if length($alt_allele) != length($ref); + $is_sub = 1 if length($alt_allele) == length($ref); + $ins_count++ if length($alt_allele) > length($ref); + $total_count++; + } + + # multiple alt alleles? + if($alt =~ /\,/) { + if($is_indel) { + my @alts; + + if($alt =~ /D|I/) { + foreach my $alt_allele(split /\,/, $alt) { + # deletion (VCF <4) + if($alt_allele =~ /D/) { + push @alts, '-'; + } + + elsif($alt_allele =~ /I/) { + $alt_allele =~ s/^I//g; + push @alts, $alt_allele; + } + } + } + + else { + $ref = substr($ref, 1) || '-'; + $start++; + + foreach my $alt_allele(split /\,/, $alt) { + $alt_allele = substr($alt_allele, 1); + $alt_allele = '-' if $alt_allele eq ''; + push @alts, $alt_allele; + } + } + + $alt = join "/", @alts; + } + + else { + # for substitutions we just need to replace ',' with '/' in $alt + $alt =~ s/\,/\//g; + } + } + + elsif($is_indel) { + # deletion (VCF <4) + if($alt =~ /D/) { + my $num_deleted = $alt; + $num_deleted =~ s/\D+//g; + $end += $num_deleted - 1; + $alt = "-"; + $ref .= ("N" x ($num_deleted - 1)) unless length($ref) > 1; + } + + # insertion (VCF <4) + elsif($alt =~ /I/) { + $ref = '-'; + $alt =~ s/^I//g; + $start++; + } + + # insertion or deletion (VCF 4+) + elsif(substr($ref, 0, 1) eq substr($alt, 0, 1)) { + + # chop off first base + $ref = substr($ref, 1) || '-'; + $alt = substr($alt, 1) || '-'; + + $start++; + } + } + + # create VF object + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $end, + allele_string => $non_variant ? $ref : $ref.'/'.$alt, + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + variation_name => $data[2] eq '.' ? undef : $data[2], + chr => $chr, + }); + + # flag as non-variant + $vf->{non_variant} = 1 if $non_variant; + + # individuals? + if(defined($config->{individual})) { + my @alleles = split /\//, $ref.'/'.$alt; + + my @return; + + foreach my $ind(keys %{$config->{ind_cols}}) { + + # get alleles present in this individual + my @bits; + my $gt = (split /\:/, $data[$config->{ind_cols}->{$ind}])[0]; + + my $phased = ($gt =~ /\|/ ? 1 : 0); + + foreach my $bit(split /\||\/|\\/, $gt) { + push @bits, $alleles[$bit] unless $bit eq '.'; + } + + # shallow copy VF + my $vf_copy = { %$vf }; + bless $vf_copy, ref($vf); + + # get non-refs + my %non_ref = map {$_ => 1} grep {$_ ne $ref} @bits; + + # construct allele_string + if(scalar keys %non_ref) { + $vf_copy->{allele_string} = $ref."/".(join "/", keys %non_ref); + } + else { + $vf_copy->{allele_string} = $ref; + $vf_copy->{non_variant} = 1; + } + + # store phasing info + $vf_copy->{phased} = defined($config->{phased} ? 1 : $phased); + + # store GT + $vf_copy->{genotype} = \@bits; + + # store individual name + $vf_copy->{individual} = $ind; + + push @return, $vf_copy; + } + + return \@return; + } + else { + return [$vf]; + } + } +} + +# parse a line of pileup input into variation feature objects +sub parse_pileup { + my $config = shift; + my $line = shift; + + my @data = split /\s+/, $line; + + # pileup can produce more than one VF per line + my @return; + + # normal variant + if($data[2] ne "*"){ + my $var; + + if($data[3] =~ /^[A|C|G|T]$/) { + $var = $data[3]; + } + else { + ($var = unambiguity_code($data[3])) =~ s/$data[2]//ig; + } + + for my $alt(split //, $var){ + push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $data[1], + end => $data[1], + allele_string => $data[2].'/'.$alt, + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $data[0], + }); + } + } + + # in/del + else { + my %tmp_hash = map {$_ => 1} split /\//, $data[3]; + my @genotype = keys %tmp_hash; + + foreach my $allele(@genotype){ + if(substr($allele,0,1) eq "+") { #ins + push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $data[1] + 1, + end => $data[1], + allele_string => '-/'.substr($allele, 1), + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $data[0], + }); + } + elsif(substr($allele,0,1) eq "-"){ #del + push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $data[1] + 1, + end => $data[1] + length(substr($allele, 1)), + allele_string => substr($allele, 1).'/-', + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $data[0], + }); + } + elsif($allele ne "*"){ + warn("WARNING: invalid pileup indel genotype: $line\n") unless defined $config->{quiet}; + } + } + } + + return \@return; +} + +# parse a line of HGVS input into a variation feature object +sub parse_hgvs { + my $config = shift; + my $line = shift; + + my $vf; + + # not all hgvs notations are supported yet, so we have to wrap it in an eval + eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{sa}, $config->{ta}) }; + + if((!defined($vf) || (defined $@ && length($@) > 1)) && defined($config->{coordinator})) { + eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{ofsa}, $config->{ofta}) }; + } + + if(!defined($vf) || (defined $@ && length($@) > 1)) { + warn("WARNING: Unable to parse HGVS notation \'$line\'\n$@") unless defined $config->{quiet}; + return []; + } + + # get whole chromosome slice + my $slice = $vf->slice->adaptor->fetch_by_region($vf->slice->coord_system->name, $vf->slice->seq_region_name); + + $vf = $vf->transfer($slice); + + # name it after the HGVS + $vf->{variation_name} = $line; + + # add chr attrib + $vf->{chr} = $vf->slice->seq_region_name; + + return [$vf]; +} + +# parse a variation identifier e.g. a dbSNP rsID +sub parse_id { + my $config = shift; + my $line = shift; + + my $v_obj = $config->{va}->fetch_by_name($line); + + return [] unless defined $v_obj; + + my @vfs = @{$v_obj->get_all_VariationFeatures}; + delete $_->{dbID} for @vfs; + delete $_->{overlap_consequences} for @vfs; + $_->{chr} = $_->seq_region_name for @vfs; + + return \@vfs; +} + +# parse a line of VEP output +sub parse_vep { + my $config = shift; + my $line = shift; + + my @data = split /\t/, $line; + + my ($chr, $start, $end) = split /\:|\-/, $data[1]; + $end ||= $start; + + # might get allele string from ID + my $allele_string; + + if($data[0] =~ /^\w\_\w\_\w$/) { + my @split = split /\_/, $data[0]; + $allele_string = $split[-1] if $split[-1] =~ /[ACGTN-]+\/[ACGTN-]+/; + } + + $allele_string ||= 'N/'.($data[6] =~ /intergenic/ ? 'N' : $data[2]); + + my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ + start => $start, + end => $end, + allele_string => $allele_string, + strand => 1, + map_weight => 1, + adaptor => $config->{vfa}, + chr => $chr, + variation_name => $data[0], + }); + + return [$vf]; +} + + + +# converts to VCF format +sub convert_to_vcf { + my $config = shift; + my $vf = shift; + + # look for imbalance in the allele string + my %allele_lengths; + my @alleles = split /\//, $vf->allele_string; + + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $allele_lengths{length($allele)} = 1; + } + + # in/del/unbalanced + if(scalar keys %allele_lengths > 1) { + + # we need the ref base before the variation + # default to N in case we can't get it + my $prev_base = 'N'; + + unless(defined($config->{cache})) { + my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1); + $prev_base = $slice->seq if defined($slice); + } + + for my $i(0..$#alleles) { + $alleles[$i] =~ s/\-//g; + $alleles[$i] = $prev_base.$alleles[$i]; + } + + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start - 1, + $vf->variation_name, + shift @alleles, + (join ",", @alleles), + '.', '.', '.' + ]; + + } + + # balanced sub + else { + return [ + $vf->{chr} || $vf->seq_region_name, + $vf->start, + $vf->variation_name, + shift @alleles, + (join ",", @alleles), + '.', '.', '.' + ]; + } +} + + +# tries to map a VF to the LRG coordinate system +sub add_lrg_mappings { + my $config = shift; + my $vfs = shift; + + my @new_vfs; + + foreach my $vf(@$vfs) { + + # add the unmapped VF to the array + push @new_vfs, $vf; + + # make sure the VF has an attached slice + $vf->{slice} ||= get_slice($config, $vf->{chr}); + next unless defined($vf->{slice}); + + # transform LRG <-> chromosome + my $new_vf = $vf->transform($vf->{slice}->coord_system->name eq 'lrg' ? 'chromosome' : 'lrg'); + + # add it to the array if transformation worked + if(defined($new_vf)) { + + # update new VF's chr entry + $new_vf->{chr} = $new_vf->seq_region_name; + push @new_vfs, $new_vf; + } + } + + return \@new_vfs; +} + + +# wrapper for whole_genome_fetch and vf_to_consequences +# takes config and a listref of VFs, returns listref of line hashes for printing +sub get_all_consequences { + my $config = shift; + my $listref = shift; + + if ($config->{extra}) { + eval "use Plugin qw($config);" + } + + # initialize caches + $config->{$_.'_cache'} ||= {} for qw(tr rf slice); + + # build hash + my %vf_hash; + push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref; + + my @non_variant = grep {defined($_->{non_variant})} @$listref; + debug("Skipping ".(scalar @non_variant)." non-variant loci\n") unless defined($config->{quiet}); + + # get regions + my $regions = ®ions_from_hash($config, \%vf_hash); + my $trim_regions = $regions; + + # get trimmed regions - allows us to discard out-of-range transcripts + # when using cache + #if(defined($config->{cache})) { + # my $tmp = $config->{cache}; + # delete $config->{cache}; + # $trim_regions = regions_from_hash($config, \%vf_hash); + # $config->{cache} = $tmp; + #} + + # prune caches + prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr}); + prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf}); + + # get chr list + my %chrs = map {$_->{chr} => 1} @$listref; + + my $fetched_tr_count = 0; + $fetched_tr_count = fetch_transcripts($config, $regions, $trim_regions) + unless defined($config->{no_consequences}); + + my $fetched_rf_count = 0; + $fetched_rf_count = fetch_regfeats($config, $regions, $trim_regions) + if defined($config->{regulatory}) + && !defined($config->{no_consequences}); + + # check we can use MIME::Base64 + if(defined($config->{fork})) { + eval q{ use MIME::Base64; }; + + if($@) { + debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); + delete $config->{fork}; + } + } + + my (@temp_array, @return, @pids); + if(defined($config->{fork})) { + my $size = scalar @$listref; + + while(my $tmp_vf = shift @$listref) { + push @temp_array, $tmp_vf; + + # fork + if(scalar @temp_array >= ($size / $config->{fork}) || scalar @$listref == 0) { + + my $pid = fork; + + if(!defined($pid)) { + debug("WARNING: Failed to fork - will attempt to continue without forking") unless defined($config->{quiet}); + push @temp_array, @$listref; + push @return, @{vf_list_to_cons($config, \@temp_array, $regions)}; + last; + } + elsif($pid) { + push @pids, $pid; + @temp_array = (); + } + elsif($pid == 0) { + + $config->{forked} = $$; + $config->{quiet} = 1; + + # redirect STDERR to PARENT so we can catch errors + *STDERR = *PARENT; + + my $cons = vf_list_to_cons($config, \@temp_array, $regions); + + # what we're doing here is sending a serialised hash of the + # results through to the parent process through the socket. + # This is then thawed by the parent process. + # $$, or the PID, is added so that the input can be sorted + # back into the correct order for output + print PARENT $$." ".encode_base64(freeze($_), "\t")."\n" for @$cons; + + # some plugins may cache stuff, check for this and try and + # reconstitute it into parent's plugin cache + foreach my $plugin(@{$config->{plugins}}) { + + next unless defined($plugin->{has_cache}); + + # delete unnecessary stuff and stuff that can't be serialised + delete $plugin->{$_} for qw(config feature_types variant_feature_types version feature_types_wanted variant_feature_types_wanted params); + print PARENT $$." PLUGIN ".ref($plugin)." ".encode_base64(freeze($plugin), "\t")."\n"; + } + + # we need to tell the parent this child is finished + # otherwise it keeps listening + print PARENT "DONE $$\n"; + close PARENT; + + exit(0); + } + } + } + + debug("Calculating consequences") unless defined($config->{quiet}); + + my $fh = $config->{out_file_handle}; + my $done_processes = 0; + my $done_vars = 0; + my $total_size = $size; + my $pruned_count; + + # create a hash to store the returned data by PID + # this means we can sort it correctly on output + my %by_pid; + + # read child input + while(<CHILD>) { + + # child finished + if(/^DONE/) { + $done_processes++; + last if $done_processes == scalar @pids; + } + + # variant finished / progress indicator + elsif(/^BUMP/) { + progress($config, ++$done_vars, $total_size);# if $pruned_count == scalar @pids; + } + + # output + elsif(/^\-?\d+ /) { + + # plugin + if(/^\-?\d+ PLUGIN/) { + + m/^(\-?\d+) PLUGIN (\w+) /; + my ($pid, $plugin) = ($1, $2); + + # remove the PID + s/^\-?\d+ PLUGIN \w+ //; + chomp; + + my $tmp = thaw(decode_base64($_)); + + next unless defined($plugin); + + # copy data to parent plugin + my ($parent_plugin) = grep {ref($_) eq $plugin} @{$config->{plugins}}; + + next unless defined($parent_plugin); + + merge_hashes($parent_plugin, $tmp); + } + + else { + # grab the PID + m/^(\-?\d+)\s/; + my $pid = $1; + die "ERROR: Could not parse forked PID from line $_" unless defined($pid); + + # remove the PID + s/^\-?\d+\s//; + chomp; + + # decode and thaw "output" from forked process + push @{$by_pid{$pid}}, thaw(decode_base64($_)); + } + } + + elsif(/^PRUNED/) { + s/PRUNED //g; + chomp; + $pruned_count++; + $total_size += $_; + } + + elsif(/^DEBUG/) { + print STDERR; + } + + # something's wrong + else { + # kill the other pids + kill(15, $_) for @pids; + die("\nERROR: Forked process failed\n$_\n"); + } + } + + end_progress($config); + + debug("Writing output") unless defined($config->{quiet}); + + waitpid($_, 0) for @pids; + + # add the sorted data to the return array + push @return, @{$by_pid{$_} || []} for @pids; + } + + # no forking + else { + push @return, @{vf_list_to_cons($config, $listref, $regions)}; + } + + if(defined($config->{debug})) { + eval q{use Devel::Size qw(total_size)}; + my $mem = memory(); + my $tot; + $tot += $_ for @$mem; + + if($tot > 1000000) { + $tot = sprintf("%.2fGB", $tot / (1024 * 1024)); + } + + elsif($tot > 1000) { + $tot = sprintf("%.2fMB", $tot / 1024); + } + + my $mem_diff = mem_diff($config); + debug( + "LINES ", $config->{line_number}, + "\tMEMORY $tot ", (join " ", @$mem), + "\tDIFF ", (join " ", @$mem_diff), + "\tCACHE ", total_size($config->{tr_cache}). + "\tRF ", total_size($config->{rf_cache}), + "\tVF ", total_size(\%vf_hash), + ); + #exit(0) if grep {$_ < 0} @$mem_diff; + } + + return \@return; +} + +sub vf_list_to_cons { + my $config = shift; + my $listref = shift; + my $regions = shift; + + # get non-variants + my @non_variants = grep {$_->{non_variant}} @$listref; + + my %vf_hash = (); + push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref; + + # check existing VFs + &check_existing_hash($config, \%vf_hash) if defined($config->{check_existing});# && scalar @$listref > 10; + + # get overlapping SVs + &check_svs_hash($config, \%vf_hash) if defined($config->{check_svs}); + + # if we are forked, we can trim off some stuff + if(defined($config->{forked})) { + my $tmp = $config->{cache}; + delete $config->{cache}; + $regions = regions_from_hash($config, \%vf_hash); + $config->{cache} = $tmp; + + # prune caches + my $new_count = 0; + + $new_count += prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr}); + $new_count += prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf}); + + print PARENT "PRUNED $new_count\n"; + } + + my @return; + + foreach my $chr(sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %vf_hash) { + my $finished_vfs = whole_genome_fetch($config, $chr, \%vf_hash); + + # non-variants? + if(scalar @non_variants) { + push @$finished_vfs, grep {$_->{chr} eq $chr} @non_variants; + + # need to re-sort + @$finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @$finished_vfs; + } + + debug("Calculating consequences") unless defined($config->{quiet}); + + my $vf_count = scalar @$finished_vfs; + my $vf_counter = 0; + + while(my $vf = shift @$finished_vfs) { + progress($config, $vf_counter++, $vf_count) unless $vf_count == 1; + + my $filter_ok = 1; + + # filtered output + if(defined($config->{filter})) { + $filter_ok = filter_by_consequence($config, $vf); + $config->{filter_count} += $filter_ok; + } + + # skip filtered lines + next unless $filter_ok; + + # original output + if(defined($config->{original})) { + push @return, $vf->{_line}; + } + + # GVF output + elsif(defined($config->{gvf})) { + $vf->source("User"); + + $config->{gvf_id} ||= 1; + + # get custom annotation + my $custom_annotation = defined($config->{custom}) ? get_custom_annotation($config, $vf) : {}; + $custom_annotation->{ID} = $config->{gvf_id}++; + + + my $tmp = $vf->to_gvf( + include_consequences => defined($config->{no_consequences}) ? 0 : 1, + extra_attrs => $custom_annotation, + ); + push @return, \$tmp; + } + + # VCF output + elsif(defined($config->{vcf})) { + + # convert to VCF, otherwise get line + my $line = $config->{format} eq 'vcf' ? [split /\s+/, $vf->{_line}] : convert_to_vcf($config, $vf); + + if(!defined($line->[7]) || $line->[7] eq '.') { + $line->[7] = ''; + } + + # get all the lines the normal way + # and process them into VCF-compatible string + my $string = 'CSQ='; + + foreach my $line(@{vf_to_consequences($config, $vf)}) { + + # use the field list (can be user-defined by setting --fields) + for my $col(@{$config->{fields}}) { + + # skip fields already represented in the VCF + next if $col eq 'Uploaded_variation' or $col eq 'Location' or $col eq 'Extra'; + + # search for data in main line hash as well as extra field + my $data = defined $line->{$col} ? $line->{$col} : $line->{Extra}->{$col}; + + # "-" means null for everything except the Allele field (confusing...) + $data = undef if defined($data) and $data eq '-' and $col ne 'Allele'; + $data =~ s/\,/\&/g if defined $data; + $string .= defined($data) ? $data : ''; + $string .= '|'; + } + + $string =~ s/\|$//; + $string .= ','; + } + + $string =~ s/\,$//; + + if(!defined($config->{no_consequences}) && $string ne 'CSQ=') { + $line->[7] .= ($line->[7] ? ';' : '').$string; + } + + # get custom annotation + if(defined($config->{custom}) && scalar @{$config->{custom}}) { + my $custom_annotation = get_custom_annotation($config, $vf); + foreach my $key(keys %{$custom_annotation}) { + $line->[7] .= ($line->[7] ? ';' : '').$key.'='.$custom_annotation->{$key}; + } + } + + my $tmp = join "\t", @$line; + push @return, \$tmp; + } + + # no consequence output from vep input + elsif(defined($config->{no_consequences}) && $config->{format} eq 'vep') { + + my $line = [split /\s+/, $vf->{_line}]; + + if($line->[13] eq '-') { + $line->[13] = ''; + } + + # get custom annotation + if(defined($config->{custom})) { + my $custom_annotation = get_custom_annotation($config, $vf); + foreach my $key(keys %{$custom_annotation}) { + $line->[13] .= ($line->[13] ? ';' : '').$key.'='.$custom_annotation->{$key}; + } + } + + my $tmp = join "\t", @$line; + push @return, \$tmp; + } + + # normal output + else { + push @return, @{vf_to_consequences($config, $vf)}; + } + + print PARENT "BUMP\n" if defined($config->{forked}); + } + + end_progress($config) unless scalar @$listref == 1; + } + + return \@return; +} + +# takes a variation feature and returns ready to print consequence information +sub vf_to_consequences { + my $config = shift; + my $vf = shift; + + # use a different method for SVs + return svf_to_consequences($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); + + my @return = (); + + # method name for consequence terms + my $term_method = $config->{terms}.'_term'; + + # find any co-located existing VFs + $vf->{existing} ||= find_existing($config, $vf) if defined $config->{check_existing}; + + # skip based on frequency checks? + if(defined($config->{check_frequency}) && defined($vf->{existing}) && $vf->{existing} ne '-' && defined($config->{va})) { + return [] unless grep {$_} map {check_frequencies($config, $_)} reverse split(/\,/, $vf->{existing}); + $vf->{freqs} = $config->{filtered_freqs}; + } + + # force empty hash into object's transcript_variations if undefined from whole_genome_fetch + # this will stop the API trying to go off and fill it again + $vf->{transcript_variations} ||= {} if defined $config->{whole_genome}; + + # regulatory stuff + if(!defined $config->{coding_only} && defined $config->{regulatory}) { + + for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) { + + my $rf = $rfv->regulatory_feature; + + my $base_line = { + Feature_type => 'RegulatoryFeature', + Feature => $rf->stable_id, + }; + + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + $base_line->{Extra}->{CELL_TYPE} = join ",", + map {$_.':'.$rf->{cell_types}->{$_}} + grep {$rf->{cell_types}->{$_}} + @{$config->{cell_type}}; + + $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; + } + + # this currently always returns 'RegulatoryFeature', so we ignore it for now + #$base_line->{Extra}->{REG_FEAT_TYPE} = $rf->feature_type->name; + + for my $rfva (@{ $rfv->get_all_alternate_RegulatoryFeatureVariationAlleles }) { + + my $line = init_line($config, $vf, $base_line); + + $line->{Allele} = $rfva->variation_feature_seq; + $line->{Consequence} = join ',', + map { $_->$term_method || $_->SO_term } + @{ $rfva->get_all_OverlapConsequences }; + + $line = run_plugins($rfva, $line, $config); + + push @return, $line; + } + } + + for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) { + + my $mf = $mfv->motif_feature; + + # check that the motif has a binding matrix, if not there's not + # much we can do so don't return anything + + next unless defined $mf->binding_matrix; + + my $matrix = $mf->binding_matrix->description.' '.$mf->display_label; + $matrix =~ s/\s+/\_/g; + + my $base_line = { + Feature_type => 'MotifFeature', + Feature => $mf->binding_matrix->name, + Extra => { + MOTIF_NAME => $matrix, + } + }; + + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + $base_line->{Extra}->{CELL_TYPE} = join ",", + map {$_.':'.$mf->{cell_types}->{$_}} + grep {$mf->{cell_types}->{$_}} + @{$config->{cell_type}}; + + $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; + } + + for my $mfva (@{ $mfv->get_all_alternate_MotifFeatureVariationAlleles }) { + + my $line = init_line($config, $vf, $base_line); + + $line->{Extra}->{MOTIF_POS} = $mfva->motif_start if defined $mfva->motif_start; + $line->{Extra}->{HIGH_INF_POS} = ($mfva->in_informative_position ? 'Y' : 'N'); + + my $delta = $mfva->motif_score_delta if $mfva->variation_feature_seq =~ /^[ACGT]+$/; + + $line->{Extra}->{MOTIF_SCORE_CHANGE} = sprintf("%.3f", $delta) if defined $delta; + + $line->{Allele} = $mfva->variation_feature_seq; + $line->{Consequence} = join ',', + map { $_->$term_method || $_->SO_term } + @{ $mfva->get_all_OverlapConsequences }; + + $line = run_plugins($mfva, $line, $config); + + push @return, $line; + } + } + } + + # get TVs + my $tvs = $vf->get_all_TranscriptVariations; + + # only most severe + if(defined($config->{most_severe}) || defined($config->{summary})) { + + my $line = init_line($config, $vf); + + if(defined($config->{summary})) { + $line->{Consequence} = join ",", @{$vf->consequence_type($config->{terms}) || $vf->consequence_type}; + } + else { + $line->{Consequence} = $vf->display_consequence($config->{terms}) || $vf->display_consequence; + } + + push @return, $line; + } + + # pass a true argument to get_IntergenicVariation to stop it doing a reference allele check + # (to stay consistent with the rest of the VEP) + elsif ((my $iv = $vf->get_IntergenicVariation(1)) && !defined($config->{no_intergenic})) { + + for my $iva (@{ $iv->get_all_alternate_IntergenicVariationAlleles }) { + + my $line = init_line($config, $vf); + + $line->{Allele} = $iva->variation_feature_seq; + + my $cons = $iva->get_all_OverlapConsequences->[0]; + + $line->{Consequence} = $cons->$term_method || $cons->SO_term; + + $line = run_plugins($iva, $line, $config); + + push @return, $line; + } + } + + # user wants only one conseqeunce per gene + elsif(defined($config->{per_gene})) { + + # sort the TVA objects into a hash by gene + my %by_gene; + + foreach my $tv(@$tvs) { + next if(defined $config->{coding_only} && !($tv->affects_cds)); + + my $gene = $tv->transcript->{_gene_stable_id} || $config->{ga}->fetch_by_transcript_stable_id($tv->transcript->stable_id)->stable_id; + + push @{$by_gene{$gene}}, @{$tv->get_all_alternate_TranscriptVariationAlleles}; + } + + foreach my $gene(keys %by_gene) { + my ($lowest, $lowest_tva); + + # at the moment this means the one that comes out last will be picked + # if there is more than one TVA with the same rank of consequence + foreach my $tva(@{$by_gene{$gene}}) { + foreach my $oc(@{$tva->get_all_OverlapConsequences}) { + if(!defined($lowest) || $oc->rank < $lowest) { + $lowest = $oc->rank; + $lowest_tva = $tva; + } + } + } + + push @return, tva_to_line($config, $lowest_tva); + } + } + + else { + foreach my $tv(@$tvs) { + next if(defined $config->{coding_only} && !($tv->affects_cds)); + + push @return, map {tva_to_line($config, $_)} @{$tv->get_all_alternate_TranscriptVariationAlleles}; + + undef $tv->{$_} for keys %$tv; + } + } + + return \@return; +} + +# get consequences for a structural variation feature +sub svf_to_consequences { + my $config = shift; + my $svf = shift; + + my @return = (); + + my $term_method = $config->{terms}.'_term'; + + if(defined $config->{whole_genome}) { + $svf->{transcript_structural_variations} ||= []; + $svf->{regulation_structural_variations}->{$_} ||= [] for @REG_FEAT_TYPES; + } + + if ((my $iv = $svf->get_IntergenicStructuralVariation(1)) && !defined($config->{no_intergenic})) { + + for my $iva (@{ $iv->get_all_alternate_IntergenicStructuralVariationAlleles }) { + + my $line = init_line($config, $svf); + + $line->{Allele} = '-'; + + my $cons = $iva->get_all_OverlapConsequences->[0]; + + $line->{Consequence} = $cons->$term_method || $cons->SO_term; + + $line = run_plugins($iva, $line, $config); + + push @return, $line; + } + } + + foreach my $svo(@{$svf->get_all_StructuralVariationOverlaps}) { + + next if $svo->isa('Bio::EnsEMBL::Variation::IntergenicStructuralVariation'); + + my $feature = $svo->feature; + + # get feature type + my $feature_type = (split '::', ref($feature))[-1]; + + my $base_line = { + Feature_type => $feature_type, + Feature => $feature->stable_id, + Allele => $svf->class_SO_term, + }; + + if($svo->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariation')) { + $base_line->{cDNA_position} = format_coords($svo->cdna_start, $svo->cdna_end); + $base_line->{CDS_position} = format_coords($svo->cds_start, $svo->cds_end); + $base_line->{Protein_position} = format_coords($svo->translation_start, $svo->translation_end); + } + + foreach my $svoa(@{$svo->get_all_StructuralVariationOverlapAlleles}) { + my $line = init_line($config, $svf, $base_line); + + $line->{Consequence} = join ",", + #map {s/feature/$feature_type/e; $_} + map {$_->$term_method} + sort {$a->rank <=> $b->rank} + @{$svoa->get_all_OverlapConsequences}; + + # work out overlap amounts + my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1]; + my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0]; + my $overlap_length = ($overlap_end - $overlap_start) + 1; + my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1)); + + $line->{Extra}->{OverlapBP} = $overlap_length if $overlap_length > 0; + $line->{Extra}->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0; + + add_extra_fields($config, $line, $svoa); + + push @return, $line; + } + } + + return \@return; +} + +# run all of the configured plugins on a VariationFeatureOverlapAllele instance +# and store any results in the provided line hash +sub run_plugins { + + my ($bvfoa, $line_hash, $config) = @_; + + my $skip_line = 0; + + for my $plugin (@{ $config->{plugins} || [] }) { + + # check that this plugin is interested in this type of variation feature + + if ($plugin->check_variant_feature_type(ref $bvfoa->base_variation_feature)) { + + # check that this plugin is interested in this type of feature + + if ($plugin->check_feature_type(ref $bvfoa->feature || 'Intergenic')) { + + eval { + my $plugin_results = $plugin->run($bvfoa, $line_hash); + + if (defined $plugin_results) { + if (ref $plugin_results eq 'HASH') { + for my $key (keys %$plugin_results) { + $line_hash->{Extra}->{$key} = $plugin_results->{$key}; + } + } + else { + warn "Plugin '".(ref $plugin)."' did not return a hashref, output ignored!\n"; + } + } + else { + # if a plugin returns undef, that means it want to filter out this line + $skip_line = 1; + } + }; + if ($@) { + warn "Plugin '".(ref $plugin)."' went wrong: $@"; + } + + # there's no point running any other plugins if we're filtering this line, + # because the first plugin to skip the line wins, so we might as well last + # out of the loop now and avoid any unnecessary computation + + last if $skip_line; + } + } + } + + return $skip_line ? undef : $line_hash; +} + +# turn a TranscriptVariationAllele into a line hash +sub tva_to_line { + my $config = shift; + my $tva = shift; + + my $tv = $tva->transcript_variation; + my $t = $tv->transcript; + + # method name for consequence terms + my $term_method = $config->{terms}.'_term'; + + my $base_line = { + Feature_type => 'Transcript', + Feature => (defined $t ? $t->stable_id : undef), + cDNA_position => format_coords($tv->cdna_start, $tv->cdna_end), + CDS_position => format_coords($tv->cds_start, $tv->cds_end), + Protein_position => format_coords($tv->translation_start, $tv->translation_end), + Allele => $tva->variation_feature_seq, + Amino_acids => $tva->pep_allele_string, + Codons => $tva->display_codon_allele_string, + Consequence => join ",", map {$_->$term_method || $_->SO_term} sort {$a->rank <=> $b->rank} @{$tva->get_all_OverlapConsequences}, + }; + + my $line = init_line($config, $tva->variation_feature, $base_line); + + # HGVS + if(defined $config->{hgvs}) { + my $hgvs_t = $tva->hgvs_transcript; + my $hgvs_p = $tva->hgvs_protein; + + $line->{Extra}->{HGVSc} = $hgvs_t if $hgvs_t; + $line->{Extra}->{HGVSp} = $hgvs_p if $hgvs_p; + } + + foreach my $tool (qw(SIFT PolyPhen)) { + my $lc_tool = lc($tool); + + if (my $opt = $config->{$lc_tool}) { + my $want_pred = $opt =~ /^p/i; + my $want_score = $opt =~ /^s/i; + my $want_both = $opt =~ /^b/i; + + if ($want_both) { + $want_pred = 1; + $want_score = 1; + } + + next unless $want_pred || $want_score; + + my $pred_meth = $lc_tool.'_prediction'; + my $score_meth = $lc_tool.'_score'; + + my $pred = $tva->$pred_meth; + + if($pred) { + + if ($want_pred) { + $pred =~ s/\s+/\_/; + $line->{Extra}->{$tool} = $pred; + } + + if ($want_score) { + my $score = $tva->$score_meth; + + if(defined $score) { + if($want_pred) { + $line->{Extra}->{$tool} .= "($score)"; + } + else { + $line->{Extra}->{$tool} = $score; + } + } + } + } + } + } + + $line = add_extra_fields($config, $line, $tva); + + return $line; +} + +sub add_extra_fields { + my $config = shift; + my $line = shift; + my $bvfoa = shift; + + # overlapping SVs + if(defined $config->{check_svs} && defined $bvfoa->base_variation_feature->{overlapping_svs}) { + $line->{Extra}->{SV} = $bvfoa->base_variation_feature->{overlapping_svs}; + } + + # add transcript-specific fields + $line = add_extra_fields_transcript($config, $line, $bvfoa) if $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele'); + + # run plugins + $line = run_plugins($bvfoa, $line, $config); + + return $line; +} + +sub add_extra_fields_transcript { + my $config = shift; + my $line = shift; + my $tva = shift; + + my $tv = $tva->base_variation_feature_overlap; + my $tr = $tva->transcript; + + # get gene + my $gene; + + $line->{Gene} = $tr->{_gene_stable_id}; + + if(!defined($line->{Gene})) { + $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); + $line->{Gene} = $gene ? $gene->stable_id : '-'; + } + + # exon/intron numbers + if ($config->{numbers}) { + $line->{Extra}->{EXON} = $tv->exon_number if defined $tv->exon_number; + $line->{Extra}->{INTRON} = $tv->intron_number if defined $tv->intron_number; + } + + if ($config->{domains}) { + my $feats = $tv->get_overlapping_ProteinFeatures; + + my @strings; + + for my $feat (@$feats) { + my $label = $feat->analysis->display_label.':'.$feat->hseqname; + + # replace any special characters + $label =~ s/[\s;=]/_/g; + + push @strings, $label; + } + + $line->{Extra}->{DOMAINS} = join ',', @strings if @strings; + } + + # distance to transcript + if($line->{Consequence} =~ /(up|down)stream/i) { + $line->{Extra}->{DISTANCE} = $tv->distance_to_transcript; + } + + # HGNC + if(defined $config->{hgnc}) { + my $hgnc; + $hgnc = $tr->{_gene_hgnc}; + + if(!defined($hgnc)) { + if(!defined($gene)) { + $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); + } + + my @entries = grep {$_->database eq 'HGNC'} @{$gene->get_all_DBEntries()}; + if(scalar @entries) { + $hgnc = $entries[0]->display_id; + } + } + + $hgnc = undef if defined($hgnc) && $hgnc eq '-'; + + $line->{Extra}->{HGNC} = $hgnc if defined($hgnc); + } + + # CCDS + if(defined($config->{ccds})) { + my $ccds = $tr->{_ccds}; + + if(!defined($ccds)) { + my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; + $ccds = $entries[0]->display_id if scalar @entries; + } + + $ccds = undef if defined($ccds) && $ccds eq '-'; + + $line->{Extra}->{CCDS} = $ccds if defined($ccds); + } + + # refseq xref + if(defined($config->{xref_refseq})) { + my $refseq = $tr->{_refseq}; + + if(!defined($refseq)) { + my @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; + if(scalar @entries) { + $refseq = join ",", map {$_->display_id."-".$_->database} @entries; + } + } + + $refseq = undef if defined($refseq) && $refseq eq '-'; + + $line->{Extra}->{RefSeq} = $refseq if defined($refseq); + } + + # protein ID + if(defined $config->{protein}) { + my $protein = $tr->{_protein}; + + if(!defined($protein)) { + $protein = $tr->translation->stable_id if defined($tr->translation); + } + + $protein = undef if defined($protein) && $protein eq '-'; + + $line->{Extra}->{ENSP} = $protein if defined($protein); + } + + # canonical transcript + if(defined $config->{canonical}) { + $line->{Extra}->{CANONICAL} = 'YES' if $tr->is_canonical; + } + + return $line; +} + +# initialize a line hash +sub init_line { + my $config = shift; + my $vf = shift; + my $base_line = shift; + + my $line = { + Uploaded_variation => $vf->variation_name, + Location => ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->start, $vf->end), + Existing_variation => $vf->{existing}, + Extra => {}, + }; + + # add custom info + if(defined($config->{custom}) && scalar @{$config->{custom}}) { + # merge the custom hash with the extra hash + my $custom = get_custom_annotation($config, $vf); + + for my $key (keys %$custom) { + $line->{Extra}->{$key} = $custom->{$key}; + } + } + + # individual? + $line->{Extra}->{IND} = $vf->{individual} if defined($vf->{individual}); + + # frequencies? + $line->{Extra}->{FREQS} = join ",", @{$vf->{freqs}} if defined($vf->{freqs}); + + # gmaf? + $line->{Extra}->{GMAF} = $vf->{gmaf} if defined($config->{gmaf}) && defined($vf->{gmaf}); + + # copy entries from base_line + if(defined($base_line)) { + $line->{$_} = $base_line->{$_} for keys %$base_line; + } + + return $line; +} + + +# get custom annotation for a single VF +sub get_custom_annotation { + my $config = shift; + my $vf = shift; + my $cache = shift; + + return $vf->{custom} if defined($vf->{custom}); + + my $annotation = {}; + + my $chr = $vf->{chr}; + + if(!defined($cache)) { + # spoof regions + my $regions; + $regions->{$chr} = [$vf->{start}.'-'.$vf->{end}]; + $cache = cache_custom_annotation($config, $regions, $chr); + } + + foreach my $custom(@{$config->{custom}}) { + next unless defined($cache->{$chr}->{$custom->{name}}); + + # exact type must match coords of variant exactly + if($custom->{type} eq 'exact') { + foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$vf->{start}}}) { + + next unless + $feature->{chr} eq $chr && + $feature->{start} eq $vf->{start} && + $feature->{end} eq $vf->{end}; + + $annotation->{$custom->{name}} .= $feature->{name}.','; + } + } + + # overlap type only needs to overlap, but we need to search the whole range + elsif($custom->{type} eq 'overlap') { + foreach my $pos(keys %{$cache->{$chr}->{$custom->{name}}}) { + foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$pos}}) { + + next unless + $feature->{chr} eq $chr && + $feature->{end} >= $vf->{start} && + $feature->{start} <= $vf->{end}; + + $annotation->{$custom->{name}} .= $feature->{name}.','; + } + } + } + + # trim off trailing commas + $annotation->{$custom->{name}} =~ s/\,$//g if defined($annotation->{$custom->{name}}); + } + + return $annotation; +} + +# decides whether to print a VF based on user defined consequences +sub filter_by_consequence { + my $config = shift; + my $vf = shift; + my $filters = $config->{filter}; + + # find it if we only have "no"s + my $only_nos = 0; + $only_nos = 1 if (sort {$a <=> $b} values %$filters)[-1] == 0; + + my ($yes, $no) = (0, 0); + + # get all consequences across all term types + my @types = ('SO', 'display'); + + my @cons; + push @cons, @{$vf->consequence_type($_)} for @types; + + # add regulatory consequences + if(defined($config->{regulatory})) { + foreach my $term_type(@types) { + my $term_method = $term_type.'_term'; + + for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) { + for my $rfva(@{$rfv->get_all_alternate_RegulatoryFeatureVariationAlleles}) { + push @cons, map {$_->$term_method} @{ $rfva->get_all_OverlapConsequences }; + } + } + for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) { + for my $mfva(@{$mfv->get_all_alternate_MotifFeatureVariationAlleles}) { + push @cons, map {$_->$term_method} @{ $mfva->get_all_OverlapConsequences }; + } + } + } + } + + foreach my $con(grep {defined($_) && defined($filters->{$_})} @cons) { + if($filters->{$con} == 1) { + $yes = 1; + } + else { + $no = 1; + } + } + + # check special case, coding + if(defined($filters->{coding})) { + if(grep {$_->affects_cds} @{$vf->get_all_TranscriptVariations}) { + if($filters->{coding} == 1) { + $yes = 1; + } + else { + $no = 1; + } + } + } + + my $ok = 0; + if($only_nos) { + $ok = 1 if !$no; + } + else { + $ok = 1 if $yes && !$no; + } + + return $ok; +} + + +# takes VFs created from input, fixes and checks various things +sub validate_vf { + my $config = shift; + my $vf = shift; + + # user specified chr skip list + return 0 if defined($config->{chr}) && !$config->{chr}->{$vf->{chr}}; + + # fix inputs + $vf->{chr} =~ s/chr//ig unless $vf->{chr} =~ /^chromosome$/i; + $vf->{chr} = 'MT' if $vf->{chr} eq 'M'; + $vf->{strand} ||= 1; + $vf->{strand} = ($vf->{strand} =~ /\-/ ? "-1" : "1"); + + # sanity checks + unless($vf->{start} =~ /^\d+$/ && $vf->{end} =~ /^\d+$/) { + warn("WARNING: Start ".$vf->{start}." or end ".$vf->{end}." coordinate invalid on line ".$config->{line_number}."\n") unless defined $config->{quiet}; + return 0; + } + + # structural variation? + return validate_svf($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); + + # uppercase allele string + $vf->{allele_string} =~ tr/[a-z]/[A-Z]/; + + unless($vf->{allele_string} =~ /([ACGT-]+\/*)+/) { + warn("WARNING: Invalid allele string ".$vf->{allele_string}." on line ".$config->{line_number}." or possible parsing error\n") unless defined $config->{quiet}; + return 0; + } + + # insertion should have start = end + 1 + if($vf->{allele_string} =~ /^\-\// && $vf->{start} != $vf->{end} + 1) { + warn( + "WARNING: Alleles look like an insertion (". + $vf->{allele_string}. + ") but coordinates are not start = end + 1 (START=". + $vf->{start}.", END=".$vf->{end}. + ") on line ".$config->{line_number}."\n" + ) unless defined($config->{quiet}); + return 0; + } + + # check length of reference matches seq length spanned + my @alleles = split /\//, $vf->{allele_string}; + my $ref_allele = shift @alleles; + my $tmp_ref_allele = $ref_allele; + $tmp_ref_allele =~ s/\-//g; + + #if(($vf->{end} - $vf->{start}) + 1 != length($tmp_ref_allele)) { + # warn( + # "WARNING: Length of reference allele (".$ref_allele. + # " length ".length($tmp_ref_allele).") does not match co-ordinates ".$vf->{start}."-".$vf->{end}. + # " on line ".$config->{line_number} + # ) unless defined($config->{quiet}); + # return 0; + #} + + # flag as unbalanced + foreach my $allele(@alleles) { + $allele =~ s/\-//g; + $vf->{indel} = 1 unless length($allele) == length($tmp_ref_allele); + } + + # check reference allele if requested + if(defined $config->{check_ref}) { + my $ok = 0; + my $slice_ref_allele; + + # insertion, therefore no ref allele to check + if($ref_allele eq '-') { + $ok = 1; + } + else { + my $slice_ref = $vf->{slice}->sub_Slice($vf->{start}, $vf->{end}, $vf->{strand}); + + if(!defined($slice_ref)) { + warn "WARNING: Could not fetch sub-slice from ".$vf->{start}."\-".$vf->{end}."\(".$vf->{strand}."\) on line ".$config->{line_number} unless defined $config->{quiet}; + } + + else { + $slice_ref_allele = $slice_ref->seq; + $ok = ($slice_ref_allele eq $ref_allele ? 1 : 0); + } + } + + if(!$ok) { + warn + "WARNING: Specified reference allele $ref_allele ", + "does not match Ensembl reference allele", + ($slice_ref_allele ? " $slice_ref_allele" : ""), + " on line ".$config->{line_number} unless defined $config->{quiet}; + return 0; + } + } + + return 1; +} + + +# validate a structural variation +sub validate_svf { + my $config = shift; + my $svf = shift; + + return 1; +} + + +# takes a hash of VFs and fetches consequences by pre-fetching overlapping transcripts +# from database and/or cache +sub whole_genome_fetch { + my $config = shift; + my $chr = shift; + my $vf_hash = shift; + + my (%vf_done, @finished_vfs, %seen_rfs); + + if(defined($config->{offline}) && !-e $config->{dir}.'/'.$chr) { + debug("No cache found for chromsome $chr") unless defined($config->{quiet}); + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + + return \@finished_vfs; + } + + my $slice_cache = $config->{slice_cache}; + build_slice_cache($config, $config->{tr_cache}) unless defined($slice_cache->{$chr}); + + debug("Analyzing chromosome $chr") unless defined($config->{quiet}); + + # custom annotations + whole_genome_fetch_custom($config, $vf_hash, $chr) if defined($config->{custom}); + + # split up normal variations from SVs + my ($tmp_vf_hash, @svfs); + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { + if($vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { + push @svfs, $vf; + } + else { + push @{$tmp_vf_hash->{$chr}{$chunk}{$pos}}, $vf; + } + } + } + } + + $vf_hash = $tmp_vf_hash; + + # transcript annotations + whole_genome_fetch_transcript($config, $vf_hash, $chr) + unless defined($config->{no_consequences}); + + # regulatory annotations + whole_genome_fetch_reg($config, $vf_hash, $chr) + if defined($config->{regulatory}) + && !defined($config->{no_consequences}); + + # structural variations + @finished_vfs = @{whole_genome_fetch_sv($config, \@svfs, $chr)} + if scalar @svfs; + + # sort results into @finished_vfs array + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + + # pinch slice from slice cache if we don't already have it + $_->{slice} ||= $slice_cache->{$chr} for @{$vf_hash->{$chr}{$chunk}{$pos}}; + + if(defined($config->{regulatory})) { + foreach my $type(@REG_FEAT_TYPES) { + $_->{regulation_variations}->{$type} ||= [] for @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + + if(defined($config->{custom})) { + $_->{custom} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + + $_->{transcript_variations} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; + + # add to final array + push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + + # sort + @finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @finished_vfs; + + # clean hash + delete $vf_hash->{$chr}; + + return \@finished_vfs; +} + +sub whole_genome_fetch_custom { + my $config = shift; + my $vf_hash = shift; + my $chr = shift; + + return unless scalar @{$config->{custom}}; + + # create regions based on VFs instead of chunks + my $tmp_regions; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { + push @{$tmp_regions->{$chr}}, ($vf->{start}-1).'-'.($vf->{end}+1); + } + } + } + + return unless defined($tmp_regions->{$chr}); + + # cache annotations + my $annotation_cache = cache_custom_annotation($config, $tmp_regions, $chr); + + # count and report + my $total_annotations = 0; + $total_annotations += scalar keys %{$annotation_cache->{$chr}->{$_}} for keys %{$annotation_cache->{$chr}}; + debug("Retrieved $total_annotations custom annotations (", (join ", ", map {(scalar keys %{$annotation_cache->{$chr}->{$_}}).' '.$_} keys %{$annotation_cache->{$chr}}), ")"); + + # compare annotations to variations in hash + debug("Analyzing custom annotations") unless defined($config->{quiet}); + my $total = scalar keys %{$vf_hash->{$chr}}; + my $i = 0; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + progress($config, $i++, $total); + + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { + + $vf->{custom} = get_custom_annotation($config, $vf, $annotation_cache); + } + } + } + + end_progress($config); +} + +sub whole_genome_fetch_transcript { + my $config = shift; + my $vf_hash = shift; + my $chr = shift; + + my $tr_cache = $config->{tr_cache}; + my $slice_cache = $config->{slice_cache}; + + my $up_down_size = MAX_DISTANCE_FROM_TRANSCRIPT; + + # check we have defined regions + return unless defined($vf_hash->{$chr}) && defined($tr_cache->{$chr}); + + # copy slice from transcript to slice cache + $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); + + debug("Analyzing variants") unless defined($config->{quiet}); + + my $tr_counter = 0; + my $tr_count = scalar @{$tr_cache->{$chr}}; + + while($tr_counter < $tr_count) { + + progress($config, $tr_counter, $tr_count); + print PARENT "BUMP\n" if defined($config->{forked}); + + my $tr = $tr_cache->{$chr}->[$tr_counter++]; + + # do each overlapping VF + my $s = $tr->start - $up_down_size; + my $e = $tr->end + $up_down_size; + + # get the chunks this transcript overlaps + my %chunks; + $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); + map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; + + # pointer to previous VF + # used to tell plugins this is the last variant analysed in this transcript + my $previous_vf; + + foreach my $chunk(keys %chunks) { + foreach my $vf( + grep {$_->{start} <= $e && $_->{end} >= $s} + map {@{$vf_hash->{$chr}{$chunk}{$_}}} + keys %{$vf_hash->{$chr}{$chunk}} + ) { + # pinch slice from slice cache if we don't already have it + $vf->{slice} ||= $slice_cache->{$chr}; + + my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( + -transcript => $tr, + -variation_feature => $vf, + -adaptor => $config->{tva}, + -no_ref_check => 1, + -no_transfer => 1 + ); + + # prefetching stuff here prevents doing loads at the + # end and makes progress reporting more useful + $tv->_prefetch_for_vep; + + $vf->add_TranscriptVariation($tv); + + # cache VF on the transcript if it is an unbalanced sub + push @{$tr->{indels}}, $vf if defined($vf->{indel}); + + if(defined($config->{individual})) { + + # store VF on transcript, weaken reference to avoid circularity + push @{$tr->{vfs}}, $vf; + weaken($tr->{vfs}->[-1]); + + delete $previous_vf->{last_in_transcript}->{$tr->stable_id}; + $vf->{last_in_transcript}->{$tr->stable_id} = 1; + } + + $previous_vf = $vf; + } + } + } + + end_progress($config); +} + +sub whole_genome_fetch_reg { + my $config = shift; + my $vf_hash = shift; + my $chr = shift; + + my $rf_cache = $config->{rf_cache}; + my $slice_cache = $config->{slice_cache}; + + foreach my $type(keys %{$rf_cache->{$chr}}) { + debug("Analyzing ".$type."s") unless defined($config->{quiet}); + + my $constructor = 'Bio::EnsEMBL::Variation::'.$type.'Variation'; + + my $rf_counter = 0; + my $rf_count = scalar @{$rf_cache->{$chr}->{$type}}; + + while($rf_counter < $rf_count) { + + progress($config, $rf_counter, $rf_count); + print PARENT "BUMP\n" if defined($config->{forked}); + + my $rf = $rf_cache->{$chr}->{$type}->[$rf_counter++]; + + # do each overlapping VF + my $s = $rf->{start}; + my $e = $rf->{end}; + + # get the chunks this transcript overlaps + my %chunks; + $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); + map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; + + foreach my $chunk(keys %chunks) { + foreach my $vf( + grep {$_->{start} <= $e && $_->{end} >= $s} + map {@{$vf_hash->{$chr}{$chunk}{$_}}} + keys %{$vf_hash->{$chr}{$chunk}} + ) { + push @{$vf->{regulation_variations}->{$type}}, $constructor->new( + -variation_feature => $vf, + -feature => $rf, + -no_ref_check => 1, + -no_transfer => 1 + ); + } + } + } + + end_progress($config); + } +} + +sub whole_genome_fetch_sv { + my $config = shift; + my $svfs = shift; + my $chr = shift; + + my $tr_cache = $config->{tr_cache}; + my $rf_cache = $config->{rf_cache}; + my $slice_cache = $config->{slice_cache}; + + debug("Analyzing structural variations") unless defined($config->{quiet}); + + my($i, $total) = (0, scalar @$svfs); + + my @finished_vfs; + + foreach my $svf(@$svfs) { + progress($config, $i++, $total); + + my %done_genes = (); + + if(defined($tr_cache->{$chr})) { + foreach my $tr(grep {overlap($_->{start} - MAX_DISTANCE_FROM_TRANSCRIPT, $_->{end} + MAX_DISTANCE_FROM_TRANSCRIPT, $svf->{start}, $svf->{end})} @{$tr_cache->{$chr}}) { + my $svo = Bio::EnsEMBL::Variation::TranscriptStructuralVariation->new( + -transcript => $tr, + -structural_variation_feature => $svf, + -no_transfer => 1 + ); + + $svf->add_TranscriptStructuralVariation($svo); + } + } + + $svf->{transcript_structural_variations} ||= {}; + + # do regulatory features + if(defined($config->{regulatory}) && defined($rf_cache->{$chr})) { + foreach my $rf_type(qw/RegulatoryFeature/) {#keys %{$rf_cache->{$chr}}) { + foreach my $rf(grep {$_->{start} <= $svf->{end} && $_->end >= $svf->{end}} @{$rf_cache->{$chr}->{$rf_type}}) { + my $svo = Bio::EnsEMBL::Variation::StructuralVariationOverlap->new( + -feature => $rf, + -structural_variation_feature => $svf, + -no_transfer => 1 + ); + + push @{$svf->{regulation_structural_variations}->{$rf_type}}, $svo; + } + + $svf->{regulation_structural_variations}->{$rf_type} ||= []; + } + } + + # sort them + #$svf->_sort_svos; + push @finished_vfs, $svf; + } + + end_progress($config); + + return \@finished_vfs; +} + +# retrieves transcripts given region list +sub fetch_transcripts { + my $config = shift; + my $regions = shift; + my $trim_regions = shift; + + my $tr_cache = $config->{tr_cache}; + my $slice_cache = $config->{slice_cache}; + + my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); + + my %seen_trs; + + $count_from_mem = 0; + my $region_count = 0; + foreach my $chr(keys %{$regions}) { + $count_from_mem += scalar @{$tr_cache->{$chr}} if defined($tr_cache->{$chr}) && ref($tr_cache->{$chr}) eq 'ARRAY'; + $region_count += scalar @{$regions->{$chr}}; + } + + my $counter; + + debug("Reading transcript data from cache and/or database") unless defined($config->{quiet}); + + foreach my $chr(keys %{$regions}) { + foreach my $region(sort {(split /\-/, $a)[0] <=> (split /\-/, $b)[1]} @{$regions->{$chr}}) { + progress($config, $counter++, $region_count); + + # skip regions beyond the end of the chr + next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; + + next if defined($config->{loaded_tr}->{$chr}->{$region}); + + # force quiet so other methods don't mess up the progress bar + my $quiet = $config->{quiet}; + $config->{quiet} = 1; + + # try and load cache from disk if using cache + my $tmp_cache; + if(defined($config->{cache})) { + #$tmp_cache = ( + # defined($config->{tabix}) ? + # load_dumped_transcript_cache_tabix($config, $chr, $region, $trim_regions) : + # load_dumped_transcript_cache($config, $chr, $region) + #); + $tmp_cache = load_dumped_transcript_cache($config, $chr, $region); + $count_from_cache += scalar @{$tmp_cache->{$chr}} if defined($tmp_cache->{$chr}); + $config->{loaded_tr}->{$chr}->{$region} = 1; + } + + # no cache found on disk or not using cache + if(!defined($tmp_cache->{$chr})) { + + if(defined($config->{offline})) { + # restore quiet status + $config->{quiet} = $quiet; + + debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); + next; + } + + # spoof temporary region hash + my $tmp_hash; + push @{$tmp_hash->{$chr}}, $region; + + $tmp_cache = cache_transcripts($config, $tmp_hash); + + # make it an empty arrayref that gets cached + # so we don't get confused and reload next time round + $tmp_cache->{$chr} ||= []; + + $count_from_db += scalar @{$tmp_cache->{$chr}}; + + # dump to disk if writing to cache + (defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $region) : dump_transcript_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); + + $config->{loaded_tr}->{$chr}->{$region} = 1; + } + + # add loaded transcripts to main cache + if(defined($tmp_cache->{$chr})) { + while(my $tr = shift @{$tmp_cache->{$chr}}) { + + # track already added transcripts by dbID + my $dbID = $tr->dbID; + if($seen_trs{$dbID}) { + $count_duplicates++; + next; + } + + # trim out? + #if(defined($trim_regions) && defined($trim_regions->{$chr})) { + # my $tmp_count = scalar grep { + # overlap( + # (split /\-/, $_)[0], (split /\-/, $_)[1], + # $tr->{start}, $tr->{end} + # ) + # } @{$trim_regions->{$chr}}; + # + # if(!$tmp_count) { + # $count_trimmed++; + # next; + # } + #} + + $seen_trs{$dbID} = 1; + + push @{$tr_cache->{$chr}}, $tr; + } + } + + $tr_cache->{$chr} ||= []; + + undef $tmp_cache; + + # restore quiet status + $config->{quiet} = $quiet; + + # build slice cache + $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); + } + } + + end_progress($config); + + my $tr_count = 0; + $tr_count += scalar @{$tr_cache->{$_}} for keys %$tr_cache; + + debug("Retrieved $tr_count transcripts ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); + + return $tr_count; +} + +sub fetch_regfeats { + my $config = shift; + my $regions = shift; + my $trim_regions = shift; + + my $rf_cache = $config->{rf_cache}; + my $slice_cache = $config->{slice_cache}; + + my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); + + my %seen_rfs; + + $count_from_mem = 0; + my $region_count = 0; + + foreach my $chr(keys %$regions) { + if(defined($rf_cache->{$chr}) && ref($rf_cache->{$chr}) eq 'HASH') { + $count_from_mem += scalar @{$rf_cache->{$chr}->{$_}} for keys %{$rf_cache->{$chr}}; + } + $region_count += scalar @{$regions->{$chr}}; + } + + my $counter = 0; + + debug("Reading regulatory data from cache and/or database") unless defined($config->{quiet}); + + foreach my $chr(keys %$regions) { + foreach my $region(sort {(split /\-/, $a)[0] cmp (split /\-/, $b)[1]} @{$regions->{$chr}}) { + progress($config, $counter++, $region_count); + + next if defined($config->{loaded_rf}->{$chr}->{$region}); + + # skip regions beyond the end of the chr + next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; + + # force quiet so other methods don't mess up the progress bar + my $quiet = $config->{quiet}; + $config->{quiet} = 1; + + # try and load cache from disk if using cache + my $tmp_cache; + if(defined($config->{cache})) { + $tmp_cache = load_dumped_reg_feat_cache($config, $chr, $region); + + #$tmp_cache = + # defined($config->{tabix}) ? + # load_dumped_reg_feat_cache_tabix($config, $chr, $region, $trim_regions) : + # load_dumped_reg_feat_cache($config, $chr, $region); + + + if(defined($tmp_cache->{$chr})) { + $count_from_cache += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; + } + + # flag as loaded + $config->{loaded_rf}->{$chr}->{$region} = 1; + } + + # no cache found on disk or not using cache + if(!defined($tmp_cache->{$chr})) { + + if(defined($config->{offline})) { + + # restore quiet status + $config->{quiet} = $quiet; + + debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); + next; + } + + # spoof temporary region hash + my $tmp_hash; + push @{$tmp_hash->{$chr}}, $region; + + $tmp_cache = cache_reg_feats($config, $tmp_hash); + + # make it an empty arrayref that gets cached + # so we don't get confused and reload next time round + $tmp_cache->{$chr} ||= {}; + + $count_from_db += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; + + # dump to disk if writing to cache + #dump_reg_feat_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); + (defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $tmp_cache, $chr, $region) : dump_reg_feat_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); + + # restore deleted coord_system adaptor + foreach my $type(keys %{$tmp_cache->{$chr}}) { + $_->{slice}->{coord_system}->{adaptor} = $config->{csa} for @{$tmp_cache->{$chr}->{$type}}; + } + + # flag as loaded + $config->{loaded_rf}->{$chr}->{$region} = 1; + } + + # add loaded reg_feats to main cache + if(defined($tmp_cache->{$chr})) { + foreach my $type(keys %{$tmp_cache->{$chr}}) { + while(my $rf = shift @{$tmp_cache->{$chr}->{$type}}) { + + # filter on cell type + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + next unless grep {$rf->{cell_types}->{$_}} @{$config->{cell_type}}; + } + + # trim out? + #if(defined($trim_regions) && defined($trim_regions->{$chr})) { + # my $tmp_count = scalar grep { + # overlap( + # (split /\-/, $_)[0], (split /\-/, $_)[1], + # $rf->{start}, $rf->{end} + # ) + # } @{$trim_regions->{$chr}}; + # + # if(!$tmp_count) { + # $count_trimmed++; + # next; + # } + #} + + # track already added reg_feats by dbID + my $dbID = $rf->{dbID}; + if($seen_rfs{$dbID}) { + $count_duplicates++; + next; + } + $seen_rfs{$dbID} = 1; + + push @{$rf_cache->{$chr}->{$type}}, $rf; + } + } + } + + undef $tmp_cache; + + # restore quiet status + $config->{quiet} = $quiet; + } + } + + end_progress($config); + + my $rf_count = 0; + + foreach my $chr(keys %$rf_cache) { + foreach my $type(keys %{$rf_cache->{$chr}}) { + $rf_count += scalar @{$rf_cache->{$chr}->{$type}}; + } + } + + debug("Retrieved $rf_count regulatory features ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); + + return $rf_count; +} + +# gets existing VFs for a vf_hash +sub check_existing_hash { + my $config = shift; + my $vf_hash = shift; + my $variation_cache; + + # we only care about non-SVs here + my %new_hash; + + foreach my $chr(keys %{$vf_hash}) { + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(grep {$_->isa('Bio::EnsEMBL::Variation::VariationFeature')} @{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + push @{$new_hash{$chr}->{$chunk}->{$pos}}, $var; + } + } + } + } + + $vf_hash = \%new_hash; + + debug("Checking for existing variations") unless defined($config->{quiet}); + + my ($chunk_count, $counter); + $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; + + foreach my $chr(keys %{$vf_hash}) { + + my %loaded_regions; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + progress($config, $counter++, $chunk_count); + + # get the VFs for this chunk + my ($start, $end); + + # work out start and end using chunk_size + $start = $config->{chunk_size} * $chunk; + $end = $config->{chunk_size} * ($chunk + 1); + + # using cache? + if(defined($config->{cache})) { + my $tmp_regions; + push @{$tmp_regions->{$chr}}, $start.'-'.$end; + + my $converted_regions = convert_regions($config, $tmp_regions); + + foreach my $region(@{$converted_regions->{$chr}}) { + + unless($loaded_regions{$region}) { + my $tmp_cache = load_dumped_variation_cache($config, $chr, $region); + + # load from DB if not found in cache + if(!defined($tmp_cache->{$chr})) { + if(defined($config->{offline})) { + debug("WARNING: Could not find variation cache for $chr\:$region") unless defined($config->{quiet}); + next; + } + + $tmp_cache->{$chr} = get_variations_in_region($config, $chr, $region); + dump_variation_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); + } + + # merge tmp_cache with the main cache + foreach my $key(keys %{$tmp_cache->{$chr}}) { + $variation_cache->{$chr}->{$key} = $tmp_cache->{$chr}->{$key}; + delete $tmp_cache->{$chr}->{$key}; + } + + # clear memory + undef $tmp_cache; + + # record this region as fetched + $loaded_regions{$region} = 1; + } + } + } + + # no cache, get all variations in region from DB + else { + + my ($min, $max); + + # we can fetch smaller region when using DB + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + foreach my $coord(qw(start end)) { + $min = $var->{$coord} if !defined($min) || $var->{$coord} < $min; + $max = $var->{$coord} if !defined($max) || $var->{$coord} > $max; + } + } + } + + $variation_cache->{$chr} = get_variations_in_region($config, $chr, $min.'-'.$max); + } + + # now compare retrieved vars with vf_hash + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + my @found; + my @gmafs; + + if(defined($variation_cache->{$chr})) { + if(my $existing_vars = $variation_cache->{$chr}->{$pos}) { + foreach my $existing_var(grep {$_->[1] <= $config->{failed}} @$existing_vars) { + unless(is_var_novel($config, $existing_var, $var)) { + push @found, $existing_var->[0]; + push @gmafs, $existing_var->[6].":".$existing_var->[7] if defined($existing_var->[6]) && defined($existing_var->[7]); + } + } + } + } + + $var->{existing} = join ",", @found; + $var->{existing} ||= '-'; + + $var->{gmaf} = join ",", @gmafs; + $var->{gmaf} ||= undef; + } + } + } + + delete $variation_cache->{$chr}; + } + + end_progress($config); +} + +# gets overlapping SVs for a vf_hash +sub check_svs_hash { + my $config = shift; + my $vf_hash = shift; + + debug("Checking for overlapping structural variations") unless defined($config->{quiet}); + + my ($chunk_count, $counter); + $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; + + foreach my $chr(keys %{$vf_hash}) { + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + + progress($config, $counter++, $chunk_count); + + # work out start and end using chunk_size + my ($start, $end); + $start = $config->{chunk_size} * $chunk; + $end = $config->{chunk_size} * ($chunk + 1); + + # check for structural variations + if(defined($config->{sa})) { + my $slice = $config->{sa}->fetch_by_region('chromosome', $chr, $start, $end); + + if(defined($slice)) { + my $svs = $config->{svfa}->fetch_all_by_Slice($slice); + + foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { + foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { + my $string = join ",", + map {$_->variation_name} + grep {$_->seq_region_start <= $var->end && $_->seq_region_end >= $var->start} + @$svs; + + $var->{overlapping_svs} = $string if $string; + } + } + } + } + } + } + + end_progress($config); +} + +# gets a slice from the slice adaptor +sub get_slice { + my $config = shift; + my $chr = shift; + my $otherfeatures = shift; + $otherfeatures ||= ''; + + return undef unless defined($config->{sa}) && defined($chr); + + my $slice; + + # first try to get a chromosome + eval { $slice = $config->{$otherfeatures.'sa'}->fetch_by_region('chromosome', $chr); }; + + # if failed, try to get any seq region + if(!defined($slice)) { + $slice = $config->{$otherfeatures.'sa'}->fetch_by_region(undef, $chr); + } + + return $slice; +} + + + + +# METHODS THAT DEAL WITH "REGIONS" +################################## + +# gets regions from VF hash +sub regions_from_hash { + my $config = shift; + my $vf_hash = shift; + + my %include_regions; + + # if using cache we just want the regions of cache_region_size + # since that's what we'll get from the cache (or DB if no cache found) + if(defined($config->{cache})) { + + my $region_size = $config->{cache_region_size}; + + foreach my $chr(keys %$vf_hash) { + $include_regions{$chr} = []; + my %temp_regions; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + my ($s, $e) = ($pos - MAX_DISTANCE_FROM_TRANSCRIPT, $pos + MAX_DISTANCE_FROM_TRANSCRIPT); + + my $low = int ($s / $region_size); + my $high = int ($e / $region_size) + 1; + + for my $i($low..($high - 1)) { + $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; + } + } + } + + @{$include_regions{$chr}} = keys %temp_regions; + } + } + + # if no cache we don't want to fetch more than is necessary, so find the + # minimum covered region of the variations in the hash + else { + foreach my $chr(keys %$vf_hash) { + $include_regions{$chr} = []; + + foreach my $chunk(keys %{$vf_hash->{$chr}}) { + foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { + add_region($_->start, $_->end, $include_regions{$chr}) for @{$vf_hash->{$chr}{$chunk}{$pos}}; + } + } + } + + # merge regions + merge_regions(\%include_regions, $config); + } + + return \%include_regions; +} + +# adds a region to region list, expanding existing one if overlaps +sub add_region { + my $start = shift; + my $end = shift; + my $region_list = shift; + + # fix end for insertions + $end = $start if $end < $start; + + my $added = 0; + my $i = 0; + + while ($i < scalar @$region_list) { + my ($region_start, $region_end) = split /\-/, $region_list->[$i]; + + if($start <= $region_end && $end >= $region_start) { + my $new_region_start = ($start < $end ? $start : $end) - MAX_DISTANCE_FROM_TRANSCRIPT; + my $new_region_end = ($start > $end ? $start : $end) + MAX_DISTANCE_FROM_TRANSCRIPT; + + $new_region_start = 1 if $new_region_start < 1; + + $region_start = $new_region_start if $new_region_start < $region_start; + $region_end = $new_region_end if $new_region_end > $region_end; + + $region_list->[$i] = $region_start.'-'.$region_end; + $added = 1; + } + + $i++; + } + + unless($added) { + my $s = $start - MAX_DISTANCE_FROM_TRANSCRIPT; + $s = 1 if $s < 1; + + push @{$region_list}, $s.'-'.($end + MAX_DISTANCE_FROM_TRANSCRIPT); + } +} + +# merges overlapping regions from scans +sub merge_regions { + my $include_regions = shift; + my $config = shift; + my $consecutive = shift; + $consecutive ||= 0; + + # now merge overlapping regions + foreach my $chr(keys %$include_regions) { + my $max_index = $#{$include_regions->{$chr}}; + my (@new_regions, %skip); + + for my $i(0..$max_index) { + next if $skip{$i}; + my ($s, $e) = split /\-/, $include_regions->{$chr}[$i]; + + for my $j(($i+1)..$max_index) { + next if $skip{$j}; + my ($ns, $ne) = split /\-/, $include_regions->{$chr}[$j]; + + if($s <= ($ne + $consecutive) && $e >= ($ns - $consecutive)) { + $s = $ns if $ns < $s; + $e = $ne if $ne > $e; + + $skip{$j} = 1; + } + } + + push @new_regions, $s.'-'.$e; + } + + # replace original + $include_regions->{$chr} = \@new_regions; + + $config->{region_count} += scalar @new_regions; + } + + return $include_regions; +} + +# converts regions as determined by scan_file to regions loadable from cache +sub convert_regions { + my $config = shift; + my $regions = shift; + + return undef unless defined $regions; + + my $region_size = $config->{cache_region_size}; + + my %new_regions; + + foreach my $chr(keys %$regions) { + my %temp_regions; + + foreach my $region(@{$regions->{$chr}}) { + my ($s, $e) = split /\-/, $region; + + my $low = int ($s / $region_size); + my $high = int ($e / $region_size) + 1; + + for my $i($low..($high - 1)) { + $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; + } + } + + @{$new_regions{$chr}} = keys %temp_regions; + } + + return \%new_regions; +} + + + + + +# CACHE METHODS +############### + +# prunes a cache to get rid of features not in regions in use +sub prune_cache { + my $config = shift; + my $cache = shift; + my $regions = shift; + my $loaded = shift; + + # delete no longer in use chroms + foreach my $chr(keys %$cache) { + delete $cache->{$chr} unless defined $regions->{$chr} && scalar @{$regions->{$chr}}; + } + + my $new_count = 0; + + foreach my $chr(keys %$cache) { + + # get total area spanned by regions + my ($min, $max); + foreach my $region(@{$regions->{$chr}}) { + my ($s, $e) = split /\-/, $region; + $min = $s if !defined($min) or $s < $min; + $max = $e if !defined($max) or $e > $max; + } + + # transcript cache + if(ref($cache->{$chr}) eq 'ARRAY') { + $cache->{$chr} = prune_min_max($cache->{$chr}, $min, $max); + $new_count += scalar @{$cache->{$chr}}; + } + # regfeat cache + elsif(ref($cache->{$chr}) eq 'HASH') { + for(keys %{$cache->{$chr}}) { + $cache->{$chr}->{$_} = prune_min_max($cache->{$chr}->{$_}, $min, $max); + $new_count += scalar @{$cache->{$chr}->{$_}}; + } + } + + # update loaded regions + my %have_regions = map {$_ => 1} @{$regions->{$chr}}; + + foreach my $region(keys %{$loaded->{$chr}}) { + delete $loaded->{$chr}->{$region} unless defined $have_regions{$region}; + } + } + + return $new_count; +} + +# does the actual pruning +sub prune_min_max { + my $array = shift; + my $min = shift; + my $max = shift; + + # splice out features not in area spanned by min/max + my $i = 0; + my $f_count = scalar @{$array}; + my @new_cache; + + while($i < $f_count) { + my $f = $array->[$i]; + + $i++; + + if($max - $f->start() > 0 && $f->end - $min > 0) { + push @new_cache, $f; + } + + # do some cleaning for transcripts + elsif(defined $f->{translation}) { + delete $f->{translation}->{transcript}; + delete $f->{translation}; + } + } + + undef $array; + return \@new_cache; +} + +# get transcripts for slices +sub cache_transcripts { + my $config = shift; + my $include_regions = shift; + + my $tr_cache; + my $i; + + debug("Caching transcripts") unless defined($config->{quiet}); + + foreach my $chr(keys %$include_regions) { + + my $slice = get_slice($config, $chr); + + next unless defined $slice; + + # prefetch some things + $slice->is_circular; + + # trim bumf off the slice + delete $slice->{coord_system}->{adaptor} if defined($config->{write_cache}); + + # no regions? + if(!scalar @{$include_regions->{$chr}}) { + my $start = 1; + my $end = $config->{cache_region_size}; + + while($start < $slice->end) { + push @{$include_regions->{$chr}}, $start.'-'.$end; + $start += $config->{cache_region_size}; + $end += $config->{cache_region_size}; + } + } + + my $region_count; + + if(scalar keys %$include_regions == 1) { + my ($chr) = keys %$include_regions; + $region_count = scalar @{$include_regions->{$chr}}; + debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); + } + + foreach my $region(@{$include_regions->{$chr}}) { + progress($config, $i++, $region_count || $config->{region_count}); + + my ($s, $e) = split /\-/, $region; + + # sanity check start and end + $s = 1 if $s < 1; + $e = $slice->end if $e > $slice->end; + + # get sub-slice + my $sub_slice = $slice->sub_Slice($s, $e); + + # for some reason unless seq is called here the sequence becomes Ns later + $sub_slice->seq; + + # add transcripts to the cache, via a transfer to the chrom's slice + if(defined($sub_slice)) { + foreach my $gene(map {$_->transfer($slice)} @{$sub_slice->get_all_Genes(undef, undef, 1)}) { + my $gene_stable_id = $gene->stable_id; + my $canonical_tr_id = $gene->{canonical_transcript_id}; + + my @trs; + + foreach my $tr(@{$gene->get_all_Transcripts}) { + $tr->{_gene_stable_id} = $gene_stable_id; + $tr->{_gene} = $gene; + + # indicate if canonical + $tr->{is_canonical} = 1 if defined $canonical_tr_id and $tr->dbID eq $canonical_tr_id; + + if(defined($config->{prefetch})) { + prefetch_transcript_data($config, $tr); + } + + # CCDS + elsif(defined($config->{ccds})) { + my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; + $tr->{_ccds} = $entries[0]->display_id if scalar @entries; + } + + # strip some unnecessary data from the transcript object + clean_transcript($tr) if defined($config->{write_cache}); + + push @trs, $tr; + } + + # sort the transcripts by translation so we can share sift/polyphen stuff + # between transcripts and save cache space + if(defined($config->{write_cache}) && (defined($config->{sift}) || defined($config->{polyphen}))) { + + my $prev_tr; + + # sort them by peptide seqeuence as transcripts with identical peptides + # will have identical SIFT/PolyPhen prediction strings + foreach my $tr(sort {$a->{_variation_effect_feature_cache}->{peptide} cmp $b->{_variation_effect_feature_cache}->{peptide}} grep {$_->{_variation_effect_feature_cache}->{peptide}} @trs) { + + if( + defined($prev_tr) && + $prev_tr->{_variation_effect_feature_cache}->{peptide} + eq $tr->{_variation_effect_feature_cache}->{peptide} + ) { + + foreach my $analysis(qw(sift polyphen)) { + next unless defined($config->{$analysis}); + $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis} = $prev_tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis}; + } + } + + $prev_tr = $tr; + } + } + + # clean the gene + clean_gene($gene); + + push @{$tr_cache->{$chr}}, @trs; + } + } + } + } + + end_progress($config); + + return $tr_cache; +} + +# gets rid of extra bits of info attached to the transcript that we don't need +sub clean_transcript { + my $tr = shift; + + foreach my $key(qw(display_xref external_db external_display_name external_name external_status created_date status description edits_enabled modified_date dbentries is_current analysis transcript_mapper)) { + delete $tr->{$key} if defined($tr->{$key}); + } + + # clean all attributes but miRNA + if(defined($tr->{attributes})) { + my @new_atts; + foreach my $att(@{$tr->{attributes}}) { + push @new_atts, $att if $att->{code} eq 'miRNA'; + } + $tr->{attributes} = \@new_atts; + } + + # clean the translation + if(defined($tr->translation)) { + + # sometimes the translation points to a different transcript? + $tr->{translation}->{transcript} = $tr; + weaken($tr->{translation}->{transcript}); + + for my $key(qw(attributes protein_features created_date modified_date)) { + delete $tr->translation->{$key}; + } + } +} + +# gets rid of extra bits of info attached to genes. At the moment this is almost +# everything as genes are only used for their locations when looking at +# structural variations +sub clean_gene { + my $gene = shift; + + # delete almost everything in the gene + map {delete $gene->{$_}} + grep { + $_ ne 'start' && + $_ ne 'end' && + $_ ne 'strand' && + $_ ne 'stable_id' + } + keys %{$gene}; +} + +# build slice cache from transcript cache +sub build_slice_cache { + my $config = shift; + my $tr_cache = shift; + + my %slice_cache; + + foreach my $chr(keys %$tr_cache) { + $slice_cache{$chr} = scalar @{$tr_cache->{$chr}} ? $tr_cache->{$chr}[0]->slice : &get_slice($config, $chr); + + if(!defined($slice_cache{$chr})) { + delete $slice_cache{$chr} + } + + else { + # reattach adaptor to the coord system + $slice_cache{$chr}->{coord_system}->{adaptor} ||= $config->{csa}; + } + } + + return \%slice_cache; +} + +# pre-fetches per-transcript data +sub prefetch_transcript_data { + my $config = shift; + my $tr = shift; + + # introns + my $introns = $tr->get_all_Introns; + + if(defined($introns)) { + foreach my $intron(@$introns) { + foreach my $key(qw(adaptor analysis dbID next prev seqname)) { + delete $intron->{$key}; + } + } + } + + $tr->{_variation_effect_feature_cache}->{introns} ||= $introns; + + # translateable_seq, mapper + $tr->{_variation_effect_feature_cache}->{translateable_seq} ||= $tr->translateable_seq; + $tr->{_variation_effect_feature_cache}->{mapper} ||= $tr->get_TranscriptMapper; + + # peptide + unless ($tr->{_variation_effect_feature_cache}->{peptide}) { + my $translation = $tr->translate; + $tr->{_variation_effect_feature_cache}->{peptide} = $translation ? $translation->seq : undef; + } + + # protein features + if(defined($config->{domains}) || defined($config->{write_cache})) { + my $pfs = $tr->translation ? $tr->translation->get_all_ProteinFeatures : []; + + # clean them to save cache space + foreach my $pf(@$pfs) { + + # remove everything but the coord, analysis and ID fields + foreach my $key(keys %$pf) { + delete $pf->{$key} unless + $key eq 'start' || + $key eq 'end' || + $key eq 'analysis' || + $key eq 'hseqname'; + } + + # remove everything from the analysis but the display label + foreach my $key(keys %{$pf->{analysis}}) { + delete $pf->{analysis}->{$key} unless $key eq '_display_label'; + } + } + + $tr->{_variation_effect_feature_cache}->{protein_features} = $pfs; + } + + # codon table + unless ($tr->{_variation_effect_feature_cache}->{codon_table}) { + # for mithocondrial dna we need to to use a different codon table + my $attrib = $tr->slice->get_all_Attributes('codon_table')->[0]; + + $tr->{_variation_effect_feature_cache}->{codon_table} = $attrib ? $attrib->value : 1; + } + + # sift/polyphen + if(defined($config->{pfpma}) && defined($tr->{_variation_effect_feature_cache}->{peptide})) { + foreach my $analysis(qw(sift polyphen)) { + next unless defined($config->{$analysis}); + my $a = $analysis; + $a .= '_humvar' if $a eq 'polyphen'; + $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$a} ||= $config->{pfpma}->fetch_by_analysis_translation_md5($a, md5_hex($tr->{_variation_effect_feature_cache}->{peptide})) + } + } + + # gene + $tr->{_gene} ||= $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); + + # gene HGNC + if(defined $config->{hgnc}) { + + # get from gene cache if found already + if(defined($tr->{_gene}->{_hgnc})) { + $tr->{_gene_hgnc} = $tr->{_gene}->{_hgnc}; + } + else { + my @entries = grep {$_->database eq 'HGNC'} @{$tr->{_gene}->get_all_DBEntries()}; + if(scalar @entries) { + $tr->{_gene_hgnc} = $entries[0]->display_id; + } + + $tr->{_gene_hgnc} ||= '-'; + + # cache it on the gene object too + $tr->{_gene}->{_hgnc} = $tr->{_gene_hgnc}; + } + } + + # CCDS + my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; + $tr->{_ccds} = $entries[0]->display_id if scalar @entries; + $tr->{_ccds} ||= '-'; + + # refseq + @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; + if(scalar @entries) { + $tr->{_refseq} = join ",", map {$_->display_id} @entries; + } + else { + $tr->{_refseq} = '-'; + } + + # protein stable ID + $tr->{_protein} = $tr->translation ? $tr->translation->stable_id : '-'; + + return $tr; +} + +sub get_dump_file_name { + my $config = shift; + my $chr = shift; + my $region = shift; + my $type = shift; + + $type ||= 'transcript'; + + if($type eq 'transcript') { + $type = ''; + } + else { + $type = '_'.$type; + } + + #my ($s, $e) = split /\-/, $region; + #my $subdir = int($s / 1e6); + # + #my $dir = $config->{dir}.'/'.$chr.'/'.$subdir; + + my $dir = $config->{dir}.'/'.$chr; + my $dump_file = $dir.'/'.$region.$type.(defined($config->{tabix}) ? '_tabix' : '').'.gz'; + + # make directory if it doesn't exist + if(!(-e $dir) && defined($config->{write_cache})) { + make_path($dir); + } + + return $dump_file; +} + +# dumps out transcript cache to file +sub dump_transcript_cache { + my $config = shift; + my $tr_cache = shift; + my $chr = shift; + my $region = shift; + + debug("Dumping cached transcript data") unless defined($config->{quiet}); + + # clean the slice adaptor before storing + clean_slice_adaptor($config); + + strip_transcript_cache($config, $tr_cache); + + $config->{reg}->disconnect_all; + delete $config->{sa}->{dbc}->{_sql_helper}; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); + + debug("Writing to $dump_file") unless defined($config->{quiet}); + + # storable + open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; + nstore_fd($tr_cache, $fh); + close $fh; +} + +#sub dump_transcript_cache_tabix { +# my $config = shift; +# my $tr_cache = shift; +# my $chr = shift; +# my $region = shift; +# +# debug("Dumping cached transcript data") unless defined($config->{quiet}); +# +# # clean the slice adaptor before storing +# clean_slice_adaptor($config); +# +# strip_transcript_cache($config, $tr_cache); +# +# $config->{reg}->disconnect_all; +# +# my $dir = $config->{dir}.'/'.$chr; +# my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz'; +# +# # make directory if it doesn't exist +# if(!(-e $dir)) { +# make_path($dir); +# } +# +# debug("Writing to $dump_file") unless defined($config->{quiet}); +# +# use Storable qw(nfreeze); +# use MIME::Base64 qw(encode_base64); +# #open NEW, "| bgzip -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; +# # +# #foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { +# # print NEW join "\t", ( +# # $chr, +# # $tr->start, +# # $tr->end, +# # encode_base64(freeze($tr), "") +# # ); +# # print NEW "\n"; +# #} +# #close NEW; +# # +# ## tabix it +# #my $output = `tabix -s 1 -b 2 -e 3 -f $dump_file 2>&1`; +# #die("ERROR: Failed during tabix indexing\n$output\n") if $output; +# open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; +# +# foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { +# print NEW join "\t", ( +# $chr, +# $tr->start, +# $tr->end, +# encode_base64(freeze($tr), "") +# ); +# print NEW "\n"; +# } +# close NEW; +#} + +# loads in dumped transcript cache to memory +sub load_dumped_transcript_cache { + my $config = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); + + return undef unless -e $dump_file; + + debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); + + open my $fh, $config->{compress}." ".$dump_file." |" or return undef; + my $tr_cache; + $tr_cache = fd_retrieve($fh); + close $fh; + + # reattach adaptors + foreach my $t(@{$tr_cache->{$chr}}) { + if(defined($t->{translation})) { + $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor}; + $t->{translation}->{transcript} = $t; + weaken($t->{translation}->{transcript}); + } + + $t->{slice}->{adaptor} = $config->{sa}; + } + + return $tr_cache; +} + +#sub load_dumped_transcript_cache_tabix { +# my $config = shift; +# my $chr = shift; +# my $region = shift; +# my $trim_regions = shift; +# +# my $dir = $config->{dir}.'/'.$chr; +# my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz'; +# +# #print STDERR "Reading from $dump_file\n"; +# +# return undef unless -e $dump_file; +# +# debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); +# +# my $tr_cache; +# +# use MIME::Base64 qw(decode_base64); +# use Storable qw(thaw); +# +# my ($s, $e) = split /\-/, $region; +# my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; +# my $regions = ""; +# $regions .= " $chr\:$_" for @regions; +# +# #print STDERR "tabix $dump_file $regions |\n"; +# #open IN, "tabix $dump_file $regions |"; +# open IN, "gzip -dc $dump_file |"; +# while(<IN>) { +# +# #$DB::single = 1; +# my ($chr, $start, $end, $blob) = split /\t/, $_; +# next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; +# my $tr = thaw(decode_base64($blob)); +# push @{$tr_cache->{$chr}}, $tr; +# } +# close IN; +# +# # reattach adaptors +# foreach my $t(@{$tr_cache->{$chr}}) { +# if(defined($t->{translation})) { +# $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor}; +# $t->{translation}->{transcript} = $t; +# weaken($t->{translation}->{transcript}); +# } +# +# $t->{slice}->{adaptor} = $config->{sa}; +# } +# +# # add empty array ref so code doesn't try and fetch from DB too +# $tr_cache->{$chr} ||= []; +# +# return $tr_cache; +#} + +# strips cache before writing to disk +sub strip_transcript_cache { + my $config = shift; + my $cache = shift; + + foreach my $chr(keys %$cache) { + foreach my $tr(@{$cache->{$chr}}) { + foreach my $exon(@{$tr->{_trans_exon_array}}) { + delete $exon->{slice}->{adaptor}; + + for(qw(adaptor created_date modified_date is_current version is_constitutive _seq_cache dbID slice)) { + delete $exon->{$_}; + } + } + + delete $tr->{adaptor}; + delete $tr->{slice}->{adaptor}; + } + } +} + +# cleans slice adaptor before storing in cache +sub clean_slice_adaptor{ + my $config = shift; + + # clean some stuff off the slice adaptor + delete $config->{sa}->{asm_exc_cache}; + $config->{sa}->{sr_name_cache} = {}; + $config->{sa}->{sr_id_cache} = {}; + delete $config->{sa}->{db}->{seq_region_cache}; + delete $config->{sa}->{db}->{name_cache}; +} + + +# dump adaptors to cache +sub dump_adaptor_cache { + my $config = shift; + + $config->{reg}->disconnect_all; + delete $config->{sa}->{dbc}->{_sql_helper}; + + my $dir = $config->{dir}; + my $dump_file = $dir.'/adaptors.gz'; + + # make directory if it doesn't exist + if(!(-e $dir)) { + make_path($dir); + } + + open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; + nstore_fd($config, $fh); + close $fh; +} + +# load dumped adaptors +sub load_dumped_adaptor_cache { + my $config = shift; + + my $dir = $config->{dir}; + my $dump_file = $dir.'/adaptors.gz'; + + return undef unless -e $dump_file; + + debug("Reading cached adaptor data") unless defined($config->{quiet}); + + open my $fh, $config->{compress}." ".$dump_file." |" or return undef; + my $cached_config; + $cached_config = fd_retrieve($fh); + close $fh; + + $config->{$_} = $cached_config->{$_} for qw(sa ga ta vfa svfa tva pfpma mca csa RegulatoryFeature_adaptor MotifFeature_adaptor); + + return 1; +} + +# dumps cached variations to disk +sub dump_variation_cache { + my $config = shift; + my $v_cache = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); + + open DUMP, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to adaptor dump file $dump_file"; + + foreach my $pos(keys %{$v_cache->{$chr}}) { + foreach my $v(@{$v_cache->{$chr}->{$pos}}) { + my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = @$v; + + print DUMP join " ", ( + $name, + $failed == 0 ? '' : $failed, + $start, + $end == $start ? '' : $end, + $as, + $strand == 1 ? '' : $strand, + $ma || '', + defined($maf) ? sprintf("%.2f", $maf) : '', + ); + print DUMP "\n"; + } + } + + close DUMP; +} + +# loads dumped variation cache +sub load_dumped_variation_cache { + my $config = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); + + return undef unless -e $dump_file; + + open DUMP, $config->{compress}." ".$dump_file." |" or return undef; + + my $v_cache; + + while(<DUMP>) { + chomp; + my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = split / /, $_; + $failed ||= 0; + $end ||= $start; + $strand ||= 1; + $ma ||= undef; + $maf ||= undef; + + my @v = ($name, $failed, $start, $end, $as, $strand, $ma, $maf); + push @{$v_cache->{$chr}->{$start}}, \@v; + } + + close DUMP; + + return $v_cache; +} + +# caches regulatory features +sub cache_reg_feats { + my $config = shift; + my $include_regions = shift; + + my $rf_cache; + my $i; + + debug("Caching regulatory features") unless defined($config->{quiet}); + + foreach my $chr(keys %$include_regions) { + + my $slice = get_slice($config, $chr); + + next unless defined $slice; + + # prefetch some things + $slice->is_circular; + + # no regions? + if(!scalar @{$include_regions->{$chr}}) { + my $start = 1; + my $end = $config->{cache_region_size}; + + while($start < $slice->end) { + push @{$include_regions->{$chr}}, $start.'-'.$end; + $start += $config->{cache_region_size}; + $end += $config->{cache_region_size}; + } + } + + my $region_count; + + if(scalar keys %$include_regions == 1) { + my ($chr) = keys %$include_regions; + $region_count = scalar @{$include_regions->{$chr}}; + debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); + } + + foreach my $region(@{$include_regions->{$chr}}) { + progress($config, $i++, $region_count || $config->{region_count}); + + my ($s, $e) = split /\-/, $region; + + # sanity check start and end + $s = 1 if $s < 1; + $e = $slice->end if $e > $slice->end; + + # get sub-slice + my $sub_slice = $slice->sub_Slice($s, $e); + $sub_slice->{coord_system}->{adaptor} = $config->{csa}; + + next unless defined($sub_slice); + + foreach my $type(@REG_FEAT_TYPES) { + my $features = $config->{$type.'_adaptor'}->fetch_all_by_Slice($sub_slice); + next unless defined($features); + + # cell types + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + foreach my $rf(@$features) { + + my %cl; + + # get cell type by fetching all from stable ID + if($type eq 'RegulatoryFeature') { + %cl = map { + $_->feature_set->cell_type->name => $_->feature_type->name + } @{$rf->adaptor->fetch_all_by_stable_ID($rf->stable_id)}; + } + + # get cell type by fetching regfeats that contain this MotifFeature + elsif($type eq 'MotifFeature') { + %cl = map { + $_->feature_set->cell_type->name => $_->feature_type->name + } @{$config->{'RegulatoryFeature_adaptor'}->fetch_all_by_attribute_feature($rf)}; + } + + $rf->{cell_types} = \%cl; + } + } + + push @{$rf_cache->{$chr}->{$type}}, + map { clean_reg_feat($_) } + map { $_->transfer($slice) } + @{$features}; + } + } + } + + end_progress($config); + + return $rf_cache; +} + + +# cleans reg feats for caching +sub clean_reg_feat { + my $rf = shift; + + foreach my $key(qw/adaptor binary_string bound_start bound_end attribute_cache feature_type feature_set analysis/) { + delete $rf->{$key}; + } + + if(defined($rf->{binding_matrix})) { + $rf->{_variation_effect_feature_cache}->{seq} = $rf->seq; + + foreach my $key(qw/adaptor feature_type analysis dbID/) { + delete $rf->{binding_matrix}->{$key}; + } + } + + return $rf; +} + + +# dumps out reg feat cache to file +sub dump_reg_feat_cache { + my $config = shift; + my $rf_cache = shift; + my $chr = shift; + my $region = shift; + + debug("Dumping cached reg feat data for $chr:$region") unless defined($config->{quiet}); + + # clean the slice adaptor before storing + clean_slice_adaptor($config); + + $config->{reg}->disconnect_all; + delete $config->{sa}->{dbc}->{_sql_helper}; + + foreach my $chr(keys %{$rf_cache}) { + foreach my $type(keys %{$rf_cache->{$chr}}) { + delete $_->{slice}->{coord_system}->{adaptor} for @{$rf_cache->{$chr}->{$type}}; + } + } + + my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); + + debug("Writing to $dump_file") unless defined($config->{quiet}); + + # storable + open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; + nstore_fd($rf_cache, $fh); + close $fh; +} + +#sub dump_reg_feat_cache_tabix { +# my $config = shift; +# my $rf_cache = shift; +# my $chr = shift; +# my $region = shift; +# +# debug("Dumping cached reg feat data") unless defined($config->{quiet}); +# +# # clean the slice adaptor before storing +# clean_slice_adaptor($config); +# +# $config->{reg}->disconnect_all; +# delete $config->{sa}->{dbc}->{_sql_helper}; +# +# $config->{reg}->disconnect_all; +# +# my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); +# +# debug("Writing to $dump_file") unless defined($config->{quiet}); +# +# use Storable qw(nfreeze); +# use MIME::Base64 qw(encode_base64); +# open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; +# +# foreach my $type(keys %{$rf_cache->{$chr}}) { +# foreach my $rf(sort {$a->start <=> $b->start} @{$rf_cache->{$chr}->{$type}}) { +# print NEW join "\t", ( +# $chr, +# $rf->start, +# $rf->end, +# $type, +# encode_base64(freeze($rf), "") +# ); +# print NEW "\n"; +# } +# } +# close NEW; +#} + +# loads in dumped transcript cache to memory +sub load_dumped_reg_feat_cache { + my $config = shift; + my $chr = shift; + my $region = shift; + + my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); + + return undef unless -e $dump_file; + + debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); + + open my $fh, $config->{compress}." ".$dump_file." |" or return undef; + my $rf_cache; + $rf_cache = fd_retrieve($fh); + close $fh; + + return $rf_cache; +} + + + +#sub load_dumped_reg_feat_cache_tabix { +# my $config = shift; +# my $chr = shift; +# my $region = shift; +# my $trim_regions = shift; +# +# my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); +# +# #print STDERR "Reading from $dump_file\n"; +# +# return undef unless -e $dump_file; +# +# debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); +# +# my $rf_cache; +# +# use MIME::Base64 qw(decode_base64); +# use Storable qw(thaw); +# +# my ($s, $e) = split /\-/, $region; +# my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; +# my $regions = ""; +# $regions .= " $chr\:$_" for @regions; +# +# #print STDERR "tabix $dump_file $regions |\n"; +# #open IN, "tabix $dump_file $regions |"; +# open IN, "gzip -dc $dump_file |"; +# while(<IN>) { +# my ($chr, $start, $end, $type, $blob) = split /\t/, $_; +# next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; +# my $rf = thaw(decode_base64($blob)); +# push @{$rf_cache->{$chr}->{$type}}, $rf; +# } +# close IN; +# +# $rf_cache->{$chr}->{$_} ||= [] for @REG_FEAT_TYPES; +# +# return $rf_cache; +#} + + +# get custom annotation for a region +sub cache_custom_annotation { + my $config = shift; + my $include_regions = shift; + my $chr = shift; + + #$include_regions = merge_regions($include_regions, $config, 1); + + my $annotation = {}; + + my $total = scalar @{$config->{custom}} * scalar @{$include_regions->{$chr}}; + my $counter = 0; + + my $max_regions_per_tabix = 1000; + + debug("Caching custom annotations") unless defined($config->{quiet}); + + foreach my $custom(@{$config->{custom}}) { + + my @regions = @{$include_regions->{$chr}}; + + while(scalar @regions) { + my $got_features = 0; + + my @tmp_regions = splice @regions, 0, $max_regions_per_tabix; + + progress($config, $counter, $total); + $counter += scalar @tmp_regions; + + # some files may have e.g. chr10 instead of 10 + for my $tmp_chr($chr, 'chr'.$chr) { + + # bigwig needs to use bigWigToWig utility + if($custom->{format} eq 'bigwig') { + foreach my $region(@tmp_regions) { + my ($s, $e) = split /\-/, $region; + my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$.'_'.$tmp_chr.'_'.$s.'_'.$e; + my $bigwig_file = $custom->{file}; + my $bigwig_output = `bigWigToWig -chrom=$tmp_chr -start=$s -end=$e $bigwig_file $tmp_file 2>&1`; + + die "\nERROR: Problem using bigwig file $bigwig_file\n$bigwig_output" if $bigwig_output; + } + + # concatenate all the files together + my $string = $config->{tmpdir}.'/vep_tmp_'.$$.'_*'; + my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$; + `cat $string > $tmp_file; rm $string`; + open CUSTOM, $tmp_file + or die "\nERROR: Could not read from temporary WIG file $tmp_file\n"; + } + + # otherwise use tabix + else { + # tabix can fetch multiple regions, so construct a string + my $region_string = join " ", map {$tmp_chr.':'.$_} @tmp_regions; + + open CUSTOM, "tabix ".$custom->{file}." $region_string 2>&1 |" + or die "\nERROR: Could not open tabix pipe for ".$custom->{file}."\n"; + } + + # set an error flag so we don't have to check every line + my $error_flag = 1; + + while(<CUSTOM>) { + chomp; + + # check for errors + if($error_flag) { + die "\nERROR: Problem using annotation file ".$custom->{file}."\n$_\n" if /invalid pointer|tabix|get_intv/; + $error_flag = 0; + } + + my @data = split /\t/, $_; + + my $feature; + + if($custom->{format} eq 'bed') { + $feature = { + chr => $chr, + start => $data[1], + end => $data[2], + name => $data[3], + }; + } + + elsif($custom->{format} eq 'vcf') { + my $tmp_vf = parse_vcf($config, $_)->[0]; + + $feature = { + chr => $chr, + start => $tmp_vf->{start}, + end => $tmp_vf->{end}, + name => $tmp_vf->{variation_name}, + }; + } + + elsif($custom->{format} eq 'gff' || $custom->{format} eq 'gtf') { + + my $name; + + # try and get a feature name from the attributes column + foreach my $attrib(split /\s*\;\s*/, $data[8]) { + my ($key, $value) = split /\=/, $attrib; + $name = $value if $key eq 'ID'; + } + + $name ||= $data[2]."_".$data[0].":".$data[3]."-".$data[4]; + + $feature = { + chr => $chr, + start => $data[3], + end => $data[4], + name => $name, + }; + } + + elsif($custom->{format} eq 'bigwig') { + $feature = { + chr => $chr, + start => $data[0], + end => $data[0], + name => $data[1], + }; + } + + if(defined($feature)) { + $got_features = 1; + + if(!defined($feature->{name}) || $custom->{coords}) { + $feature->{name} = $feature->{chr}.":".$feature->{start}."-".$feature->{end}; + } + + # add the feature to the cache + $annotation->{$chr}->{$custom->{name}}->{$feature->{start}}->{$feature->{name}} = $feature; + } + } + close CUSTOM; + + # unlink temporary wig files + unlink($config->{tmpdir}.'/vep_tmp_'.$$) if $custom->{format} eq 'bigwig'; + + # no need to fetch e.g. "chr21" features if just "21" worked + last if $got_features; + } + } + } + + end_progress($config); + + return $annotation; +} + +# builds a full cache for this species +sub build_full_cache { + my $config = shift; + + my @slices; + + if($config->{build} =~ /all/i) { + @slices = @{$config->{sa}->fetch_all('toplevel')}; + push @slices, @{$config->{sa}->fetch_all('lrg', undef, 1, undef, 1)} if defined($config->{lrg}); + } + else { + foreach my $val(split /\,/, $config->{build}) { + my @nnn = split /\-/, $val; + + foreach my $chr($nnn[0]..$nnn[-1]) { + my $slice = get_slice($config, $chr); + push @slices, $slice if defined($slice); + } + } + } + + foreach my $slice(@slices) { + my $chr = $slice->seq_region_name; + + # check for features, we don't want a load of effectively empty dirs + my $dbc = $config->{sa}->db->dbc; + my $sth = $dbc->prepare("SELECT COUNT(*) FROM transcript WHERE seq_region_id = ?"); + $sth->execute($slice->get_seq_region_id); + + my $count; + $sth->bind_columns(\$count); + $sth->fetch; + $sth->finish; + + next unless $count > 0; + + my $regions; + + # for progress + my $region_count = int($slice->end / $config->{cache_region_size}) + 1; + my $counter = 0; + + # initial region + my ($start, $end) = (1, $config->{cache_region_size}); + + debug((defined($config->{rebuild}) ? "Rebuild" : "Creat")."ing cache for chromosome $chr") unless defined($config->{quiet}); + + while($start < $slice->end) { + + progress($config, $counter++, $region_count); + + # store quiet status + my $quiet = $config->{quiet}; + $config->{quiet} = 1; + + # spoof regions + $regions->{$chr} = [$start.'-'.$end]; + + # store transcripts + my $tmp_cache = (defined($config->{rebuild}) ? load_dumped_transcript_cache($config, $chr, $start.'-'.$end) : cache_transcripts($config, $regions)); + $tmp_cache->{$chr} ||= []; + + #(defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $start.'-'.$end) : dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end)); + dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end); + undef $tmp_cache; + + # store reg feats + if(defined($config->{regulatory})) { + my $rf_cache = cache_reg_feats($config, $regions); + $rf_cache->{$chr} ||= {}; + + dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end); + #(defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $rf_cache, $chr, $start.'-'.$end) : dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end)); + undef $rf_cache; + + # this gets cleaned off but needs to be there for the next loop + $slice->{coord_system}->{adaptor} = $config->{csa}; + } + + # store variations + my $variation_cache; + $variation_cache->{$chr} = get_variations_in_region($config, $chr, $start.'-'.$end); + $variation_cache->{$chr} ||= {}; + + dump_variation_cache($config, $variation_cache, $chr, $start.'-'.$end); + undef $variation_cache; + + # restore quiet status + $config->{quiet} = $quiet; + + # increment by cache_region_size to get next region + $start += $config->{cache_region_size}; + $end += $config->{cache_region_size}; + } + + end_progress($config); + + undef $regions; + } + + write_cache_info($config); +} + +# write an info file that defines what is in the cache +sub write_cache_info { + my $config = shift; + + my $info_file = $config->{dir}.'/info.txt'; + + open OUT, ">>$info_file" or die "ERROR: Could not write to cache info file $info_file\n"; + + print OUT "# CACHE UPDATED ".get_time()."\n"; + + foreach my $param(qw( + host + port + user + build + regulatory + sift + polyphen + )) { + print OUT "$param\t".(defined $config->{$param} ? $config->{$param} : '-')."\n"; + } + + # cell types + if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { + my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); + print OUT "cell_types\t".(join ",", map {$_->name} @{$cta->fetch_all}); + print OUT "\n"; + } + + close OUT; +} + +# reads in cache info file +sub read_cache_info { + my $config = shift; + + my $info_file = $config->{dir}.'/info.txt'; + + open IN, $info_file or return 0; + + while(<IN>) { + next if /^#/; + chomp; + my ($param, $value) = split /\t/; + $config->{'cache_'.$param} = $value unless defined $value && $value eq '-'; + } + + close IN; + + return 1; +} + +# format coords for printing +sub format_coords { + my ($start, $end) = @_; + + if(defined($start)) { + if(defined($end)) { + if($start > $end) { + return $end.'-'.$start; + } + elsif($start == $end) { + return $start; + } + else { + return $start.'-'.$end; + } + } + else { + return $start.'-?'; + } + } + elsif(defined($end)) { + return '?-'.$end; + } + else { + return '-'; + } +} + + + + +# METHODS TO FIND CO-LOCATED / EXISTING VARIATIONS +################################################## + +# finds an existing VF in the db +sub find_existing { + my $config = shift; + my $new_vf = shift; + + if(defined($config->{vfa}->db)) { + + my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; + + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + SELECT variation_name, IF(fv.variation_id IS NULL, 0, 1), seq_region_start, seq_region_end, allele_string, seq_region_strand, $maf_cols + FROM variation_feature vf LEFT JOIN failed_variation fv + ON vf.variation_id = fv.variation_id + WHERE vf.seq_region_id = ? + AND vf.seq_region_start = ? + AND vf.seq_region_end = ? + ORDER BY vf.source_id ASC + }); + + $sth->execute($new_vf->slice->get_seq_region_id, $new_vf->start, $new_vf->end); + + my @v; + for my $i(0..7) { + $v[$i] = undef; + } + + $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]); + + my @found; + + while($sth->fetch) { + push @found, $v[0] unless is_var_novel($config, \@v, $new_vf) || $v[1] > $config->{failed}; + } + + $sth->finish(); + + return (scalar @found ? join ",", @found : undef); + } + + return undef; +} + +# compare a new vf to one from the cache / DB +sub is_var_novel { + my $config = shift; + my $existing_var = shift; + my $new_var = shift; + + my $is_novel = 1; + + $is_novel = 0 if $existing_var->[2] == $new_var->start && $existing_var->[3] == $new_var->end; + + if(defined($config->{check_alleles})) { + my %existing_alleles; + + $existing_alleles{$_} = 1 for split /\//, $existing_var->[4]; + + my $seen_new = 0; + foreach my $a(split /\//, ($new_var->allele_string || "")) { + reverse_comp(\$a) if $new_var->strand ne $existing_var->[5]; + $seen_new = 1 unless defined $existing_alleles{$a}; + } + + $is_novel = 1 if $seen_new; + } + + return $is_novel; +} + +# check frequencies of existing var against requested params +sub check_frequencies { + my $config = shift; + my $var_name = shift; + + my $v = $config->{va}->fetch_by_name($var_name); + + my $pass = 0; + + my $freq_pop = $config->{freq_pop}; + my $freq_freq = $config->{freq_freq}; + my $freq_gt_lt = $config->{freq_gt_lt}; + + my $freq_pop_name = (split /\_/, $freq_pop)[-1]; + $freq_pop_name = undef if $freq_pop_name =~ /1kg|hap/; + + delete $config->{filtered_freqs}; + + foreach my $a(@{$v->get_all_Alleles}) { + next unless defined $a->{population} || defined $a->{'_population_id'}; + next unless defined $a->frequency; + next if $a->frequency > 0.5; + + my $pop_name = $a->population->name; + + if($freq_pop =~ /1kg/) { next unless $pop_name =~ /^1000.+(low|phase).+/i; } + if($freq_pop =~ /hap/) { next unless $pop_name =~ /^CSHL-HAPMAP/i; } + if($freq_pop =~ /any/) { next unless $pop_name =~ /^(CSHL-HAPMAP)|(1000.+(low|phase).+)/i; } + if(defined $freq_pop_name) { next unless $pop_name =~ /$freq_pop_name/i; } + + $pass = 1 if $a->frequency >= $freq_freq and $freq_gt_lt eq 'gt'; + $pass = 1 if $a->frequency <= $freq_freq and $freq_gt_lt eq 'lt'; + + $pop_name =~ s/\:/\_/g; + push @{$config->{filtered_freqs}}, $pop_name.':'.$a->frequency; + + #warn "Comparing allele ", $a->allele, " ", $a->frequency, " for $var_name in population ", $a->population->name, " PASS $pass"; + } + + return 0 if $config->{freq_filter} eq 'exclude' and $pass == 1; + return 0 if $config->{freq_filter} eq 'include' and $pass == 0; + return 1; +} + +# gets all variations in a region +sub get_variations_in_region { + my $config = shift; + my $chr = shift; + my $region = shift; + + my ($start, $end) = split /\-/, $region; + + my %variations; + + if(defined($config->{vfa}->db)) { + my $sr_cache = $config->{seq_region_cache}; + + if(!defined($sr_cache)) { + $sr_cache = cache_seq_region_ids($config); + $config->{seq_region_cache} = $sr_cache; + } + + # no seq_region_id? + return {} unless defined($sr_cache) && defined($sr_cache->{$chr}); + + my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; + + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + SELECT vf.variation_name, IF(fv.variation_id IS NULL, 0, 1), vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand, $maf_cols + FROM variation_feature vf + LEFT JOIN failed_variation fv ON fv.variation_id = vf.variation_id + WHERE vf.seq_region_id = ? + AND vf.seq_region_start >= ? + AND vf.seq_region_start <= ? + }); + + $sth->execute($sr_cache->{$chr}, $start, $end); + + my @v; + for my $i(0..7) { + $v[$i] = undef; + } + + $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]); + + while($sth->fetch) { + my @v_copy = @v; + push @{$variations{$v[2]}}, \@v_copy; + } + + $sth->finish(); + } + + return \%variations; +} + +sub cache_seq_region_ids { + my $config = shift; + + my (%cache, $chr, $id); + + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + SELECT seq_region_id, name FROM seq_region + }); + + $sth->execute(); + $sth->bind_columns(\$id, \$chr); + $cache{$chr} = $id while $sth->fetch(); + $sth->finish; + + return \%cache; +} + +sub have_maf_cols { + my $config = shift; + + if(!defined($config->{have_maf_cols})) { + my $sth = $config->{vfa}->db->dbc->prepare(qq{ + DESCRIBE variation_feature + }); + + $sth->execute(); + my @cols = map {$_->[0]} @{$sth->fetchall_arrayref}; + $sth->finish(); + + $config->{have_maf_cols} = 0; + $config->{have_maf_cols} = 1 if grep {$_ eq 'minor_allele'} @cols; + } + + return $config->{have_maf_cols}; +} + +sub merge_hashes { + my ($x, $y) = @_; + + foreach my $k (keys %$y) { + if (!defined($x->{$k})) { + $x->{$k} = $y->{$k}; + } else { + if(ref($x->{$k}) eq 'ARRAY') { + $x->{$k} = merge_arrays($x->{$k}, $y->{$k}); + } + elsif(ref($x->{$k}) eq 'HASH') { + $x->{$k} = merge_hashes($x->{$k}, $y->{$k}); + } + else { + $x->{$k} = $y->{$k}; + } + } + } + return $x; +} + +sub merge_arrays { + my ($x, $y) = @_; + + my %tmp = map {$_ => 1} (@$x, @$y); + + return [keys %tmp]; +} + + + + +# DEBUG AND STATUS METHODS +########################## + +# gets time +sub get_time() { + my @time = localtime(time()); + + # increment the month (Jan = 0) + $time[4]++; + + # add leading zeroes as required + for my $i(0..4) { + $time[$i] = "0".$time[$i] if $time[$i] < 10; + } + + # put the components together in a string + my $time = + ($time[5] + 1900)."-". + $time[4]."-". + $time[3]." ". + $time[2].":". + $time[1].":". + $time[0]; + + return $time; +} + +# prints debug output with time +sub debug { + my $text = (@_ ? (join "", @_) : "No message"); + my $time = get_time; + + print $time." - ".$text.($text =~ /\n$/ ? "" : "\n"); +} + +# finds out memory usage +sub memory { + my @mem; + + open IN, "ps -o rss,vsz $$ |"; + while(<IN>) { + next if $_ =~ /rss/i; + chomp; + @mem = split; + } + close IN; + + return \@mem; +} + +sub mem_diff { + my $config = shift; + my $mem = memory(); + my @diffs; + + if(defined($config->{memory})) { + for my $i(0..(scalar @{$config->{memory}} - 1)) { + push @diffs, $mem->[$i] - $config->{memory}->[$i]; + } + } + else { + @diffs = @$mem; + } + + $config->{memory} = $mem; + + return \@diffs; +} + +# update or initiate progress bar +sub progress { + my ($config, $i, $total) = @_; + + return if defined($config->{quiet}) || defined($config->{no_progress}); + + my $width = $config->{terminal_width} || 60; + my $percent = int(($i/$total) * 100); + my $numblobs = int((($i/$total) * $width) - 2); + + # this ensures we're not writing to the terminal too much + return if(defined($config->{prev_prog})) && $numblobs.'-'.$percent eq $config->{prev_prog}; + $config->{prev_prog} = $numblobs.'-'.$percent; + + printf("\r% -${width}s% 1s% 10s", '['.('=' x $numblobs).($numblobs == $width - 2 ? '=' : '>'), ']', "[ " . $percent . "% ]"); +} + +# end progress bar +sub end_progress { + my $config = shift; + return if defined($config->{quiet}) || defined($config->{no_progress}); + progress($config, 1,1); + print "\n"; + delete $config->{prev_prog}; +} + +1;
