comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VEP.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence
22 #
23 #
24
25 =head1 NAME
26
27 Bio::EnsEMBL::Variation::Utils::VEP - Methods used by the Variant Effect Predictor
28
29 =head1 SYNOPSIS
30
31 use Bio::EnsEMBL::Variation::Utils::VEP qw(configure);
32
33 my $config = configure();
34
35 =head1 METHODS
36
37 =cut
38
39
40 use strict;
41 use warnings;
42
43 package Bio::EnsEMBL::Variation::Utils::VEP;
44
45 # module list
46 use Getopt::Long;
47 use FileHandle;
48 use File::Path qw(make_path);
49 use Storable qw(nstore_fd fd_retrieve freeze thaw);
50 use Scalar::Util qw(weaken);
51 use Digest::MD5 qw(md5_hex);
52
53 use Bio::EnsEMBL::Registry;
54 use Bio::EnsEMBL::Variation::VariationFeature;
55 use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor;
56 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap);
57 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
58 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
59 use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3;
60 use Bio::EnsEMBL::Variation::StructuralVariationFeature;
61 use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor;
62 use Bio::EnsEMBL::Variation::TranscriptStructuralVariation;
63
64 # we need to manually include all these modules for caching to work
65 use Bio::EnsEMBL::CoordSystem;
66 use Bio::EnsEMBL::Transcript;
67 use Bio::EnsEMBL::Translation;
68 use Bio::EnsEMBL::Exon;
69 use Bio::EnsEMBL::ProteinFeature;
70 use Bio::EnsEMBL::Analysis;
71 use Bio::EnsEMBL::DBSQL::GeneAdaptor;
72 use Bio::EnsEMBL::DBSQL::SliceAdaptor;
73 use Bio::EnsEMBL::DBSQL::TranslationAdaptor;
74 use Bio::EnsEMBL::DBSQL::TranscriptAdaptor;
75 use Bio::EnsEMBL::DBSQL::MetaContainer;
76 use Bio::EnsEMBL::DBSQL::CoordSystemAdaptor;
77
78 use Exporter;
79 use vars qw(@ISA @EXPORT_OK);
80 @ISA = qw(Exporter);
81
82 # open socket pairs for cross-process comms
83 use Socket;
84 socketpair(CHILD, PARENT, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "ERROR: Failed to open socketpair: $!";
85 CHILD->autoflush(1);
86 PARENT->autoflush(1);
87
88 @EXPORT_OK = qw(
89 &parse_line
90 &vf_to_consequences
91 &validate_vf
92 &read_cache_info
93 &dump_adaptor_cache
94 &load_dumped_adaptor_cache
95 &load_dumped_variation_cache
96 &get_all_consequences
97 &get_slice
98 &build_slice_cache
99 &build_full_cache
100 &regions_from_hash
101 &get_time
102 &debug
103 &convert_to_vcf
104 &progress
105 &end_progress
106 @REG_FEAT_TYPES
107 @OUTPUT_COLS
108 @VEP_WEB_CONFIG
109 %FILTER_SHORTCUTS
110 );
111
112 our @OUTPUT_COLS = qw(
113 Uploaded_variation
114 Location
115 Allele
116 Gene
117 Feature
118 Feature_type
119 Consequence
120 cDNA_position
121 CDS_position
122 Protein_position
123 Amino_acids
124 Codons
125 Existing_variation
126 Extra
127 );
128
129 our @REG_FEAT_TYPES = qw(
130 RegulatoryFeature
131 MotifFeature
132 );
133
134 our @VEP_WEB_CONFIG = qw(
135 format
136 check_existing
137 coding_only
138 core_type
139 hgnc
140 protein
141 hgvs
142 terms
143 check_frequency
144 freq_filter
145 freq_gt_lt
146 freq_freq
147 freq_pop
148 sift
149 polyphen
150 regulatory
151 );
152
153 our %FILTER_SHORTCUTS = (
154 upstream => {
155 '5KB_upstream_variant' => 1,
156 '2KB_upstream_variant' => 1,
157 },
158 downstream => {
159 '5KB_downstream_variant' => 1,
160 '2KB_downstream_variant' => 1,
161 '500B_downstream_variant' => 1,
162 },
163 utr => {
164 '5_prime_UTR_variant' => 1,
165 '3_prime_UTR_variant' => 1,
166 },
167 splice => {
168 splice_donor_variant => 1,
169 splice_acceptor_variant => 1,
170 splice_region_variant => 1,
171 },
172 coding_change => {
173 stop_lost => 1,
174 stop_gained => 1,
175 missense_variant => 1,
176 frameshift_variant => 1,
177 inframe_insertion => 1,
178 inframe_deletion => 1,
179 },
180 regulatory => {
181 regulatory_region_variant => 1,
182 TF_binding_site_variant => 1,
183 },
184 );
185
186 # parses a line of input, returns VF object(s)
187 sub parse_line {
188 my $config = shift;
189 my $line = shift;
190
191 # find out file format - will only do this on first line
192 if(!defined($config->{format}) || (defined($config->{format}) && $config->{format} eq 'guess')) {
193 $config->{format} = &detect_format($line);
194 debug("Detected format of input file as ", $config->{format}) unless defined($config->{quiet});
195
196 # HGVS and ID formats need DB
197 die("ERROR: Can't use ".uc($config->{format})." format in offline mode") if $config->{format} =~ /id|hgvs/ && defined($config->{offline});
198
199 # force certain options if format is VEP output
200 if($config->{format} eq 'vep') {
201 $config->{no_consequence} = 1;
202 delete $config->{regulatory};
203 debug("Forcing no consequence calculation") unless defined($config->{quiet});
204 }
205 }
206
207 # check that format is vcf when using --individual
208 die("ERROR: --individual only compatible with VCF input files\n") if defined($config->{individual}) && $config->{format} ne 'vcf';
209
210 my $parse_method = 'parse_'.$config->{format};
211 $parse_method =~ s/vep_//;
212 my $method_ref = \&$parse_method;
213
214 my $vfs = &$method_ref($config, $line);
215
216 $vfs = add_lrg_mappings($config, $vfs) if defined($config->{lrg});
217
218 return $vfs;
219 }
220
221 # sub-routine to detect format of input
222 sub detect_format {
223 my $line = shift;
224 my @data = split /\s+/, $line;
225
226 # HGVS: ENST00000285667.3:c.1047_1048insC
227 if (
228 scalar @data == 1 &&
229 $data[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i
230 ) {
231 return 'hgvs';
232 }
233
234 # variant identifier: rs123456
235 elsif (
236 scalar @data == 1
237 ) {
238 return 'id';
239 }
240
241 # VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ
242 elsif (
243 $data[0] =~ /(chr)?\w+/ &&
244 $data[1] =~ /^\d+$/ &&
245 $data[3] =~ /^[ACGTN-]+$/i &&
246 $data[4] =~ /^([\.ACGTN-]+\,?)+$/i
247 ) {
248 return 'vcf';
249 }
250
251 # pileup: chr1 60 T A
252 elsif (
253 $data[0] =~ /(chr)?\w+/ &&
254 $data[1] =~ /^\d+$/ &&
255 $data[2] =~ /^[\*ACGTN-]+$/i &&
256 $data[3] =~ /^[\*ACGTNRYSWKM\+\/-]+$/i
257 ) {
258 return 'pileup';
259 }
260
261 # ensembl: 20 14370 14370 A/G +
262 elsif (
263 $data[0] =~ /\w+/ &&
264 $data[1] =~ /^\d+$/ &&
265 $data[2] =~ /^\d+$/ &&
266 $data[3] =~ /[ACGTN-]+\/[ACGTN-]+/i
267 ) {
268 return 'ensembl';
269 }
270
271 # vep output: ID 1:142849179 - - - - INTERGENIC
272 elsif (
273 $data[0] =~ /\w+/ &&
274 $data[1] =~ /^\w+?\:\d+(\-\d+)*$/ &&
275 scalar @data == 14
276 ) {
277 return 'vep';
278 }
279
280 else {
281 die("ERROR: Could not detect input file format\n");
282 }
283 }
284
285 # parse a line of Ensembl format input into a variation feature object
286 sub parse_ensembl {
287 my $config = shift;
288 my $line = shift;
289
290 my ($chr, $start, $end, $allele_string, $strand, $var_name) = split /\s+/, $line;
291
292 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({
293 start => $start,
294 end => $end,
295 allele_string => $allele_string,
296 strand => $strand,
297 map_weight => 1,
298 adaptor => $config->{vfa},
299 variation_name => $var_name,
300 chr => $chr,
301 });
302
303 return [$vf];
304 }
305
306 # parse a line of VCF input into a variation feature object
307 sub parse_vcf {
308 my $config = shift;
309 my $line = shift;
310
311 my @data = split /\s+/, $line;
312
313 # non-variant
314 my $non_variant = 0;
315
316 if($data[4] eq '.') {
317 if(defined($config->{allow_non_variant})) {
318 $non_variant = 1;
319 }
320 else {
321 return [];
322 }
323 }
324
325 # get relevant data
326 my ($chr, $start, $end, $ref, $alt) = ($data[0], $data[1], $data[1], $data[3], $data[4]);
327
328 # some VCF files have a GRCh37 pos defined in GP flag in INFO column
329 # if user has requested, we can use that as the position instead
330 if(defined $config->{gp}) {
331 $chr = undef;
332 $start = undef;
333
334 foreach my $pair(split /\;/, $data[7]) {
335 my ($key, $value) = split /\=/, $pair;
336 if($key eq 'GP') {
337 ($chr, $start) = split /\:/, $value;
338 $end = $start;
339 }
340 }
341
342 unless(defined($chr) and defined($start)) {
343 warn "No GP flag found in INFO column" unless defined $config->{quiet};
344 return [];
345 }
346 }
347
348 # adjust end coord
349 $end += (length($ref) - 1);
350
351 # structural variation
352 if((defined($data[7]) && $data[7] =~ /SVTYPE/) || $alt =~ /\<|\[|\]|\>/) {
353
354 # parse INFO field
355 my %info = ();
356
357 foreach my $bit(split /\;/, $data[7]) {
358 my ($key, $value) = split /\=/, $bit;
359 $info{$key} = $value;
360 }
361
362 # like indels, SVs have the base before included for reference
363 $start++;
364
365 # work out the end coord
366 if(defined($info{END})) {
367 $end = $info{END};
368 }
369 elsif(defined($info{SVLEN})) {
370 $end = $start + abs($info{SVLEN}) - 1;
371 }
372
373 # check for imprecise breakpoints
374 my ($min_start, $max_start, $min_end, $max_end);
375
376 if(defined($info{CIPOS})) {
377 my ($low, $high) = split /\,/, $info{CIPOS};
378 $min_start = $start + $low;
379 $max_start = $start + $high;
380 }
381
382 if(defined($info{CIEND})) {
383 my ($low, $high) = split /\,/, $info{CIEND};
384 $min_end = $end + $low;
385 $max_end = $end + $high;
386 }
387
388 # get type
389 my $type = $info{SVTYPE};
390 my $so_term;
391
392 if(defined($type)) {
393 # convert to SO term
394 my %terms = (
395 INS => 'insertion',
396 DEL => 'deletion',
397 TDUP => 'tandem_duplication',
398 DUP => 'duplication'
399 );
400
401 $so_term = defined $terms{$type} ? $terms{$type} : $type;
402 }
403
404 my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({
405 start => $start,
406 inner_start => $max_start,
407 outer_start => $min_start,
408 end => $end,
409 inner_end => $min_end,
410 outer_end => $max_end,
411 strand => 1,
412 adaptor => $config->{svfa},
413 variation_name => $data[2] eq '.' ? undef : $data[2],
414 chr => $chr,
415 class_SO_term => $so_term,
416 });
417
418 return [$svf];
419 }
420
421 # normal variation
422 else {
423 # find out if any of the alt alleles make this an insertion or a deletion
424 my ($is_indel, $is_sub, $ins_count, $total_count);
425 foreach my $alt_allele(split /\,/, $alt) {
426 $is_indel = 1 if $alt_allele =~ /D|I/;
427 $is_indel = 1 if length($alt_allele) != length($ref);
428 $is_sub = 1 if length($alt_allele) == length($ref);
429 $ins_count++ if length($alt_allele) > length($ref);
430 $total_count++;
431 }
432
433 # multiple alt alleles?
434 if($alt =~ /\,/) {
435 if($is_indel) {
436 my @alts;
437
438 if($alt =~ /D|I/) {
439 foreach my $alt_allele(split /\,/, $alt) {
440 # deletion (VCF <4)
441 if($alt_allele =~ /D/) {
442 push @alts, '-';
443 }
444
445 elsif($alt_allele =~ /I/) {
446 $alt_allele =~ s/^I//g;
447 push @alts, $alt_allele;
448 }
449 }
450 }
451
452 else {
453 $ref = substr($ref, 1) || '-';
454 $start++;
455
456 foreach my $alt_allele(split /\,/, $alt) {
457 $alt_allele = substr($alt_allele, 1);
458 $alt_allele = '-' if $alt_allele eq '';
459 push @alts, $alt_allele;
460 }
461 }
462
463 $alt = join "/", @alts;
464 }
465
466 else {
467 # for substitutions we just need to replace ',' with '/' in $alt
468 $alt =~ s/\,/\//g;
469 }
470 }
471
472 elsif($is_indel) {
473 # deletion (VCF <4)
474 if($alt =~ /D/) {
475 my $num_deleted = $alt;
476 $num_deleted =~ s/\D+//g;
477 $end += $num_deleted - 1;
478 $alt = "-";
479 $ref .= ("N" x ($num_deleted - 1)) unless length($ref) > 1;
480 }
481
482 # insertion (VCF <4)
483 elsif($alt =~ /I/) {
484 $ref = '-';
485 $alt =~ s/^I//g;
486 $start++;
487 }
488
489 # insertion or deletion (VCF 4+)
490 elsif(substr($ref, 0, 1) eq substr($alt, 0, 1)) {
491
492 # chop off first base
493 $ref = substr($ref, 1) || '-';
494 $alt = substr($alt, 1) || '-';
495
496 $start++;
497 }
498 }
499
500 # create VF object
501 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({
502 start => $start,
503 end => $end,
504 allele_string => $non_variant ? $ref : $ref.'/'.$alt,
505 strand => 1,
506 map_weight => 1,
507 adaptor => $config->{vfa},
508 variation_name => $data[2] eq '.' ? undef : $data[2],
509 chr => $chr,
510 });
511
512 # flag as non-variant
513 $vf->{non_variant} = 1 if $non_variant;
514
515 # individuals?
516 if(defined($config->{individual})) {
517 my @alleles = split /\//, $ref.'/'.$alt;
518
519 my @return;
520
521 foreach my $ind(keys %{$config->{ind_cols}}) {
522
523 # get alleles present in this individual
524 my @bits;
525 my $gt = (split /\:/, $data[$config->{ind_cols}->{$ind}])[0];
526
527 my $phased = ($gt =~ /\|/ ? 1 : 0);
528
529 foreach my $bit(split /\||\/|\\/, $gt) {
530 push @bits, $alleles[$bit] unless $bit eq '.';
531 }
532
533 # shallow copy VF
534 my $vf_copy = { %$vf };
535 bless $vf_copy, ref($vf);
536
537 # get non-refs
538 my %non_ref = map {$_ => 1} grep {$_ ne $ref} @bits;
539
540 # construct allele_string
541 if(scalar keys %non_ref) {
542 $vf_copy->{allele_string} = $ref."/".(join "/", keys %non_ref);
543 }
544 else {
545 $vf_copy->{allele_string} = $ref;
546 $vf_copy->{non_variant} = 1;
547 }
548
549 # store phasing info
550 $vf_copy->{phased} = defined($config->{phased} ? 1 : $phased);
551
552 # store GT
553 $vf_copy->{genotype} = \@bits;
554
555 # store individual name
556 $vf_copy->{individual} = $ind;
557
558 push @return, $vf_copy;
559 }
560
561 return \@return;
562 }
563 else {
564 return [$vf];
565 }
566 }
567 }
568
569 # parse a line of pileup input into variation feature objects
570 sub parse_pileup {
571 my $config = shift;
572 my $line = shift;
573
574 my @data = split /\s+/, $line;
575
576 # pileup can produce more than one VF per line
577 my @return;
578
579 # normal variant
580 if($data[2] ne "*"){
581 my $var;
582
583 if($data[3] =~ /^[A|C|G|T]$/) {
584 $var = $data[3];
585 }
586 else {
587 ($var = unambiguity_code($data[3])) =~ s/$data[2]//ig;
588 }
589
590 for my $alt(split //, $var){
591 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
592 start => $data[1],
593 end => $data[1],
594 allele_string => $data[2].'/'.$alt,
595 strand => 1,
596 map_weight => 1,
597 adaptor => $config->{vfa},
598 chr => $data[0],
599 });
600 }
601 }
602
603 # in/del
604 else {
605 my %tmp_hash = map {$_ => 1} split /\//, $data[3];
606 my @genotype = keys %tmp_hash;
607
608 foreach my $allele(@genotype){
609 if(substr($allele,0,1) eq "+") { #ins
610 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
611 start => $data[1] + 1,
612 end => $data[1],
613 allele_string => '-/'.substr($allele, 1),
614 strand => 1,
615 map_weight => 1,
616 adaptor => $config->{vfa},
617 chr => $data[0],
618 });
619 }
620 elsif(substr($allele,0,1) eq "-"){ #del
621 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
622 start => $data[1] + 1,
623 end => $data[1] + length(substr($allele, 1)),
624 allele_string => substr($allele, 1).'/-',
625 strand => 1,
626 map_weight => 1,
627 adaptor => $config->{vfa},
628 chr => $data[0],
629 });
630 }
631 elsif($allele ne "*"){
632 warn("WARNING: invalid pileup indel genotype: $line\n") unless defined $config->{quiet};
633 }
634 }
635 }
636
637 return \@return;
638 }
639
640 # parse a line of HGVS input into a variation feature object
641 sub parse_hgvs {
642 my $config = shift;
643 my $line = shift;
644
645 my $vf;
646
647 # not all hgvs notations are supported yet, so we have to wrap it in an eval
648 eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{sa}, $config->{ta}) };
649
650 if((!defined($vf) || (defined $@ && length($@) > 1)) && defined($config->{coordinator})) {
651 eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{ofsa}, $config->{ofta}) };
652 }
653
654 if(!defined($vf) || (defined $@ && length($@) > 1)) {
655 warn("WARNING: Unable to parse HGVS notation \'$line\'\n$@") unless defined $config->{quiet};
656 return [];
657 }
658
659 # get whole chromosome slice
660 my $slice = $vf->slice->adaptor->fetch_by_region($vf->slice->coord_system->name, $vf->slice->seq_region_name);
661
662 $vf = $vf->transfer($slice);
663
664 # name it after the HGVS
665 $vf->{variation_name} = $line;
666
667 # add chr attrib
668 $vf->{chr} = $vf->slice->seq_region_name;
669
670 return [$vf];
671 }
672
673 # parse a variation identifier e.g. a dbSNP rsID
674 sub parse_id {
675 my $config = shift;
676 my $line = shift;
677
678 my $v_obj = $config->{va}->fetch_by_name($line);
679
680 return [] unless defined $v_obj;
681
682 my @vfs = @{$v_obj->get_all_VariationFeatures};
683 delete $_->{dbID} for @vfs;
684 delete $_->{overlap_consequences} for @vfs;
685 $_->{chr} = $_->seq_region_name for @vfs;
686
687 return \@vfs;
688 }
689
690 # parse a line of VEP output
691 sub parse_vep {
692 my $config = shift;
693 my $line = shift;
694
695 my @data = split /\t/, $line;
696
697 my ($chr, $start, $end) = split /\:|\-/, $data[1];
698 $end ||= $start;
699
700 # might get allele string from ID
701 my $allele_string;
702
703 if($data[0] =~ /^\w\_\w\_\w$/) {
704 my @split = split /\_/, $data[0];
705 $allele_string = $split[-1] if $split[-1] =~ /[ACGTN-]+\/[ACGTN-]+/;
706 }
707
708 $allele_string ||= 'N/'.($data[6] =~ /intergenic/ ? 'N' : $data[2]);
709
710 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({
711 start => $start,
712 end => $end,
713 allele_string => $allele_string,
714 strand => 1,
715 map_weight => 1,
716 adaptor => $config->{vfa},
717 chr => $chr,
718 variation_name => $data[0],
719 });
720
721 return [$vf];
722 }
723
724
725
726 # converts to VCF format
727 sub convert_to_vcf {
728 my $config = shift;
729 my $vf = shift;
730
731 # look for imbalance in the allele string
732 my %allele_lengths;
733 my @alleles = split /\//, $vf->allele_string;
734
735 foreach my $allele(@alleles) {
736 $allele =~ s/\-//g;
737 $allele_lengths{length($allele)} = 1;
738 }
739
740 # in/del/unbalanced
741 if(scalar keys %allele_lengths > 1) {
742
743 # we need the ref base before the variation
744 # default to N in case we can't get it
745 my $prev_base = 'N';
746
747 unless(defined($config->{cache})) {
748 my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1);
749 $prev_base = $slice->seq if defined($slice);
750 }
751
752 for my $i(0..$#alleles) {
753 $alleles[$i] =~ s/\-//g;
754 $alleles[$i] = $prev_base.$alleles[$i];
755 }
756
757 return [
758 $vf->{chr} || $vf->seq_region_name,
759 $vf->start - 1,
760 $vf->variation_name,
761 shift @alleles,
762 (join ",", @alleles),
763 '.', '.', '.'
764 ];
765
766 }
767
768 # balanced sub
769 else {
770 return [
771 $vf->{chr} || $vf->seq_region_name,
772 $vf->start,
773 $vf->variation_name,
774 shift @alleles,
775 (join ",", @alleles),
776 '.', '.', '.'
777 ];
778 }
779 }
780
781
782 # tries to map a VF to the LRG coordinate system
783 sub add_lrg_mappings {
784 my $config = shift;
785 my $vfs = shift;
786
787 my @new_vfs;
788
789 foreach my $vf(@$vfs) {
790
791 # add the unmapped VF to the array
792 push @new_vfs, $vf;
793
794 # make sure the VF has an attached slice
795 $vf->{slice} ||= get_slice($config, $vf->{chr});
796 next unless defined($vf->{slice});
797
798 # transform LRG <-> chromosome
799 my $new_vf = $vf->transform($vf->{slice}->coord_system->name eq 'lrg' ? 'chromosome' : 'lrg');
800
801 # add it to the array if transformation worked
802 if(defined($new_vf)) {
803
804 # update new VF's chr entry
805 $new_vf->{chr} = $new_vf->seq_region_name;
806 push @new_vfs, $new_vf;
807 }
808 }
809
810 return \@new_vfs;
811 }
812
813
814 # wrapper for whole_genome_fetch and vf_to_consequences
815 # takes config and a listref of VFs, returns listref of line hashes for printing
816 sub get_all_consequences {
817 my $config = shift;
818 my $listref = shift;
819
820 if ($config->{extra}) {
821 eval "use Plugin qw($config);"
822 }
823
824 # initialize caches
825 $config->{$_.'_cache'} ||= {} for qw(tr rf slice);
826
827 # build hash
828 my %vf_hash;
829 push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref;
830
831 my @non_variant = grep {defined($_->{non_variant})} @$listref;
832 debug("Skipping ".(scalar @non_variant)." non-variant loci\n") unless defined($config->{quiet});
833
834 # get regions
835 my $regions = &regions_from_hash($config, \%vf_hash);
836 my $trim_regions = $regions;
837
838 # get trimmed regions - allows us to discard out-of-range transcripts
839 # when using cache
840 #if(defined($config->{cache})) {
841 # my $tmp = $config->{cache};
842 # delete $config->{cache};
843 # $trim_regions = regions_from_hash($config, \%vf_hash);
844 # $config->{cache} = $tmp;
845 #}
846
847 # prune caches
848 prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr});
849 prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf});
850
851 # get chr list
852 my %chrs = map {$_->{chr} => 1} @$listref;
853
854 my $fetched_tr_count = 0;
855 $fetched_tr_count = fetch_transcripts($config, $regions, $trim_regions)
856 unless defined($config->{no_consequences});
857
858 my $fetched_rf_count = 0;
859 $fetched_rf_count = fetch_regfeats($config, $regions, $trim_regions)
860 if defined($config->{regulatory})
861 && !defined($config->{no_consequences});
862
863 # check we can use MIME::Base64
864 if(defined($config->{fork})) {
865 eval q{ use MIME::Base64; };
866
867 if($@) {
868 debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet});
869 delete $config->{fork};
870 }
871 }
872
873 my (@temp_array, @return, @pids);
874 if(defined($config->{fork})) {
875 my $size = scalar @$listref;
876
877 while(my $tmp_vf = shift @$listref) {
878 push @temp_array, $tmp_vf;
879
880 # fork
881 if(scalar @temp_array >= ($size / $config->{fork}) || scalar @$listref == 0) {
882
883 my $pid = fork;
884
885 if(!defined($pid)) {
886 debug("WARNING: Failed to fork - will attempt to continue without forking") unless defined($config->{quiet});
887 push @temp_array, @$listref;
888 push @return, @{vf_list_to_cons($config, \@temp_array, $regions)};
889 last;
890 }
891 elsif($pid) {
892 push @pids, $pid;
893 @temp_array = ();
894 }
895 elsif($pid == 0) {
896
897 $config->{forked} = $$;
898 $config->{quiet} = 1;
899
900 # redirect STDERR to PARENT so we can catch errors
901 *STDERR = *PARENT;
902
903 my $cons = vf_list_to_cons($config, \@temp_array, $regions);
904
905 # what we're doing here is sending a serialised hash of the
906 # results through to the parent process through the socket.
907 # This is then thawed by the parent process.
908 # $$, or the PID, is added so that the input can be sorted
909 # back into the correct order for output
910 print PARENT $$." ".encode_base64(freeze($_), "\t")."\n" for @$cons;
911
912 # some plugins may cache stuff, check for this and try and
913 # reconstitute it into parent's plugin cache
914 foreach my $plugin(@{$config->{plugins}}) {
915
916 next unless defined($plugin->{has_cache});
917
918 # delete unnecessary stuff and stuff that can't be serialised
919 delete $plugin->{$_} for qw(config feature_types variant_feature_types version feature_types_wanted variant_feature_types_wanted params);
920 print PARENT $$." PLUGIN ".ref($plugin)." ".encode_base64(freeze($plugin), "\t")."\n";
921 }
922
923 # we need to tell the parent this child is finished
924 # otherwise it keeps listening
925 print PARENT "DONE $$\n";
926 close PARENT;
927
928 exit(0);
929 }
930 }
931 }
932
933 debug("Calculating consequences") unless defined($config->{quiet});
934
935 my $fh = $config->{out_file_handle};
936 my $done_processes = 0;
937 my $done_vars = 0;
938 my $total_size = $size;
939 my $pruned_count;
940
941 # create a hash to store the returned data by PID
942 # this means we can sort it correctly on output
943 my %by_pid;
944
945 # read child input
946 while(<CHILD>) {
947
948 # child finished
949 if(/^DONE/) {
950 $done_processes++;
951 last if $done_processes == scalar @pids;
952 }
953
954 # variant finished / progress indicator
955 elsif(/^BUMP/) {
956 progress($config, ++$done_vars, $total_size);# if $pruned_count == scalar @pids;
957 }
958
959 # output
960 elsif(/^\-?\d+ /) {
961
962 # plugin
963 if(/^\-?\d+ PLUGIN/) {
964
965 m/^(\-?\d+) PLUGIN (\w+) /;
966 my ($pid, $plugin) = ($1, $2);
967
968 # remove the PID
969 s/^\-?\d+ PLUGIN \w+ //;
970 chomp;
971
972 my $tmp = thaw(decode_base64($_));
973
974 next unless defined($plugin);
975
976 # copy data to parent plugin
977 my ($parent_plugin) = grep {ref($_) eq $plugin} @{$config->{plugins}};
978
979 next unless defined($parent_plugin);
980
981 merge_hashes($parent_plugin, $tmp);
982 }
983
984 else {
985 # grab the PID
986 m/^(\-?\d+)\s/;
987 my $pid = $1;
988 die "ERROR: Could not parse forked PID from line $_" unless defined($pid);
989
990 # remove the PID
991 s/^\-?\d+\s//;
992 chomp;
993
994 # decode and thaw "output" from forked process
995 push @{$by_pid{$pid}}, thaw(decode_base64($_));
996 }
997 }
998
999 elsif(/^PRUNED/) {
1000 s/PRUNED //g;
1001 chomp;
1002 $pruned_count++;
1003 $total_size += $_;
1004 }
1005
1006 elsif(/^DEBUG/) {
1007 print STDERR;
1008 }
1009
1010 # something's wrong
1011 else {
1012 # kill the other pids
1013 kill(15, $_) for @pids;
1014 die("\nERROR: Forked process failed\n$_\n");
1015 }
1016 }
1017
1018 end_progress($config);
1019
1020 debug("Writing output") unless defined($config->{quiet});
1021
1022 waitpid($_, 0) for @pids;
1023
1024 # add the sorted data to the return array
1025 push @return, @{$by_pid{$_} || []} for @pids;
1026 }
1027
1028 # no forking
1029 else {
1030 push @return, @{vf_list_to_cons($config, $listref, $regions)};
1031 }
1032
1033 if(defined($config->{debug})) {
1034 eval q{use Devel::Size qw(total_size)};
1035 my $mem = memory();
1036 my $tot;
1037 $tot += $_ for @$mem;
1038
1039 if($tot > 1000000) {
1040 $tot = sprintf("%.2fGB", $tot / (1024 * 1024));
1041 }
1042
1043 elsif($tot > 1000) {
1044 $tot = sprintf("%.2fMB", $tot / 1024);
1045 }
1046
1047 my $mem_diff = mem_diff($config);
1048 debug(
1049 "LINES ", $config->{line_number},
1050 "\tMEMORY $tot ", (join " ", @$mem),
1051 "\tDIFF ", (join " ", @$mem_diff),
1052 "\tCACHE ", total_size($config->{tr_cache}).
1053 "\tRF ", total_size($config->{rf_cache}),
1054 "\tVF ", total_size(\%vf_hash),
1055 );
1056 #exit(0) if grep {$_ < 0} @$mem_diff;
1057 }
1058
1059 return \@return;
1060 }
1061
1062 sub vf_list_to_cons {
1063 my $config = shift;
1064 my $listref = shift;
1065 my $regions = shift;
1066
1067 # get non-variants
1068 my @non_variants = grep {$_->{non_variant}} @$listref;
1069
1070 my %vf_hash = ();
1071 push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref;
1072
1073 # check existing VFs
1074 &check_existing_hash($config, \%vf_hash) if defined($config->{check_existing});# && scalar @$listref > 10;
1075
1076 # get overlapping SVs
1077 &check_svs_hash($config, \%vf_hash) if defined($config->{check_svs});
1078
1079 # if we are forked, we can trim off some stuff
1080 if(defined($config->{forked})) {
1081 my $tmp = $config->{cache};
1082 delete $config->{cache};
1083 $regions = regions_from_hash($config, \%vf_hash);
1084 $config->{cache} = $tmp;
1085
1086 # prune caches
1087 my $new_count = 0;
1088
1089 $new_count += prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr});
1090 $new_count += prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf});
1091
1092 print PARENT "PRUNED $new_count\n";
1093 }
1094
1095 my @return;
1096
1097 foreach my $chr(sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %vf_hash) {
1098 my $finished_vfs = whole_genome_fetch($config, $chr, \%vf_hash);
1099
1100 # non-variants?
1101 if(scalar @non_variants) {
1102 push @$finished_vfs, grep {$_->{chr} eq $chr} @non_variants;
1103
1104 # need to re-sort
1105 @$finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @$finished_vfs;
1106 }
1107
1108 debug("Calculating consequences") unless defined($config->{quiet});
1109
1110 my $vf_count = scalar @$finished_vfs;
1111 my $vf_counter = 0;
1112
1113 while(my $vf = shift @$finished_vfs) {
1114 progress($config, $vf_counter++, $vf_count) unless $vf_count == 1;
1115
1116 my $filter_ok = 1;
1117
1118 # filtered output
1119 if(defined($config->{filter})) {
1120 $filter_ok = filter_by_consequence($config, $vf);
1121 $config->{filter_count} += $filter_ok;
1122 }
1123
1124 # skip filtered lines
1125 next unless $filter_ok;
1126
1127 # original output
1128 if(defined($config->{original})) {
1129 push @return, $vf->{_line};
1130 }
1131
1132 # GVF output
1133 elsif(defined($config->{gvf})) {
1134 $vf->source("User");
1135
1136 $config->{gvf_id} ||= 1;
1137
1138 # get custom annotation
1139 my $custom_annotation = defined($config->{custom}) ? get_custom_annotation($config, $vf) : {};
1140 $custom_annotation->{ID} = $config->{gvf_id}++;
1141
1142
1143 my $tmp = $vf->to_gvf(
1144 include_consequences => defined($config->{no_consequences}) ? 0 : 1,
1145 extra_attrs => $custom_annotation,
1146 );
1147 push @return, \$tmp;
1148 }
1149
1150 # VCF output
1151 elsif(defined($config->{vcf})) {
1152
1153 # convert to VCF, otherwise get line
1154 my $line = $config->{format} eq 'vcf' ? [split /\s+/, $vf->{_line}] : convert_to_vcf($config, $vf);
1155
1156 if(!defined($line->[7]) || $line->[7] eq '.') {
1157 $line->[7] = '';
1158 }
1159
1160 # get all the lines the normal way
1161 # and process them into VCF-compatible string
1162 my $string = 'CSQ=';
1163
1164 foreach my $line(@{vf_to_consequences($config, $vf)}) {
1165
1166 # use the field list (can be user-defined by setting --fields)
1167 for my $col(@{$config->{fields}}) {
1168
1169 # skip fields already represented in the VCF
1170 next if $col eq 'Uploaded_variation' or $col eq 'Location' or $col eq 'Extra';
1171
1172 # search for data in main line hash as well as extra field
1173 my $data = defined $line->{$col} ? $line->{$col} : $line->{Extra}->{$col};
1174
1175 # "-" means null for everything except the Allele field (confusing...)
1176 $data = undef if defined($data) and $data eq '-' and $col ne 'Allele';
1177 $data =~ s/\,/\&/g if defined $data;
1178 $string .= defined($data) ? $data : '';
1179 $string .= '|';
1180 }
1181
1182 $string =~ s/\|$//;
1183 $string .= ',';
1184 }
1185
1186 $string =~ s/\,$//;
1187
1188 if(!defined($config->{no_consequences}) && $string ne 'CSQ=') {
1189 $line->[7] .= ($line->[7] ? ';' : '').$string;
1190 }
1191
1192 # get custom annotation
1193 if(defined($config->{custom}) && scalar @{$config->{custom}}) {
1194 my $custom_annotation = get_custom_annotation($config, $vf);
1195 foreach my $key(keys %{$custom_annotation}) {
1196 $line->[7] .= ($line->[7] ? ';' : '').$key.'='.$custom_annotation->{$key};
1197 }
1198 }
1199
1200 my $tmp = join "\t", @$line;
1201 push @return, \$tmp;
1202 }
1203
1204 # no consequence output from vep input
1205 elsif(defined($config->{no_consequences}) && $config->{format} eq 'vep') {
1206
1207 my $line = [split /\s+/, $vf->{_line}];
1208
1209 if($line->[13] eq '-') {
1210 $line->[13] = '';
1211 }
1212
1213 # get custom annotation
1214 if(defined($config->{custom})) {
1215 my $custom_annotation = get_custom_annotation($config, $vf);
1216 foreach my $key(keys %{$custom_annotation}) {
1217 $line->[13] .= ($line->[13] ? ';' : '').$key.'='.$custom_annotation->{$key};
1218 }
1219 }
1220
1221 my $tmp = join "\t", @$line;
1222 push @return, \$tmp;
1223 }
1224
1225 # normal output
1226 else {
1227 push @return, @{vf_to_consequences($config, $vf)};
1228 }
1229
1230 print PARENT "BUMP\n" if defined($config->{forked});
1231 }
1232
1233 end_progress($config) unless scalar @$listref == 1;
1234 }
1235
1236 return \@return;
1237 }
1238
1239 # takes a variation feature and returns ready to print consequence information
1240 sub vf_to_consequences {
1241 my $config = shift;
1242 my $vf = shift;
1243
1244 # use a different method for SVs
1245 return svf_to_consequences($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature');
1246
1247 my @return = ();
1248
1249 # method name for consequence terms
1250 my $term_method = $config->{terms}.'_term';
1251
1252 # find any co-located existing VFs
1253 $vf->{existing} ||= find_existing($config, $vf) if defined $config->{check_existing};
1254
1255 # skip based on frequency checks?
1256 if(defined($config->{check_frequency}) && defined($vf->{existing}) && $vf->{existing} ne '-' && defined($config->{va})) {
1257 return [] unless grep {$_} map {check_frequencies($config, $_)} reverse split(/\,/, $vf->{existing});
1258 $vf->{freqs} = $config->{filtered_freqs};
1259 }
1260
1261 # force empty hash into object's transcript_variations if undefined from whole_genome_fetch
1262 # this will stop the API trying to go off and fill it again
1263 $vf->{transcript_variations} ||= {} if defined $config->{whole_genome};
1264
1265 # regulatory stuff
1266 if(!defined $config->{coding_only} && defined $config->{regulatory}) {
1267
1268 for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) {
1269
1270 my $rf = $rfv->regulatory_feature;
1271
1272 my $base_line = {
1273 Feature_type => 'RegulatoryFeature',
1274 Feature => $rf->stable_id,
1275 };
1276
1277 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
1278 $base_line->{Extra}->{CELL_TYPE} = join ",",
1279 map {$_.':'.$rf->{cell_types}->{$_}}
1280 grep {$rf->{cell_types}->{$_}}
1281 @{$config->{cell_type}};
1282
1283 $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g;
1284 }
1285
1286 # this currently always returns 'RegulatoryFeature', so we ignore it for now
1287 #$base_line->{Extra}->{REG_FEAT_TYPE} = $rf->feature_type->name;
1288
1289 for my $rfva (@{ $rfv->get_all_alternate_RegulatoryFeatureVariationAlleles }) {
1290
1291 my $line = init_line($config, $vf, $base_line);
1292
1293 $line->{Allele} = $rfva->variation_feature_seq;
1294 $line->{Consequence} = join ',',
1295 map { $_->$term_method || $_->SO_term }
1296 @{ $rfva->get_all_OverlapConsequences };
1297
1298 $line = run_plugins($rfva, $line, $config);
1299
1300 push @return, $line;
1301 }
1302 }
1303
1304 for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) {
1305
1306 my $mf = $mfv->motif_feature;
1307
1308 # check that the motif has a binding matrix, if not there's not
1309 # much we can do so don't return anything
1310
1311 next unless defined $mf->binding_matrix;
1312
1313 my $matrix = $mf->binding_matrix->description.' '.$mf->display_label;
1314 $matrix =~ s/\s+/\_/g;
1315
1316 my $base_line = {
1317 Feature_type => 'MotifFeature',
1318 Feature => $mf->binding_matrix->name,
1319 Extra => {
1320 MOTIF_NAME => $matrix,
1321 }
1322 };
1323
1324 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
1325 $base_line->{Extra}->{CELL_TYPE} = join ",",
1326 map {$_.':'.$mf->{cell_types}->{$_}}
1327 grep {$mf->{cell_types}->{$_}}
1328 @{$config->{cell_type}};
1329
1330 $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g;
1331 }
1332
1333 for my $mfva (@{ $mfv->get_all_alternate_MotifFeatureVariationAlleles }) {
1334
1335 my $line = init_line($config, $vf, $base_line);
1336
1337 $line->{Extra}->{MOTIF_POS} = $mfva->motif_start if defined $mfva->motif_start;
1338 $line->{Extra}->{HIGH_INF_POS} = ($mfva->in_informative_position ? 'Y' : 'N');
1339
1340 my $delta = $mfva->motif_score_delta if $mfva->variation_feature_seq =~ /^[ACGT]+$/;
1341
1342 $line->{Extra}->{MOTIF_SCORE_CHANGE} = sprintf("%.3f", $delta) if defined $delta;
1343
1344 $line->{Allele} = $mfva->variation_feature_seq;
1345 $line->{Consequence} = join ',',
1346 map { $_->$term_method || $_->SO_term }
1347 @{ $mfva->get_all_OverlapConsequences };
1348
1349 $line = run_plugins($mfva, $line, $config);
1350
1351 push @return, $line;
1352 }
1353 }
1354 }
1355
1356 # get TVs
1357 my $tvs = $vf->get_all_TranscriptVariations;
1358
1359 # only most severe
1360 if(defined($config->{most_severe}) || defined($config->{summary})) {
1361
1362 my $line = init_line($config, $vf);
1363
1364 if(defined($config->{summary})) {
1365 $line->{Consequence} = join ",", @{$vf->consequence_type($config->{terms}) || $vf->consequence_type};
1366 }
1367 else {
1368 $line->{Consequence} = $vf->display_consequence($config->{terms}) || $vf->display_consequence;
1369 }
1370
1371 push @return, $line;
1372 }
1373
1374 # pass a true argument to get_IntergenicVariation to stop it doing a reference allele check
1375 # (to stay consistent with the rest of the VEP)
1376 elsif ((my $iv = $vf->get_IntergenicVariation(1)) && !defined($config->{no_intergenic})) {
1377
1378 for my $iva (@{ $iv->get_all_alternate_IntergenicVariationAlleles }) {
1379
1380 my $line = init_line($config, $vf);
1381
1382 $line->{Allele} = $iva->variation_feature_seq;
1383
1384 my $cons = $iva->get_all_OverlapConsequences->[0];
1385
1386 $line->{Consequence} = $cons->$term_method || $cons->SO_term;
1387
1388 $line = run_plugins($iva, $line, $config);
1389
1390 push @return, $line;
1391 }
1392 }
1393
1394 # user wants only one conseqeunce per gene
1395 elsif(defined($config->{per_gene})) {
1396
1397 # sort the TVA objects into a hash by gene
1398 my %by_gene;
1399
1400 foreach my $tv(@$tvs) {
1401 next if(defined $config->{coding_only} && !($tv->affects_cds));
1402
1403 my $gene = $tv->transcript->{_gene_stable_id} || $config->{ga}->fetch_by_transcript_stable_id($tv->transcript->stable_id)->stable_id;
1404
1405 push @{$by_gene{$gene}}, @{$tv->get_all_alternate_TranscriptVariationAlleles};
1406 }
1407
1408 foreach my $gene(keys %by_gene) {
1409 my ($lowest, $lowest_tva);
1410
1411 # at the moment this means the one that comes out last will be picked
1412 # if there is more than one TVA with the same rank of consequence
1413 foreach my $tva(@{$by_gene{$gene}}) {
1414 foreach my $oc(@{$tva->get_all_OverlapConsequences}) {
1415 if(!defined($lowest) || $oc->rank < $lowest) {
1416 $lowest = $oc->rank;
1417 $lowest_tva = $tva;
1418 }
1419 }
1420 }
1421
1422 push @return, tva_to_line($config, $lowest_tva);
1423 }
1424 }
1425
1426 else {
1427 foreach my $tv(@$tvs) {
1428 next if(defined $config->{coding_only} && !($tv->affects_cds));
1429
1430 push @return, map {tva_to_line($config, $_)} @{$tv->get_all_alternate_TranscriptVariationAlleles};
1431
1432 undef $tv->{$_} for keys %$tv;
1433 }
1434 }
1435
1436 return \@return;
1437 }
1438
1439 # get consequences for a structural variation feature
1440 sub svf_to_consequences {
1441 my $config = shift;
1442 my $svf = shift;
1443
1444 my @return = ();
1445
1446 my $term_method = $config->{terms}.'_term';
1447
1448 if(defined $config->{whole_genome}) {
1449 $svf->{transcript_structural_variations} ||= [];
1450 $svf->{regulation_structural_variations}->{$_} ||= [] for @REG_FEAT_TYPES;
1451 }
1452
1453 if ((my $iv = $svf->get_IntergenicStructuralVariation(1)) && !defined($config->{no_intergenic})) {
1454
1455 for my $iva (@{ $iv->get_all_alternate_IntergenicStructuralVariationAlleles }) {
1456
1457 my $line = init_line($config, $svf);
1458
1459 $line->{Allele} = '-';
1460
1461 my $cons = $iva->get_all_OverlapConsequences->[0];
1462
1463 $line->{Consequence} = $cons->$term_method || $cons->SO_term;
1464
1465 $line = run_plugins($iva, $line, $config);
1466
1467 push @return, $line;
1468 }
1469 }
1470
1471 foreach my $svo(@{$svf->get_all_StructuralVariationOverlaps}) {
1472
1473 next if $svo->isa('Bio::EnsEMBL::Variation::IntergenicStructuralVariation');
1474
1475 my $feature = $svo->feature;
1476
1477 # get feature type
1478 my $feature_type = (split '::', ref($feature))[-1];
1479
1480 my $base_line = {
1481 Feature_type => $feature_type,
1482 Feature => $feature->stable_id,
1483 Allele => $svf->class_SO_term,
1484 };
1485
1486 if($svo->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariation')) {
1487 $base_line->{cDNA_position} = format_coords($svo->cdna_start, $svo->cdna_end);
1488 $base_line->{CDS_position} = format_coords($svo->cds_start, $svo->cds_end);
1489 $base_line->{Protein_position} = format_coords($svo->translation_start, $svo->translation_end);
1490 }
1491
1492 foreach my $svoa(@{$svo->get_all_StructuralVariationOverlapAlleles}) {
1493 my $line = init_line($config, $svf, $base_line);
1494
1495 $line->{Consequence} = join ",",
1496 #map {s/feature/$feature_type/e; $_}
1497 map {$_->$term_method}
1498 sort {$a->rank <=> $b->rank}
1499 @{$svoa->get_all_OverlapConsequences};
1500
1501 # work out overlap amounts
1502 my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1];
1503 my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0];
1504 my $overlap_length = ($overlap_end - $overlap_start) + 1;
1505 my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1));
1506
1507 $line->{Extra}->{OverlapBP} = $overlap_length if $overlap_length > 0;
1508 $line->{Extra}->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0;
1509
1510 add_extra_fields($config, $line, $svoa);
1511
1512 push @return, $line;
1513 }
1514 }
1515
1516 return \@return;
1517 }
1518
1519 # run all of the configured plugins on a VariationFeatureOverlapAllele instance
1520 # and store any results in the provided line hash
1521 sub run_plugins {
1522
1523 my ($bvfoa, $line_hash, $config) = @_;
1524
1525 my $skip_line = 0;
1526
1527 for my $plugin (@{ $config->{plugins} || [] }) {
1528
1529 # check that this plugin is interested in this type of variation feature
1530
1531 if ($plugin->check_variant_feature_type(ref $bvfoa->base_variation_feature)) {
1532
1533 # check that this plugin is interested in this type of feature
1534
1535 if ($plugin->check_feature_type(ref $bvfoa->feature || 'Intergenic')) {
1536
1537 eval {
1538 my $plugin_results = $plugin->run($bvfoa, $line_hash);
1539
1540 if (defined $plugin_results) {
1541 if (ref $plugin_results eq 'HASH') {
1542 for my $key (keys %$plugin_results) {
1543 $line_hash->{Extra}->{$key} = $plugin_results->{$key};
1544 }
1545 }
1546 else {
1547 warn "Plugin '".(ref $plugin)."' did not return a hashref, output ignored!\n";
1548 }
1549 }
1550 else {
1551 # if a plugin returns undef, that means it want to filter out this line
1552 $skip_line = 1;
1553 }
1554 };
1555 if ($@) {
1556 warn "Plugin '".(ref $plugin)."' went wrong: $@";
1557 }
1558
1559 # there's no point running any other plugins if we're filtering this line,
1560 # because the first plugin to skip the line wins, so we might as well last
1561 # out of the loop now and avoid any unnecessary computation
1562
1563 last if $skip_line;
1564 }
1565 }
1566 }
1567
1568 return $skip_line ? undef : $line_hash;
1569 }
1570
1571 # turn a TranscriptVariationAllele into a line hash
1572 sub tva_to_line {
1573 my $config = shift;
1574 my $tva = shift;
1575
1576 my $tv = $tva->transcript_variation;
1577 my $t = $tv->transcript;
1578
1579 # method name for consequence terms
1580 my $term_method = $config->{terms}.'_term';
1581
1582 my $base_line = {
1583 Feature_type => 'Transcript',
1584 Feature => (defined $t ? $t->stable_id : undef),
1585 cDNA_position => format_coords($tv->cdna_start, $tv->cdna_end),
1586 CDS_position => format_coords($tv->cds_start, $tv->cds_end),
1587 Protein_position => format_coords($tv->translation_start, $tv->translation_end),
1588 Allele => $tva->variation_feature_seq,
1589 Amino_acids => $tva->pep_allele_string,
1590 Codons => $tva->display_codon_allele_string,
1591 Consequence => join ",", map {$_->$term_method || $_->SO_term} sort {$a->rank <=> $b->rank} @{$tva->get_all_OverlapConsequences},
1592 };
1593
1594 my $line = init_line($config, $tva->variation_feature, $base_line);
1595
1596 # HGVS
1597 if(defined $config->{hgvs}) {
1598 my $hgvs_t = $tva->hgvs_transcript;
1599 my $hgvs_p = $tva->hgvs_protein;
1600
1601 $line->{Extra}->{HGVSc} = $hgvs_t if $hgvs_t;
1602 $line->{Extra}->{HGVSp} = $hgvs_p if $hgvs_p;
1603 }
1604
1605 foreach my $tool (qw(SIFT PolyPhen)) {
1606 my $lc_tool = lc($tool);
1607
1608 if (my $opt = $config->{$lc_tool}) {
1609 my $want_pred = $opt =~ /^p/i;
1610 my $want_score = $opt =~ /^s/i;
1611 my $want_both = $opt =~ /^b/i;
1612
1613 if ($want_both) {
1614 $want_pred = 1;
1615 $want_score = 1;
1616 }
1617
1618 next unless $want_pred || $want_score;
1619
1620 my $pred_meth = $lc_tool.'_prediction';
1621 my $score_meth = $lc_tool.'_score';
1622
1623 my $pred = $tva->$pred_meth;
1624
1625 if($pred) {
1626
1627 if ($want_pred) {
1628 $pred =~ s/\s+/\_/;
1629 $line->{Extra}->{$tool} = $pred;
1630 }
1631
1632 if ($want_score) {
1633 my $score = $tva->$score_meth;
1634
1635 if(defined $score) {
1636 if($want_pred) {
1637 $line->{Extra}->{$tool} .= "($score)";
1638 }
1639 else {
1640 $line->{Extra}->{$tool} = $score;
1641 }
1642 }
1643 }
1644 }
1645 }
1646 }
1647
1648 $line = add_extra_fields($config, $line, $tva);
1649
1650 return $line;
1651 }
1652
1653 sub add_extra_fields {
1654 my $config = shift;
1655 my $line = shift;
1656 my $bvfoa = shift;
1657
1658 # overlapping SVs
1659 if(defined $config->{check_svs} && defined $bvfoa->base_variation_feature->{overlapping_svs}) {
1660 $line->{Extra}->{SV} = $bvfoa->base_variation_feature->{overlapping_svs};
1661 }
1662
1663 # add transcript-specific fields
1664 $line = add_extra_fields_transcript($config, $line, $bvfoa) if $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele');
1665
1666 # run plugins
1667 $line = run_plugins($bvfoa, $line, $config);
1668
1669 return $line;
1670 }
1671
1672 sub add_extra_fields_transcript {
1673 my $config = shift;
1674 my $line = shift;
1675 my $tva = shift;
1676
1677 my $tv = $tva->base_variation_feature_overlap;
1678 my $tr = $tva->transcript;
1679
1680 # get gene
1681 my $gene;
1682
1683 $line->{Gene} = $tr->{_gene_stable_id};
1684
1685 if(!defined($line->{Gene})) {
1686 $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id);
1687 $line->{Gene} = $gene ? $gene->stable_id : '-';
1688 }
1689
1690 # exon/intron numbers
1691 if ($config->{numbers}) {
1692 $line->{Extra}->{EXON} = $tv->exon_number if defined $tv->exon_number;
1693 $line->{Extra}->{INTRON} = $tv->intron_number if defined $tv->intron_number;
1694 }
1695
1696 if ($config->{domains}) {
1697 my $feats = $tv->get_overlapping_ProteinFeatures;
1698
1699 my @strings;
1700
1701 for my $feat (@$feats) {
1702 my $label = $feat->analysis->display_label.':'.$feat->hseqname;
1703
1704 # replace any special characters
1705 $label =~ s/[\s;=]/_/g;
1706
1707 push @strings, $label;
1708 }
1709
1710 $line->{Extra}->{DOMAINS} = join ',', @strings if @strings;
1711 }
1712
1713 # distance to transcript
1714 if($line->{Consequence} =~ /(up|down)stream/i) {
1715 $line->{Extra}->{DISTANCE} = $tv->distance_to_transcript;
1716 }
1717
1718 # HGNC
1719 if(defined $config->{hgnc}) {
1720 my $hgnc;
1721 $hgnc = $tr->{_gene_hgnc};
1722
1723 if(!defined($hgnc)) {
1724 if(!defined($gene)) {
1725 $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id);
1726 }
1727
1728 my @entries = grep {$_->database eq 'HGNC'} @{$gene->get_all_DBEntries()};
1729 if(scalar @entries) {
1730 $hgnc = $entries[0]->display_id;
1731 }
1732 }
1733
1734 $hgnc = undef if defined($hgnc) && $hgnc eq '-';
1735
1736 $line->{Extra}->{HGNC} = $hgnc if defined($hgnc);
1737 }
1738
1739 # CCDS
1740 if(defined($config->{ccds})) {
1741 my $ccds = $tr->{_ccds};
1742
1743 if(!defined($ccds)) {
1744 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries};
1745 $ccds = $entries[0]->display_id if scalar @entries;
1746 }
1747
1748 $ccds = undef if defined($ccds) && $ccds eq '-';
1749
1750 $line->{Extra}->{CCDS} = $ccds if defined($ccds);
1751 }
1752
1753 # refseq xref
1754 if(defined($config->{xref_refseq})) {
1755 my $refseq = $tr->{_refseq};
1756
1757 if(!defined($refseq)) {
1758 my @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries};
1759 if(scalar @entries) {
1760 $refseq = join ",", map {$_->display_id."-".$_->database} @entries;
1761 }
1762 }
1763
1764 $refseq = undef if defined($refseq) && $refseq eq '-';
1765
1766 $line->{Extra}->{RefSeq} = $refseq if defined($refseq);
1767 }
1768
1769 # protein ID
1770 if(defined $config->{protein}) {
1771 my $protein = $tr->{_protein};
1772
1773 if(!defined($protein)) {
1774 $protein = $tr->translation->stable_id if defined($tr->translation);
1775 }
1776
1777 $protein = undef if defined($protein) && $protein eq '-';
1778
1779 $line->{Extra}->{ENSP} = $protein if defined($protein);
1780 }
1781
1782 # canonical transcript
1783 if(defined $config->{canonical}) {
1784 $line->{Extra}->{CANONICAL} = 'YES' if $tr->is_canonical;
1785 }
1786
1787 return $line;
1788 }
1789
1790 # initialize a line hash
1791 sub init_line {
1792 my $config = shift;
1793 my $vf = shift;
1794 my $base_line = shift;
1795
1796 my $line = {
1797 Uploaded_variation => $vf->variation_name,
1798 Location => ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->start, $vf->end),
1799 Existing_variation => $vf->{existing},
1800 Extra => {},
1801 };
1802
1803 # add custom info
1804 if(defined($config->{custom}) && scalar @{$config->{custom}}) {
1805 # merge the custom hash with the extra hash
1806 my $custom = get_custom_annotation($config, $vf);
1807
1808 for my $key (keys %$custom) {
1809 $line->{Extra}->{$key} = $custom->{$key};
1810 }
1811 }
1812
1813 # individual?
1814 $line->{Extra}->{IND} = $vf->{individual} if defined($vf->{individual});
1815
1816 # frequencies?
1817 $line->{Extra}->{FREQS} = join ",", @{$vf->{freqs}} if defined($vf->{freqs});
1818
1819 # gmaf?
1820 $line->{Extra}->{GMAF} = $vf->{gmaf} if defined($config->{gmaf}) && defined($vf->{gmaf});
1821
1822 # copy entries from base_line
1823 if(defined($base_line)) {
1824 $line->{$_} = $base_line->{$_} for keys %$base_line;
1825 }
1826
1827 return $line;
1828 }
1829
1830
1831 # get custom annotation for a single VF
1832 sub get_custom_annotation {
1833 my $config = shift;
1834 my $vf = shift;
1835 my $cache = shift;
1836
1837 return $vf->{custom} if defined($vf->{custom});
1838
1839 my $annotation = {};
1840
1841 my $chr = $vf->{chr};
1842
1843 if(!defined($cache)) {
1844 # spoof regions
1845 my $regions;
1846 $regions->{$chr} = [$vf->{start}.'-'.$vf->{end}];
1847 $cache = cache_custom_annotation($config, $regions, $chr);
1848 }
1849
1850 foreach my $custom(@{$config->{custom}}) {
1851 next unless defined($cache->{$chr}->{$custom->{name}});
1852
1853 # exact type must match coords of variant exactly
1854 if($custom->{type} eq 'exact') {
1855 foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$vf->{start}}}) {
1856
1857 next unless
1858 $feature->{chr} eq $chr &&
1859 $feature->{start} eq $vf->{start} &&
1860 $feature->{end} eq $vf->{end};
1861
1862 $annotation->{$custom->{name}} .= $feature->{name}.',';
1863 }
1864 }
1865
1866 # overlap type only needs to overlap, but we need to search the whole range
1867 elsif($custom->{type} eq 'overlap') {
1868 foreach my $pos(keys %{$cache->{$chr}->{$custom->{name}}}) {
1869 foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$pos}}) {
1870
1871 next unless
1872 $feature->{chr} eq $chr &&
1873 $feature->{end} >= $vf->{start} &&
1874 $feature->{start} <= $vf->{end};
1875
1876 $annotation->{$custom->{name}} .= $feature->{name}.',';
1877 }
1878 }
1879 }
1880
1881 # trim off trailing commas
1882 $annotation->{$custom->{name}} =~ s/\,$//g if defined($annotation->{$custom->{name}});
1883 }
1884
1885 return $annotation;
1886 }
1887
1888 # decides whether to print a VF based on user defined consequences
1889 sub filter_by_consequence {
1890 my $config = shift;
1891 my $vf = shift;
1892 my $filters = $config->{filter};
1893
1894 # find it if we only have "no"s
1895 my $only_nos = 0;
1896 $only_nos = 1 if (sort {$a <=> $b} values %$filters)[-1] == 0;
1897
1898 my ($yes, $no) = (0, 0);
1899
1900 # get all consequences across all term types
1901 my @types = ('SO', 'display');
1902
1903 my @cons;
1904 push @cons, @{$vf->consequence_type($_)} for @types;
1905
1906 # add regulatory consequences
1907 if(defined($config->{regulatory})) {
1908 foreach my $term_type(@types) {
1909 my $term_method = $term_type.'_term';
1910
1911 for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) {
1912 for my $rfva(@{$rfv->get_all_alternate_RegulatoryFeatureVariationAlleles}) {
1913 push @cons, map {$_->$term_method} @{ $rfva->get_all_OverlapConsequences };
1914 }
1915 }
1916 for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) {
1917 for my $mfva(@{$mfv->get_all_alternate_MotifFeatureVariationAlleles}) {
1918 push @cons, map {$_->$term_method} @{ $mfva->get_all_OverlapConsequences };
1919 }
1920 }
1921 }
1922 }
1923
1924 foreach my $con(grep {defined($_) && defined($filters->{$_})} @cons) {
1925 if($filters->{$con} == 1) {
1926 $yes = 1;
1927 }
1928 else {
1929 $no = 1;
1930 }
1931 }
1932
1933 # check special case, coding
1934 if(defined($filters->{coding})) {
1935 if(grep {$_->affects_cds} @{$vf->get_all_TranscriptVariations}) {
1936 if($filters->{coding} == 1) {
1937 $yes = 1;
1938 }
1939 else {
1940 $no = 1;
1941 }
1942 }
1943 }
1944
1945 my $ok = 0;
1946 if($only_nos) {
1947 $ok = 1 if !$no;
1948 }
1949 else {
1950 $ok = 1 if $yes && !$no;
1951 }
1952
1953 return $ok;
1954 }
1955
1956
1957 # takes VFs created from input, fixes and checks various things
1958 sub validate_vf {
1959 my $config = shift;
1960 my $vf = shift;
1961
1962 # user specified chr skip list
1963 return 0 if defined($config->{chr}) && !$config->{chr}->{$vf->{chr}};
1964
1965 # fix inputs
1966 $vf->{chr} =~ s/chr//ig unless $vf->{chr} =~ /^chromosome$/i;
1967 $vf->{chr} = 'MT' if $vf->{chr} eq 'M';
1968 $vf->{strand} ||= 1;
1969 $vf->{strand} = ($vf->{strand} =~ /\-/ ? "-1" : "1");
1970
1971 # sanity checks
1972 unless($vf->{start} =~ /^\d+$/ && $vf->{end} =~ /^\d+$/) {
1973 warn("WARNING: Start ".$vf->{start}." or end ".$vf->{end}." coordinate invalid on line ".$config->{line_number}."\n") unless defined $config->{quiet};
1974 return 0;
1975 }
1976
1977 # structural variation?
1978 return validate_svf($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature');
1979
1980 # uppercase allele string
1981 $vf->{allele_string} =~ tr/[a-z]/[A-Z]/;
1982
1983 unless($vf->{allele_string} =~ /([ACGT-]+\/*)+/) {
1984 warn("WARNING: Invalid allele string ".$vf->{allele_string}." on line ".$config->{line_number}." or possible parsing error\n") unless defined $config->{quiet};
1985 return 0;
1986 }
1987
1988 # insertion should have start = end + 1
1989 if($vf->{allele_string} =~ /^\-\// && $vf->{start} != $vf->{end} + 1) {
1990 warn(
1991 "WARNING: Alleles look like an insertion (".
1992 $vf->{allele_string}.
1993 ") but coordinates are not start = end + 1 (START=".
1994 $vf->{start}.", END=".$vf->{end}.
1995 ") on line ".$config->{line_number}."\n"
1996 ) unless defined($config->{quiet});
1997 return 0;
1998 }
1999
2000 # check length of reference matches seq length spanned
2001 my @alleles = split /\//, $vf->{allele_string};
2002 my $ref_allele = shift @alleles;
2003 my $tmp_ref_allele = $ref_allele;
2004 $tmp_ref_allele =~ s/\-//g;
2005
2006 #if(($vf->{end} - $vf->{start}) + 1 != length($tmp_ref_allele)) {
2007 # warn(
2008 # "WARNING: Length of reference allele (".$ref_allele.
2009 # " length ".length($tmp_ref_allele).") does not match co-ordinates ".$vf->{start}."-".$vf->{end}.
2010 # " on line ".$config->{line_number}
2011 # ) unless defined($config->{quiet});
2012 # return 0;
2013 #}
2014
2015 # flag as unbalanced
2016 foreach my $allele(@alleles) {
2017 $allele =~ s/\-//g;
2018 $vf->{indel} = 1 unless length($allele) == length($tmp_ref_allele);
2019 }
2020
2021 # check reference allele if requested
2022 if(defined $config->{check_ref}) {
2023 my $ok = 0;
2024 my $slice_ref_allele;
2025
2026 # insertion, therefore no ref allele to check
2027 if($ref_allele eq '-') {
2028 $ok = 1;
2029 }
2030 else {
2031 my $slice_ref = $vf->{slice}->sub_Slice($vf->{start}, $vf->{end}, $vf->{strand});
2032
2033 if(!defined($slice_ref)) {
2034 warn "WARNING: Could not fetch sub-slice from ".$vf->{start}."\-".$vf->{end}."\(".$vf->{strand}."\) on line ".$config->{line_number} unless defined $config->{quiet};
2035 }
2036
2037 else {
2038 $slice_ref_allele = $slice_ref->seq;
2039 $ok = ($slice_ref_allele eq $ref_allele ? 1 : 0);
2040 }
2041 }
2042
2043 if(!$ok) {
2044 warn
2045 "WARNING: Specified reference allele $ref_allele ",
2046 "does not match Ensembl reference allele",
2047 ($slice_ref_allele ? " $slice_ref_allele" : ""),
2048 " on line ".$config->{line_number} unless defined $config->{quiet};
2049 return 0;
2050 }
2051 }
2052
2053 return 1;
2054 }
2055
2056
2057 # validate a structural variation
2058 sub validate_svf {
2059 my $config = shift;
2060 my $svf = shift;
2061
2062 return 1;
2063 }
2064
2065
2066 # takes a hash of VFs and fetches consequences by pre-fetching overlapping transcripts
2067 # from database and/or cache
2068 sub whole_genome_fetch {
2069 my $config = shift;
2070 my $chr = shift;
2071 my $vf_hash = shift;
2072
2073 my (%vf_done, @finished_vfs, %seen_rfs);
2074
2075 if(defined($config->{offline}) && !-e $config->{dir}.'/'.$chr) {
2076 debug("No cache found for chromsome $chr") unless defined($config->{quiet});
2077
2078 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2079 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
2080 push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}};
2081 }
2082 }
2083
2084 return \@finished_vfs;
2085 }
2086
2087 my $slice_cache = $config->{slice_cache};
2088 build_slice_cache($config, $config->{tr_cache}) unless defined($slice_cache->{$chr});
2089
2090 debug("Analyzing chromosome $chr") unless defined($config->{quiet});
2091
2092 # custom annotations
2093 whole_genome_fetch_custom($config, $vf_hash, $chr) if defined($config->{custom});
2094
2095 # split up normal variations from SVs
2096 my ($tmp_vf_hash, @svfs);
2097
2098 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2099 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
2100 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) {
2101 if($vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) {
2102 push @svfs, $vf;
2103 }
2104 else {
2105 push @{$tmp_vf_hash->{$chr}{$chunk}{$pos}}, $vf;
2106 }
2107 }
2108 }
2109 }
2110
2111 $vf_hash = $tmp_vf_hash;
2112
2113 # transcript annotations
2114 whole_genome_fetch_transcript($config, $vf_hash, $chr)
2115 unless defined($config->{no_consequences});
2116
2117 # regulatory annotations
2118 whole_genome_fetch_reg($config, $vf_hash, $chr)
2119 if defined($config->{regulatory})
2120 && !defined($config->{no_consequences});
2121
2122 # structural variations
2123 @finished_vfs = @{whole_genome_fetch_sv($config, \@svfs, $chr)}
2124 if scalar @svfs;
2125
2126 # sort results into @finished_vfs array
2127 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2128 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
2129
2130 # pinch slice from slice cache if we don't already have it
2131 $_->{slice} ||= $slice_cache->{$chr} for @{$vf_hash->{$chr}{$chunk}{$pos}};
2132
2133 if(defined($config->{regulatory})) {
2134 foreach my $type(@REG_FEAT_TYPES) {
2135 $_->{regulation_variations}->{$type} ||= [] for @{$vf_hash->{$chr}{$chunk}{$pos}};
2136 }
2137 }
2138
2139 if(defined($config->{custom})) {
2140 $_->{custom} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}};
2141 }
2142
2143 $_->{transcript_variations} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}};
2144
2145 # add to final array
2146 push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}};
2147 }
2148 }
2149
2150 # sort
2151 @finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @finished_vfs;
2152
2153 # clean hash
2154 delete $vf_hash->{$chr};
2155
2156 return \@finished_vfs;
2157 }
2158
2159 sub whole_genome_fetch_custom {
2160 my $config = shift;
2161 my $vf_hash = shift;
2162 my $chr = shift;
2163
2164 return unless scalar @{$config->{custom}};
2165
2166 # create regions based on VFs instead of chunks
2167 my $tmp_regions;
2168
2169 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2170 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
2171 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) {
2172 push @{$tmp_regions->{$chr}}, ($vf->{start}-1).'-'.($vf->{end}+1);
2173 }
2174 }
2175 }
2176
2177 return unless defined($tmp_regions->{$chr});
2178
2179 # cache annotations
2180 my $annotation_cache = cache_custom_annotation($config, $tmp_regions, $chr);
2181
2182 # count and report
2183 my $total_annotations = 0;
2184 $total_annotations += scalar keys %{$annotation_cache->{$chr}->{$_}} for keys %{$annotation_cache->{$chr}};
2185 debug("Retrieved $total_annotations custom annotations (", (join ", ", map {(scalar keys %{$annotation_cache->{$chr}->{$_}}).' '.$_} keys %{$annotation_cache->{$chr}}), ")");
2186
2187 # compare annotations to variations in hash
2188 debug("Analyzing custom annotations") unless defined($config->{quiet});
2189 my $total = scalar keys %{$vf_hash->{$chr}};
2190 my $i = 0;
2191
2192 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2193 progress($config, $i++, $total);
2194
2195 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
2196 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) {
2197
2198 $vf->{custom} = get_custom_annotation($config, $vf, $annotation_cache);
2199 }
2200 }
2201 }
2202
2203 end_progress($config);
2204 }
2205
2206 sub whole_genome_fetch_transcript {
2207 my $config = shift;
2208 my $vf_hash = shift;
2209 my $chr = shift;
2210
2211 my $tr_cache = $config->{tr_cache};
2212 my $slice_cache = $config->{slice_cache};
2213
2214 my $up_down_size = MAX_DISTANCE_FROM_TRANSCRIPT;
2215
2216 # check we have defined regions
2217 return unless defined($vf_hash->{$chr}) && defined($tr_cache->{$chr});
2218
2219 # copy slice from transcript to slice cache
2220 $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr});
2221
2222 debug("Analyzing variants") unless defined($config->{quiet});
2223
2224 my $tr_counter = 0;
2225 my $tr_count = scalar @{$tr_cache->{$chr}};
2226
2227 while($tr_counter < $tr_count) {
2228
2229 progress($config, $tr_counter, $tr_count);
2230 print PARENT "BUMP\n" if defined($config->{forked});
2231
2232 my $tr = $tr_cache->{$chr}->[$tr_counter++];
2233
2234 # do each overlapping VF
2235 my $s = $tr->start - $up_down_size;
2236 my $e = $tr->end + $up_down_size;
2237
2238 # get the chunks this transcript overlaps
2239 my %chunks;
2240 $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size}));
2241 map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks;
2242
2243 # pointer to previous VF
2244 # used to tell plugins this is the last variant analysed in this transcript
2245 my $previous_vf;
2246
2247 foreach my $chunk(keys %chunks) {
2248 foreach my $vf(
2249 grep {$_->{start} <= $e && $_->{end} >= $s}
2250 map {@{$vf_hash->{$chr}{$chunk}{$_}}}
2251 keys %{$vf_hash->{$chr}{$chunk}}
2252 ) {
2253 # pinch slice from slice cache if we don't already have it
2254 $vf->{slice} ||= $slice_cache->{$chr};
2255
2256 my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new(
2257 -transcript => $tr,
2258 -variation_feature => $vf,
2259 -adaptor => $config->{tva},
2260 -no_ref_check => 1,
2261 -no_transfer => 1
2262 );
2263
2264 # prefetching stuff here prevents doing loads at the
2265 # end and makes progress reporting more useful
2266 $tv->_prefetch_for_vep;
2267
2268 $vf->add_TranscriptVariation($tv);
2269
2270 # cache VF on the transcript if it is an unbalanced sub
2271 push @{$tr->{indels}}, $vf if defined($vf->{indel});
2272
2273 if(defined($config->{individual})) {
2274
2275 # store VF on transcript, weaken reference to avoid circularity
2276 push @{$tr->{vfs}}, $vf;
2277 weaken($tr->{vfs}->[-1]);
2278
2279 delete $previous_vf->{last_in_transcript}->{$tr->stable_id};
2280 $vf->{last_in_transcript}->{$tr->stable_id} = 1;
2281 }
2282
2283 $previous_vf = $vf;
2284 }
2285 }
2286 }
2287
2288 end_progress($config);
2289 }
2290
2291 sub whole_genome_fetch_reg {
2292 my $config = shift;
2293 my $vf_hash = shift;
2294 my $chr = shift;
2295
2296 my $rf_cache = $config->{rf_cache};
2297 my $slice_cache = $config->{slice_cache};
2298
2299 foreach my $type(keys %{$rf_cache->{$chr}}) {
2300 debug("Analyzing ".$type."s") unless defined($config->{quiet});
2301
2302 my $constructor = 'Bio::EnsEMBL::Variation::'.$type.'Variation';
2303
2304 my $rf_counter = 0;
2305 my $rf_count = scalar @{$rf_cache->{$chr}->{$type}};
2306
2307 while($rf_counter < $rf_count) {
2308
2309 progress($config, $rf_counter, $rf_count);
2310 print PARENT "BUMP\n" if defined($config->{forked});
2311
2312 my $rf = $rf_cache->{$chr}->{$type}->[$rf_counter++];
2313
2314 # do each overlapping VF
2315 my $s = $rf->{start};
2316 my $e = $rf->{end};
2317
2318 # get the chunks this transcript overlaps
2319 my %chunks;
2320 $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size}));
2321 map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks;
2322
2323 foreach my $chunk(keys %chunks) {
2324 foreach my $vf(
2325 grep {$_->{start} <= $e && $_->{end} >= $s}
2326 map {@{$vf_hash->{$chr}{$chunk}{$_}}}
2327 keys %{$vf_hash->{$chr}{$chunk}}
2328 ) {
2329 push @{$vf->{regulation_variations}->{$type}}, $constructor->new(
2330 -variation_feature => $vf,
2331 -feature => $rf,
2332 -no_ref_check => 1,
2333 -no_transfer => 1
2334 );
2335 }
2336 }
2337 }
2338
2339 end_progress($config);
2340 }
2341 }
2342
2343 sub whole_genome_fetch_sv {
2344 my $config = shift;
2345 my $svfs = shift;
2346 my $chr = shift;
2347
2348 my $tr_cache = $config->{tr_cache};
2349 my $rf_cache = $config->{rf_cache};
2350 my $slice_cache = $config->{slice_cache};
2351
2352 debug("Analyzing structural variations") unless defined($config->{quiet});
2353
2354 my($i, $total) = (0, scalar @$svfs);
2355
2356 my @finished_vfs;
2357
2358 foreach my $svf(@$svfs) {
2359 progress($config, $i++, $total);
2360
2361 my %done_genes = ();
2362
2363 if(defined($tr_cache->{$chr})) {
2364 foreach my $tr(grep {overlap($_->{start} - MAX_DISTANCE_FROM_TRANSCRIPT, $_->{end} + MAX_DISTANCE_FROM_TRANSCRIPT, $svf->{start}, $svf->{end})} @{$tr_cache->{$chr}}) {
2365 my $svo = Bio::EnsEMBL::Variation::TranscriptStructuralVariation->new(
2366 -transcript => $tr,
2367 -structural_variation_feature => $svf,
2368 -no_transfer => 1
2369 );
2370
2371 $svf->add_TranscriptStructuralVariation($svo);
2372 }
2373 }
2374
2375 $svf->{transcript_structural_variations} ||= {};
2376
2377 # do regulatory features
2378 if(defined($config->{regulatory}) && defined($rf_cache->{$chr})) {
2379 foreach my $rf_type(qw/RegulatoryFeature/) {#keys %{$rf_cache->{$chr}}) {
2380 foreach my $rf(grep {$_->{start} <= $svf->{end} && $_->end >= $svf->{end}} @{$rf_cache->{$chr}->{$rf_type}}) {
2381 my $svo = Bio::EnsEMBL::Variation::StructuralVariationOverlap->new(
2382 -feature => $rf,
2383 -structural_variation_feature => $svf,
2384 -no_transfer => 1
2385 );
2386
2387 push @{$svf->{regulation_structural_variations}->{$rf_type}}, $svo;
2388 }
2389
2390 $svf->{regulation_structural_variations}->{$rf_type} ||= [];
2391 }
2392 }
2393
2394 # sort them
2395 #$svf->_sort_svos;
2396 push @finished_vfs, $svf;
2397 }
2398
2399 end_progress($config);
2400
2401 return \@finished_vfs;
2402 }
2403
2404 # retrieves transcripts given region list
2405 sub fetch_transcripts {
2406 my $config = shift;
2407 my $regions = shift;
2408 my $trim_regions = shift;
2409
2410 my $tr_cache = $config->{tr_cache};
2411 my $slice_cache = $config->{slice_cache};
2412
2413 my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0);
2414
2415 my %seen_trs;
2416
2417 $count_from_mem = 0;
2418 my $region_count = 0;
2419 foreach my $chr(keys %{$regions}) {
2420 $count_from_mem += scalar @{$tr_cache->{$chr}} if defined($tr_cache->{$chr}) && ref($tr_cache->{$chr}) eq 'ARRAY';
2421 $region_count += scalar @{$regions->{$chr}};
2422 }
2423
2424 my $counter;
2425
2426 debug("Reading transcript data from cache and/or database") unless defined($config->{quiet});
2427
2428 foreach my $chr(keys %{$regions}) {
2429 foreach my $region(sort {(split /\-/, $a)[0] <=> (split /\-/, $b)[1]} @{$regions->{$chr}}) {
2430 progress($config, $counter++, $region_count);
2431
2432 # skip regions beyond the end of the chr
2433 next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length;
2434
2435 next if defined($config->{loaded_tr}->{$chr}->{$region});
2436
2437 # force quiet so other methods don't mess up the progress bar
2438 my $quiet = $config->{quiet};
2439 $config->{quiet} = 1;
2440
2441 # try and load cache from disk if using cache
2442 my $tmp_cache;
2443 if(defined($config->{cache})) {
2444 #$tmp_cache = (
2445 # defined($config->{tabix}) ?
2446 # load_dumped_transcript_cache_tabix($config, $chr, $region, $trim_regions) :
2447 # load_dumped_transcript_cache($config, $chr, $region)
2448 #);
2449 $tmp_cache = load_dumped_transcript_cache($config, $chr, $region);
2450 $count_from_cache += scalar @{$tmp_cache->{$chr}} if defined($tmp_cache->{$chr});
2451 $config->{loaded_tr}->{$chr}->{$region} = 1;
2452 }
2453
2454 # no cache found on disk or not using cache
2455 if(!defined($tmp_cache->{$chr})) {
2456
2457 if(defined($config->{offline})) {
2458 # restore quiet status
2459 $config->{quiet} = $quiet;
2460
2461 debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet});
2462 next;
2463 }
2464
2465 # spoof temporary region hash
2466 my $tmp_hash;
2467 push @{$tmp_hash->{$chr}}, $region;
2468
2469 $tmp_cache = cache_transcripts($config, $tmp_hash);
2470
2471 # make it an empty arrayref that gets cached
2472 # so we don't get confused and reload next time round
2473 $tmp_cache->{$chr} ||= [];
2474
2475 $count_from_db += scalar @{$tmp_cache->{$chr}};
2476
2477 # dump to disk if writing to cache
2478 (defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $region) : dump_transcript_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache});
2479
2480 $config->{loaded_tr}->{$chr}->{$region} = 1;
2481 }
2482
2483 # add loaded transcripts to main cache
2484 if(defined($tmp_cache->{$chr})) {
2485 while(my $tr = shift @{$tmp_cache->{$chr}}) {
2486
2487 # track already added transcripts by dbID
2488 my $dbID = $tr->dbID;
2489 if($seen_trs{$dbID}) {
2490 $count_duplicates++;
2491 next;
2492 }
2493
2494 # trim out?
2495 #if(defined($trim_regions) && defined($trim_regions->{$chr})) {
2496 # my $tmp_count = scalar grep {
2497 # overlap(
2498 # (split /\-/, $_)[0], (split /\-/, $_)[1],
2499 # $tr->{start}, $tr->{end}
2500 # )
2501 # } @{$trim_regions->{$chr}};
2502 #
2503 # if(!$tmp_count) {
2504 # $count_trimmed++;
2505 # next;
2506 # }
2507 #}
2508
2509 $seen_trs{$dbID} = 1;
2510
2511 push @{$tr_cache->{$chr}}, $tr;
2512 }
2513 }
2514
2515 $tr_cache->{$chr} ||= [];
2516
2517 undef $tmp_cache;
2518
2519 # restore quiet status
2520 $config->{quiet} = $quiet;
2521
2522 # build slice cache
2523 $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr});
2524 }
2525 }
2526
2527 end_progress($config);
2528
2529 my $tr_count = 0;
2530 $tr_count += scalar @{$tr_cache->{$_}} for keys %$tr_cache;
2531
2532 debug("Retrieved $tr_count transcripts ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet});
2533
2534 return $tr_count;
2535 }
2536
2537 sub fetch_regfeats {
2538 my $config = shift;
2539 my $regions = shift;
2540 my $trim_regions = shift;
2541
2542 my $rf_cache = $config->{rf_cache};
2543 my $slice_cache = $config->{slice_cache};
2544
2545 my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0);
2546
2547 my %seen_rfs;
2548
2549 $count_from_mem = 0;
2550 my $region_count = 0;
2551
2552 foreach my $chr(keys %$regions) {
2553 if(defined($rf_cache->{$chr}) && ref($rf_cache->{$chr}) eq 'HASH') {
2554 $count_from_mem += scalar @{$rf_cache->{$chr}->{$_}} for keys %{$rf_cache->{$chr}};
2555 }
2556 $region_count += scalar @{$regions->{$chr}};
2557 }
2558
2559 my $counter = 0;
2560
2561 debug("Reading regulatory data from cache and/or database") unless defined($config->{quiet});
2562
2563 foreach my $chr(keys %$regions) {
2564 foreach my $region(sort {(split /\-/, $a)[0] cmp (split /\-/, $b)[1]} @{$regions->{$chr}}) {
2565 progress($config, $counter++, $region_count);
2566
2567 next if defined($config->{loaded_rf}->{$chr}->{$region});
2568
2569 # skip regions beyond the end of the chr
2570 next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length;
2571
2572 # force quiet so other methods don't mess up the progress bar
2573 my $quiet = $config->{quiet};
2574 $config->{quiet} = 1;
2575
2576 # try and load cache from disk if using cache
2577 my $tmp_cache;
2578 if(defined($config->{cache})) {
2579 $tmp_cache = load_dumped_reg_feat_cache($config, $chr, $region);
2580
2581 #$tmp_cache =
2582 # defined($config->{tabix}) ?
2583 # load_dumped_reg_feat_cache_tabix($config, $chr, $region, $trim_regions) :
2584 # load_dumped_reg_feat_cache($config, $chr, $region);
2585
2586
2587 if(defined($tmp_cache->{$chr})) {
2588 $count_from_cache += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}};
2589 }
2590
2591 # flag as loaded
2592 $config->{loaded_rf}->{$chr}->{$region} = 1;
2593 }
2594
2595 # no cache found on disk or not using cache
2596 if(!defined($tmp_cache->{$chr})) {
2597
2598 if(defined($config->{offline})) {
2599
2600 # restore quiet status
2601 $config->{quiet} = $quiet;
2602
2603 debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet});
2604 next;
2605 }
2606
2607 # spoof temporary region hash
2608 my $tmp_hash;
2609 push @{$tmp_hash->{$chr}}, $region;
2610
2611 $tmp_cache = cache_reg_feats($config, $tmp_hash);
2612
2613 # make it an empty arrayref that gets cached
2614 # so we don't get confused and reload next time round
2615 $tmp_cache->{$chr} ||= {};
2616
2617 $count_from_db += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}};
2618
2619 # dump to disk if writing to cache
2620 #dump_reg_feat_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache});
2621 (defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $tmp_cache, $chr, $region) : dump_reg_feat_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache});
2622
2623 # restore deleted coord_system adaptor
2624 foreach my $type(keys %{$tmp_cache->{$chr}}) {
2625 $_->{slice}->{coord_system}->{adaptor} = $config->{csa} for @{$tmp_cache->{$chr}->{$type}};
2626 }
2627
2628 # flag as loaded
2629 $config->{loaded_rf}->{$chr}->{$region} = 1;
2630 }
2631
2632 # add loaded reg_feats to main cache
2633 if(defined($tmp_cache->{$chr})) {
2634 foreach my $type(keys %{$tmp_cache->{$chr}}) {
2635 while(my $rf = shift @{$tmp_cache->{$chr}->{$type}}) {
2636
2637 # filter on cell type
2638 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
2639 next unless grep {$rf->{cell_types}->{$_}} @{$config->{cell_type}};
2640 }
2641
2642 # trim out?
2643 #if(defined($trim_regions) && defined($trim_regions->{$chr})) {
2644 # my $tmp_count = scalar grep {
2645 # overlap(
2646 # (split /\-/, $_)[0], (split /\-/, $_)[1],
2647 # $rf->{start}, $rf->{end}
2648 # )
2649 # } @{$trim_regions->{$chr}};
2650 #
2651 # if(!$tmp_count) {
2652 # $count_trimmed++;
2653 # next;
2654 # }
2655 #}
2656
2657 # track already added reg_feats by dbID
2658 my $dbID = $rf->{dbID};
2659 if($seen_rfs{$dbID}) {
2660 $count_duplicates++;
2661 next;
2662 }
2663 $seen_rfs{$dbID} = 1;
2664
2665 push @{$rf_cache->{$chr}->{$type}}, $rf;
2666 }
2667 }
2668 }
2669
2670 undef $tmp_cache;
2671
2672 # restore quiet status
2673 $config->{quiet} = $quiet;
2674 }
2675 }
2676
2677 end_progress($config);
2678
2679 my $rf_count = 0;
2680
2681 foreach my $chr(keys %$rf_cache) {
2682 foreach my $type(keys %{$rf_cache->{$chr}}) {
2683 $rf_count += scalar @{$rf_cache->{$chr}->{$type}};
2684 }
2685 }
2686
2687 debug("Retrieved $rf_count regulatory features ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet});
2688
2689 return $rf_count;
2690 }
2691
2692 # gets existing VFs for a vf_hash
2693 sub check_existing_hash {
2694 my $config = shift;
2695 my $vf_hash = shift;
2696 my $variation_cache;
2697
2698 # we only care about non-SVs here
2699 my %new_hash;
2700
2701 foreach my $chr(keys %{$vf_hash}) {
2702 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2703 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
2704 foreach my $var(grep {$_->isa('Bio::EnsEMBL::Variation::VariationFeature')} @{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
2705 push @{$new_hash{$chr}->{$chunk}->{$pos}}, $var;
2706 }
2707 }
2708 }
2709 }
2710
2711 $vf_hash = \%new_hash;
2712
2713 debug("Checking for existing variations") unless defined($config->{quiet});
2714
2715 my ($chunk_count, $counter);
2716 $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash};
2717
2718 foreach my $chr(keys %{$vf_hash}) {
2719
2720 my %loaded_regions;
2721
2722 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2723 progress($config, $counter++, $chunk_count);
2724
2725 # get the VFs for this chunk
2726 my ($start, $end);
2727
2728 # work out start and end using chunk_size
2729 $start = $config->{chunk_size} * $chunk;
2730 $end = $config->{chunk_size} * ($chunk + 1);
2731
2732 # using cache?
2733 if(defined($config->{cache})) {
2734 my $tmp_regions;
2735 push @{$tmp_regions->{$chr}}, $start.'-'.$end;
2736
2737 my $converted_regions = convert_regions($config, $tmp_regions);
2738
2739 foreach my $region(@{$converted_regions->{$chr}}) {
2740
2741 unless($loaded_regions{$region}) {
2742 my $tmp_cache = load_dumped_variation_cache($config, $chr, $region);
2743
2744 # load from DB if not found in cache
2745 if(!defined($tmp_cache->{$chr})) {
2746 if(defined($config->{offline})) {
2747 debug("WARNING: Could not find variation cache for $chr\:$region") unless defined($config->{quiet});
2748 next;
2749 }
2750
2751 $tmp_cache->{$chr} = get_variations_in_region($config, $chr, $region);
2752 dump_variation_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache});
2753 }
2754
2755 # merge tmp_cache with the main cache
2756 foreach my $key(keys %{$tmp_cache->{$chr}}) {
2757 $variation_cache->{$chr}->{$key} = $tmp_cache->{$chr}->{$key};
2758 delete $tmp_cache->{$chr}->{$key};
2759 }
2760
2761 # clear memory
2762 undef $tmp_cache;
2763
2764 # record this region as fetched
2765 $loaded_regions{$region} = 1;
2766 }
2767 }
2768 }
2769
2770 # no cache, get all variations in region from DB
2771 else {
2772
2773 my ($min, $max);
2774
2775 # we can fetch smaller region when using DB
2776 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
2777 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
2778 foreach my $coord(qw(start end)) {
2779 $min = $var->{$coord} if !defined($min) || $var->{$coord} < $min;
2780 $max = $var->{$coord} if !defined($max) || $var->{$coord} > $max;
2781 }
2782 }
2783 }
2784
2785 $variation_cache->{$chr} = get_variations_in_region($config, $chr, $min.'-'.$max);
2786 }
2787
2788 # now compare retrieved vars with vf_hash
2789 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
2790 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
2791 my @found;
2792 my @gmafs;
2793
2794 if(defined($variation_cache->{$chr})) {
2795 if(my $existing_vars = $variation_cache->{$chr}->{$pos}) {
2796 foreach my $existing_var(grep {$_->[1] <= $config->{failed}} @$existing_vars) {
2797 unless(is_var_novel($config, $existing_var, $var)) {
2798 push @found, $existing_var->[0];
2799 push @gmafs, $existing_var->[6].":".$existing_var->[7] if defined($existing_var->[6]) && defined($existing_var->[7]);
2800 }
2801 }
2802 }
2803 }
2804
2805 $var->{existing} = join ",", @found;
2806 $var->{existing} ||= '-';
2807
2808 $var->{gmaf} = join ",", @gmafs;
2809 $var->{gmaf} ||= undef;
2810 }
2811 }
2812 }
2813
2814 delete $variation_cache->{$chr};
2815 }
2816
2817 end_progress($config);
2818 }
2819
2820 # gets overlapping SVs for a vf_hash
2821 sub check_svs_hash {
2822 my $config = shift;
2823 my $vf_hash = shift;
2824
2825 debug("Checking for overlapping structural variations") unless defined($config->{quiet});
2826
2827 my ($chunk_count, $counter);
2828 $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash};
2829
2830 foreach my $chr(keys %{$vf_hash}) {
2831 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2832
2833 progress($config, $counter++, $chunk_count);
2834
2835 # work out start and end using chunk_size
2836 my ($start, $end);
2837 $start = $config->{chunk_size} * $chunk;
2838 $end = $config->{chunk_size} * ($chunk + 1);
2839
2840 # check for structural variations
2841 if(defined($config->{sa})) {
2842 my $slice = $config->{sa}->fetch_by_region('chromosome', $chr, $start, $end);
2843
2844 if(defined($slice)) {
2845 my $svs = $config->{svfa}->fetch_all_by_Slice($slice);
2846
2847 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
2848 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
2849 my $string = join ",",
2850 map {$_->variation_name}
2851 grep {$_->seq_region_start <= $var->end && $_->seq_region_end >= $var->start}
2852 @$svs;
2853
2854 $var->{overlapping_svs} = $string if $string;
2855 }
2856 }
2857 }
2858 }
2859 }
2860 }
2861
2862 end_progress($config);
2863 }
2864
2865 # gets a slice from the slice adaptor
2866 sub get_slice {
2867 my $config = shift;
2868 my $chr = shift;
2869 my $otherfeatures = shift;
2870 $otherfeatures ||= '';
2871
2872 return undef unless defined($config->{sa}) && defined($chr);
2873
2874 my $slice;
2875
2876 # first try to get a chromosome
2877 eval { $slice = $config->{$otherfeatures.'sa'}->fetch_by_region('chromosome', $chr); };
2878
2879 # if failed, try to get any seq region
2880 if(!defined($slice)) {
2881 $slice = $config->{$otherfeatures.'sa'}->fetch_by_region(undef, $chr);
2882 }
2883
2884 return $slice;
2885 }
2886
2887
2888
2889
2890 # METHODS THAT DEAL WITH "REGIONS"
2891 ##################################
2892
2893 # gets regions from VF hash
2894 sub regions_from_hash {
2895 my $config = shift;
2896 my $vf_hash = shift;
2897
2898 my %include_regions;
2899
2900 # if using cache we just want the regions of cache_region_size
2901 # since that's what we'll get from the cache (or DB if no cache found)
2902 if(defined($config->{cache})) {
2903
2904 my $region_size = $config->{cache_region_size};
2905
2906 foreach my $chr(keys %$vf_hash) {
2907 $include_regions{$chr} = [];
2908 my %temp_regions;
2909
2910 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2911 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
2912 my ($s, $e) = ($pos - MAX_DISTANCE_FROM_TRANSCRIPT, $pos + MAX_DISTANCE_FROM_TRANSCRIPT);
2913
2914 my $low = int ($s / $region_size);
2915 my $high = int ($e / $region_size) + 1;
2916
2917 for my $i($low..($high - 1)) {
2918 $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1;
2919 }
2920 }
2921 }
2922
2923 @{$include_regions{$chr}} = keys %temp_regions;
2924 }
2925 }
2926
2927 # if no cache we don't want to fetch more than is necessary, so find the
2928 # minimum covered region of the variations in the hash
2929 else {
2930 foreach my $chr(keys %$vf_hash) {
2931 $include_regions{$chr} = [];
2932
2933 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
2934 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
2935 add_region($_->start, $_->end, $include_regions{$chr}) for @{$vf_hash->{$chr}{$chunk}{$pos}};
2936 }
2937 }
2938 }
2939
2940 # merge regions
2941 merge_regions(\%include_regions, $config);
2942 }
2943
2944 return \%include_regions;
2945 }
2946
2947 # adds a region to region list, expanding existing one if overlaps
2948 sub add_region {
2949 my $start = shift;
2950 my $end = shift;
2951 my $region_list = shift;
2952
2953 # fix end for insertions
2954 $end = $start if $end < $start;
2955
2956 my $added = 0;
2957 my $i = 0;
2958
2959 while ($i < scalar @$region_list) {
2960 my ($region_start, $region_end) = split /\-/, $region_list->[$i];
2961
2962 if($start <= $region_end && $end >= $region_start) {
2963 my $new_region_start = ($start < $end ? $start : $end) - MAX_DISTANCE_FROM_TRANSCRIPT;
2964 my $new_region_end = ($start > $end ? $start : $end) + MAX_DISTANCE_FROM_TRANSCRIPT;
2965
2966 $new_region_start = 1 if $new_region_start < 1;
2967
2968 $region_start = $new_region_start if $new_region_start < $region_start;
2969 $region_end = $new_region_end if $new_region_end > $region_end;
2970
2971 $region_list->[$i] = $region_start.'-'.$region_end;
2972 $added = 1;
2973 }
2974
2975 $i++;
2976 }
2977
2978 unless($added) {
2979 my $s = $start - MAX_DISTANCE_FROM_TRANSCRIPT;
2980 $s = 1 if $s < 1;
2981
2982 push @{$region_list}, $s.'-'.($end + MAX_DISTANCE_FROM_TRANSCRIPT);
2983 }
2984 }
2985
2986 # merges overlapping regions from scans
2987 sub merge_regions {
2988 my $include_regions = shift;
2989 my $config = shift;
2990 my $consecutive = shift;
2991 $consecutive ||= 0;
2992
2993 # now merge overlapping regions
2994 foreach my $chr(keys %$include_regions) {
2995 my $max_index = $#{$include_regions->{$chr}};
2996 my (@new_regions, %skip);
2997
2998 for my $i(0..$max_index) {
2999 next if $skip{$i};
3000 my ($s, $e) = split /\-/, $include_regions->{$chr}[$i];
3001
3002 for my $j(($i+1)..$max_index) {
3003 next if $skip{$j};
3004 my ($ns, $ne) = split /\-/, $include_regions->{$chr}[$j];
3005
3006 if($s <= ($ne + $consecutive) && $e >= ($ns - $consecutive)) {
3007 $s = $ns if $ns < $s;
3008 $e = $ne if $ne > $e;
3009
3010 $skip{$j} = 1;
3011 }
3012 }
3013
3014 push @new_regions, $s.'-'.$e;
3015 }
3016
3017 # replace original
3018 $include_regions->{$chr} = \@new_regions;
3019
3020 $config->{region_count} += scalar @new_regions;
3021 }
3022
3023 return $include_regions;
3024 }
3025
3026 # converts regions as determined by scan_file to regions loadable from cache
3027 sub convert_regions {
3028 my $config = shift;
3029 my $regions = shift;
3030
3031 return undef unless defined $regions;
3032
3033 my $region_size = $config->{cache_region_size};
3034
3035 my %new_regions;
3036
3037 foreach my $chr(keys %$regions) {
3038 my %temp_regions;
3039
3040 foreach my $region(@{$regions->{$chr}}) {
3041 my ($s, $e) = split /\-/, $region;
3042
3043 my $low = int ($s / $region_size);
3044 my $high = int ($e / $region_size) + 1;
3045
3046 for my $i($low..($high - 1)) {
3047 $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1;
3048 }
3049 }
3050
3051 @{$new_regions{$chr}} = keys %temp_regions;
3052 }
3053
3054 return \%new_regions;
3055 }
3056
3057
3058
3059
3060
3061 # CACHE METHODS
3062 ###############
3063
3064 # prunes a cache to get rid of features not in regions in use
3065 sub prune_cache {
3066 my $config = shift;
3067 my $cache = shift;
3068 my $regions = shift;
3069 my $loaded = shift;
3070
3071 # delete no longer in use chroms
3072 foreach my $chr(keys %$cache) {
3073 delete $cache->{$chr} unless defined $regions->{$chr} && scalar @{$regions->{$chr}};
3074 }
3075
3076 my $new_count = 0;
3077
3078 foreach my $chr(keys %$cache) {
3079
3080 # get total area spanned by regions
3081 my ($min, $max);
3082 foreach my $region(@{$regions->{$chr}}) {
3083 my ($s, $e) = split /\-/, $region;
3084 $min = $s if !defined($min) or $s < $min;
3085 $max = $e if !defined($max) or $e > $max;
3086 }
3087
3088 # transcript cache
3089 if(ref($cache->{$chr}) eq 'ARRAY') {
3090 $cache->{$chr} = prune_min_max($cache->{$chr}, $min, $max);
3091 $new_count += scalar @{$cache->{$chr}};
3092 }
3093 # regfeat cache
3094 elsif(ref($cache->{$chr}) eq 'HASH') {
3095 for(keys %{$cache->{$chr}}) {
3096 $cache->{$chr}->{$_} = prune_min_max($cache->{$chr}->{$_}, $min, $max);
3097 $new_count += scalar @{$cache->{$chr}->{$_}};
3098 }
3099 }
3100
3101 # update loaded regions
3102 my %have_regions = map {$_ => 1} @{$regions->{$chr}};
3103
3104 foreach my $region(keys %{$loaded->{$chr}}) {
3105 delete $loaded->{$chr}->{$region} unless defined $have_regions{$region};
3106 }
3107 }
3108
3109 return $new_count;
3110 }
3111
3112 # does the actual pruning
3113 sub prune_min_max {
3114 my $array = shift;
3115 my $min = shift;
3116 my $max = shift;
3117
3118 # splice out features not in area spanned by min/max
3119 my $i = 0;
3120 my $f_count = scalar @{$array};
3121 my @new_cache;
3122
3123 while($i < $f_count) {
3124 my $f = $array->[$i];
3125
3126 $i++;
3127
3128 if($max - $f->start() > 0 && $f->end - $min > 0) {
3129 push @new_cache, $f;
3130 }
3131
3132 # do some cleaning for transcripts
3133 elsif(defined $f->{translation}) {
3134 delete $f->{translation}->{transcript};
3135 delete $f->{translation};
3136 }
3137 }
3138
3139 undef $array;
3140 return \@new_cache;
3141 }
3142
3143 # get transcripts for slices
3144 sub cache_transcripts {
3145 my $config = shift;
3146 my $include_regions = shift;
3147
3148 my $tr_cache;
3149 my $i;
3150
3151 debug("Caching transcripts") unless defined($config->{quiet});
3152
3153 foreach my $chr(keys %$include_regions) {
3154
3155 my $slice = get_slice($config, $chr);
3156
3157 next unless defined $slice;
3158
3159 # prefetch some things
3160 $slice->is_circular;
3161
3162 # trim bumf off the slice
3163 delete $slice->{coord_system}->{adaptor} if defined($config->{write_cache});
3164
3165 # no regions?
3166 if(!scalar @{$include_regions->{$chr}}) {
3167 my $start = 1;
3168 my $end = $config->{cache_region_size};
3169
3170 while($start < $slice->end) {
3171 push @{$include_regions->{$chr}}, $start.'-'.$end;
3172 $start += $config->{cache_region_size};
3173 $end += $config->{cache_region_size};
3174 }
3175 }
3176
3177 my $region_count;
3178
3179 if(scalar keys %$include_regions == 1) {
3180 my ($chr) = keys %$include_regions;
3181 $region_count = scalar @{$include_regions->{$chr}};
3182 debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet});
3183 }
3184
3185 foreach my $region(@{$include_regions->{$chr}}) {
3186 progress($config, $i++, $region_count || $config->{region_count});
3187
3188 my ($s, $e) = split /\-/, $region;
3189
3190 # sanity check start and end
3191 $s = 1 if $s < 1;
3192 $e = $slice->end if $e > $slice->end;
3193
3194 # get sub-slice
3195 my $sub_slice = $slice->sub_Slice($s, $e);
3196
3197 # for some reason unless seq is called here the sequence becomes Ns later
3198 $sub_slice->seq;
3199
3200 # add transcripts to the cache, via a transfer to the chrom's slice
3201 if(defined($sub_slice)) {
3202 foreach my $gene(map {$_->transfer($slice)} @{$sub_slice->get_all_Genes(undef, undef, 1)}) {
3203 my $gene_stable_id = $gene->stable_id;
3204 my $canonical_tr_id = $gene->{canonical_transcript_id};
3205
3206 my @trs;
3207
3208 foreach my $tr(@{$gene->get_all_Transcripts}) {
3209 $tr->{_gene_stable_id} = $gene_stable_id;
3210 $tr->{_gene} = $gene;
3211
3212 # indicate if canonical
3213 $tr->{is_canonical} = 1 if defined $canonical_tr_id and $tr->dbID eq $canonical_tr_id;
3214
3215 if(defined($config->{prefetch})) {
3216 prefetch_transcript_data($config, $tr);
3217 }
3218
3219 # CCDS
3220 elsif(defined($config->{ccds})) {
3221 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries};
3222 $tr->{_ccds} = $entries[0]->display_id if scalar @entries;
3223 }
3224
3225 # strip some unnecessary data from the transcript object
3226 clean_transcript($tr) if defined($config->{write_cache});
3227
3228 push @trs, $tr;
3229 }
3230
3231 # sort the transcripts by translation so we can share sift/polyphen stuff
3232 # between transcripts and save cache space
3233 if(defined($config->{write_cache}) && (defined($config->{sift}) || defined($config->{polyphen}))) {
3234
3235 my $prev_tr;
3236
3237 # sort them by peptide seqeuence as transcripts with identical peptides
3238 # will have identical SIFT/PolyPhen prediction strings
3239 foreach my $tr(sort {$a->{_variation_effect_feature_cache}->{peptide} cmp $b->{_variation_effect_feature_cache}->{peptide}} grep {$_->{_variation_effect_feature_cache}->{peptide}} @trs) {
3240
3241 if(
3242 defined($prev_tr) &&
3243 $prev_tr->{_variation_effect_feature_cache}->{peptide}
3244 eq $tr->{_variation_effect_feature_cache}->{peptide}
3245 ) {
3246
3247 foreach my $analysis(qw(sift polyphen)) {
3248 next unless defined($config->{$analysis});
3249 $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis} = $prev_tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis};
3250 }
3251 }
3252
3253 $prev_tr = $tr;
3254 }
3255 }
3256
3257 # clean the gene
3258 clean_gene($gene);
3259
3260 push @{$tr_cache->{$chr}}, @trs;
3261 }
3262 }
3263 }
3264 }
3265
3266 end_progress($config);
3267
3268 return $tr_cache;
3269 }
3270
3271 # gets rid of extra bits of info attached to the transcript that we don't need
3272 sub clean_transcript {
3273 my $tr = shift;
3274
3275 foreach my $key(qw(display_xref external_db external_display_name external_name external_status created_date status description edits_enabled modified_date dbentries is_current analysis transcript_mapper)) {
3276 delete $tr->{$key} if defined($tr->{$key});
3277 }
3278
3279 # clean all attributes but miRNA
3280 if(defined($tr->{attributes})) {
3281 my @new_atts;
3282 foreach my $att(@{$tr->{attributes}}) {
3283 push @new_atts, $att if $att->{code} eq 'miRNA';
3284 }
3285 $tr->{attributes} = \@new_atts;
3286 }
3287
3288 # clean the translation
3289 if(defined($tr->translation)) {
3290
3291 # sometimes the translation points to a different transcript?
3292 $tr->{translation}->{transcript} = $tr;
3293 weaken($tr->{translation}->{transcript});
3294
3295 for my $key(qw(attributes protein_features created_date modified_date)) {
3296 delete $tr->translation->{$key};
3297 }
3298 }
3299 }
3300
3301 # gets rid of extra bits of info attached to genes. At the moment this is almost
3302 # everything as genes are only used for their locations when looking at
3303 # structural variations
3304 sub clean_gene {
3305 my $gene = shift;
3306
3307 # delete almost everything in the gene
3308 map {delete $gene->{$_}}
3309 grep {
3310 $_ ne 'start' &&
3311 $_ ne 'end' &&
3312 $_ ne 'strand' &&
3313 $_ ne 'stable_id'
3314 }
3315 keys %{$gene};
3316 }
3317
3318 # build slice cache from transcript cache
3319 sub build_slice_cache {
3320 my $config = shift;
3321 my $tr_cache = shift;
3322
3323 my %slice_cache;
3324
3325 foreach my $chr(keys %$tr_cache) {
3326 $slice_cache{$chr} = scalar @{$tr_cache->{$chr}} ? $tr_cache->{$chr}[0]->slice : &get_slice($config, $chr);
3327
3328 if(!defined($slice_cache{$chr})) {
3329 delete $slice_cache{$chr}
3330 }
3331
3332 else {
3333 # reattach adaptor to the coord system
3334 $slice_cache{$chr}->{coord_system}->{adaptor} ||= $config->{csa};
3335 }
3336 }
3337
3338 return \%slice_cache;
3339 }
3340
3341 # pre-fetches per-transcript data
3342 sub prefetch_transcript_data {
3343 my $config = shift;
3344 my $tr = shift;
3345
3346 # introns
3347 my $introns = $tr->get_all_Introns;
3348
3349 if(defined($introns)) {
3350 foreach my $intron(@$introns) {
3351 foreach my $key(qw(adaptor analysis dbID next prev seqname)) {
3352 delete $intron->{$key};
3353 }
3354 }
3355 }
3356
3357 $tr->{_variation_effect_feature_cache}->{introns} ||= $introns;
3358
3359 # translateable_seq, mapper
3360 $tr->{_variation_effect_feature_cache}->{translateable_seq} ||= $tr->translateable_seq;
3361 $tr->{_variation_effect_feature_cache}->{mapper} ||= $tr->get_TranscriptMapper;
3362
3363 # peptide
3364 unless ($tr->{_variation_effect_feature_cache}->{peptide}) {
3365 my $translation = $tr->translate;
3366 $tr->{_variation_effect_feature_cache}->{peptide} = $translation ? $translation->seq : undef;
3367 }
3368
3369 # protein features
3370 if(defined($config->{domains}) || defined($config->{write_cache})) {
3371 my $pfs = $tr->translation ? $tr->translation->get_all_ProteinFeatures : [];
3372
3373 # clean them to save cache space
3374 foreach my $pf(@$pfs) {
3375
3376 # remove everything but the coord, analysis and ID fields
3377 foreach my $key(keys %$pf) {
3378 delete $pf->{$key} unless
3379 $key eq 'start' ||
3380 $key eq 'end' ||
3381 $key eq 'analysis' ||
3382 $key eq 'hseqname';
3383 }
3384
3385 # remove everything from the analysis but the display label
3386 foreach my $key(keys %{$pf->{analysis}}) {
3387 delete $pf->{analysis}->{$key} unless $key eq '_display_label';
3388 }
3389 }
3390
3391 $tr->{_variation_effect_feature_cache}->{protein_features} = $pfs;
3392 }
3393
3394 # codon table
3395 unless ($tr->{_variation_effect_feature_cache}->{codon_table}) {
3396 # for mithocondrial dna we need to to use a different codon table
3397 my $attrib = $tr->slice->get_all_Attributes('codon_table')->[0];
3398
3399 $tr->{_variation_effect_feature_cache}->{codon_table} = $attrib ? $attrib->value : 1;
3400 }
3401
3402 # sift/polyphen
3403 if(defined($config->{pfpma}) && defined($tr->{_variation_effect_feature_cache}->{peptide})) {
3404 foreach my $analysis(qw(sift polyphen)) {
3405 next unless defined($config->{$analysis});
3406 my $a = $analysis;
3407 $a .= '_humvar' if $a eq 'polyphen';
3408 $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$a} ||= $config->{pfpma}->fetch_by_analysis_translation_md5($a, md5_hex($tr->{_variation_effect_feature_cache}->{peptide}))
3409 }
3410 }
3411
3412 # gene
3413 $tr->{_gene} ||= $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id);
3414
3415 # gene HGNC
3416 if(defined $config->{hgnc}) {
3417
3418 # get from gene cache if found already
3419 if(defined($tr->{_gene}->{_hgnc})) {
3420 $tr->{_gene_hgnc} = $tr->{_gene}->{_hgnc};
3421 }
3422 else {
3423 my @entries = grep {$_->database eq 'HGNC'} @{$tr->{_gene}->get_all_DBEntries()};
3424 if(scalar @entries) {
3425 $tr->{_gene_hgnc} = $entries[0]->display_id;
3426 }
3427
3428 $tr->{_gene_hgnc} ||= '-';
3429
3430 # cache it on the gene object too
3431 $tr->{_gene}->{_hgnc} = $tr->{_gene_hgnc};
3432 }
3433 }
3434
3435 # CCDS
3436 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries};
3437 $tr->{_ccds} = $entries[0]->display_id if scalar @entries;
3438 $tr->{_ccds} ||= '-';
3439
3440 # refseq
3441 @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries};
3442 if(scalar @entries) {
3443 $tr->{_refseq} = join ",", map {$_->display_id} @entries;
3444 }
3445 else {
3446 $tr->{_refseq} = '-';
3447 }
3448
3449 # protein stable ID
3450 $tr->{_protein} = $tr->translation ? $tr->translation->stable_id : '-';
3451
3452 return $tr;
3453 }
3454
3455 sub get_dump_file_name {
3456 my $config = shift;
3457 my $chr = shift;
3458 my $region = shift;
3459 my $type = shift;
3460
3461 $type ||= 'transcript';
3462
3463 if($type eq 'transcript') {
3464 $type = '';
3465 }
3466 else {
3467 $type = '_'.$type;
3468 }
3469
3470 #my ($s, $e) = split /\-/, $region;
3471 #my $subdir = int($s / 1e6);
3472 #
3473 #my $dir = $config->{dir}.'/'.$chr.'/'.$subdir;
3474
3475 my $dir = $config->{dir}.'/'.$chr;
3476 my $dump_file = $dir.'/'.$region.$type.(defined($config->{tabix}) ? '_tabix' : '').'.gz';
3477
3478 # make directory if it doesn't exist
3479 if(!(-e $dir) && defined($config->{write_cache})) {
3480 make_path($dir);
3481 }
3482
3483 return $dump_file;
3484 }
3485
3486 # dumps out transcript cache to file
3487 sub dump_transcript_cache {
3488 my $config = shift;
3489 my $tr_cache = shift;
3490 my $chr = shift;
3491 my $region = shift;
3492
3493 debug("Dumping cached transcript data") unless defined($config->{quiet});
3494
3495 # clean the slice adaptor before storing
3496 clean_slice_adaptor($config);
3497
3498 strip_transcript_cache($config, $tr_cache);
3499
3500 $config->{reg}->disconnect_all;
3501 delete $config->{sa}->{dbc}->{_sql_helper};
3502
3503 my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript');
3504
3505 debug("Writing to $dump_file") unless defined($config->{quiet});
3506
3507 # storable
3508 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
3509 nstore_fd($tr_cache, $fh);
3510 close $fh;
3511 }
3512
3513 #sub dump_transcript_cache_tabix {
3514 # my $config = shift;
3515 # my $tr_cache = shift;
3516 # my $chr = shift;
3517 # my $region = shift;
3518 #
3519 # debug("Dumping cached transcript data") unless defined($config->{quiet});
3520 #
3521 # # clean the slice adaptor before storing
3522 # clean_slice_adaptor($config);
3523 #
3524 # strip_transcript_cache($config, $tr_cache);
3525 #
3526 # $config->{reg}->disconnect_all;
3527 #
3528 # my $dir = $config->{dir}.'/'.$chr;
3529 # my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz';
3530 #
3531 # # make directory if it doesn't exist
3532 # if(!(-e $dir)) {
3533 # make_path($dir);
3534 # }
3535 #
3536 # debug("Writing to $dump_file") unless defined($config->{quiet});
3537 #
3538 # use Storable qw(nfreeze);
3539 # use MIME::Base64 qw(encode_base64);
3540 # #open NEW, "| bgzip -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
3541 # #
3542 # #foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) {
3543 # # print NEW join "\t", (
3544 # # $chr,
3545 # # $tr->start,
3546 # # $tr->end,
3547 # # encode_base64(freeze($tr), "")
3548 # # );
3549 # # print NEW "\n";
3550 # #}
3551 # #close NEW;
3552 # #
3553 # ## tabix it
3554 # #my $output = `tabix -s 1 -b 2 -e 3 -f $dump_file 2>&1`;
3555 # #die("ERROR: Failed during tabix indexing\n$output\n") if $output;
3556 # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
3557 #
3558 # foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) {
3559 # print NEW join "\t", (
3560 # $chr,
3561 # $tr->start,
3562 # $tr->end,
3563 # encode_base64(freeze($tr), "")
3564 # );
3565 # print NEW "\n";
3566 # }
3567 # close NEW;
3568 #}
3569
3570 # loads in dumped transcript cache to memory
3571 sub load_dumped_transcript_cache {
3572 my $config = shift;
3573 my $chr = shift;
3574 my $region = shift;
3575
3576 my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript');
3577
3578 return undef unless -e $dump_file;
3579
3580 debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
3581
3582 open my $fh, $config->{compress}." ".$dump_file." |" or return undef;
3583 my $tr_cache;
3584 $tr_cache = fd_retrieve($fh);
3585 close $fh;
3586
3587 # reattach adaptors
3588 foreach my $t(@{$tr_cache->{$chr}}) {
3589 if(defined($t->{translation})) {
3590 $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor};
3591 $t->{translation}->{transcript} = $t;
3592 weaken($t->{translation}->{transcript});
3593 }
3594
3595 $t->{slice}->{adaptor} = $config->{sa};
3596 }
3597
3598 return $tr_cache;
3599 }
3600
3601 #sub load_dumped_transcript_cache_tabix {
3602 # my $config = shift;
3603 # my $chr = shift;
3604 # my $region = shift;
3605 # my $trim_regions = shift;
3606 #
3607 # my $dir = $config->{dir}.'/'.$chr;
3608 # my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz';
3609 #
3610 # #print STDERR "Reading from $dump_file\n";
3611 #
3612 # return undef unless -e $dump_file;
3613 #
3614 # debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
3615 #
3616 # my $tr_cache;
3617 #
3618 # use MIME::Base64 qw(decode_base64);
3619 # use Storable qw(thaw);
3620 #
3621 # my ($s, $e) = split /\-/, $region;
3622 # my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}};
3623 # my $regions = "";
3624 # $regions .= " $chr\:$_" for @regions;
3625 #
3626 # #print STDERR "tabix $dump_file $regions |\n";
3627 # #open IN, "tabix $dump_file $regions |";
3628 # open IN, "gzip -dc $dump_file |";
3629 # while(<IN>) {
3630 #
3631 # #$DB::single = 1;
3632 # my ($chr, $start, $end, $blob) = split /\t/, $_;
3633 # next unless grep {overlap($start, $end, (split /\-/, $_))} @regions;
3634 # my $tr = thaw(decode_base64($blob));
3635 # push @{$tr_cache->{$chr}}, $tr;
3636 # }
3637 # close IN;
3638 #
3639 # # reattach adaptors
3640 # foreach my $t(@{$tr_cache->{$chr}}) {
3641 # if(defined($t->{translation})) {
3642 # $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor};
3643 # $t->{translation}->{transcript} = $t;
3644 # weaken($t->{translation}->{transcript});
3645 # }
3646 #
3647 # $t->{slice}->{adaptor} = $config->{sa};
3648 # }
3649 #
3650 # # add empty array ref so code doesn't try and fetch from DB too
3651 # $tr_cache->{$chr} ||= [];
3652 #
3653 # return $tr_cache;
3654 #}
3655
3656 # strips cache before writing to disk
3657 sub strip_transcript_cache {
3658 my $config = shift;
3659 my $cache = shift;
3660
3661 foreach my $chr(keys %$cache) {
3662 foreach my $tr(@{$cache->{$chr}}) {
3663 foreach my $exon(@{$tr->{_trans_exon_array}}) {
3664 delete $exon->{slice}->{adaptor};
3665
3666 for(qw(adaptor created_date modified_date is_current version is_constitutive _seq_cache dbID slice)) {
3667 delete $exon->{$_};
3668 }
3669 }
3670
3671 delete $tr->{adaptor};
3672 delete $tr->{slice}->{adaptor};
3673 }
3674 }
3675 }
3676
3677 # cleans slice adaptor before storing in cache
3678 sub clean_slice_adaptor{
3679 my $config = shift;
3680
3681 # clean some stuff off the slice adaptor
3682 delete $config->{sa}->{asm_exc_cache};
3683 $config->{sa}->{sr_name_cache} = {};
3684 $config->{sa}->{sr_id_cache} = {};
3685 delete $config->{sa}->{db}->{seq_region_cache};
3686 delete $config->{sa}->{db}->{name_cache};
3687 }
3688
3689
3690 # dump adaptors to cache
3691 sub dump_adaptor_cache {
3692 my $config = shift;
3693
3694 $config->{reg}->disconnect_all;
3695 delete $config->{sa}->{dbc}->{_sql_helper};
3696
3697 my $dir = $config->{dir};
3698 my $dump_file = $dir.'/adaptors.gz';
3699
3700 # make directory if it doesn't exist
3701 if(!(-e $dir)) {
3702 make_path($dir);
3703 }
3704
3705 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
3706 nstore_fd($config, $fh);
3707 close $fh;
3708 }
3709
3710 # load dumped adaptors
3711 sub load_dumped_adaptor_cache {
3712 my $config = shift;
3713
3714 my $dir = $config->{dir};
3715 my $dump_file = $dir.'/adaptors.gz';
3716
3717 return undef unless -e $dump_file;
3718
3719 debug("Reading cached adaptor data") unless defined($config->{quiet});
3720
3721 open my $fh, $config->{compress}." ".$dump_file." |" or return undef;
3722 my $cached_config;
3723 $cached_config = fd_retrieve($fh);
3724 close $fh;
3725
3726 $config->{$_} = $cached_config->{$_} for qw(sa ga ta vfa svfa tva pfpma mca csa RegulatoryFeature_adaptor MotifFeature_adaptor);
3727
3728 return 1;
3729 }
3730
3731 # dumps cached variations to disk
3732 sub dump_variation_cache {
3733 my $config = shift;
3734 my $v_cache = shift;
3735 my $chr = shift;
3736 my $region = shift;
3737
3738 my $dump_file = get_dump_file_name($config, $chr, $region, 'var');
3739
3740 open DUMP, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to adaptor dump file $dump_file";
3741
3742 foreach my $pos(keys %{$v_cache->{$chr}}) {
3743 foreach my $v(@{$v_cache->{$chr}->{$pos}}) {
3744 my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = @$v;
3745
3746 print DUMP join " ", (
3747 $name,
3748 $failed == 0 ? '' : $failed,
3749 $start,
3750 $end == $start ? '' : $end,
3751 $as,
3752 $strand == 1 ? '' : $strand,
3753 $ma || '',
3754 defined($maf) ? sprintf("%.2f", $maf) : '',
3755 );
3756 print DUMP "\n";
3757 }
3758 }
3759
3760 close DUMP;
3761 }
3762
3763 # loads dumped variation cache
3764 sub load_dumped_variation_cache {
3765 my $config = shift;
3766 my $chr = shift;
3767 my $region = shift;
3768
3769 my $dump_file = get_dump_file_name($config, $chr, $region, 'var');
3770
3771 return undef unless -e $dump_file;
3772
3773 open DUMP, $config->{compress}." ".$dump_file." |" or return undef;
3774
3775 my $v_cache;
3776
3777 while(<DUMP>) {
3778 chomp;
3779 my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = split / /, $_;
3780 $failed ||= 0;
3781 $end ||= $start;
3782 $strand ||= 1;
3783 $ma ||= undef;
3784 $maf ||= undef;
3785
3786 my @v = ($name, $failed, $start, $end, $as, $strand, $ma, $maf);
3787 push @{$v_cache->{$chr}->{$start}}, \@v;
3788 }
3789
3790 close DUMP;
3791
3792 return $v_cache;
3793 }
3794
3795 # caches regulatory features
3796 sub cache_reg_feats {
3797 my $config = shift;
3798 my $include_regions = shift;
3799
3800 my $rf_cache;
3801 my $i;
3802
3803 debug("Caching regulatory features") unless defined($config->{quiet});
3804
3805 foreach my $chr(keys %$include_regions) {
3806
3807 my $slice = get_slice($config, $chr);
3808
3809 next unless defined $slice;
3810
3811 # prefetch some things
3812 $slice->is_circular;
3813
3814 # no regions?
3815 if(!scalar @{$include_regions->{$chr}}) {
3816 my $start = 1;
3817 my $end = $config->{cache_region_size};
3818
3819 while($start < $slice->end) {
3820 push @{$include_regions->{$chr}}, $start.'-'.$end;
3821 $start += $config->{cache_region_size};
3822 $end += $config->{cache_region_size};
3823 }
3824 }
3825
3826 my $region_count;
3827
3828 if(scalar keys %$include_regions == 1) {
3829 my ($chr) = keys %$include_regions;
3830 $region_count = scalar @{$include_regions->{$chr}};
3831 debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet});
3832 }
3833
3834 foreach my $region(@{$include_regions->{$chr}}) {
3835 progress($config, $i++, $region_count || $config->{region_count});
3836
3837 my ($s, $e) = split /\-/, $region;
3838
3839 # sanity check start and end
3840 $s = 1 if $s < 1;
3841 $e = $slice->end if $e > $slice->end;
3842
3843 # get sub-slice
3844 my $sub_slice = $slice->sub_Slice($s, $e);
3845 $sub_slice->{coord_system}->{adaptor} = $config->{csa};
3846
3847 next unless defined($sub_slice);
3848
3849 foreach my $type(@REG_FEAT_TYPES) {
3850 my $features = $config->{$type.'_adaptor'}->fetch_all_by_Slice($sub_slice);
3851 next unless defined($features);
3852
3853 # cell types
3854 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
3855 foreach my $rf(@$features) {
3856
3857 my %cl;
3858
3859 # get cell type by fetching all from stable ID
3860 if($type eq 'RegulatoryFeature') {
3861 %cl = map {
3862 $_->feature_set->cell_type->name => $_->feature_type->name
3863 } @{$rf->adaptor->fetch_all_by_stable_ID($rf->stable_id)};
3864 }
3865
3866 # get cell type by fetching regfeats that contain this MotifFeature
3867 elsif($type eq 'MotifFeature') {
3868 %cl = map {
3869 $_->feature_set->cell_type->name => $_->feature_type->name
3870 } @{$config->{'RegulatoryFeature_adaptor'}->fetch_all_by_attribute_feature($rf)};
3871 }
3872
3873 $rf->{cell_types} = \%cl;
3874 }
3875 }
3876
3877 push @{$rf_cache->{$chr}->{$type}},
3878 map { clean_reg_feat($_) }
3879 map { $_->transfer($slice) }
3880 @{$features};
3881 }
3882 }
3883 }
3884
3885 end_progress($config);
3886
3887 return $rf_cache;
3888 }
3889
3890
3891 # cleans reg feats for caching
3892 sub clean_reg_feat {
3893 my $rf = shift;
3894
3895 foreach my $key(qw/adaptor binary_string bound_start bound_end attribute_cache feature_type feature_set analysis/) {
3896 delete $rf->{$key};
3897 }
3898
3899 if(defined($rf->{binding_matrix})) {
3900 $rf->{_variation_effect_feature_cache}->{seq} = $rf->seq;
3901
3902 foreach my $key(qw/adaptor feature_type analysis dbID/) {
3903 delete $rf->{binding_matrix}->{$key};
3904 }
3905 }
3906
3907 return $rf;
3908 }
3909
3910
3911 # dumps out reg feat cache to file
3912 sub dump_reg_feat_cache {
3913 my $config = shift;
3914 my $rf_cache = shift;
3915 my $chr = shift;
3916 my $region = shift;
3917
3918 debug("Dumping cached reg feat data for $chr:$region") unless defined($config->{quiet});
3919
3920 # clean the slice adaptor before storing
3921 clean_slice_adaptor($config);
3922
3923 $config->{reg}->disconnect_all;
3924 delete $config->{sa}->{dbc}->{_sql_helper};
3925
3926 foreach my $chr(keys %{$rf_cache}) {
3927 foreach my $type(keys %{$rf_cache->{$chr}}) {
3928 delete $_->{slice}->{coord_system}->{adaptor} for @{$rf_cache->{$chr}->{$type}};
3929 }
3930 }
3931
3932 my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
3933
3934 debug("Writing to $dump_file") unless defined($config->{quiet});
3935
3936 # storable
3937 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
3938 nstore_fd($rf_cache, $fh);
3939 close $fh;
3940 }
3941
3942 #sub dump_reg_feat_cache_tabix {
3943 # my $config = shift;
3944 # my $rf_cache = shift;
3945 # my $chr = shift;
3946 # my $region = shift;
3947 #
3948 # debug("Dumping cached reg feat data") unless defined($config->{quiet});
3949 #
3950 # # clean the slice adaptor before storing
3951 # clean_slice_adaptor($config);
3952 #
3953 # $config->{reg}->disconnect_all;
3954 # delete $config->{sa}->{dbc}->{_sql_helper};
3955 #
3956 # $config->{reg}->disconnect_all;
3957 #
3958 # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
3959 #
3960 # debug("Writing to $dump_file") unless defined($config->{quiet});
3961 #
3962 # use Storable qw(nfreeze);
3963 # use MIME::Base64 qw(encode_base64);
3964 # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
3965 #
3966 # foreach my $type(keys %{$rf_cache->{$chr}}) {
3967 # foreach my $rf(sort {$a->start <=> $b->start} @{$rf_cache->{$chr}->{$type}}) {
3968 # print NEW join "\t", (
3969 # $chr,
3970 # $rf->start,
3971 # $rf->end,
3972 # $type,
3973 # encode_base64(freeze($rf), "")
3974 # );
3975 # print NEW "\n";
3976 # }
3977 # }
3978 # close NEW;
3979 #}
3980
3981 # loads in dumped transcript cache to memory
3982 sub load_dumped_reg_feat_cache {
3983 my $config = shift;
3984 my $chr = shift;
3985 my $region = shift;
3986
3987 my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
3988
3989 return undef unless -e $dump_file;
3990
3991 debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
3992
3993 open my $fh, $config->{compress}." ".$dump_file." |" or return undef;
3994 my $rf_cache;
3995 $rf_cache = fd_retrieve($fh);
3996 close $fh;
3997
3998 return $rf_cache;
3999 }
4000
4001
4002
4003 #sub load_dumped_reg_feat_cache_tabix {
4004 # my $config = shift;
4005 # my $chr = shift;
4006 # my $region = shift;
4007 # my $trim_regions = shift;
4008 #
4009 # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
4010 #
4011 # #print STDERR "Reading from $dump_file\n";
4012 #
4013 # return undef unless -e $dump_file;
4014 #
4015 # debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
4016 #
4017 # my $rf_cache;
4018 #
4019 # use MIME::Base64 qw(decode_base64);
4020 # use Storable qw(thaw);
4021 #
4022 # my ($s, $e) = split /\-/, $region;
4023 # my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}};
4024 # my $regions = "";
4025 # $regions .= " $chr\:$_" for @regions;
4026 #
4027 # #print STDERR "tabix $dump_file $regions |\n";
4028 # #open IN, "tabix $dump_file $regions |";
4029 # open IN, "gzip -dc $dump_file |";
4030 # while(<IN>) {
4031 # my ($chr, $start, $end, $type, $blob) = split /\t/, $_;
4032 # next unless grep {overlap($start, $end, (split /\-/, $_))} @regions;
4033 # my $rf = thaw(decode_base64($blob));
4034 # push @{$rf_cache->{$chr}->{$type}}, $rf;
4035 # }
4036 # close IN;
4037 #
4038 # $rf_cache->{$chr}->{$_} ||= [] for @REG_FEAT_TYPES;
4039 #
4040 # return $rf_cache;
4041 #}
4042
4043
4044 # get custom annotation for a region
4045 sub cache_custom_annotation {
4046 my $config = shift;
4047 my $include_regions = shift;
4048 my $chr = shift;
4049
4050 #$include_regions = merge_regions($include_regions, $config, 1);
4051
4052 my $annotation = {};
4053
4054 my $total = scalar @{$config->{custom}} * scalar @{$include_regions->{$chr}};
4055 my $counter = 0;
4056
4057 my $max_regions_per_tabix = 1000;
4058
4059 debug("Caching custom annotations") unless defined($config->{quiet});
4060
4061 foreach my $custom(@{$config->{custom}}) {
4062
4063 my @regions = @{$include_regions->{$chr}};
4064
4065 while(scalar @regions) {
4066 my $got_features = 0;
4067
4068 my @tmp_regions = splice @regions, 0, $max_regions_per_tabix;
4069
4070 progress($config, $counter, $total);
4071 $counter += scalar @tmp_regions;
4072
4073 # some files may have e.g. chr10 instead of 10
4074 for my $tmp_chr($chr, 'chr'.$chr) {
4075
4076 # bigwig needs to use bigWigToWig utility
4077 if($custom->{format} eq 'bigwig') {
4078 foreach my $region(@tmp_regions) {
4079 my ($s, $e) = split /\-/, $region;
4080 my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$.'_'.$tmp_chr.'_'.$s.'_'.$e;
4081 my $bigwig_file = $custom->{file};
4082 my $bigwig_output = `bigWigToWig -chrom=$tmp_chr -start=$s -end=$e $bigwig_file $tmp_file 2>&1`;
4083
4084 die "\nERROR: Problem using bigwig file $bigwig_file\n$bigwig_output" if $bigwig_output;
4085 }
4086
4087 # concatenate all the files together
4088 my $string = $config->{tmpdir}.'/vep_tmp_'.$$.'_*';
4089 my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$;
4090 `cat $string > $tmp_file; rm $string`;
4091 open CUSTOM, $tmp_file
4092 or die "\nERROR: Could not read from temporary WIG file $tmp_file\n";
4093 }
4094
4095 # otherwise use tabix
4096 else {
4097 # tabix can fetch multiple regions, so construct a string
4098 my $region_string = join " ", map {$tmp_chr.':'.$_} @tmp_regions;
4099
4100 open CUSTOM, "tabix ".$custom->{file}." $region_string 2>&1 |"
4101 or die "\nERROR: Could not open tabix pipe for ".$custom->{file}."\n";
4102 }
4103
4104 # set an error flag so we don't have to check every line
4105 my $error_flag = 1;
4106
4107 while(<CUSTOM>) {
4108 chomp;
4109
4110 # check for errors
4111 if($error_flag) {
4112 die "\nERROR: Problem using annotation file ".$custom->{file}."\n$_\n" if /invalid pointer|tabix|get_intv/;
4113 $error_flag = 0;
4114 }
4115
4116 my @data = split /\t/, $_;
4117
4118 my $feature;
4119
4120 if($custom->{format} eq 'bed') {
4121 $feature = {
4122 chr => $chr,
4123 start => $data[1],
4124 end => $data[2],
4125 name => $data[3],
4126 };
4127 }
4128
4129 elsif($custom->{format} eq 'vcf') {
4130 my $tmp_vf = parse_vcf($config, $_)->[0];
4131
4132 $feature = {
4133 chr => $chr,
4134 start => $tmp_vf->{start},
4135 end => $tmp_vf->{end},
4136 name => $tmp_vf->{variation_name},
4137 };
4138 }
4139
4140 elsif($custom->{format} eq 'gff' || $custom->{format} eq 'gtf') {
4141
4142 my $name;
4143
4144 # try and get a feature name from the attributes column
4145 foreach my $attrib(split /\s*\;\s*/, $data[8]) {
4146 my ($key, $value) = split /\=/, $attrib;
4147 $name = $value if $key eq 'ID';
4148 }
4149
4150 $name ||= $data[2]."_".$data[0].":".$data[3]."-".$data[4];
4151
4152 $feature = {
4153 chr => $chr,
4154 start => $data[3],
4155 end => $data[4],
4156 name => $name,
4157 };
4158 }
4159
4160 elsif($custom->{format} eq 'bigwig') {
4161 $feature = {
4162 chr => $chr,
4163 start => $data[0],
4164 end => $data[0],
4165 name => $data[1],
4166 };
4167 }
4168
4169 if(defined($feature)) {
4170 $got_features = 1;
4171
4172 if(!defined($feature->{name}) || $custom->{coords}) {
4173 $feature->{name} = $feature->{chr}.":".$feature->{start}."-".$feature->{end};
4174 }
4175
4176 # add the feature to the cache
4177 $annotation->{$chr}->{$custom->{name}}->{$feature->{start}}->{$feature->{name}} = $feature;
4178 }
4179 }
4180 close CUSTOM;
4181
4182 # unlink temporary wig files
4183 unlink($config->{tmpdir}.'/vep_tmp_'.$$) if $custom->{format} eq 'bigwig';
4184
4185 # no need to fetch e.g. "chr21" features if just "21" worked
4186 last if $got_features;
4187 }
4188 }
4189 }
4190
4191 end_progress($config);
4192
4193 return $annotation;
4194 }
4195
4196 # builds a full cache for this species
4197 sub build_full_cache {
4198 my $config = shift;
4199
4200 my @slices;
4201
4202 if($config->{build} =~ /all/i) {
4203 @slices = @{$config->{sa}->fetch_all('toplevel')};
4204 push @slices, @{$config->{sa}->fetch_all('lrg', undef, 1, undef, 1)} if defined($config->{lrg});
4205 }
4206 else {
4207 foreach my $val(split /\,/, $config->{build}) {
4208 my @nnn = split /\-/, $val;
4209
4210 foreach my $chr($nnn[0]..$nnn[-1]) {
4211 my $slice = get_slice($config, $chr);
4212 push @slices, $slice if defined($slice);
4213 }
4214 }
4215 }
4216
4217 foreach my $slice(@slices) {
4218 my $chr = $slice->seq_region_name;
4219
4220 # check for features, we don't want a load of effectively empty dirs
4221 my $dbc = $config->{sa}->db->dbc;
4222 my $sth = $dbc->prepare("SELECT COUNT(*) FROM transcript WHERE seq_region_id = ?");
4223 $sth->execute($slice->get_seq_region_id);
4224
4225 my $count;
4226 $sth->bind_columns(\$count);
4227 $sth->fetch;
4228 $sth->finish;
4229
4230 next unless $count > 0;
4231
4232 my $regions;
4233
4234 # for progress
4235 my $region_count = int($slice->end / $config->{cache_region_size}) + 1;
4236 my $counter = 0;
4237
4238 # initial region
4239 my ($start, $end) = (1, $config->{cache_region_size});
4240
4241 debug((defined($config->{rebuild}) ? "Rebuild" : "Creat")."ing cache for chromosome $chr") unless defined($config->{quiet});
4242
4243 while($start < $slice->end) {
4244
4245 progress($config, $counter++, $region_count);
4246
4247 # store quiet status
4248 my $quiet = $config->{quiet};
4249 $config->{quiet} = 1;
4250
4251 # spoof regions
4252 $regions->{$chr} = [$start.'-'.$end];
4253
4254 # store transcripts
4255 my $tmp_cache = (defined($config->{rebuild}) ? load_dumped_transcript_cache($config, $chr, $start.'-'.$end) : cache_transcripts($config, $regions));
4256 $tmp_cache->{$chr} ||= [];
4257
4258 #(defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $start.'-'.$end) : dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end));
4259 dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end);
4260 undef $tmp_cache;
4261
4262 # store reg feats
4263 if(defined($config->{regulatory})) {
4264 my $rf_cache = cache_reg_feats($config, $regions);
4265 $rf_cache->{$chr} ||= {};
4266
4267 dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end);
4268 #(defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $rf_cache, $chr, $start.'-'.$end) : dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end));
4269 undef $rf_cache;
4270
4271 # this gets cleaned off but needs to be there for the next loop
4272 $slice->{coord_system}->{adaptor} = $config->{csa};
4273 }
4274
4275 # store variations
4276 my $variation_cache;
4277 $variation_cache->{$chr} = get_variations_in_region($config, $chr, $start.'-'.$end);
4278 $variation_cache->{$chr} ||= {};
4279
4280 dump_variation_cache($config, $variation_cache, $chr, $start.'-'.$end);
4281 undef $variation_cache;
4282
4283 # restore quiet status
4284 $config->{quiet} = $quiet;
4285
4286 # increment by cache_region_size to get next region
4287 $start += $config->{cache_region_size};
4288 $end += $config->{cache_region_size};
4289 }
4290
4291 end_progress($config);
4292
4293 undef $regions;
4294 }
4295
4296 write_cache_info($config);
4297 }
4298
4299 # write an info file that defines what is in the cache
4300 sub write_cache_info {
4301 my $config = shift;
4302
4303 my $info_file = $config->{dir}.'/info.txt';
4304
4305 open OUT, ">>$info_file" or die "ERROR: Could not write to cache info file $info_file\n";
4306
4307 print OUT "# CACHE UPDATED ".get_time()."\n";
4308
4309 foreach my $param(qw(
4310 host
4311 port
4312 user
4313 build
4314 regulatory
4315 sift
4316 polyphen
4317 )) {
4318 print OUT "$param\t".(defined $config->{$param} ? $config->{$param} : '-')."\n";
4319 }
4320
4321 # cell types
4322 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
4323 my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor();
4324 print OUT "cell_types\t".(join ",", map {$_->name} @{$cta->fetch_all});
4325 print OUT "\n";
4326 }
4327
4328 close OUT;
4329 }
4330
4331 # reads in cache info file
4332 sub read_cache_info {
4333 my $config = shift;
4334
4335 my $info_file = $config->{dir}.'/info.txt';
4336
4337 open IN, $info_file or return 0;
4338
4339 while(<IN>) {
4340 next if /^#/;
4341 chomp;
4342 my ($param, $value) = split /\t/;
4343 $config->{'cache_'.$param} = $value unless defined $value && $value eq '-';
4344 }
4345
4346 close IN;
4347
4348 return 1;
4349 }
4350
4351 # format coords for printing
4352 sub format_coords {
4353 my ($start, $end) = @_;
4354
4355 if(defined($start)) {
4356 if(defined($end)) {
4357 if($start > $end) {
4358 return $end.'-'.$start;
4359 }
4360 elsif($start == $end) {
4361 return $start;
4362 }
4363 else {
4364 return $start.'-'.$end;
4365 }
4366 }
4367 else {
4368 return $start.'-?';
4369 }
4370 }
4371 elsif(defined($end)) {
4372 return '?-'.$end;
4373 }
4374 else {
4375 return '-';
4376 }
4377 }
4378
4379
4380
4381
4382 # METHODS TO FIND CO-LOCATED / EXISTING VARIATIONS
4383 ##################################################
4384
4385 # finds an existing VF in the db
4386 sub find_existing {
4387 my $config = shift;
4388 my $new_vf = shift;
4389
4390 if(defined($config->{vfa}->db)) {
4391
4392 my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL';
4393
4394 my $sth = $config->{vfa}->db->dbc->prepare(qq{
4395 SELECT variation_name, IF(fv.variation_id IS NULL, 0, 1), seq_region_start, seq_region_end, allele_string, seq_region_strand, $maf_cols
4396 FROM variation_feature vf LEFT JOIN failed_variation fv
4397 ON vf.variation_id = fv.variation_id
4398 WHERE vf.seq_region_id = ?
4399 AND vf.seq_region_start = ?
4400 AND vf.seq_region_end = ?
4401 ORDER BY vf.source_id ASC
4402 });
4403
4404 $sth->execute($new_vf->slice->get_seq_region_id, $new_vf->start, $new_vf->end);
4405
4406 my @v;
4407 for my $i(0..7) {
4408 $v[$i] = undef;
4409 }
4410
4411 $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]);
4412
4413 my @found;
4414
4415 while($sth->fetch) {
4416 push @found, $v[0] unless is_var_novel($config, \@v, $new_vf) || $v[1] > $config->{failed};
4417 }
4418
4419 $sth->finish();
4420
4421 return (scalar @found ? join ",", @found : undef);
4422 }
4423
4424 return undef;
4425 }
4426
4427 # compare a new vf to one from the cache / DB
4428 sub is_var_novel {
4429 my $config = shift;
4430 my $existing_var = shift;
4431 my $new_var = shift;
4432
4433 my $is_novel = 1;
4434
4435 $is_novel = 0 if $existing_var->[2] == $new_var->start && $existing_var->[3] == $new_var->end;
4436
4437 if(defined($config->{check_alleles})) {
4438 my %existing_alleles;
4439
4440 $existing_alleles{$_} = 1 for split /\//, $existing_var->[4];
4441
4442 my $seen_new = 0;
4443 foreach my $a(split /\//, ($new_var->allele_string || "")) {
4444 reverse_comp(\$a) if $new_var->strand ne $existing_var->[5];
4445 $seen_new = 1 unless defined $existing_alleles{$a};
4446 }
4447
4448 $is_novel = 1 if $seen_new;
4449 }
4450
4451 return $is_novel;
4452 }
4453
4454 # check frequencies of existing var against requested params
4455 sub check_frequencies {
4456 my $config = shift;
4457 my $var_name = shift;
4458
4459 my $v = $config->{va}->fetch_by_name($var_name);
4460
4461 my $pass = 0;
4462
4463 my $freq_pop = $config->{freq_pop};
4464 my $freq_freq = $config->{freq_freq};
4465 my $freq_gt_lt = $config->{freq_gt_lt};
4466
4467 my $freq_pop_name = (split /\_/, $freq_pop)[-1];
4468 $freq_pop_name = undef if $freq_pop_name =~ /1kg|hap/;
4469
4470 delete $config->{filtered_freqs};
4471
4472 foreach my $a(@{$v->get_all_Alleles}) {
4473 next unless defined $a->{population} || defined $a->{'_population_id'};
4474 next unless defined $a->frequency;
4475 next if $a->frequency > 0.5;
4476
4477 my $pop_name = $a->population->name;
4478
4479 if($freq_pop =~ /1kg/) { next unless $pop_name =~ /^1000.+(low|phase).+/i; }
4480 if($freq_pop =~ /hap/) { next unless $pop_name =~ /^CSHL-HAPMAP/i; }
4481 if($freq_pop =~ /any/) { next unless $pop_name =~ /^(CSHL-HAPMAP)|(1000.+(low|phase).+)/i; }
4482 if(defined $freq_pop_name) { next unless $pop_name =~ /$freq_pop_name/i; }
4483
4484 $pass = 1 if $a->frequency >= $freq_freq and $freq_gt_lt eq 'gt';
4485 $pass = 1 if $a->frequency <= $freq_freq and $freq_gt_lt eq 'lt';
4486
4487 $pop_name =~ s/\:/\_/g;
4488 push @{$config->{filtered_freqs}}, $pop_name.':'.$a->frequency;
4489
4490 #warn "Comparing allele ", $a->allele, " ", $a->frequency, " for $var_name in population ", $a->population->name, " PASS $pass";
4491 }
4492
4493 return 0 if $config->{freq_filter} eq 'exclude' and $pass == 1;
4494 return 0 if $config->{freq_filter} eq 'include' and $pass == 0;
4495 return 1;
4496 }
4497
4498 # gets all variations in a region
4499 sub get_variations_in_region {
4500 my $config = shift;
4501 my $chr = shift;
4502 my $region = shift;
4503
4504 my ($start, $end) = split /\-/, $region;
4505
4506 my %variations;
4507
4508 if(defined($config->{vfa}->db)) {
4509 my $sr_cache = $config->{seq_region_cache};
4510
4511 if(!defined($sr_cache)) {
4512 $sr_cache = cache_seq_region_ids($config);
4513 $config->{seq_region_cache} = $sr_cache;
4514 }
4515
4516 # no seq_region_id?
4517 return {} unless defined($sr_cache) && defined($sr_cache->{$chr});
4518
4519 my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL';
4520
4521 my $sth = $config->{vfa}->db->dbc->prepare(qq{
4522 SELECT vf.variation_name, IF(fv.variation_id IS NULL, 0, 1), vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand, $maf_cols
4523 FROM variation_feature vf
4524 LEFT JOIN failed_variation fv ON fv.variation_id = vf.variation_id
4525 WHERE vf.seq_region_id = ?
4526 AND vf.seq_region_start >= ?
4527 AND vf.seq_region_start <= ?
4528 });
4529
4530 $sth->execute($sr_cache->{$chr}, $start, $end);
4531
4532 my @v;
4533 for my $i(0..7) {
4534 $v[$i] = undef;
4535 }
4536
4537 $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]);
4538
4539 while($sth->fetch) {
4540 my @v_copy = @v;
4541 push @{$variations{$v[2]}}, \@v_copy;
4542 }
4543
4544 $sth->finish();
4545 }
4546
4547 return \%variations;
4548 }
4549
4550 sub cache_seq_region_ids {
4551 my $config = shift;
4552
4553 my (%cache, $chr, $id);
4554
4555 my $sth = $config->{vfa}->db->dbc->prepare(qq{
4556 SELECT seq_region_id, name FROM seq_region
4557 });
4558
4559 $sth->execute();
4560 $sth->bind_columns(\$id, \$chr);
4561 $cache{$chr} = $id while $sth->fetch();
4562 $sth->finish;
4563
4564 return \%cache;
4565 }
4566
4567 sub have_maf_cols {
4568 my $config = shift;
4569
4570 if(!defined($config->{have_maf_cols})) {
4571 my $sth = $config->{vfa}->db->dbc->prepare(qq{
4572 DESCRIBE variation_feature
4573 });
4574
4575 $sth->execute();
4576 my @cols = map {$_->[0]} @{$sth->fetchall_arrayref};
4577 $sth->finish();
4578
4579 $config->{have_maf_cols} = 0;
4580 $config->{have_maf_cols} = 1 if grep {$_ eq 'minor_allele'} @cols;
4581 }
4582
4583 return $config->{have_maf_cols};
4584 }
4585
4586 sub merge_hashes {
4587 my ($x, $y) = @_;
4588
4589 foreach my $k (keys %$y) {
4590 if (!defined($x->{$k})) {
4591 $x->{$k} = $y->{$k};
4592 } else {
4593 if(ref($x->{$k}) eq 'ARRAY') {
4594 $x->{$k} = merge_arrays($x->{$k}, $y->{$k});
4595 }
4596 elsif(ref($x->{$k}) eq 'HASH') {
4597 $x->{$k} = merge_hashes($x->{$k}, $y->{$k});
4598 }
4599 else {
4600 $x->{$k} = $y->{$k};
4601 }
4602 }
4603 }
4604 return $x;
4605 }
4606
4607 sub merge_arrays {
4608 my ($x, $y) = @_;
4609
4610 my %tmp = map {$_ => 1} (@$x, @$y);
4611
4612 return [keys %tmp];
4613 }
4614
4615
4616
4617
4618 # DEBUG AND STATUS METHODS
4619 ##########################
4620
4621 # gets time
4622 sub get_time() {
4623 my @time = localtime(time());
4624
4625 # increment the month (Jan = 0)
4626 $time[4]++;
4627
4628 # add leading zeroes as required
4629 for my $i(0..4) {
4630 $time[$i] = "0".$time[$i] if $time[$i] < 10;
4631 }
4632
4633 # put the components together in a string
4634 my $time =
4635 ($time[5] + 1900)."-".
4636 $time[4]."-".
4637 $time[3]." ".
4638 $time[2].":".
4639 $time[1].":".
4640 $time[0];
4641
4642 return $time;
4643 }
4644
4645 # prints debug output with time
4646 sub debug {
4647 my $text = (@_ ? (join "", @_) : "No message");
4648 my $time = get_time;
4649
4650 print $time." - ".$text.($text =~ /\n$/ ? "" : "\n");
4651 }
4652
4653 # finds out memory usage
4654 sub memory {
4655 my @mem;
4656
4657 open IN, "ps -o rss,vsz $$ |";
4658 while(<IN>) {
4659 next if $_ =~ /rss/i;
4660 chomp;
4661 @mem = split;
4662 }
4663 close IN;
4664
4665 return \@mem;
4666 }
4667
4668 sub mem_diff {
4669 my $config = shift;
4670 my $mem = memory();
4671 my @diffs;
4672
4673 if(defined($config->{memory})) {
4674 for my $i(0..(scalar @{$config->{memory}} - 1)) {
4675 push @diffs, $mem->[$i] - $config->{memory}->[$i];
4676 }
4677 }
4678 else {
4679 @diffs = @$mem;
4680 }
4681
4682 $config->{memory} = $mem;
4683
4684 return \@diffs;
4685 }
4686
4687 # update or initiate progress bar
4688 sub progress {
4689 my ($config, $i, $total) = @_;
4690
4691 return if defined($config->{quiet}) || defined($config->{no_progress});
4692
4693 my $width = $config->{terminal_width} || 60;
4694 my $percent = int(($i/$total) * 100);
4695 my $numblobs = int((($i/$total) * $width) - 2);
4696
4697 # this ensures we're not writing to the terminal too much
4698 return if(defined($config->{prev_prog})) && $numblobs.'-'.$percent eq $config->{prev_prog};
4699 $config->{prev_prog} = $numblobs.'-'.$percent;
4700
4701 printf("\r% -${width}s% 1s% 10s", '['.('=' x $numblobs).($numblobs == $width - 2 ? '=' : '>'), ']', "[ " . $percent . "% ]");
4702 }
4703
4704 # end progress bar
4705 sub end_progress {
4706 my $config = shift;
4707 return if defined($config->{quiet}) || defined($config->{no_progress});
4708 progress($config, 1,1);
4709 print "\n";
4710 delete $config->{prev_prog};
4711 }
4712
4713 1;