Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/VEP.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence | |
22 # | |
23 # | |
24 | |
25 =head1 NAME | |
26 | |
27 Bio::EnsEMBL::Variation::Utils::VEP - Methods used by the Variant Effect Predictor | |
28 | |
29 =head1 SYNOPSIS | |
30 | |
31 use Bio::EnsEMBL::Variation::Utils::VEP qw(configure); | |
32 | |
33 my $config = configure(); | |
34 | |
35 =head1 METHODS | |
36 | |
37 =cut | |
38 | |
39 | |
40 use strict; | |
41 use warnings; | |
42 | |
43 package Bio::EnsEMBL::Variation::Utils::VEP; | |
44 | |
45 # module list | |
46 use Getopt::Long; | |
47 use FileHandle; | |
48 use File::Path qw(make_path); | |
49 use Storable qw(nstore_fd fd_retrieve freeze thaw); | |
50 use Scalar::Util qw(weaken); | |
51 use Digest::MD5 qw(md5_hex); | |
52 | |
53 use Bio::EnsEMBL::Registry; | |
54 use Bio::EnsEMBL::Variation::VariationFeature; | |
55 use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor; | |
56 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap); | |
57 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
58 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code); | |
59 use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; | |
60 use Bio::EnsEMBL::Variation::StructuralVariationFeature; | |
61 use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor; | |
62 use Bio::EnsEMBL::Variation::TranscriptStructuralVariation; | |
63 | |
64 # we need to manually include all these modules for caching to work | |
65 use Bio::EnsEMBL::CoordSystem; | |
66 use Bio::EnsEMBL::Transcript; | |
67 use Bio::EnsEMBL::Translation; | |
68 use Bio::EnsEMBL::Exon; | |
69 use Bio::EnsEMBL::ProteinFeature; | |
70 use Bio::EnsEMBL::Analysis; | |
71 use Bio::EnsEMBL::DBSQL::GeneAdaptor; | |
72 use Bio::EnsEMBL::DBSQL::SliceAdaptor; | |
73 use Bio::EnsEMBL::DBSQL::TranslationAdaptor; | |
74 use Bio::EnsEMBL::DBSQL::TranscriptAdaptor; | |
75 use Bio::EnsEMBL::DBSQL::MetaContainer; | |
76 use Bio::EnsEMBL::DBSQL::CoordSystemAdaptor; | |
77 | |
78 use Exporter; | |
79 use vars qw(@ISA @EXPORT_OK); | |
80 @ISA = qw(Exporter); | |
81 | |
82 # open socket pairs for cross-process comms | |
83 use Socket; | |
84 socketpair(CHILD, PARENT, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "ERROR: Failed to open socketpair: $!"; | |
85 CHILD->autoflush(1); | |
86 PARENT->autoflush(1); | |
87 | |
88 @EXPORT_OK = qw( | |
89 &parse_line | |
90 &vf_to_consequences | |
91 &validate_vf | |
92 &read_cache_info | |
93 &dump_adaptor_cache | |
94 &load_dumped_adaptor_cache | |
95 &load_dumped_variation_cache | |
96 &get_all_consequences | |
97 &get_slice | |
98 &build_slice_cache | |
99 &build_full_cache | |
100 ®ions_from_hash | |
101 &get_time | |
102 &debug | |
103 &convert_to_vcf | |
104 &progress | |
105 &end_progress | |
106 @REG_FEAT_TYPES | |
107 @OUTPUT_COLS | |
108 @VEP_WEB_CONFIG | |
109 %FILTER_SHORTCUTS | |
110 ); | |
111 | |
112 our @OUTPUT_COLS = qw( | |
113 Uploaded_variation | |
114 Location | |
115 Allele | |
116 Gene | |
117 Feature | |
118 Feature_type | |
119 Consequence | |
120 cDNA_position | |
121 CDS_position | |
122 Protein_position | |
123 Amino_acids | |
124 Codons | |
125 Existing_variation | |
126 Extra | |
127 ); | |
128 | |
129 our @REG_FEAT_TYPES = qw( | |
130 RegulatoryFeature | |
131 MotifFeature | |
132 ); | |
133 | |
134 our @VEP_WEB_CONFIG = qw( | |
135 format | |
136 check_existing | |
137 coding_only | |
138 core_type | |
139 hgnc | |
140 protein | |
141 hgvs | |
142 terms | |
143 check_frequency | |
144 freq_filter | |
145 freq_gt_lt | |
146 freq_freq | |
147 freq_pop | |
148 sift | |
149 polyphen | |
150 regulatory | |
151 ); | |
152 | |
153 our %FILTER_SHORTCUTS = ( | |
154 upstream => { | |
155 '5KB_upstream_variant' => 1, | |
156 '2KB_upstream_variant' => 1, | |
157 }, | |
158 downstream => { | |
159 '5KB_downstream_variant' => 1, | |
160 '2KB_downstream_variant' => 1, | |
161 '500B_downstream_variant' => 1, | |
162 }, | |
163 utr => { | |
164 '5_prime_UTR_variant' => 1, | |
165 '3_prime_UTR_variant' => 1, | |
166 }, | |
167 splice => { | |
168 splice_donor_variant => 1, | |
169 splice_acceptor_variant => 1, | |
170 splice_region_variant => 1, | |
171 }, | |
172 coding_change => { | |
173 stop_lost => 1, | |
174 stop_gained => 1, | |
175 missense_variant => 1, | |
176 frameshift_variant => 1, | |
177 inframe_insertion => 1, | |
178 inframe_deletion => 1, | |
179 }, | |
180 regulatory => { | |
181 regulatory_region_variant => 1, | |
182 TF_binding_site_variant => 1, | |
183 }, | |
184 ); | |
185 | |
186 # parses a line of input, returns VF object(s) | |
187 sub parse_line { | |
188 my $config = shift; | |
189 my $line = shift; | |
190 | |
191 # find out file format - will only do this on first line | |
192 if(!defined($config->{format}) || (defined($config->{format}) && $config->{format} eq 'guess')) { | |
193 $config->{format} = &detect_format($line); | |
194 debug("Detected format of input file as ", $config->{format}) unless defined($config->{quiet}); | |
195 | |
196 # HGVS and ID formats need DB | |
197 die("ERROR: Can't use ".uc($config->{format})." format in offline mode") if $config->{format} =~ /id|hgvs/ && defined($config->{offline}); | |
198 | |
199 # force certain options if format is VEP output | |
200 if($config->{format} eq 'vep') { | |
201 $config->{no_consequence} = 1; | |
202 delete $config->{regulatory}; | |
203 debug("Forcing no consequence calculation") unless defined($config->{quiet}); | |
204 } | |
205 } | |
206 | |
207 # check that format is vcf when using --individual | |
208 die("ERROR: --individual only compatible with VCF input files\n") if defined($config->{individual}) && $config->{format} ne 'vcf'; | |
209 | |
210 my $parse_method = 'parse_'.$config->{format}; | |
211 $parse_method =~ s/vep_//; | |
212 my $method_ref = \&$parse_method; | |
213 | |
214 my $vfs = &$method_ref($config, $line); | |
215 | |
216 $vfs = add_lrg_mappings($config, $vfs) if defined($config->{lrg}); | |
217 | |
218 return $vfs; | |
219 } | |
220 | |
221 # sub-routine to detect format of input | |
222 sub detect_format { | |
223 my $line = shift; | |
224 my @data = split /\s+/, $line; | |
225 | |
226 # HGVS: ENST00000285667.3:c.1047_1048insC | |
227 if ( | |
228 scalar @data == 1 && | |
229 $data[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i | |
230 ) { | |
231 return 'hgvs'; | |
232 } | |
233 | |
234 # variant identifier: rs123456 | |
235 elsif ( | |
236 scalar @data == 1 | |
237 ) { | |
238 return 'id'; | |
239 } | |
240 | |
241 # VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ | |
242 elsif ( | |
243 $data[0] =~ /(chr)?\w+/ && | |
244 $data[1] =~ /^\d+$/ && | |
245 $data[3] =~ /^[ACGTN-]+$/i && | |
246 $data[4] =~ /^([\.ACGTN-]+\,?)+$/i | |
247 ) { | |
248 return 'vcf'; | |
249 } | |
250 | |
251 # pileup: chr1 60 T A | |
252 elsif ( | |
253 $data[0] =~ /(chr)?\w+/ && | |
254 $data[1] =~ /^\d+$/ && | |
255 $data[2] =~ /^[\*ACGTN-]+$/i && | |
256 $data[3] =~ /^[\*ACGTNRYSWKM\+\/-]+$/i | |
257 ) { | |
258 return 'pileup'; | |
259 } | |
260 | |
261 # ensembl: 20 14370 14370 A/G + | |
262 elsif ( | |
263 $data[0] =~ /\w+/ && | |
264 $data[1] =~ /^\d+$/ && | |
265 $data[2] =~ /^\d+$/ && | |
266 $data[3] =~ /[ACGTN-]+\/[ACGTN-]+/i | |
267 ) { | |
268 return 'ensembl'; | |
269 } | |
270 | |
271 # vep output: ID 1:142849179 - - - - INTERGENIC | |
272 elsif ( | |
273 $data[0] =~ /\w+/ && | |
274 $data[1] =~ /^\w+?\:\d+(\-\d+)*$/ && | |
275 scalar @data == 14 | |
276 ) { | |
277 return 'vep'; | |
278 } | |
279 | |
280 else { | |
281 die("ERROR: Could not detect input file format\n"); | |
282 } | |
283 } | |
284 | |
285 # parse a line of Ensembl format input into a variation feature object | |
286 sub parse_ensembl { | |
287 my $config = shift; | |
288 my $line = shift; | |
289 | |
290 my ($chr, $start, $end, $allele_string, $strand, $var_name) = split /\s+/, $line; | |
291 | |
292 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | |
293 start => $start, | |
294 end => $end, | |
295 allele_string => $allele_string, | |
296 strand => $strand, | |
297 map_weight => 1, | |
298 adaptor => $config->{vfa}, | |
299 variation_name => $var_name, | |
300 chr => $chr, | |
301 }); | |
302 | |
303 return [$vf]; | |
304 } | |
305 | |
306 # parse a line of VCF input into a variation feature object | |
307 sub parse_vcf { | |
308 my $config = shift; | |
309 my $line = shift; | |
310 | |
311 my @data = split /\s+/, $line; | |
312 | |
313 # non-variant | |
314 my $non_variant = 0; | |
315 | |
316 if($data[4] eq '.') { | |
317 if(defined($config->{allow_non_variant})) { | |
318 $non_variant = 1; | |
319 } | |
320 else { | |
321 return []; | |
322 } | |
323 } | |
324 | |
325 # get relevant data | |
326 my ($chr, $start, $end, $ref, $alt) = ($data[0], $data[1], $data[1], $data[3], $data[4]); | |
327 | |
328 # some VCF files have a GRCh37 pos defined in GP flag in INFO column | |
329 # if user has requested, we can use that as the position instead | |
330 if(defined $config->{gp}) { | |
331 $chr = undef; | |
332 $start = undef; | |
333 | |
334 foreach my $pair(split /\;/, $data[7]) { | |
335 my ($key, $value) = split /\=/, $pair; | |
336 if($key eq 'GP') { | |
337 ($chr, $start) = split /\:/, $value; | |
338 $end = $start; | |
339 } | |
340 } | |
341 | |
342 unless(defined($chr) and defined($start)) { | |
343 warn "No GP flag found in INFO column" unless defined $config->{quiet}; | |
344 return []; | |
345 } | |
346 } | |
347 | |
348 # adjust end coord | |
349 $end += (length($ref) - 1); | |
350 | |
351 # structural variation | |
352 if((defined($data[7]) && $data[7] =~ /SVTYPE/) || $alt =~ /\<|\[|\]|\>/) { | |
353 | |
354 # parse INFO field | |
355 my %info = (); | |
356 | |
357 foreach my $bit(split /\;/, $data[7]) { | |
358 my ($key, $value) = split /\=/, $bit; | |
359 $info{$key} = $value; | |
360 } | |
361 | |
362 # like indels, SVs have the base before included for reference | |
363 $start++; | |
364 | |
365 # work out the end coord | |
366 if(defined($info{END})) { | |
367 $end = $info{END}; | |
368 } | |
369 elsif(defined($info{SVLEN})) { | |
370 $end = $start + abs($info{SVLEN}) - 1; | |
371 } | |
372 | |
373 # check for imprecise breakpoints | |
374 my ($min_start, $max_start, $min_end, $max_end); | |
375 | |
376 if(defined($info{CIPOS})) { | |
377 my ($low, $high) = split /\,/, $info{CIPOS}; | |
378 $min_start = $start + $low; | |
379 $max_start = $start + $high; | |
380 } | |
381 | |
382 if(defined($info{CIEND})) { | |
383 my ($low, $high) = split /\,/, $info{CIEND}; | |
384 $min_end = $end + $low; | |
385 $max_end = $end + $high; | |
386 } | |
387 | |
388 # get type | |
389 my $type = $info{SVTYPE}; | |
390 my $so_term; | |
391 | |
392 if(defined($type)) { | |
393 # convert to SO term | |
394 my %terms = ( | |
395 INS => 'insertion', | |
396 DEL => 'deletion', | |
397 TDUP => 'tandem_duplication', | |
398 DUP => 'duplication' | |
399 ); | |
400 | |
401 $so_term = defined $terms{$type} ? $terms{$type} : $type; | |
402 } | |
403 | |
404 my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({ | |
405 start => $start, | |
406 inner_start => $max_start, | |
407 outer_start => $min_start, | |
408 end => $end, | |
409 inner_end => $min_end, | |
410 outer_end => $max_end, | |
411 strand => 1, | |
412 adaptor => $config->{svfa}, | |
413 variation_name => $data[2] eq '.' ? undef : $data[2], | |
414 chr => $chr, | |
415 class_SO_term => $so_term, | |
416 }); | |
417 | |
418 return [$svf]; | |
419 } | |
420 | |
421 # normal variation | |
422 else { | |
423 # find out if any of the alt alleles make this an insertion or a deletion | |
424 my ($is_indel, $is_sub, $ins_count, $total_count); | |
425 foreach my $alt_allele(split /\,/, $alt) { | |
426 $is_indel = 1 if $alt_allele =~ /D|I/; | |
427 $is_indel = 1 if length($alt_allele) != length($ref); | |
428 $is_sub = 1 if length($alt_allele) == length($ref); | |
429 $ins_count++ if length($alt_allele) > length($ref); | |
430 $total_count++; | |
431 } | |
432 | |
433 # multiple alt alleles? | |
434 if($alt =~ /\,/) { | |
435 if($is_indel) { | |
436 my @alts; | |
437 | |
438 if($alt =~ /D|I/) { | |
439 foreach my $alt_allele(split /\,/, $alt) { | |
440 # deletion (VCF <4) | |
441 if($alt_allele =~ /D/) { | |
442 push @alts, '-'; | |
443 } | |
444 | |
445 elsif($alt_allele =~ /I/) { | |
446 $alt_allele =~ s/^I//g; | |
447 push @alts, $alt_allele; | |
448 } | |
449 } | |
450 } | |
451 | |
452 else { | |
453 $ref = substr($ref, 1) || '-'; | |
454 $start++; | |
455 | |
456 foreach my $alt_allele(split /\,/, $alt) { | |
457 $alt_allele = substr($alt_allele, 1); | |
458 $alt_allele = '-' if $alt_allele eq ''; | |
459 push @alts, $alt_allele; | |
460 } | |
461 } | |
462 | |
463 $alt = join "/", @alts; | |
464 } | |
465 | |
466 else { | |
467 # for substitutions we just need to replace ',' with '/' in $alt | |
468 $alt =~ s/\,/\//g; | |
469 } | |
470 } | |
471 | |
472 elsif($is_indel) { | |
473 # deletion (VCF <4) | |
474 if($alt =~ /D/) { | |
475 my $num_deleted = $alt; | |
476 $num_deleted =~ s/\D+//g; | |
477 $end += $num_deleted - 1; | |
478 $alt = "-"; | |
479 $ref .= ("N" x ($num_deleted - 1)) unless length($ref) > 1; | |
480 } | |
481 | |
482 # insertion (VCF <4) | |
483 elsif($alt =~ /I/) { | |
484 $ref = '-'; | |
485 $alt =~ s/^I//g; | |
486 $start++; | |
487 } | |
488 | |
489 # insertion or deletion (VCF 4+) | |
490 elsif(substr($ref, 0, 1) eq substr($alt, 0, 1)) { | |
491 | |
492 # chop off first base | |
493 $ref = substr($ref, 1) || '-'; | |
494 $alt = substr($alt, 1) || '-'; | |
495 | |
496 $start++; | |
497 } | |
498 } | |
499 | |
500 # create VF object | |
501 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | |
502 start => $start, | |
503 end => $end, | |
504 allele_string => $non_variant ? $ref : $ref.'/'.$alt, | |
505 strand => 1, | |
506 map_weight => 1, | |
507 adaptor => $config->{vfa}, | |
508 variation_name => $data[2] eq '.' ? undef : $data[2], | |
509 chr => $chr, | |
510 }); | |
511 | |
512 # flag as non-variant | |
513 $vf->{non_variant} = 1 if $non_variant; | |
514 | |
515 # individuals? | |
516 if(defined($config->{individual})) { | |
517 my @alleles = split /\//, $ref.'/'.$alt; | |
518 | |
519 my @return; | |
520 | |
521 foreach my $ind(keys %{$config->{ind_cols}}) { | |
522 | |
523 # get alleles present in this individual | |
524 my @bits; | |
525 my $gt = (split /\:/, $data[$config->{ind_cols}->{$ind}])[0]; | |
526 | |
527 my $phased = ($gt =~ /\|/ ? 1 : 0); | |
528 | |
529 foreach my $bit(split /\||\/|\\/, $gt) { | |
530 push @bits, $alleles[$bit] unless $bit eq '.'; | |
531 } | |
532 | |
533 # shallow copy VF | |
534 my $vf_copy = { %$vf }; | |
535 bless $vf_copy, ref($vf); | |
536 | |
537 # get non-refs | |
538 my %non_ref = map {$_ => 1} grep {$_ ne $ref} @bits; | |
539 | |
540 # construct allele_string | |
541 if(scalar keys %non_ref) { | |
542 $vf_copy->{allele_string} = $ref."/".(join "/", keys %non_ref); | |
543 } | |
544 else { | |
545 $vf_copy->{allele_string} = $ref; | |
546 $vf_copy->{non_variant} = 1; | |
547 } | |
548 | |
549 # store phasing info | |
550 $vf_copy->{phased} = defined($config->{phased} ? 1 : $phased); | |
551 | |
552 # store GT | |
553 $vf_copy->{genotype} = \@bits; | |
554 | |
555 # store individual name | |
556 $vf_copy->{individual} = $ind; | |
557 | |
558 push @return, $vf_copy; | |
559 } | |
560 | |
561 return \@return; | |
562 } | |
563 else { | |
564 return [$vf]; | |
565 } | |
566 } | |
567 } | |
568 | |
569 # parse a line of pileup input into variation feature objects | |
570 sub parse_pileup { | |
571 my $config = shift; | |
572 my $line = shift; | |
573 | |
574 my @data = split /\s+/, $line; | |
575 | |
576 # pileup can produce more than one VF per line | |
577 my @return; | |
578 | |
579 # normal variant | |
580 if($data[2] ne "*"){ | |
581 my $var; | |
582 | |
583 if($data[3] =~ /^[A|C|G|T]$/) { | |
584 $var = $data[3]; | |
585 } | |
586 else { | |
587 ($var = unambiguity_code($data[3])) =~ s/$data[2]//ig; | |
588 } | |
589 | |
590 for my $alt(split //, $var){ | |
591 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | |
592 start => $data[1], | |
593 end => $data[1], | |
594 allele_string => $data[2].'/'.$alt, | |
595 strand => 1, | |
596 map_weight => 1, | |
597 adaptor => $config->{vfa}, | |
598 chr => $data[0], | |
599 }); | |
600 } | |
601 } | |
602 | |
603 # in/del | |
604 else { | |
605 my %tmp_hash = map {$_ => 1} split /\//, $data[3]; | |
606 my @genotype = keys %tmp_hash; | |
607 | |
608 foreach my $allele(@genotype){ | |
609 if(substr($allele,0,1) eq "+") { #ins | |
610 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | |
611 start => $data[1] + 1, | |
612 end => $data[1], | |
613 allele_string => '-/'.substr($allele, 1), | |
614 strand => 1, | |
615 map_weight => 1, | |
616 adaptor => $config->{vfa}, | |
617 chr => $data[0], | |
618 }); | |
619 } | |
620 elsif(substr($allele,0,1) eq "-"){ #del | |
621 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | |
622 start => $data[1] + 1, | |
623 end => $data[1] + length(substr($allele, 1)), | |
624 allele_string => substr($allele, 1).'/-', | |
625 strand => 1, | |
626 map_weight => 1, | |
627 adaptor => $config->{vfa}, | |
628 chr => $data[0], | |
629 }); | |
630 } | |
631 elsif($allele ne "*"){ | |
632 warn("WARNING: invalid pileup indel genotype: $line\n") unless defined $config->{quiet}; | |
633 } | |
634 } | |
635 } | |
636 | |
637 return \@return; | |
638 } | |
639 | |
640 # parse a line of HGVS input into a variation feature object | |
641 sub parse_hgvs { | |
642 my $config = shift; | |
643 my $line = shift; | |
644 | |
645 my $vf; | |
646 | |
647 # not all hgvs notations are supported yet, so we have to wrap it in an eval | |
648 eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{sa}, $config->{ta}) }; | |
649 | |
650 if((!defined($vf) || (defined $@ && length($@) > 1)) && defined($config->{coordinator})) { | |
651 eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{ofsa}, $config->{ofta}) }; | |
652 } | |
653 | |
654 if(!defined($vf) || (defined $@ && length($@) > 1)) { | |
655 warn("WARNING: Unable to parse HGVS notation \'$line\'\n$@") unless defined $config->{quiet}; | |
656 return []; | |
657 } | |
658 | |
659 # get whole chromosome slice | |
660 my $slice = $vf->slice->adaptor->fetch_by_region($vf->slice->coord_system->name, $vf->slice->seq_region_name); | |
661 | |
662 $vf = $vf->transfer($slice); | |
663 | |
664 # name it after the HGVS | |
665 $vf->{variation_name} = $line; | |
666 | |
667 # add chr attrib | |
668 $vf->{chr} = $vf->slice->seq_region_name; | |
669 | |
670 return [$vf]; | |
671 } | |
672 | |
673 # parse a variation identifier e.g. a dbSNP rsID | |
674 sub parse_id { | |
675 my $config = shift; | |
676 my $line = shift; | |
677 | |
678 my $v_obj = $config->{va}->fetch_by_name($line); | |
679 | |
680 return [] unless defined $v_obj; | |
681 | |
682 my @vfs = @{$v_obj->get_all_VariationFeatures}; | |
683 delete $_->{dbID} for @vfs; | |
684 delete $_->{overlap_consequences} for @vfs; | |
685 $_->{chr} = $_->seq_region_name for @vfs; | |
686 | |
687 return \@vfs; | |
688 } | |
689 | |
690 # parse a line of VEP output | |
691 sub parse_vep { | |
692 my $config = shift; | |
693 my $line = shift; | |
694 | |
695 my @data = split /\t/, $line; | |
696 | |
697 my ($chr, $start, $end) = split /\:|\-/, $data[1]; | |
698 $end ||= $start; | |
699 | |
700 # might get allele string from ID | |
701 my $allele_string; | |
702 | |
703 if($data[0] =~ /^\w\_\w\_\w$/) { | |
704 my @split = split /\_/, $data[0]; | |
705 $allele_string = $split[-1] if $split[-1] =~ /[ACGTN-]+\/[ACGTN-]+/; | |
706 } | |
707 | |
708 $allele_string ||= 'N/'.($data[6] =~ /intergenic/ ? 'N' : $data[2]); | |
709 | |
710 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ | |
711 start => $start, | |
712 end => $end, | |
713 allele_string => $allele_string, | |
714 strand => 1, | |
715 map_weight => 1, | |
716 adaptor => $config->{vfa}, | |
717 chr => $chr, | |
718 variation_name => $data[0], | |
719 }); | |
720 | |
721 return [$vf]; | |
722 } | |
723 | |
724 | |
725 | |
726 # converts to VCF format | |
727 sub convert_to_vcf { | |
728 my $config = shift; | |
729 my $vf = shift; | |
730 | |
731 # look for imbalance in the allele string | |
732 my %allele_lengths; | |
733 my @alleles = split /\//, $vf->allele_string; | |
734 | |
735 foreach my $allele(@alleles) { | |
736 $allele =~ s/\-//g; | |
737 $allele_lengths{length($allele)} = 1; | |
738 } | |
739 | |
740 # in/del/unbalanced | |
741 if(scalar keys %allele_lengths > 1) { | |
742 | |
743 # we need the ref base before the variation | |
744 # default to N in case we can't get it | |
745 my $prev_base = 'N'; | |
746 | |
747 unless(defined($config->{cache})) { | |
748 my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1); | |
749 $prev_base = $slice->seq if defined($slice); | |
750 } | |
751 | |
752 for my $i(0..$#alleles) { | |
753 $alleles[$i] =~ s/\-//g; | |
754 $alleles[$i] = $prev_base.$alleles[$i]; | |
755 } | |
756 | |
757 return [ | |
758 $vf->{chr} || $vf->seq_region_name, | |
759 $vf->start - 1, | |
760 $vf->variation_name, | |
761 shift @alleles, | |
762 (join ",", @alleles), | |
763 '.', '.', '.' | |
764 ]; | |
765 | |
766 } | |
767 | |
768 # balanced sub | |
769 else { | |
770 return [ | |
771 $vf->{chr} || $vf->seq_region_name, | |
772 $vf->start, | |
773 $vf->variation_name, | |
774 shift @alleles, | |
775 (join ",", @alleles), | |
776 '.', '.', '.' | |
777 ]; | |
778 } | |
779 } | |
780 | |
781 | |
782 # tries to map a VF to the LRG coordinate system | |
783 sub add_lrg_mappings { | |
784 my $config = shift; | |
785 my $vfs = shift; | |
786 | |
787 my @new_vfs; | |
788 | |
789 foreach my $vf(@$vfs) { | |
790 | |
791 # add the unmapped VF to the array | |
792 push @new_vfs, $vf; | |
793 | |
794 # make sure the VF has an attached slice | |
795 $vf->{slice} ||= get_slice($config, $vf->{chr}); | |
796 next unless defined($vf->{slice}); | |
797 | |
798 # transform LRG <-> chromosome | |
799 my $new_vf = $vf->transform($vf->{slice}->coord_system->name eq 'lrg' ? 'chromosome' : 'lrg'); | |
800 | |
801 # add it to the array if transformation worked | |
802 if(defined($new_vf)) { | |
803 | |
804 # update new VF's chr entry | |
805 $new_vf->{chr} = $new_vf->seq_region_name; | |
806 push @new_vfs, $new_vf; | |
807 } | |
808 } | |
809 | |
810 return \@new_vfs; | |
811 } | |
812 | |
813 | |
814 # wrapper for whole_genome_fetch and vf_to_consequences | |
815 # takes config and a listref of VFs, returns listref of line hashes for printing | |
816 sub get_all_consequences { | |
817 my $config = shift; | |
818 my $listref = shift; | |
819 | |
820 if ($config->{extra}) { | |
821 eval "use Plugin qw($config);" | |
822 } | |
823 | |
824 # initialize caches | |
825 $config->{$_.'_cache'} ||= {} for qw(tr rf slice); | |
826 | |
827 # build hash | |
828 my %vf_hash; | |
829 push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref; | |
830 | |
831 my @non_variant = grep {defined($_->{non_variant})} @$listref; | |
832 debug("Skipping ".(scalar @non_variant)." non-variant loci\n") unless defined($config->{quiet}); | |
833 | |
834 # get regions | |
835 my $regions = ®ions_from_hash($config, \%vf_hash); | |
836 my $trim_regions = $regions; | |
837 | |
838 # get trimmed regions - allows us to discard out-of-range transcripts | |
839 # when using cache | |
840 #if(defined($config->{cache})) { | |
841 # my $tmp = $config->{cache}; | |
842 # delete $config->{cache}; | |
843 # $trim_regions = regions_from_hash($config, \%vf_hash); | |
844 # $config->{cache} = $tmp; | |
845 #} | |
846 | |
847 # prune caches | |
848 prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr}); | |
849 prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf}); | |
850 | |
851 # get chr list | |
852 my %chrs = map {$_->{chr} => 1} @$listref; | |
853 | |
854 my $fetched_tr_count = 0; | |
855 $fetched_tr_count = fetch_transcripts($config, $regions, $trim_regions) | |
856 unless defined($config->{no_consequences}); | |
857 | |
858 my $fetched_rf_count = 0; | |
859 $fetched_rf_count = fetch_regfeats($config, $regions, $trim_regions) | |
860 if defined($config->{regulatory}) | |
861 && !defined($config->{no_consequences}); | |
862 | |
863 # check we can use MIME::Base64 | |
864 if(defined($config->{fork})) { | |
865 eval q{ use MIME::Base64; }; | |
866 | |
867 if($@) { | |
868 debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet}); | |
869 delete $config->{fork}; | |
870 } | |
871 } | |
872 | |
873 my (@temp_array, @return, @pids); | |
874 if(defined($config->{fork})) { | |
875 my $size = scalar @$listref; | |
876 | |
877 while(my $tmp_vf = shift @$listref) { | |
878 push @temp_array, $tmp_vf; | |
879 | |
880 # fork | |
881 if(scalar @temp_array >= ($size / $config->{fork}) || scalar @$listref == 0) { | |
882 | |
883 my $pid = fork; | |
884 | |
885 if(!defined($pid)) { | |
886 debug("WARNING: Failed to fork - will attempt to continue without forking") unless defined($config->{quiet}); | |
887 push @temp_array, @$listref; | |
888 push @return, @{vf_list_to_cons($config, \@temp_array, $regions)}; | |
889 last; | |
890 } | |
891 elsif($pid) { | |
892 push @pids, $pid; | |
893 @temp_array = (); | |
894 } | |
895 elsif($pid == 0) { | |
896 | |
897 $config->{forked} = $$; | |
898 $config->{quiet} = 1; | |
899 | |
900 # redirect STDERR to PARENT so we can catch errors | |
901 *STDERR = *PARENT; | |
902 | |
903 my $cons = vf_list_to_cons($config, \@temp_array, $regions); | |
904 | |
905 # what we're doing here is sending a serialised hash of the | |
906 # results through to the parent process through the socket. | |
907 # This is then thawed by the parent process. | |
908 # $$, or the PID, is added so that the input can be sorted | |
909 # back into the correct order for output | |
910 print PARENT $$." ".encode_base64(freeze($_), "\t")."\n" for @$cons; | |
911 | |
912 # some plugins may cache stuff, check for this and try and | |
913 # reconstitute it into parent's plugin cache | |
914 foreach my $plugin(@{$config->{plugins}}) { | |
915 | |
916 next unless defined($plugin->{has_cache}); | |
917 | |
918 # delete unnecessary stuff and stuff that can't be serialised | |
919 delete $plugin->{$_} for qw(config feature_types variant_feature_types version feature_types_wanted variant_feature_types_wanted params); | |
920 print PARENT $$." PLUGIN ".ref($plugin)." ".encode_base64(freeze($plugin), "\t")."\n"; | |
921 } | |
922 | |
923 # we need to tell the parent this child is finished | |
924 # otherwise it keeps listening | |
925 print PARENT "DONE $$\n"; | |
926 close PARENT; | |
927 | |
928 exit(0); | |
929 } | |
930 } | |
931 } | |
932 | |
933 debug("Calculating consequences") unless defined($config->{quiet}); | |
934 | |
935 my $fh = $config->{out_file_handle}; | |
936 my $done_processes = 0; | |
937 my $done_vars = 0; | |
938 my $total_size = $size; | |
939 my $pruned_count; | |
940 | |
941 # create a hash to store the returned data by PID | |
942 # this means we can sort it correctly on output | |
943 my %by_pid; | |
944 | |
945 # read child input | |
946 while(<CHILD>) { | |
947 | |
948 # child finished | |
949 if(/^DONE/) { | |
950 $done_processes++; | |
951 last if $done_processes == scalar @pids; | |
952 } | |
953 | |
954 # variant finished / progress indicator | |
955 elsif(/^BUMP/) { | |
956 progress($config, ++$done_vars, $total_size);# if $pruned_count == scalar @pids; | |
957 } | |
958 | |
959 # output | |
960 elsif(/^\-?\d+ /) { | |
961 | |
962 # plugin | |
963 if(/^\-?\d+ PLUGIN/) { | |
964 | |
965 m/^(\-?\d+) PLUGIN (\w+) /; | |
966 my ($pid, $plugin) = ($1, $2); | |
967 | |
968 # remove the PID | |
969 s/^\-?\d+ PLUGIN \w+ //; | |
970 chomp; | |
971 | |
972 my $tmp = thaw(decode_base64($_)); | |
973 | |
974 next unless defined($plugin); | |
975 | |
976 # copy data to parent plugin | |
977 my ($parent_plugin) = grep {ref($_) eq $plugin} @{$config->{plugins}}; | |
978 | |
979 next unless defined($parent_plugin); | |
980 | |
981 merge_hashes($parent_plugin, $tmp); | |
982 } | |
983 | |
984 else { | |
985 # grab the PID | |
986 m/^(\-?\d+)\s/; | |
987 my $pid = $1; | |
988 die "ERROR: Could not parse forked PID from line $_" unless defined($pid); | |
989 | |
990 # remove the PID | |
991 s/^\-?\d+\s//; | |
992 chomp; | |
993 | |
994 # decode and thaw "output" from forked process | |
995 push @{$by_pid{$pid}}, thaw(decode_base64($_)); | |
996 } | |
997 } | |
998 | |
999 elsif(/^PRUNED/) { | |
1000 s/PRUNED //g; | |
1001 chomp; | |
1002 $pruned_count++; | |
1003 $total_size += $_; | |
1004 } | |
1005 | |
1006 elsif(/^DEBUG/) { | |
1007 print STDERR; | |
1008 } | |
1009 | |
1010 # something's wrong | |
1011 else { | |
1012 # kill the other pids | |
1013 kill(15, $_) for @pids; | |
1014 die("\nERROR: Forked process failed\n$_\n"); | |
1015 } | |
1016 } | |
1017 | |
1018 end_progress($config); | |
1019 | |
1020 debug("Writing output") unless defined($config->{quiet}); | |
1021 | |
1022 waitpid($_, 0) for @pids; | |
1023 | |
1024 # add the sorted data to the return array | |
1025 push @return, @{$by_pid{$_} || []} for @pids; | |
1026 } | |
1027 | |
1028 # no forking | |
1029 else { | |
1030 push @return, @{vf_list_to_cons($config, $listref, $regions)}; | |
1031 } | |
1032 | |
1033 if(defined($config->{debug})) { | |
1034 eval q{use Devel::Size qw(total_size)}; | |
1035 my $mem = memory(); | |
1036 my $tot; | |
1037 $tot += $_ for @$mem; | |
1038 | |
1039 if($tot > 1000000) { | |
1040 $tot = sprintf("%.2fGB", $tot / (1024 * 1024)); | |
1041 } | |
1042 | |
1043 elsif($tot > 1000) { | |
1044 $tot = sprintf("%.2fMB", $tot / 1024); | |
1045 } | |
1046 | |
1047 my $mem_diff = mem_diff($config); | |
1048 debug( | |
1049 "LINES ", $config->{line_number}, | |
1050 "\tMEMORY $tot ", (join " ", @$mem), | |
1051 "\tDIFF ", (join " ", @$mem_diff), | |
1052 "\tCACHE ", total_size($config->{tr_cache}). | |
1053 "\tRF ", total_size($config->{rf_cache}), | |
1054 "\tVF ", total_size(\%vf_hash), | |
1055 ); | |
1056 #exit(0) if grep {$_ < 0} @$mem_diff; | |
1057 } | |
1058 | |
1059 return \@return; | |
1060 } | |
1061 | |
1062 sub vf_list_to_cons { | |
1063 my $config = shift; | |
1064 my $listref = shift; | |
1065 my $regions = shift; | |
1066 | |
1067 # get non-variants | |
1068 my @non_variants = grep {$_->{non_variant}} @$listref; | |
1069 | |
1070 my %vf_hash = (); | |
1071 push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref; | |
1072 | |
1073 # check existing VFs | |
1074 &check_existing_hash($config, \%vf_hash) if defined($config->{check_existing});# && scalar @$listref > 10; | |
1075 | |
1076 # get overlapping SVs | |
1077 &check_svs_hash($config, \%vf_hash) if defined($config->{check_svs}); | |
1078 | |
1079 # if we are forked, we can trim off some stuff | |
1080 if(defined($config->{forked})) { | |
1081 my $tmp = $config->{cache}; | |
1082 delete $config->{cache}; | |
1083 $regions = regions_from_hash($config, \%vf_hash); | |
1084 $config->{cache} = $tmp; | |
1085 | |
1086 # prune caches | |
1087 my $new_count = 0; | |
1088 | |
1089 $new_count += prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr}); | |
1090 $new_count += prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf}); | |
1091 | |
1092 print PARENT "PRUNED $new_count\n"; | |
1093 } | |
1094 | |
1095 my @return; | |
1096 | |
1097 foreach my $chr(sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %vf_hash) { | |
1098 my $finished_vfs = whole_genome_fetch($config, $chr, \%vf_hash); | |
1099 | |
1100 # non-variants? | |
1101 if(scalar @non_variants) { | |
1102 push @$finished_vfs, grep {$_->{chr} eq $chr} @non_variants; | |
1103 | |
1104 # need to re-sort | |
1105 @$finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @$finished_vfs; | |
1106 } | |
1107 | |
1108 debug("Calculating consequences") unless defined($config->{quiet}); | |
1109 | |
1110 my $vf_count = scalar @$finished_vfs; | |
1111 my $vf_counter = 0; | |
1112 | |
1113 while(my $vf = shift @$finished_vfs) { | |
1114 progress($config, $vf_counter++, $vf_count) unless $vf_count == 1; | |
1115 | |
1116 my $filter_ok = 1; | |
1117 | |
1118 # filtered output | |
1119 if(defined($config->{filter})) { | |
1120 $filter_ok = filter_by_consequence($config, $vf); | |
1121 $config->{filter_count} += $filter_ok; | |
1122 } | |
1123 | |
1124 # skip filtered lines | |
1125 next unless $filter_ok; | |
1126 | |
1127 # original output | |
1128 if(defined($config->{original})) { | |
1129 push @return, $vf->{_line}; | |
1130 } | |
1131 | |
1132 # GVF output | |
1133 elsif(defined($config->{gvf})) { | |
1134 $vf->source("User"); | |
1135 | |
1136 $config->{gvf_id} ||= 1; | |
1137 | |
1138 # get custom annotation | |
1139 my $custom_annotation = defined($config->{custom}) ? get_custom_annotation($config, $vf) : {}; | |
1140 $custom_annotation->{ID} = $config->{gvf_id}++; | |
1141 | |
1142 | |
1143 my $tmp = $vf->to_gvf( | |
1144 include_consequences => defined($config->{no_consequences}) ? 0 : 1, | |
1145 extra_attrs => $custom_annotation, | |
1146 ); | |
1147 push @return, \$tmp; | |
1148 } | |
1149 | |
1150 # VCF output | |
1151 elsif(defined($config->{vcf})) { | |
1152 | |
1153 # convert to VCF, otherwise get line | |
1154 my $line = $config->{format} eq 'vcf' ? [split /\s+/, $vf->{_line}] : convert_to_vcf($config, $vf); | |
1155 | |
1156 if(!defined($line->[7]) || $line->[7] eq '.') { | |
1157 $line->[7] = ''; | |
1158 } | |
1159 | |
1160 # get all the lines the normal way | |
1161 # and process them into VCF-compatible string | |
1162 my $string = 'CSQ='; | |
1163 | |
1164 foreach my $line(@{vf_to_consequences($config, $vf)}) { | |
1165 | |
1166 # use the field list (can be user-defined by setting --fields) | |
1167 for my $col(@{$config->{fields}}) { | |
1168 | |
1169 # skip fields already represented in the VCF | |
1170 next if $col eq 'Uploaded_variation' or $col eq 'Location' or $col eq 'Extra'; | |
1171 | |
1172 # search for data in main line hash as well as extra field | |
1173 my $data = defined $line->{$col} ? $line->{$col} : $line->{Extra}->{$col}; | |
1174 | |
1175 # "-" means null for everything except the Allele field (confusing...) | |
1176 $data = undef if defined($data) and $data eq '-' and $col ne 'Allele'; | |
1177 $data =~ s/\,/\&/g if defined $data; | |
1178 $string .= defined($data) ? $data : ''; | |
1179 $string .= '|'; | |
1180 } | |
1181 | |
1182 $string =~ s/\|$//; | |
1183 $string .= ','; | |
1184 } | |
1185 | |
1186 $string =~ s/\,$//; | |
1187 | |
1188 if(!defined($config->{no_consequences}) && $string ne 'CSQ=') { | |
1189 $line->[7] .= ($line->[7] ? ';' : '').$string; | |
1190 } | |
1191 | |
1192 # get custom annotation | |
1193 if(defined($config->{custom}) && scalar @{$config->{custom}}) { | |
1194 my $custom_annotation = get_custom_annotation($config, $vf); | |
1195 foreach my $key(keys %{$custom_annotation}) { | |
1196 $line->[7] .= ($line->[7] ? ';' : '').$key.'='.$custom_annotation->{$key}; | |
1197 } | |
1198 } | |
1199 | |
1200 my $tmp = join "\t", @$line; | |
1201 push @return, \$tmp; | |
1202 } | |
1203 | |
1204 # no consequence output from vep input | |
1205 elsif(defined($config->{no_consequences}) && $config->{format} eq 'vep') { | |
1206 | |
1207 my $line = [split /\s+/, $vf->{_line}]; | |
1208 | |
1209 if($line->[13] eq '-') { | |
1210 $line->[13] = ''; | |
1211 } | |
1212 | |
1213 # get custom annotation | |
1214 if(defined($config->{custom})) { | |
1215 my $custom_annotation = get_custom_annotation($config, $vf); | |
1216 foreach my $key(keys %{$custom_annotation}) { | |
1217 $line->[13] .= ($line->[13] ? ';' : '').$key.'='.$custom_annotation->{$key}; | |
1218 } | |
1219 } | |
1220 | |
1221 my $tmp = join "\t", @$line; | |
1222 push @return, \$tmp; | |
1223 } | |
1224 | |
1225 # normal output | |
1226 else { | |
1227 push @return, @{vf_to_consequences($config, $vf)}; | |
1228 } | |
1229 | |
1230 print PARENT "BUMP\n" if defined($config->{forked}); | |
1231 } | |
1232 | |
1233 end_progress($config) unless scalar @$listref == 1; | |
1234 } | |
1235 | |
1236 return \@return; | |
1237 } | |
1238 | |
1239 # takes a variation feature and returns ready to print consequence information | |
1240 sub vf_to_consequences { | |
1241 my $config = shift; | |
1242 my $vf = shift; | |
1243 | |
1244 # use a different method for SVs | |
1245 return svf_to_consequences($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); | |
1246 | |
1247 my @return = (); | |
1248 | |
1249 # method name for consequence terms | |
1250 my $term_method = $config->{terms}.'_term'; | |
1251 | |
1252 # find any co-located existing VFs | |
1253 $vf->{existing} ||= find_existing($config, $vf) if defined $config->{check_existing}; | |
1254 | |
1255 # skip based on frequency checks? | |
1256 if(defined($config->{check_frequency}) && defined($vf->{existing}) && $vf->{existing} ne '-' && defined($config->{va})) { | |
1257 return [] unless grep {$_} map {check_frequencies($config, $_)} reverse split(/\,/, $vf->{existing}); | |
1258 $vf->{freqs} = $config->{filtered_freqs}; | |
1259 } | |
1260 | |
1261 # force empty hash into object's transcript_variations if undefined from whole_genome_fetch | |
1262 # this will stop the API trying to go off and fill it again | |
1263 $vf->{transcript_variations} ||= {} if defined $config->{whole_genome}; | |
1264 | |
1265 # regulatory stuff | |
1266 if(!defined $config->{coding_only} && defined $config->{regulatory}) { | |
1267 | |
1268 for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) { | |
1269 | |
1270 my $rf = $rfv->regulatory_feature; | |
1271 | |
1272 my $base_line = { | |
1273 Feature_type => 'RegulatoryFeature', | |
1274 Feature => $rf->stable_id, | |
1275 }; | |
1276 | |
1277 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { | |
1278 $base_line->{Extra}->{CELL_TYPE} = join ",", | |
1279 map {$_.':'.$rf->{cell_types}->{$_}} | |
1280 grep {$rf->{cell_types}->{$_}} | |
1281 @{$config->{cell_type}}; | |
1282 | |
1283 $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; | |
1284 } | |
1285 | |
1286 # this currently always returns 'RegulatoryFeature', so we ignore it for now | |
1287 #$base_line->{Extra}->{REG_FEAT_TYPE} = $rf->feature_type->name; | |
1288 | |
1289 for my $rfva (@{ $rfv->get_all_alternate_RegulatoryFeatureVariationAlleles }) { | |
1290 | |
1291 my $line = init_line($config, $vf, $base_line); | |
1292 | |
1293 $line->{Allele} = $rfva->variation_feature_seq; | |
1294 $line->{Consequence} = join ',', | |
1295 map { $_->$term_method || $_->SO_term } | |
1296 @{ $rfva->get_all_OverlapConsequences }; | |
1297 | |
1298 $line = run_plugins($rfva, $line, $config); | |
1299 | |
1300 push @return, $line; | |
1301 } | |
1302 } | |
1303 | |
1304 for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) { | |
1305 | |
1306 my $mf = $mfv->motif_feature; | |
1307 | |
1308 # check that the motif has a binding matrix, if not there's not | |
1309 # much we can do so don't return anything | |
1310 | |
1311 next unless defined $mf->binding_matrix; | |
1312 | |
1313 my $matrix = $mf->binding_matrix->description.' '.$mf->display_label; | |
1314 $matrix =~ s/\s+/\_/g; | |
1315 | |
1316 my $base_line = { | |
1317 Feature_type => 'MotifFeature', | |
1318 Feature => $mf->binding_matrix->name, | |
1319 Extra => { | |
1320 MOTIF_NAME => $matrix, | |
1321 } | |
1322 }; | |
1323 | |
1324 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { | |
1325 $base_line->{Extra}->{CELL_TYPE} = join ",", | |
1326 map {$_.':'.$mf->{cell_types}->{$_}} | |
1327 grep {$mf->{cell_types}->{$_}} | |
1328 @{$config->{cell_type}}; | |
1329 | |
1330 $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g; | |
1331 } | |
1332 | |
1333 for my $mfva (@{ $mfv->get_all_alternate_MotifFeatureVariationAlleles }) { | |
1334 | |
1335 my $line = init_line($config, $vf, $base_line); | |
1336 | |
1337 $line->{Extra}->{MOTIF_POS} = $mfva->motif_start if defined $mfva->motif_start; | |
1338 $line->{Extra}->{HIGH_INF_POS} = ($mfva->in_informative_position ? 'Y' : 'N'); | |
1339 | |
1340 my $delta = $mfva->motif_score_delta if $mfva->variation_feature_seq =~ /^[ACGT]+$/; | |
1341 | |
1342 $line->{Extra}->{MOTIF_SCORE_CHANGE} = sprintf("%.3f", $delta) if defined $delta; | |
1343 | |
1344 $line->{Allele} = $mfva->variation_feature_seq; | |
1345 $line->{Consequence} = join ',', | |
1346 map { $_->$term_method || $_->SO_term } | |
1347 @{ $mfva->get_all_OverlapConsequences }; | |
1348 | |
1349 $line = run_plugins($mfva, $line, $config); | |
1350 | |
1351 push @return, $line; | |
1352 } | |
1353 } | |
1354 } | |
1355 | |
1356 # get TVs | |
1357 my $tvs = $vf->get_all_TranscriptVariations; | |
1358 | |
1359 # only most severe | |
1360 if(defined($config->{most_severe}) || defined($config->{summary})) { | |
1361 | |
1362 my $line = init_line($config, $vf); | |
1363 | |
1364 if(defined($config->{summary})) { | |
1365 $line->{Consequence} = join ",", @{$vf->consequence_type($config->{terms}) || $vf->consequence_type}; | |
1366 } | |
1367 else { | |
1368 $line->{Consequence} = $vf->display_consequence($config->{terms}) || $vf->display_consequence; | |
1369 } | |
1370 | |
1371 push @return, $line; | |
1372 } | |
1373 | |
1374 # pass a true argument to get_IntergenicVariation to stop it doing a reference allele check | |
1375 # (to stay consistent with the rest of the VEP) | |
1376 elsif ((my $iv = $vf->get_IntergenicVariation(1)) && !defined($config->{no_intergenic})) { | |
1377 | |
1378 for my $iva (@{ $iv->get_all_alternate_IntergenicVariationAlleles }) { | |
1379 | |
1380 my $line = init_line($config, $vf); | |
1381 | |
1382 $line->{Allele} = $iva->variation_feature_seq; | |
1383 | |
1384 my $cons = $iva->get_all_OverlapConsequences->[0]; | |
1385 | |
1386 $line->{Consequence} = $cons->$term_method || $cons->SO_term; | |
1387 | |
1388 $line = run_plugins($iva, $line, $config); | |
1389 | |
1390 push @return, $line; | |
1391 } | |
1392 } | |
1393 | |
1394 # user wants only one conseqeunce per gene | |
1395 elsif(defined($config->{per_gene})) { | |
1396 | |
1397 # sort the TVA objects into a hash by gene | |
1398 my %by_gene; | |
1399 | |
1400 foreach my $tv(@$tvs) { | |
1401 next if(defined $config->{coding_only} && !($tv->affects_cds)); | |
1402 | |
1403 my $gene = $tv->transcript->{_gene_stable_id} || $config->{ga}->fetch_by_transcript_stable_id($tv->transcript->stable_id)->stable_id; | |
1404 | |
1405 push @{$by_gene{$gene}}, @{$tv->get_all_alternate_TranscriptVariationAlleles}; | |
1406 } | |
1407 | |
1408 foreach my $gene(keys %by_gene) { | |
1409 my ($lowest, $lowest_tva); | |
1410 | |
1411 # at the moment this means the one that comes out last will be picked | |
1412 # if there is more than one TVA with the same rank of consequence | |
1413 foreach my $tva(@{$by_gene{$gene}}) { | |
1414 foreach my $oc(@{$tva->get_all_OverlapConsequences}) { | |
1415 if(!defined($lowest) || $oc->rank < $lowest) { | |
1416 $lowest = $oc->rank; | |
1417 $lowest_tva = $tva; | |
1418 } | |
1419 } | |
1420 } | |
1421 | |
1422 push @return, tva_to_line($config, $lowest_tva); | |
1423 } | |
1424 } | |
1425 | |
1426 else { | |
1427 foreach my $tv(@$tvs) { | |
1428 next if(defined $config->{coding_only} && !($tv->affects_cds)); | |
1429 | |
1430 push @return, map {tva_to_line($config, $_)} @{$tv->get_all_alternate_TranscriptVariationAlleles}; | |
1431 | |
1432 undef $tv->{$_} for keys %$tv; | |
1433 } | |
1434 } | |
1435 | |
1436 return \@return; | |
1437 } | |
1438 | |
1439 # get consequences for a structural variation feature | |
1440 sub svf_to_consequences { | |
1441 my $config = shift; | |
1442 my $svf = shift; | |
1443 | |
1444 my @return = (); | |
1445 | |
1446 my $term_method = $config->{terms}.'_term'; | |
1447 | |
1448 if(defined $config->{whole_genome}) { | |
1449 $svf->{transcript_structural_variations} ||= []; | |
1450 $svf->{regulation_structural_variations}->{$_} ||= [] for @REG_FEAT_TYPES; | |
1451 } | |
1452 | |
1453 if ((my $iv = $svf->get_IntergenicStructuralVariation(1)) && !defined($config->{no_intergenic})) { | |
1454 | |
1455 for my $iva (@{ $iv->get_all_alternate_IntergenicStructuralVariationAlleles }) { | |
1456 | |
1457 my $line = init_line($config, $svf); | |
1458 | |
1459 $line->{Allele} = '-'; | |
1460 | |
1461 my $cons = $iva->get_all_OverlapConsequences->[0]; | |
1462 | |
1463 $line->{Consequence} = $cons->$term_method || $cons->SO_term; | |
1464 | |
1465 $line = run_plugins($iva, $line, $config); | |
1466 | |
1467 push @return, $line; | |
1468 } | |
1469 } | |
1470 | |
1471 foreach my $svo(@{$svf->get_all_StructuralVariationOverlaps}) { | |
1472 | |
1473 next if $svo->isa('Bio::EnsEMBL::Variation::IntergenicStructuralVariation'); | |
1474 | |
1475 my $feature = $svo->feature; | |
1476 | |
1477 # get feature type | |
1478 my $feature_type = (split '::', ref($feature))[-1]; | |
1479 | |
1480 my $base_line = { | |
1481 Feature_type => $feature_type, | |
1482 Feature => $feature->stable_id, | |
1483 Allele => $svf->class_SO_term, | |
1484 }; | |
1485 | |
1486 if($svo->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariation')) { | |
1487 $base_line->{cDNA_position} = format_coords($svo->cdna_start, $svo->cdna_end); | |
1488 $base_line->{CDS_position} = format_coords($svo->cds_start, $svo->cds_end); | |
1489 $base_line->{Protein_position} = format_coords($svo->translation_start, $svo->translation_end); | |
1490 } | |
1491 | |
1492 foreach my $svoa(@{$svo->get_all_StructuralVariationOverlapAlleles}) { | |
1493 my $line = init_line($config, $svf, $base_line); | |
1494 | |
1495 $line->{Consequence} = join ",", | |
1496 #map {s/feature/$feature_type/e; $_} | |
1497 map {$_->$term_method} | |
1498 sort {$a->rank <=> $b->rank} | |
1499 @{$svoa->get_all_OverlapConsequences}; | |
1500 | |
1501 # work out overlap amounts | |
1502 my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1]; | |
1503 my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0]; | |
1504 my $overlap_length = ($overlap_end - $overlap_start) + 1; | |
1505 my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1)); | |
1506 | |
1507 $line->{Extra}->{OverlapBP} = $overlap_length if $overlap_length > 0; | |
1508 $line->{Extra}->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0; | |
1509 | |
1510 add_extra_fields($config, $line, $svoa); | |
1511 | |
1512 push @return, $line; | |
1513 } | |
1514 } | |
1515 | |
1516 return \@return; | |
1517 } | |
1518 | |
1519 # run all of the configured plugins on a VariationFeatureOverlapAllele instance | |
1520 # and store any results in the provided line hash | |
1521 sub run_plugins { | |
1522 | |
1523 my ($bvfoa, $line_hash, $config) = @_; | |
1524 | |
1525 my $skip_line = 0; | |
1526 | |
1527 for my $plugin (@{ $config->{plugins} || [] }) { | |
1528 | |
1529 # check that this plugin is interested in this type of variation feature | |
1530 | |
1531 if ($plugin->check_variant_feature_type(ref $bvfoa->base_variation_feature)) { | |
1532 | |
1533 # check that this plugin is interested in this type of feature | |
1534 | |
1535 if ($plugin->check_feature_type(ref $bvfoa->feature || 'Intergenic')) { | |
1536 | |
1537 eval { | |
1538 my $plugin_results = $plugin->run($bvfoa, $line_hash); | |
1539 | |
1540 if (defined $plugin_results) { | |
1541 if (ref $plugin_results eq 'HASH') { | |
1542 for my $key (keys %$plugin_results) { | |
1543 $line_hash->{Extra}->{$key} = $plugin_results->{$key}; | |
1544 } | |
1545 } | |
1546 else { | |
1547 warn "Plugin '".(ref $plugin)."' did not return a hashref, output ignored!\n"; | |
1548 } | |
1549 } | |
1550 else { | |
1551 # if a plugin returns undef, that means it want to filter out this line | |
1552 $skip_line = 1; | |
1553 } | |
1554 }; | |
1555 if ($@) { | |
1556 warn "Plugin '".(ref $plugin)."' went wrong: $@"; | |
1557 } | |
1558 | |
1559 # there's no point running any other plugins if we're filtering this line, | |
1560 # because the first plugin to skip the line wins, so we might as well last | |
1561 # out of the loop now and avoid any unnecessary computation | |
1562 | |
1563 last if $skip_line; | |
1564 } | |
1565 } | |
1566 } | |
1567 | |
1568 return $skip_line ? undef : $line_hash; | |
1569 } | |
1570 | |
1571 # turn a TranscriptVariationAllele into a line hash | |
1572 sub tva_to_line { | |
1573 my $config = shift; | |
1574 my $tva = shift; | |
1575 | |
1576 my $tv = $tva->transcript_variation; | |
1577 my $t = $tv->transcript; | |
1578 | |
1579 # method name for consequence terms | |
1580 my $term_method = $config->{terms}.'_term'; | |
1581 | |
1582 my $base_line = { | |
1583 Feature_type => 'Transcript', | |
1584 Feature => (defined $t ? $t->stable_id : undef), | |
1585 cDNA_position => format_coords($tv->cdna_start, $tv->cdna_end), | |
1586 CDS_position => format_coords($tv->cds_start, $tv->cds_end), | |
1587 Protein_position => format_coords($tv->translation_start, $tv->translation_end), | |
1588 Allele => $tva->variation_feature_seq, | |
1589 Amino_acids => $tva->pep_allele_string, | |
1590 Codons => $tva->display_codon_allele_string, | |
1591 Consequence => join ",", map {$_->$term_method || $_->SO_term} sort {$a->rank <=> $b->rank} @{$tva->get_all_OverlapConsequences}, | |
1592 }; | |
1593 | |
1594 my $line = init_line($config, $tva->variation_feature, $base_line); | |
1595 | |
1596 # HGVS | |
1597 if(defined $config->{hgvs}) { | |
1598 my $hgvs_t = $tva->hgvs_transcript; | |
1599 my $hgvs_p = $tva->hgvs_protein; | |
1600 | |
1601 $line->{Extra}->{HGVSc} = $hgvs_t if $hgvs_t; | |
1602 $line->{Extra}->{HGVSp} = $hgvs_p if $hgvs_p; | |
1603 } | |
1604 | |
1605 foreach my $tool (qw(SIFT PolyPhen)) { | |
1606 my $lc_tool = lc($tool); | |
1607 | |
1608 if (my $opt = $config->{$lc_tool}) { | |
1609 my $want_pred = $opt =~ /^p/i; | |
1610 my $want_score = $opt =~ /^s/i; | |
1611 my $want_both = $opt =~ /^b/i; | |
1612 | |
1613 if ($want_both) { | |
1614 $want_pred = 1; | |
1615 $want_score = 1; | |
1616 } | |
1617 | |
1618 next unless $want_pred || $want_score; | |
1619 | |
1620 my $pred_meth = $lc_tool.'_prediction'; | |
1621 my $score_meth = $lc_tool.'_score'; | |
1622 | |
1623 my $pred = $tva->$pred_meth; | |
1624 | |
1625 if($pred) { | |
1626 | |
1627 if ($want_pred) { | |
1628 $pred =~ s/\s+/\_/; | |
1629 $line->{Extra}->{$tool} = $pred; | |
1630 } | |
1631 | |
1632 if ($want_score) { | |
1633 my $score = $tva->$score_meth; | |
1634 | |
1635 if(defined $score) { | |
1636 if($want_pred) { | |
1637 $line->{Extra}->{$tool} .= "($score)"; | |
1638 } | |
1639 else { | |
1640 $line->{Extra}->{$tool} = $score; | |
1641 } | |
1642 } | |
1643 } | |
1644 } | |
1645 } | |
1646 } | |
1647 | |
1648 $line = add_extra_fields($config, $line, $tva); | |
1649 | |
1650 return $line; | |
1651 } | |
1652 | |
1653 sub add_extra_fields { | |
1654 my $config = shift; | |
1655 my $line = shift; | |
1656 my $bvfoa = shift; | |
1657 | |
1658 # overlapping SVs | |
1659 if(defined $config->{check_svs} && defined $bvfoa->base_variation_feature->{overlapping_svs}) { | |
1660 $line->{Extra}->{SV} = $bvfoa->base_variation_feature->{overlapping_svs}; | |
1661 } | |
1662 | |
1663 # add transcript-specific fields | |
1664 $line = add_extra_fields_transcript($config, $line, $bvfoa) if $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele'); | |
1665 | |
1666 # run plugins | |
1667 $line = run_plugins($bvfoa, $line, $config); | |
1668 | |
1669 return $line; | |
1670 } | |
1671 | |
1672 sub add_extra_fields_transcript { | |
1673 my $config = shift; | |
1674 my $line = shift; | |
1675 my $tva = shift; | |
1676 | |
1677 my $tv = $tva->base_variation_feature_overlap; | |
1678 my $tr = $tva->transcript; | |
1679 | |
1680 # get gene | |
1681 my $gene; | |
1682 | |
1683 $line->{Gene} = $tr->{_gene_stable_id}; | |
1684 | |
1685 if(!defined($line->{Gene})) { | |
1686 $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); | |
1687 $line->{Gene} = $gene ? $gene->stable_id : '-'; | |
1688 } | |
1689 | |
1690 # exon/intron numbers | |
1691 if ($config->{numbers}) { | |
1692 $line->{Extra}->{EXON} = $tv->exon_number if defined $tv->exon_number; | |
1693 $line->{Extra}->{INTRON} = $tv->intron_number if defined $tv->intron_number; | |
1694 } | |
1695 | |
1696 if ($config->{domains}) { | |
1697 my $feats = $tv->get_overlapping_ProteinFeatures; | |
1698 | |
1699 my @strings; | |
1700 | |
1701 for my $feat (@$feats) { | |
1702 my $label = $feat->analysis->display_label.':'.$feat->hseqname; | |
1703 | |
1704 # replace any special characters | |
1705 $label =~ s/[\s;=]/_/g; | |
1706 | |
1707 push @strings, $label; | |
1708 } | |
1709 | |
1710 $line->{Extra}->{DOMAINS} = join ',', @strings if @strings; | |
1711 } | |
1712 | |
1713 # distance to transcript | |
1714 if($line->{Consequence} =~ /(up|down)stream/i) { | |
1715 $line->{Extra}->{DISTANCE} = $tv->distance_to_transcript; | |
1716 } | |
1717 | |
1718 # HGNC | |
1719 if(defined $config->{hgnc}) { | |
1720 my $hgnc; | |
1721 $hgnc = $tr->{_gene_hgnc}; | |
1722 | |
1723 if(!defined($hgnc)) { | |
1724 if(!defined($gene)) { | |
1725 $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); | |
1726 } | |
1727 | |
1728 my @entries = grep {$_->database eq 'HGNC'} @{$gene->get_all_DBEntries()}; | |
1729 if(scalar @entries) { | |
1730 $hgnc = $entries[0]->display_id; | |
1731 } | |
1732 } | |
1733 | |
1734 $hgnc = undef if defined($hgnc) && $hgnc eq '-'; | |
1735 | |
1736 $line->{Extra}->{HGNC} = $hgnc if defined($hgnc); | |
1737 } | |
1738 | |
1739 # CCDS | |
1740 if(defined($config->{ccds})) { | |
1741 my $ccds = $tr->{_ccds}; | |
1742 | |
1743 if(!defined($ccds)) { | |
1744 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; | |
1745 $ccds = $entries[0]->display_id if scalar @entries; | |
1746 } | |
1747 | |
1748 $ccds = undef if defined($ccds) && $ccds eq '-'; | |
1749 | |
1750 $line->{Extra}->{CCDS} = $ccds if defined($ccds); | |
1751 } | |
1752 | |
1753 # refseq xref | |
1754 if(defined($config->{xref_refseq})) { | |
1755 my $refseq = $tr->{_refseq}; | |
1756 | |
1757 if(!defined($refseq)) { | |
1758 my @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; | |
1759 if(scalar @entries) { | |
1760 $refseq = join ",", map {$_->display_id."-".$_->database} @entries; | |
1761 } | |
1762 } | |
1763 | |
1764 $refseq = undef if defined($refseq) && $refseq eq '-'; | |
1765 | |
1766 $line->{Extra}->{RefSeq} = $refseq if defined($refseq); | |
1767 } | |
1768 | |
1769 # protein ID | |
1770 if(defined $config->{protein}) { | |
1771 my $protein = $tr->{_protein}; | |
1772 | |
1773 if(!defined($protein)) { | |
1774 $protein = $tr->translation->stable_id if defined($tr->translation); | |
1775 } | |
1776 | |
1777 $protein = undef if defined($protein) && $protein eq '-'; | |
1778 | |
1779 $line->{Extra}->{ENSP} = $protein if defined($protein); | |
1780 } | |
1781 | |
1782 # canonical transcript | |
1783 if(defined $config->{canonical}) { | |
1784 $line->{Extra}->{CANONICAL} = 'YES' if $tr->is_canonical; | |
1785 } | |
1786 | |
1787 return $line; | |
1788 } | |
1789 | |
1790 # initialize a line hash | |
1791 sub init_line { | |
1792 my $config = shift; | |
1793 my $vf = shift; | |
1794 my $base_line = shift; | |
1795 | |
1796 my $line = { | |
1797 Uploaded_variation => $vf->variation_name, | |
1798 Location => ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->start, $vf->end), | |
1799 Existing_variation => $vf->{existing}, | |
1800 Extra => {}, | |
1801 }; | |
1802 | |
1803 # add custom info | |
1804 if(defined($config->{custom}) && scalar @{$config->{custom}}) { | |
1805 # merge the custom hash with the extra hash | |
1806 my $custom = get_custom_annotation($config, $vf); | |
1807 | |
1808 for my $key (keys %$custom) { | |
1809 $line->{Extra}->{$key} = $custom->{$key}; | |
1810 } | |
1811 } | |
1812 | |
1813 # individual? | |
1814 $line->{Extra}->{IND} = $vf->{individual} if defined($vf->{individual}); | |
1815 | |
1816 # frequencies? | |
1817 $line->{Extra}->{FREQS} = join ",", @{$vf->{freqs}} if defined($vf->{freqs}); | |
1818 | |
1819 # gmaf? | |
1820 $line->{Extra}->{GMAF} = $vf->{gmaf} if defined($config->{gmaf}) && defined($vf->{gmaf}); | |
1821 | |
1822 # copy entries from base_line | |
1823 if(defined($base_line)) { | |
1824 $line->{$_} = $base_line->{$_} for keys %$base_line; | |
1825 } | |
1826 | |
1827 return $line; | |
1828 } | |
1829 | |
1830 | |
1831 # get custom annotation for a single VF | |
1832 sub get_custom_annotation { | |
1833 my $config = shift; | |
1834 my $vf = shift; | |
1835 my $cache = shift; | |
1836 | |
1837 return $vf->{custom} if defined($vf->{custom}); | |
1838 | |
1839 my $annotation = {}; | |
1840 | |
1841 my $chr = $vf->{chr}; | |
1842 | |
1843 if(!defined($cache)) { | |
1844 # spoof regions | |
1845 my $regions; | |
1846 $regions->{$chr} = [$vf->{start}.'-'.$vf->{end}]; | |
1847 $cache = cache_custom_annotation($config, $regions, $chr); | |
1848 } | |
1849 | |
1850 foreach my $custom(@{$config->{custom}}) { | |
1851 next unless defined($cache->{$chr}->{$custom->{name}}); | |
1852 | |
1853 # exact type must match coords of variant exactly | |
1854 if($custom->{type} eq 'exact') { | |
1855 foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$vf->{start}}}) { | |
1856 | |
1857 next unless | |
1858 $feature->{chr} eq $chr && | |
1859 $feature->{start} eq $vf->{start} && | |
1860 $feature->{end} eq $vf->{end}; | |
1861 | |
1862 $annotation->{$custom->{name}} .= $feature->{name}.','; | |
1863 } | |
1864 } | |
1865 | |
1866 # overlap type only needs to overlap, but we need to search the whole range | |
1867 elsif($custom->{type} eq 'overlap') { | |
1868 foreach my $pos(keys %{$cache->{$chr}->{$custom->{name}}}) { | |
1869 foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$pos}}) { | |
1870 | |
1871 next unless | |
1872 $feature->{chr} eq $chr && | |
1873 $feature->{end} >= $vf->{start} && | |
1874 $feature->{start} <= $vf->{end}; | |
1875 | |
1876 $annotation->{$custom->{name}} .= $feature->{name}.','; | |
1877 } | |
1878 } | |
1879 } | |
1880 | |
1881 # trim off trailing commas | |
1882 $annotation->{$custom->{name}} =~ s/\,$//g if defined($annotation->{$custom->{name}}); | |
1883 } | |
1884 | |
1885 return $annotation; | |
1886 } | |
1887 | |
1888 # decides whether to print a VF based on user defined consequences | |
1889 sub filter_by_consequence { | |
1890 my $config = shift; | |
1891 my $vf = shift; | |
1892 my $filters = $config->{filter}; | |
1893 | |
1894 # find it if we only have "no"s | |
1895 my $only_nos = 0; | |
1896 $only_nos = 1 if (sort {$a <=> $b} values %$filters)[-1] == 0; | |
1897 | |
1898 my ($yes, $no) = (0, 0); | |
1899 | |
1900 # get all consequences across all term types | |
1901 my @types = ('SO', 'display'); | |
1902 | |
1903 my @cons; | |
1904 push @cons, @{$vf->consequence_type($_)} for @types; | |
1905 | |
1906 # add regulatory consequences | |
1907 if(defined($config->{regulatory})) { | |
1908 foreach my $term_type(@types) { | |
1909 my $term_method = $term_type.'_term'; | |
1910 | |
1911 for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) { | |
1912 for my $rfva(@{$rfv->get_all_alternate_RegulatoryFeatureVariationAlleles}) { | |
1913 push @cons, map {$_->$term_method} @{ $rfva->get_all_OverlapConsequences }; | |
1914 } | |
1915 } | |
1916 for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) { | |
1917 for my $mfva(@{$mfv->get_all_alternate_MotifFeatureVariationAlleles}) { | |
1918 push @cons, map {$_->$term_method} @{ $mfva->get_all_OverlapConsequences }; | |
1919 } | |
1920 } | |
1921 } | |
1922 } | |
1923 | |
1924 foreach my $con(grep {defined($_) && defined($filters->{$_})} @cons) { | |
1925 if($filters->{$con} == 1) { | |
1926 $yes = 1; | |
1927 } | |
1928 else { | |
1929 $no = 1; | |
1930 } | |
1931 } | |
1932 | |
1933 # check special case, coding | |
1934 if(defined($filters->{coding})) { | |
1935 if(grep {$_->affects_cds} @{$vf->get_all_TranscriptVariations}) { | |
1936 if($filters->{coding} == 1) { | |
1937 $yes = 1; | |
1938 } | |
1939 else { | |
1940 $no = 1; | |
1941 } | |
1942 } | |
1943 } | |
1944 | |
1945 my $ok = 0; | |
1946 if($only_nos) { | |
1947 $ok = 1 if !$no; | |
1948 } | |
1949 else { | |
1950 $ok = 1 if $yes && !$no; | |
1951 } | |
1952 | |
1953 return $ok; | |
1954 } | |
1955 | |
1956 | |
1957 # takes VFs created from input, fixes and checks various things | |
1958 sub validate_vf { | |
1959 my $config = shift; | |
1960 my $vf = shift; | |
1961 | |
1962 # user specified chr skip list | |
1963 return 0 if defined($config->{chr}) && !$config->{chr}->{$vf->{chr}}; | |
1964 | |
1965 # fix inputs | |
1966 $vf->{chr} =~ s/chr//ig unless $vf->{chr} =~ /^chromosome$/i; | |
1967 $vf->{chr} = 'MT' if $vf->{chr} eq 'M'; | |
1968 $vf->{strand} ||= 1; | |
1969 $vf->{strand} = ($vf->{strand} =~ /\-/ ? "-1" : "1"); | |
1970 | |
1971 # sanity checks | |
1972 unless($vf->{start} =~ /^\d+$/ && $vf->{end} =~ /^\d+$/) { | |
1973 warn("WARNING: Start ".$vf->{start}." or end ".$vf->{end}." coordinate invalid on line ".$config->{line_number}."\n") unless defined $config->{quiet}; | |
1974 return 0; | |
1975 } | |
1976 | |
1977 # structural variation? | |
1978 return validate_svf($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature'); | |
1979 | |
1980 # uppercase allele string | |
1981 $vf->{allele_string} =~ tr/[a-z]/[A-Z]/; | |
1982 | |
1983 unless($vf->{allele_string} =~ /([ACGT-]+\/*)+/) { | |
1984 warn("WARNING: Invalid allele string ".$vf->{allele_string}." on line ".$config->{line_number}." or possible parsing error\n") unless defined $config->{quiet}; | |
1985 return 0; | |
1986 } | |
1987 | |
1988 # insertion should have start = end + 1 | |
1989 if($vf->{allele_string} =~ /^\-\// && $vf->{start} != $vf->{end} + 1) { | |
1990 warn( | |
1991 "WARNING: Alleles look like an insertion (". | |
1992 $vf->{allele_string}. | |
1993 ") but coordinates are not start = end + 1 (START=". | |
1994 $vf->{start}.", END=".$vf->{end}. | |
1995 ") on line ".$config->{line_number}."\n" | |
1996 ) unless defined($config->{quiet}); | |
1997 return 0; | |
1998 } | |
1999 | |
2000 # check length of reference matches seq length spanned | |
2001 my @alleles = split /\//, $vf->{allele_string}; | |
2002 my $ref_allele = shift @alleles; | |
2003 my $tmp_ref_allele = $ref_allele; | |
2004 $tmp_ref_allele =~ s/\-//g; | |
2005 | |
2006 #if(($vf->{end} - $vf->{start}) + 1 != length($tmp_ref_allele)) { | |
2007 # warn( | |
2008 # "WARNING: Length of reference allele (".$ref_allele. | |
2009 # " length ".length($tmp_ref_allele).") does not match co-ordinates ".$vf->{start}."-".$vf->{end}. | |
2010 # " on line ".$config->{line_number} | |
2011 # ) unless defined($config->{quiet}); | |
2012 # return 0; | |
2013 #} | |
2014 | |
2015 # flag as unbalanced | |
2016 foreach my $allele(@alleles) { | |
2017 $allele =~ s/\-//g; | |
2018 $vf->{indel} = 1 unless length($allele) == length($tmp_ref_allele); | |
2019 } | |
2020 | |
2021 # check reference allele if requested | |
2022 if(defined $config->{check_ref}) { | |
2023 my $ok = 0; | |
2024 my $slice_ref_allele; | |
2025 | |
2026 # insertion, therefore no ref allele to check | |
2027 if($ref_allele eq '-') { | |
2028 $ok = 1; | |
2029 } | |
2030 else { | |
2031 my $slice_ref = $vf->{slice}->sub_Slice($vf->{start}, $vf->{end}, $vf->{strand}); | |
2032 | |
2033 if(!defined($slice_ref)) { | |
2034 warn "WARNING: Could not fetch sub-slice from ".$vf->{start}."\-".$vf->{end}."\(".$vf->{strand}."\) on line ".$config->{line_number} unless defined $config->{quiet}; | |
2035 } | |
2036 | |
2037 else { | |
2038 $slice_ref_allele = $slice_ref->seq; | |
2039 $ok = ($slice_ref_allele eq $ref_allele ? 1 : 0); | |
2040 } | |
2041 } | |
2042 | |
2043 if(!$ok) { | |
2044 warn | |
2045 "WARNING: Specified reference allele $ref_allele ", | |
2046 "does not match Ensembl reference allele", | |
2047 ($slice_ref_allele ? " $slice_ref_allele" : ""), | |
2048 " on line ".$config->{line_number} unless defined $config->{quiet}; | |
2049 return 0; | |
2050 } | |
2051 } | |
2052 | |
2053 return 1; | |
2054 } | |
2055 | |
2056 | |
2057 # validate a structural variation | |
2058 sub validate_svf { | |
2059 my $config = shift; | |
2060 my $svf = shift; | |
2061 | |
2062 return 1; | |
2063 } | |
2064 | |
2065 | |
2066 # takes a hash of VFs and fetches consequences by pre-fetching overlapping transcripts | |
2067 # from database and/or cache | |
2068 sub whole_genome_fetch { | |
2069 my $config = shift; | |
2070 my $chr = shift; | |
2071 my $vf_hash = shift; | |
2072 | |
2073 my (%vf_done, @finished_vfs, %seen_rfs); | |
2074 | |
2075 if(defined($config->{offline}) && !-e $config->{dir}.'/'.$chr) { | |
2076 debug("No cache found for chromsome $chr") unless defined($config->{quiet}); | |
2077 | |
2078 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2079 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { | |
2080 push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; | |
2081 } | |
2082 } | |
2083 | |
2084 return \@finished_vfs; | |
2085 } | |
2086 | |
2087 my $slice_cache = $config->{slice_cache}; | |
2088 build_slice_cache($config, $config->{tr_cache}) unless defined($slice_cache->{$chr}); | |
2089 | |
2090 debug("Analyzing chromosome $chr") unless defined($config->{quiet}); | |
2091 | |
2092 # custom annotations | |
2093 whole_genome_fetch_custom($config, $vf_hash, $chr) if defined($config->{custom}); | |
2094 | |
2095 # split up normal variations from SVs | |
2096 my ($tmp_vf_hash, @svfs); | |
2097 | |
2098 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2099 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { | |
2100 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { | |
2101 if($vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) { | |
2102 push @svfs, $vf; | |
2103 } | |
2104 else { | |
2105 push @{$tmp_vf_hash->{$chr}{$chunk}{$pos}}, $vf; | |
2106 } | |
2107 } | |
2108 } | |
2109 } | |
2110 | |
2111 $vf_hash = $tmp_vf_hash; | |
2112 | |
2113 # transcript annotations | |
2114 whole_genome_fetch_transcript($config, $vf_hash, $chr) | |
2115 unless defined($config->{no_consequences}); | |
2116 | |
2117 # regulatory annotations | |
2118 whole_genome_fetch_reg($config, $vf_hash, $chr) | |
2119 if defined($config->{regulatory}) | |
2120 && !defined($config->{no_consequences}); | |
2121 | |
2122 # structural variations | |
2123 @finished_vfs = @{whole_genome_fetch_sv($config, \@svfs, $chr)} | |
2124 if scalar @svfs; | |
2125 | |
2126 # sort results into @finished_vfs array | |
2127 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2128 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { | |
2129 | |
2130 # pinch slice from slice cache if we don't already have it | |
2131 $_->{slice} ||= $slice_cache->{$chr} for @{$vf_hash->{$chr}{$chunk}{$pos}}; | |
2132 | |
2133 if(defined($config->{regulatory})) { | |
2134 foreach my $type(@REG_FEAT_TYPES) { | |
2135 $_->{regulation_variations}->{$type} ||= [] for @{$vf_hash->{$chr}{$chunk}{$pos}}; | |
2136 } | |
2137 } | |
2138 | |
2139 if(defined($config->{custom})) { | |
2140 $_->{custom} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; | |
2141 } | |
2142 | |
2143 $_->{transcript_variations} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}}; | |
2144 | |
2145 # add to final array | |
2146 push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}}; | |
2147 } | |
2148 } | |
2149 | |
2150 # sort | |
2151 @finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @finished_vfs; | |
2152 | |
2153 # clean hash | |
2154 delete $vf_hash->{$chr}; | |
2155 | |
2156 return \@finished_vfs; | |
2157 } | |
2158 | |
2159 sub whole_genome_fetch_custom { | |
2160 my $config = shift; | |
2161 my $vf_hash = shift; | |
2162 my $chr = shift; | |
2163 | |
2164 return unless scalar @{$config->{custom}}; | |
2165 | |
2166 # create regions based on VFs instead of chunks | |
2167 my $tmp_regions; | |
2168 | |
2169 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2170 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { | |
2171 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { | |
2172 push @{$tmp_regions->{$chr}}, ($vf->{start}-1).'-'.($vf->{end}+1); | |
2173 } | |
2174 } | |
2175 } | |
2176 | |
2177 return unless defined($tmp_regions->{$chr}); | |
2178 | |
2179 # cache annotations | |
2180 my $annotation_cache = cache_custom_annotation($config, $tmp_regions, $chr); | |
2181 | |
2182 # count and report | |
2183 my $total_annotations = 0; | |
2184 $total_annotations += scalar keys %{$annotation_cache->{$chr}->{$_}} for keys %{$annotation_cache->{$chr}}; | |
2185 debug("Retrieved $total_annotations custom annotations (", (join ", ", map {(scalar keys %{$annotation_cache->{$chr}->{$_}}).' '.$_} keys %{$annotation_cache->{$chr}}), ")"); | |
2186 | |
2187 # compare annotations to variations in hash | |
2188 debug("Analyzing custom annotations") unless defined($config->{quiet}); | |
2189 my $total = scalar keys %{$vf_hash->{$chr}}; | |
2190 my $i = 0; | |
2191 | |
2192 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2193 progress($config, $i++, $total); | |
2194 | |
2195 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { | |
2196 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) { | |
2197 | |
2198 $vf->{custom} = get_custom_annotation($config, $vf, $annotation_cache); | |
2199 } | |
2200 } | |
2201 } | |
2202 | |
2203 end_progress($config); | |
2204 } | |
2205 | |
2206 sub whole_genome_fetch_transcript { | |
2207 my $config = shift; | |
2208 my $vf_hash = shift; | |
2209 my $chr = shift; | |
2210 | |
2211 my $tr_cache = $config->{tr_cache}; | |
2212 my $slice_cache = $config->{slice_cache}; | |
2213 | |
2214 my $up_down_size = MAX_DISTANCE_FROM_TRANSCRIPT; | |
2215 | |
2216 # check we have defined regions | |
2217 return unless defined($vf_hash->{$chr}) && defined($tr_cache->{$chr}); | |
2218 | |
2219 # copy slice from transcript to slice cache | |
2220 $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); | |
2221 | |
2222 debug("Analyzing variants") unless defined($config->{quiet}); | |
2223 | |
2224 my $tr_counter = 0; | |
2225 my $tr_count = scalar @{$tr_cache->{$chr}}; | |
2226 | |
2227 while($tr_counter < $tr_count) { | |
2228 | |
2229 progress($config, $tr_counter, $tr_count); | |
2230 print PARENT "BUMP\n" if defined($config->{forked}); | |
2231 | |
2232 my $tr = $tr_cache->{$chr}->[$tr_counter++]; | |
2233 | |
2234 # do each overlapping VF | |
2235 my $s = $tr->start - $up_down_size; | |
2236 my $e = $tr->end + $up_down_size; | |
2237 | |
2238 # get the chunks this transcript overlaps | |
2239 my %chunks; | |
2240 $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); | |
2241 map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; | |
2242 | |
2243 # pointer to previous VF | |
2244 # used to tell plugins this is the last variant analysed in this transcript | |
2245 my $previous_vf; | |
2246 | |
2247 foreach my $chunk(keys %chunks) { | |
2248 foreach my $vf( | |
2249 grep {$_->{start} <= $e && $_->{end} >= $s} | |
2250 map {@{$vf_hash->{$chr}{$chunk}{$_}}} | |
2251 keys %{$vf_hash->{$chr}{$chunk}} | |
2252 ) { | |
2253 # pinch slice from slice cache if we don't already have it | |
2254 $vf->{slice} ||= $slice_cache->{$chr}; | |
2255 | |
2256 my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new( | |
2257 -transcript => $tr, | |
2258 -variation_feature => $vf, | |
2259 -adaptor => $config->{tva}, | |
2260 -no_ref_check => 1, | |
2261 -no_transfer => 1 | |
2262 ); | |
2263 | |
2264 # prefetching stuff here prevents doing loads at the | |
2265 # end and makes progress reporting more useful | |
2266 $tv->_prefetch_for_vep; | |
2267 | |
2268 $vf->add_TranscriptVariation($tv); | |
2269 | |
2270 # cache VF on the transcript if it is an unbalanced sub | |
2271 push @{$tr->{indels}}, $vf if defined($vf->{indel}); | |
2272 | |
2273 if(defined($config->{individual})) { | |
2274 | |
2275 # store VF on transcript, weaken reference to avoid circularity | |
2276 push @{$tr->{vfs}}, $vf; | |
2277 weaken($tr->{vfs}->[-1]); | |
2278 | |
2279 delete $previous_vf->{last_in_transcript}->{$tr->stable_id}; | |
2280 $vf->{last_in_transcript}->{$tr->stable_id} = 1; | |
2281 } | |
2282 | |
2283 $previous_vf = $vf; | |
2284 } | |
2285 } | |
2286 } | |
2287 | |
2288 end_progress($config); | |
2289 } | |
2290 | |
2291 sub whole_genome_fetch_reg { | |
2292 my $config = shift; | |
2293 my $vf_hash = shift; | |
2294 my $chr = shift; | |
2295 | |
2296 my $rf_cache = $config->{rf_cache}; | |
2297 my $slice_cache = $config->{slice_cache}; | |
2298 | |
2299 foreach my $type(keys %{$rf_cache->{$chr}}) { | |
2300 debug("Analyzing ".$type."s") unless defined($config->{quiet}); | |
2301 | |
2302 my $constructor = 'Bio::EnsEMBL::Variation::'.$type.'Variation'; | |
2303 | |
2304 my $rf_counter = 0; | |
2305 my $rf_count = scalar @{$rf_cache->{$chr}->{$type}}; | |
2306 | |
2307 while($rf_counter < $rf_count) { | |
2308 | |
2309 progress($config, $rf_counter, $rf_count); | |
2310 print PARENT "BUMP\n" if defined($config->{forked}); | |
2311 | |
2312 my $rf = $rf_cache->{$chr}->{$type}->[$rf_counter++]; | |
2313 | |
2314 # do each overlapping VF | |
2315 my $s = $rf->{start}; | |
2316 my $e = $rf->{end}; | |
2317 | |
2318 # get the chunks this transcript overlaps | |
2319 my %chunks; | |
2320 $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size})); | |
2321 map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks; | |
2322 | |
2323 foreach my $chunk(keys %chunks) { | |
2324 foreach my $vf( | |
2325 grep {$_->{start} <= $e && $_->{end} >= $s} | |
2326 map {@{$vf_hash->{$chr}{$chunk}{$_}}} | |
2327 keys %{$vf_hash->{$chr}{$chunk}} | |
2328 ) { | |
2329 push @{$vf->{regulation_variations}->{$type}}, $constructor->new( | |
2330 -variation_feature => $vf, | |
2331 -feature => $rf, | |
2332 -no_ref_check => 1, | |
2333 -no_transfer => 1 | |
2334 ); | |
2335 } | |
2336 } | |
2337 } | |
2338 | |
2339 end_progress($config); | |
2340 } | |
2341 } | |
2342 | |
2343 sub whole_genome_fetch_sv { | |
2344 my $config = shift; | |
2345 my $svfs = shift; | |
2346 my $chr = shift; | |
2347 | |
2348 my $tr_cache = $config->{tr_cache}; | |
2349 my $rf_cache = $config->{rf_cache}; | |
2350 my $slice_cache = $config->{slice_cache}; | |
2351 | |
2352 debug("Analyzing structural variations") unless defined($config->{quiet}); | |
2353 | |
2354 my($i, $total) = (0, scalar @$svfs); | |
2355 | |
2356 my @finished_vfs; | |
2357 | |
2358 foreach my $svf(@$svfs) { | |
2359 progress($config, $i++, $total); | |
2360 | |
2361 my %done_genes = (); | |
2362 | |
2363 if(defined($tr_cache->{$chr})) { | |
2364 foreach my $tr(grep {overlap($_->{start} - MAX_DISTANCE_FROM_TRANSCRIPT, $_->{end} + MAX_DISTANCE_FROM_TRANSCRIPT, $svf->{start}, $svf->{end})} @{$tr_cache->{$chr}}) { | |
2365 my $svo = Bio::EnsEMBL::Variation::TranscriptStructuralVariation->new( | |
2366 -transcript => $tr, | |
2367 -structural_variation_feature => $svf, | |
2368 -no_transfer => 1 | |
2369 ); | |
2370 | |
2371 $svf->add_TranscriptStructuralVariation($svo); | |
2372 } | |
2373 } | |
2374 | |
2375 $svf->{transcript_structural_variations} ||= {}; | |
2376 | |
2377 # do regulatory features | |
2378 if(defined($config->{regulatory}) && defined($rf_cache->{$chr})) { | |
2379 foreach my $rf_type(qw/RegulatoryFeature/) {#keys %{$rf_cache->{$chr}}) { | |
2380 foreach my $rf(grep {$_->{start} <= $svf->{end} && $_->end >= $svf->{end}} @{$rf_cache->{$chr}->{$rf_type}}) { | |
2381 my $svo = Bio::EnsEMBL::Variation::StructuralVariationOverlap->new( | |
2382 -feature => $rf, | |
2383 -structural_variation_feature => $svf, | |
2384 -no_transfer => 1 | |
2385 ); | |
2386 | |
2387 push @{$svf->{regulation_structural_variations}->{$rf_type}}, $svo; | |
2388 } | |
2389 | |
2390 $svf->{regulation_structural_variations}->{$rf_type} ||= []; | |
2391 } | |
2392 } | |
2393 | |
2394 # sort them | |
2395 #$svf->_sort_svos; | |
2396 push @finished_vfs, $svf; | |
2397 } | |
2398 | |
2399 end_progress($config); | |
2400 | |
2401 return \@finished_vfs; | |
2402 } | |
2403 | |
2404 # retrieves transcripts given region list | |
2405 sub fetch_transcripts { | |
2406 my $config = shift; | |
2407 my $regions = shift; | |
2408 my $trim_regions = shift; | |
2409 | |
2410 my $tr_cache = $config->{tr_cache}; | |
2411 my $slice_cache = $config->{slice_cache}; | |
2412 | |
2413 my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); | |
2414 | |
2415 my %seen_trs; | |
2416 | |
2417 $count_from_mem = 0; | |
2418 my $region_count = 0; | |
2419 foreach my $chr(keys %{$regions}) { | |
2420 $count_from_mem += scalar @{$tr_cache->{$chr}} if defined($tr_cache->{$chr}) && ref($tr_cache->{$chr}) eq 'ARRAY'; | |
2421 $region_count += scalar @{$regions->{$chr}}; | |
2422 } | |
2423 | |
2424 my $counter; | |
2425 | |
2426 debug("Reading transcript data from cache and/or database") unless defined($config->{quiet}); | |
2427 | |
2428 foreach my $chr(keys %{$regions}) { | |
2429 foreach my $region(sort {(split /\-/, $a)[0] <=> (split /\-/, $b)[1]} @{$regions->{$chr}}) { | |
2430 progress($config, $counter++, $region_count); | |
2431 | |
2432 # skip regions beyond the end of the chr | |
2433 next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; | |
2434 | |
2435 next if defined($config->{loaded_tr}->{$chr}->{$region}); | |
2436 | |
2437 # force quiet so other methods don't mess up the progress bar | |
2438 my $quiet = $config->{quiet}; | |
2439 $config->{quiet} = 1; | |
2440 | |
2441 # try and load cache from disk if using cache | |
2442 my $tmp_cache; | |
2443 if(defined($config->{cache})) { | |
2444 #$tmp_cache = ( | |
2445 # defined($config->{tabix}) ? | |
2446 # load_dumped_transcript_cache_tabix($config, $chr, $region, $trim_regions) : | |
2447 # load_dumped_transcript_cache($config, $chr, $region) | |
2448 #); | |
2449 $tmp_cache = load_dumped_transcript_cache($config, $chr, $region); | |
2450 $count_from_cache += scalar @{$tmp_cache->{$chr}} if defined($tmp_cache->{$chr}); | |
2451 $config->{loaded_tr}->{$chr}->{$region} = 1; | |
2452 } | |
2453 | |
2454 # no cache found on disk or not using cache | |
2455 if(!defined($tmp_cache->{$chr})) { | |
2456 | |
2457 if(defined($config->{offline})) { | |
2458 # restore quiet status | |
2459 $config->{quiet} = $quiet; | |
2460 | |
2461 debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); | |
2462 next; | |
2463 } | |
2464 | |
2465 # spoof temporary region hash | |
2466 my $tmp_hash; | |
2467 push @{$tmp_hash->{$chr}}, $region; | |
2468 | |
2469 $tmp_cache = cache_transcripts($config, $tmp_hash); | |
2470 | |
2471 # make it an empty arrayref that gets cached | |
2472 # so we don't get confused and reload next time round | |
2473 $tmp_cache->{$chr} ||= []; | |
2474 | |
2475 $count_from_db += scalar @{$tmp_cache->{$chr}}; | |
2476 | |
2477 # dump to disk if writing to cache | |
2478 (defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $region) : dump_transcript_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); | |
2479 | |
2480 $config->{loaded_tr}->{$chr}->{$region} = 1; | |
2481 } | |
2482 | |
2483 # add loaded transcripts to main cache | |
2484 if(defined($tmp_cache->{$chr})) { | |
2485 while(my $tr = shift @{$tmp_cache->{$chr}}) { | |
2486 | |
2487 # track already added transcripts by dbID | |
2488 my $dbID = $tr->dbID; | |
2489 if($seen_trs{$dbID}) { | |
2490 $count_duplicates++; | |
2491 next; | |
2492 } | |
2493 | |
2494 # trim out? | |
2495 #if(defined($trim_regions) && defined($trim_regions->{$chr})) { | |
2496 # my $tmp_count = scalar grep { | |
2497 # overlap( | |
2498 # (split /\-/, $_)[0], (split /\-/, $_)[1], | |
2499 # $tr->{start}, $tr->{end} | |
2500 # ) | |
2501 # } @{$trim_regions->{$chr}}; | |
2502 # | |
2503 # if(!$tmp_count) { | |
2504 # $count_trimmed++; | |
2505 # next; | |
2506 # } | |
2507 #} | |
2508 | |
2509 $seen_trs{$dbID} = 1; | |
2510 | |
2511 push @{$tr_cache->{$chr}}, $tr; | |
2512 } | |
2513 } | |
2514 | |
2515 $tr_cache->{$chr} ||= []; | |
2516 | |
2517 undef $tmp_cache; | |
2518 | |
2519 # restore quiet status | |
2520 $config->{quiet} = $quiet; | |
2521 | |
2522 # build slice cache | |
2523 $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr}); | |
2524 } | |
2525 } | |
2526 | |
2527 end_progress($config); | |
2528 | |
2529 my $tr_count = 0; | |
2530 $tr_count += scalar @{$tr_cache->{$_}} for keys %$tr_cache; | |
2531 | |
2532 debug("Retrieved $tr_count transcripts ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); | |
2533 | |
2534 return $tr_count; | |
2535 } | |
2536 | |
2537 sub fetch_regfeats { | |
2538 my $config = shift; | |
2539 my $regions = shift; | |
2540 my $trim_regions = shift; | |
2541 | |
2542 my $rf_cache = $config->{rf_cache}; | |
2543 my $slice_cache = $config->{slice_cache}; | |
2544 | |
2545 my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0); | |
2546 | |
2547 my %seen_rfs; | |
2548 | |
2549 $count_from_mem = 0; | |
2550 my $region_count = 0; | |
2551 | |
2552 foreach my $chr(keys %$regions) { | |
2553 if(defined($rf_cache->{$chr}) && ref($rf_cache->{$chr}) eq 'HASH') { | |
2554 $count_from_mem += scalar @{$rf_cache->{$chr}->{$_}} for keys %{$rf_cache->{$chr}}; | |
2555 } | |
2556 $region_count += scalar @{$regions->{$chr}}; | |
2557 } | |
2558 | |
2559 my $counter = 0; | |
2560 | |
2561 debug("Reading regulatory data from cache and/or database") unless defined($config->{quiet}); | |
2562 | |
2563 foreach my $chr(keys %$regions) { | |
2564 foreach my $region(sort {(split /\-/, $a)[0] cmp (split /\-/, $b)[1]} @{$regions->{$chr}}) { | |
2565 progress($config, $counter++, $region_count); | |
2566 | |
2567 next if defined($config->{loaded_rf}->{$chr}->{$region}); | |
2568 | |
2569 # skip regions beyond the end of the chr | |
2570 next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length; | |
2571 | |
2572 # force quiet so other methods don't mess up the progress bar | |
2573 my $quiet = $config->{quiet}; | |
2574 $config->{quiet} = 1; | |
2575 | |
2576 # try and load cache from disk if using cache | |
2577 my $tmp_cache; | |
2578 if(defined($config->{cache})) { | |
2579 $tmp_cache = load_dumped_reg_feat_cache($config, $chr, $region); | |
2580 | |
2581 #$tmp_cache = | |
2582 # defined($config->{tabix}) ? | |
2583 # load_dumped_reg_feat_cache_tabix($config, $chr, $region, $trim_regions) : | |
2584 # load_dumped_reg_feat_cache($config, $chr, $region); | |
2585 | |
2586 | |
2587 if(defined($tmp_cache->{$chr})) { | |
2588 $count_from_cache += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; | |
2589 } | |
2590 | |
2591 # flag as loaded | |
2592 $config->{loaded_rf}->{$chr}->{$region} = 1; | |
2593 } | |
2594 | |
2595 # no cache found on disk or not using cache | |
2596 if(!defined($tmp_cache->{$chr})) { | |
2597 | |
2598 if(defined($config->{offline})) { | |
2599 | |
2600 # restore quiet status | |
2601 $config->{quiet} = $quiet; | |
2602 | |
2603 debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet}); | |
2604 next; | |
2605 } | |
2606 | |
2607 # spoof temporary region hash | |
2608 my $tmp_hash; | |
2609 push @{$tmp_hash->{$chr}}, $region; | |
2610 | |
2611 $tmp_cache = cache_reg_feats($config, $tmp_hash); | |
2612 | |
2613 # make it an empty arrayref that gets cached | |
2614 # so we don't get confused and reload next time round | |
2615 $tmp_cache->{$chr} ||= {}; | |
2616 | |
2617 $count_from_db += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}}; | |
2618 | |
2619 # dump to disk if writing to cache | |
2620 #dump_reg_feat_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); | |
2621 (defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $tmp_cache, $chr, $region) : dump_reg_feat_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache}); | |
2622 | |
2623 # restore deleted coord_system adaptor | |
2624 foreach my $type(keys %{$tmp_cache->{$chr}}) { | |
2625 $_->{slice}->{coord_system}->{adaptor} = $config->{csa} for @{$tmp_cache->{$chr}->{$type}}; | |
2626 } | |
2627 | |
2628 # flag as loaded | |
2629 $config->{loaded_rf}->{$chr}->{$region} = 1; | |
2630 } | |
2631 | |
2632 # add loaded reg_feats to main cache | |
2633 if(defined($tmp_cache->{$chr})) { | |
2634 foreach my $type(keys %{$tmp_cache->{$chr}}) { | |
2635 while(my $rf = shift @{$tmp_cache->{$chr}->{$type}}) { | |
2636 | |
2637 # filter on cell type | |
2638 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { | |
2639 next unless grep {$rf->{cell_types}->{$_}} @{$config->{cell_type}}; | |
2640 } | |
2641 | |
2642 # trim out? | |
2643 #if(defined($trim_regions) && defined($trim_regions->{$chr})) { | |
2644 # my $tmp_count = scalar grep { | |
2645 # overlap( | |
2646 # (split /\-/, $_)[0], (split /\-/, $_)[1], | |
2647 # $rf->{start}, $rf->{end} | |
2648 # ) | |
2649 # } @{$trim_regions->{$chr}}; | |
2650 # | |
2651 # if(!$tmp_count) { | |
2652 # $count_trimmed++; | |
2653 # next; | |
2654 # } | |
2655 #} | |
2656 | |
2657 # track already added reg_feats by dbID | |
2658 my $dbID = $rf->{dbID}; | |
2659 if($seen_rfs{$dbID}) { | |
2660 $count_duplicates++; | |
2661 next; | |
2662 } | |
2663 $seen_rfs{$dbID} = 1; | |
2664 | |
2665 push @{$rf_cache->{$chr}->{$type}}, $rf; | |
2666 } | |
2667 } | |
2668 } | |
2669 | |
2670 undef $tmp_cache; | |
2671 | |
2672 # restore quiet status | |
2673 $config->{quiet} = $quiet; | |
2674 } | |
2675 } | |
2676 | |
2677 end_progress($config); | |
2678 | |
2679 my $rf_count = 0; | |
2680 | |
2681 foreach my $chr(keys %$rf_cache) { | |
2682 foreach my $type(keys %{$rf_cache->{$chr}}) { | |
2683 $rf_count += scalar @{$rf_cache->{$chr}->{$type}}; | |
2684 } | |
2685 } | |
2686 | |
2687 debug("Retrieved $rf_count regulatory features ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet}); | |
2688 | |
2689 return $rf_count; | |
2690 } | |
2691 | |
2692 # gets existing VFs for a vf_hash | |
2693 sub check_existing_hash { | |
2694 my $config = shift; | |
2695 my $vf_hash = shift; | |
2696 my $variation_cache; | |
2697 | |
2698 # we only care about non-SVs here | |
2699 my %new_hash; | |
2700 | |
2701 foreach my $chr(keys %{$vf_hash}) { | |
2702 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2703 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { | |
2704 foreach my $var(grep {$_->isa('Bio::EnsEMBL::Variation::VariationFeature')} @{$vf_hash->{$chr}->{$chunk}->{$pos}}) { | |
2705 push @{$new_hash{$chr}->{$chunk}->{$pos}}, $var; | |
2706 } | |
2707 } | |
2708 } | |
2709 } | |
2710 | |
2711 $vf_hash = \%new_hash; | |
2712 | |
2713 debug("Checking for existing variations") unless defined($config->{quiet}); | |
2714 | |
2715 my ($chunk_count, $counter); | |
2716 $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; | |
2717 | |
2718 foreach my $chr(keys %{$vf_hash}) { | |
2719 | |
2720 my %loaded_regions; | |
2721 | |
2722 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2723 progress($config, $counter++, $chunk_count); | |
2724 | |
2725 # get the VFs for this chunk | |
2726 my ($start, $end); | |
2727 | |
2728 # work out start and end using chunk_size | |
2729 $start = $config->{chunk_size} * $chunk; | |
2730 $end = $config->{chunk_size} * ($chunk + 1); | |
2731 | |
2732 # using cache? | |
2733 if(defined($config->{cache})) { | |
2734 my $tmp_regions; | |
2735 push @{$tmp_regions->{$chr}}, $start.'-'.$end; | |
2736 | |
2737 my $converted_regions = convert_regions($config, $tmp_regions); | |
2738 | |
2739 foreach my $region(@{$converted_regions->{$chr}}) { | |
2740 | |
2741 unless($loaded_regions{$region}) { | |
2742 my $tmp_cache = load_dumped_variation_cache($config, $chr, $region); | |
2743 | |
2744 # load from DB if not found in cache | |
2745 if(!defined($tmp_cache->{$chr})) { | |
2746 if(defined($config->{offline})) { | |
2747 debug("WARNING: Could not find variation cache for $chr\:$region") unless defined($config->{quiet}); | |
2748 next; | |
2749 } | |
2750 | |
2751 $tmp_cache->{$chr} = get_variations_in_region($config, $chr, $region); | |
2752 dump_variation_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache}); | |
2753 } | |
2754 | |
2755 # merge tmp_cache with the main cache | |
2756 foreach my $key(keys %{$tmp_cache->{$chr}}) { | |
2757 $variation_cache->{$chr}->{$key} = $tmp_cache->{$chr}->{$key}; | |
2758 delete $tmp_cache->{$chr}->{$key}; | |
2759 } | |
2760 | |
2761 # clear memory | |
2762 undef $tmp_cache; | |
2763 | |
2764 # record this region as fetched | |
2765 $loaded_regions{$region} = 1; | |
2766 } | |
2767 } | |
2768 } | |
2769 | |
2770 # no cache, get all variations in region from DB | |
2771 else { | |
2772 | |
2773 my ($min, $max); | |
2774 | |
2775 # we can fetch smaller region when using DB | |
2776 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { | |
2777 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { | |
2778 foreach my $coord(qw(start end)) { | |
2779 $min = $var->{$coord} if !defined($min) || $var->{$coord} < $min; | |
2780 $max = $var->{$coord} if !defined($max) || $var->{$coord} > $max; | |
2781 } | |
2782 } | |
2783 } | |
2784 | |
2785 $variation_cache->{$chr} = get_variations_in_region($config, $chr, $min.'-'.$max); | |
2786 } | |
2787 | |
2788 # now compare retrieved vars with vf_hash | |
2789 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { | |
2790 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { | |
2791 my @found; | |
2792 my @gmafs; | |
2793 | |
2794 if(defined($variation_cache->{$chr})) { | |
2795 if(my $existing_vars = $variation_cache->{$chr}->{$pos}) { | |
2796 foreach my $existing_var(grep {$_->[1] <= $config->{failed}} @$existing_vars) { | |
2797 unless(is_var_novel($config, $existing_var, $var)) { | |
2798 push @found, $existing_var->[0]; | |
2799 push @gmafs, $existing_var->[6].":".$existing_var->[7] if defined($existing_var->[6]) && defined($existing_var->[7]); | |
2800 } | |
2801 } | |
2802 } | |
2803 } | |
2804 | |
2805 $var->{existing} = join ",", @found; | |
2806 $var->{existing} ||= '-'; | |
2807 | |
2808 $var->{gmaf} = join ",", @gmafs; | |
2809 $var->{gmaf} ||= undef; | |
2810 } | |
2811 } | |
2812 } | |
2813 | |
2814 delete $variation_cache->{$chr}; | |
2815 } | |
2816 | |
2817 end_progress($config); | |
2818 } | |
2819 | |
2820 # gets overlapping SVs for a vf_hash | |
2821 sub check_svs_hash { | |
2822 my $config = shift; | |
2823 my $vf_hash = shift; | |
2824 | |
2825 debug("Checking for overlapping structural variations") unless defined($config->{quiet}); | |
2826 | |
2827 my ($chunk_count, $counter); | |
2828 $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash}; | |
2829 | |
2830 foreach my $chr(keys %{$vf_hash}) { | |
2831 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2832 | |
2833 progress($config, $counter++, $chunk_count); | |
2834 | |
2835 # work out start and end using chunk_size | |
2836 my ($start, $end); | |
2837 $start = $config->{chunk_size} * $chunk; | |
2838 $end = $config->{chunk_size} * ($chunk + 1); | |
2839 | |
2840 # check for structural variations | |
2841 if(defined($config->{sa})) { | |
2842 my $slice = $config->{sa}->fetch_by_region('chromosome', $chr, $start, $end); | |
2843 | |
2844 if(defined($slice)) { | |
2845 my $svs = $config->{svfa}->fetch_all_by_Slice($slice); | |
2846 | |
2847 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) { | |
2848 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) { | |
2849 my $string = join ",", | |
2850 map {$_->variation_name} | |
2851 grep {$_->seq_region_start <= $var->end && $_->seq_region_end >= $var->start} | |
2852 @$svs; | |
2853 | |
2854 $var->{overlapping_svs} = $string if $string; | |
2855 } | |
2856 } | |
2857 } | |
2858 } | |
2859 } | |
2860 } | |
2861 | |
2862 end_progress($config); | |
2863 } | |
2864 | |
2865 # gets a slice from the slice adaptor | |
2866 sub get_slice { | |
2867 my $config = shift; | |
2868 my $chr = shift; | |
2869 my $otherfeatures = shift; | |
2870 $otherfeatures ||= ''; | |
2871 | |
2872 return undef unless defined($config->{sa}) && defined($chr); | |
2873 | |
2874 my $slice; | |
2875 | |
2876 # first try to get a chromosome | |
2877 eval { $slice = $config->{$otherfeatures.'sa'}->fetch_by_region('chromosome', $chr); }; | |
2878 | |
2879 # if failed, try to get any seq region | |
2880 if(!defined($slice)) { | |
2881 $slice = $config->{$otherfeatures.'sa'}->fetch_by_region(undef, $chr); | |
2882 } | |
2883 | |
2884 return $slice; | |
2885 } | |
2886 | |
2887 | |
2888 | |
2889 | |
2890 # METHODS THAT DEAL WITH "REGIONS" | |
2891 ################################## | |
2892 | |
2893 # gets regions from VF hash | |
2894 sub regions_from_hash { | |
2895 my $config = shift; | |
2896 my $vf_hash = shift; | |
2897 | |
2898 my %include_regions; | |
2899 | |
2900 # if using cache we just want the regions of cache_region_size | |
2901 # since that's what we'll get from the cache (or DB if no cache found) | |
2902 if(defined($config->{cache})) { | |
2903 | |
2904 my $region_size = $config->{cache_region_size}; | |
2905 | |
2906 foreach my $chr(keys %$vf_hash) { | |
2907 $include_regions{$chr} = []; | |
2908 my %temp_regions; | |
2909 | |
2910 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2911 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { | |
2912 my ($s, $e) = ($pos - MAX_DISTANCE_FROM_TRANSCRIPT, $pos + MAX_DISTANCE_FROM_TRANSCRIPT); | |
2913 | |
2914 my $low = int ($s / $region_size); | |
2915 my $high = int ($e / $region_size) + 1; | |
2916 | |
2917 for my $i($low..($high - 1)) { | |
2918 $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; | |
2919 } | |
2920 } | |
2921 } | |
2922 | |
2923 @{$include_regions{$chr}} = keys %temp_regions; | |
2924 } | |
2925 } | |
2926 | |
2927 # if no cache we don't want to fetch more than is necessary, so find the | |
2928 # minimum covered region of the variations in the hash | |
2929 else { | |
2930 foreach my $chr(keys %$vf_hash) { | |
2931 $include_regions{$chr} = []; | |
2932 | |
2933 foreach my $chunk(keys %{$vf_hash->{$chr}}) { | |
2934 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) { | |
2935 add_region($_->start, $_->end, $include_regions{$chr}) for @{$vf_hash->{$chr}{$chunk}{$pos}}; | |
2936 } | |
2937 } | |
2938 } | |
2939 | |
2940 # merge regions | |
2941 merge_regions(\%include_regions, $config); | |
2942 } | |
2943 | |
2944 return \%include_regions; | |
2945 } | |
2946 | |
2947 # adds a region to region list, expanding existing one if overlaps | |
2948 sub add_region { | |
2949 my $start = shift; | |
2950 my $end = shift; | |
2951 my $region_list = shift; | |
2952 | |
2953 # fix end for insertions | |
2954 $end = $start if $end < $start; | |
2955 | |
2956 my $added = 0; | |
2957 my $i = 0; | |
2958 | |
2959 while ($i < scalar @$region_list) { | |
2960 my ($region_start, $region_end) = split /\-/, $region_list->[$i]; | |
2961 | |
2962 if($start <= $region_end && $end >= $region_start) { | |
2963 my $new_region_start = ($start < $end ? $start : $end) - MAX_DISTANCE_FROM_TRANSCRIPT; | |
2964 my $new_region_end = ($start > $end ? $start : $end) + MAX_DISTANCE_FROM_TRANSCRIPT; | |
2965 | |
2966 $new_region_start = 1 if $new_region_start < 1; | |
2967 | |
2968 $region_start = $new_region_start if $new_region_start < $region_start; | |
2969 $region_end = $new_region_end if $new_region_end > $region_end; | |
2970 | |
2971 $region_list->[$i] = $region_start.'-'.$region_end; | |
2972 $added = 1; | |
2973 } | |
2974 | |
2975 $i++; | |
2976 } | |
2977 | |
2978 unless($added) { | |
2979 my $s = $start - MAX_DISTANCE_FROM_TRANSCRIPT; | |
2980 $s = 1 if $s < 1; | |
2981 | |
2982 push @{$region_list}, $s.'-'.($end + MAX_DISTANCE_FROM_TRANSCRIPT); | |
2983 } | |
2984 } | |
2985 | |
2986 # merges overlapping regions from scans | |
2987 sub merge_regions { | |
2988 my $include_regions = shift; | |
2989 my $config = shift; | |
2990 my $consecutive = shift; | |
2991 $consecutive ||= 0; | |
2992 | |
2993 # now merge overlapping regions | |
2994 foreach my $chr(keys %$include_regions) { | |
2995 my $max_index = $#{$include_regions->{$chr}}; | |
2996 my (@new_regions, %skip); | |
2997 | |
2998 for my $i(0..$max_index) { | |
2999 next if $skip{$i}; | |
3000 my ($s, $e) = split /\-/, $include_regions->{$chr}[$i]; | |
3001 | |
3002 for my $j(($i+1)..$max_index) { | |
3003 next if $skip{$j}; | |
3004 my ($ns, $ne) = split /\-/, $include_regions->{$chr}[$j]; | |
3005 | |
3006 if($s <= ($ne + $consecutive) && $e >= ($ns - $consecutive)) { | |
3007 $s = $ns if $ns < $s; | |
3008 $e = $ne if $ne > $e; | |
3009 | |
3010 $skip{$j} = 1; | |
3011 } | |
3012 } | |
3013 | |
3014 push @new_regions, $s.'-'.$e; | |
3015 } | |
3016 | |
3017 # replace original | |
3018 $include_regions->{$chr} = \@new_regions; | |
3019 | |
3020 $config->{region_count} += scalar @new_regions; | |
3021 } | |
3022 | |
3023 return $include_regions; | |
3024 } | |
3025 | |
3026 # converts regions as determined by scan_file to regions loadable from cache | |
3027 sub convert_regions { | |
3028 my $config = shift; | |
3029 my $regions = shift; | |
3030 | |
3031 return undef unless defined $regions; | |
3032 | |
3033 my $region_size = $config->{cache_region_size}; | |
3034 | |
3035 my %new_regions; | |
3036 | |
3037 foreach my $chr(keys %$regions) { | |
3038 my %temp_regions; | |
3039 | |
3040 foreach my $region(@{$regions->{$chr}}) { | |
3041 my ($s, $e) = split /\-/, $region; | |
3042 | |
3043 my $low = int ($s / $region_size); | |
3044 my $high = int ($e / $region_size) + 1; | |
3045 | |
3046 for my $i($low..($high - 1)) { | |
3047 $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1; | |
3048 } | |
3049 } | |
3050 | |
3051 @{$new_regions{$chr}} = keys %temp_regions; | |
3052 } | |
3053 | |
3054 return \%new_regions; | |
3055 } | |
3056 | |
3057 | |
3058 | |
3059 | |
3060 | |
3061 # CACHE METHODS | |
3062 ############### | |
3063 | |
3064 # prunes a cache to get rid of features not in regions in use | |
3065 sub prune_cache { | |
3066 my $config = shift; | |
3067 my $cache = shift; | |
3068 my $regions = shift; | |
3069 my $loaded = shift; | |
3070 | |
3071 # delete no longer in use chroms | |
3072 foreach my $chr(keys %$cache) { | |
3073 delete $cache->{$chr} unless defined $regions->{$chr} && scalar @{$regions->{$chr}}; | |
3074 } | |
3075 | |
3076 my $new_count = 0; | |
3077 | |
3078 foreach my $chr(keys %$cache) { | |
3079 | |
3080 # get total area spanned by regions | |
3081 my ($min, $max); | |
3082 foreach my $region(@{$regions->{$chr}}) { | |
3083 my ($s, $e) = split /\-/, $region; | |
3084 $min = $s if !defined($min) or $s < $min; | |
3085 $max = $e if !defined($max) or $e > $max; | |
3086 } | |
3087 | |
3088 # transcript cache | |
3089 if(ref($cache->{$chr}) eq 'ARRAY') { | |
3090 $cache->{$chr} = prune_min_max($cache->{$chr}, $min, $max); | |
3091 $new_count += scalar @{$cache->{$chr}}; | |
3092 } | |
3093 # regfeat cache | |
3094 elsif(ref($cache->{$chr}) eq 'HASH') { | |
3095 for(keys %{$cache->{$chr}}) { | |
3096 $cache->{$chr}->{$_} = prune_min_max($cache->{$chr}->{$_}, $min, $max); | |
3097 $new_count += scalar @{$cache->{$chr}->{$_}}; | |
3098 } | |
3099 } | |
3100 | |
3101 # update loaded regions | |
3102 my %have_regions = map {$_ => 1} @{$regions->{$chr}}; | |
3103 | |
3104 foreach my $region(keys %{$loaded->{$chr}}) { | |
3105 delete $loaded->{$chr}->{$region} unless defined $have_regions{$region}; | |
3106 } | |
3107 } | |
3108 | |
3109 return $new_count; | |
3110 } | |
3111 | |
3112 # does the actual pruning | |
3113 sub prune_min_max { | |
3114 my $array = shift; | |
3115 my $min = shift; | |
3116 my $max = shift; | |
3117 | |
3118 # splice out features not in area spanned by min/max | |
3119 my $i = 0; | |
3120 my $f_count = scalar @{$array}; | |
3121 my @new_cache; | |
3122 | |
3123 while($i < $f_count) { | |
3124 my $f = $array->[$i]; | |
3125 | |
3126 $i++; | |
3127 | |
3128 if($max - $f->start() > 0 && $f->end - $min > 0) { | |
3129 push @new_cache, $f; | |
3130 } | |
3131 | |
3132 # do some cleaning for transcripts | |
3133 elsif(defined $f->{translation}) { | |
3134 delete $f->{translation}->{transcript}; | |
3135 delete $f->{translation}; | |
3136 } | |
3137 } | |
3138 | |
3139 undef $array; | |
3140 return \@new_cache; | |
3141 } | |
3142 | |
3143 # get transcripts for slices | |
3144 sub cache_transcripts { | |
3145 my $config = shift; | |
3146 my $include_regions = shift; | |
3147 | |
3148 my $tr_cache; | |
3149 my $i; | |
3150 | |
3151 debug("Caching transcripts") unless defined($config->{quiet}); | |
3152 | |
3153 foreach my $chr(keys %$include_regions) { | |
3154 | |
3155 my $slice = get_slice($config, $chr); | |
3156 | |
3157 next unless defined $slice; | |
3158 | |
3159 # prefetch some things | |
3160 $slice->is_circular; | |
3161 | |
3162 # trim bumf off the slice | |
3163 delete $slice->{coord_system}->{adaptor} if defined($config->{write_cache}); | |
3164 | |
3165 # no regions? | |
3166 if(!scalar @{$include_regions->{$chr}}) { | |
3167 my $start = 1; | |
3168 my $end = $config->{cache_region_size}; | |
3169 | |
3170 while($start < $slice->end) { | |
3171 push @{$include_regions->{$chr}}, $start.'-'.$end; | |
3172 $start += $config->{cache_region_size}; | |
3173 $end += $config->{cache_region_size}; | |
3174 } | |
3175 } | |
3176 | |
3177 my $region_count; | |
3178 | |
3179 if(scalar keys %$include_regions == 1) { | |
3180 my ($chr) = keys %$include_regions; | |
3181 $region_count = scalar @{$include_regions->{$chr}}; | |
3182 debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); | |
3183 } | |
3184 | |
3185 foreach my $region(@{$include_regions->{$chr}}) { | |
3186 progress($config, $i++, $region_count || $config->{region_count}); | |
3187 | |
3188 my ($s, $e) = split /\-/, $region; | |
3189 | |
3190 # sanity check start and end | |
3191 $s = 1 if $s < 1; | |
3192 $e = $slice->end if $e > $slice->end; | |
3193 | |
3194 # get sub-slice | |
3195 my $sub_slice = $slice->sub_Slice($s, $e); | |
3196 | |
3197 # for some reason unless seq is called here the sequence becomes Ns later | |
3198 $sub_slice->seq; | |
3199 | |
3200 # add transcripts to the cache, via a transfer to the chrom's slice | |
3201 if(defined($sub_slice)) { | |
3202 foreach my $gene(map {$_->transfer($slice)} @{$sub_slice->get_all_Genes(undef, undef, 1)}) { | |
3203 my $gene_stable_id = $gene->stable_id; | |
3204 my $canonical_tr_id = $gene->{canonical_transcript_id}; | |
3205 | |
3206 my @trs; | |
3207 | |
3208 foreach my $tr(@{$gene->get_all_Transcripts}) { | |
3209 $tr->{_gene_stable_id} = $gene_stable_id; | |
3210 $tr->{_gene} = $gene; | |
3211 | |
3212 # indicate if canonical | |
3213 $tr->{is_canonical} = 1 if defined $canonical_tr_id and $tr->dbID eq $canonical_tr_id; | |
3214 | |
3215 if(defined($config->{prefetch})) { | |
3216 prefetch_transcript_data($config, $tr); | |
3217 } | |
3218 | |
3219 # CCDS | |
3220 elsif(defined($config->{ccds})) { | |
3221 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; | |
3222 $tr->{_ccds} = $entries[0]->display_id if scalar @entries; | |
3223 } | |
3224 | |
3225 # strip some unnecessary data from the transcript object | |
3226 clean_transcript($tr) if defined($config->{write_cache}); | |
3227 | |
3228 push @trs, $tr; | |
3229 } | |
3230 | |
3231 # sort the transcripts by translation so we can share sift/polyphen stuff | |
3232 # between transcripts and save cache space | |
3233 if(defined($config->{write_cache}) && (defined($config->{sift}) || defined($config->{polyphen}))) { | |
3234 | |
3235 my $prev_tr; | |
3236 | |
3237 # sort them by peptide seqeuence as transcripts with identical peptides | |
3238 # will have identical SIFT/PolyPhen prediction strings | |
3239 foreach my $tr(sort {$a->{_variation_effect_feature_cache}->{peptide} cmp $b->{_variation_effect_feature_cache}->{peptide}} grep {$_->{_variation_effect_feature_cache}->{peptide}} @trs) { | |
3240 | |
3241 if( | |
3242 defined($prev_tr) && | |
3243 $prev_tr->{_variation_effect_feature_cache}->{peptide} | |
3244 eq $tr->{_variation_effect_feature_cache}->{peptide} | |
3245 ) { | |
3246 | |
3247 foreach my $analysis(qw(sift polyphen)) { | |
3248 next unless defined($config->{$analysis}); | |
3249 $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis} = $prev_tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis}; | |
3250 } | |
3251 } | |
3252 | |
3253 $prev_tr = $tr; | |
3254 } | |
3255 } | |
3256 | |
3257 # clean the gene | |
3258 clean_gene($gene); | |
3259 | |
3260 push @{$tr_cache->{$chr}}, @trs; | |
3261 } | |
3262 } | |
3263 } | |
3264 } | |
3265 | |
3266 end_progress($config); | |
3267 | |
3268 return $tr_cache; | |
3269 } | |
3270 | |
3271 # gets rid of extra bits of info attached to the transcript that we don't need | |
3272 sub clean_transcript { | |
3273 my $tr = shift; | |
3274 | |
3275 foreach my $key(qw(display_xref external_db external_display_name external_name external_status created_date status description edits_enabled modified_date dbentries is_current analysis transcript_mapper)) { | |
3276 delete $tr->{$key} if defined($tr->{$key}); | |
3277 } | |
3278 | |
3279 # clean all attributes but miRNA | |
3280 if(defined($tr->{attributes})) { | |
3281 my @new_atts; | |
3282 foreach my $att(@{$tr->{attributes}}) { | |
3283 push @new_atts, $att if $att->{code} eq 'miRNA'; | |
3284 } | |
3285 $tr->{attributes} = \@new_atts; | |
3286 } | |
3287 | |
3288 # clean the translation | |
3289 if(defined($tr->translation)) { | |
3290 | |
3291 # sometimes the translation points to a different transcript? | |
3292 $tr->{translation}->{transcript} = $tr; | |
3293 weaken($tr->{translation}->{transcript}); | |
3294 | |
3295 for my $key(qw(attributes protein_features created_date modified_date)) { | |
3296 delete $tr->translation->{$key}; | |
3297 } | |
3298 } | |
3299 } | |
3300 | |
3301 # gets rid of extra bits of info attached to genes. At the moment this is almost | |
3302 # everything as genes are only used for their locations when looking at | |
3303 # structural variations | |
3304 sub clean_gene { | |
3305 my $gene = shift; | |
3306 | |
3307 # delete almost everything in the gene | |
3308 map {delete $gene->{$_}} | |
3309 grep { | |
3310 $_ ne 'start' && | |
3311 $_ ne 'end' && | |
3312 $_ ne 'strand' && | |
3313 $_ ne 'stable_id' | |
3314 } | |
3315 keys %{$gene}; | |
3316 } | |
3317 | |
3318 # build slice cache from transcript cache | |
3319 sub build_slice_cache { | |
3320 my $config = shift; | |
3321 my $tr_cache = shift; | |
3322 | |
3323 my %slice_cache; | |
3324 | |
3325 foreach my $chr(keys %$tr_cache) { | |
3326 $slice_cache{$chr} = scalar @{$tr_cache->{$chr}} ? $tr_cache->{$chr}[0]->slice : &get_slice($config, $chr); | |
3327 | |
3328 if(!defined($slice_cache{$chr})) { | |
3329 delete $slice_cache{$chr} | |
3330 } | |
3331 | |
3332 else { | |
3333 # reattach adaptor to the coord system | |
3334 $slice_cache{$chr}->{coord_system}->{adaptor} ||= $config->{csa}; | |
3335 } | |
3336 } | |
3337 | |
3338 return \%slice_cache; | |
3339 } | |
3340 | |
3341 # pre-fetches per-transcript data | |
3342 sub prefetch_transcript_data { | |
3343 my $config = shift; | |
3344 my $tr = shift; | |
3345 | |
3346 # introns | |
3347 my $introns = $tr->get_all_Introns; | |
3348 | |
3349 if(defined($introns)) { | |
3350 foreach my $intron(@$introns) { | |
3351 foreach my $key(qw(adaptor analysis dbID next prev seqname)) { | |
3352 delete $intron->{$key}; | |
3353 } | |
3354 } | |
3355 } | |
3356 | |
3357 $tr->{_variation_effect_feature_cache}->{introns} ||= $introns; | |
3358 | |
3359 # translateable_seq, mapper | |
3360 $tr->{_variation_effect_feature_cache}->{translateable_seq} ||= $tr->translateable_seq; | |
3361 $tr->{_variation_effect_feature_cache}->{mapper} ||= $tr->get_TranscriptMapper; | |
3362 | |
3363 # peptide | |
3364 unless ($tr->{_variation_effect_feature_cache}->{peptide}) { | |
3365 my $translation = $tr->translate; | |
3366 $tr->{_variation_effect_feature_cache}->{peptide} = $translation ? $translation->seq : undef; | |
3367 } | |
3368 | |
3369 # protein features | |
3370 if(defined($config->{domains}) || defined($config->{write_cache})) { | |
3371 my $pfs = $tr->translation ? $tr->translation->get_all_ProteinFeatures : []; | |
3372 | |
3373 # clean them to save cache space | |
3374 foreach my $pf(@$pfs) { | |
3375 | |
3376 # remove everything but the coord, analysis and ID fields | |
3377 foreach my $key(keys %$pf) { | |
3378 delete $pf->{$key} unless | |
3379 $key eq 'start' || | |
3380 $key eq 'end' || | |
3381 $key eq 'analysis' || | |
3382 $key eq 'hseqname'; | |
3383 } | |
3384 | |
3385 # remove everything from the analysis but the display label | |
3386 foreach my $key(keys %{$pf->{analysis}}) { | |
3387 delete $pf->{analysis}->{$key} unless $key eq '_display_label'; | |
3388 } | |
3389 } | |
3390 | |
3391 $tr->{_variation_effect_feature_cache}->{protein_features} = $pfs; | |
3392 } | |
3393 | |
3394 # codon table | |
3395 unless ($tr->{_variation_effect_feature_cache}->{codon_table}) { | |
3396 # for mithocondrial dna we need to to use a different codon table | |
3397 my $attrib = $tr->slice->get_all_Attributes('codon_table')->[0]; | |
3398 | |
3399 $tr->{_variation_effect_feature_cache}->{codon_table} = $attrib ? $attrib->value : 1; | |
3400 } | |
3401 | |
3402 # sift/polyphen | |
3403 if(defined($config->{pfpma}) && defined($tr->{_variation_effect_feature_cache}->{peptide})) { | |
3404 foreach my $analysis(qw(sift polyphen)) { | |
3405 next unless defined($config->{$analysis}); | |
3406 my $a = $analysis; | |
3407 $a .= '_humvar' if $a eq 'polyphen'; | |
3408 $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$a} ||= $config->{pfpma}->fetch_by_analysis_translation_md5($a, md5_hex($tr->{_variation_effect_feature_cache}->{peptide})) | |
3409 } | |
3410 } | |
3411 | |
3412 # gene | |
3413 $tr->{_gene} ||= $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id); | |
3414 | |
3415 # gene HGNC | |
3416 if(defined $config->{hgnc}) { | |
3417 | |
3418 # get from gene cache if found already | |
3419 if(defined($tr->{_gene}->{_hgnc})) { | |
3420 $tr->{_gene_hgnc} = $tr->{_gene}->{_hgnc}; | |
3421 } | |
3422 else { | |
3423 my @entries = grep {$_->database eq 'HGNC'} @{$tr->{_gene}->get_all_DBEntries()}; | |
3424 if(scalar @entries) { | |
3425 $tr->{_gene_hgnc} = $entries[0]->display_id; | |
3426 } | |
3427 | |
3428 $tr->{_gene_hgnc} ||= '-'; | |
3429 | |
3430 # cache it on the gene object too | |
3431 $tr->{_gene}->{_hgnc} = $tr->{_gene_hgnc}; | |
3432 } | |
3433 } | |
3434 | |
3435 # CCDS | |
3436 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries}; | |
3437 $tr->{_ccds} = $entries[0]->display_id if scalar @entries; | |
3438 $tr->{_ccds} ||= '-'; | |
3439 | |
3440 # refseq | |
3441 @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries}; | |
3442 if(scalar @entries) { | |
3443 $tr->{_refseq} = join ",", map {$_->display_id} @entries; | |
3444 } | |
3445 else { | |
3446 $tr->{_refseq} = '-'; | |
3447 } | |
3448 | |
3449 # protein stable ID | |
3450 $tr->{_protein} = $tr->translation ? $tr->translation->stable_id : '-'; | |
3451 | |
3452 return $tr; | |
3453 } | |
3454 | |
3455 sub get_dump_file_name { | |
3456 my $config = shift; | |
3457 my $chr = shift; | |
3458 my $region = shift; | |
3459 my $type = shift; | |
3460 | |
3461 $type ||= 'transcript'; | |
3462 | |
3463 if($type eq 'transcript') { | |
3464 $type = ''; | |
3465 } | |
3466 else { | |
3467 $type = '_'.$type; | |
3468 } | |
3469 | |
3470 #my ($s, $e) = split /\-/, $region; | |
3471 #my $subdir = int($s / 1e6); | |
3472 # | |
3473 #my $dir = $config->{dir}.'/'.$chr.'/'.$subdir; | |
3474 | |
3475 my $dir = $config->{dir}.'/'.$chr; | |
3476 my $dump_file = $dir.'/'.$region.$type.(defined($config->{tabix}) ? '_tabix' : '').'.gz'; | |
3477 | |
3478 # make directory if it doesn't exist | |
3479 if(!(-e $dir) && defined($config->{write_cache})) { | |
3480 make_path($dir); | |
3481 } | |
3482 | |
3483 return $dump_file; | |
3484 } | |
3485 | |
3486 # dumps out transcript cache to file | |
3487 sub dump_transcript_cache { | |
3488 my $config = shift; | |
3489 my $tr_cache = shift; | |
3490 my $chr = shift; | |
3491 my $region = shift; | |
3492 | |
3493 debug("Dumping cached transcript data") unless defined($config->{quiet}); | |
3494 | |
3495 # clean the slice adaptor before storing | |
3496 clean_slice_adaptor($config); | |
3497 | |
3498 strip_transcript_cache($config, $tr_cache); | |
3499 | |
3500 $config->{reg}->disconnect_all; | |
3501 delete $config->{sa}->{dbc}->{_sql_helper}; | |
3502 | |
3503 my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); | |
3504 | |
3505 debug("Writing to $dump_file") unless defined($config->{quiet}); | |
3506 | |
3507 # storable | |
3508 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; | |
3509 nstore_fd($tr_cache, $fh); | |
3510 close $fh; | |
3511 } | |
3512 | |
3513 #sub dump_transcript_cache_tabix { | |
3514 # my $config = shift; | |
3515 # my $tr_cache = shift; | |
3516 # my $chr = shift; | |
3517 # my $region = shift; | |
3518 # | |
3519 # debug("Dumping cached transcript data") unless defined($config->{quiet}); | |
3520 # | |
3521 # # clean the slice adaptor before storing | |
3522 # clean_slice_adaptor($config); | |
3523 # | |
3524 # strip_transcript_cache($config, $tr_cache); | |
3525 # | |
3526 # $config->{reg}->disconnect_all; | |
3527 # | |
3528 # my $dir = $config->{dir}.'/'.$chr; | |
3529 # my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz'; | |
3530 # | |
3531 # # make directory if it doesn't exist | |
3532 # if(!(-e $dir)) { | |
3533 # make_path($dir); | |
3534 # } | |
3535 # | |
3536 # debug("Writing to $dump_file") unless defined($config->{quiet}); | |
3537 # | |
3538 # use Storable qw(nfreeze); | |
3539 # use MIME::Base64 qw(encode_base64); | |
3540 # #open NEW, "| bgzip -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; | |
3541 # # | |
3542 # #foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { | |
3543 # # print NEW join "\t", ( | |
3544 # # $chr, | |
3545 # # $tr->start, | |
3546 # # $tr->end, | |
3547 # # encode_base64(freeze($tr), "") | |
3548 # # ); | |
3549 # # print NEW "\n"; | |
3550 # #} | |
3551 # #close NEW; | |
3552 # # | |
3553 # ## tabix it | |
3554 # #my $output = `tabix -s 1 -b 2 -e 3 -f $dump_file 2>&1`; | |
3555 # #die("ERROR: Failed during tabix indexing\n$output\n") if $output; | |
3556 # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; | |
3557 # | |
3558 # foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) { | |
3559 # print NEW join "\t", ( | |
3560 # $chr, | |
3561 # $tr->start, | |
3562 # $tr->end, | |
3563 # encode_base64(freeze($tr), "") | |
3564 # ); | |
3565 # print NEW "\n"; | |
3566 # } | |
3567 # close NEW; | |
3568 #} | |
3569 | |
3570 # loads in dumped transcript cache to memory | |
3571 sub load_dumped_transcript_cache { | |
3572 my $config = shift; | |
3573 my $chr = shift; | |
3574 my $region = shift; | |
3575 | |
3576 my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript'); | |
3577 | |
3578 return undef unless -e $dump_file; | |
3579 | |
3580 debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); | |
3581 | |
3582 open my $fh, $config->{compress}." ".$dump_file." |" or return undef; | |
3583 my $tr_cache; | |
3584 $tr_cache = fd_retrieve($fh); | |
3585 close $fh; | |
3586 | |
3587 # reattach adaptors | |
3588 foreach my $t(@{$tr_cache->{$chr}}) { | |
3589 if(defined($t->{translation})) { | |
3590 $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor}; | |
3591 $t->{translation}->{transcript} = $t; | |
3592 weaken($t->{translation}->{transcript}); | |
3593 } | |
3594 | |
3595 $t->{slice}->{adaptor} = $config->{sa}; | |
3596 } | |
3597 | |
3598 return $tr_cache; | |
3599 } | |
3600 | |
3601 #sub load_dumped_transcript_cache_tabix { | |
3602 # my $config = shift; | |
3603 # my $chr = shift; | |
3604 # my $region = shift; | |
3605 # my $trim_regions = shift; | |
3606 # | |
3607 # my $dir = $config->{dir}.'/'.$chr; | |
3608 # my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz'; | |
3609 # | |
3610 # #print STDERR "Reading from $dump_file\n"; | |
3611 # | |
3612 # return undef unless -e $dump_file; | |
3613 # | |
3614 # debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); | |
3615 # | |
3616 # my $tr_cache; | |
3617 # | |
3618 # use MIME::Base64 qw(decode_base64); | |
3619 # use Storable qw(thaw); | |
3620 # | |
3621 # my ($s, $e) = split /\-/, $region; | |
3622 # my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; | |
3623 # my $regions = ""; | |
3624 # $regions .= " $chr\:$_" for @regions; | |
3625 # | |
3626 # #print STDERR "tabix $dump_file $regions |\n"; | |
3627 # #open IN, "tabix $dump_file $regions |"; | |
3628 # open IN, "gzip -dc $dump_file |"; | |
3629 # while(<IN>) { | |
3630 # | |
3631 # #$DB::single = 1; | |
3632 # my ($chr, $start, $end, $blob) = split /\t/, $_; | |
3633 # next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; | |
3634 # my $tr = thaw(decode_base64($blob)); | |
3635 # push @{$tr_cache->{$chr}}, $tr; | |
3636 # } | |
3637 # close IN; | |
3638 # | |
3639 # # reattach adaptors | |
3640 # foreach my $t(@{$tr_cache->{$chr}}) { | |
3641 # if(defined($t->{translation})) { | |
3642 # $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor}; | |
3643 # $t->{translation}->{transcript} = $t; | |
3644 # weaken($t->{translation}->{transcript}); | |
3645 # } | |
3646 # | |
3647 # $t->{slice}->{adaptor} = $config->{sa}; | |
3648 # } | |
3649 # | |
3650 # # add empty array ref so code doesn't try and fetch from DB too | |
3651 # $tr_cache->{$chr} ||= []; | |
3652 # | |
3653 # return $tr_cache; | |
3654 #} | |
3655 | |
3656 # strips cache before writing to disk | |
3657 sub strip_transcript_cache { | |
3658 my $config = shift; | |
3659 my $cache = shift; | |
3660 | |
3661 foreach my $chr(keys %$cache) { | |
3662 foreach my $tr(@{$cache->{$chr}}) { | |
3663 foreach my $exon(@{$tr->{_trans_exon_array}}) { | |
3664 delete $exon->{slice}->{adaptor}; | |
3665 | |
3666 for(qw(adaptor created_date modified_date is_current version is_constitutive _seq_cache dbID slice)) { | |
3667 delete $exon->{$_}; | |
3668 } | |
3669 } | |
3670 | |
3671 delete $tr->{adaptor}; | |
3672 delete $tr->{slice}->{adaptor}; | |
3673 } | |
3674 } | |
3675 } | |
3676 | |
3677 # cleans slice adaptor before storing in cache | |
3678 sub clean_slice_adaptor{ | |
3679 my $config = shift; | |
3680 | |
3681 # clean some stuff off the slice adaptor | |
3682 delete $config->{sa}->{asm_exc_cache}; | |
3683 $config->{sa}->{sr_name_cache} = {}; | |
3684 $config->{sa}->{sr_id_cache} = {}; | |
3685 delete $config->{sa}->{db}->{seq_region_cache}; | |
3686 delete $config->{sa}->{db}->{name_cache}; | |
3687 } | |
3688 | |
3689 | |
3690 # dump adaptors to cache | |
3691 sub dump_adaptor_cache { | |
3692 my $config = shift; | |
3693 | |
3694 $config->{reg}->disconnect_all; | |
3695 delete $config->{sa}->{dbc}->{_sql_helper}; | |
3696 | |
3697 my $dir = $config->{dir}; | |
3698 my $dump_file = $dir.'/adaptors.gz'; | |
3699 | |
3700 # make directory if it doesn't exist | |
3701 if(!(-e $dir)) { | |
3702 make_path($dir); | |
3703 } | |
3704 | |
3705 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; | |
3706 nstore_fd($config, $fh); | |
3707 close $fh; | |
3708 } | |
3709 | |
3710 # load dumped adaptors | |
3711 sub load_dumped_adaptor_cache { | |
3712 my $config = shift; | |
3713 | |
3714 my $dir = $config->{dir}; | |
3715 my $dump_file = $dir.'/adaptors.gz'; | |
3716 | |
3717 return undef unless -e $dump_file; | |
3718 | |
3719 debug("Reading cached adaptor data") unless defined($config->{quiet}); | |
3720 | |
3721 open my $fh, $config->{compress}." ".$dump_file." |" or return undef; | |
3722 my $cached_config; | |
3723 $cached_config = fd_retrieve($fh); | |
3724 close $fh; | |
3725 | |
3726 $config->{$_} = $cached_config->{$_} for qw(sa ga ta vfa svfa tva pfpma mca csa RegulatoryFeature_adaptor MotifFeature_adaptor); | |
3727 | |
3728 return 1; | |
3729 } | |
3730 | |
3731 # dumps cached variations to disk | |
3732 sub dump_variation_cache { | |
3733 my $config = shift; | |
3734 my $v_cache = shift; | |
3735 my $chr = shift; | |
3736 my $region = shift; | |
3737 | |
3738 my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); | |
3739 | |
3740 open DUMP, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to adaptor dump file $dump_file"; | |
3741 | |
3742 foreach my $pos(keys %{$v_cache->{$chr}}) { | |
3743 foreach my $v(@{$v_cache->{$chr}->{$pos}}) { | |
3744 my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = @$v; | |
3745 | |
3746 print DUMP join " ", ( | |
3747 $name, | |
3748 $failed == 0 ? '' : $failed, | |
3749 $start, | |
3750 $end == $start ? '' : $end, | |
3751 $as, | |
3752 $strand == 1 ? '' : $strand, | |
3753 $ma || '', | |
3754 defined($maf) ? sprintf("%.2f", $maf) : '', | |
3755 ); | |
3756 print DUMP "\n"; | |
3757 } | |
3758 } | |
3759 | |
3760 close DUMP; | |
3761 } | |
3762 | |
3763 # loads dumped variation cache | |
3764 sub load_dumped_variation_cache { | |
3765 my $config = shift; | |
3766 my $chr = shift; | |
3767 my $region = shift; | |
3768 | |
3769 my $dump_file = get_dump_file_name($config, $chr, $region, 'var'); | |
3770 | |
3771 return undef unless -e $dump_file; | |
3772 | |
3773 open DUMP, $config->{compress}." ".$dump_file." |" or return undef; | |
3774 | |
3775 my $v_cache; | |
3776 | |
3777 while(<DUMP>) { | |
3778 chomp; | |
3779 my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = split / /, $_; | |
3780 $failed ||= 0; | |
3781 $end ||= $start; | |
3782 $strand ||= 1; | |
3783 $ma ||= undef; | |
3784 $maf ||= undef; | |
3785 | |
3786 my @v = ($name, $failed, $start, $end, $as, $strand, $ma, $maf); | |
3787 push @{$v_cache->{$chr}->{$start}}, \@v; | |
3788 } | |
3789 | |
3790 close DUMP; | |
3791 | |
3792 return $v_cache; | |
3793 } | |
3794 | |
3795 # caches regulatory features | |
3796 sub cache_reg_feats { | |
3797 my $config = shift; | |
3798 my $include_regions = shift; | |
3799 | |
3800 my $rf_cache; | |
3801 my $i; | |
3802 | |
3803 debug("Caching regulatory features") unless defined($config->{quiet}); | |
3804 | |
3805 foreach my $chr(keys %$include_regions) { | |
3806 | |
3807 my $slice = get_slice($config, $chr); | |
3808 | |
3809 next unless defined $slice; | |
3810 | |
3811 # prefetch some things | |
3812 $slice->is_circular; | |
3813 | |
3814 # no regions? | |
3815 if(!scalar @{$include_regions->{$chr}}) { | |
3816 my $start = 1; | |
3817 my $end = $config->{cache_region_size}; | |
3818 | |
3819 while($start < $slice->end) { | |
3820 push @{$include_regions->{$chr}}, $start.'-'.$end; | |
3821 $start += $config->{cache_region_size}; | |
3822 $end += $config->{cache_region_size}; | |
3823 } | |
3824 } | |
3825 | |
3826 my $region_count; | |
3827 | |
3828 if(scalar keys %$include_regions == 1) { | |
3829 my ($chr) = keys %$include_regions; | |
3830 $region_count = scalar @{$include_regions->{$chr}}; | |
3831 debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet}); | |
3832 } | |
3833 | |
3834 foreach my $region(@{$include_regions->{$chr}}) { | |
3835 progress($config, $i++, $region_count || $config->{region_count}); | |
3836 | |
3837 my ($s, $e) = split /\-/, $region; | |
3838 | |
3839 # sanity check start and end | |
3840 $s = 1 if $s < 1; | |
3841 $e = $slice->end if $e > $slice->end; | |
3842 | |
3843 # get sub-slice | |
3844 my $sub_slice = $slice->sub_Slice($s, $e); | |
3845 $sub_slice->{coord_system}->{adaptor} = $config->{csa}; | |
3846 | |
3847 next unless defined($sub_slice); | |
3848 | |
3849 foreach my $type(@REG_FEAT_TYPES) { | |
3850 my $features = $config->{$type.'_adaptor'}->fetch_all_by_Slice($sub_slice); | |
3851 next unless defined($features); | |
3852 | |
3853 # cell types | |
3854 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { | |
3855 foreach my $rf(@$features) { | |
3856 | |
3857 my %cl; | |
3858 | |
3859 # get cell type by fetching all from stable ID | |
3860 if($type eq 'RegulatoryFeature') { | |
3861 %cl = map { | |
3862 $_->feature_set->cell_type->name => $_->feature_type->name | |
3863 } @{$rf->adaptor->fetch_all_by_stable_ID($rf->stable_id)}; | |
3864 } | |
3865 | |
3866 # get cell type by fetching regfeats that contain this MotifFeature | |
3867 elsif($type eq 'MotifFeature') { | |
3868 %cl = map { | |
3869 $_->feature_set->cell_type->name => $_->feature_type->name | |
3870 } @{$config->{'RegulatoryFeature_adaptor'}->fetch_all_by_attribute_feature($rf)}; | |
3871 } | |
3872 | |
3873 $rf->{cell_types} = \%cl; | |
3874 } | |
3875 } | |
3876 | |
3877 push @{$rf_cache->{$chr}->{$type}}, | |
3878 map { clean_reg_feat($_) } | |
3879 map { $_->transfer($slice) } | |
3880 @{$features}; | |
3881 } | |
3882 } | |
3883 } | |
3884 | |
3885 end_progress($config); | |
3886 | |
3887 return $rf_cache; | |
3888 } | |
3889 | |
3890 | |
3891 # cleans reg feats for caching | |
3892 sub clean_reg_feat { | |
3893 my $rf = shift; | |
3894 | |
3895 foreach my $key(qw/adaptor binary_string bound_start bound_end attribute_cache feature_type feature_set analysis/) { | |
3896 delete $rf->{$key}; | |
3897 } | |
3898 | |
3899 if(defined($rf->{binding_matrix})) { | |
3900 $rf->{_variation_effect_feature_cache}->{seq} = $rf->seq; | |
3901 | |
3902 foreach my $key(qw/adaptor feature_type analysis dbID/) { | |
3903 delete $rf->{binding_matrix}->{$key}; | |
3904 } | |
3905 } | |
3906 | |
3907 return $rf; | |
3908 } | |
3909 | |
3910 | |
3911 # dumps out reg feat cache to file | |
3912 sub dump_reg_feat_cache { | |
3913 my $config = shift; | |
3914 my $rf_cache = shift; | |
3915 my $chr = shift; | |
3916 my $region = shift; | |
3917 | |
3918 debug("Dumping cached reg feat data for $chr:$region") unless defined($config->{quiet}); | |
3919 | |
3920 # clean the slice adaptor before storing | |
3921 clean_slice_adaptor($config); | |
3922 | |
3923 $config->{reg}->disconnect_all; | |
3924 delete $config->{sa}->{dbc}->{_sql_helper}; | |
3925 | |
3926 foreach my $chr(keys %{$rf_cache}) { | |
3927 foreach my $type(keys %{$rf_cache->{$chr}}) { | |
3928 delete $_->{slice}->{coord_system}->{adaptor} for @{$rf_cache->{$chr}->{$type}}; | |
3929 } | |
3930 } | |
3931 | |
3932 my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); | |
3933 | |
3934 debug("Writing to $dump_file") unless defined($config->{quiet}); | |
3935 | |
3936 # storable | |
3937 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; | |
3938 nstore_fd($rf_cache, $fh); | |
3939 close $fh; | |
3940 } | |
3941 | |
3942 #sub dump_reg_feat_cache_tabix { | |
3943 # my $config = shift; | |
3944 # my $rf_cache = shift; | |
3945 # my $chr = shift; | |
3946 # my $region = shift; | |
3947 # | |
3948 # debug("Dumping cached reg feat data") unless defined($config->{quiet}); | |
3949 # | |
3950 # # clean the slice adaptor before storing | |
3951 # clean_slice_adaptor($config); | |
3952 # | |
3953 # $config->{reg}->disconnect_all; | |
3954 # delete $config->{sa}->{dbc}->{_sql_helper}; | |
3955 # | |
3956 # $config->{reg}->disconnect_all; | |
3957 # | |
3958 # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); | |
3959 # | |
3960 # debug("Writing to $dump_file") unless defined($config->{quiet}); | |
3961 # | |
3962 # use Storable qw(nfreeze); | |
3963 # use MIME::Base64 qw(encode_base64); | |
3964 # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file"; | |
3965 # | |
3966 # foreach my $type(keys %{$rf_cache->{$chr}}) { | |
3967 # foreach my $rf(sort {$a->start <=> $b->start} @{$rf_cache->{$chr}->{$type}}) { | |
3968 # print NEW join "\t", ( | |
3969 # $chr, | |
3970 # $rf->start, | |
3971 # $rf->end, | |
3972 # $type, | |
3973 # encode_base64(freeze($rf), "") | |
3974 # ); | |
3975 # print NEW "\n"; | |
3976 # } | |
3977 # } | |
3978 # close NEW; | |
3979 #} | |
3980 | |
3981 # loads in dumped transcript cache to memory | |
3982 sub load_dumped_reg_feat_cache { | |
3983 my $config = shift; | |
3984 my $chr = shift; | |
3985 my $region = shift; | |
3986 | |
3987 my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); | |
3988 | |
3989 return undef unless -e $dump_file; | |
3990 | |
3991 debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); | |
3992 | |
3993 open my $fh, $config->{compress}." ".$dump_file." |" or return undef; | |
3994 my $rf_cache; | |
3995 $rf_cache = fd_retrieve($fh); | |
3996 close $fh; | |
3997 | |
3998 return $rf_cache; | |
3999 } | |
4000 | |
4001 | |
4002 | |
4003 #sub load_dumped_reg_feat_cache_tabix { | |
4004 # my $config = shift; | |
4005 # my $chr = shift; | |
4006 # my $region = shift; | |
4007 # my $trim_regions = shift; | |
4008 # | |
4009 # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg'); | |
4010 # | |
4011 # #print STDERR "Reading from $dump_file\n"; | |
4012 # | |
4013 # return undef unless -e $dump_file; | |
4014 # | |
4015 # debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet}); | |
4016 # | |
4017 # my $rf_cache; | |
4018 # | |
4019 # use MIME::Base64 qw(decode_base64); | |
4020 # use Storable qw(thaw); | |
4021 # | |
4022 # my ($s, $e) = split /\-/, $region; | |
4023 # my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}}; | |
4024 # my $regions = ""; | |
4025 # $regions .= " $chr\:$_" for @regions; | |
4026 # | |
4027 # #print STDERR "tabix $dump_file $regions |\n"; | |
4028 # #open IN, "tabix $dump_file $regions |"; | |
4029 # open IN, "gzip -dc $dump_file |"; | |
4030 # while(<IN>) { | |
4031 # my ($chr, $start, $end, $type, $blob) = split /\t/, $_; | |
4032 # next unless grep {overlap($start, $end, (split /\-/, $_))} @regions; | |
4033 # my $rf = thaw(decode_base64($blob)); | |
4034 # push @{$rf_cache->{$chr}->{$type}}, $rf; | |
4035 # } | |
4036 # close IN; | |
4037 # | |
4038 # $rf_cache->{$chr}->{$_} ||= [] for @REG_FEAT_TYPES; | |
4039 # | |
4040 # return $rf_cache; | |
4041 #} | |
4042 | |
4043 | |
4044 # get custom annotation for a region | |
4045 sub cache_custom_annotation { | |
4046 my $config = shift; | |
4047 my $include_regions = shift; | |
4048 my $chr = shift; | |
4049 | |
4050 #$include_regions = merge_regions($include_regions, $config, 1); | |
4051 | |
4052 my $annotation = {}; | |
4053 | |
4054 my $total = scalar @{$config->{custom}} * scalar @{$include_regions->{$chr}}; | |
4055 my $counter = 0; | |
4056 | |
4057 my $max_regions_per_tabix = 1000; | |
4058 | |
4059 debug("Caching custom annotations") unless defined($config->{quiet}); | |
4060 | |
4061 foreach my $custom(@{$config->{custom}}) { | |
4062 | |
4063 my @regions = @{$include_regions->{$chr}}; | |
4064 | |
4065 while(scalar @regions) { | |
4066 my $got_features = 0; | |
4067 | |
4068 my @tmp_regions = splice @regions, 0, $max_regions_per_tabix; | |
4069 | |
4070 progress($config, $counter, $total); | |
4071 $counter += scalar @tmp_regions; | |
4072 | |
4073 # some files may have e.g. chr10 instead of 10 | |
4074 for my $tmp_chr($chr, 'chr'.$chr) { | |
4075 | |
4076 # bigwig needs to use bigWigToWig utility | |
4077 if($custom->{format} eq 'bigwig') { | |
4078 foreach my $region(@tmp_regions) { | |
4079 my ($s, $e) = split /\-/, $region; | |
4080 my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$.'_'.$tmp_chr.'_'.$s.'_'.$e; | |
4081 my $bigwig_file = $custom->{file}; | |
4082 my $bigwig_output = `bigWigToWig -chrom=$tmp_chr -start=$s -end=$e $bigwig_file $tmp_file 2>&1`; | |
4083 | |
4084 die "\nERROR: Problem using bigwig file $bigwig_file\n$bigwig_output" if $bigwig_output; | |
4085 } | |
4086 | |
4087 # concatenate all the files together | |
4088 my $string = $config->{tmpdir}.'/vep_tmp_'.$$.'_*'; | |
4089 my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$; | |
4090 `cat $string > $tmp_file; rm $string`; | |
4091 open CUSTOM, $tmp_file | |
4092 or die "\nERROR: Could not read from temporary WIG file $tmp_file\n"; | |
4093 } | |
4094 | |
4095 # otherwise use tabix | |
4096 else { | |
4097 # tabix can fetch multiple regions, so construct a string | |
4098 my $region_string = join " ", map {$tmp_chr.':'.$_} @tmp_regions; | |
4099 | |
4100 open CUSTOM, "tabix ".$custom->{file}." $region_string 2>&1 |" | |
4101 or die "\nERROR: Could not open tabix pipe for ".$custom->{file}."\n"; | |
4102 } | |
4103 | |
4104 # set an error flag so we don't have to check every line | |
4105 my $error_flag = 1; | |
4106 | |
4107 while(<CUSTOM>) { | |
4108 chomp; | |
4109 | |
4110 # check for errors | |
4111 if($error_flag) { | |
4112 die "\nERROR: Problem using annotation file ".$custom->{file}."\n$_\n" if /invalid pointer|tabix|get_intv/; | |
4113 $error_flag = 0; | |
4114 } | |
4115 | |
4116 my @data = split /\t/, $_; | |
4117 | |
4118 my $feature; | |
4119 | |
4120 if($custom->{format} eq 'bed') { | |
4121 $feature = { | |
4122 chr => $chr, | |
4123 start => $data[1], | |
4124 end => $data[2], | |
4125 name => $data[3], | |
4126 }; | |
4127 } | |
4128 | |
4129 elsif($custom->{format} eq 'vcf') { | |
4130 my $tmp_vf = parse_vcf($config, $_)->[0]; | |
4131 | |
4132 $feature = { | |
4133 chr => $chr, | |
4134 start => $tmp_vf->{start}, | |
4135 end => $tmp_vf->{end}, | |
4136 name => $tmp_vf->{variation_name}, | |
4137 }; | |
4138 } | |
4139 | |
4140 elsif($custom->{format} eq 'gff' || $custom->{format} eq 'gtf') { | |
4141 | |
4142 my $name; | |
4143 | |
4144 # try and get a feature name from the attributes column | |
4145 foreach my $attrib(split /\s*\;\s*/, $data[8]) { | |
4146 my ($key, $value) = split /\=/, $attrib; | |
4147 $name = $value if $key eq 'ID'; | |
4148 } | |
4149 | |
4150 $name ||= $data[2]."_".$data[0].":".$data[3]."-".$data[4]; | |
4151 | |
4152 $feature = { | |
4153 chr => $chr, | |
4154 start => $data[3], | |
4155 end => $data[4], | |
4156 name => $name, | |
4157 }; | |
4158 } | |
4159 | |
4160 elsif($custom->{format} eq 'bigwig') { | |
4161 $feature = { | |
4162 chr => $chr, | |
4163 start => $data[0], | |
4164 end => $data[0], | |
4165 name => $data[1], | |
4166 }; | |
4167 } | |
4168 | |
4169 if(defined($feature)) { | |
4170 $got_features = 1; | |
4171 | |
4172 if(!defined($feature->{name}) || $custom->{coords}) { | |
4173 $feature->{name} = $feature->{chr}.":".$feature->{start}."-".$feature->{end}; | |
4174 } | |
4175 | |
4176 # add the feature to the cache | |
4177 $annotation->{$chr}->{$custom->{name}}->{$feature->{start}}->{$feature->{name}} = $feature; | |
4178 } | |
4179 } | |
4180 close CUSTOM; | |
4181 | |
4182 # unlink temporary wig files | |
4183 unlink($config->{tmpdir}.'/vep_tmp_'.$$) if $custom->{format} eq 'bigwig'; | |
4184 | |
4185 # no need to fetch e.g. "chr21" features if just "21" worked | |
4186 last if $got_features; | |
4187 } | |
4188 } | |
4189 } | |
4190 | |
4191 end_progress($config); | |
4192 | |
4193 return $annotation; | |
4194 } | |
4195 | |
4196 # builds a full cache for this species | |
4197 sub build_full_cache { | |
4198 my $config = shift; | |
4199 | |
4200 my @slices; | |
4201 | |
4202 if($config->{build} =~ /all/i) { | |
4203 @slices = @{$config->{sa}->fetch_all('toplevel')}; | |
4204 push @slices, @{$config->{sa}->fetch_all('lrg', undef, 1, undef, 1)} if defined($config->{lrg}); | |
4205 } | |
4206 else { | |
4207 foreach my $val(split /\,/, $config->{build}) { | |
4208 my @nnn = split /\-/, $val; | |
4209 | |
4210 foreach my $chr($nnn[0]..$nnn[-1]) { | |
4211 my $slice = get_slice($config, $chr); | |
4212 push @slices, $slice if defined($slice); | |
4213 } | |
4214 } | |
4215 } | |
4216 | |
4217 foreach my $slice(@slices) { | |
4218 my $chr = $slice->seq_region_name; | |
4219 | |
4220 # check for features, we don't want a load of effectively empty dirs | |
4221 my $dbc = $config->{sa}->db->dbc; | |
4222 my $sth = $dbc->prepare("SELECT COUNT(*) FROM transcript WHERE seq_region_id = ?"); | |
4223 $sth->execute($slice->get_seq_region_id); | |
4224 | |
4225 my $count; | |
4226 $sth->bind_columns(\$count); | |
4227 $sth->fetch; | |
4228 $sth->finish; | |
4229 | |
4230 next unless $count > 0; | |
4231 | |
4232 my $regions; | |
4233 | |
4234 # for progress | |
4235 my $region_count = int($slice->end / $config->{cache_region_size}) + 1; | |
4236 my $counter = 0; | |
4237 | |
4238 # initial region | |
4239 my ($start, $end) = (1, $config->{cache_region_size}); | |
4240 | |
4241 debug((defined($config->{rebuild}) ? "Rebuild" : "Creat")."ing cache for chromosome $chr") unless defined($config->{quiet}); | |
4242 | |
4243 while($start < $slice->end) { | |
4244 | |
4245 progress($config, $counter++, $region_count); | |
4246 | |
4247 # store quiet status | |
4248 my $quiet = $config->{quiet}; | |
4249 $config->{quiet} = 1; | |
4250 | |
4251 # spoof regions | |
4252 $regions->{$chr} = [$start.'-'.$end]; | |
4253 | |
4254 # store transcripts | |
4255 my $tmp_cache = (defined($config->{rebuild}) ? load_dumped_transcript_cache($config, $chr, $start.'-'.$end) : cache_transcripts($config, $regions)); | |
4256 $tmp_cache->{$chr} ||= []; | |
4257 | |
4258 #(defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $start.'-'.$end) : dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end)); | |
4259 dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end); | |
4260 undef $tmp_cache; | |
4261 | |
4262 # store reg feats | |
4263 if(defined($config->{regulatory})) { | |
4264 my $rf_cache = cache_reg_feats($config, $regions); | |
4265 $rf_cache->{$chr} ||= {}; | |
4266 | |
4267 dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end); | |
4268 #(defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $rf_cache, $chr, $start.'-'.$end) : dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end)); | |
4269 undef $rf_cache; | |
4270 | |
4271 # this gets cleaned off but needs to be there for the next loop | |
4272 $slice->{coord_system}->{adaptor} = $config->{csa}; | |
4273 } | |
4274 | |
4275 # store variations | |
4276 my $variation_cache; | |
4277 $variation_cache->{$chr} = get_variations_in_region($config, $chr, $start.'-'.$end); | |
4278 $variation_cache->{$chr} ||= {}; | |
4279 | |
4280 dump_variation_cache($config, $variation_cache, $chr, $start.'-'.$end); | |
4281 undef $variation_cache; | |
4282 | |
4283 # restore quiet status | |
4284 $config->{quiet} = $quiet; | |
4285 | |
4286 # increment by cache_region_size to get next region | |
4287 $start += $config->{cache_region_size}; | |
4288 $end += $config->{cache_region_size}; | |
4289 } | |
4290 | |
4291 end_progress($config); | |
4292 | |
4293 undef $regions; | |
4294 } | |
4295 | |
4296 write_cache_info($config); | |
4297 } | |
4298 | |
4299 # write an info file that defines what is in the cache | |
4300 sub write_cache_info { | |
4301 my $config = shift; | |
4302 | |
4303 my $info_file = $config->{dir}.'/info.txt'; | |
4304 | |
4305 open OUT, ">>$info_file" or die "ERROR: Could not write to cache info file $info_file\n"; | |
4306 | |
4307 print OUT "# CACHE UPDATED ".get_time()."\n"; | |
4308 | |
4309 foreach my $param(qw( | |
4310 host | |
4311 port | |
4312 user | |
4313 build | |
4314 regulatory | |
4315 sift | |
4316 polyphen | |
4317 )) { | |
4318 print OUT "$param\t".(defined $config->{$param} ? $config->{$param} : '-')."\n"; | |
4319 } | |
4320 | |
4321 # cell types | |
4322 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) { | |
4323 my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor(); | |
4324 print OUT "cell_types\t".(join ",", map {$_->name} @{$cta->fetch_all}); | |
4325 print OUT "\n"; | |
4326 } | |
4327 | |
4328 close OUT; | |
4329 } | |
4330 | |
4331 # reads in cache info file | |
4332 sub read_cache_info { | |
4333 my $config = shift; | |
4334 | |
4335 my $info_file = $config->{dir}.'/info.txt'; | |
4336 | |
4337 open IN, $info_file or return 0; | |
4338 | |
4339 while(<IN>) { | |
4340 next if /^#/; | |
4341 chomp; | |
4342 my ($param, $value) = split /\t/; | |
4343 $config->{'cache_'.$param} = $value unless defined $value && $value eq '-'; | |
4344 } | |
4345 | |
4346 close IN; | |
4347 | |
4348 return 1; | |
4349 } | |
4350 | |
4351 # format coords for printing | |
4352 sub format_coords { | |
4353 my ($start, $end) = @_; | |
4354 | |
4355 if(defined($start)) { | |
4356 if(defined($end)) { | |
4357 if($start > $end) { | |
4358 return $end.'-'.$start; | |
4359 } | |
4360 elsif($start == $end) { | |
4361 return $start; | |
4362 } | |
4363 else { | |
4364 return $start.'-'.$end; | |
4365 } | |
4366 } | |
4367 else { | |
4368 return $start.'-?'; | |
4369 } | |
4370 } | |
4371 elsif(defined($end)) { | |
4372 return '?-'.$end; | |
4373 } | |
4374 else { | |
4375 return '-'; | |
4376 } | |
4377 } | |
4378 | |
4379 | |
4380 | |
4381 | |
4382 # METHODS TO FIND CO-LOCATED / EXISTING VARIATIONS | |
4383 ################################################## | |
4384 | |
4385 # finds an existing VF in the db | |
4386 sub find_existing { | |
4387 my $config = shift; | |
4388 my $new_vf = shift; | |
4389 | |
4390 if(defined($config->{vfa}->db)) { | |
4391 | |
4392 my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; | |
4393 | |
4394 my $sth = $config->{vfa}->db->dbc->prepare(qq{ | |
4395 SELECT variation_name, IF(fv.variation_id IS NULL, 0, 1), seq_region_start, seq_region_end, allele_string, seq_region_strand, $maf_cols | |
4396 FROM variation_feature vf LEFT JOIN failed_variation fv | |
4397 ON vf.variation_id = fv.variation_id | |
4398 WHERE vf.seq_region_id = ? | |
4399 AND vf.seq_region_start = ? | |
4400 AND vf.seq_region_end = ? | |
4401 ORDER BY vf.source_id ASC | |
4402 }); | |
4403 | |
4404 $sth->execute($new_vf->slice->get_seq_region_id, $new_vf->start, $new_vf->end); | |
4405 | |
4406 my @v; | |
4407 for my $i(0..7) { | |
4408 $v[$i] = undef; | |
4409 } | |
4410 | |
4411 $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]); | |
4412 | |
4413 my @found; | |
4414 | |
4415 while($sth->fetch) { | |
4416 push @found, $v[0] unless is_var_novel($config, \@v, $new_vf) || $v[1] > $config->{failed}; | |
4417 } | |
4418 | |
4419 $sth->finish(); | |
4420 | |
4421 return (scalar @found ? join ",", @found : undef); | |
4422 } | |
4423 | |
4424 return undef; | |
4425 } | |
4426 | |
4427 # compare a new vf to one from the cache / DB | |
4428 sub is_var_novel { | |
4429 my $config = shift; | |
4430 my $existing_var = shift; | |
4431 my $new_var = shift; | |
4432 | |
4433 my $is_novel = 1; | |
4434 | |
4435 $is_novel = 0 if $existing_var->[2] == $new_var->start && $existing_var->[3] == $new_var->end; | |
4436 | |
4437 if(defined($config->{check_alleles})) { | |
4438 my %existing_alleles; | |
4439 | |
4440 $existing_alleles{$_} = 1 for split /\//, $existing_var->[4]; | |
4441 | |
4442 my $seen_new = 0; | |
4443 foreach my $a(split /\//, ($new_var->allele_string || "")) { | |
4444 reverse_comp(\$a) if $new_var->strand ne $existing_var->[5]; | |
4445 $seen_new = 1 unless defined $existing_alleles{$a}; | |
4446 } | |
4447 | |
4448 $is_novel = 1 if $seen_new; | |
4449 } | |
4450 | |
4451 return $is_novel; | |
4452 } | |
4453 | |
4454 # check frequencies of existing var against requested params | |
4455 sub check_frequencies { | |
4456 my $config = shift; | |
4457 my $var_name = shift; | |
4458 | |
4459 my $v = $config->{va}->fetch_by_name($var_name); | |
4460 | |
4461 my $pass = 0; | |
4462 | |
4463 my $freq_pop = $config->{freq_pop}; | |
4464 my $freq_freq = $config->{freq_freq}; | |
4465 my $freq_gt_lt = $config->{freq_gt_lt}; | |
4466 | |
4467 my $freq_pop_name = (split /\_/, $freq_pop)[-1]; | |
4468 $freq_pop_name = undef if $freq_pop_name =~ /1kg|hap/; | |
4469 | |
4470 delete $config->{filtered_freqs}; | |
4471 | |
4472 foreach my $a(@{$v->get_all_Alleles}) { | |
4473 next unless defined $a->{population} || defined $a->{'_population_id'}; | |
4474 next unless defined $a->frequency; | |
4475 next if $a->frequency > 0.5; | |
4476 | |
4477 my $pop_name = $a->population->name; | |
4478 | |
4479 if($freq_pop =~ /1kg/) { next unless $pop_name =~ /^1000.+(low|phase).+/i; } | |
4480 if($freq_pop =~ /hap/) { next unless $pop_name =~ /^CSHL-HAPMAP/i; } | |
4481 if($freq_pop =~ /any/) { next unless $pop_name =~ /^(CSHL-HAPMAP)|(1000.+(low|phase).+)/i; } | |
4482 if(defined $freq_pop_name) { next unless $pop_name =~ /$freq_pop_name/i; } | |
4483 | |
4484 $pass = 1 if $a->frequency >= $freq_freq and $freq_gt_lt eq 'gt'; | |
4485 $pass = 1 if $a->frequency <= $freq_freq and $freq_gt_lt eq 'lt'; | |
4486 | |
4487 $pop_name =~ s/\:/\_/g; | |
4488 push @{$config->{filtered_freqs}}, $pop_name.':'.$a->frequency; | |
4489 | |
4490 #warn "Comparing allele ", $a->allele, " ", $a->frequency, " for $var_name in population ", $a->population->name, " PASS $pass"; | |
4491 } | |
4492 | |
4493 return 0 if $config->{freq_filter} eq 'exclude' and $pass == 1; | |
4494 return 0 if $config->{freq_filter} eq 'include' and $pass == 0; | |
4495 return 1; | |
4496 } | |
4497 | |
4498 # gets all variations in a region | |
4499 sub get_variations_in_region { | |
4500 my $config = shift; | |
4501 my $chr = shift; | |
4502 my $region = shift; | |
4503 | |
4504 my ($start, $end) = split /\-/, $region; | |
4505 | |
4506 my %variations; | |
4507 | |
4508 if(defined($config->{vfa}->db)) { | |
4509 my $sr_cache = $config->{seq_region_cache}; | |
4510 | |
4511 if(!defined($sr_cache)) { | |
4512 $sr_cache = cache_seq_region_ids($config); | |
4513 $config->{seq_region_cache} = $sr_cache; | |
4514 } | |
4515 | |
4516 # no seq_region_id? | |
4517 return {} unless defined($sr_cache) && defined($sr_cache->{$chr}); | |
4518 | |
4519 my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL'; | |
4520 | |
4521 my $sth = $config->{vfa}->db->dbc->prepare(qq{ | |
4522 SELECT vf.variation_name, IF(fv.variation_id IS NULL, 0, 1), vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand, $maf_cols | |
4523 FROM variation_feature vf | |
4524 LEFT JOIN failed_variation fv ON fv.variation_id = vf.variation_id | |
4525 WHERE vf.seq_region_id = ? | |
4526 AND vf.seq_region_start >= ? | |
4527 AND vf.seq_region_start <= ? | |
4528 }); | |
4529 | |
4530 $sth->execute($sr_cache->{$chr}, $start, $end); | |
4531 | |
4532 my @v; | |
4533 for my $i(0..7) { | |
4534 $v[$i] = undef; | |
4535 } | |
4536 | |
4537 $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]); | |
4538 | |
4539 while($sth->fetch) { | |
4540 my @v_copy = @v; | |
4541 push @{$variations{$v[2]}}, \@v_copy; | |
4542 } | |
4543 | |
4544 $sth->finish(); | |
4545 } | |
4546 | |
4547 return \%variations; | |
4548 } | |
4549 | |
4550 sub cache_seq_region_ids { | |
4551 my $config = shift; | |
4552 | |
4553 my (%cache, $chr, $id); | |
4554 | |
4555 my $sth = $config->{vfa}->db->dbc->prepare(qq{ | |
4556 SELECT seq_region_id, name FROM seq_region | |
4557 }); | |
4558 | |
4559 $sth->execute(); | |
4560 $sth->bind_columns(\$id, \$chr); | |
4561 $cache{$chr} = $id while $sth->fetch(); | |
4562 $sth->finish; | |
4563 | |
4564 return \%cache; | |
4565 } | |
4566 | |
4567 sub have_maf_cols { | |
4568 my $config = shift; | |
4569 | |
4570 if(!defined($config->{have_maf_cols})) { | |
4571 my $sth = $config->{vfa}->db->dbc->prepare(qq{ | |
4572 DESCRIBE variation_feature | |
4573 }); | |
4574 | |
4575 $sth->execute(); | |
4576 my @cols = map {$_->[0]} @{$sth->fetchall_arrayref}; | |
4577 $sth->finish(); | |
4578 | |
4579 $config->{have_maf_cols} = 0; | |
4580 $config->{have_maf_cols} = 1 if grep {$_ eq 'minor_allele'} @cols; | |
4581 } | |
4582 | |
4583 return $config->{have_maf_cols}; | |
4584 } | |
4585 | |
4586 sub merge_hashes { | |
4587 my ($x, $y) = @_; | |
4588 | |
4589 foreach my $k (keys %$y) { | |
4590 if (!defined($x->{$k})) { | |
4591 $x->{$k} = $y->{$k}; | |
4592 } else { | |
4593 if(ref($x->{$k}) eq 'ARRAY') { | |
4594 $x->{$k} = merge_arrays($x->{$k}, $y->{$k}); | |
4595 } | |
4596 elsif(ref($x->{$k}) eq 'HASH') { | |
4597 $x->{$k} = merge_hashes($x->{$k}, $y->{$k}); | |
4598 } | |
4599 else { | |
4600 $x->{$k} = $y->{$k}; | |
4601 } | |
4602 } | |
4603 } | |
4604 return $x; | |
4605 } | |
4606 | |
4607 sub merge_arrays { | |
4608 my ($x, $y) = @_; | |
4609 | |
4610 my %tmp = map {$_ => 1} (@$x, @$y); | |
4611 | |
4612 return [keys %tmp]; | |
4613 } | |
4614 | |
4615 | |
4616 | |
4617 | |
4618 # DEBUG AND STATUS METHODS | |
4619 ########################## | |
4620 | |
4621 # gets time | |
4622 sub get_time() { | |
4623 my @time = localtime(time()); | |
4624 | |
4625 # increment the month (Jan = 0) | |
4626 $time[4]++; | |
4627 | |
4628 # add leading zeroes as required | |
4629 for my $i(0..4) { | |
4630 $time[$i] = "0".$time[$i] if $time[$i] < 10; | |
4631 } | |
4632 | |
4633 # put the components together in a string | |
4634 my $time = | |
4635 ($time[5] + 1900)."-". | |
4636 $time[4]."-". | |
4637 $time[3]." ". | |
4638 $time[2].":". | |
4639 $time[1].":". | |
4640 $time[0]; | |
4641 | |
4642 return $time; | |
4643 } | |
4644 | |
4645 # prints debug output with time | |
4646 sub debug { | |
4647 my $text = (@_ ? (join "", @_) : "No message"); | |
4648 my $time = get_time; | |
4649 | |
4650 print $time." - ".$text.($text =~ /\n$/ ? "" : "\n"); | |
4651 } | |
4652 | |
4653 # finds out memory usage | |
4654 sub memory { | |
4655 my @mem; | |
4656 | |
4657 open IN, "ps -o rss,vsz $$ |"; | |
4658 while(<IN>) { | |
4659 next if $_ =~ /rss/i; | |
4660 chomp; | |
4661 @mem = split; | |
4662 } | |
4663 close IN; | |
4664 | |
4665 return \@mem; | |
4666 } | |
4667 | |
4668 sub mem_diff { | |
4669 my $config = shift; | |
4670 my $mem = memory(); | |
4671 my @diffs; | |
4672 | |
4673 if(defined($config->{memory})) { | |
4674 for my $i(0..(scalar @{$config->{memory}} - 1)) { | |
4675 push @diffs, $mem->[$i] - $config->{memory}->[$i]; | |
4676 } | |
4677 } | |
4678 else { | |
4679 @diffs = @$mem; | |
4680 } | |
4681 | |
4682 $config->{memory} = $mem; | |
4683 | |
4684 return \@diffs; | |
4685 } | |
4686 | |
4687 # update or initiate progress bar | |
4688 sub progress { | |
4689 my ($config, $i, $total) = @_; | |
4690 | |
4691 return if defined($config->{quiet}) || defined($config->{no_progress}); | |
4692 | |
4693 my $width = $config->{terminal_width} || 60; | |
4694 my $percent = int(($i/$total) * 100); | |
4695 my $numblobs = int((($i/$total) * $width) - 2); | |
4696 | |
4697 # this ensures we're not writing to the terminal too much | |
4698 return if(defined($config->{prev_prog})) && $numblobs.'-'.$percent eq $config->{prev_prog}; | |
4699 $config->{prev_prog} = $numblobs.'-'.$percent; | |
4700 | |
4701 printf("\r% -${width}s% 1s% 10s", '['.('=' x $numblobs).($numblobs == $width - 2 ? '=' : '>'), ']', "[ " . $percent . "% ]"); | |
4702 } | |
4703 | |
4704 # end progress bar | |
4705 sub end_progress { | |
4706 my $config = shift; | |
4707 return if defined($config->{quiet}) || defined($config->{no_progress}); | |
4708 progress($config, 1,1); | |
4709 print "\n"; | |
4710 delete $config->{prev_prog}; | |
4711 } | |
4712 | |
4713 1; |