0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence
|
|
22 #
|
|
23 #
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 Bio::EnsEMBL::Variation::Utils::VEP - Methods used by the Variant Effect Predictor
|
|
28
|
|
29 =head1 SYNOPSIS
|
|
30
|
|
31 use Bio::EnsEMBL::Variation::Utils::VEP qw(configure);
|
|
32
|
|
33 my $config = configure();
|
|
34
|
|
35 =head1 METHODS
|
|
36
|
|
37 =cut
|
|
38
|
|
39
|
|
40 use strict;
|
|
41 use warnings;
|
|
42
|
|
43 package Bio::EnsEMBL::Variation::Utils::VEP;
|
|
44
|
|
45 # module list
|
|
46 use Getopt::Long;
|
|
47 use FileHandle;
|
|
48 use File::Path qw(make_path);
|
|
49 use Storable qw(nstore_fd fd_retrieve freeze thaw);
|
|
50 use Scalar::Util qw(weaken);
|
|
51 use Digest::MD5 qw(md5_hex);
|
|
52
|
|
53 use Bio::EnsEMBL::Registry;
|
|
54 use Bio::EnsEMBL::Variation::VariationFeature;
|
|
55 use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor;
|
|
56 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(MAX_DISTANCE_FROM_TRANSCRIPT overlap);
|
|
57 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
58 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
|
|
59 use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3;
|
|
60 use Bio::EnsEMBL::Variation::StructuralVariationFeature;
|
|
61 use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor;
|
|
62 use Bio::EnsEMBL::Variation::TranscriptStructuralVariation;
|
|
63
|
|
64 # we need to manually include all these modules for caching to work
|
|
65 use Bio::EnsEMBL::CoordSystem;
|
|
66 use Bio::EnsEMBL::Transcript;
|
|
67 use Bio::EnsEMBL::Translation;
|
|
68 use Bio::EnsEMBL::Exon;
|
|
69 use Bio::EnsEMBL::ProteinFeature;
|
|
70 use Bio::EnsEMBL::Analysis;
|
|
71 use Bio::EnsEMBL::DBSQL::GeneAdaptor;
|
|
72 use Bio::EnsEMBL::DBSQL::SliceAdaptor;
|
|
73 use Bio::EnsEMBL::DBSQL::TranslationAdaptor;
|
|
74 use Bio::EnsEMBL::DBSQL::TranscriptAdaptor;
|
|
75 use Bio::EnsEMBL::DBSQL::MetaContainer;
|
|
76 use Bio::EnsEMBL::DBSQL::CoordSystemAdaptor;
|
|
77
|
|
78 use Exporter;
|
|
79 use vars qw(@ISA @EXPORT_OK);
|
|
80 @ISA = qw(Exporter);
|
|
81
|
|
82 # open socket pairs for cross-process comms
|
|
83 use Socket;
|
|
84 socketpair(CHILD, PARENT, AF_UNIX, SOCK_STREAM, PF_UNSPEC) or die "ERROR: Failed to open socketpair: $!";
|
|
85 CHILD->autoflush(1);
|
|
86 PARENT->autoflush(1);
|
|
87
|
|
88 @EXPORT_OK = qw(
|
|
89 &parse_line
|
|
90 &vf_to_consequences
|
|
91 &validate_vf
|
|
92 &read_cache_info
|
|
93 &dump_adaptor_cache
|
|
94 &load_dumped_adaptor_cache
|
|
95 &load_dumped_variation_cache
|
|
96 &get_all_consequences
|
|
97 &get_slice
|
|
98 &build_slice_cache
|
|
99 &build_full_cache
|
|
100 ®ions_from_hash
|
|
101 &get_time
|
|
102 &debug
|
|
103 &convert_to_vcf
|
|
104 &progress
|
|
105 &end_progress
|
|
106 @REG_FEAT_TYPES
|
|
107 @OUTPUT_COLS
|
|
108 @VEP_WEB_CONFIG
|
|
109 %FILTER_SHORTCUTS
|
|
110 );
|
|
111
|
|
112 our @OUTPUT_COLS = qw(
|
|
113 Uploaded_variation
|
|
114 Location
|
|
115 Allele
|
|
116 Gene
|
|
117 Feature
|
|
118 Feature_type
|
|
119 Consequence
|
|
120 cDNA_position
|
|
121 CDS_position
|
|
122 Protein_position
|
|
123 Amino_acids
|
|
124 Codons
|
|
125 Existing_variation
|
|
126 Extra
|
|
127 );
|
|
128
|
|
129 our @REG_FEAT_TYPES = qw(
|
|
130 RegulatoryFeature
|
|
131 MotifFeature
|
|
132 );
|
|
133
|
|
134 our @VEP_WEB_CONFIG = qw(
|
|
135 format
|
|
136 check_existing
|
|
137 coding_only
|
|
138 core_type
|
|
139 hgnc
|
|
140 protein
|
|
141 hgvs
|
|
142 terms
|
|
143 check_frequency
|
|
144 freq_filter
|
|
145 freq_gt_lt
|
|
146 freq_freq
|
|
147 freq_pop
|
|
148 sift
|
|
149 polyphen
|
|
150 regulatory
|
|
151 );
|
|
152
|
|
153 our %FILTER_SHORTCUTS = (
|
|
154 upstream => {
|
|
155 '5KB_upstream_variant' => 1,
|
|
156 '2KB_upstream_variant' => 1,
|
|
157 },
|
|
158 downstream => {
|
|
159 '5KB_downstream_variant' => 1,
|
|
160 '2KB_downstream_variant' => 1,
|
|
161 '500B_downstream_variant' => 1,
|
|
162 },
|
|
163 utr => {
|
|
164 '5_prime_UTR_variant' => 1,
|
|
165 '3_prime_UTR_variant' => 1,
|
|
166 },
|
|
167 splice => {
|
|
168 splice_donor_variant => 1,
|
|
169 splice_acceptor_variant => 1,
|
|
170 splice_region_variant => 1,
|
|
171 },
|
|
172 coding_change => {
|
|
173 stop_lost => 1,
|
|
174 stop_gained => 1,
|
|
175 missense_variant => 1,
|
|
176 frameshift_variant => 1,
|
|
177 inframe_insertion => 1,
|
|
178 inframe_deletion => 1,
|
|
179 },
|
|
180 regulatory => {
|
|
181 regulatory_region_variant => 1,
|
|
182 TF_binding_site_variant => 1,
|
|
183 },
|
|
184 );
|
|
185
|
|
186 # parses a line of input, returns VF object(s)
|
|
187 sub parse_line {
|
|
188 my $config = shift;
|
|
189 my $line = shift;
|
|
190
|
|
191 # find out file format - will only do this on first line
|
|
192 if(!defined($config->{format}) || (defined($config->{format}) && $config->{format} eq 'guess')) {
|
|
193 $config->{format} = &detect_format($line);
|
|
194 debug("Detected format of input file as ", $config->{format}) unless defined($config->{quiet});
|
|
195
|
|
196 # HGVS and ID formats need DB
|
|
197 die("ERROR: Can't use ".uc($config->{format})." format in offline mode") if $config->{format} =~ /id|hgvs/ && defined($config->{offline});
|
|
198
|
|
199 # force certain options if format is VEP output
|
|
200 if($config->{format} eq 'vep') {
|
|
201 $config->{no_consequence} = 1;
|
|
202 delete $config->{regulatory};
|
|
203 debug("Forcing no consequence calculation") unless defined($config->{quiet});
|
|
204 }
|
|
205 }
|
|
206
|
|
207 # check that format is vcf when using --individual
|
|
208 die("ERROR: --individual only compatible with VCF input files\n") if defined($config->{individual}) && $config->{format} ne 'vcf';
|
|
209
|
|
210 my $parse_method = 'parse_'.$config->{format};
|
|
211 $parse_method =~ s/vep_//;
|
|
212 my $method_ref = \&$parse_method;
|
|
213
|
|
214 my $vfs = &$method_ref($config, $line);
|
|
215
|
|
216 $vfs = add_lrg_mappings($config, $vfs) if defined($config->{lrg});
|
|
217
|
|
218 return $vfs;
|
|
219 }
|
|
220
|
|
221 # sub-routine to detect format of input
|
|
222 sub detect_format {
|
|
223 my $line = shift;
|
|
224 my @data = split /\s+/, $line;
|
|
225
|
|
226 # HGVS: ENST00000285667.3:c.1047_1048insC
|
|
227 if (
|
|
228 scalar @data == 1 &&
|
|
229 $data[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i
|
|
230 ) {
|
|
231 return 'hgvs';
|
|
232 }
|
|
233
|
|
234 # variant identifier: rs123456
|
|
235 elsif (
|
|
236 scalar @data == 1
|
|
237 ) {
|
|
238 return 'id';
|
|
239 }
|
|
240
|
|
241 # VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ
|
|
242 elsif (
|
|
243 $data[0] =~ /(chr)?\w+/ &&
|
|
244 $data[1] =~ /^\d+$/ &&
|
|
245 $data[3] =~ /^[ACGTN-]+$/i &&
|
|
246 $data[4] =~ /^([\.ACGTN-]+\,?)+$/i
|
|
247 ) {
|
|
248 return 'vcf';
|
|
249 }
|
|
250
|
|
251 # pileup: chr1 60 T A
|
|
252 elsif (
|
|
253 $data[0] =~ /(chr)?\w+/ &&
|
|
254 $data[1] =~ /^\d+$/ &&
|
|
255 $data[2] =~ /^[\*ACGTN-]+$/i &&
|
|
256 $data[3] =~ /^[\*ACGTNRYSWKM\+\/-]+$/i
|
|
257 ) {
|
|
258 return 'pileup';
|
|
259 }
|
|
260
|
|
261 # ensembl: 20 14370 14370 A/G +
|
|
262 elsif (
|
|
263 $data[0] =~ /\w+/ &&
|
|
264 $data[1] =~ /^\d+$/ &&
|
|
265 $data[2] =~ /^\d+$/ &&
|
|
266 $data[3] =~ /[ACGTN-]+\/[ACGTN-]+/i
|
|
267 ) {
|
|
268 return 'ensembl';
|
|
269 }
|
|
270
|
|
271 # vep output: ID 1:142849179 - - - - INTERGENIC
|
|
272 elsif (
|
|
273 $data[0] =~ /\w+/ &&
|
|
274 $data[1] =~ /^\w+?\:\d+(\-\d+)*$/ &&
|
|
275 scalar @data == 14
|
|
276 ) {
|
|
277 return 'vep';
|
|
278 }
|
|
279
|
|
280 else {
|
|
281 die("ERROR: Could not detect input file format\n");
|
|
282 }
|
|
283 }
|
|
284
|
|
285 # parse a line of Ensembl format input into a variation feature object
|
|
286 sub parse_ensembl {
|
|
287 my $config = shift;
|
|
288 my $line = shift;
|
|
289
|
|
290 my ($chr, $start, $end, $allele_string, $strand, $var_name) = split /\s+/, $line;
|
|
291
|
|
292 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({
|
|
293 start => $start,
|
|
294 end => $end,
|
|
295 allele_string => $allele_string,
|
|
296 strand => $strand,
|
|
297 map_weight => 1,
|
|
298 adaptor => $config->{vfa},
|
|
299 variation_name => $var_name,
|
|
300 chr => $chr,
|
|
301 });
|
|
302
|
|
303 return [$vf];
|
|
304 }
|
|
305
|
|
306 # parse a line of VCF input into a variation feature object
|
|
307 sub parse_vcf {
|
|
308 my $config = shift;
|
|
309 my $line = shift;
|
|
310
|
|
311 my @data = split /\s+/, $line;
|
|
312
|
|
313 # non-variant
|
|
314 my $non_variant = 0;
|
|
315
|
|
316 if($data[4] eq '.') {
|
|
317 if(defined($config->{allow_non_variant})) {
|
|
318 $non_variant = 1;
|
|
319 }
|
|
320 else {
|
|
321 return [];
|
|
322 }
|
|
323 }
|
|
324
|
|
325 # get relevant data
|
|
326 my ($chr, $start, $end, $ref, $alt) = ($data[0], $data[1], $data[1], $data[3], $data[4]);
|
|
327
|
|
328 # some VCF files have a GRCh37 pos defined in GP flag in INFO column
|
|
329 # if user has requested, we can use that as the position instead
|
|
330 if(defined $config->{gp}) {
|
|
331 $chr = undef;
|
|
332 $start = undef;
|
|
333
|
|
334 foreach my $pair(split /\;/, $data[7]) {
|
|
335 my ($key, $value) = split /\=/, $pair;
|
|
336 if($key eq 'GP') {
|
|
337 ($chr, $start) = split /\:/, $value;
|
|
338 $end = $start;
|
|
339 }
|
|
340 }
|
|
341
|
|
342 unless(defined($chr) and defined($start)) {
|
|
343 warn "No GP flag found in INFO column" unless defined $config->{quiet};
|
|
344 return [];
|
|
345 }
|
|
346 }
|
|
347
|
|
348 # adjust end coord
|
|
349 $end += (length($ref) - 1);
|
|
350
|
|
351 # structural variation
|
|
352 if((defined($data[7]) && $data[7] =~ /SVTYPE/) || $alt =~ /\<|\[|\]|\>/) {
|
|
353
|
|
354 # parse INFO field
|
|
355 my %info = ();
|
|
356
|
|
357 foreach my $bit(split /\;/, $data[7]) {
|
|
358 my ($key, $value) = split /\=/, $bit;
|
|
359 $info{$key} = $value;
|
|
360 }
|
|
361
|
|
362 # like indels, SVs have the base before included for reference
|
|
363 $start++;
|
|
364
|
|
365 # work out the end coord
|
|
366 if(defined($info{END})) {
|
|
367 $end = $info{END};
|
|
368 }
|
|
369 elsif(defined($info{SVLEN})) {
|
|
370 $end = $start + abs($info{SVLEN}) - 1;
|
|
371 }
|
|
372
|
|
373 # check for imprecise breakpoints
|
|
374 my ($min_start, $max_start, $min_end, $max_end);
|
|
375
|
|
376 if(defined($info{CIPOS})) {
|
|
377 my ($low, $high) = split /\,/, $info{CIPOS};
|
|
378 $min_start = $start + $low;
|
|
379 $max_start = $start + $high;
|
|
380 }
|
|
381
|
|
382 if(defined($info{CIEND})) {
|
|
383 my ($low, $high) = split /\,/, $info{CIEND};
|
|
384 $min_end = $end + $low;
|
|
385 $max_end = $end + $high;
|
|
386 }
|
|
387
|
|
388 # get type
|
|
389 my $type = $info{SVTYPE};
|
|
390 my $so_term;
|
|
391
|
|
392 if(defined($type)) {
|
|
393 # convert to SO term
|
|
394 my %terms = (
|
|
395 INS => 'insertion',
|
|
396 DEL => 'deletion',
|
|
397 TDUP => 'tandem_duplication',
|
|
398 DUP => 'duplication'
|
|
399 );
|
|
400
|
|
401 $so_term = defined $terms{$type} ? $terms{$type} : $type;
|
|
402 }
|
|
403
|
|
404 my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({
|
|
405 start => $start,
|
|
406 inner_start => $max_start,
|
|
407 outer_start => $min_start,
|
|
408 end => $end,
|
|
409 inner_end => $min_end,
|
|
410 outer_end => $max_end,
|
|
411 strand => 1,
|
|
412 adaptor => $config->{svfa},
|
|
413 variation_name => $data[2] eq '.' ? undef : $data[2],
|
|
414 chr => $chr,
|
|
415 class_SO_term => $so_term,
|
|
416 });
|
|
417
|
|
418 return [$svf];
|
|
419 }
|
|
420
|
|
421 # normal variation
|
|
422 else {
|
|
423 # find out if any of the alt alleles make this an insertion or a deletion
|
|
424 my ($is_indel, $is_sub, $ins_count, $total_count);
|
|
425 foreach my $alt_allele(split /\,/, $alt) {
|
|
426 $is_indel = 1 if $alt_allele =~ /D|I/;
|
|
427 $is_indel = 1 if length($alt_allele) != length($ref);
|
|
428 $is_sub = 1 if length($alt_allele) == length($ref);
|
|
429 $ins_count++ if length($alt_allele) > length($ref);
|
|
430 $total_count++;
|
|
431 }
|
|
432
|
|
433 # multiple alt alleles?
|
|
434 if($alt =~ /\,/) {
|
|
435 if($is_indel) {
|
|
436 my @alts;
|
|
437
|
|
438 if($alt =~ /D|I/) {
|
|
439 foreach my $alt_allele(split /\,/, $alt) {
|
|
440 # deletion (VCF <4)
|
|
441 if($alt_allele =~ /D/) {
|
|
442 push @alts, '-';
|
|
443 }
|
|
444
|
|
445 elsif($alt_allele =~ /I/) {
|
|
446 $alt_allele =~ s/^I//g;
|
|
447 push @alts, $alt_allele;
|
|
448 }
|
|
449 }
|
|
450 }
|
|
451
|
|
452 else {
|
|
453 $ref = substr($ref, 1) || '-';
|
|
454 $start++;
|
|
455
|
|
456 foreach my $alt_allele(split /\,/, $alt) {
|
|
457 $alt_allele = substr($alt_allele, 1);
|
|
458 $alt_allele = '-' if $alt_allele eq '';
|
|
459 push @alts, $alt_allele;
|
|
460 }
|
|
461 }
|
|
462
|
|
463 $alt = join "/", @alts;
|
|
464 }
|
|
465
|
|
466 else {
|
|
467 # for substitutions we just need to replace ',' with '/' in $alt
|
|
468 $alt =~ s/\,/\//g;
|
|
469 }
|
|
470 }
|
|
471
|
|
472 elsif($is_indel) {
|
|
473 # deletion (VCF <4)
|
|
474 if($alt =~ /D/) {
|
|
475 my $num_deleted = $alt;
|
|
476 $num_deleted =~ s/\D+//g;
|
|
477 $end += $num_deleted - 1;
|
|
478 $alt = "-";
|
|
479 $ref .= ("N" x ($num_deleted - 1)) unless length($ref) > 1;
|
|
480 }
|
|
481
|
|
482 # insertion (VCF <4)
|
|
483 elsif($alt =~ /I/) {
|
|
484 $ref = '-';
|
|
485 $alt =~ s/^I//g;
|
|
486 $start++;
|
|
487 }
|
|
488
|
|
489 # insertion or deletion (VCF 4+)
|
|
490 elsif(substr($ref, 0, 1) eq substr($alt, 0, 1)) {
|
|
491
|
|
492 # chop off first base
|
|
493 $ref = substr($ref, 1) || '-';
|
|
494 $alt = substr($alt, 1) || '-';
|
|
495
|
|
496 $start++;
|
|
497 }
|
|
498 }
|
|
499
|
|
500 # create VF object
|
|
501 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({
|
|
502 start => $start,
|
|
503 end => $end,
|
|
504 allele_string => $non_variant ? $ref : $ref.'/'.$alt,
|
|
505 strand => 1,
|
|
506 map_weight => 1,
|
|
507 adaptor => $config->{vfa},
|
|
508 variation_name => $data[2] eq '.' ? undef : $data[2],
|
|
509 chr => $chr,
|
|
510 });
|
|
511
|
|
512 # flag as non-variant
|
|
513 $vf->{non_variant} = 1 if $non_variant;
|
|
514
|
|
515 # individuals?
|
|
516 if(defined($config->{individual})) {
|
|
517 my @alleles = split /\//, $ref.'/'.$alt;
|
|
518
|
|
519 my @return;
|
|
520
|
|
521 foreach my $ind(keys %{$config->{ind_cols}}) {
|
|
522
|
|
523 # get alleles present in this individual
|
|
524 my @bits;
|
|
525 my $gt = (split /\:/, $data[$config->{ind_cols}->{$ind}])[0];
|
|
526
|
|
527 my $phased = ($gt =~ /\|/ ? 1 : 0);
|
|
528
|
|
529 foreach my $bit(split /\||\/|\\/, $gt) {
|
|
530 push @bits, $alleles[$bit] unless $bit eq '.';
|
|
531 }
|
|
532
|
|
533 # shallow copy VF
|
|
534 my $vf_copy = { %$vf };
|
|
535 bless $vf_copy, ref($vf);
|
|
536
|
|
537 # get non-refs
|
|
538 my %non_ref = map {$_ => 1} grep {$_ ne $ref} @bits;
|
|
539
|
|
540 # construct allele_string
|
|
541 if(scalar keys %non_ref) {
|
|
542 $vf_copy->{allele_string} = $ref."/".(join "/", keys %non_ref);
|
|
543 }
|
|
544 else {
|
|
545 $vf_copy->{allele_string} = $ref;
|
|
546 $vf_copy->{non_variant} = 1;
|
|
547 }
|
|
548
|
|
549 # store phasing info
|
|
550 $vf_copy->{phased} = defined($config->{phased} ? 1 : $phased);
|
|
551
|
|
552 # store GT
|
|
553 $vf_copy->{genotype} = \@bits;
|
|
554
|
|
555 # store individual name
|
|
556 $vf_copy->{individual} = $ind;
|
|
557
|
|
558 push @return, $vf_copy;
|
|
559 }
|
|
560
|
|
561 return \@return;
|
|
562 }
|
|
563 else {
|
|
564 return [$vf];
|
|
565 }
|
|
566 }
|
|
567 }
|
|
568
|
|
569 # parse a line of pileup input into variation feature objects
|
|
570 sub parse_pileup {
|
|
571 my $config = shift;
|
|
572 my $line = shift;
|
|
573
|
|
574 my @data = split /\s+/, $line;
|
|
575
|
|
576 # pileup can produce more than one VF per line
|
|
577 my @return;
|
|
578
|
|
579 # normal variant
|
|
580 if($data[2] ne "*"){
|
|
581 my $var;
|
|
582
|
|
583 if($data[3] =~ /^[A|C|G|T]$/) {
|
|
584 $var = $data[3];
|
|
585 }
|
|
586 else {
|
|
587 ($var = unambiguity_code($data[3])) =~ s/$data[2]//ig;
|
|
588 }
|
|
589
|
|
590 for my $alt(split //, $var){
|
|
591 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
|
|
592 start => $data[1],
|
|
593 end => $data[1],
|
|
594 allele_string => $data[2].'/'.$alt,
|
|
595 strand => 1,
|
|
596 map_weight => 1,
|
|
597 adaptor => $config->{vfa},
|
|
598 chr => $data[0],
|
|
599 });
|
|
600 }
|
|
601 }
|
|
602
|
|
603 # in/del
|
|
604 else {
|
|
605 my %tmp_hash = map {$_ => 1} split /\//, $data[3];
|
|
606 my @genotype = keys %tmp_hash;
|
|
607
|
|
608 foreach my $allele(@genotype){
|
|
609 if(substr($allele,0,1) eq "+") { #ins
|
|
610 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
|
|
611 start => $data[1] + 1,
|
|
612 end => $data[1],
|
|
613 allele_string => '-/'.substr($allele, 1),
|
|
614 strand => 1,
|
|
615 map_weight => 1,
|
|
616 adaptor => $config->{vfa},
|
|
617 chr => $data[0],
|
|
618 });
|
|
619 }
|
|
620 elsif(substr($allele,0,1) eq "-"){ #del
|
|
621 push @return, Bio::EnsEMBL::Variation::VariationFeature->new_fast({
|
|
622 start => $data[1] + 1,
|
|
623 end => $data[1] + length(substr($allele, 1)),
|
|
624 allele_string => substr($allele, 1).'/-',
|
|
625 strand => 1,
|
|
626 map_weight => 1,
|
|
627 adaptor => $config->{vfa},
|
|
628 chr => $data[0],
|
|
629 });
|
|
630 }
|
|
631 elsif($allele ne "*"){
|
|
632 warn("WARNING: invalid pileup indel genotype: $line\n") unless defined $config->{quiet};
|
|
633 }
|
|
634 }
|
|
635 }
|
|
636
|
|
637 return \@return;
|
|
638 }
|
|
639
|
|
640 # parse a line of HGVS input into a variation feature object
|
|
641 sub parse_hgvs {
|
|
642 my $config = shift;
|
|
643 my $line = shift;
|
|
644
|
|
645 my $vf;
|
|
646
|
|
647 # not all hgvs notations are supported yet, so we have to wrap it in an eval
|
|
648 eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{sa}, $config->{ta}) };
|
|
649
|
|
650 if((!defined($vf) || (defined $@ && length($@) > 1)) && defined($config->{coordinator})) {
|
|
651 eval { $vf = $config->{vfa}->fetch_by_hgvs_notation($line, $config->{ofsa}, $config->{ofta}) };
|
|
652 }
|
|
653
|
|
654 if(!defined($vf) || (defined $@ && length($@) > 1)) {
|
|
655 warn("WARNING: Unable to parse HGVS notation \'$line\'\n$@") unless defined $config->{quiet};
|
|
656 return [];
|
|
657 }
|
|
658
|
|
659 # get whole chromosome slice
|
|
660 my $slice = $vf->slice->adaptor->fetch_by_region($vf->slice->coord_system->name, $vf->slice->seq_region_name);
|
|
661
|
|
662 $vf = $vf->transfer($slice);
|
|
663
|
|
664 # name it after the HGVS
|
|
665 $vf->{variation_name} = $line;
|
|
666
|
|
667 # add chr attrib
|
|
668 $vf->{chr} = $vf->slice->seq_region_name;
|
|
669
|
|
670 return [$vf];
|
|
671 }
|
|
672
|
|
673 # parse a variation identifier e.g. a dbSNP rsID
|
|
674 sub parse_id {
|
|
675 my $config = shift;
|
|
676 my $line = shift;
|
|
677
|
|
678 my $v_obj = $config->{va}->fetch_by_name($line);
|
|
679
|
|
680 return [] unless defined $v_obj;
|
|
681
|
|
682 my @vfs = @{$v_obj->get_all_VariationFeatures};
|
|
683 delete $_->{dbID} for @vfs;
|
|
684 delete $_->{overlap_consequences} for @vfs;
|
|
685 $_->{chr} = $_->seq_region_name for @vfs;
|
|
686
|
|
687 return \@vfs;
|
|
688 }
|
|
689
|
|
690 # parse a line of VEP output
|
|
691 sub parse_vep {
|
|
692 my $config = shift;
|
|
693 my $line = shift;
|
|
694
|
|
695 my @data = split /\t/, $line;
|
|
696
|
|
697 my ($chr, $start, $end) = split /\:|\-/, $data[1];
|
|
698 $end ||= $start;
|
|
699
|
|
700 # might get allele string from ID
|
|
701 my $allele_string;
|
|
702
|
|
703 if($data[0] =~ /^\w\_\w\_\w$/) {
|
|
704 my @split = split /\_/, $data[0];
|
|
705 $allele_string = $split[-1] if $split[-1] =~ /[ACGTN-]+\/[ACGTN-]+/;
|
|
706 }
|
|
707
|
|
708 $allele_string ||= 'N/'.($data[6] =~ /intergenic/ ? 'N' : $data[2]);
|
|
709
|
|
710 my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({
|
|
711 start => $start,
|
|
712 end => $end,
|
|
713 allele_string => $allele_string,
|
|
714 strand => 1,
|
|
715 map_weight => 1,
|
|
716 adaptor => $config->{vfa},
|
|
717 chr => $chr,
|
|
718 variation_name => $data[0],
|
|
719 });
|
|
720
|
|
721 return [$vf];
|
|
722 }
|
|
723
|
|
724
|
|
725
|
|
726 # converts to VCF format
|
|
727 sub convert_to_vcf {
|
|
728 my $config = shift;
|
|
729 my $vf = shift;
|
|
730
|
|
731 # look for imbalance in the allele string
|
|
732 my %allele_lengths;
|
|
733 my @alleles = split /\//, $vf->allele_string;
|
|
734
|
|
735 foreach my $allele(@alleles) {
|
|
736 $allele =~ s/\-//g;
|
|
737 $allele_lengths{length($allele)} = 1;
|
|
738 }
|
|
739
|
|
740 # in/del/unbalanced
|
|
741 if(scalar keys %allele_lengths > 1) {
|
|
742
|
|
743 # we need the ref base before the variation
|
|
744 # default to N in case we can't get it
|
|
745 my $prev_base = 'N';
|
|
746
|
|
747 unless(defined($config->{cache})) {
|
|
748 my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1);
|
|
749 $prev_base = $slice->seq if defined($slice);
|
|
750 }
|
|
751
|
|
752 for my $i(0..$#alleles) {
|
|
753 $alleles[$i] =~ s/\-//g;
|
|
754 $alleles[$i] = $prev_base.$alleles[$i];
|
|
755 }
|
|
756
|
|
757 return [
|
|
758 $vf->{chr} || $vf->seq_region_name,
|
|
759 $vf->start - 1,
|
|
760 $vf->variation_name,
|
|
761 shift @alleles,
|
|
762 (join ",", @alleles),
|
|
763 '.', '.', '.'
|
|
764 ];
|
|
765
|
|
766 }
|
|
767
|
|
768 # balanced sub
|
|
769 else {
|
|
770 return [
|
|
771 $vf->{chr} || $vf->seq_region_name,
|
|
772 $vf->start,
|
|
773 $vf->variation_name,
|
|
774 shift @alleles,
|
|
775 (join ",", @alleles),
|
|
776 '.', '.', '.'
|
|
777 ];
|
|
778 }
|
|
779 }
|
|
780
|
|
781
|
|
782 # tries to map a VF to the LRG coordinate system
|
|
783 sub add_lrg_mappings {
|
|
784 my $config = shift;
|
|
785 my $vfs = shift;
|
|
786
|
|
787 my @new_vfs;
|
|
788
|
|
789 foreach my $vf(@$vfs) {
|
|
790
|
|
791 # add the unmapped VF to the array
|
|
792 push @new_vfs, $vf;
|
|
793
|
|
794 # make sure the VF has an attached slice
|
|
795 $vf->{slice} ||= get_slice($config, $vf->{chr});
|
|
796 next unless defined($vf->{slice});
|
|
797
|
|
798 # transform LRG <-> chromosome
|
|
799 my $new_vf = $vf->transform($vf->{slice}->coord_system->name eq 'lrg' ? 'chromosome' : 'lrg');
|
|
800
|
|
801 # add it to the array if transformation worked
|
|
802 if(defined($new_vf)) {
|
|
803
|
|
804 # update new VF's chr entry
|
|
805 $new_vf->{chr} = $new_vf->seq_region_name;
|
|
806 push @new_vfs, $new_vf;
|
|
807 }
|
|
808 }
|
|
809
|
|
810 return \@new_vfs;
|
|
811 }
|
|
812
|
|
813
|
|
814 # wrapper for whole_genome_fetch and vf_to_consequences
|
|
815 # takes config and a listref of VFs, returns listref of line hashes for printing
|
|
816 sub get_all_consequences {
|
|
817 my $config = shift;
|
|
818 my $listref = shift;
|
|
819
|
|
820 if ($config->{extra}) {
|
|
821 eval "use Plugin qw($config);"
|
|
822 }
|
|
823
|
|
824 # initialize caches
|
|
825 $config->{$_.'_cache'} ||= {} for qw(tr rf slice);
|
|
826
|
|
827 # build hash
|
|
828 my %vf_hash;
|
|
829 push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref;
|
|
830
|
|
831 my @non_variant = grep {defined($_->{non_variant})} @$listref;
|
|
832 debug("Skipping ".(scalar @non_variant)." non-variant loci\n") unless defined($config->{quiet});
|
|
833
|
|
834 # get regions
|
|
835 my $regions = ®ions_from_hash($config, \%vf_hash);
|
|
836 my $trim_regions = $regions;
|
|
837
|
|
838 # get trimmed regions - allows us to discard out-of-range transcripts
|
|
839 # when using cache
|
|
840 #if(defined($config->{cache})) {
|
|
841 # my $tmp = $config->{cache};
|
|
842 # delete $config->{cache};
|
|
843 # $trim_regions = regions_from_hash($config, \%vf_hash);
|
|
844 # $config->{cache} = $tmp;
|
|
845 #}
|
|
846
|
|
847 # prune caches
|
|
848 prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr});
|
|
849 prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf});
|
|
850
|
|
851 # get chr list
|
|
852 my %chrs = map {$_->{chr} => 1} @$listref;
|
|
853
|
|
854 my $fetched_tr_count = 0;
|
|
855 $fetched_tr_count = fetch_transcripts($config, $regions, $trim_regions)
|
|
856 unless defined($config->{no_consequences});
|
|
857
|
|
858 my $fetched_rf_count = 0;
|
|
859 $fetched_rf_count = fetch_regfeats($config, $regions, $trim_regions)
|
|
860 if defined($config->{regulatory})
|
|
861 && !defined($config->{no_consequences});
|
|
862
|
|
863 # check we can use MIME::Base64
|
|
864 if(defined($config->{fork})) {
|
|
865 eval q{ use MIME::Base64; };
|
|
866
|
|
867 if($@) {
|
|
868 debug("WARNING: Unable to load MIME::Base64, forking disabled") unless defined($config->{quiet});
|
|
869 delete $config->{fork};
|
|
870 }
|
|
871 }
|
|
872
|
|
873 my (@temp_array, @return, @pids);
|
|
874 if(defined($config->{fork})) {
|
|
875 my $size = scalar @$listref;
|
|
876
|
|
877 while(my $tmp_vf = shift @$listref) {
|
|
878 push @temp_array, $tmp_vf;
|
|
879
|
|
880 # fork
|
|
881 if(scalar @temp_array >= ($size / $config->{fork}) || scalar @$listref == 0) {
|
|
882
|
|
883 my $pid = fork;
|
|
884
|
|
885 if(!defined($pid)) {
|
|
886 debug("WARNING: Failed to fork - will attempt to continue without forking") unless defined($config->{quiet});
|
|
887 push @temp_array, @$listref;
|
|
888 push @return, @{vf_list_to_cons($config, \@temp_array, $regions)};
|
|
889 last;
|
|
890 }
|
|
891 elsif($pid) {
|
|
892 push @pids, $pid;
|
|
893 @temp_array = ();
|
|
894 }
|
|
895 elsif($pid == 0) {
|
|
896
|
|
897 $config->{forked} = $$;
|
|
898 $config->{quiet} = 1;
|
|
899
|
|
900 # redirect STDERR to PARENT so we can catch errors
|
|
901 *STDERR = *PARENT;
|
|
902
|
|
903 my $cons = vf_list_to_cons($config, \@temp_array, $regions);
|
|
904
|
|
905 # what we're doing here is sending a serialised hash of the
|
|
906 # results through to the parent process through the socket.
|
|
907 # This is then thawed by the parent process.
|
|
908 # $$, or the PID, is added so that the input can be sorted
|
|
909 # back into the correct order for output
|
|
910 print PARENT $$." ".encode_base64(freeze($_), "\t")."\n" for @$cons;
|
|
911
|
|
912 # some plugins may cache stuff, check for this and try and
|
|
913 # reconstitute it into parent's plugin cache
|
|
914 foreach my $plugin(@{$config->{plugins}}) {
|
|
915
|
|
916 next unless defined($plugin->{has_cache});
|
|
917
|
|
918 # delete unnecessary stuff and stuff that can't be serialised
|
|
919 delete $plugin->{$_} for qw(config feature_types variant_feature_types version feature_types_wanted variant_feature_types_wanted params);
|
|
920 print PARENT $$." PLUGIN ".ref($plugin)." ".encode_base64(freeze($plugin), "\t")."\n";
|
|
921 }
|
|
922
|
|
923 # we need to tell the parent this child is finished
|
|
924 # otherwise it keeps listening
|
|
925 print PARENT "DONE $$\n";
|
|
926 close PARENT;
|
|
927
|
|
928 exit(0);
|
|
929 }
|
|
930 }
|
|
931 }
|
|
932
|
|
933 debug("Calculating consequences") unless defined($config->{quiet});
|
|
934
|
|
935 my $fh = $config->{out_file_handle};
|
|
936 my $done_processes = 0;
|
|
937 my $done_vars = 0;
|
|
938 my $total_size = $size;
|
|
939 my $pruned_count;
|
|
940
|
|
941 # create a hash to store the returned data by PID
|
|
942 # this means we can sort it correctly on output
|
|
943 my %by_pid;
|
|
944
|
|
945 # read child input
|
|
946 while(<CHILD>) {
|
|
947
|
|
948 # child finished
|
|
949 if(/^DONE/) {
|
|
950 $done_processes++;
|
|
951 last if $done_processes == scalar @pids;
|
|
952 }
|
|
953
|
|
954 # variant finished / progress indicator
|
|
955 elsif(/^BUMP/) {
|
|
956 progress($config, ++$done_vars, $total_size);# if $pruned_count == scalar @pids;
|
|
957 }
|
|
958
|
|
959 # output
|
|
960 elsif(/^\-?\d+ /) {
|
|
961
|
|
962 # plugin
|
|
963 if(/^\-?\d+ PLUGIN/) {
|
|
964
|
|
965 m/^(\-?\d+) PLUGIN (\w+) /;
|
|
966 my ($pid, $plugin) = ($1, $2);
|
|
967
|
|
968 # remove the PID
|
|
969 s/^\-?\d+ PLUGIN \w+ //;
|
|
970 chomp;
|
|
971
|
|
972 my $tmp = thaw(decode_base64($_));
|
|
973
|
|
974 next unless defined($plugin);
|
|
975
|
|
976 # copy data to parent plugin
|
|
977 my ($parent_plugin) = grep {ref($_) eq $plugin} @{$config->{plugins}};
|
|
978
|
|
979 next unless defined($parent_plugin);
|
|
980
|
|
981 merge_hashes($parent_plugin, $tmp);
|
|
982 }
|
|
983
|
|
984 else {
|
|
985 # grab the PID
|
|
986 m/^(\-?\d+)\s/;
|
|
987 my $pid = $1;
|
|
988 die "ERROR: Could not parse forked PID from line $_" unless defined($pid);
|
|
989
|
|
990 # remove the PID
|
|
991 s/^\-?\d+\s//;
|
|
992 chomp;
|
|
993
|
|
994 # decode and thaw "output" from forked process
|
|
995 push @{$by_pid{$pid}}, thaw(decode_base64($_));
|
|
996 }
|
|
997 }
|
|
998
|
|
999 elsif(/^PRUNED/) {
|
|
1000 s/PRUNED //g;
|
|
1001 chomp;
|
|
1002 $pruned_count++;
|
|
1003 $total_size += $_;
|
|
1004 }
|
|
1005
|
|
1006 elsif(/^DEBUG/) {
|
|
1007 print STDERR;
|
|
1008 }
|
|
1009
|
|
1010 # something's wrong
|
|
1011 else {
|
|
1012 # kill the other pids
|
|
1013 kill(15, $_) for @pids;
|
|
1014 die("\nERROR: Forked process failed\n$_\n");
|
|
1015 }
|
|
1016 }
|
|
1017
|
|
1018 end_progress($config);
|
|
1019
|
|
1020 debug("Writing output") unless defined($config->{quiet});
|
|
1021
|
|
1022 waitpid($_, 0) for @pids;
|
|
1023
|
|
1024 # add the sorted data to the return array
|
|
1025 push @return, @{$by_pid{$_} || []} for @pids;
|
|
1026 }
|
|
1027
|
|
1028 # no forking
|
|
1029 else {
|
|
1030 push @return, @{vf_list_to_cons($config, $listref, $regions)};
|
|
1031 }
|
|
1032
|
|
1033 if(defined($config->{debug})) {
|
|
1034 eval q{use Devel::Size qw(total_size)};
|
|
1035 my $mem = memory();
|
|
1036 my $tot;
|
|
1037 $tot += $_ for @$mem;
|
|
1038
|
|
1039 if($tot > 1000000) {
|
|
1040 $tot = sprintf("%.2fGB", $tot / (1024 * 1024));
|
|
1041 }
|
|
1042
|
|
1043 elsif($tot > 1000) {
|
|
1044 $tot = sprintf("%.2fMB", $tot / 1024);
|
|
1045 }
|
|
1046
|
|
1047 my $mem_diff = mem_diff($config);
|
|
1048 debug(
|
|
1049 "LINES ", $config->{line_number},
|
|
1050 "\tMEMORY $tot ", (join " ", @$mem),
|
|
1051 "\tDIFF ", (join " ", @$mem_diff),
|
|
1052 "\tCACHE ", total_size($config->{tr_cache}).
|
|
1053 "\tRF ", total_size($config->{rf_cache}),
|
|
1054 "\tVF ", total_size(\%vf_hash),
|
|
1055 );
|
|
1056 #exit(0) if grep {$_ < 0} @$mem_diff;
|
|
1057 }
|
|
1058
|
|
1059 return \@return;
|
|
1060 }
|
|
1061
|
|
1062 sub vf_list_to_cons {
|
|
1063 my $config = shift;
|
|
1064 my $listref = shift;
|
|
1065 my $regions = shift;
|
|
1066
|
|
1067 # get non-variants
|
|
1068 my @non_variants = grep {$_->{non_variant}} @$listref;
|
|
1069
|
|
1070 my %vf_hash = ();
|
|
1071 push @{$vf_hash{$_->{chr}}{int($_->{start} / $config->{chunk_size})}{$_->{start}}}, $_ for grep {!defined($_->{non_variant})} @$listref;
|
|
1072
|
|
1073 # check existing VFs
|
|
1074 &check_existing_hash($config, \%vf_hash) if defined($config->{check_existing});# && scalar @$listref > 10;
|
|
1075
|
|
1076 # get overlapping SVs
|
|
1077 &check_svs_hash($config, \%vf_hash) if defined($config->{check_svs});
|
|
1078
|
|
1079 # if we are forked, we can trim off some stuff
|
|
1080 if(defined($config->{forked})) {
|
|
1081 my $tmp = $config->{cache};
|
|
1082 delete $config->{cache};
|
|
1083 $regions = regions_from_hash($config, \%vf_hash);
|
|
1084 $config->{cache} = $tmp;
|
|
1085
|
|
1086 # prune caches
|
|
1087 my $new_count = 0;
|
|
1088
|
|
1089 $new_count += prune_cache($config, $config->{tr_cache}, $regions, $config->{loaded_tr});
|
|
1090 $new_count += prune_cache($config, $config->{rf_cache}, $regions, $config->{loaded_rf});
|
|
1091
|
|
1092 print PARENT "PRUNED $new_count\n";
|
|
1093 }
|
|
1094
|
|
1095 my @return;
|
|
1096
|
|
1097 foreach my $chr(sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %vf_hash) {
|
|
1098 my $finished_vfs = whole_genome_fetch($config, $chr, \%vf_hash);
|
|
1099
|
|
1100 # non-variants?
|
|
1101 if(scalar @non_variants) {
|
|
1102 push @$finished_vfs, grep {$_->{chr} eq $chr} @non_variants;
|
|
1103
|
|
1104 # need to re-sort
|
|
1105 @$finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @$finished_vfs;
|
|
1106 }
|
|
1107
|
|
1108 debug("Calculating consequences") unless defined($config->{quiet});
|
|
1109
|
|
1110 my $vf_count = scalar @$finished_vfs;
|
|
1111 my $vf_counter = 0;
|
|
1112
|
|
1113 while(my $vf = shift @$finished_vfs) {
|
|
1114 progress($config, $vf_counter++, $vf_count) unless $vf_count == 1;
|
|
1115
|
|
1116 my $filter_ok = 1;
|
|
1117
|
|
1118 # filtered output
|
|
1119 if(defined($config->{filter})) {
|
|
1120 $filter_ok = filter_by_consequence($config, $vf);
|
|
1121 $config->{filter_count} += $filter_ok;
|
|
1122 }
|
|
1123
|
|
1124 # skip filtered lines
|
|
1125 next unless $filter_ok;
|
|
1126
|
|
1127 # original output
|
|
1128 if(defined($config->{original})) {
|
|
1129 push @return, $vf->{_line};
|
|
1130 }
|
|
1131
|
|
1132 # GVF output
|
|
1133 elsif(defined($config->{gvf})) {
|
|
1134 $vf->source("User");
|
|
1135
|
|
1136 $config->{gvf_id} ||= 1;
|
|
1137
|
|
1138 # get custom annotation
|
|
1139 my $custom_annotation = defined($config->{custom}) ? get_custom_annotation($config, $vf) : {};
|
|
1140 $custom_annotation->{ID} = $config->{gvf_id}++;
|
|
1141
|
|
1142
|
|
1143 my $tmp = $vf->to_gvf(
|
|
1144 include_consequences => defined($config->{no_consequences}) ? 0 : 1,
|
|
1145 extra_attrs => $custom_annotation,
|
|
1146 );
|
|
1147 push @return, \$tmp;
|
|
1148 }
|
|
1149
|
|
1150 # VCF output
|
|
1151 elsif(defined($config->{vcf})) {
|
|
1152
|
|
1153 # convert to VCF, otherwise get line
|
|
1154 my $line = $config->{format} eq 'vcf' ? [split /\s+/, $vf->{_line}] : convert_to_vcf($config, $vf);
|
|
1155
|
|
1156 if(!defined($line->[7]) || $line->[7] eq '.') {
|
|
1157 $line->[7] = '';
|
|
1158 }
|
|
1159
|
|
1160 # get all the lines the normal way
|
|
1161 # and process them into VCF-compatible string
|
|
1162 my $string = 'CSQ=';
|
|
1163
|
|
1164 foreach my $line(@{vf_to_consequences($config, $vf)}) {
|
|
1165
|
|
1166 # use the field list (can be user-defined by setting --fields)
|
|
1167 for my $col(@{$config->{fields}}) {
|
|
1168
|
|
1169 # skip fields already represented in the VCF
|
|
1170 next if $col eq 'Uploaded_variation' or $col eq 'Location' or $col eq 'Extra';
|
|
1171
|
|
1172 # search for data in main line hash as well as extra field
|
|
1173 my $data = defined $line->{$col} ? $line->{$col} : $line->{Extra}->{$col};
|
|
1174
|
|
1175 # "-" means null for everything except the Allele field (confusing...)
|
|
1176 $data = undef if defined($data) and $data eq '-' and $col ne 'Allele';
|
|
1177 $data =~ s/\,/\&/g if defined $data;
|
|
1178 $string .= defined($data) ? $data : '';
|
|
1179 $string .= '|';
|
|
1180 }
|
|
1181
|
|
1182 $string =~ s/\|$//;
|
|
1183 $string .= ',';
|
|
1184 }
|
|
1185
|
|
1186 $string =~ s/\,$//;
|
|
1187
|
|
1188 if(!defined($config->{no_consequences}) && $string ne 'CSQ=') {
|
|
1189 $line->[7] .= ($line->[7] ? ';' : '').$string;
|
|
1190 }
|
|
1191
|
|
1192 # get custom annotation
|
|
1193 if(defined($config->{custom}) && scalar @{$config->{custom}}) {
|
|
1194 my $custom_annotation = get_custom_annotation($config, $vf);
|
|
1195 foreach my $key(keys %{$custom_annotation}) {
|
|
1196 $line->[7] .= ($line->[7] ? ';' : '').$key.'='.$custom_annotation->{$key};
|
|
1197 }
|
|
1198 }
|
|
1199
|
|
1200 my $tmp = join "\t", @$line;
|
|
1201 push @return, \$tmp;
|
|
1202 }
|
|
1203
|
|
1204 # no consequence output from vep input
|
|
1205 elsif(defined($config->{no_consequences}) && $config->{format} eq 'vep') {
|
|
1206
|
|
1207 my $line = [split /\s+/, $vf->{_line}];
|
|
1208
|
|
1209 if($line->[13] eq '-') {
|
|
1210 $line->[13] = '';
|
|
1211 }
|
|
1212
|
|
1213 # get custom annotation
|
|
1214 if(defined($config->{custom})) {
|
|
1215 my $custom_annotation = get_custom_annotation($config, $vf);
|
|
1216 foreach my $key(keys %{$custom_annotation}) {
|
|
1217 $line->[13] .= ($line->[13] ? ';' : '').$key.'='.$custom_annotation->{$key};
|
|
1218 }
|
|
1219 }
|
|
1220
|
|
1221 my $tmp = join "\t", @$line;
|
|
1222 push @return, \$tmp;
|
|
1223 }
|
|
1224
|
|
1225 # normal output
|
|
1226 else {
|
|
1227 push @return, @{vf_to_consequences($config, $vf)};
|
|
1228 }
|
|
1229
|
|
1230 print PARENT "BUMP\n" if defined($config->{forked});
|
|
1231 }
|
|
1232
|
|
1233 end_progress($config) unless scalar @$listref == 1;
|
|
1234 }
|
|
1235
|
|
1236 return \@return;
|
|
1237 }
|
|
1238
|
|
1239 # takes a variation feature and returns ready to print consequence information
|
|
1240 sub vf_to_consequences {
|
|
1241 my $config = shift;
|
|
1242 my $vf = shift;
|
|
1243
|
|
1244 # use a different method for SVs
|
|
1245 return svf_to_consequences($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature');
|
|
1246
|
|
1247 my @return = ();
|
|
1248
|
|
1249 # method name for consequence terms
|
|
1250 my $term_method = $config->{terms}.'_term';
|
|
1251
|
|
1252 # find any co-located existing VFs
|
|
1253 $vf->{existing} ||= find_existing($config, $vf) if defined $config->{check_existing};
|
|
1254
|
|
1255 # skip based on frequency checks?
|
|
1256 if(defined($config->{check_frequency}) && defined($vf->{existing}) && $vf->{existing} ne '-' && defined($config->{va})) {
|
|
1257 return [] unless grep {$_} map {check_frequencies($config, $_)} reverse split(/\,/, $vf->{existing});
|
|
1258 $vf->{freqs} = $config->{filtered_freqs};
|
|
1259 }
|
|
1260
|
|
1261 # force empty hash into object's transcript_variations if undefined from whole_genome_fetch
|
|
1262 # this will stop the API trying to go off and fill it again
|
|
1263 $vf->{transcript_variations} ||= {} if defined $config->{whole_genome};
|
|
1264
|
|
1265 # regulatory stuff
|
|
1266 if(!defined $config->{coding_only} && defined $config->{regulatory}) {
|
|
1267
|
|
1268 for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) {
|
|
1269
|
|
1270 my $rf = $rfv->regulatory_feature;
|
|
1271
|
|
1272 my $base_line = {
|
|
1273 Feature_type => 'RegulatoryFeature',
|
|
1274 Feature => $rf->stable_id,
|
|
1275 };
|
|
1276
|
|
1277 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
|
|
1278 $base_line->{Extra}->{CELL_TYPE} = join ",",
|
|
1279 map {$_.':'.$rf->{cell_types}->{$_}}
|
|
1280 grep {$rf->{cell_types}->{$_}}
|
|
1281 @{$config->{cell_type}};
|
|
1282
|
|
1283 $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g;
|
|
1284 }
|
|
1285
|
|
1286 # this currently always returns 'RegulatoryFeature', so we ignore it for now
|
|
1287 #$base_line->{Extra}->{REG_FEAT_TYPE} = $rf->feature_type->name;
|
|
1288
|
|
1289 for my $rfva (@{ $rfv->get_all_alternate_RegulatoryFeatureVariationAlleles }) {
|
|
1290
|
|
1291 my $line = init_line($config, $vf, $base_line);
|
|
1292
|
|
1293 $line->{Allele} = $rfva->variation_feature_seq;
|
|
1294 $line->{Consequence} = join ',',
|
|
1295 map { $_->$term_method || $_->SO_term }
|
|
1296 @{ $rfva->get_all_OverlapConsequences };
|
|
1297
|
|
1298 $line = run_plugins($rfva, $line, $config);
|
|
1299
|
|
1300 push @return, $line;
|
|
1301 }
|
|
1302 }
|
|
1303
|
|
1304 for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) {
|
|
1305
|
|
1306 my $mf = $mfv->motif_feature;
|
|
1307
|
|
1308 # check that the motif has a binding matrix, if not there's not
|
|
1309 # much we can do so don't return anything
|
|
1310
|
|
1311 next unless defined $mf->binding_matrix;
|
|
1312
|
|
1313 my $matrix = $mf->binding_matrix->description.' '.$mf->display_label;
|
|
1314 $matrix =~ s/\s+/\_/g;
|
|
1315
|
|
1316 my $base_line = {
|
|
1317 Feature_type => 'MotifFeature',
|
|
1318 Feature => $mf->binding_matrix->name,
|
|
1319 Extra => {
|
|
1320 MOTIF_NAME => $matrix,
|
|
1321 }
|
|
1322 };
|
|
1323
|
|
1324 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
|
|
1325 $base_line->{Extra}->{CELL_TYPE} = join ",",
|
|
1326 map {$_.':'.$mf->{cell_types}->{$_}}
|
|
1327 grep {$mf->{cell_types}->{$_}}
|
|
1328 @{$config->{cell_type}};
|
|
1329
|
|
1330 $base_line->{Extra}->{CELL_TYPE} =~ s/\s+/\_/g;
|
|
1331 }
|
|
1332
|
|
1333 for my $mfva (@{ $mfv->get_all_alternate_MotifFeatureVariationAlleles }) {
|
|
1334
|
|
1335 my $line = init_line($config, $vf, $base_line);
|
|
1336
|
|
1337 $line->{Extra}->{MOTIF_POS} = $mfva->motif_start if defined $mfva->motif_start;
|
|
1338 $line->{Extra}->{HIGH_INF_POS} = ($mfva->in_informative_position ? 'Y' : 'N');
|
|
1339
|
|
1340 my $delta = $mfva->motif_score_delta if $mfva->variation_feature_seq =~ /^[ACGT]+$/;
|
|
1341
|
|
1342 $line->{Extra}->{MOTIF_SCORE_CHANGE} = sprintf("%.3f", $delta) if defined $delta;
|
|
1343
|
|
1344 $line->{Allele} = $mfva->variation_feature_seq;
|
|
1345 $line->{Consequence} = join ',',
|
|
1346 map { $_->$term_method || $_->SO_term }
|
|
1347 @{ $mfva->get_all_OverlapConsequences };
|
|
1348
|
|
1349 $line = run_plugins($mfva, $line, $config);
|
|
1350
|
|
1351 push @return, $line;
|
|
1352 }
|
|
1353 }
|
|
1354 }
|
|
1355
|
|
1356 # get TVs
|
|
1357 my $tvs = $vf->get_all_TranscriptVariations;
|
|
1358
|
|
1359 # only most severe
|
|
1360 if(defined($config->{most_severe}) || defined($config->{summary})) {
|
|
1361
|
|
1362 my $line = init_line($config, $vf);
|
|
1363
|
|
1364 if(defined($config->{summary})) {
|
|
1365 $line->{Consequence} = join ",", @{$vf->consequence_type($config->{terms}) || $vf->consequence_type};
|
|
1366 }
|
|
1367 else {
|
|
1368 $line->{Consequence} = $vf->display_consequence($config->{terms}) || $vf->display_consequence;
|
|
1369 }
|
|
1370
|
|
1371 push @return, $line;
|
|
1372 }
|
|
1373
|
|
1374 # pass a true argument to get_IntergenicVariation to stop it doing a reference allele check
|
|
1375 # (to stay consistent with the rest of the VEP)
|
|
1376 elsif ((my $iv = $vf->get_IntergenicVariation(1)) && !defined($config->{no_intergenic})) {
|
|
1377
|
|
1378 for my $iva (@{ $iv->get_all_alternate_IntergenicVariationAlleles }) {
|
|
1379
|
|
1380 my $line = init_line($config, $vf);
|
|
1381
|
|
1382 $line->{Allele} = $iva->variation_feature_seq;
|
|
1383
|
|
1384 my $cons = $iva->get_all_OverlapConsequences->[0];
|
|
1385
|
|
1386 $line->{Consequence} = $cons->$term_method || $cons->SO_term;
|
|
1387
|
|
1388 $line = run_plugins($iva, $line, $config);
|
|
1389
|
|
1390 push @return, $line;
|
|
1391 }
|
|
1392 }
|
|
1393
|
|
1394 # user wants only one conseqeunce per gene
|
|
1395 elsif(defined($config->{per_gene})) {
|
|
1396
|
|
1397 # sort the TVA objects into a hash by gene
|
|
1398 my %by_gene;
|
|
1399
|
|
1400 foreach my $tv(@$tvs) {
|
|
1401 next if(defined $config->{coding_only} && !($tv->affects_cds));
|
|
1402
|
|
1403 my $gene = $tv->transcript->{_gene_stable_id} || $config->{ga}->fetch_by_transcript_stable_id($tv->transcript->stable_id)->stable_id;
|
|
1404
|
|
1405 push @{$by_gene{$gene}}, @{$tv->get_all_alternate_TranscriptVariationAlleles};
|
|
1406 }
|
|
1407
|
|
1408 foreach my $gene(keys %by_gene) {
|
|
1409 my ($lowest, $lowest_tva);
|
|
1410
|
|
1411 # at the moment this means the one that comes out last will be picked
|
|
1412 # if there is more than one TVA with the same rank of consequence
|
|
1413 foreach my $tva(@{$by_gene{$gene}}) {
|
|
1414 foreach my $oc(@{$tva->get_all_OverlapConsequences}) {
|
|
1415 if(!defined($lowest) || $oc->rank < $lowest) {
|
|
1416 $lowest = $oc->rank;
|
|
1417 $lowest_tva = $tva;
|
|
1418 }
|
|
1419 }
|
|
1420 }
|
|
1421
|
|
1422 push @return, tva_to_line($config, $lowest_tva);
|
|
1423 }
|
|
1424 }
|
|
1425
|
|
1426 else {
|
|
1427 foreach my $tv(@$tvs) {
|
|
1428 next if(defined $config->{coding_only} && !($tv->affects_cds));
|
|
1429
|
|
1430 push @return, map {tva_to_line($config, $_)} @{$tv->get_all_alternate_TranscriptVariationAlleles};
|
|
1431
|
|
1432 undef $tv->{$_} for keys %$tv;
|
|
1433 }
|
|
1434 }
|
|
1435
|
|
1436 return \@return;
|
|
1437 }
|
|
1438
|
|
1439 # get consequences for a structural variation feature
|
|
1440 sub svf_to_consequences {
|
|
1441 my $config = shift;
|
|
1442 my $svf = shift;
|
|
1443
|
|
1444 my @return = ();
|
|
1445
|
|
1446 my $term_method = $config->{terms}.'_term';
|
|
1447
|
|
1448 if(defined $config->{whole_genome}) {
|
|
1449 $svf->{transcript_structural_variations} ||= [];
|
|
1450 $svf->{regulation_structural_variations}->{$_} ||= [] for @REG_FEAT_TYPES;
|
|
1451 }
|
|
1452
|
|
1453 if ((my $iv = $svf->get_IntergenicStructuralVariation(1)) && !defined($config->{no_intergenic})) {
|
|
1454
|
|
1455 for my $iva (@{ $iv->get_all_alternate_IntergenicStructuralVariationAlleles }) {
|
|
1456
|
|
1457 my $line = init_line($config, $svf);
|
|
1458
|
|
1459 $line->{Allele} = '-';
|
|
1460
|
|
1461 my $cons = $iva->get_all_OverlapConsequences->[0];
|
|
1462
|
|
1463 $line->{Consequence} = $cons->$term_method || $cons->SO_term;
|
|
1464
|
|
1465 $line = run_plugins($iva, $line, $config);
|
|
1466
|
|
1467 push @return, $line;
|
|
1468 }
|
|
1469 }
|
|
1470
|
|
1471 foreach my $svo(@{$svf->get_all_StructuralVariationOverlaps}) {
|
|
1472
|
|
1473 next if $svo->isa('Bio::EnsEMBL::Variation::IntergenicStructuralVariation');
|
|
1474
|
|
1475 my $feature = $svo->feature;
|
|
1476
|
|
1477 # get feature type
|
|
1478 my $feature_type = (split '::', ref($feature))[-1];
|
|
1479
|
|
1480 my $base_line = {
|
|
1481 Feature_type => $feature_type,
|
|
1482 Feature => $feature->stable_id,
|
|
1483 Allele => $svf->class_SO_term,
|
|
1484 };
|
|
1485
|
|
1486 if($svo->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariation')) {
|
|
1487 $base_line->{cDNA_position} = format_coords($svo->cdna_start, $svo->cdna_end);
|
|
1488 $base_line->{CDS_position} = format_coords($svo->cds_start, $svo->cds_end);
|
|
1489 $base_line->{Protein_position} = format_coords($svo->translation_start, $svo->translation_end);
|
|
1490 }
|
|
1491
|
|
1492 foreach my $svoa(@{$svo->get_all_StructuralVariationOverlapAlleles}) {
|
|
1493 my $line = init_line($config, $svf, $base_line);
|
|
1494
|
|
1495 $line->{Consequence} = join ",",
|
|
1496 #map {s/feature/$feature_type/e; $_}
|
|
1497 map {$_->$term_method}
|
|
1498 sort {$a->rank <=> $b->rank}
|
|
1499 @{$svoa->get_all_OverlapConsequences};
|
|
1500
|
|
1501 # work out overlap amounts
|
|
1502 my $overlap_start = (sort {$a <=> $b} ($svf->start, $feature->start))[-1];
|
|
1503 my $overlap_end = (sort {$a <=> $b} ($svf->end, $feature->end))[0];
|
|
1504 my $overlap_length = ($overlap_end - $overlap_start) + 1;
|
|
1505 my $overlap_pc = 100 * ($overlap_length / (($feature->end - $feature->start) + 1));
|
|
1506
|
|
1507 $line->{Extra}->{OverlapBP} = $overlap_length if $overlap_length > 0;
|
|
1508 $line->{Extra}->{OverlapPC} = sprintf("%.2f", $overlap_pc) if $overlap_pc > 0;
|
|
1509
|
|
1510 add_extra_fields($config, $line, $svoa);
|
|
1511
|
|
1512 push @return, $line;
|
|
1513 }
|
|
1514 }
|
|
1515
|
|
1516 return \@return;
|
|
1517 }
|
|
1518
|
|
1519 # run all of the configured plugins on a VariationFeatureOverlapAllele instance
|
|
1520 # and store any results in the provided line hash
|
|
1521 sub run_plugins {
|
|
1522
|
|
1523 my ($bvfoa, $line_hash, $config) = @_;
|
|
1524
|
|
1525 my $skip_line = 0;
|
|
1526
|
|
1527 for my $plugin (@{ $config->{plugins} || [] }) {
|
|
1528
|
|
1529 # check that this plugin is interested in this type of variation feature
|
|
1530
|
|
1531 if ($plugin->check_variant_feature_type(ref $bvfoa->base_variation_feature)) {
|
|
1532
|
|
1533 # check that this plugin is interested in this type of feature
|
|
1534
|
|
1535 if ($plugin->check_feature_type(ref $bvfoa->feature || 'Intergenic')) {
|
|
1536
|
|
1537 eval {
|
|
1538 my $plugin_results = $plugin->run($bvfoa, $line_hash);
|
|
1539
|
|
1540 if (defined $plugin_results) {
|
|
1541 if (ref $plugin_results eq 'HASH') {
|
|
1542 for my $key (keys %$plugin_results) {
|
|
1543 $line_hash->{Extra}->{$key} = $plugin_results->{$key};
|
|
1544 }
|
|
1545 }
|
|
1546 else {
|
|
1547 warn "Plugin '".(ref $plugin)."' did not return a hashref, output ignored!\n";
|
|
1548 }
|
|
1549 }
|
|
1550 else {
|
|
1551 # if a plugin returns undef, that means it want to filter out this line
|
|
1552 $skip_line = 1;
|
|
1553 }
|
|
1554 };
|
|
1555 if ($@) {
|
|
1556 warn "Plugin '".(ref $plugin)."' went wrong: $@";
|
|
1557 }
|
|
1558
|
|
1559 # there's no point running any other plugins if we're filtering this line,
|
|
1560 # because the first plugin to skip the line wins, so we might as well last
|
|
1561 # out of the loop now and avoid any unnecessary computation
|
|
1562
|
|
1563 last if $skip_line;
|
|
1564 }
|
|
1565 }
|
|
1566 }
|
|
1567
|
|
1568 return $skip_line ? undef : $line_hash;
|
|
1569 }
|
|
1570
|
|
1571 # turn a TranscriptVariationAllele into a line hash
|
|
1572 sub tva_to_line {
|
|
1573 my $config = shift;
|
|
1574 my $tva = shift;
|
|
1575
|
|
1576 my $tv = $tva->transcript_variation;
|
|
1577 my $t = $tv->transcript;
|
|
1578
|
|
1579 # method name for consequence terms
|
|
1580 my $term_method = $config->{terms}.'_term';
|
|
1581
|
|
1582 my $base_line = {
|
|
1583 Feature_type => 'Transcript',
|
|
1584 Feature => (defined $t ? $t->stable_id : undef),
|
|
1585 cDNA_position => format_coords($tv->cdna_start, $tv->cdna_end),
|
|
1586 CDS_position => format_coords($tv->cds_start, $tv->cds_end),
|
|
1587 Protein_position => format_coords($tv->translation_start, $tv->translation_end),
|
|
1588 Allele => $tva->variation_feature_seq,
|
|
1589 Amino_acids => $tva->pep_allele_string,
|
|
1590 Codons => $tva->display_codon_allele_string,
|
|
1591 Consequence => join ",", map {$_->$term_method || $_->SO_term} sort {$a->rank <=> $b->rank} @{$tva->get_all_OverlapConsequences},
|
|
1592 };
|
|
1593
|
|
1594 my $line = init_line($config, $tva->variation_feature, $base_line);
|
|
1595
|
|
1596 # HGVS
|
|
1597 if(defined $config->{hgvs}) {
|
|
1598 my $hgvs_t = $tva->hgvs_transcript;
|
|
1599 my $hgvs_p = $tva->hgvs_protein;
|
|
1600
|
|
1601 $line->{Extra}->{HGVSc} = $hgvs_t if $hgvs_t;
|
|
1602 $line->{Extra}->{HGVSp} = $hgvs_p if $hgvs_p;
|
|
1603 }
|
|
1604
|
|
1605 foreach my $tool (qw(SIFT PolyPhen)) {
|
|
1606 my $lc_tool = lc($tool);
|
|
1607
|
|
1608 if (my $opt = $config->{$lc_tool}) {
|
|
1609 my $want_pred = $opt =~ /^p/i;
|
|
1610 my $want_score = $opt =~ /^s/i;
|
|
1611 my $want_both = $opt =~ /^b/i;
|
|
1612
|
|
1613 if ($want_both) {
|
|
1614 $want_pred = 1;
|
|
1615 $want_score = 1;
|
|
1616 }
|
|
1617
|
|
1618 next unless $want_pred || $want_score;
|
|
1619
|
|
1620 my $pred_meth = $lc_tool.'_prediction';
|
|
1621 my $score_meth = $lc_tool.'_score';
|
|
1622
|
|
1623 my $pred = $tva->$pred_meth;
|
|
1624
|
|
1625 if($pred) {
|
|
1626
|
|
1627 if ($want_pred) {
|
|
1628 $pred =~ s/\s+/\_/;
|
|
1629 $line->{Extra}->{$tool} = $pred;
|
|
1630 }
|
|
1631
|
|
1632 if ($want_score) {
|
|
1633 my $score = $tva->$score_meth;
|
|
1634
|
|
1635 if(defined $score) {
|
|
1636 if($want_pred) {
|
|
1637 $line->{Extra}->{$tool} .= "($score)";
|
|
1638 }
|
|
1639 else {
|
|
1640 $line->{Extra}->{$tool} = $score;
|
|
1641 }
|
|
1642 }
|
|
1643 }
|
|
1644 }
|
|
1645 }
|
|
1646 }
|
|
1647
|
|
1648 $line = add_extra_fields($config, $line, $tva);
|
|
1649
|
|
1650 return $line;
|
|
1651 }
|
|
1652
|
|
1653 sub add_extra_fields {
|
|
1654 my $config = shift;
|
|
1655 my $line = shift;
|
|
1656 my $bvfoa = shift;
|
|
1657
|
|
1658 # overlapping SVs
|
|
1659 if(defined $config->{check_svs} && defined $bvfoa->base_variation_feature->{overlapping_svs}) {
|
|
1660 $line->{Extra}->{SV} = $bvfoa->base_variation_feature->{overlapping_svs};
|
|
1661 }
|
|
1662
|
|
1663 # add transcript-specific fields
|
|
1664 $line = add_extra_fields_transcript($config, $line, $bvfoa) if $bvfoa->isa('Bio::EnsEMBL::Variation::BaseTranscriptVariationAllele');
|
|
1665
|
|
1666 # run plugins
|
|
1667 $line = run_plugins($bvfoa, $line, $config);
|
|
1668
|
|
1669 return $line;
|
|
1670 }
|
|
1671
|
|
1672 sub add_extra_fields_transcript {
|
|
1673 my $config = shift;
|
|
1674 my $line = shift;
|
|
1675 my $tva = shift;
|
|
1676
|
|
1677 my $tv = $tva->base_variation_feature_overlap;
|
|
1678 my $tr = $tva->transcript;
|
|
1679
|
|
1680 # get gene
|
|
1681 my $gene;
|
|
1682
|
|
1683 $line->{Gene} = $tr->{_gene_stable_id};
|
|
1684
|
|
1685 if(!defined($line->{Gene})) {
|
|
1686 $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id);
|
|
1687 $line->{Gene} = $gene ? $gene->stable_id : '-';
|
|
1688 }
|
|
1689
|
|
1690 # exon/intron numbers
|
|
1691 if ($config->{numbers}) {
|
|
1692 $line->{Extra}->{EXON} = $tv->exon_number if defined $tv->exon_number;
|
|
1693 $line->{Extra}->{INTRON} = $tv->intron_number if defined $tv->intron_number;
|
|
1694 }
|
|
1695
|
|
1696 if ($config->{domains}) {
|
|
1697 my $feats = $tv->get_overlapping_ProteinFeatures;
|
|
1698
|
|
1699 my @strings;
|
|
1700
|
|
1701 for my $feat (@$feats) {
|
|
1702 my $label = $feat->analysis->display_label.':'.$feat->hseqname;
|
|
1703
|
|
1704 # replace any special characters
|
|
1705 $label =~ s/[\s;=]/_/g;
|
|
1706
|
|
1707 push @strings, $label;
|
|
1708 }
|
|
1709
|
|
1710 $line->{Extra}->{DOMAINS} = join ',', @strings if @strings;
|
|
1711 }
|
|
1712
|
|
1713 # distance to transcript
|
|
1714 if($line->{Consequence} =~ /(up|down)stream/i) {
|
|
1715 $line->{Extra}->{DISTANCE} = $tv->distance_to_transcript;
|
|
1716 }
|
|
1717
|
|
1718 # HGNC
|
|
1719 if(defined $config->{hgnc}) {
|
|
1720 my $hgnc;
|
|
1721 $hgnc = $tr->{_gene_hgnc};
|
|
1722
|
|
1723 if(!defined($hgnc)) {
|
|
1724 if(!defined($gene)) {
|
|
1725 $gene = $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id);
|
|
1726 }
|
|
1727
|
|
1728 my @entries = grep {$_->database eq 'HGNC'} @{$gene->get_all_DBEntries()};
|
|
1729 if(scalar @entries) {
|
|
1730 $hgnc = $entries[0]->display_id;
|
|
1731 }
|
|
1732 }
|
|
1733
|
|
1734 $hgnc = undef if defined($hgnc) && $hgnc eq '-';
|
|
1735
|
|
1736 $line->{Extra}->{HGNC} = $hgnc if defined($hgnc);
|
|
1737 }
|
|
1738
|
|
1739 # CCDS
|
|
1740 if(defined($config->{ccds})) {
|
|
1741 my $ccds = $tr->{_ccds};
|
|
1742
|
|
1743 if(!defined($ccds)) {
|
|
1744 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries};
|
|
1745 $ccds = $entries[0]->display_id if scalar @entries;
|
|
1746 }
|
|
1747
|
|
1748 $ccds = undef if defined($ccds) && $ccds eq '-';
|
|
1749
|
|
1750 $line->{Extra}->{CCDS} = $ccds if defined($ccds);
|
|
1751 }
|
|
1752
|
|
1753 # refseq xref
|
|
1754 if(defined($config->{xref_refseq})) {
|
|
1755 my $refseq = $tr->{_refseq};
|
|
1756
|
|
1757 if(!defined($refseq)) {
|
|
1758 my @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries};
|
|
1759 if(scalar @entries) {
|
|
1760 $refseq = join ",", map {$_->display_id."-".$_->database} @entries;
|
|
1761 }
|
|
1762 }
|
|
1763
|
|
1764 $refseq = undef if defined($refseq) && $refseq eq '-';
|
|
1765
|
|
1766 $line->{Extra}->{RefSeq} = $refseq if defined($refseq);
|
|
1767 }
|
|
1768
|
|
1769 # protein ID
|
|
1770 if(defined $config->{protein}) {
|
|
1771 my $protein = $tr->{_protein};
|
|
1772
|
|
1773 if(!defined($protein)) {
|
|
1774 $protein = $tr->translation->stable_id if defined($tr->translation);
|
|
1775 }
|
|
1776
|
|
1777 $protein = undef if defined($protein) && $protein eq '-';
|
|
1778
|
|
1779 $line->{Extra}->{ENSP} = $protein if defined($protein);
|
|
1780 }
|
|
1781
|
|
1782 # canonical transcript
|
|
1783 if(defined $config->{canonical}) {
|
|
1784 $line->{Extra}->{CANONICAL} = 'YES' if $tr->is_canonical;
|
|
1785 }
|
|
1786
|
|
1787 return $line;
|
|
1788 }
|
|
1789
|
|
1790 # initialize a line hash
|
|
1791 sub init_line {
|
|
1792 my $config = shift;
|
|
1793 my $vf = shift;
|
|
1794 my $base_line = shift;
|
|
1795
|
|
1796 my $line = {
|
|
1797 Uploaded_variation => $vf->variation_name,
|
|
1798 Location => ($vf->{chr} || $vf->seq_region_name).':'.format_coords($vf->start, $vf->end),
|
|
1799 Existing_variation => $vf->{existing},
|
|
1800 Extra => {},
|
|
1801 };
|
|
1802
|
|
1803 # add custom info
|
|
1804 if(defined($config->{custom}) && scalar @{$config->{custom}}) {
|
|
1805 # merge the custom hash with the extra hash
|
|
1806 my $custom = get_custom_annotation($config, $vf);
|
|
1807
|
|
1808 for my $key (keys %$custom) {
|
|
1809 $line->{Extra}->{$key} = $custom->{$key};
|
|
1810 }
|
|
1811 }
|
|
1812
|
|
1813 # individual?
|
|
1814 $line->{Extra}->{IND} = $vf->{individual} if defined($vf->{individual});
|
|
1815
|
|
1816 # frequencies?
|
|
1817 $line->{Extra}->{FREQS} = join ",", @{$vf->{freqs}} if defined($vf->{freqs});
|
|
1818
|
|
1819 # gmaf?
|
|
1820 $line->{Extra}->{GMAF} = $vf->{gmaf} if defined($config->{gmaf}) && defined($vf->{gmaf});
|
|
1821
|
|
1822 # copy entries from base_line
|
|
1823 if(defined($base_line)) {
|
|
1824 $line->{$_} = $base_line->{$_} for keys %$base_line;
|
|
1825 }
|
|
1826
|
|
1827 return $line;
|
|
1828 }
|
|
1829
|
|
1830
|
|
1831 # get custom annotation for a single VF
|
|
1832 sub get_custom_annotation {
|
|
1833 my $config = shift;
|
|
1834 my $vf = shift;
|
|
1835 my $cache = shift;
|
|
1836
|
|
1837 return $vf->{custom} if defined($vf->{custom});
|
|
1838
|
|
1839 my $annotation = {};
|
|
1840
|
|
1841 my $chr = $vf->{chr};
|
|
1842
|
|
1843 if(!defined($cache)) {
|
|
1844 # spoof regions
|
|
1845 my $regions;
|
|
1846 $regions->{$chr} = [$vf->{start}.'-'.$vf->{end}];
|
|
1847 $cache = cache_custom_annotation($config, $regions, $chr);
|
|
1848 }
|
|
1849
|
|
1850 foreach my $custom(@{$config->{custom}}) {
|
|
1851 next unless defined($cache->{$chr}->{$custom->{name}});
|
|
1852
|
|
1853 # exact type must match coords of variant exactly
|
|
1854 if($custom->{type} eq 'exact') {
|
|
1855 foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$vf->{start}}}) {
|
|
1856
|
|
1857 next unless
|
|
1858 $feature->{chr} eq $chr &&
|
|
1859 $feature->{start} eq $vf->{start} &&
|
|
1860 $feature->{end} eq $vf->{end};
|
|
1861
|
|
1862 $annotation->{$custom->{name}} .= $feature->{name}.',';
|
|
1863 }
|
|
1864 }
|
|
1865
|
|
1866 # overlap type only needs to overlap, but we need to search the whole range
|
|
1867 elsif($custom->{type} eq 'overlap') {
|
|
1868 foreach my $pos(keys %{$cache->{$chr}->{$custom->{name}}}) {
|
|
1869 foreach my $feature(values %{$cache->{$chr}->{$custom->{name}}->{$pos}}) {
|
|
1870
|
|
1871 next unless
|
|
1872 $feature->{chr} eq $chr &&
|
|
1873 $feature->{end} >= $vf->{start} &&
|
|
1874 $feature->{start} <= $vf->{end};
|
|
1875
|
|
1876 $annotation->{$custom->{name}} .= $feature->{name}.',';
|
|
1877 }
|
|
1878 }
|
|
1879 }
|
|
1880
|
|
1881 # trim off trailing commas
|
|
1882 $annotation->{$custom->{name}} =~ s/\,$//g if defined($annotation->{$custom->{name}});
|
|
1883 }
|
|
1884
|
|
1885 return $annotation;
|
|
1886 }
|
|
1887
|
|
1888 # decides whether to print a VF based on user defined consequences
|
|
1889 sub filter_by_consequence {
|
|
1890 my $config = shift;
|
|
1891 my $vf = shift;
|
|
1892 my $filters = $config->{filter};
|
|
1893
|
|
1894 # find it if we only have "no"s
|
|
1895 my $only_nos = 0;
|
|
1896 $only_nos = 1 if (sort {$a <=> $b} values %$filters)[-1] == 0;
|
|
1897
|
|
1898 my ($yes, $no) = (0, 0);
|
|
1899
|
|
1900 # get all consequences across all term types
|
|
1901 my @types = ('SO', 'display');
|
|
1902
|
|
1903 my @cons;
|
|
1904 push @cons, @{$vf->consequence_type($_)} for @types;
|
|
1905
|
|
1906 # add regulatory consequences
|
|
1907 if(defined($config->{regulatory})) {
|
|
1908 foreach my $term_type(@types) {
|
|
1909 my $term_method = $term_type.'_term';
|
|
1910
|
|
1911 for my $rfv (@{ $vf->get_all_RegulatoryFeatureVariations }) {
|
|
1912 for my $rfva(@{$rfv->get_all_alternate_RegulatoryFeatureVariationAlleles}) {
|
|
1913 push @cons, map {$_->$term_method} @{ $rfva->get_all_OverlapConsequences };
|
|
1914 }
|
|
1915 }
|
|
1916 for my $mfv (@{ $vf->get_all_MotifFeatureVariations }) {
|
|
1917 for my $mfva(@{$mfv->get_all_alternate_MotifFeatureVariationAlleles}) {
|
|
1918 push @cons, map {$_->$term_method} @{ $mfva->get_all_OverlapConsequences };
|
|
1919 }
|
|
1920 }
|
|
1921 }
|
|
1922 }
|
|
1923
|
|
1924 foreach my $con(grep {defined($_) && defined($filters->{$_})} @cons) {
|
|
1925 if($filters->{$con} == 1) {
|
|
1926 $yes = 1;
|
|
1927 }
|
|
1928 else {
|
|
1929 $no = 1;
|
|
1930 }
|
|
1931 }
|
|
1932
|
|
1933 # check special case, coding
|
|
1934 if(defined($filters->{coding})) {
|
|
1935 if(grep {$_->affects_cds} @{$vf->get_all_TranscriptVariations}) {
|
|
1936 if($filters->{coding} == 1) {
|
|
1937 $yes = 1;
|
|
1938 }
|
|
1939 else {
|
|
1940 $no = 1;
|
|
1941 }
|
|
1942 }
|
|
1943 }
|
|
1944
|
|
1945 my $ok = 0;
|
|
1946 if($only_nos) {
|
|
1947 $ok = 1 if !$no;
|
|
1948 }
|
|
1949 else {
|
|
1950 $ok = 1 if $yes && !$no;
|
|
1951 }
|
|
1952
|
|
1953 return $ok;
|
|
1954 }
|
|
1955
|
|
1956
|
|
1957 # takes VFs created from input, fixes and checks various things
|
|
1958 sub validate_vf {
|
|
1959 my $config = shift;
|
|
1960 my $vf = shift;
|
|
1961
|
|
1962 # user specified chr skip list
|
|
1963 return 0 if defined($config->{chr}) && !$config->{chr}->{$vf->{chr}};
|
|
1964
|
|
1965 # fix inputs
|
|
1966 $vf->{chr} =~ s/chr//ig unless $vf->{chr} =~ /^chromosome$/i;
|
|
1967 $vf->{chr} = 'MT' if $vf->{chr} eq 'M';
|
|
1968 $vf->{strand} ||= 1;
|
|
1969 $vf->{strand} = ($vf->{strand} =~ /\-/ ? "-1" : "1");
|
|
1970
|
|
1971 # sanity checks
|
|
1972 unless($vf->{start} =~ /^\d+$/ && $vf->{end} =~ /^\d+$/) {
|
|
1973 warn("WARNING: Start ".$vf->{start}." or end ".$vf->{end}." coordinate invalid on line ".$config->{line_number}."\n") unless defined $config->{quiet};
|
|
1974 return 0;
|
|
1975 }
|
|
1976
|
|
1977 # structural variation?
|
|
1978 return validate_svf($config, $vf) if $vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature');
|
|
1979
|
|
1980 # uppercase allele string
|
|
1981 $vf->{allele_string} =~ tr/[a-z]/[A-Z]/;
|
|
1982
|
|
1983 unless($vf->{allele_string} =~ /([ACGT-]+\/*)+/) {
|
|
1984 warn("WARNING: Invalid allele string ".$vf->{allele_string}." on line ".$config->{line_number}." or possible parsing error\n") unless defined $config->{quiet};
|
|
1985 return 0;
|
|
1986 }
|
|
1987
|
|
1988 # insertion should have start = end + 1
|
|
1989 if($vf->{allele_string} =~ /^\-\// && $vf->{start} != $vf->{end} + 1) {
|
|
1990 warn(
|
|
1991 "WARNING: Alleles look like an insertion (".
|
|
1992 $vf->{allele_string}.
|
|
1993 ") but coordinates are not start = end + 1 (START=".
|
|
1994 $vf->{start}.", END=".$vf->{end}.
|
|
1995 ") on line ".$config->{line_number}."\n"
|
|
1996 ) unless defined($config->{quiet});
|
|
1997 return 0;
|
|
1998 }
|
|
1999
|
|
2000 # check length of reference matches seq length spanned
|
|
2001 my @alleles = split /\//, $vf->{allele_string};
|
|
2002 my $ref_allele = shift @alleles;
|
|
2003 my $tmp_ref_allele = $ref_allele;
|
|
2004 $tmp_ref_allele =~ s/\-//g;
|
|
2005
|
|
2006 #if(($vf->{end} - $vf->{start}) + 1 != length($tmp_ref_allele)) {
|
|
2007 # warn(
|
|
2008 # "WARNING: Length of reference allele (".$ref_allele.
|
|
2009 # " length ".length($tmp_ref_allele).") does not match co-ordinates ".$vf->{start}."-".$vf->{end}.
|
|
2010 # " on line ".$config->{line_number}
|
|
2011 # ) unless defined($config->{quiet});
|
|
2012 # return 0;
|
|
2013 #}
|
|
2014
|
|
2015 # flag as unbalanced
|
|
2016 foreach my $allele(@alleles) {
|
|
2017 $allele =~ s/\-//g;
|
|
2018 $vf->{indel} = 1 unless length($allele) == length($tmp_ref_allele);
|
|
2019 }
|
|
2020
|
|
2021 # check reference allele if requested
|
|
2022 if(defined $config->{check_ref}) {
|
|
2023 my $ok = 0;
|
|
2024 my $slice_ref_allele;
|
|
2025
|
|
2026 # insertion, therefore no ref allele to check
|
|
2027 if($ref_allele eq '-') {
|
|
2028 $ok = 1;
|
|
2029 }
|
|
2030 else {
|
|
2031 my $slice_ref = $vf->{slice}->sub_Slice($vf->{start}, $vf->{end}, $vf->{strand});
|
|
2032
|
|
2033 if(!defined($slice_ref)) {
|
|
2034 warn "WARNING: Could not fetch sub-slice from ".$vf->{start}."\-".$vf->{end}."\(".$vf->{strand}."\) on line ".$config->{line_number} unless defined $config->{quiet};
|
|
2035 }
|
|
2036
|
|
2037 else {
|
|
2038 $slice_ref_allele = $slice_ref->seq;
|
|
2039 $ok = ($slice_ref_allele eq $ref_allele ? 1 : 0);
|
|
2040 }
|
|
2041 }
|
|
2042
|
|
2043 if(!$ok) {
|
|
2044 warn
|
|
2045 "WARNING: Specified reference allele $ref_allele ",
|
|
2046 "does not match Ensembl reference allele",
|
|
2047 ($slice_ref_allele ? " $slice_ref_allele" : ""),
|
|
2048 " on line ".$config->{line_number} unless defined $config->{quiet};
|
|
2049 return 0;
|
|
2050 }
|
|
2051 }
|
|
2052
|
|
2053 return 1;
|
|
2054 }
|
|
2055
|
|
2056
|
|
2057 # validate a structural variation
|
|
2058 sub validate_svf {
|
|
2059 my $config = shift;
|
|
2060 my $svf = shift;
|
|
2061
|
|
2062 return 1;
|
|
2063 }
|
|
2064
|
|
2065
|
|
2066 # takes a hash of VFs and fetches consequences by pre-fetching overlapping transcripts
|
|
2067 # from database and/or cache
|
|
2068 sub whole_genome_fetch {
|
|
2069 my $config = shift;
|
|
2070 my $chr = shift;
|
|
2071 my $vf_hash = shift;
|
|
2072
|
|
2073 my (%vf_done, @finished_vfs, %seen_rfs);
|
|
2074
|
|
2075 if(defined($config->{offline}) && !-e $config->{dir}.'/'.$chr) {
|
|
2076 debug("No cache found for chromsome $chr") unless defined($config->{quiet});
|
|
2077
|
|
2078 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2079 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
|
|
2080 push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}};
|
|
2081 }
|
|
2082 }
|
|
2083
|
|
2084 return \@finished_vfs;
|
|
2085 }
|
|
2086
|
|
2087 my $slice_cache = $config->{slice_cache};
|
|
2088 build_slice_cache($config, $config->{tr_cache}) unless defined($slice_cache->{$chr});
|
|
2089
|
|
2090 debug("Analyzing chromosome $chr") unless defined($config->{quiet});
|
|
2091
|
|
2092 # custom annotations
|
|
2093 whole_genome_fetch_custom($config, $vf_hash, $chr) if defined($config->{custom});
|
|
2094
|
|
2095 # split up normal variations from SVs
|
|
2096 my ($tmp_vf_hash, @svfs);
|
|
2097
|
|
2098 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2099 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
|
|
2100 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) {
|
|
2101 if($vf->isa('Bio::EnsEMBL::Variation::StructuralVariationFeature')) {
|
|
2102 push @svfs, $vf;
|
|
2103 }
|
|
2104 else {
|
|
2105 push @{$tmp_vf_hash->{$chr}{$chunk}{$pos}}, $vf;
|
|
2106 }
|
|
2107 }
|
|
2108 }
|
|
2109 }
|
|
2110
|
|
2111 $vf_hash = $tmp_vf_hash;
|
|
2112
|
|
2113 # transcript annotations
|
|
2114 whole_genome_fetch_transcript($config, $vf_hash, $chr)
|
|
2115 unless defined($config->{no_consequences});
|
|
2116
|
|
2117 # regulatory annotations
|
|
2118 whole_genome_fetch_reg($config, $vf_hash, $chr)
|
|
2119 if defined($config->{regulatory})
|
|
2120 && !defined($config->{no_consequences});
|
|
2121
|
|
2122 # structural variations
|
|
2123 @finished_vfs = @{whole_genome_fetch_sv($config, \@svfs, $chr)}
|
|
2124 if scalar @svfs;
|
|
2125
|
|
2126 # sort results into @finished_vfs array
|
|
2127 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2128 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
|
|
2129
|
|
2130 # pinch slice from slice cache if we don't already have it
|
|
2131 $_->{slice} ||= $slice_cache->{$chr} for @{$vf_hash->{$chr}{$chunk}{$pos}};
|
|
2132
|
|
2133 if(defined($config->{regulatory})) {
|
|
2134 foreach my $type(@REG_FEAT_TYPES) {
|
|
2135 $_->{regulation_variations}->{$type} ||= [] for @{$vf_hash->{$chr}{$chunk}{$pos}};
|
|
2136 }
|
|
2137 }
|
|
2138
|
|
2139 if(defined($config->{custom})) {
|
|
2140 $_->{custom} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}};
|
|
2141 }
|
|
2142
|
|
2143 $_->{transcript_variations} ||= {} for @{$vf_hash->{$chr}{$chunk}{$pos}};
|
|
2144
|
|
2145 # add to final array
|
|
2146 push @finished_vfs, @{$vf_hash->{$chr}{$chunk}{$pos}};
|
|
2147 }
|
|
2148 }
|
|
2149
|
|
2150 # sort
|
|
2151 @finished_vfs = sort {$a->{start} <=> $b->{start} || $a->{end} <=> $b->{end}} @finished_vfs;
|
|
2152
|
|
2153 # clean hash
|
|
2154 delete $vf_hash->{$chr};
|
|
2155
|
|
2156 return \@finished_vfs;
|
|
2157 }
|
|
2158
|
|
2159 sub whole_genome_fetch_custom {
|
|
2160 my $config = shift;
|
|
2161 my $vf_hash = shift;
|
|
2162 my $chr = shift;
|
|
2163
|
|
2164 return unless scalar @{$config->{custom}};
|
|
2165
|
|
2166 # create regions based on VFs instead of chunks
|
|
2167 my $tmp_regions;
|
|
2168
|
|
2169 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2170 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
|
|
2171 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) {
|
|
2172 push @{$tmp_regions->{$chr}}, ($vf->{start}-1).'-'.($vf->{end}+1);
|
|
2173 }
|
|
2174 }
|
|
2175 }
|
|
2176
|
|
2177 return unless defined($tmp_regions->{$chr});
|
|
2178
|
|
2179 # cache annotations
|
|
2180 my $annotation_cache = cache_custom_annotation($config, $tmp_regions, $chr);
|
|
2181
|
|
2182 # count and report
|
|
2183 my $total_annotations = 0;
|
|
2184 $total_annotations += scalar keys %{$annotation_cache->{$chr}->{$_}} for keys %{$annotation_cache->{$chr}};
|
|
2185 debug("Retrieved $total_annotations custom annotations (", (join ", ", map {(scalar keys %{$annotation_cache->{$chr}->{$_}}).' '.$_} keys %{$annotation_cache->{$chr}}), ")");
|
|
2186
|
|
2187 # compare annotations to variations in hash
|
|
2188 debug("Analyzing custom annotations") unless defined($config->{quiet});
|
|
2189 my $total = scalar keys %{$vf_hash->{$chr}};
|
|
2190 my $i = 0;
|
|
2191
|
|
2192 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2193 progress($config, $i++, $total);
|
|
2194
|
|
2195 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
|
|
2196 foreach my $vf(@{$vf_hash->{$chr}{$chunk}{$pos}}) {
|
|
2197
|
|
2198 $vf->{custom} = get_custom_annotation($config, $vf, $annotation_cache);
|
|
2199 }
|
|
2200 }
|
|
2201 }
|
|
2202
|
|
2203 end_progress($config);
|
|
2204 }
|
|
2205
|
|
2206 sub whole_genome_fetch_transcript {
|
|
2207 my $config = shift;
|
|
2208 my $vf_hash = shift;
|
|
2209 my $chr = shift;
|
|
2210
|
|
2211 my $tr_cache = $config->{tr_cache};
|
|
2212 my $slice_cache = $config->{slice_cache};
|
|
2213
|
|
2214 my $up_down_size = MAX_DISTANCE_FROM_TRANSCRIPT;
|
|
2215
|
|
2216 # check we have defined regions
|
|
2217 return unless defined($vf_hash->{$chr}) && defined($tr_cache->{$chr});
|
|
2218
|
|
2219 # copy slice from transcript to slice cache
|
|
2220 $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr});
|
|
2221
|
|
2222 debug("Analyzing variants") unless defined($config->{quiet});
|
|
2223
|
|
2224 my $tr_counter = 0;
|
|
2225 my $tr_count = scalar @{$tr_cache->{$chr}};
|
|
2226
|
|
2227 while($tr_counter < $tr_count) {
|
|
2228
|
|
2229 progress($config, $tr_counter, $tr_count);
|
|
2230 print PARENT "BUMP\n" if defined($config->{forked});
|
|
2231
|
|
2232 my $tr = $tr_cache->{$chr}->[$tr_counter++];
|
|
2233
|
|
2234 # do each overlapping VF
|
|
2235 my $s = $tr->start - $up_down_size;
|
|
2236 my $e = $tr->end + $up_down_size;
|
|
2237
|
|
2238 # get the chunks this transcript overlaps
|
|
2239 my %chunks;
|
|
2240 $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size}));
|
|
2241 map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks;
|
|
2242
|
|
2243 # pointer to previous VF
|
|
2244 # used to tell plugins this is the last variant analysed in this transcript
|
|
2245 my $previous_vf;
|
|
2246
|
|
2247 foreach my $chunk(keys %chunks) {
|
|
2248 foreach my $vf(
|
|
2249 grep {$_->{start} <= $e && $_->{end} >= $s}
|
|
2250 map {@{$vf_hash->{$chr}{$chunk}{$_}}}
|
|
2251 keys %{$vf_hash->{$chr}{$chunk}}
|
|
2252 ) {
|
|
2253 # pinch slice from slice cache if we don't already have it
|
|
2254 $vf->{slice} ||= $slice_cache->{$chr};
|
|
2255
|
|
2256 my $tv = Bio::EnsEMBL::Variation::TranscriptVariation->new(
|
|
2257 -transcript => $tr,
|
|
2258 -variation_feature => $vf,
|
|
2259 -adaptor => $config->{tva},
|
|
2260 -no_ref_check => 1,
|
|
2261 -no_transfer => 1
|
|
2262 );
|
|
2263
|
|
2264 # prefetching stuff here prevents doing loads at the
|
|
2265 # end and makes progress reporting more useful
|
|
2266 $tv->_prefetch_for_vep;
|
|
2267
|
|
2268 $vf->add_TranscriptVariation($tv);
|
|
2269
|
|
2270 # cache VF on the transcript if it is an unbalanced sub
|
|
2271 push @{$tr->{indels}}, $vf if defined($vf->{indel});
|
|
2272
|
|
2273 if(defined($config->{individual})) {
|
|
2274
|
|
2275 # store VF on transcript, weaken reference to avoid circularity
|
|
2276 push @{$tr->{vfs}}, $vf;
|
|
2277 weaken($tr->{vfs}->[-1]);
|
|
2278
|
|
2279 delete $previous_vf->{last_in_transcript}->{$tr->stable_id};
|
|
2280 $vf->{last_in_transcript}->{$tr->stable_id} = 1;
|
|
2281 }
|
|
2282
|
|
2283 $previous_vf = $vf;
|
|
2284 }
|
|
2285 }
|
|
2286 }
|
|
2287
|
|
2288 end_progress($config);
|
|
2289 }
|
|
2290
|
|
2291 sub whole_genome_fetch_reg {
|
|
2292 my $config = shift;
|
|
2293 my $vf_hash = shift;
|
|
2294 my $chr = shift;
|
|
2295
|
|
2296 my $rf_cache = $config->{rf_cache};
|
|
2297 my $slice_cache = $config->{slice_cache};
|
|
2298
|
|
2299 foreach my $type(keys %{$rf_cache->{$chr}}) {
|
|
2300 debug("Analyzing ".$type."s") unless defined($config->{quiet});
|
|
2301
|
|
2302 my $constructor = 'Bio::EnsEMBL::Variation::'.$type.'Variation';
|
|
2303
|
|
2304 my $rf_counter = 0;
|
|
2305 my $rf_count = scalar @{$rf_cache->{$chr}->{$type}};
|
|
2306
|
|
2307 while($rf_counter < $rf_count) {
|
|
2308
|
|
2309 progress($config, $rf_counter, $rf_count);
|
|
2310 print PARENT "BUMP\n" if defined($config->{forked});
|
|
2311
|
|
2312 my $rf = $rf_cache->{$chr}->{$type}->[$rf_counter++];
|
|
2313
|
|
2314 # do each overlapping VF
|
|
2315 my $s = $rf->{start};
|
|
2316 my $e = $rf->{end};
|
|
2317
|
|
2318 # get the chunks this transcript overlaps
|
|
2319 my %chunks;
|
|
2320 $chunks{$_} = 1 for (int($s/$config->{chunk_size})..int($e/$config->{chunk_size}));
|
|
2321 map {delete $chunks{$_} unless defined($vf_hash->{$chr}{$_})} keys %chunks;
|
|
2322
|
|
2323 foreach my $chunk(keys %chunks) {
|
|
2324 foreach my $vf(
|
|
2325 grep {$_->{start} <= $e && $_->{end} >= $s}
|
|
2326 map {@{$vf_hash->{$chr}{$chunk}{$_}}}
|
|
2327 keys %{$vf_hash->{$chr}{$chunk}}
|
|
2328 ) {
|
|
2329 push @{$vf->{regulation_variations}->{$type}}, $constructor->new(
|
|
2330 -variation_feature => $vf,
|
|
2331 -feature => $rf,
|
|
2332 -no_ref_check => 1,
|
|
2333 -no_transfer => 1
|
|
2334 );
|
|
2335 }
|
|
2336 }
|
|
2337 }
|
|
2338
|
|
2339 end_progress($config);
|
|
2340 }
|
|
2341 }
|
|
2342
|
|
2343 sub whole_genome_fetch_sv {
|
|
2344 my $config = shift;
|
|
2345 my $svfs = shift;
|
|
2346 my $chr = shift;
|
|
2347
|
|
2348 my $tr_cache = $config->{tr_cache};
|
|
2349 my $rf_cache = $config->{rf_cache};
|
|
2350 my $slice_cache = $config->{slice_cache};
|
|
2351
|
|
2352 debug("Analyzing structural variations") unless defined($config->{quiet});
|
|
2353
|
|
2354 my($i, $total) = (0, scalar @$svfs);
|
|
2355
|
|
2356 my @finished_vfs;
|
|
2357
|
|
2358 foreach my $svf(@$svfs) {
|
|
2359 progress($config, $i++, $total);
|
|
2360
|
|
2361 my %done_genes = ();
|
|
2362
|
|
2363 if(defined($tr_cache->{$chr})) {
|
|
2364 foreach my $tr(grep {overlap($_->{start} - MAX_DISTANCE_FROM_TRANSCRIPT, $_->{end} + MAX_DISTANCE_FROM_TRANSCRIPT, $svf->{start}, $svf->{end})} @{$tr_cache->{$chr}}) {
|
|
2365 my $svo = Bio::EnsEMBL::Variation::TranscriptStructuralVariation->new(
|
|
2366 -transcript => $tr,
|
|
2367 -structural_variation_feature => $svf,
|
|
2368 -no_transfer => 1
|
|
2369 );
|
|
2370
|
|
2371 $svf->add_TranscriptStructuralVariation($svo);
|
|
2372 }
|
|
2373 }
|
|
2374
|
|
2375 $svf->{transcript_structural_variations} ||= {};
|
|
2376
|
|
2377 # do regulatory features
|
|
2378 if(defined($config->{regulatory}) && defined($rf_cache->{$chr})) {
|
|
2379 foreach my $rf_type(qw/RegulatoryFeature/) {#keys %{$rf_cache->{$chr}}) {
|
|
2380 foreach my $rf(grep {$_->{start} <= $svf->{end} && $_->end >= $svf->{end}} @{$rf_cache->{$chr}->{$rf_type}}) {
|
|
2381 my $svo = Bio::EnsEMBL::Variation::StructuralVariationOverlap->new(
|
|
2382 -feature => $rf,
|
|
2383 -structural_variation_feature => $svf,
|
|
2384 -no_transfer => 1
|
|
2385 );
|
|
2386
|
|
2387 push @{$svf->{regulation_structural_variations}->{$rf_type}}, $svo;
|
|
2388 }
|
|
2389
|
|
2390 $svf->{regulation_structural_variations}->{$rf_type} ||= [];
|
|
2391 }
|
|
2392 }
|
|
2393
|
|
2394 # sort them
|
|
2395 #$svf->_sort_svos;
|
|
2396 push @finished_vfs, $svf;
|
|
2397 }
|
|
2398
|
|
2399 end_progress($config);
|
|
2400
|
|
2401 return \@finished_vfs;
|
|
2402 }
|
|
2403
|
|
2404 # retrieves transcripts given region list
|
|
2405 sub fetch_transcripts {
|
|
2406 my $config = shift;
|
|
2407 my $regions = shift;
|
|
2408 my $trim_regions = shift;
|
|
2409
|
|
2410 my $tr_cache = $config->{tr_cache};
|
|
2411 my $slice_cache = $config->{slice_cache};
|
|
2412
|
|
2413 my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0);
|
|
2414
|
|
2415 my %seen_trs;
|
|
2416
|
|
2417 $count_from_mem = 0;
|
|
2418 my $region_count = 0;
|
|
2419 foreach my $chr(keys %{$regions}) {
|
|
2420 $count_from_mem += scalar @{$tr_cache->{$chr}} if defined($tr_cache->{$chr}) && ref($tr_cache->{$chr}) eq 'ARRAY';
|
|
2421 $region_count += scalar @{$regions->{$chr}};
|
|
2422 }
|
|
2423
|
|
2424 my $counter;
|
|
2425
|
|
2426 debug("Reading transcript data from cache and/or database") unless defined($config->{quiet});
|
|
2427
|
|
2428 foreach my $chr(keys %{$regions}) {
|
|
2429 foreach my $region(sort {(split /\-/, $a)[0] <=> (split /\-/, $b)[1]} @{$regions->{$chr}}) {
|
|
2430 progress($config, $counter++, $region_count);
|
|
2431
|
|
2432 # skip regions beyond the end of the chr
|
|
2433 next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length;
|
|
2434
|
|
2435 next if defined($config->{loaded_tr}->{$chr}->{$region});
|
|
2436
|
|
2437 # force quiet so other methods don't mess up the progress bar
|
|
2438 my $quiet = $config->{quiet};
|
|
2439 $config->{quiet} = 1;
|
|
2440
|
|
2441 # try and load cache from disk if using cache
|
|
2442 my $tmp_cache;
|
|
2443 if(defined($config->{cache})) {
|
|
2444 #$tmp_cache = (
|
|
2445 # defined($config->{tabix}) ?
|
|
2446 # load_dumped_transcript_cache_tabix($config, $chr, $region, $trim_regions) :
|
|
2447 # load_dumped_transcript_cache($config, $chr, $region)
|
|
2448 #);
|
|
2449 $tmp_cache = load_dumped_transcript_cache($config, $chr, $region);
|
|
2450 $count_from_cache += scalar @{$tmp_cache->{$chr}} if defined($tmp_cache->{$chr});
|
|
2451 $config->{loaded_tr}->{$chr}->{$region} = 1;
|
|
2452 }
|
|
2453
|
|
2454 # no cache found on disk or not using cache
|
|
2455 if(!defined($tmp_cache->{$chr})) {
|
|
2456
|
|
2457 if(defined($config->{offline})) {
|
|
2458 # restore quiet status
|
|
2459 $config->{quiet} = $quiet;
|
|
2460
|
|
2461 debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet});
|
|
2462 next;
|
|
2463 }
|
|
2464
|
|
2465 # spoof temporary region hash
|
|
2466 my $tmp_hash;
|
|
2467 push @{$tmp_hash->{$chr}}, $region;
|
|
2468
|
|
2469 $tmp_cache = cache_transcripts($config, $tmp_hash);
|
|
2470
|
|
2471 # make it an empty arrayref that gets cached
|
|
2472 # so we don't get confused and reload next time round
|
|
2473 $tmp_cache->{$chr} ||= [];
|
|
2474
|
|
2475 $count_from_db += scalar @{$tmp_cache->{$chr}};
|
|
2476
|
|
2477 # dump to disk if writing to cache
|
|
2478 (defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $region) : dump_transcript_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache});
|
|
2479
|
|
2480 $config->{loaded_tr}->{$chr}->{$region} = 1;
|
|
2481 }
|
|
2482
|
|
2483 # add loaded transcripts to main cache
|
|
2484 if(defined($tmp_cache->{$chr})) {
|
|
2485 while(my $tr = shift @{$tmp_cache->{$chr}}) {
|
|
2486
|
|
2487 # track already added transcripts by dbID
|
|
2488 my $dbID = $tr->dbID;
|
|
2489 if($seen_trs{$dbID}) {
|
|
2490 $count_duplicates++;
|
|
2491 next;
|
|
2492 }
|
|
2493
|
|
2494 # trim out?
|
|
2495 #if(defined($trim_regions) && defined($trim_regions->{$chr})) {
|
|
2496 # my $tmp_count = scalar grep {
|
|
2497 # overlap(
|
|
2498 # (split /\-/, $_)[0], (split /\-/, $_)[1],
|
|
2499 # $tr->{start}, $tr->{end}
|
|
2500 # )
|
|
2501 # } @{$trim_regions->{$chr}};
|
|
2502 #
|
|
2503 # if(!$tmp_count) {
|
|
2504 # $count_trimmed++;
|
|
2505 # next;
|
|
2506 # }
|
|
2507 #}
|
|
2508
|
|
2509 $seen_trs{$dbID} = 1;
|
|
2510
|
|
2511 push @{$tr_cache->{$chr}}, $tr;
|
|
2512 }
|
|
2513 }
|
|
2514
|
|
2515 $tr_cache->{$chr} ||= [];
|
|
2516
|
|
2517 undef $tmp_cache;
|
|
2518
|
|
2519 # restore quiet status
|
|
2520 $config->{quiet} = $quiet;
|
|
2521
|
|
2522 # build slice cache
|
|
2523 $slice_cache = build_slice_cache($config, $tr_cache) unless defined($slice_cache->{$chr});
|
|
2524 }
|
|
2525 }
|
|
2526
|
|
2527 end_progress($config);
|
|
2528
|
|
2529 my $tr_count = 0;
|
|
2530 $tr_count += scalar @{$tr_cache->{$_}} for keys %$tr_cache;
|
|
2531
|
|
2532 debug("Retrieved $tr_count transcripts ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet});
|
|
2533
|
|
2534 return $tr_count;
|
|
2535 }
|
|
2536
|
|
2537 sub fetch_regfeats {
|
|
2538 my $config = shift;
|
|
2539 my $regions = shift;
|
|
2540 my $trim_regions = shift;
|
|
2541
|
|
2542 my $rf_cache = $config->{rf_cache};
|
|
2543 my $slice_cache = $config->{slice_cache};
|
|
2544
|
|
2545 my ($count_from_mem, $count_from_db, $count_from_cache, $count_duplicates, $count_trimmed) = (0, 0, 0, 0, 0);
|
|
2546
|
|
2547 my %seen_rfs;
|
|
2548
|
|
2549 $count_from_mem = 0;
|
|
2550 my $region_count = 0;
|
|
2551
|
|
2552 foreach my $chr(keys %$regions) {
|
|
2553 if(defined($rf_cache->{$chr}) && ref($rf_cache->{$chr}) eq 'HASH') {
|
|
2554 $count_from_mem += scalar @{$rf_cache->{$chr}->{$_}} for keys %{$rf_cache->{$chr}};
|
|
2555 }
|
|
2556 $region_count += scalar @{$regions->{$chr}};
|
|
2557 }
|
|
2558
|
|
2559 my $counter = 0;
|
|
2560
|
|
2561 debug("Reading regulatory data from cache and/or database") unless defined($config->{quiet});
|
|
2562
|
|
2563 foreach my $chr(keys %$regions) {
|
|
2564 foreach my $region(sort {(split /\-/, $a)[0] cmp (split /\-/, $b)[1]} @{$regions->{$chr}}) {
|
|
2565 progress($config, $counter++, $region_count);
|
|
2566
|
|
2567 next if defined($config->{loaded_rf}->{$chr}->{$region});
|
|
2568
|
|
2569 # skip regions beyond the end of the chr
|
|
2570 next if defined($slice_cache->{$chr}) && (split /\-/, $region)[0] > $slice_cache->{$chr}->length;
|
|
2571
|
|
2572 # force quiet so other methods don't mess up the progress bar
|
|
2573 my $quiet = $config->{quiet};
|
|
2574 $config->{quiet} = 1;
|
|
2575
|
|
2576 # try and load cache from disk if using cache
|
|
2577 my $tmp_cache;
|
|
2578 if(defined($config->{cache})) {
|
|
2579 $tmp_cache = load_dumped_reg_feat_cache($config, $chr, $region);
|
|
2580
|
|
2581 #$tmp_cache =
|
|
2582 # defined($config->{tabix}) ?
|
|
2583 # load_dumped_reg_feat_cache_tabix($config, $chr, $region, $trim_regions) :
|
|
2584 # load_dumped_reg_feat_cache($config, $chr, $region);
|
|
2585
|
|
2586
|
|
2587 if(defined($tmp_cache->{$chr})) {
|
|
2588 $count_from_cache += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}};
|
|
2589 }
|
|
2590
|
|
2591 # flag as loaded
|
|
2592 $config->{loaded_rf}->{$chr}->{$region} = 1;
|
|
2593 }
|
|
2594
|
|
2595 # no cache found on disk or not using cache
|
|
2596 if(!defined($tmp_cache->{$chr})) {
|
|
2597
|
|
2598 if(defined($config->{offline})) {
|
|
2599
|
|
2600 # restore quiet status
|
|
2601 $config->{quiet} = $quiet;
|
|
2602
|
|
2603 debug("WARNING: Could not find cache for $chr\:$region") unless defined($config->{quiet});
|
|
2604 next;
|
|
2605 }
|
|
2606
|
|
2607 # spoof temporary region hash
|
|
2608 my $tmp_hash;
|
|
2609 push @{$tmp_hash->{$chr}}, $region;
|
|
2610
|
|
2611 $tmp_cache = cache_reg_feats($config, $tmp_hash);
|
|
2612
|
|
2613 # make it an empty arrayref that gets cached
|
|
2614 # so we don't get confused and reload next time round
|
|
2615 $tmp_cache->{$chr} ||= {};
|
|
2616
|
|
2617 $count_from_db += scalar @{$tmp_cache->{$chr}->{$_}} for keys %{$tmp_cache->{$chr}};
|
|
2618
|
|
2619 # dump to disk if writing to cache
|
|
2620 #dump_reg_feat_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache});
|
|
2621 (defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $tmp_cache, $chr, $region) : dump_reg_feat_cache($config, $tmp_cache, $chr, $region)) if defined($config->{write_cache});
|
|
2622
|
|
2623 # restore deleted coord_system adaptor
|
|
2624 foreach my $type(keys %{$tmp_cache->{$chr}}) {
|
|
2625 $_->{slice}->{coord_system}->{adaptor} = $config->{csa} for @{$tmp_cache->{$chr}->{$type}};
|
|
2626 }
|
|
2627
|
|
2628 # flag as loaded
|
|
2629 $config->{loaded_rf}->{$chr}->{$region} = 1;
|
|
2630 }
|
|
2631
|
|
2632 # add loaded reg_feats to main cache
|
|
2633 if(defined($tmp_cache->{$chr})) {
|
|
2634 foreach my $type(keys %{$tmp_cache->{$chr}}) {
|
|
2635 while(my $rf = shift @{$tmp_cache->{$chr}->{$type}}) {
|
|
2636
|
|
2637 # filter on cell type
|
|
2638 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
|
|
2639 next unless grep {$rf->{cell_types}->{$_}} @{$config->{cell_type}};
|
|
2640 }
|
|
2641
|
|
2642 # trim out?
|
|
2643 #if(defined($trim_regions) && defined($trim_regions->{$chr})) {
|
|
2644 # my $tmp_count = scalar grep {
|
|
2645 # overlap(
|
|
2646 # (split /\-/, $_)[0], (split /\-/, $_)[1],
|
|
2647 # $rf->{start}, $rf->{end}
|
|
2648 # )
|
|
2649 # } @{$trim_regions->{$chr}};
|
|
2650 #
|
|
2651 # if(!$tmp_count) {
|
|
2652 # $count_trimmed++;
|
|
2653 # next;
|
|
2654 # }
|
|
2655 #}
|
|
2656
|
|
2657 # track already added reg_feats by dbID
|
|
2658 my $dbID = $rf->{dbID};
|
|
2659 if($seen_rfs{$dbID}) {
|
|
2660 $count_duplicates++;
|
|
2661 next;
|
|
2662 }
|
|
2663 $seen_rfs{$dbID} = 1;
|
|
2664
|
|
2665 push @{$rf_cache->{$chr}->{$type}}, $rf;
|
|
2666 }
|
|
2667 }
|
|
2668 }
|
|
2669
|
|
2670 undef $tmp_cache;
|
|
2671
|
|
2672 # restore quiet status
|
|
2673 $config->{quiet} = $quiet;
|
|
2674 }
|
|
2675 }
|
|
2676
|
|
2677 end_progress($config);
|
|
2678
|
|
2679 my $rf_count = 0;
|
|
2680
|
|
2681 foreach my $chr(keys %$rf_cache) {
|
|
2682 foreach my $type(keys %{$rf_cache->{$chr}}) {
|
|
2683 $rf_count += scalar @{$rf_cache->{$chr}->{$type}};
|
|
2684 }
|
|
2685 }
|
|
2686
|
|
2687 debug("Retrieved $rf_count regulatory features ($count_from_mem mem, $count_from_cache cached, $count_from_db DB, $count_duplicates duplicates)") unless defined($config->{quiet});
|
|
2688
|
|
2689 return $rf_count;
|
|
2690 }
|
|
2691
|
|
2692 # gets existing VFs for a vf_hash
|
|
2693 sub check_existing_hash {
|
|
2694 my $config = shift;
|
|
2695 my $vf_hash = shift;
|
|
2696 my $variation_cache;
|
|
2697
|
|
2698 # we only care about non-SVs here
|
|
2699 my %new_hash;
|
|
2700
|
|
2701 foreach my $chr(keys %{$vf_hash}) {
|
|
2702 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2703 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
|
|
2704 foreach my $var(grep {$_->isa('Bio::EnsEMBL::Variation::VariationFeature')} @{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
|
|
2705 push @{$new_hash{$chr}->{$chunk}->{$pos}}, $var;
|
|
2706 }
|
|
2707 }
|
|
2708 }
|
|
2709 }
|
|
2710
|
|
2711 $vf_hash = \%new_hash;
|
|
2712
|
|
2713 debug("Checking for existing variations") unless defined($config->{quiet});
|
|
2714
|
|
2715 my ($chunk_count, $counter);
|
|
2716 $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash};
|
|
2717
|
|
2718 foreach my $chr(keys %{$vf_hash}) {
|
|
2719
|
|
2720 my %loaded_regions;
|
|
2721
|
|
2722 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2723 progress($config, $counter++, $chunk_count);
|
|
2724
|
|
2725 # get the VFs for this chunk
|
|
2726 my ($start, $end);
|
|
2727
|
|
2728 # work out start and end using chunk_size
|
|
2729 $start = $config->{chunk_size} * $chunk;
|
|
2730 $end = $config->{chunk_size} * ($chunk + 1);
|
|
2731
|
|
2732 # using cache?
|
|
2733 if(defined($config->{cache})) {
|
|
2734 my $tmp_regions;
|
|
2735 push @{$tmp_regions->{$chr}}, $start.'-'.$end;
|
|
2736
|
|
2737 my $converted_regions = convert_regions($config, $tmp_regions);
|
|
2738
|
|
2739 foreach my $region(@{$converted_regions->{$chr}}) {
|
|
2740
|
|
2741 unless($loaded_regions{$region}) {
|
|
2742 my $tmp_cache = load_dumped_variation_cache($config, $chr, $region);
|
|
2743
|
|
2744 # load from DB if not found in cache
|
|
2745 if(!defined($tmp_cache->{$chr})) {
|
|
2746 if(defined($config->{offline})) {
|
|
2747 debug("WARNING: Could not find variation cache for $chr\:$region") unless defined($config->{quiet});
|
|
2748 next;
|
|
2749 }
|
|
2750
|
|
2751 $tmp_cache->{$chr} = get_variations_in_region($config, $chr, $region);
|
|
2752 dump_variation_cache($config, $tmp_cache, $chr, $region) if defined($config->{write_cache});
|
|
2753 }
|
|
2754
|
|
2755 # merge tmp_cache with the main cache
|
|
2756 foreach my $key(keys %{$tmp_cache->{$chr}}) {
|
|
2757 $variation_cache->{$chr}->{$key} = $tmp_cache->{$chr}->{$key};
|
|
2758 delete $tmp_cache->{$chr}->{$key};
|
|
2759 }
|
|
2760
|
|
2761 # clear memory
|
|
2762 undef $tmp_cache;
|
|
2763
|
|
2764 # record this region as fetched
|
|
2765 $loaded_regions{$region} = 1;
|
|
2766 }
|
|
2767 }
|
|
2768 }
|
|
2769
|
|
2770 # no cache, get all variations in region from DB
|
|
2771 else {
|
|
2772
|
|
2773 my ($min, $max);
|
|
2774
|
|
2775 # we can fetch smaller region when using DB
|
|
2776 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
|
|
2777 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
|
|
2778 foreach my $coord(qw(start end)) {
|
|
2779 $min = $var->{$coord} if !defined($min) || $var->{$coord} < $min;
|
|
2780 $max = $var->{$coord} if !defined($max) || $var->{$coord} > $max;
|
|
2781 }
|
|
2782 }
|
|
2783 }
|
|
2784
|
|
2785 $variation_cache->{$chr} = get_variations_in_region($config, $chr, $min.'-'.$max);
|
|
2786 }
|
|
2787
|
|
2788 # now compare retrieved vars with vf_hash
|
|
2789 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
|
|
2790 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
|
|
2791 my @found;
|
|
2792 my @gmafs;
|
|
2793
|
|
2794 if(defined($variation_cache->{$chr})) {
|
|
2795 if(my $existing_vars = $variation_cache->{$chr}->{$pos}) {
|
|
2796 foreach my $existing_var(grep {$_->[1] <= $config->{failed}} @$existing_vars) {
|
|
2797 unless(is_var_novel($config, $existing_var, $var)) {
|
|
2798 push @found, $existing_var->[0];
|
|
2799 push @gmafs, $existing_var->[6].":".$existing_var->[7] if defined($existing_var->[6]) && defined($existing_var->[7]);
|
|
2800 }
|
|
2801 }
|
|
2802 }
|
|
2803 }
|
|
2804
|
|
2805 $var->{existing} = join ",", @found;
|
|
2806 $var->{existing} ||= '-';
|
|
2807
|
|
2808 $var->{gmaf} = join ",", @gmafs;
|
|
2809 $var->{gmaf} ||= undef;
|
|
2810 }
|
|
2811 }
|
|
2812 }
|
|
2813
|
|
2814 delete $variation_cache->{$chr};
|
|
2815 }
|
|
2816
|
|
2817 end_progress($config);
|
|
2818 }
|
|
2819
|
|
2820 # gets overlapping SVs for a vf_hash
|
|
2821 sub check_svs_hash {
|
|
2822 my $config = shift;
|
|
2823 my $vf_hash = shift;
|
|
2824
|
|
2825 debug("Checking for overlapping structural variations") unless defined($config->{quiet});
|
|
2826
|
|
2827 my ($chunk_count, $counter);
|
|
2828 $chunk_count += scalar keys %{$vf_hash->{$_}} for keys %{$vf_hash};
|
|
2829
|
|
2830 foreach my $chr(keys %{$vf_hash}) {
|
|
2831 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2832
|
|
2833 progress($config, $counter++, $chunk_count);
|
|
2834
|
|
2835 # work out start and end using chunk_size
|
|
2836 my ($start, $end);
|
|
2837 $start = $config->{chunk_size} * $chunk;
|
|
2838 $end = $config->{chunk_size} * ($chunk + 1);
|
|
2839
|
|
2840 # check for structural variations
|
|
2841 if(defined($config->{sa})) {
|
|
2842 my $slice = $config->{sa}->fetch_by_region('chromosome', $chr, $start, $end);
|
|
2843
|
|
2844 if(defined($slice)) {
|
|
2845 my $svs = $config->{svfa}->fetch_all_by_Slice($slice);
|
|
2846
|
|
2847 foreach my $pos(keys %{$vf_hash->{$chr}->{$chunk}}) {
|
|
2848 foreach my $var(@{$vf_hash->{$chr}->{$chunk}->{$pos}}) {
|
|
2849 my $string = join ",",
|
|
2850 map {$_->variation_name}
|
|
2851 grep {$_->seq_region_start <= $var->end && $_->seq_region_end >= $var->start}
|
|
2852 @$svs;
|
|
2853
|
|
2854 $var->{overlapping_svs} = $string if $string;
|
|
2855 }
|
|
2856 }
|
|
2857 }
|
|
2858 }
|
|
2859 }
|
|
2860 }
|
|
2861
|
|
2862 end_progress($config);
|
|
2863 }
|
|
2864
|
|
2865 # gets a slice from the slice adaptor
|
|
2866 sub get_slice {
|
|
2867 my $config = shift;
|
|
2868 my $chr = shift;
|
|
2869 my $otherfeatures = shift;
|
|
2870 $otherfeatures ||= '';
|
|
2871
|
|
2872 return undef unless defined($config->{sa}) && defined($chr);
|
|
2873
|
|
2874 my $slice;
|
|
2875
|
|
2876 # first try to get a chromosome
|
|
2877 eval { $slice = $config->{$otherfeatures.'sa'}->fetch_by_region('chromosome', $chr); };
|
|
2878
|
|
2879 # if failed, try to get any seq region
|
|
2880 if(!defined($slice)) {
|
|
2881 $slice = $config->{$otherfeatures.'sa'}->fetch_by_region(undef, $chr);
|
|
2882 }
|
|
2883
|
|
2884 return $slice;
|
|
2885 }
|
|
2886
|
|
2887
|
|
2888
|
|
2889
|
|
2890 # METHODS THAT DEAL WITH "REGIONS"
|
|
2891 ##################################
|
|
2892
|
|
2893 # gets regions from VF hash
|
|
2894 sub regions_from_hash {
|
|
2895 my $config = shift;
|
|
2896 my $vf_hash = shift;
|
|
2897
|
|
2898 my %include_regions;
|
|
2899
|
|
2900 # if using cache we just want the regions of cache_region_size
|
|
2901 # since that's what we'll get from the cache (or DB if no cache found)
|
|
2902 if(defined($config->{cache})) {
|
|
2903
|
|
2904 my $region_size = $config->{cache_region_size};
|
|
2905
|
|
2906 foreach my $chr(keys %$vf_hash) {
|
|
2907 $include_regions{$chr} = [];
|
|
2908 my %temp_regions;
|
|
2909
|
|
2910 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2911 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
|
|
2912 my ($s, $e) = ($pos - MAX_DISTANCE_FROM_TRANSCRIPT, $pos + MAX_DISTANCE_FROM_TRANSCRIPT);
|
|
2913
|
|
2914 my $low = int ($s / $region_size);
|
|
2915 my $high = int ($e / $region_size) + 1;
|
|
2916
|
|
2917 for my $i($low..($high - 1)) {
|
|
2918 $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1;
|
|
2919 }
|
|
2920 }
|
|
2921 }
|
|
2922
|
|
2923 @{$include_regions{$chr}} = keys %temp_regions;
|
|
2924 }
|
|
2925 }
|
|
2926
|
|
2927 # if no cache we don't want to fetch more than is necessary, so find the
|
|
2928 # minimum covered region of the variations in the hash
|
|
2929 else {
|
|
2930 foreach my $chr(keys %$vf_hash) {
|
|
2931 $include_regions{$chr} = [];
|
|
2932
|
|
2933 foreach my $chunk(keys %{$vf_hash->{$chr}}) {
|
|
2934 foreach my $pos(keys %{$vf_hash->{$chr}{$chunk}}) {
|
|
2935 add_region($_->start, $_->end, $include_regions{$chr}) for @{$vf_hash->{$chr}{$chunk}{$pos}};
|
|
2936 }
|
|
2937 }
|
|
2938 }
|
|
2939
|
|
2940 # merge regions
|
|
2941 merge_regions(\%include_regions, $config);
|
|
2942 }
|
|
2943
|
|
2944 return \%include_regions;
|
|
2945 }
|
|
2946
|
|
2947 # adds a region to region list, expanding existing one if overlaps
|
|
2948 sub add_region {
|
|
2949 my $start = shift;
|
|
2950 my $end = shift;
|
|
2951 my $region_list = shift;
|
|
2952
|
|
2953 # fix end for insertions
|
|
2954 $end = $start if $end < $start;
|
|
2955
|
|
2956 my $added = 0;
|
|
2957 my $i = 0;
|
|
2958
|
|
2959 while ($i < scalar @$region_list) {
|
|
2960 my ($region_start, $region_end) = split /\-/, $region_list->[$i];
|
|
2961
|
|
2962 if($start <= $region_end && $end >= $region_start) {
|
|
2963 my $new_region_start = ($start < $end ? $start : $end) - MAX_DISTANCE_FROM_TRANSCRIPT;
|
|
2964 my $new_region_end = ($start > $end ? $start : $end) + MAX_DISTANCE_FROM_TRANSCRIPT;
|
|
2965
|
|
2966 $new_region_start = 1 if $new_region_start < 1;
|
|
2967
|
|
2968 $region_start = $new_region_start if $new_region_start < $region_start;
|
|
2969 $region_end = $new_region_end if $new_region_end > $region_end;
|
|
2970
|
|
2971 $region_list->[$i] = $region_start.'-'.$region_end;
|
|
2972 $added = 1;
|
|
2973 }
|
|
2974
|
|
2975 $i++;
|
|
2976 }
|
|
2977
|
|
2978 unless($added) {
|
|
2979 my $s = $start - MAX_DISTANCE_FROM_TRANSCRIPT;
|
|
2980 $s = 1 if $s < 1;
|
|
2981
|
|
2982 push @{$region_list}, $s.'-'.($end + MAX_DISTANCE_FROM_TRANSCRIPT);
|
|
2983 }
|
|
2984 }
|
|
2985
|
|
2986 # merges overlapping regions from scans
|
|
2987 sub merge_regions {
|
|
2988 my $include_regions = shift;
|
|
2989 my $config = shift;
|
|
2990 my $consecutive = shift;
|
|
2991 $consecutive ||= 0;
|
|
2992
|
|
2993 # now merge overlapping regions
|
|
2994 foreach my $chr(keys %$include_regions) {
|
|
2995 my $max_index = $#{$include_regions->{$chr}};
|
|
2996 my (@new_regions, %skip);
|
|
2997
|
|
2998 for my $i(0..$max_index) {
|
|
2999 next if $skip{$i};
|
|
3000 my ($s, $e) = split /\-/, $include_regions->{$chr}[$i];
|
|
3001
|
|
3002 for my $j(($i+1)..$max_index) {
|
|
3003 next if $skip{$j};
|
|
3004 my ($ns, $ne) = split /\-/, $include_regions->{$chr}[$j];
|
|
3005
|
|
3006 if($s <= ($ne + $consecutive) && $e >= ($ns - $consecutive)) {
|
|
3007 $s = $ns if $ns < $s;
|
|
3008 $e = $ne if $ne > $e;
|
|
3009
|
|
3010 $skip{$j} = 1;
|
|
3011 }
|
|
3012 }
|
|
3013
|
|
3014 push @new_regions, $s.'-'.$e;
|
|
3015 }
|
|
3016
|
|
3017 # replace original
|
|
3018 $include_regions->{$chr} = \@new_regions;
|
|
3019
|
|
3020 $config->{region_count} += scalar @new_regions;
|
|
3021 }
|
|
3022
|
|
3023 return $include_regions;
|
|
3024 }
|
|
3025
|
|
3026 # converts regions as determined by scan_file to regions loadable from cache
|
|
3027 sub convert_regions {
|
|
3028 my $config = shift;
|
|
3029 my $regions = shift;
|
|
3030
|
|
3031 return undef unless defined $regions;
|
|
3032
|
|
3033 my $region_size = $config->{cache_region_size};
|
|
3034
|
|
3035 my %new_regions;
|
|
3036
|
|
3037 foreach my $chr(keys %$regions) {
|
|
3038 my %temp_regions;
|
|
3039
|
|
3040 foreach my $region(@{$regions->{$chr}}) {
|
|
3041 my ($s, $e) = split /\-/, $region;
|
|
3042
|
|
3043 my $low = int ($s / $region_size);
|
|
3044 my $high = int ($e / $region_size) + 1;
|
|
3045
|
|
3046 for my $i($low..($high - 1)) {
|
|
3047 $temp_regions{(($i * $region_size) + 1).'-'.(($i + 1) * $region_size)} = 1;
|
|
3048 }
|
|
3049 }
|
|
3050
|
|
3051 @{$new_regions{$chr}} = keys %temp_regions;
|
|
3052 }
|
|
3053
|
|
3054 return \%new_regions;
|
|
3055 }
|
|
3056
|
|
3057
|
|
3058
|
|
3059
|
|
3060
|
|
3061 # CACHE METHODS
|
|
3062 ###############
|
|
3063
|
|
3064 # prunes a cache to get rid of features not in regions in use
|
|
3065 sub prune_cache {
|
|
3066 my $config = shift;
|
|
3067 my $cache = shift;
|
|
3068 my $regions = shift;
|
|
3069 my $loaded = shift;
|
|
3070
|
|
3071 # delete no longer in use chroms
|
|
3072 foreach my $chr(keys %$cache) {
|
|
3073 delete $cache->{$chr} unless defined $regions->{$chr} && scalar @{$regions->{$chr}};
|
|
3074 }
|
|
3075
|
|
3076 my $new_count = 0;
|
|
3077
|
|
3078 foreach my $chr(keys %$cache) {
|
|
3079
|
|
3080 # get total area spanned by regions
|
|
3081 my ($min, $max);
|
|
3082 foreach my $region(@{$regions->{$chr}}) {
|
|
3083 my ($s, $e) = split /\-/, $region;
|
|
3084 $min = $s if !defined($min) or $s < $min;
|
|
3085 $max = $e if !defined($max) or $e > $max;
|
|
3086 }
|
|
3087
|
|
3088 # transcript cache
|
|
3089 if(ref($cache->{$chr}) eq 'ARRAY') {
|
|
3090 $cache->{$chr} = prune_min_max($cache->{$chr}, $min, $max);
|
|
3091 $new_count += scalar @{$cache->{$chr}};
|
|
3092 }
|
|
3093 # regfeat cache
|
|
3094 elsif(ref($cache->{$chr}) eq 'HASH') {
|
|
3095 for(keys %{$cache->{$chr}}) {
|
|
3096 $cache->{$chr}->{$_} = prune_min_max($cache->{$chr}->{$_}, $min, $max);
|
|
3097 $new_count += scalar @{$cache->{$chr}->{$_}};
|
|
3098 }
|
|
3099 }
|
|
3100
|
|
3101 # update loaded regions
|
|
3102 my %have_regions = map {$_ => 1} @{$regions->{$chr}};
|
|
3103
|
|
3104 foreach my $region(keys %{$loaded->{$chr}}) {
|
|
3105 delete $loaded->{$chr}->{$region} unless defined $have_regions{$region};
|
|
3106 }
|
|
3107 }
|
|
3108
|
|
3109 return $new_count;
|
|
3110 }
|
|
3111
|
|
3112 # does the actual pruning
|
|
3113 sub prune_min_max {
|
|
3114 my $array = shift;
|
|
3115 my $min = shift;
|
|
3116 my $max = shift;
|
|
3117
|
|
3118 # splice out features not in area spanned by min/max
|
|
3119 my $i = 0;
|
|
3120 my $f_count = scalar @{$array};
|
|
3121 my @new_cache;
|
|
3122
|
|
3123 while($i < $f_count) {
|
|
3124 my $f = $array->[$i];
|
|
3125
|
|
3126 $i++;
|
|
3127
|
|
3128 if($max - $f->start() > 0 && $f->end - $min > 0) {
|
|
3129 push @new_cache, $f;
|
|
3130 }
|
|
3131
|
|
3132 # do some cleaning for transcripts
|
|
3133 elsif(defined $f->{translation}) {
|
|
3134 delete $f->{translation}->{transcript};
|
|
3135 delete $f->{translation};
|
|
3136 }
|
|
3137 }
|
|
3138
|
|
3139 undef $array;
|
|
3140 return \@new_cache;
|
|
3141 }
|
|
3142
|
|
3143 # get transcripts for slices
|
|
3144 sub cache_transcripts {
|
|
3145 my $config = shift;
|
|
3146 my $include_regions = shift;
|
|
3147
|
|
3148 my $tr_cache;
|
|
3149 my $i;
|
|
3150
|
|
3151 debug("Caching transcripts") unless defined($config->{quiet});
|
|
3152
|
|
3153 foreach my $chr(keys %$include_regions) {
|
|
3154
|
|
3155 my $slice = get_slice($config, $chr);
|
|
3156
|
|
3157 next unless defined $slice;
|
|
3158
|
|
3159 # prefetch some things
|
|
3160 $slice->is_circular;
|
|
3161
|
|
3162 # trim bumf off the slice
|
|
3163 delete $slice->{coord_system}->{adaptor} if defined($config->{write_cache});
|
|
3164
|
|
3165 # no regions?
|
|
3166 if(!scalar @{$include_regions->{$chr}}) {
|
|
3167 my $start = 1;
|
|
3168 my $end = $config->{cache_region_size};
|
|
3169
|
|
3170 while($start < $slice->end) {
|
|
3171 push @{$include_regions->{$chr}}, $start.'-'.$end;
|
|
3172 $start += $config->{cache_region_size};
|
|
3173 $end += $config->{cache_region_size};
|
|
3174 }
|
|
3175 }
|
|
3176
|
|
3177 my $region_count;
|
|
3178
|
|
3179 if(scalar keys %$include_regions == 1) {
|
|
3180 my ($chr) = keys %$include_regions;
|
|
3181 $region_count = scalar @{$include_regions->{$chr}};
|
|
3182 debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet});
|
|
3183 }
|
|
3184
|
|
3185 foreach my $region(@{$include_regions->{$chr}}) {
|
|
3186 progress($config, $i++, $region_count || $config->{region_count});
|
|
3187
|
|
3188 my ($s, $e) = split /\-/, $region;
|
|
3189
|
|
3190 # sanity check start and end
|
|
3191 $s = 1 if $s < 1;
|
|
3192 $e = $slice->end if $e > $slice->end;
|
|
3193
|
|
3194 # get sub-slice
|
|
3195 my $sub_slice = $slice->sub_Slice($s, $e);
|
|
3196
|
|
3197 # for some reason unless seq is called here the sequence becomes Ns later
|
|
3198 $sub_slice->seq;
|
|
3199
|
|
3200 # add transcripts to the cache, via a transfer to the chrom's slice
|
|
3201 if(defined($sub_slice)) {
|
|
3202 foreach my $gene(map {$_->transfer($slice)} @{$sub_slice->get_all_Genes(undef, undef, 1)}) {
|
|
3203 my $gene_stable_id = $gene->stable_id;
|
|
3204 my $canonical_tr_id = $gene->{canonical_transcript_id};
|
|
3205
|
|
3206 my @trs;
|
|
3207
|
|
3208 foreach my $tr(@{$gene->get_all_Transcripts}) {
|
|
3209 $tr->{_gene_stable_id} = $gene_stable_id;
|
|
3210 $tr->{_gene} = $gene;
|
|
3211
|
|
3212 # indicate if canonical
|
|
3213 $tr->{is_canonical} = 1 if defined $canonical_tr_id and $tr->dbID eq $canonical_tr_id;
|
|
3214
|
|
3215 if(defined($config->{prefetch})) {
|
|
3216 prefetch_transcript_data($config, $tr);
|
|
3217 }
|
|
3218
|
|
3219 # CCDS
|
|
3220 elsif(defined($config->{ccds})) {
|
|
3221 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries};
|
|
3222 $tr->{_ccds} = $entries[0]->display_id if scalar @entries;
|
|
3223 }
|
|
3224
|
|
3225 # strip some unnecessary data from the transcript object
|
|
3226 clean_transcript($tr) if defined($config->{write_cache});
|
|
3227
|
|
3228 push @trs, $tr;
|
|
3229 }
|
|
3230
|
|
3231 # sort the transcripts by translation so we can share sift/polyphen stuff
|
|
3232 # between transcripts and save cache space
|
|
3233 if(defined($config->{write_cache}) && (defined($config->{sift}) || defined($config->{polyphen}))) {
|
|
3234
|
|
3235 my $prev_tr;
|
|
3236
|
|
3237 # sort them by peptide seqeuence as transcripts with identical peptides
|
|
3238 # will have identical SIFT/PolyPhen prediction strings
|
|
3239 foreach my $tr(sort {$a->{_variation_effect_feature_cache}->{peptide} cmp $b->{_variation_effect_feature_cache}->{peptide}} grep {$_->{_variation_effect_feature_cache}->{peptide}} @trs) {
|
|
3240
|
|
3241 if(
|
|
3242 defined($prev_tr) &&
|
|
3243 $prev_tr->{_variation_effect_feature_cache}->{peptide}
|
|
3244 eq $tr->{_variation_effect_feature_cache}->{peptide}
|
|
3245 ) {
|
|
3246
|
|
3247 foreach my $analysis(qw(sift polyphen)) {
|
|
3248 next unless defined($config->{$analysis});
|
|
3249 $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis} = $prev_tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$analysis};
|
|
3250 }
|
|
3251 }
|
|
3252
|
|
3253 $prev_tr = $tr;
|
|
3254 }
|
|
3255 }
|
|
3256
|
|
3257 # clean the gene
|
|
3258 clean_gene($gene);
|
|
3259
|
|
3260 push @{$tr_cache->{$chr}}, @trs;
|
|
3261 }
|
|
3262 }
|
|
3263 }
|
|
3264 }
|
|
3265
|
|
3266 end_progress($config);
|
|
3267
|
|
3268 return $tr_cache;
|
|
3269 }
|
|
3270
|
|
3271 # gets rid of extra bits of info attached to the transcript that we don't need
|
|
3272 sub clean_transcript {
|
|
3273 my $tr = shift;
|
|
3274
|
|
3275 foreach my $key(qw(display_xref external_db external_display_name external_name external_status created_date status description edits_enabled modified_date dbentries is_current analysis transcript_mapper)) {
|
|
3276 delete $tr->{$key} if defined($tr->{$key});
|
|
3277 }
|
|
3278
|
|
3279 # clean all attributes but miRNA
|
|
3280 if(defined($tr->{attributes})) {
|
|
3281 my @new_atts;
|
|
3282 foreach my $att(@{$tr->{attributes}}) {
|
|
3283 push @new_atts, $att if $att->{code} eq 'miRNA';
|
|
3284 }
|
|
3285 $tr->{attributes} = \@new_atts;
|
|
3286 }
|
|
3287
|
|
3288 # clean the translation
|
|
3289 if(defined($tr->translation)) {
|
|
3290
|
|
3291 # sometimes the translation points to a different transcript?
|
|
3292 $tr->{translation}->{transcript} = $tr;
|
|
3293 weaken($tr->{translation}->{transcript});
|
|
3294
|
|
3295 for my $key(qw(attributes protein_features created_date modified_date)) {
|
|
3296 delete $tr->translation->{$key};
|
|
3297 }
|
|
3298 }
|
|
3299 }
|
|
3300
|
|
3301 # gets rid of extra bits of info attached to genes. At the moment this is almost
|
|
3302 # everything as genes are only used for their locations when looking at
|
|
3303 # structural variations
|
|
3304 sub clean_gene {
|
|
3305 my $gene = shift;
|
|
3306
|
|
3307 # delete almost everything in the gene
|
|
3308 map {delete $gene->{$_}}
|
|
3309 grep {
|
|
3310 $_ ne 'start' &&
|
|
3311 $_ ne 'end' &&
|
|
3312 $_ ne 'strand' &&
|
|
3313 $_ ne 'stable_id'
|
|
3314 }
|
|
3315 keys %{$gene};
|
|
3316 }
|
|
3317
|
|
3318 # build slice cache from transcript cache
|
|
3319 sub build_slice_cache {
|
|
3320 my $config = shift;
|
|
3321 my $tr_cache = shift;
|
|
3322
|
|
3323 my %slice_cache;
|
|
3324
|
|
3325 foreach my $chr(keys %$tr_cache) {
|
|
3326 $slice_cache{$chr} = scalar @{$tr_cache->{$chr}} ? $tr_cache->{$chr}[0]->slice : &get_slice($config, $chr);
|
|
3327
|
|
3328 if(!defined($slice_cache{$chr})) {
|
|
3329 delete $slice_cache{$chr}
|
|
3330 }
|
|
3331
|
|
3332 else {
|
|
3333 # reattach adaptor to the coord system
|
|
3334 $slice_cache{$chr}->{coord_system}->{adaptor} ||= $config->{csa};
|
|
3335 }
|
|
3336 }
|
|
3337
|
|
3338 return \%slice_cache;
|
|
3339 }
|
|
3340
|
|
3341 # pre-fetches per-transcript data
|
|
3342 sub prefetch_transcript_data {
|
|
3343 my $config = shift;
|
|
3344 my $tr = shift;
|
|
3345
|
|
3346 # introns
|
|
3347 my $introns = $tr->get_all_Introns;
|
|
3348
|
|
3349 if(defined($introns)) {
|
|
3350 foreach my $intron(@$introns) {
|
|
3351 foreach my $key(qw(adaptor analysis dbID next prev seqname)) {
|
|
3352 delete $intron->{$key};
|
|
3353 }
|
|
3354 }
|
|
3355 }
|
|
3356
|
|
3357 $tr->{_variation_effect_feature_cache}->{introns} ||= $introns;
|
|
3358
|
|
3359 # translateable_seq, mapper
|
|
3360 $tr->{_variation_effect_feature_cache}->{translateable_seq} ||= $tr->translateable_seq;
|
|
3361 $tr->{_variation_effect_feature_cache}->{mapper} ||= $tr->get_TranscriptMapper;
|
|
3362
|
|
3363 # peptide
|
|
3364 unless ($tr->{_variation_effect_feature_cache}->{peptide}) {
|
|
3365 my $translation = $tr->translate;
|
|
3366 $tr->{_variation_effect_feature_cache}->{peptide} = $translation ? $translation->seq : undef;
|
|
3367 }
|
|
3368
|
|
3369 # protein features
|
|
3370 if(defined($config->{domains}) || defined($config->{write_cache})) {
|
|
3371 my $pfs = $tr->translation ? $tr->translation->get_all_ProteinFeatures : [];
|
|
3372
|
|
3373 # clean them to save cache space
|
|
3374 foreach my $pf(@$pfs) {
|
|
3375
|
|
3376 # remove everything but the coord, analysis and ID fields
|
|
3377 foreach my $key(keys %$pf) {
|
|
3378 delete $pf->{$key} unless
|
|
3379 $key eq 'start' ||
|
|
3380 $key eq 'end' ||
|
|
3381 $key eq 'analysis' ||
|
|
3382 $key eq 'hseqname';
|
|
3383 }
|
|
3384
|
|
3385 # remove everything from the analysis but the display label
|
|
3386 foreach my $key(keys %{$pf->{analysis}}) {
|
|
3387 delete $pf->{analysis}->{$key} unless $key eq '_display_label';
|
|
3388 }
|
|
3389 }
|
|
3390
|
|
3391 $tr->{_variation_effect_feature_cache}->{protein_features} = $pfs;
|
|
3392 }
|
|
3393
|
|
3394 # codon table
|
|
3395 unless ($tr->{_variation_effect_feature_cache}->{codon_table}) {
|
|
3396 # for mithocondrial dna we need to to use a different codon table
|
|
3397 my $attrib = $tr->slice->get_all_Attributes('codon_table')->[0];
|
|
3398
|
|
3399 $tr->{_variation_effect_feature_cache}->{codon_table} = $attrib ? $attrib->value : 1;
|
|
3400 }
|
|
3401
|
|
3402 # sift/polyphen
|
|
3403 if(defined($config->{pfpma}) && defined($tr->{_variation_effect_feature_cache}->{peptide})) {
|
|
3404 foreach my $analysis(qw(sift polyphen)) {
|
|
3405 next unless defined($config->{$analysis});
|
|
3406 my $a = $analysis;
|
|
3407 $a .= '_humvar' if $a eq 'polyphen';
|
|
3408 $tr->{_variation_effect_feature_cache}->{protein_function_predictions}->{$a} ||= $config->{pfpma}->fetch_by_analysis_translation_md5($a, md5_hex($tr->{_variation_effect_feature_cache}->{peptide}))
|
|
3409 }
|
|
3410 }
|
|
3411
|
|
3412 # gene
|
|
3413 $tr->{_gene} ||= $config->{ga}->fetch_by_transcript_stable_id($tr->stable_id);
|
|
3414
|
|
3415 # gene HGNC
|
|
3416 if(defined $config->{hgnc}) {
|
|
3417
|
|
3418 # get from gene cache if found already
|
|
3419 if(defined($tr->{_gene}->{_hgnc})) {
|
|
3420 $tr->{_gene_hgnc} = $tr->{_gene}->{_hgnc};
|
|
3421 }
|
|
3422 else {
|
|
3423 my @entries = grep {$_->database eq 'HGNC'} @{$tr->{_gene}->get_all_DBEntries()};
|
|
3424 if(scalar @entries) {
|
|
3425 $tr->{_gene_hgnc} = $entries[0]->display_id;
|
|
3426 }
|
|
3427
|
|
3428 $tr->{_gene_hgnc} ||= '-';
|
|
3429
|
|
3430 # cache it on the gene object too
|
|
3431 $tr->{_gene}->{_hgnc} = $tr->{_gene_hgnc};
|
|
3432 }
|
|
3433 }
|
|
3434
|
|
3435 # CCDS
|
|
3436 my @entries = grep {$_->database eq 'CCDS'} @{$tr->get_all_DBEntries};
|
|
3437 $tr->{_ccds} = $entries[0]->display_id if scalar @entries;
|
|
3438 $tr->{_ccds} ||= '-';
|
|
3439
|
|
3440 # refseq
|
|
3441 @entries = grep {$_->database eq 'RefSeq_mRNA'} @{$tr->get_all_DBEntries};
|
|
3442 if(scalar @entries) {
|
|
3443 $tr->{_refseq} = join ",", map {$_->display_id} @entries;
|
|
3444 }
|
|
3445 else {
|
|
3446 $tr->{_refseq} = '-';
|
|
3447 }
|
|
3448
|
|
3449 # protein stable ID
|
|
3450 $tr->{_protein} = $tr->translation ? $tr->translation->stable_id : '-';
|
|
3451
|
|
3452 return $tr;
|
|
3453 }
|
|
3454
|
|
3455 sub get_dump_file_name {
|
|
3456 my $config = shift;
|
|
3457 my $chr = shift;
|
|
3458 my $region = shift;
|
|
3459 my $type = shift;
|
|
3460
|
|
3461 $type ||= 'transcript';
|
|
3462
|
|
3463 if($type eq 'transcript') {
|
|
3464 $type = '';
|
|
3465 }
|
|
3466 else {
|
|
3467 $type = '_'.$type;
|
|
3468 }
|
|
3469
|
|
3470 #my ($s, $e) = split /\-/, $region;
|
|
3471 #my $subdir = int($s / 1e6);
|
|
3472 #
|
|
3473 #my $dir = $config->{dir}.'/'.$chr.'/'.$subdir;
|
|
3474
|
|
3475 my $dir = $config->{dir}.'/'.$chr;
|
|
3476 my $dump_file = $dir.'/'.$region.$type.(defined($config->{tabix}) ? '_tabix' : '').'.gz';
|
|
3477
|
|
3478 # make directory if it doesn't exist
|
|
3479 if(!(-e $dir) && defined($config->{write_cache})) {
|
|
3480 make_path($dir);
|
|
3481 }
|
|
3482
|
|
3483 return $dump_file;
|
|
3484 }
|
|
3485
|
|
3486 # dumps out transcript cache to file
|
|
3487 sub dump_transcript_cache {
|
|
3488 my $config = shift;
|
|
3489 my $tr_cache = shift;
|
|
3490 my $chr = shift;
|
|
3491 my $region = shift;
|
|
3492
|
|
3493 debug("Dumping cached transcript data") unless defined($config->{quiet});
|
|
3494
|
|
3495 # clean the slice adaptor before storing
|
|
3496 clean_slice_adaptor($config);
|
|
3497
|
|
3498 strip_transcript_cache($config, $tr_cache);
|
|
3499
|
|
3500 $config->{reg}->disconnect_all;
|
|
3501 delete $config->{sa}->{dbc}->{_sql_helper};
|
|
3502
|
|
3503 my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript');
|
|
3504
|
|
3505 debug("Writing to $dump_file") unless defined($config->{quiet});
|
|
3506
|
|
3507 # storable
|
|
3508 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
|
|
3509 nstore_fd($tr_cache, $fh);
|
|
3510 close $fh;
|
|
3511 }
|
|
3512
|
|
3513 #sub dump_transcript_cache_tabix {
|
|
3514 # my $config = shift;
|
|
3515 # my $tr_cache = shift;
|
|
3516 # my $chr = shift;
|
|
3517 # my $region = shift;
|
|
3518 #
|
|
3519 # debug("Dumping cached transcript data") unless defined($config->{quiet});
|
|
3520 #
|
|
3521 # # clean the slice adaptor before storing
|
|
3522 # clean_slice_adaptor($config);
|
|
3523 #
|
|
3524 # strip_transcript_cache($config, $tr_cache);
|
|
3525 #
|
|
3526 # $config->{reg}->disconnect_all;
|
|
3527 #
|
|
3528 # my $dir = $config->{dir}.'/'.$chr;
|
|
3529 # my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz';
|
|
3530 #
|
|
3531 # # make directory if it doesn't exist
|
|
3532 # if(!(-e $dir)) {
|
|
3533 # make_path($dir);
|
|
3534 # }
|
|
3535 #
|
|
3536 # debug("Writing to $dump_file") unless defined($config->{quiet});
|
|
3537 #
|
|
3538 # use Storable qw(nfreeze);
|
|
3539 # use MIME::Base64 qw(encode_base64);
|
|
3540 # #open NEW, "| bgzip -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
|
|
3541 # #
|
|
3542 # #foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) {
|
|
3543 # # print NEW join "\t", (
|
|
3544 # # $chr,
|
|
3545 # # $tr->start,
|
|
3546 # # $tr->end,
|
|
3547 # # encode_base64(freeze($tr), "")
|
|
3548 # # );
|
|
3549 # # print NEW "\n";
|
|
3550 # #}
|
|
3551 # #close NEW;
|
|
3552 # #
|
|
3553 # ## tabix it
|
|
3554 # #my $output = `tabix -s 1 -b 2 -e 3 -f $dump_file 2>&1`;
|
|
3555 # #die("ERROR: Failed during tabix indexing\n$output\n") if $output;
|
|
3556 # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
|
|
3557 #
|
|
3558 # foreach my $tr(sort {$a->start <=> $b->start} @{$tr_cache->{$chr}}) {
|
|
3559 # print NEW join "\t", (
|
|
3560 # $chr,
|
|
3561 # $tr->start,
|
|
3562 # $tr->end,
|
|
3563 # encode_base64(freeze($tr), "")
|
|
3564 # );
|
|
3565 # print NEW "\n";
|
|
3566 # }
|
|
3567 # close NEW;
|
|
3568 #}
|
|
3569
|
|
3570 # loads in dumped transcript cache to memory
|
|
3571 sub load_dumped_transcript_cache {
|
|
3572 my $config = shift;
|
|
3573 my $chr = shift;
|
|
3574 my $region = shift;
|
|
3575
|
|
3576 my $dump_file = get_dump_file_name($config, $chr, $region, 'transcript');
|
|
3577
|
|
3578 return undef unless -e $dump_file;
|
|
3579
|
|
3580 debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
|
|
3581
|
|
3582 open my $fh, $config->{compress}." ".$dump_file." |" or return undef;
|
|
3583 my $tr_cache;
|
|
3584 $tr_cache = fd_retrieve($fh);
|
|
3585 close $fh;
|
|
3586
|
|
3587 # reattach adaptors
|
|
3588 foreach my $t(@{$tr_cache->{$chr}}) {
|
|
3589 if(defined($t->{translation})) {
|
|
3590 $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor};
|
|
3591 $t->{translation}->{transcript} = $t;
|
|
3592 weaken($t->{translation}->{transcript});
|
|
3593 }
|
|
3594
|
|
3595 $t->{slice}->{adaptor} = $config->{sa};
|
|
3596 }
|
|
3597
|
|
3598 return $tr_cache;
|
|
3599 }
|
|
3600
|
|
3601 #sub load_dumped_transcript_cache_tabix {
|
|
3602 # my $config = shift;
|
|
3603 # my $chr = shift;
|
|
3604 # my $region = shift;
|
|
3605 # my $trim_regions = shift;
|
|
3606 #
|
|
3607 # my $dir = $config->{dir}.'/'.$chr;
|
|
3608 # my $dump_file = $dir.'/'.($region || "dump").'_tabix.gz';
|
|
3609 #
|
|
3610 # #print STDERR "Reading from $dump_file\n";
|
|
3611 #
|
|
3612 # return undef unless -e $dump_file;
|
|
3613 #
|
|
3614 # debug("Reading cached transcript data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
|
|
3615 #
|
|
3616 # my $tr_cache;
|
|
3617 #
|
|
3618 # use MIME::Base64 qw(decode_base64);
|
|
3619 # use Storable qw(thaw);
|
|
3620 #
|
|
3621 # my ($s, $e) = split /\-/, $region;
|
|
3622 # my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}};
|
|
3623 # my $regions = "";
|
|
3624 # $regions .= " $chr\:$_" for @regions;
|
|
3625 #
|
|
3626 # #print STDERR "tabix $dump_file $regions |\n";
|
|
3627 # #open IN, "tabix $dump_file $regions |";
|
|
3628 # open IN, "gzip -dc $dump_file |";
|
|
3629 # while(<IN>) {
|
|
3630 #
|
|
3631 # #$DB::single = 1;
|
|
3632 # my ($chr, $start, $end, $blob) = split /\t/, $_;
|
|
3633 # next unless grep {overlap($start, $end, (split /\-/, $_))} @regions;
|
|
3634 # my $tr = thaw(decode_base64($blob));
|
|
3635 # push @{$tr_cache->{$chr}}, $tr;
|
|
3636 # }
|
|
3637 # close IN;
|
|
3638 #
|
|
3639 # # reattach adaptors
|
|
3640 # foreach my $t(@{$tr_cache->{$chr}}) {
|
|
3641 # if(defined($t->{translation})) {
|
|
3642 # $t->{translation}->{adaptor} = $config->{tra} if defined $t->{translation}->{adaptor};
|
|
3643 # $t->{translation}->{transcript} = $t;
|
|
3644 # weaken($t->{translation}->{transcript});
|
|
3645 # }
|
|
3646 #
|
|
3647 # $t->{slice}->{adaptor} = $config->{sa};
|
|
3648 # }
|
|
3649 #
|
|
3650 # # add empty array ref so code doesn't try and fetch from DB too
|
|
3651 # $tr_cache->{$chr} ||= [];
|
|
3652 #
|
|
3653 # return $tr_cache;
|
|
3654 #}
|
|
3655
|
|
3656 # strips cache before writing to disk
|
|
3657 sub strip_transcript_cache {
|
|
3658 my $config = shift;
|
|
3659 my $cache = shift;
|
|
3660
|
|
3661 foreach my $chr(keys %$cache) {
|
|
3662 foreach my $tr(@{$cache->{$chr}}) {
|
|
3663 foreach my $exon(@{$tr->{_trans_exon_array}}) {
|
|
3664 delete $exon->{slice}->{adaptor};
|
|
3665
|
|
3666 for(qw(adaptor created_date modified_date is_current version is_constitutive _seq_cache dbID slice)) {
|
|
3667 delete $exon->{$_};
|
|
3668 }
|
|
3669 }
|
|
3670
|
|
3671 delete $tr->{adaptor};
|
|
3672 delete $tr->{slice}->{adaptor};
|
|
3673 }
|
|
3674 }
|
|
3675 }
|
|
3676
|
|
3677 # cleans slice adaptor before storing in cache
|
|
3678 sub clean_slice_adaptor{
|
|
3679 my $config = shift;
|
|
3680
|
|
3681 # clean some stuff off the slice adaptor
|
|
3682 delete $config->{sa}->{asm_exc_cache};
|
|
3683 $config->{sa}->{sr_name_cache} = {};
|
|
3684 $config->{sa}->{sr_id_cache} = {};
|
|
3685 delete $config->{sa}->{db}->{seq_region_cache};
|
|
3686 delete $config->{sa}->{db}->{name_cache};
|
|
3687 }
|
|
3688
|
|
3689
|
|
3690 # dump adaptors to cache
|
|
3691 sub dump_adaptor_cache {
|
|
3692 my $config = shift;
|
|
3693
|
|
3694 $config->{reg}->disconnect_all;
|
|
3695 delete $config->{sa}->{dbc}->{_sql_helper};
|
|
3696
|
|
3697 my $dir = $config->{dir};
|
|
3698 my $dump_file = $dir.'/adaptors.gz';
|
|
3699
|
|
3700 # make directory if it doesn't exist
|
|
3701 if(!(-e $dir)) {
|
|
3702 make_path($dir);
|
|
3703 }
|
|
3704
|
|
3705 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
|
|
3706 nstore_fd($config, $fh);
|
|
3707 close $fh;
|
|
3708 }
|
|
3709
|
|
3710 # load dumped adaptors
|
|
3711 sub load_dumped_adaptor_cache {
|
|
3712 my $config = shift;
|
|
3713
|
|
3714 my $dir = $config->{dir};
|
|
3715 my $dump_file = $dir.'/adaptors.gz';
|
|
3716
|
|
3717 return undef unless -e $dump_file;
|
|
3718
|
|
3719 debug("Reading cached adaptor data") unless defined($config->{quiet});
|
|
3720
|
|
3721 open my $fh, $config->{compress}." ".$dump_file." |" or return undef;
|
|
3722 my $cached_config;
|
|
3723 $cached_config = fd_retrieve($fh);
|
|
3724 close $fh;
|
|
3725
|
|
3726 $config->{$_} = $cached_config->{$_} for qw(sa ga ta vfa svfa tva pfpma mca csa RegulatoryFeature_adaptor MotifFeature_adaptor);
|
|
3727
|
|
3728 return 1;
|
|
3729 }
|
|
3730
|
|
3731 # dumps cached variations to disk
|
|
3732 sub dump_variation_cache {
|
|
3733 my $config = shift;
|
|
3734 my $v_cache = shift;
|
|
3735 my $chr = shift;
|
|
3736 my $region = shift;
|
|
3737
|
|
3738 my $dump_file = get_dump_file_name($config, $chr, $region, 'var');
|
|
3739
|
|
3740 open DUMP, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to adaptor dump file $dump_file";
|
|
3741
|
|
3742 foreach my $pos(keys %{$v_cache->{$chr}}) {
|
|
3743 foreach my $v(@{$v_cache->{$chr}->{$pos}}) {
|
|
3744 my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = @$v;
|
|
3745
|
|
3746 print DUMP join " ", (
|
|
3747 $name,
|
|
3748 $failed == 0 ? '' : $failed,
|
|
3749 $start,
|
|
3750 $end == $start ? '' : $end,
|
|
3751 $as,
|
|
3752 $strand == 1 ? '' : $strand,
|
|
3753 $ma || '',
|
|
3754 defined($maf) ? sprintf("%.2f", $maf) : '',
|
|
3755 );
|
|
3756 print DUMP "\n";
|
|
3757 }
|
|
3758 }
|
|
3759
|
|
3760 close DUMP;
|
|
3761 }
|
|
3762
|
|
3763 # loads dumped variation cache
|
|
3764 sub load_dumped_variation_cache {
|
|
3765 my $config = shift;
|
|
3766 my $chr = shift;
|
|
3767 my $region = shift;
|
|
3768
|
|
3769 my $dump_file = get_dump_file_name($config, $chr, $region, 'var');
|
|
3770
|
|
3771 return undef unless -e $dump_file;
|
|
3772
|
|
3773 open DUMP, $config->{compress}." ".$dump_file." |" or return undef;
|
|
3774
|
|
3775 my $v_cache;
|
|
3776
|
|
3777 while(<DUMP>) {
|
|
3778 chomp;
|
|
3779 my ($name, $failed, $start, $end, $as, $strand, $ma, $maf) = split / /, $_;
|
|
3780 $failed ||= 0;
|
|
3781 $end ||= $start;
|
|
3782 $strand ||= 1;
|
|
3783 $ma ||= undef;
|
|
3784 $maf ||= undef;
|
|
3785
|
|
3786 my @v = ($name, $failed, $start, $end, $as, $strand, $ma, $maf);
|
|
3787 push @{$v_cache->{$chr}->{$start}}, \@v;
|
|
3788 }
|
|
3789
|
|
3790 close DUMP;
|
|
3791
|
|
3792 return $v_cache;
|
|
3793 }
|
|
3794
|
|
3795 # caches regulatory features
|
|
3796 sub cache_reg_feats {
|
|
3797 my $config = shift;
|
|
3798 my $include_regions = shift;
|
|
3799
|
|
3800 my $rf_cache;
|
|
3801 my $i;
|
|
3802
|
|
3803 debug("Caching regulatory features") unless defined($config->{quiet});
|
|
3804
|
|
3805 foreach my $chr(keys %$include_regions) {
|
|
3806
|
|
3807 my $slice = get_slice($config, $chr);
|
|
3808
|
|
3809 next unless defined $slice;
|
|
3810
|
|
3811 # prefetch some things
|
|
3812 $slice->is_circular;
|
|
3813
|
|
3814 # no regions?
|
|
3815 if(!scalar @{$include_regions->{$chr}}) {
|
|
3816 my $start = 1;
|
|
3817 my $end = $config->{cache_region_size};
|
|
3818
|
|
3819 while($start < $slice->end) {
|
|
3820 push @{$include_regions->{$chr}}, $start.'-'.$end;
|
|
3821 $start += $config->{cache_region_size};
|
|
3822 $end += $config->{cache_region_size};
|
|
3823 }
|
|
3824 }
|
|
3825
|
|
3826 my $region_count;
|
|
3827
|
|
3828 if(scalar keys %$include_regions == 1) {
|
|
3829 my ($chr) = keys %$include_regions;
|
|
3830 $region_count = scalar @{$include_regions->{$chr}};
|
|
3831 debug("Caching transcripts for chromosome $chr") unless defined($config->{quiet});
|
|
3832 }
|
|
3833
|
|
3834 foreach my $region(@{$include_regions->{$chr}}) {
|
|
3835 progress($config, $i++, $region_count || $config->{region_count});
|
|
3836
|
|
3837 my ($s, $e) = split /\-/, $region;
|
|
3838
|
|
3839 # sanity check start and end
|
|
3840 $s = 1 if $s < 1;
|
|
3841 $e = $slice->end if $e > $slice->end;
|
|
3842
|
|
3843 # get sub-slice
|
|
3844 my $sub_slice = $slice->sub_Slice($s, $e);
|
|
3845 $sub_slice->{coord_system}->{adaptor} = $config->{csa};
|
|
3846
|
|
3847 next unless defined($sub_slice);
|
|
3848
|
|
3849 foreach my $type(@REG_FEAT_TYPES) {
|
|
3850 my $features = $config->{$type.'_adaptor'}->fetch_all_by_Slice($sub_slice);
|
|
3851 next unless defined($features);
|
|
3852
|
|
3853 # cell types
|
|
3854 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
|
|
3855 foreach my $rf(@$features) {
|
|
3856
|
|
3857 my %cl;
|
|
3858
|
|
3859 # get cell type by fetching all from stable ID
|
|
3860 if($type eq 'RegulatoryFeature') {
|
|
3861 %cl = map {
|
|
3862 $_->feature_set->cell_type->name => $_->feature_type->name
|
|
3863 } @{$rf->adaptor->fetch_all_by_stable_ID($rf->stable_id)};
|
|
3864 }
|
|
3865
|
|
3866 # get cell type by fetching regfeats that contain this MotifFeature
|
|
3867 elsif($type eq 'MotifFeature') {
|
|
3868 %cl = map {
|
|
3869 $_->feature_set->cell_type->name => $_->feature_type->name
|
|
3870 } @{$config->{'RegulatoryFeature_adaptor'}->fetch_all_by_attribute_feature($rf)};
|
|
3871 }
|
|
3872
|
|
3873 $rf->{cell_types} = \%cl;
|
|
3874 }
|
|
3875 }
|
|
3876
|
|
3877 push @{$rf_cache->{$chr}->{$type}},
|
|
3878 map { clean_reg_feat($_) }
|
|
3879 map { $_->transfer($slice) }
|
|
3880 @{$features};
|
|
3881 }
|
|
3882 }
|
|
3883 }
|
|
3884
|
|
3885 end_progress($config);
|
|
3886
|
|
3887 return $rf_cache;
|
|
3888 }
|
|
3889
|
|
3890
|
|
3891 # cleans reg feats for caching
|
|
3892 sub clean_reg_feat {
|
|
3893 my $rf = shift;
|
|
3894
|
|
3895 foreach my $key(qw/adaptor binary_string bound_start bound_end attribute_cache feature_type feature_set analysis/) {
|
|
3896 delete $rf->{$key};
|
|
3897 }
|
|
3898
|
|
3899 if(defined($rf->{binding_matrix})) {
|
|
3900 $rf->{_variation_effect_feature_cache}->{seq} = $rf->seq;
|
|
3901
|
|
3902 foreach my $key(qw/adaptor feature_type analysis dbID/) {
|
|
3903 delete $rf->{binding_matrix}->{$key};
|
|
3904 }
|
|
3905 }
|
|
3906
|
|
3907 return $rf;
|
|
3908 }
|
|
3909
|
|
3910
|
|
3911 # dumps out reg feat cache to file
|
|
3912 sub dump_reg_feat_cache {
|
|
3913 my $config = shift;
|
|
3914 my $rf_cache = shift;
|
|
3915 my $chr = shift;
|
|
3916 my $region = shift;
|
|
3917
|
|
3918 debug("Dumping cached reg feat data for $chr:$region") unless defined($config->{quiet});
|
|
3919
|
|
3920 # clean the slice adaptor before storing
|
|
3921 clean_slice_adaptor($config);
|
|
3922
|
|
3923 $config->{reg}->disconnect_all;
|
|
3924 delete $config->{sa}->{dbc}->{_sql_helper};
|
|
3925
|
|
3926 foreach my $chr(keys %{$rf_cache}) {
|
|
3927 foreach my $type(keys %{$rf_cache->{$chr}}) {
|
|
3928 delete $_->{slice}->{coord_system}->{adaptor} for @{$rf_cache->{$chr}->{$type}};
|
|
3929 }
|
|
3930 }
|
|
3931
|
|
3932 my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
|
|
3933
|
|
3934 debug("Writing to $dump_file") unless defined($config->{quiet});
|
|
3935
|
|
3936 # storable
|
|
3937 open my $fh, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
|
|
3938 nstore_fd($rf_cache, $fh);
|
|
3939 close $fh;
|
|
3940 }
|
|
3941
|
|
3942 #sub dump_reg_feat_cache_tabix {
|
|
3943 # my $config = shift;
|
|
3944 # my $rf_cache = shift;
|
|
3945 # my $chr = shift;
|
|
3946 # my $region = shift;
|
|
3947 #
|
|
3948 # debug("Dumping cached reg feat data") unless defined($config->{quiet});
|
|
3949 #
|
|
3950 # # clean the slice adaptor before storing
|
|
3951 # clean_slice_adaptor($config);
|
|
3952 #
|
|
3953 # $config->{reg}->disconnect_all;
|
|
3954 # delete $config->{sa}->{dbc}->{_sql_helper};
|
|
3955 #
|
|
3956 # $config->{reg}->disconnect_all;
|
|
3957 #
|
|
3958 # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
|
|
3959 #
|
|
3960 # debug("Writing to $dump_file") unless defined($config->{quiet});
|
|
3961 #
|
|
3962 # use Storable qw(nfreeze);
|
|
3963 # use MIME::Base64 qw(encode_base64);
|
|
3964 # open NEW, "| gzip -9 -c > ".$dump_file or die "ERROR: Could not write to dump file $dump_file";
|
|
3965 #
|
|
3966 # foreach my $type(keys %{$rf_cache->{$chr}}) {
|
|
3967 # foreach my $rf(sort {$a->start <=> $b->start} @{$rf_cache->{$chr}->{$type}}) {
|
|
3968 # print NEW join "\t", (
|
|
3969 # $chr,
|
|
3970 # $rf->start,
|
|
3971 # $rf->end,
|
|
3972 # $type,
|
|
3973 # encode_base64(freeze($rf), "")
|
|
3974 # );
|
|
3975 # print NEW "\n";
|
|
3976 # }
|
|
3977 # }
|
|
3978 # close NEW;
|
|
3979 #}
|
|
3980
|
|
3981 # loads in dumped transcript cache to memory
|
|
3982 sub load_dumped_reg_feat_cache {
|
|
3983 my $config = shift;
|
|
3984 my $chr = shift;
|
|
3985 my $region = shift;
|
|
3986
|
|
3987 my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
|
|
3988
|
|
3989 return undef unless -e $dump_file;
|
|
3990
|
|
3991 debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
|
|
3992
|
|
3993 open my $fh, $config->{compress}." ".$dump_file." |" or return undef;
|
|
3994 my $rf_cache;
|
|
3995 $rf_cache = fd_retrieve($fh);
|
|
3996 close $fh;
|
|
3997
|
|
3998 return $rf_cache;
|
|
3999 }
|
|
4000
|
|
4001
|
|
4002
|
|
4003 #sub load_dumped_reg_feat_cache_tabix {
|
|
4004 # my $config = shift;
|
|
4005 # my $chr = shift;
|
|
4006 # my $region = shift;
|
|
4007 # my $trim_regions = shift;
|
|
4008 #
|
|
4009 # my $dump_file = get_dump_file_name($config, $chr, $region, 'reg');
|
|
4010 #
|
|
4011 # #print STDERR "Reading from $dump_file\n";
|
|
4012 #
|
|
4013 # return undef unless -e $dump_file;
|
|
4014 #
|
|
4015 # debug("Reading cached reg feat data for chromosome $chr".(defined $region ? "\:$region" : "")." from dumped file") unless defined($config->{quiet});
|
|
4016 #
|
|
4017 # my $rf_cache;
|
|
4018 #
|
|
4019 # use MIME::Base64 qw(decode_base64);
|
|
4020 # use Storable qw(thaw);
|
|
4021 #
|
|
4022 # my ($s, $e) = split /\-/, $region;
|
|
4023 # my @regions = grep {overlap($s, $e, (split /\-/, $_))} @{$trim_regions->{$chr}};
|
|
4024 # my $regions = "";
|
|
4025 # $regions .= " $chr\:$_" for @regions;
|
|
4026 #
|
|
4027 # #print STDERR "tabix $dump_file $regions |\n";
|
|
4028 # #open IN, "tabix $dump_file $regions |";
|
|
4029 # open IN, "gzip -dc $dump_file |";
|
|
4030 # while(<IN>) {
|
|
4031 # my ($chr, $start, $end, $type, $blob) = split /\t/, $_;
|
|
4032 # next unless grep {overlap($start, $end, (split /\-/, $_))} @regions;
|
|
4033 # my $rf = thaw(decode_base64($blob));
|
|
4034 # push @{$rf_cache->{$chr}->{$type}}, $rf;
|
|
4035 # }
|
|
4036 # close IN;
|
|
4037 #
|
|
4038 # $rf_cache->{$chr}->{$_} ||= [] for @REG_FEAT_TYPES;
|
|
4039 #
|
|
4040 # return $rf_cache;
|
|
4041 #}
|
|
4042
|
|
4043
|
|
4044 # get custom annotation for a region
|
|
4045 sub cache_custom_annotation {
|
|
4046 my $config = shift;
|
|
4047 my $include_regions = shift;
|
|
4048 my $chr = shift;
|
|
4049
|
|
4050 #$include_regions = merge_regions($include_regions, $config, 1);
|
|
4051
|
|
4052 my $annotation = {};
|
|
4053
|
|
4054 my $total = scalar @{$config->{custom}} * scalar @{$include_regions->{$chr}};
|
|
4055 my $counter = 0;
|
|
4056
|
|
4057 my $max_regions_per_tabix = 1000;
|
|
4058
|
|
4059 debug("Caching custom annotations") unless defined($config->{quiet});
|
|
4060
|
|
4061 foreach my $custom(@{$config->{custom}}) {
|
|
4062
|
|
4063 my @regions = @{$include_regions->{$chr}};
|
|
4064
|
|
4065 while(scalar @regions) {
|
|
4066 my $got_features = 0;
|
|
4067
|
|
4068 my @tmp_regions = splice @regions, 0, $max_regions_per_tabix;
|
|
4069
|
|
4070 progress($config, $counter, $total);
|
|
4071 $counter += scalar @tmp_regions;
|
|
4072
|
|
4073 # some files may have e.g. chr10 instead of 10
|
|
4074 for my $tmp_chr($chr, 'chr'.$chr) {
|
|
4075
|
|
4076 # bigwig needs to use bigWigToWig utility
|
|
4077 if($custom->{format} eq 'bigwig') {
|
|
4078 foreach my $region(@tmp_regions) {
|
|
4079 my ($s, $e) = split /\-/, $region;
|
|
4080 my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$.'_'.$tmp_chr.'_'.$s.'_'.$e;
|
|
4081 my $bigwig_file = $custom->{file};
|
|
4082 my $bigwig_output = `bigWigToWig -chrom=$tmp_chr -start=$s -end=$e $bigwig_file $tmp_file 2>&1`;
|
|
4083
|
|
4084 die "\nERROR: Problem using bigwig file $bigwig_file\n$bigwig_output" if $bigwig_output;
|
|
4085 }
|
|
4086
|
|
4087 # concatenate all the files together
|
|
4088 my $string = $config->{tmpdir}.'/vep_tmp_'.$$.'_*';
|
|
4089 my $tmp_file = $config->{tmpdir}.'/vep_tmp_'.$$;
|
|
4090 `cat $string > $tmp_file; rm $string`;
|
|
4091 open CUSTOM, $tmp_file
|
|
4092 or die "\nERROR: Could not read from temporary WIG file $tmp_file\n";
|
|
4093 }
|
|
4094
|
|
4095 # otherwise use tabix
|
|
4096 else {
|
|
4097 # tabix can fetch multiple regions, so construct a string
|
|
4098 my $region_string = join " ", map {$tmp_chr.':'.$_} @tmp_regions;
|
|
4099
|
|
4100 open CUSTOM, "tabix ".$custom->{file}." $region_string 2>&1 |"
|
|
4101 or die "\nERROR: Could not open tabix pipe for ".$custom->{file}."\n";
|
|
4102 }
|
|
4103
|
|
4104 # set an error flag so we don't have to check every line
|
|
4105 my $error_flag = 1;
|
|
4106
|
|
4107 while(<CUSTOM>) {
|
|
4108 chomp;
|
|
4109
|
|
4110 # check for errors
|
|
4111 if($error_flag) {
|
|
4112 die "\nERROR: Problem using annotation file ".$custom->{file}."\n$_\n" if /invalid pointer|tabix|get_intv/;
|
|
4113 $error_flag = 0;
|
|
4114 }
|
|
4115
|
|
4116 my @data = split /\t/, $_;
|
|
4117
|
|
4118 my $feature;
|
|
4119
|
|
4120 if($custom->{format} eq 'bed') {
|
|
4121 $feature = {
|
|
4122 chr => $chr,
|
|
4123 start => $data[1],
|
|
4124 end => $data[2],
|
|
4125 name => $data[3],
|
|
4126 };
|
|
4127 }
|
|
4128
|
|
4129 elsif($custom->{format} eq 'vcf') {
|
|
4130 my $tmp_vf = parse_vcf($config, $_)->[0];
|
|
4131
|
|
4132 $feature = {
|
|
4133 chr => $chr,
|
|
4134 start => $tmp_vf->{start},
|
|
4135 end => $tmp_vf->{end},
|
|
4136 name => $tmp_vf->{variation_name},
|
|
4137 };
|
|
4138 }
|
|
4139
|
|
4140 elsif($custom->{format} eq 'gff' || $custom->{format} eq 'gtf') {
|
|
4141
|
|
4142 my $name;
|
|
4143
|
|
4144 # try and get a feature name from the attributes column
|
|
4145 foreach my $attrib(split /\s*\;\s*/, $data[8]) {
|
|
4146 my ($key, $value) = split /\=/, $attrib;
|
|
4147 $name = $value if $key eq 'ID';
|
|
4148 }
|
|
4149
|
|
4150 $name ||= $data[2]."_".$data[0].":".$data[3]."-".$data[4];
|
|
4151
|
|
4152 $feature = {
|
|
4153 chr => $chr,
|
|
4154 start => $data[3],
|
|
4155 end => $data[4],
|
|
4156 name => $name,
|
|
4157 };
|
|
4158 }
|
|
4159
|
|
4160 elsif($custom->{format} eq 'bigwig') {
|
|
4161 $feature = {
|
|
4162 chr => $chr,
|
|
4163 start => $data[0],
|
|
4164 end => $data[0],
|
|
4165 name => $data[1],
|
|
4166 };
|
|
4167 }
|
|
4168
|
|
4169 if(defined($feature)) {
|
|
4170 $got_features = 1;
|
|
4171
|
|
4172 if(!defined($feature->{name}) || $custom->{coords}) {
|
|
4173 $feature->{name} = $feature->{chr}.":".$feature->{start}."-".$feature->{end};
|
|
4174 }
|
|
4175
|
|
4176 # add the feature to the cache
|
|
4177 $annotation->{$chr}->{$custom->{name}}->{$feature->{start}}->{$feature->{name}} = $feature;
|
|
4178 }
|
|
4179 }
|
|
4180 close CUSTOM;
|
|
4181
|
|
4182 # unlink temporary wig files
|
|
4183 unlink($config->{tmpdir}.'/vep_tmp_'.$$) if $custom->{format} eq 'bigwig';
|
|
4184
|
|
4185 # no need to fetch e.g. "chr21" features if just "21" worked
|
|
4186 last if $got_features;
|
|
4187 }
|
|
4188 }
|
|
4189 }
|
|
4190
|
|
4191 end_progress($config);
|
|
4192
|
|
4193 return $annotation;
|
|
4194 }
|
|
4195
|
|
4196 # builds a full cache for this species
|
|
4197 sub build_full_cache {
|
|
4198 my $config = shift;
|
|
4199
|
|
4200 my @slices;
|
|
4201
|
|
4202 if($config->{build} =~ /all/i) {
|
|
4203 @slices = @{$config->{sa}->fetch_all('toplevel')};
|
|
4204 push @slices, @{$config->{sa}->fetch_all('lrg', undef, 1, undef, 1)} if defined($config->{lrg});
|
|
4205 }
|
|
4206 else {
|
|
4207 foreach my $val(split /\,/, $config->{build}) {
|
|
4208 my @nnn = split /\-/, $val;
|
|
4209
|
|
4210 foreach my $chr($nnn[0]..$nnn[-1]) {
|
|
4211 my $slice = get_slice($config, $chr);
|
|
4212 push @slices, $slice if defined($slice);
|
|
4213 }
|
|
4214 }
|
|
4215 }
|
|
4216
|
|
4217 foreach my $slice(@slices) {
|
|
4218 my $chr = $slice->seq_region_name;
|
|
4219
|
|
4220 # check for features, we don't want a load of effectively empty dirs
|
|
4221 my $dbc = $config->{sa}->db->dbc;
|
|
4222 my $sth = $dbc->prepare("SELECT COUNT(*) FROM transcript WHERE seq_region_id = ?");
|
|
4223 $sth->execute($slice->get_seq_region_id);
|
|
4224
|
|
4225 my $count;
|
|
4226 $sth->bind_columns(\$count);
|
|
4227 $sth->fetch;
|
|
4228 $sth->finish;
|
|
4229
|
|
4230 next unless $count > 0;
|
|
4231
|
|
4232 my $regions;
|
|
4233
|
|
4234 # for progress
|
|
4235 my $region_count = int($slice->end / $config->{cache_region_size}) + 1;
|
|
4236 my $counter = 0;
|
|
4237
|
|
4238 # initial region
|
|
4239 my ($start, $end) = (1, $config->{cache_region_size});
|
|
4240
|
|
4241 debug((defined($config->{rebuild}) ? "Rebuild" : "Creat")."ing cache for chromosome $chr") unless defined($config->{quiet});
|
|
4242
|
|
4243 while($start < $slice->end) {
|
|
4244
|
|
4245 progress($config, $counter++, $region_count);
|
|
4246
|
|
4247 # store quiet status
|
|
4248 my $quiet = $config->{quiet};
|
|
4249 $config->{quiet} = 1;
|
|
4250
|
|
4251 # spoof regions
|
|
4252 $regions->{$chr} = [$start.'-'.$end];
|
|
4253
|
|
4254 # store transcripts
|
|
4255 my $tmp_cache = (defined($config->{rebuild}) ? load_dumped_transcript_cache($config, $chr, $start.'-'.$end) : cache_transcripts($config, $regions));
|
|
4256 $tmp_cache->{$chr} ||= [];
|
|
4257
|
|
4258 #(defined($config->{tabix}) ? dump_transcript_cache_tabix($config, $tmp_cache, $chr, $start.'-'.$end) : dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end));
|
|
4259 dump_transcript_cache($config, $tmp_cache, $chr, $start.'-'.$end);
|
|
4260 undef $tmp_cache;
|
|
4261
|
|
4262 # store reg feats
|
|
4263 if(defined($config->{regulatory})) {
|
|
4264 my $rf_cache = cache_reg_feats($config, $regions);
|
|
4265 $rf_cache->{$chr} ||= {};
|
|
4266
|
|
4267 dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end);
|
|
4268 #(defined($config->{tabix}) ? dump_reg_feat_cache_tabix($config, $rf_cache, $chr, $start.'-'.$end) : dump_reg_feat_cache($config, $rf_cache, $chr, $start.'-'.$end));
|
|
4269 undef $rf_cache;
|
|
4270
|
|
4271 # this gets cleaned off but needs to be there for the next loop
|
|
4272 $slice->{coord_system}->{adaptor} = $config->{csa};
|
|
4273 }
|
|
4274
|
|
4275 # store variations
|
|
4276 my $variation_cache;
|
|
4277 $variation_cache->{$chr} = get_variations_in_region($config, $chr, $start.'-'.$end);
|
|
4278 $variation_cache->{$chr} ||= {};
|
|
4279
|
|
4280 dump_variation_cache($config, $variation_cache, $chr, $start.'-'.$end);
|
|
4281 undef $variation_cache;
|
|
4282
|
|
4283 # restore quiet status
|
|
4284 $config->{quiet} = $quiet;
|
|
4285
|
|
4286 # increment by cache_region_size to get next region
|
|
4287 $start += $config->{cache_region_size};
|
|
4288 $end += $config->{cache_region_size};
|
|
4289 }
|
|
4290
|
|
4291 end_progress($config);
|
|
4292
|
|
4293 undef $regions;
|
|
4294 }
|
|
4295
|
|
4296 write_cache_info($config);
|
|
4297 }
|
|
4298
|
|
4299 # write an info file that defines what is in the cache
|
|
4300 sub write_cache_info {
|
|
4301 my $config = shift;
|
|
4302
|
|
4303 my $info_file = $config->{dir}.'/info.txt';
|
|
4304
|
|
4305 open OUT, ">>$info_file" or die "ERROR: Could not write to cache info file $info_file\n";
|
|
4306
|
|
4307 print OUT "# CACHE UPDATED ".get_time()."\n";
|
|
4308
|
|
4309 foreach my $param(qw(
|
|
4310 host
|
|
4311 port
|
|
4312 user
|
|
4313 build
|
|
4314 regulatory
|
|
4315 sift
|
|
4316 polyphen
|
|
4317 )) {
|
|
4318 print OUT "$param\t".(defined $config->{$param} ? $config->{$param} : '-')."\n";
|
|
4319 }
|
|
4320
|
|
4321 # cell types
|
|
4322 if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
|
|
4323 my $cta = $config->{RegulatoryFeature_adaptor}->db->get_CellTypeAdaptor();
|
|
4324 print OUT "cell_types\t".(join ",", map {$_->name} @{$cta->fetch_all});
|
|
4325 print OUT "\n";
|
|
4326 }
|
|
4327
|
|
4328 close OUT;
|
|
4329 }
|
|
4330
|
|
4331 # reads in cache info file
|
|
4332 sub read_cache_info {
|
|
4333 my $config = shift;
|
|
4334
|
|
4335 my $info_file = $config->{dir}.'/info.txt';
|
|
4336
|
|
4337 open IN, $info_file or return 0;
|
|
4338
|
|
4339 while(<IN>) {
|
|
4340 next if /^#/;
|
|
4341 chomp;
|
|
4342 my ($param, $value) = split /\t/;
|
|
4343 $config->{'cache_'.$param} = $value unless defined $value && $value eq '-';
|
|
4344 }
|
|
4345
|
|
4346 close IN;
|
|
4347
|
|
4348 return 1;
|
|
4349 }
|
|
4350
|
|
4351 # format coords for printing
|
|
4352 sub format_coords {
|
|
4353 my ($start, $end) = @_;
|
|
4354
|
|
4355 if(defined($start)) {
|
|
4356 if(defined($end)) {
|
|
4357 if($start > $end) {
|
|
4358 return $end.'-'.$start;
|
|
4359 }
|
|
4360 elsif($start == $end) {
|
|
4361 return $start;
|
|
4362 }
|
|
4363 else {
|
|
4364 return $start.'-'.$end;
|
|
4365 }
|
|
4366 }
|
|
4367 else {
|
|
4368 return $start.'-?';
|
|
4369 }
|
|
4370 }
|
|
4371 elsif(defined($end)) {
|
|
4372 return '?-'.$end;
|
|
4373 }
|
|
4374 else {
|
|
4375 return '-';
|
|
4376 }
|
|
4377 }
|
|
4378
|
|
4379
|
|
4380
|
|
4381
|
|
4382 # METHODS TO FIND CO-LOCATED / EXISTING VARIATIONS
|
|
4383 ##################################################
|
|
4384
|
|
4385 # finds an existing VF in the db
|
|
4386 sub find_existing {
|
|
4387 my $config = shift;
|
|
4388 my $new_vf = shift;
|
|
4389
|
|
4390 if(defined($config->{vfa}->db)) {
|
|
4391
|
|
4392 my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL';
|
|
4393
|
|
4394 my $sth = $config->{vfa}->db->dbc->prepare(qq{
|
|
4395 SELECT variation_name, IF(fv.variation_id IS NULL, 0, 1), seq_region_start, seq_region_end, allele_string, seq_region_strand, $maf_cols
|
|
4396 FROM variation_feature vf LEFT JOIN failed_variation fv
|
|
4397 ON vf.variation_id = fv.variation_id
|
|
4398 WHERE vf.seq_region_id = ?
|
|
4399 AND vf.seq_region_start = ?
|
|
4400 AND vf.seq_region_end = ?
|
|
4401 ORDER BY vf.source_id ASC
|
|
4402 });
|
|
4403
|
|
4404 $sth->execute($new_vf->slice->get_seq_region_id, $new_vf->start, $new_vf->end);
|
|
4405
|
|
4406 my @v;
|
|
4407 for my $i(0..7) {
|
|
4408 $v[$i] = undef;
|
|
4409 }
|
|
4410
|
|
4411 $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]);
|
|
4412
|
|
4413 my @found;
|
|
4414
|
|
4415 while($sth->fetch) {
|
|
4416 push @found, $v[0] unless is_var_novel($config, \@v, $new_vf) || $v[1] > $config->{failed};
|
|
4417 }
|
|
4418
|
|
4419 $sth->finish();
|
|
4420
|
|
4421 return (scalar @found ? join ",", @found : undef);
|
|
4422 }
|
|
4423
|
|
4424 return undef;
|
|
4425 }
|
|
4426
|
|
4427 # compare a new vf to one from the cache / DB
|
|
4428 sub is_var_novel {
|
|
4429 my $config = shift;
|
|
4430 my $existing_var = shift;
|
|
4431 my $new_var = shift;
|
|
4432
|
|
4433 my $is_novel = 1;
|
|
4434
|
|
4435 $is_novel = 0 if $existing_var->[2] == $new_var->start && $existing_var->[3] == $new_var->end;
|
|
4436
|
|
4437 if(defined($config->{check_alleles})) {
|
|
4438 my %existing_alleles;
|
|
4439
|
|
4440 $existing_alleles{$_} = 1 for split /\//, $existing_var->[4];
|
|
4441
|
|
4442 my $seen_new = 0;
|
|
4443 foreach my $a(split /\//, ($new_var->allele_string || "")) {
|
|
4444 reverse_comp(\$a) if $new_var->strand ne $existing_var->[5];
|
|
4445 $seen_new = 1 unless defined $existing_alleles{$a};
|
|
4446 }
|
|
4447
|
|
4448 $is_novel = 1 if $seen_new;
|
|
4449 }
|
|
4450
|
|
4451 return $is_novel;
|
|
4452 }
|
|
4453
|
|
4454 # check frequencies of existing var against requested params
|
|
4455 sub check_frequencies {
|
|
4456 my $config = shift;
|
|
4457 my $var_name = shift;
|
|
4458
|
|
4459 my $v = $config->{va}->fetch_by_name($var_name);
|
|
4460
|
|
4461 my $pass = 0;
|
|
4462
|
|
4463 my $freq_pop = $config->{freq_pop};
|
|
4464 my $freq_freq = $config->{freq_freq};
|
|
4465 my $freq_gt_lt = $config->{freq_gt_lt};
|
|
4466
|
|
4467 my $freq_pop_name = (split /\_/, $freq_pop)[-1];
|
|
4468 $freq_pop_name = undef if $freq_pop_name =~ /1kg|hap/;
|
|
4469
|
|
4470 delete $config->{filtered_freqs};
|
|
4471
|
|
4472 foreach my $a(@{$v->get_all_Alleles}) {
|
|
4473 next unless defined $a->{population} || defined $a->{'_population_id'};
|
|
4474 next unless defined $a->frequency;
|
|
4475 next if $a->frequency > 0.5;
|
|
4476
|
|
4477 my $pop_name = $a->population->name;
|
|
4478
|
|
4479 if($freq_pop =~ /1kg/) { next unless $pop_name =~ /^1000.+(low|phase).+/i; }
|
|
4480 if($freq_pop =~ /hap/) { next unless $pop_name =~ /^CSHL-HAPMAP/i; }
|
|
4481 if($freq_pop =~ /any/) { next unless $pop_name =~ /^(CSHL-HAPMAP)|(1000.+(low|phase).+)/i; }
|
|
4482 if(defined $freq_pop_name) { next unless $pop_name =~ /$freq_pop_name/i; }
|
|
4483
|
|
4484 $pass = 1 if $a->frequency >= $freq_freq and $freq_gt_lt eq 'gt';
|
|
4485 $pass = 1 if $a->frequency <= $freq_freq and $freq_gt_lt eq 'lt';
|
|
4486
|
|
4487 $pop_name =~ s/\:/\_/g;
|
|
4488 push @{$config->{filtered_freqs}}, $pop_name.':'.$a->frequency;
|
|
4489
|
|
4490 #warn "Comparing allele ", $a->allele, " ", $a->frequency, " for $var_name in population ", $a->population->name, " PASS $pass";
|
|
4491 }
|
|
4492
|
|
4493 return 0 if $config->{freq_filter} eq 'exclude' and $pass == 1;
|
|
4494 return 0 if $config->{freq_filter} eq 'include' and $pass == 0;
|
|
4495 return 1;
|
|
4496 }
|
|
4497
|
|
4498 # gets all variations in a region
|
|
4499 sub get_variations_in_region {
|
|
4500 my $config = shift;
|
|
4501 my $chr = shift;
|
|
4502 my $region = shift;
|
|
4503
|
|
4504 my ($start, $end) = split /\-/, $region;
|
|
4505
|
|
4506 my %variations;
|
|
4507
|
|
4508 if(defined($config->{vfa}->db)) {
|
|
4509 my $sr_cache = $config->{seq_region_cache};
|
|
4510
|
|
4511 if(!defined($sr_cache)) {
|
|
4512 $sr_cache = cache_seq_region_ids($config);
|
|
4513 $config->{seq_region_cache} = $sr_cache;
|
|
4514 }
|
|
4515
|
|
4516 # no seq_region_id?
|
|
4517 return {} unless defined($sr_cache) && defined($sr_cache->{$chr});
|
|
4518
|
|
4519 my $maf_cols = have_maf_cols($config) ? 'vf.minor_allele, vf.minor_allele_freq' : 'NULL, NULL';
|
|
4520
|
|
4521 my $sth = $config->{vfa}->db->dbc->prepare(qq{
|
|
4522 SELECT vf.variation_name, IF(fv.variation_id IS NULL, 0, 1), vf.seq_region_start, vf.seq_region_end, vf.allele_string, vf.seq_region_strand, $maf_cols
|
|
4523 FROM variation_feature vf
|
|
4524 LEFT JOIN failed_variation fv ON fv.variation_id = vf.variation_id
|
|
4525 WHERE vf.seq_region_id = ?
|
|
4526 AND vf.seq_region_start >= ?
|
|
4527 AND vf.seq_region_start <= ?
|
|
4528 });
|
|
4529
|
|
4530 $sth->execute($sr_cache->{$chr}, $start, $end);
|
|
4531
|
|
4532 my @v;
|
|
4533 for my $i(0..7) {
|
|
4534 $v[$i] = undef;
|
|
4535 }
|
|
4536
|
|
4537 $sth->bind_columns(\$v[0], \$v[1], \$v[2], \$v[3], \$v[4], \$v[5], \$v[6], \$v[7]);
|
|
4538
|
|
4539 while($sth->fetch) {
|
|
4540 my @v_copy = @v;
|
|
4541 push @{$variations{$v[2]}}, \@v_copy;
|
|
4542 }
|
|
4543
|
|
4544 $sth->finish();
|
|
4545 }
|
|
4546
|
|
4547 return \%variations;
|
|
4548 }
|
|
4549
|
|
4550 sub cache_seq_region_ids {
|
|
4551 my $config = shift;
|
|
4552
|
|
4553 my (%cache, $chr, $id);
|
|
4554
|
|
4555 my $sth = $config->{vfa}->db->dbc->prepare(qq{
|
|
4556 SELECT seq_region_id, name FROM seq_region
|
|
4557 });
|
|
4558
|
|
4559 $sth->execute();
|
|
4560 $sth->bind_columns(\$id, \$chr);
|
|
4561 $cache{$chr} = $id while $sth->fetch();
|
|
4562 $sth->finish;
|
|
4563
|
|
4564 return \%cache;
|
|
4565 }
|
|
4566
|
|
4567 sub have_maf_cols {
|
|
4568 my $config = shift;
|
|
4569
|
|
4570 if(!defined($config->{have_maf_cols})) {
|
|
4571 my $sth = $config->{vfa}->db->dbc->prepare(qq{
|
|
4572 DESCRIBE variation_feature
|
|
4573 });
|
|
4574
|
|
4575 $sth->execute();
|
|
4576 my @cols = map {$_->[0]} @{$sth->fetchall_arrayref};
|
|
4577 $sth->finish();
|
|
4578
|
|
4579 $config->{have_maf_cols} = 0;
|
|
4580 $config->{have_maf_cols} = 1 if grep {$_ eq 'minor_allele'} @cols;
|
|
4581 }
|
|
4582
|
|
4583 return $config->{have_maf_cols};
|
|
4584 }
|
|
4585
|
|
4586 sub merge_hashes {
|
|
4587 my ($x, $y) = @_;
|
|
4588
|
|
4589 foreach my $k (keys %$y) {
|
|
4590 if (!defined($x->{$k})) {
|
|
4591 $x->{$k} = $y->{$k};
|
|
4592 } else {
|
|
4593 if(ref($x->{$k}) eq 'ARRAY') {
|
|
4594 $x->{$k} = merge_arrays($x->{$k}, $y->{$k});
|
|
4595 }
|
|
4596 elsif(ref($x->{$k}) eq 'HASH') {
|
|
4597 $x->{$k} = merge_hashes($x->{$k}, $y->{$k});
|
|
4598 }
|
|
4599 else {
|
|
4600 $x->{$k} = $y->{$k};
|
|
4601 }
|
|
4602 }
|
|
4603 }
|
|
4604 return $x;
|
|
4605 }
|
|
4606
|
|
4607 sub merge_arrays {
|
|
4608 my ($x, $y) = @_;
|
|
4609
|
|
4610 my %tmp = map {$_ => 1} (@$x, @$y);
|
|
4611
|
|
4612 return [keys %tmp];
|
|
4613 }
|
|
4614
|
|
4615
|
|
4616
|
|
4617
|
|
4618 # DEBUG AND STATUS METHODS
|
|
4619 ##########################
|
|
4620
|
|
4621 # gets time
|
|
4622 sub get_time() {
|
|
4623 my @time = localtime(time());
|
|
4624
|
|
4625 # increment the month (Jan = 0)
|
|
4626 $time[4]++;
|
|
4627
|
|
4628 # add leading zeroes as required
|
|
4629 for my $i(0..4) {
|
|
4630 $time[$i] = "0".$time[$i] if $time[$i] < 10;
|
|
4631 }
|
|
4632
|
|
4633 # put the components together in a string
|
|
4634 my $time =
|
|
4635 ($time[5] + 1900)."-".
|
|
4636 $time[4]."-".
|
|
4637 $time[3]." ".
|
|
4638 $time[2].":".
|
|
4639 $time[1].":".
|
|
4640 $time[0];
|
|
4641
|
|
4642 return $time;
|
|
4643 }
|
|
4644
|
|
4645 # prints debug output with time
|
|
4646 sub debug {
|
|
4647 my $text = (@_ ? (join "", @_) : "No message");
|
|
4648 my $time = get_time;
|
|
4649
|
|
4650 print $time." - ".$text.($text =~ /\n$/ ? "" : "\n");
|
|
4651 }
|
|
4652
|
|
4653 # finds out memory usage
|
|
4654 sub memory {
|
|
4655 my @mem;
|
|
4656
|
|
4657 open IN, "ps -o rss,vsz $$ |";
|
|
4658 while(<IN>) {
|
|
4659 next if $_ =~ /rss/i;
|
|
4660 chomp;
|
|
4661 @mem = split;
|
|
4662 }
|
|
4663 close IN;
|
|
4664
|
|
4665 return \@mem;
|
|
4666 }
|
|
4667
|
|
4668 sub mem_diff {
|
|
4669 my $config = shift;
|
|
4670 my $mem = memory();
|
|
4671 my @diffs;
|
|
4672
|
|
4673 if(defined($config->{memory})) {
|
|
4674 for my $i(0..(scalar @{$config->{memory}} - 1)) {
|
|
4675 push @diffs, $mem->[$i] - $config->{memory}->[$i];
|
|
4676 }
|
|
4677 }
|
|
4678 else {
|
|
4679 @diffs = @$mem;
|
|
4680 }
|
|
4681
|
|
4682 $config->{memory} = $mem;
|
|
4683
|
|
4684 return \@diffs;
|
|
4685 }
|
|
4686
|
|
4687 # update or initiate progress bar
|
|
4688 sub progress {
|
|
4689 my ($config, $i, $total) = @_;
|
|
4690
|
|
4691 return if defined($config->{quiet}) || defined($config->{no_progress});
|
|
4692
|
|
4693 my $width = $config->{terminal_width} || 60;
|
|
4694 my $percent = int(($i/$total) * 100);
|
|
4695 my $numblobs = int((($i/$total) * $width) - 2);
|
|
4696
|
|
4697 # this ensures we're not writing to the terminal too much
|
|
4698 return if(defined($config->{prev_prog})) && $numblobs.'-'.$percent eq $config->{prev_prog};
|
|
4699 $config->{prev_prog} = $numblobs.'-'.$percent;
|
|
4700
|
|
4701 printf("\r% -${width}s% 1s% 10s", '['.('=' x $numblobs).($numblobs == $width - 2 ? '=' : '>'), ']', "[ " . $percent . "% ]");
|
|
4702 }
|
|
4703
|
|
4704 # end progress bar
|
|
4705 sub end_progress {
|
|
4706 my $config = shift;
|
|
4707 return if defined($config->{quiet}) || defined($config->{no_progress});
|
|
4708 progress($config, 1,1);
|
|
4709 print "\n";
|
|
4710 delete $config->{prev_prog};
|
|
4711 }
|
|
4712
|
|
4713 1;
|