comparison dir_plugins/G2P.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
comparison
equal deleted inserted replaced
2:17c98d091710 3:49397129aec0
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 G2P
27
28 =head1 SYNOPSIS
29
30 mv G2P.pm ~/.vep/Plugins
31 ./vep -i variations.vcf --plugin G2P,file=/path/to/G2P.csv.gz
32
33 =head1 DESCRIPTION
34
35 A VEP plugin that uses G2P allelic requirements to assess variants in genes
36 for potential phenotype involvement.
37
38 The plugin has multiple configuration options, though minimally requires only
39 the CSV file of G2P data.
40
41 Options are passed to the plugin as key=value pairs, (defaults in parentheses):
42
43 file : path to G2P data file, as found at http://www.ebi.ac.uk/gene2phenotype/downloads
44
45 af_monoallelic : maximum allele frequency for inclusion for monoallelic genes (0.0001)
46
47 af_biallelic : maximum allele frequency for inclusion for biallelic genes (0.005)
48 all_confidence_levels : set value to 1 to include all confidence levels: confirmed, probable and possible.
49 Default levels are confirmed and probable.
50 af_keys : reference populations used for annotating variant alleles with observed
51 allele frequencies. Allele frequencies are stored in VEP cache files.
52 Default populations are:
53 ESP: AA, EA
54 1000 Genomes: AFR, AMR, EAS, EUR, SAS
55 gnomAD exomes: gnomAD, gnomAD_AFR, gnomAD_AMR, gnomAD_ASJ, gnomAD_EAS, gnomAD_FIN, gnomAD_NFE, gnomAD_OTH, gnomAD_SAS
56 Separate multiple values with '&'
57 af_from_vcf : set value to 1 to include allele frequencies from VCF file.
58 Specifiy the list of reference populations to include with --af_from_vcf_keys
59 af_from_vcf_keys : reference populations used for annotating variant alleles with observed
60 allele frequencies. Allele frequencies are retrieved from VCF files. If
61 af_from_vcf is set to 1 but no populations specified with --af_from_vcf_keys
62 all available reference populations are included.
63 TOPmed: TOPMed
64 UK10K: ALSPAC, TWINSUK
65 gnomAD exomes: gnomADe:AFR, gnomADe:ALL, gnomADe:AMR, gnomADe:ASJ, gnomADe:EAS, gnomADe:FIN, gnomADe:NFE, gnomADe:OTH, gnomADe:SAS
66 gnomAD genomes: gnomADg:AFR, gnomADg:ALL, gnomADg:AMR, gnomADg:ASJ, gnomADg:EAS, gnomADg:FIN, gnomADg:NFE, gnomADg:OTH
67 Separate multiple values with '&'
68 default_af : default frequency of the input variant if no frequency data is
69 found (0). This determines whether such variants are included;
70 the value of 0 forces variants with no frequency data to be
71 included as this is considered equivalent to having a frequency
72 of 0. Set to 1 (or any value higher than af) to exclude them.
73 types : SO consequence types to include. Separate multiple values with '&'
74 (splice_donor_variant,splice_acceptor_variant,stop_gained,
75 frameshift_variant,stop_lost,initiator_codon_variant,
76 inframe_insertion,inframe_deletion,missense_variant,
77 coding_sequence_variant,start_lost,transcript_ablation,
78 transcript_amplification,protein_altering_variant)
79
80 log_dir : write stats to log files in log_dir
81
82 txt_report : write all G2P complete genes and attributes to txt file
83
84 html_report : write all G2P complete genes and attributes to html file
85
86 Example:
87
88 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_keys=AA&gnomAD_ASJ,types=stop_gained&frameshift_variant
89 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,types=stop_gained&frameshift_variant
90 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_from_vcf=1
91 --plugin G2P,file=G2P.csv
92
93 =cut
94
95 package G2P;
96
97 use strict;
98 use warnings;
99
100 use Cwd;
101 use Scalar::Util qw(looks_like_number);
102 use FileHandle;
103 use Text::CSV;
104 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
105
106 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
107
108 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
109
110 my %DEFAULTS = (
111
112 # vars must have a frequency <= to this to pass
113 af => 0.001,
114 af_monoallelic => 0.0001,
115 af_biallelic => 0.005,
116
117 af_keys => [qw(AA AFR AMR EA EAS EUR SAS gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS)],
118
119 af_from_vcf_keys => [qw(ALSPAC TOPMed TWINSUK gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH)],
120
121 # if no MAF data is found, default to 0
122 # this means absence of MAF data is considered equivalent to MAF=0
123 # set to 1 to do the "opposite", i.e. exclude variants with no MAF data
124 default_af => 0,
125
126 confidence_levels => [qw(confirmed probable)],
127
128 # only include variants with these consequence types
129 # currently not ontology-resolved, exact term matches only
130 types => {map {$_ => 1} qw(splice_donor_variant splice_acceptor_variant stop_gained frameshift_variant stop_lost initiator_codon_variant inframe_insertion inframe_deletion missense_variant coding_sequence_variant start_lost transcript_ablation transcript_amplification protein_altering_variant)},
131
132 );
133
134 my $af_key_2_population_name = {
135 minor_allele_freq => 'global allele frequency (AF) from 1000 Genomes Phase 3 data',
136 AFR => '1000GENOMES:phase_3:AFR',
137 AMR => '1000GENOMES:phase_3:AMR',
138 EAS => '1000GENOMES:phase_3:EAS',
139 EUR => '1000GENOMES:phase_3:EUR',
140 SAS => '1000GENOMES:phase_3:SAS',
141 AA => 'Exome Sequencing Project 6500:African_American',
142 EA => 'Exome Sequencing Project 6500:European_American',
143 gnomAD => 'Genome Aggregation Database:Total',
144 gnomAD_AFR => 'Genome Aggregation Database exomes:African/African American',
145 gnomAD_AMR => 'Genome Aggregation Database exomes:Latino',
146 gnomAD_ASJ => 'Genome Aggregation Database exomes:Ashkenazi Jewish',
147 gnomAD_EAS => 'Genome Aggregation Database exomes:East Asian',
148 gnomAD_FIN => 'Genome Aggregation Database exomes:Finnish',
149 gnomAD_NFE => 'Genome Aggregation Database exomes:Non-Finnish European',
150 gnomAD_OTH => 'Genome Aggregation Database exomes:Other (population not assigned)',
151 gnomAD_SAS => 'Genome Aggregation Database exomes:South Asian',
152 ALSPAC => 'UK10K:ALSPAC cohort',
153 TOPMed => 'Trans-Omics for Precision Medicine (TOPMed) Program',
154 TWINSUK => 'UK10K:TWINSUK cohort',
155 'gnomADe:AFR' => 'Genome Aggregation Database exomes v170228',
156 'gnomADe:ALL' => 'Genome Aggregation Database exomes v170228',
157 'gnomADe:AMR' => 'Genome Aggregation Database exomes v170228',
158 'gnomADe:ASJ' => 'Genome Aggregation Database exomes v170228',
159 'gnomADe:EAS' => 'Genome Aggregation Database exomes v170228',
160 'gnomADe:FIN' => 'Genome Aggregation Database exomes v170228',
161 'gnomADe:NFE' => 'Genome Aggregation Database exomes v170228',
162 'gnomADe:OTH' => 'Genome Aggregation Database exomes v170228',
163 'gnomADe:SAS' => 'Genome Aggregation Database exomes v170228',
164 'gnomADg:AFR' => 'Genome Aggregation Database genomes v170228:African/African American',
165 'gnomADg:ALL' => 'Genome Aggregation Database genomes v170228:All gnomAD genomes individuals',
166 'gnomADg:AMR' => 'Genome Aggregation Database genomes v170228:Latino',
167 'gnomADg:ASJ' => 'Genome Aggregation Database genomes v170228:Ashkenazi Jewish',
168 'gnomADg:EAS' => 'Genome Aggregation Database genomes v170228:East Asian',
169 'gnomADg:FIN' => 'Genome Aggregation Database genomes v170228:Finnish',
170 'gnomADg:NFE' => 'Genome Aggregation Database genomes v170228:Non-Finnish European',
171 'gnomADg:OTH' => 'Genome Aggregation Database genomes v170228:Other (population not assigned)',
172 };
173
174 my $allelic_requirements = {
175 'biallelic' => { af => 0.005, rules => {HET => 2, HOM => 1} },
176 'monoallelic' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
177 'hemizygous' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
178 'x-linked dominant' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
179 'x-linked over-dominance' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
180 };
181
182 my @allelic_requirement_terms = keys %$allelic_requirements;
183
184 my @population_wide = qw(minor_allele_freq AA AFR ALSPAC AMR EA EAS EUR SAS TOPMed TWINSUK gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH);
185
186 sub new {
187 my $class = shift;
188
189 my $self = $class->SUPER::new(@_);
190
191 my $supported_af_keys = { map {$_ => 1} @population_wide };
192
193 my $params = $self->params_to_hash();
194 my $file = '';
195
196 # user only supplied file as first param?
197 if (!keys %$params) {
198 $file = $self->params->[0];
199 }
200 else {
201 $file = $params->{file};
202
203 # process types
204 if ($params->{types}) {
205 $params->{types} = {map {$_ => 1} split(/[\;\&\|]/, $params->{types})};
206 }
207
208 # check af
209 foreach my $af (qw/af_monoallelic af_biallelic/) {
210 if($params->{$af}) {
211 die("ERROR: Invalid value for af: ".$params->{$af} . "\n") unless
212 looks_like_number($params->{$af}) && ($params->{$af} >= 0 && $params->{$af} <= 1)
213 }
214 }
215
216 my $assembly = $self->{config}->{assembly};
217 my $af_from_vcf_key_2_collection_id = {
218 ALSPAC => {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'},
219 TOPMed => {GRCh37 => 'topmed_GRCh37', GRCh38 => 'topmed_GRCh38'},
220 TWINSUK => {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'},
221 'gnomADe:AFR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
222 'gnomADe:ALL' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
223 'gnomADe:AMR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
224 'gnomADe:ASJ' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
225 'gnomADe:EAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
226 'gnomADe:FIN' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
227 'gnomADe:NFE' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
228 'gnomADe:OTH' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
229 'gnomADe:SAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
230 'gnomADg:AFR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
231 'gnomADg:ALL' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
232 'gnomADg:AMR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
233 'gnomADg:ASJ' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
234 'gnomADg:EAS' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
235 'gnomADg:FIN' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
236 'gnomADg:NFE' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
237 'gnomADg:OTH' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
238 };
239
240 my @keys = ();
241 my $vcf_collection_ids = {};
242 if ($params->{af_keys}) {
243 push @keys, $params->{af_keys};
244 } else {
245 push @keys, @{$DEFAULTS{af_keys}};
246 }
247 if ($params->{af_from_vcf}) {
248 if ($params->{af_from_vcf_keys}) {
249 push @keys, $params->{af_from_vcf_keys};
250 } else {
251 push @keys, @{$DEFAULTS{af_from_vcf_keys}};
252 }
253 }
254
255 my @af_keys = ();
256 foreach my $af_key_set (@keys) {
257 foreach my $af_key (split(/[\;\&\|]/, $af_key_set)) {
258 die("ERROR: af_key: " . $af_key . " not supported. Check plugin documentation for supported af_keys.\n") unless $supported_af_keys->{$af_key};
259 push @af_keys, $af_key;
260 if ($af_from_vcf_key_2_collection_id->{$af_key}) {
261
262 $vcf_collection_ids->{$af_from_vcf_key_2_collection_id->{$af_key}->{$assembly}} = 1;
263 }
264 }
265 }
266 $params->{af_keys} = \@af_keys;
267 $params->{vcf_collection_ids} = $vcf_collection_ids;
268 }
269
270 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
271 $year += 1900;
272 $mon++;
273 my $stamp = join('_', ($year, $mon, $mday, $hour, $min));
274 my $cwd_dir = getcwd;
275 my $new_log_dir = "$cwd_dir/g2p_log_dir\_$stamp";
276 my $log_dir = $params->{log_dir} || $new_log_dir;
277 if (-d $log_dir) {
278 my @files = <$log_dir/*>;
279 if (scalar @files > 0) {
280 unlink glob "'$log_dir/*.*'";
281 }
282 @files = <$log_dir/*>;
283 if (scalar @files > 0) {
284 mkdir $new_log_dir, 0755;
285 $params->{log_dir} = $new_log_dir;
286 }
287 } else {
288 mkdir $log_dir, 0755;
289 $params->{log_dir} = $log_dir;
290 }
291
292 foreach my $report_type (qw/txt_report html_report/) {
293 if (!$params->{$report_type}) {
294 my $file_type = ($report_type eq 'txt_report') ? 'txt' : 'html';
295 $params->{$report_type} = $cwd_dir . "/$report_type\_$stamp.$file_type";
296 }
297 }
298
299 if ($params->{all_confidence_levels}) {
300 push @{$params->{confidence_levels}}, 'possible', @{$DEFAULTS{confidence_levels}};
301 }
302
303 # copy in default params
304 $params->{$_} //= $DEFAULTS{$_} for keys %DEFAULTS;
305 $self->{user_params} = $params;
306
307 if (!defined($self->{config}->{reg})) {
308 my $reg = 'Bio::EnsEMBL::Registry';
309 $reg->load_registry_from_db(
310 -host => $self->config->{host},
311 -user => $self->config->{user},
312 -port => $self->config->{port},
313 -db_version => $self->config->{db_version},
314 -species => $self->config->{species},
315 -no_cache => $self->config->{no_slice_cache},
316 );
317 $self->{config}->{reg} = $reg;
318 }
319
320 my $va = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'variation');
321 $va->db->use_vcf(1);
322 $va->db->include_failed_variations(1);
323 $self->{config}->{va} = $va;
324 my $pa = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'population');
325 $self->{config}->{pa} = $pa;
326 my $vca = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'VCFCollection');
327 $self->{config}->{vca} = $vca;
328 my $ta = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'core', 'transcript');
329 $self->{config}->{ta} = $ta;
330
331 # read data from file
332 $self->{gene_data} = $self->read_gene_data_from_file($file);
333 $self->synonym_mappings();
334
335 # force some config params
336 $self->{config}->{individual} //= ['all'];
337 $self->{config}->{symbol} = 1;
338
339 $self->{config}->{check_existing} = 1;
340 $self->{config}->{failed} = 1;
341 $self->{config}->{af} = 1;
342 $self->{config}->{af_1kg} = 1;
343 $self->{config}->{af_esp} = 1;
344 $self->{config}->{af_gnomad} = 1;
345 # $self->{config}->{sift} = 'b';
346 # $self->{config}->{polyphen} = 'b';
347 # $self->{config}->{hgvsc} = 1;
348 # $self->{config}->{hgvsp} = 1;
349
350 # tell VEP we have a cache so stuff gets shared/merged between forks
351 $self->{has_cache} = 1;
352 $self->{cache}->{g2p_in_vcf} = {};
353
354 return $self;
355 }
356
357 sub feature_types {
358 return ['Transcript'];
359 }
360
361 sub get_header_info {
362 my $self = shift;
363
364 return {
365 G2P_flag => 'Flags zygosity of valid variants for a G2P gene',
366 G2P_complete => 'Indicates this variant completes the allelic requirements for a G2P gene',
367 G2P_gene_req => 'MONO or BI depending on the context in which this gene has been explored',
368 };
369 }
370
371 sub run {
372 my ($self, $tva, $line) = @_;
373
374 # only interested if we know the zygosity
375 my $zyg = $line->{Extra}->{ZYG} || $line->{ZYG};
376 return {} unless $zyg;
377
378 # only interested in given gene set
379 my $tr = $tva->transcript;
380
381 my $ensembl_gene_id = $tr->{_gene}->stable_id;
382 my $gene_symbol = $tr->{_gene_symbol} || $tr->{_gene_hgnc};
383 return {} unless $gene_symbol;
384 my $gene_data = $self->gene_data($gene_symbol);
385 if (! defined $gene_data) {
386 my $ensembl_gene_id = $tr->{_gene}->stable_id;
387 $gene_data = $self->gene_data($ensembl_gene_id);
388 }
389
390 if (!$self->{cache}->{g2p_in_vcf}->{$gene_symbol}) {
391 $self->write_report('G2P_in_vcf', $gene_symbol);
392 $self->{cache}->{g2p_in_vcf}->{$gene_symbol} = 1;
393 }
394 return {} unless defined $gene_data;
395
396 my @ars = ($gene_data->{'allelic requirement'}) ? @{$gene_data->{'allelic requirement'}} : ();
397 my %seen;
398 @ars = grep { !$seen{$_}++ } @ars;
399
400 return {} unless (@ars && ( grep { exists($allelic_requirements->{$_}) } @ars));
401 # limit by type
402 my @consequence_types = map { $_->SO_term } @{$tva->get_all_OverlapConsequences};
403
404 return {} unless grep {$self->{user_params}->{types}->{$_->SO_term}} @{$tva->get_all_OverlapConsequences};
405
406 # limit by MAF
407 my $threshold = 0;
408 my ($freqs, $existing_variant, $ar_passed) = @{$self->get_freq($tva, \@ars)};
409
410 return {} if (!keys %$ar_passed);
411
412 my $vf = $tva->base_variation_feature;
413 my $allele = $tva->variation_feature_seq;
414 my $start = $vf->{start};
415 my $end = $vf->{end};
416
417 my $individual = $vf->{individual};
418 my $vf_name = $vf->variation_name;
419 if ($vf_name || $vf_name eq '.') {
420 $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term});
421 }
422 my $allele_string = $vf->{allele_string};
423 my @alleles = split('/', $allele_string);
424 my $ref = $alleles[0];
425 my $seq_region_name = $vf->{chr};
426
427 my $params = $self->{user_params};
428 my $refseq = $tr->{_refseq} || 'NA';
429 my $tr_stable_id = $tr->stable_id;
430 my $hgvs_t = $tva->hgvs_transcript || 'NA';
431 my $hgvs_p = $tva->hgvs_protein || 'NA';
432
433 my ($clin_sig, $novel, $failed, $frequencies, $existing_name) = ('NA', 'yes', 'NA', 'NA', 'NA');
434 if ($existing_variant) {
435 $clin_sig = $existing_variant->{clin_sig} || 'NA';
436 $failed = ($existing_variant->{failed}) ? 'yes' : 'no';
437 $existing_name = $existing_variant->{variation_name} || 'NA';
438 $novel = 'no';
439 }
440
441 my $pph_score = (defined $tva->polyphen_score) ? $tva->polyphen_score : 'NA';
442 my $pph_pred = (defined $tva->polyphen_prediction) ? $tva->polyphen_prediction : 'NA';
443 my $sift_score = (defined $tva->sift_score) ? $tva->sift_score : 'NA';
444 my $sift_pred = (defined $tva->sift_prediction) ? $tva->sift_prediction : 'NA';
445
446 if (scalar keys %$freqs > 0) {
447 $frequencies = join(',', map {"$_=$freqs->{$_}"} keys %$freqs);
448 }
449
450 my $ar = join(',', sort keys %$ar_passed);
451 my $ar_in_g2pdb = join(',', sort @ars);
452 my $g2p_data = {
453 'zyg' => $zyg,
454 'allele_requirement' => $ar,
455 'ar_in_g2pdb' => $ar_in_g2pdb,
456 'frequencies' => $frequencies,
457 'consequence_types' => join(',', @consequence_types),
458 'refseq' => $refseq,
459 'failed' => $failed,
460 'clin_sig' => $clin_sig,
461 'novel' => $novel,
462 'existing_name' => $existing_name,
463 'hgvs_t' => $hgvs_t,
464 'hgvs_p' => $hgvs_p,
465 'vf_location' => "$seq_region_name:$start-$end $ref/$allele",
466 'sift_score' => "$sift_score",
467 'sift_prediction' => $sift_pred,
468 'polyphen_score' => "$pph_score",
469 'polyphen_prediction' => $pph_pred,
470 };
471
472 my %return = (
473 G2P_flag => $zyg
474 );
475
476
477 $self->write_report('G2P_flag', $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data);
478
479 $self->write_report('G2P_complete', $gene_symbol, $tr_stable_id, $individual, $vf_name, $ar, $zyg);
480
481 my $cache = $self->{cache}->{$individual}->{$tr->stable_id} ||= {};
482
483 delete $cache->{$vf_name} if exists($cache->{$vf_name});
484
485 # biallelic genes require >=1 hom or >=2 hets
486
487 my $gene_reqs = {};
488
489 foreach my $ar (keys %$ar_passed) {
490 if($ar eq 'biallelic') {
491 # homozygous, report complete
492 if(uc($zyg) eq 'HOM') {
493 $return{G2P_complete} = 1;
494 $gene_reqs->{BI} = 1;
495 }
496 # heterozygous
497 # we need to cache that we've observed one
498 elsif(uc($zyg) eq 'HET') {
499 if(scalar keys %$cache) {
500 $return{G2P_complete} = 1;
501 }
502 $cache->{$vf_name} = 1;
503 }
504 }
505 # monoallelic genes require only one allele
506 elsif($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') {
507 $return{G2P_complete} = 1;
508 $gene_reqs->{MONO} = 1;
509 }
510 else {
511 return {};
512 }
513 }
514 if ($return{G2P_complete}) {
515 $return{G2P_gene_req} = join(',', sort keys %$gene_reqs);
516 }
517
518 return \%return;
519 }
520
521 # read G2P CSV dump
522 # as from http://www.ebi.ac.uk/gene2phenotype/downloads
523 sub read_gene_data_from_file {
524 my $self = shift;
525 my $file = shift;
526 my $delimiter = shift;
527 my (@headers, %gene_data);
528
529 my $assembly = $self->{config}->{assembly};
530 die("ERROR: No file specified or could not read from file ".($file || '')."\n") unless $file && -e $file;
531
532 my @confidence_levels = @{$self->{user_params}->{confidence_levels}}, "\n";
533
534 # determine file type
535 my $file_type;
536 my $fh = FileHandle->new($file, 'r');
537 while (<$fh>) {
538 chomp;
539 if (/Model_Of_Inheritance/) {
540 $file_type = 'panelapp';
541 } elsif (/"allelic requirement"/) {
542 $file_type = 'g2p';
543 } else {
544 $file_type = 'unknown';
545 }
546 last;
547 }
548 $fh->close();
549 if ($file_type eq 'unknown') {
550 if ($file =~ /gz$/) {
551 die("ERROR: G2P plugin can only read uncompressed data");
552 } else {
553 die("ERROR: Could not recognize input file format. Format must be one of panelapp, g2p or custom. Check website for details: https://www.ebi.ac.uk/gene2phenotype/g2p_vep_plugin");
554 }
555 }
556
557 if ($file_type eq 'panelapp') {
558 my @headers = ();
559 my $csv = Text::CSV->new ({ sep_char => "\t" });
560 open my $fh, "<:encoding(utf8)", "$file" or die "$file: $!";
561 while ( my $row = $csv->getline( $fh ) ) {
562 unless (@headers) {
563 @headers = @$row;
564 } else {
565 my %tmp = map {$headers[$_] => $row->[$_]} (0..$#headers);
566 my $gene_symbol = $tmp{"Gene Entity Symbol"};
567 my $ensembl_gene_id = "";
568 if ($assembly eq 'GRCh37') {
569 $ensembl_gene_id = $tmp{"EnsemblId(GRch37)"};
570 } else { # GRCh38
571 $ensembl_gene_id = $tmp{"EnsemblId(GRch38)"};
572 }
573 if ($ensembl_gene_id) {
574 my @ars = ();
575 my $allelic_requirement_panel_app = $tmp{"Model_Of_Inheritance"};
576 if ($allelic_requirement_panel_app =~ m/MONOALLELIC|BOTH/) {
577 push @ars, 'monoallelic';
578 } elsif ($allelic_requirement_panel_app =~ m/BIALLELIC|BOTH/) {
579 push @ars, 'biallelic';
580 } elsif ($allelic_requirement_panel_app eq 'X-LINKED: hemizygous mutation in males, biallelic mutations in females') {
581 push @ars, 'hemizygous';
582 } elsif ($allelic_requirement_panel_all eq 'X-LINKED: hemizygous mutation in males, monoallelic mutations in females may cause disease (may be less severe, later onset than males)') {
583 push @ars, 'x-linked dominant';
584 } else {
585 $self->write_report('log', "no allelelic_requirement for $ensembl_gene_id");
586 }
587 foreach my $ar (@ars) {
588 push @{$gene_data{$ensembl_gene_id}->{"allelic requirement"}}, $ar;
589 }
590 } else {
591 $self->write_report('log', "no ensembl gene id for $gene_symbol");
592 }
593 }
594 }
595 $csv->eof or $csv->error_diag();
596 close $fh;
597 }
598
599 if ($file_type eq 'g2p') {
600 # this regexp allows for nested ",", e.g.
601 # item,description
602 # cheese,"salty,delicious"
603 my $re = qr/(?: "\( ( [^()""]* ) \)" | \( ( [^()]* ) \) | " ( [^"]* ) " | ( [^,]* ) ) , \s* /x;
604
605 my $fh = FileHandle->new($file, 'r');
606
607 while(<$fh>) {
608 chomp;
609 $_ =~ s/\R//g;
610 my @split = grep defined, "$_," =~ /$re/g;
611 unless(@headers) {
612 if ($file_type eq 'g2p') {
613 @headers = map {s/\"//g; $_} @split;
614 } else {
615 @headers = @split;
616 }
617 }
618 else {
619 my %tmp = map {$headers[$_] => $split[$_]} (0..$#split);
620 die("ERROR: Gene symbol column not found\n$_\n") unless $tmp{"gene symbol"};
621 my $confidence_value = $tmp{"DDD category"};
622 next if (!grep{$_ eq $confidence_value} @confidence_levels);
623 my $gene_symbol = $tmp{"gene symbol"};
624 $gene_data{$gene_symbol}->{"prev symbols"} = $tmp{"prev symbols"};
625 push @{$gene_data{$gene_symbol}->{"allelic requirement"}}, $tmp{"allelic requirement"} if ($tmp{"allelic requirement"});
626 $self->write_report('G2P_list', $tmp{"gene symbol"}, $tmp{"DDD category"});
627 }
628 }
629 $fh->close;
630 }
631 return \%gene_data;
632 }
633
634 # return either whole gene data hash or one gene's data
635 # this should allow updates to this plugin to e.g. query a REST server, for example
636 sub gene_data {
637 my ($self, $gene_symbol) = @_;
638 my $gene_data = $self->{gene_data}->{$gene_symbol};
639 if (!$gene_data) {
640 my $prev_gene_symbol = $self->{prev_symbol_mappings}->{$gene_symbol};
641 return $prev_gene_symbol ? $self->{gene_data}->{$prev_gene_symbol} : undef;
642 }
643 return $gene_data;
644 }
645
646 sub synonym_mappings {
647 my $self = shift;
648 my $gene_data = $self->{gene_data};
649 my $synonym_mappings = {};
650 foreach my $gene_symbol (keys %$gene_data) {
651 my $prev_symbols = $gene_data->{$gene_symbol}->{'prev symbols'};
652 if ($prev_symbols) {
653 foreach my $prev_symbol (split(';', $prev_symbols)) {
654 $synonym_mappings->{$prev_symbol} = $gene_symbol;
655 }
656 }
657 }
658 $self->{prev_symbol_mappings} = $synonym_mappings;
659 }
660
661 sub get_freq {
662 my $self = shift;
663 my $tva = shift;
664 my $ars = shift;
665 my $allele = $tva->variation_feature_seq;
666 my $vf = $tva->base_variation_feature;
667 reverse_comp(\$allele) if $vf->{strand} < 0;
668 my $vf_name = $vf->variation_name;
669 if ($vf_name || $vf_name eq '.') {
670 $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term});
671 }
672 my $cache = $self->{cache}->{$vf_name}->{_g2p_freqs} ||= {};
673
674 if (exists $cache->{$allele}->{failed}) {
675 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}];
676 }
677
678 if (exists $cache->{$allele}->{freq}) {
679 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}];
680 }
681
682 if (!$vf->{existing}) {
683 my $failed_ars = {};
684 my $freqs = {};
685 my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars);
686 if (!$passed) {
687 $cache->{$allele}->{failed} = 1;
688 return [{}, {}, {}];
689 } else {
690 $cache->{$allele}->{freq} = $freqs;
691 $cache->{$allele}->{ex_variant} = undef;
692 # if we get to here return all allelic requirements that passed threshold filtering
693 my $passed_ar = {};
694 foreach my $ar (@$ars) {
695 if (!$failed_ars->{$ar}) {
696 $passed_ar->{$ar} = 1;
697 }
698 }
699 $cache->{$allele}->{passed_ar} = $passed_ar;
700 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}];
701 }
702 }
703
704 my @existing_variants = @{$vf->{existing}};
705 # favour dbSNP variants
706 my @dbSNP_variants = grep {$_->{variation_name} =~ /^rs/} @existing_variants;
707 if (@dbSNP_variants) {
708 @existing_variants = @dbSNP_variants;
709 }
710 foreach my $ex (@existing_variants) {
711 my $existing_allele_string = $ex->{allele_string};
712 my $variation_name = $ex->{variation_name};
713 my $freqs = {};
714 my $failed_ars = {};
715 foreach my $af_key (@{$self->{user_params}->{af_keys}}) {
716 my $freq = $self->{user_params}->{default_af};
717 if ($af_key eq 'minor_allele_freq') {
718 if (defined $ex->{minor_allele_freq}) {
719 if (($ex->{minor_allele} || '') eq $allele ) {
720 $freq = $ex->{minor_allele_freq};
721 } else {
722 $freq = $self->correct_frequency($tva, $existing_allele_string, $ex->{minor_allele}, $ex->{minor_allele_freq}, $allele, $variation_name, $af_key, $vf_name) || $freq;
723 }
724 }
725 }
726 else {
727 my @pairs = split(',', $ex->{$af_key} || '');
728 my $found = 0;
729 if (scalar @pairs == 0) {
730 $found = 1; # no allele frequency for this population/af_key available
731 }
732 foreach my $pair (@pairs) {
733 my ($a, $f) = split(':', $pair);
734 if(($a || '') eq $allele && defined($f)) {
735 $freq = $f;
736 $found = 1;
737 }
738 }
739 if (!$found) {
740 $freq = $self->correct_frequency($tva, $existing_allele_string, undef, undef, $allele, $variation_name, $af_key, $vf_name) || $freq;
741 }
742 }
743 if (!$self->continue_af_annotation($ars, $failed_ars, $freq)) {
744 # cache failed results
745 $cache->{$allele}->{failed} = 1;
746 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}];
747 }
748 $freqs->{$af_key} = $freq if ($freq);
749 }
750 if ($self->{user_params}->{af_from_vcf}) {
751 my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars);
752 if (!$passed) {
753 $cache->{$allele}->{failed} = 1;
754 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}];
755 }
756 }
757 $cache->{$allele}->{freq} = $freqs;
758 $cache->{$allele}->{ex_variant} = $ex;
759
760 # if we get to here return all allelic requirements that passed threshold filtering
761 my $passed_ar = {};
762 foreach my $ar (@$ars) {
763 if (!$failed_ars->{$ar}) {
764 $passed_ar->{$ar} = 1;
765 }
766 }
767 $cache->{$allele}->{passed_ar} = $passed_ar;
768
769 }
770
771 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}];
772 }
773
774 sub correct_frequency {
775 my ($self, $tva, $allele_string, $minor_allele, $af, $allele, $variation_name, $af_key, $vf_name) = @_;
776
777 my @existing_alleles = split('/', $allele_string);
778 if (!grep( /^$allele$/, @existing_alleles)) {
779 return 0.0;
780 }
781
782 if ($af_key eq 'minor_allele_freq' && (scalar @existing_alleles == 2)) {
783 my $existing_ref_allele = $existing_alleles[0];
784 my $existing_alt_allele = $existing_alleles[1];
785 if ( ($minor_allele eq $existing_ref_allele && ($allele eq $existing_alt_allele)) ||
786 ($minor_allele eq $existing_alt_allele && ($allele eq $existing_ref_allele)) ) {
787 return (1.0 - $af);
788 }
789 } else {
790 my $va = $self->{config}->{va};
791 my $pa = $self->{config}->{pa};
792 my $variation = $va->fetch_by_name($variation_name);
793 my $af_key = $self->{user_params}->{af_keys};
794 my $population_name = $af_key_2_population_name->{$af_key};
795 if ($population_name) {
796 my $population = $self->{config}->{$population_name};
797 if (!$population) {
798 $population = $pa->fetch_by_name($population_name);
799 $self->{config}->{$population_name} = $population;
800 }
801 foreach (@{$variation->get_all_Alleles($population)}) {
802 if ($_->allele eq $allele) {
803 return $_->frequency;
804 }
805 }
806 }
807 }
808 return 0.0;
809 }
810
811 sub frequencies_from_VCF {
812 my $self = shift;
813 my $freqs = shift;
814 my $vf = shift;
815 my $vf_allele = shift;
816 my $ars = shift;
817 my $failed_ars = shift;
818 my $vca = $self->{config}->{vca};
819 my $collections = $vca->fetch_all;
820 foreach my $vc (@$collections) {
821 next if (! $self->{user_params}->{vcf_collection_ids}->{$vc->id});
822 my $alleles = $vc->get_all_Alleles_by_VariationFeature($vf);
823 foreach my $allele (@$alleles) {
824 if ($allele->allele eq $vf_allele) {
825 my $af_key = $allele->population->name;
826 my $freq = $allele->frequency;
827 return 0 if (!$self->continue_af_annotation($ars, $failed_ars, $freq));
828 $freqs->{$af_key} = $freq;
829 }
830 }
831 }
832 return 1;
833 }
834
835 sub continue_af_annotation {
836 my $self = shift;
837 my $ars = shift;
838 my $failed_ars = shift;
839 my $freq = shift;
840 foreach my $ar (@$ars) {
841 if (!$failed_ars->{$ar}) {
842 if (defined $allelic_requirements->{$ar}) {
843 my $threshold = $allelic_requirements->{$ar}->{af};
844 if ($freq > $threshold) {
845 $failed_ars->{$ar} = 1;
846 }
847 }
848 }
849 }
850 return (scalar @$ars != scalar keys %$failed_ars);
851 }
852
853
854
855 sub write_report {
856 my $self = shift;
857 my $flag = shift;
858 my $log_dir = $self->{user_params}->{log_dir};
859 my $log_file = "$log_dir/$$.txt";
860 open(my $fh, '>>', $log_file) or die "Could not open file '$flag $log_file' $!";
861 if ($flag eq 'G2P_list') {
862 my ($gene_symbol, $DDD_category) = @_;
863 $DDD_category ||= 'Not assigned';
864 print $fh "$flag\t$gene_symbol\t$DDD_category\n";
865 } elsif ($flag eq 'G2P_in_vcf') {
866 my $gene_symbol = shift;
867 print $fh "$flag\t$gene_symbol\n";
868 } elsif ($flag eq 'G2P_complete') {
869 print $fh join("\t", $flag, @_), "\n";
870 } elsif ($flag eq 'log') {
871 print $fh join("\t", $flag, @_), "\n";
872 } else {
873 my ($gene_symbol, $tr_stable_id, $individual, $vf_name, $data) = @_;
874 $data = join(';', map {"$_=$data->{$_}"} sort keys %$data);
875 print $fh join("\t", $flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $data), "\n";
876 }
877 close $fh;
878 }
879
880 sub finish {
881 my $self = shift;
882 $self->generate_report;
883 }
884
885 sub generate_report {
886 my $self = shift;
887 my $result_summary = $self->parse_log_files;
888 my $chart_txt_data = $self->chart_and_txt_data($result_summary);
889 my $chart_data = $chart_txt_data->{chart_data};
890 my $txt_data = $chart_txt_data->{txt_data};
891 my $canonical_transcripts = $chart_txt_data->{canonical_transcripts};
892 $self->write_txt_output($txt_data);
893 $self->write_charts($result_summary, $chart_data, $canonical_transcripts);
894 }
895
896 sub write_txt_output {
897 my $self = shift;
898 my $txt_output_data = shift;
899 my $txt_output_file = $self->{user_params}->{txt_report};
900 my $fh_txt = FileHandle->new($txt_output_file, 'w');
901 foreach my $individual (sort keys %$txt_output_data) {
902 foreach my $gene_symbol (keys %{$txt_output_data->{$individual}}) {
903 foreach my $ar (keys %{$txt_output_data->{$individual}->{$gene_symbol}}) {
904 foreach my $tr_stable_id (keys %{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}}) {
905 my $is_canonical = $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{is_canonical};
906 my $canonical_tag = ($is_canonical) ? 'is_canonical' : 'not_canonical';
907 my $req = $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{REQ};
908 my $variants = join(';', @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{variants}});
909 print $fh_txt join("\t", $individual, $gene_symbol, $tr_stable_id, $canonical_tag, "OBS=$ar", "REQ=$req", $variants), "\n";
910 }
911 }
912 }
913 }
914 $fh_txt->close();
915 }
916
917 sub write_charts {
918 my $self = shift;
919 my $result_summary = shift;
920 my $chart_data = shift;
921 my $canonical_transcripts = shift;
922
923 my $count_g2p_genes = keys %{$result_summary->{g2p_list}};
924 my $count_in_vcf_file = keys %{$result_summary->{in_vcf_file}};
925 my $count_complete_genes = scalar keys %{$result_summary->{complete_genes}};
926
927 my @charts = ();
928 my @frequencies_header = ();
929
930 foreach my $short_name (sort @{$self->{user_params}->{af_keys}}) {
931 my $text = $af_key_2_population_name->{$short_name};
932 push @frequencies_header, "<a style=\"cursor: pointer\" data-placement=\"top\" data-toggle=\"tooltip\" data-container=\"body\" title=\"$text\">$short_name</a>";
933 }
934
935 my $count = 1;
936 my @new_header = (
937 'Variant location and alleles (REF/ALT)',
938 'Variant name',
939 'Existing name',
940 'Zygosity',
941 'All allelic requirements from G2P DB',
942 'Consequence types',
943 'ClinVar annotation',
944 'SIFT',
945 'PolyPhen',
946 'Novel variant',
947 'Has been failed by Ensembl',
948 @frequencies_header,
949 'HGVS transcript',
950 'HGVS protein',
951 'RefSeq IDs',
952 );
953
954 my $html_output_file = $self->{user_params}->{html_report};
955 my $fh_out = FileHandle->new($html_output_file, 'w');
956 print $fh_out stats_html_head(\@charts);
957 print $fh_out "<div class='main_content container'>";
958
959
960 print $fh_out "<h1>G2P report</h1>";
961 print $fh_out "<p>Input and output files:</p>";
962
963 print $fh_out "<dl class='dl-horizontal'>";
964 print $fh_out "<dt>G2P list</dt>";
965 print $fh_out "<dd>" . $self->{user_params}->{file} . "</dd>";
966 print $fh_out "<dt>Log directory</dt>";
967 print $fh_out "<dd>" . $self->{user_params}->{log_dir} . "</dd>";
968 print $fh_out "<dt>HTML report</dt>";
969 print $fh_out "<dd>" . $self->{user_params}->{html_report} . "</dd>";
970 print $fh_out "<dt>TXT report</dt>";
971 print $fh_out "<dd>" . $self->{user_params}->{txt_report} . "</dd>";
972 print $fh_out "</dl>";
973
974 print $fh_out "<p>Counts:</p>";
975 print $fh_out "<dl class='dl-horizontal text-overflow'>";
976 print $fh_out "<dt>$count_g2p_genes</dt>";
977 print $fh_out "<dd>G2P genes</dd>";
978 print $fh_out "<dt>$count_in_vcf_file</dt>";
979 print $fh_out "<dd>G2P genes in input VCF file</dd>";
980 print $fh_out "<dt>$count_complete_genes</dt>";
981 print $fh_out "<dd>G2P complete genes in input VCF file</dd>";
982 print $fh_out "</dl>";
983
984
985 print $fh_out "<h1>Summary of G2P complete genes per individual</h1>";
986 print $fh_out "<p>G2P complete gene: A sufficient number of variant hits for the observed allelic requirement in at least one of the gene's transcripts. Variants are filtered by frequency.</p>";
987 print $fh_out "<p>Frequency thresholds and number of required variant hits for each allelic requirement:</p>";
988
989 print $fh_out "<table class='table table-bordered'>";
990 print $fh_out "<thead>";
991 print $fh_out "<tr><th>Allelic requirement</th><th>Frequency threshold for filtering</th><th>Variant counts by zygosity</th></tr>";
992 print $fh_out "</thead>";
993 print $fh_out "<tbody>";
994 foreach my $ar (sort keys %$allelic_requirements) {
995 my $af = $allelic_requirements->{$ar}->{af};
996 my $rules = $allelic_requirements->{$ar}->{rules};
997 my $rule = join(' OR ', map {"$_ >= $rules->{$_}"} keys %$rules);
998 print $fh_out "<tr><td>$ar</td><td>$af</td><td>$rule</td></tr>";
999 }
1000 print $fh_out "</tbody>";
1001 print $fh_out "</table>";
1002
1003 my $switch =<<SHTML;
1004 <form>
1005 <div class="checkbox">
1006 <label>
1007 <input class="target" type="checkbox"> Show only canonical transcript
1008 </label>
1009 </div>
1010 </form>
1011 SHTML
1012
1013 print $fh_out $switch;
1014
1015 foreach my $individual (sort keys %$chart_data) {
1016 foreach my $gene_symbol (keys %{$chart_data->{$individual}}) {
1017 foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) {
1018 print $fh_out "<ul>\n";
1019 foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) {
1020 my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical';
1021 print $fh_out "<li><a class=\"$class\" href=\"#$individual\_$gene_symbol\_$ar\_$transcript_stable_id\">" . "$individual &gt; $gene_symbol &gt; $ar &gt; $transcript_stable_id" . "</a> </li>\n";
1022 }
1023 print $fh_out "</ul>\n";
1024 }
1025 }
1026 }
1027
1028 foreach my $individual (sort keys %$chart_data) {
1029 foreach my $gene_symbol (keys %{$chart_data->{$individual}}) {
1030 foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) {
1031 foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) {
1032 my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical';
1033 print $fh_out "<div class=\"$class\">";
1034 my $name = "$individual\_$gene_symbol\_$ar\_$transcript_stable_id";
1035 my $title = "$individual &gt; $gene_symbol &gt; $ar &gt; $transcript_stable_id";
1036 print $fh_out "<h3><a name=\"$name\"></a>$title <a title=\"Back to Top\" data-toggle=\"tooltip\" href='#top'><span class=\"glyphicon glyphicon-arrow-up\" aria-hidden=\"true\"></span></a></h3>\n";
1037 print $fh_out "<div class=\"table-responsive\" style=\"width:100%\">\n";
1038 print $fh_out "<TABLE class=\"table table-bordered table-condensed\" style=\"margin-left: 2em\">";
1039 print $fh_out "<thead>\n";
1040 print $fh_out "<tr>" . join('', map {"<th>$_</th>"} @new_header) . "</tr>\n";
1041 print $fh_out "</thead>\n";
1042 print $fh_out "<tbody>\n";
1043 foreach my $vf_data (@{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) {
1044 my $data_row = $vf_data->[0];
1045 my @tds = ();
1046 foreach my $cell (@$data_row) {
1047 my $value = $cell->[0];
1048 my $class = $cell->[1];
1049 if ($class) {
1050 push @tds, "<td class=\"$class\">$value</td>";
1051 } else {
1052 push @tds, "<td>$value</td>";
1053 }
1054 }
1055 print $fh_out "<tr>", join('', @tds), "</tr>\n";
1056 }
1057 print $fh_out "</tbody>\n";
1058 print $fh_out "</TABLE>\n";
1059 print $fh_out "</div>\n";
1060 print $fh_out "</div>\n";
1061 }
1062 }
1063 }
1064 }
1065 print $fh_out stats_html_tail();
1066 }
1067
1068 sub chart_and_txt_data {
1069 my $self = shift;
1070 my $result_summary = shift;
1071 my $individuals = $result_summary->{individuals};
1072 my $complete_genes = $result_summary->{complete_genes};
1073 my $acting_ars = $result_summary->{acting_ars};
1074 my $new_order = $result_summary->{new_order};
1075
1076 # my @frequencies_header = sort keys $af_key_2_population_name;
1077 my @frequencies_header = sort @{$self->{user_params}->{af_keys}};
1078
1079 my $assembly = $self->{config}->{assembly};
1080 my $transcripts = {};
1081 my $canonical_transcripts = {};
1082 my $transcript_adaptor = $self->{config}->{ta};
1083 my $chart_data = {};
1084 my $txt_output_data = {};
1085
1086 my $prediction2bgcolor = {
1087 'probably damaging' => 'danger',
1088 'deleterious' => 'danger',
1089 'possibly damaging' => 'warning',
1090 'unknown' => 'warning',
1091 'benign' => 'success',
1092 'tolerated' => 'success',
1093 };
1094
1095 foreach my $individual (sort keys %$new_order) {
1096 foreach my $gene_symbol (keys %{$new_order->{$individual}}) {
1097 foreach my $ar (keys %{$new_order->{$individual}->{$gene_symbol}}) {
1098 foreach my $transcript_stable_id (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}}) {
1099 foreach my $vf_name (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) {
1100 my $data = $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$transcript_stable_id};
1101
1102 my $hash = {};
1103 foreach my $pair (split/;/, $data) {
1104 my ($key, $value) = split('=', $pair, 2);
1105 $value ||= '';
1106 $hash->{$key} = $value;
1107 }
1108 my $vf_location = $hash->{vf_location};
1109 my $existing_name = $hash->{existing_name};
1110 if ($existing_name ne 'NA') {
1111 $existing_name = "<a href=\"http://$assembly.ensembl.org/Homo_sapiens/Variation/Explore?v=$existing_name\">$existing_name</a>";
1112 }
1113 my $refseq = $hash->{refseq};
1114 my $failed = $hash->{failed};
1115 my $clin_sign = $hash->{clin_sig};
1116 my $novel = $hash->{novel};
1117 my $hgvs_t = $hash->{hgvs_t};
1118 my $hgvs_p = $hash->{hgvs_p};
1119 my $allelic_requirement = $hash->{allele_requirement};
1120 my $observed_allelic_requirement = $hash->{ar_in_g2pdb};
1121 my $consequence_types = $hash->{consequence_types};
1122 my $zygosity = $hash->{zyg};
1123 my $sift_score = $hash->{sift_score} || '0.0';
1124 my $sift_prediction = $hash->{sift_prediction};
1125 my $sift = 'NA';
1126 my $sift_class = '';
1127 if ($sift_prediction ne 'NA') {
1128 $sift = "$sift_prediction(" . "$sift_score)";
1129 $sift_class = $prediction2bgcolor->{$sift_prediction};
1130 }
1131 my $polyphen_score = $hash->{polyphen_score} || '0.0';
1132 my $polyphen_prediction = $hash->{polyphen_prediction};
1133 my $polyphen = 'NA';
1134 my $polyphen_class = '';
1135 if ($polyphen_prediction ne 'NA') {
1136 $polyphen = "$polyphen_prediction($polyphen_score)";
1137 $polyphen_class = $prediction2bgcolor->{$polyphen_prediction};
1138 }
1139
1140 my %frequencies_hash = ();
1141 if ($hash->{frequencies} ne 'NA') {
1142 %frequencies_hash = split /[,=]/, $hash->{frequencies};
1143 }
1144 my @frequencies = ();
1145 my @txt_output_frequencies = ();
1146 foreach my $population (@frequencies_header) {
1147 my $frequency = $frequencies_hash{$population} || '';
1148 push @frequencies, ["$frequency"];
1149 if ($frequency) {
1150 push @txt_output_frequencies, "$population=$frequency";
1151 }
1152 }
1153 my $is_canonical = 0;
1154 if ($hash->{is_canonical}) {
1155 $is_canonical = ($hash->{is_canonical} eq 'yes') ? 1 : 0;
1156 } else {
1157 if ($transcripts->{$transcript_stable_id}) {
1158 $is_canonical = 1 if ($canonical_transcripts->{$transcript_stable_id});
1159 } else {
1160 my $transcript = $transcript_adaptor->fetch_by_stable_id($transcript_stable_id);
1161 if ($transcript) {
1162 $is_canonical = $transcript->is_canonical();
1163 $transcripts->{$transcript_stable_id} = 1;
1164 $canonical_transcripts->{$transcript_stable_id} = 1 if ($is_canonical);
1165 }
1166 }
1167 }
1168 my ($location, $alleles) = split(' ', $vf_location);
1169 $location =~ s/\-/:/;
1170 $alleles =~ s/\//:/;
1171
1172 push @{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}, [[
1173 [$vf_location],
1174 [$vf_name],
1175 [$existing_name],
1176 [$zygosity],
1177 [$observed_allelic_requirement],
1178 [$consequence_types],
1179 [$clin_sign],
1180 [$sift, $sift_class],
1181 [$polyphen, $polyphen_class],
1182 [$novel],
1183 [$failed],
1184 @frequencies,
1185 [$hgvs_t],
1186 [$hgvs_p],
1187 [$refseq]
1188 ], $is_canonical];
1189
1190 my $txt_output_variant = "$location:$alleles:$zygosity:$consequence_types:SIFT=$sift:PolyPhen=$polyphen";
1191 if (@txt_output_frequencies) {
1192 $txt_output_variant .= ':' . join(',', @txt_output_frequencies);
1193 }
1194 $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{is_canonical} = $is_canonical;
1195 $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{REQ} = $observed_allelic_requirement;
1196 push @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{variants}}, $txt_output_variant;
1197 }
1198 }
1199 }
1200 }
1201 }
1202 return {txt_data => $txt_output_data, chart_data => $chart_data, canonical_transcripts => $canonical_transcripts};
1203 }
1204
1205 sub parse_log_files {
1206 my $self = shift;
1207
1208 my $log_dir = $self->{user_params}->{log_dir};
1209 my @files = <$log_dir/*>;
1210
1211 my $genes = {};
1212 my $individuals = {};
1213 my $complete_genes = {};
1214 my $g2p_list = {};
1215 my $in_vcf_file = {};
1216 my $cache = {};
1217 my $acting_ars = {};
1218
1219 my $new_order = {};
1220
1221 foreach my $file (@files) {
1222 my $fh = FileHandle->new($file, 'r');
1223 while (<$fh>) {
1224 chomp;
1225 if (/^G2P_list/) {
1226 my ($flag, $gene_symbol, $DDD_category) = split/\t/;
1227 $g2p_list->{$gene_symbol} = 1;
1228 } elsif (/^G2P_in_vcf/) {
1229 my ($flag, $gene_symbol) = split/\t/;
1230 $in_vcf_file->{$gene_symbol} = 1;
1231 } elsif (/^G2P_complete/) {
1232 my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $ars, $zyg) = split/\t/;
1233 foreach my $ar (split(',', $ars)) {
1234 if ($ar eq 'biallelic') {
1235 # homozygous, report complete
1236 if (uc($zyg) eq 'HOM') {
1237 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1;
1238 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1;
1239 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1;
1240 }
1241 # heterozygous
1242 # we need to cache that we've observed one
1243 elsif (uc($zyg) eq 'HET') {
1244 if (scalar keys %{$cache->{$individual}->{$tr_stable_id}} >= 1) {
1245 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1;
1246 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1;
1247 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1;
1248 # add first observed het variant to the list
1249 foreach my $vf (keys %{$cache->{$individual}->{$tr_stable_id}}) {
1250 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf} = 1;
1251 }
1252 }
1253 $cache->{$individual}->{$tr_stable_id}->{$vf_name}++;
1254 }
1255 }
1256 # monoallelic genes require only one allele
1257 elsif ($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') {
1258 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1;
1259 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1;
1260 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1;
1261 }
1262 }
1263 } elsif (/^G2P_flag/) {
1264 my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data) = split/\t/;
1265 $genes->{$gene_symbol}->{"$individual\t$vf_name"}->{$tr_stable_id} = $g2p_data;
1266 $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$tr_stable_id} = $g2p_data;
1267 } else {
1268
1269 }
1270 }
1271 $fh->close();
1272 }
1273 return {
1274 genes => $genes,
1275 individuals => $individuals,
1276 complete_genes => $complete_genes,
1277 g2p_list => $g2p_list,
1278 in_vcf_file => $in_vcf_file,
1279 acting_ars => $acting_ars,
1280 new_order => $new_order,
1281 };
1282 }
1283
1284
1285 sub stats_html_head {
1286 my $charts = shift;
1287
1288 my $html =<<SHTML;
1289 <html>
1290 <head>
1291 <title>VEP summary</title>
1292 <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
1293 <style>
1294 a.inactive {
1295 color: grey;
1296 pointer-events:none;
1297 }
1298 </style>
1299 </head>
1300 <body>
1301 SHTML
1302 return $html;
1303 }
1304
1305 sub stats_html_tail {
1306 my $script =<<SHTML;
1307 <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
1308 <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
1309 <script type="text/javascript" src="http://www.google.com/jsapi"></script>
1310 <script>
1311 \$( "input[type=checkbox]" ).on( "click", function(){
1312 if (\$('.target').is(':checked')) {
1313 \$( "div.not_canonical" ).hide();
1314 \$("a.not_canonical").addClass("inactive");
1315 } else {
1316 \$( "div.not_canonical" ).show();
1317 \$("a.not_canonical").removeClass("inactive");
1318 }
1319 } );
1320 \$(document).ready(function(){
1321 \$('[data-toggle="tooltip"]').tooltip();
1322 });
1323 </script>
1324 SHTML
1325 return "\n</div>\n$script\n</body>\n</html>\n";
1326 }
1327
1328 sub sort_keys {
1329 my $data = shift;
1330 my $sort = shift;
1331 print $data, "\n";
1332 my @keys;
1333
1334 # sort data
1335 if(defined($sort)) {
1336 if($sort eq 'chr') {
1337 @keys = sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %{$data};
1338 }
1339 elsif($sort eq 'value') {
1340 @keys = sort {$data->{$a} <=> $data->{$b}} keys %{$data};
1341 }
1342 elsif(ref($sort) eq 'HASH') {
1343 @keys = sort {$sort->{$a} <=> $sort->{$b}} keys %{$data};
1344 }
1345 }
1346 else {
1347 @keys = keys %{$data};
1348 }
1349
1350 return \@keys;
1351 }
1352
1353 1;