|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 G2P
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv G2P.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin G2P,file=/path/to/G2P.csv.gz
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 A VEP plugin that uses G2P allelic requirements to assess variants in genes
|
|
|
36 for potential phenotype involvement.
|
|
|
37
|
|
|
38 The plugin has multiple configuration options, though minimally requires only
|
|
|
39 the CSV file of G2P data.
|
|
|
40
|
|
|
41 Options are passed to the plugin as key=value pairs, (defaults in parentheses):
|
|
|
42
|
|
|
43 file : path to G2P data file, as found at http://www.ebi.ac.uk/gene2phenotype/downloads
|
|
|
44
|
|
|
45 af_monoallelic : maximum allele frequency for inclusion for monoallelic genes (0.0001)
|
|
|
46
|
|
|
47 af_biallelic : maximum allele frequency for inclusion for biallelic genes (0.005)
|
|
|
48 all_confidence_levels : set value to 1 to include all confidence levels: confirmed, probable and possible.
|
|
|
49 Default levels are confirmed and probable.
|
|
|
50 af_keys : reference populations used for annotating variant alleles with observed
|
|
|
51 allele frequencies. Allele frequencies are stored in VEP cache files.
|
|
|
52 Default populations are:
|
|
|
53 ESP: AA, EA
|
|
|
54 1000 Genomes: AFR, AMR, EAS, EUR, SAS
|
|
|
55 gnomAD exomes: gnomAD, gnomAD_AFR, gnomAD_AMR, gnomAD_ASJ, gnomAD_EAS, gnomAD_FIN, gnomAD_NFE, gnomAD_OTH, gnomAD_SAS
|
|
|
56 Separate multiple values with '&'
|
|
|
57 af_from_vcf : set value to 1 to include allele frequencies from VCF file.
|
|
|
58 Specifiy the list of reference populations to include with --af_from_vcf_keys
|
|
|
59 af_from_vcf_keys : reference populations used for annotating variant alleles with observed
|
|
|
60 allele frequencies. Allele frequencies are retrieved from VCF files. If
|
|
|
61 af_from_vcf is set to 1 but no populations specified with --af_from_vcf_keys
|
|
|
62 all available reference populations are included.
|
|
|
63 TOPmed: TOPMed
|
|
|
64 UK10K: ALSPAC, TWINSUK
|
|
|
65 gnomAD exomes: gnomADe:AFR, gnomADe:ALL, gnomADe:AMR, gnomADe:ASJ, gnomADe:EAS, gnomADe:FIN, gnomADe:NFE, gnomADe:OTH, gnomADe:SAS
|
|
|
66 gnomAD genomes: gnomADg:AFR, gnomADg:ALL, gnomADg:AMR, gnomADg:ASJ, gnomADg:EAS, gnomADg:FIN, gnomADg:NFE, gnomADg:OTH
|
|
|
67 Separate multiple values with '&'
|
|
|
68 default_af : default frequency of the input variant if no frequency data is
|
|
|
69 found (0). This determines whether such variants are included;
|
|
|
70 the value of 0 forces variants with no frequency data to be
|
|
|
71 included as this is considered equivalent to having a frequency
|
|
|
72 of 0. Set to 1 (or any value higher than af) to exclude them.
|
|
|
73 types : SO consequence types to include. Separate multiple values with '&'
|
|
|
74 (splice_donor_variant,splice_acceptor_variant,stop_gained,
|
|
|
75 frameshift_variant,stop_lost,initiator_codon_variant,
|
|
|
76 inframe_insertion,inframe_deletion,missense_variant,
|
|
|
77 coding_sequence_variant,start_lost,transcript_ablation,
|
|
|
78 transcript_amplification,protein_altering_variant)
|
|
|
79
|
|
|
80 log_dir : write stats to log files in log_dir
|
|
|
81
|
|
|
82 txt_report : write all G2P complete genes and attributes to txt file
|
|
|
83
|
|
|
84 html_report : write all G2P complete genes and attributes to html file
|
|
|
85
|
|
|
86 Example:
|
|
|
87
|
|
|
88 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_keys=AA&gnomAD_ASJ,types=stop_gained&frameshift_variant
|
|
|
89 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,types=stop_gained&frameshift_variant
|
|
|
90 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_from_vcf=1
|
|
|
91 --plugin G2P,file=G2P.csv
|
|
|
92
|
|
|
93 =cut
|
|
|
94
|
|
|
95 package G2P;
|
|
|
96
|
|
|
97 use strict;
|
|
|
98 use warnings;
|
|
|
99
|
|
|
100 use Cwd;
|
|
|
101 use Scalar::Util qw(looks_like_number);
|
|
|
102 use FileHandle;
|
|
|
103 use Text::CSV;
|
|
|
104 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
105
|
|
|
106 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
|
|
|
107
|
|
|
108 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
109
|
|
|
110 my %DEFAULTS = (
|
|
|
111
|
|
|
112 # vars must have a frequency <= to this to pass
|
|
|
113 af => 0.001,
|
|
|
114 af_monoallelic => 0.0001,
|
|
|
115 af_biallelic => 0.005,
|
|
|
116
|
|
|
117 af_keys => [qw(AA AFR AMR EA EAS EUR SAS gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS)],
|
|
|
118
|
|
|
119 af_from_vcf_keys => [qw(ALSPAC TOPMed TWINSUK gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH)],
|
|
|
120
|
|
|
121 # if no MAF data is found, default to 0
|
|
|
122 # this means absence of MAF data is considered equivalent to MAF=0
|
|
|
123 # set to 1 to do the "opposite", i.e. exclude variants with no MAF data
|
|
|
124 default_af => 0,
|
|
|
125
|
|
|
126 confidence_levels => [qw(confirmed probable)],
|
|
|
127
|
|
|
128 # only include variants with these consequence types
|
|
|
129 # currently not ontology-resolved, exact term matches only
|
|
|
130 types => {map {$_ => 1} qw(splice_donor_variant splice_acceptor_variant stop_gained frameshift_variant stop_lost initiator_codon_variant inframe_insertion inframe_deletion missense_variant coding_sequence_variant start_lost transcript_ablation transcript_amplification protein_altering_variant)},
|
|
|
131
|
|
|
132 );
|
|
|
133
|
|
|
134 my $af_key_2_population_name = {
|
|
|
135 minor_allele_freq => 'global allele frequency (AF) from 1000 Genomes Phase 3 data',
|
|
|
136 AFR => '1000GENOMES:phase_3:AFR',
|
|
|
137 AMR => '1000GENOMES:phase_3:AMR',
|
|
|
138 EAS => '1000GENOMES:phase_3:EAS',
|
|
|
139 EUR => '1000GENOMES:phase_3:EUR',
|
|
|
140 SAS => '1000GENOMES:phase_3:SAS',
|
|
|
141 AA => 'Exome Sequencing Project 6500:African_American',
|
|
|
142 EA => 'Exome Sequencing Project 6500:European_American',
|
|
|
143 gnomAD => 'Genome Aggregation Database:Total',
|
|
|
144 gnomAD_AFR => 'Genome Aggregation Database exomes:African/African American',
|
|
|
145 gnomAD_AMR => 'Genome Aggregation Database exomes:Latino',
|
|
|
146 gnomAD_ASJ => 'Genome Aggregation Database exomes:Ashkenazi Jewish',
|
|
|
147 gnomAD_EAS => 'Genome Aggregation Database exomes:East Asian',
|
|
|
148 gnomAD_FIN => 'Genome Aggregation Database exomes:Finnish',
|
|
|
149 gnomAD_NFE => 'Genome Aggregation Database exomes:Non-Finnish European',
|
|
|
150 gnomAD_OTH => 'Genome Aggregation Database exomes:Other (population not assigned)',
|
|
|
151 gnomAD_SAS => 'Genome Aggregation Database exomes:South Asian',
|
|
|
152 ALSPAC => 'UK10K:ALSPAC cohort',
|
|
|
153 TOPMed => 'Trans-Omics for Precision Medicine (TOPMed) Program',
|
|
|
154 TWINSUK => 'UK10K:TWINSUK cohort',
|
|
|
155 'gnomADe:AFR' => 'Genome Aggregation Database exomes v170228',
|
|
|
156 'gnomADe:ALL' => 'Genome Aggregation Database exomes v170228',
|
|
|
157 'gnomADe:AMR' => 'Genome Aggregation Database exomes v170228',
|
|
|
158 'gnomADe:ASJ' => 'Genome Aggregation Database exomes v170228',
|
|
|
159 'gnomADe:EAS' => 'Genome Aggregation Database exomes v170228',
|
|
|
160 'gnomADe:FIN' => 'Genome Aggregation Database exomes v170228',
|
|
|
161 'gnomADe:NFE' => 'Genome Aggregation Database exomes v170228',
|
|
|
162 'gnomADe:OTH' => 'Genome Aggregation Database exomes v170228',
|
|
|
163 'gnomADe:SAS' => 'Genome Aggregation Database exomes v170228',
|
|
|
164 'gnomADg:AFR' => 'Genome Aggregation Database genomes v170228:African/African American',
|
|
|
165 'gnomADg:ALL' => 'Genome Aggregation Database genomes v170228:All gnomAD genomes individuals',
|
|
|
166 'gnomADg:AMR' => 'Genome Aggregation Database genomes v170228:Latino',
|
|
|
167 'gnomADg:ASJ' => 'Genome Aggregation Database genomes v170228:Ashkenazi Jewish',
|
|
|
168 'gnomADg:EAS' => 'Genome Aggregation Database genomes v170228:East Asian',
|
|
|
169 'gnomADg:FIN' => 'Genome Aggregation Database genomes v170228:Finnish',
|
|
|
170 'gnomADg:NFE' => 'Genome Aggregation Database genomes v170228:Non-Finnish European',
|
|
|
171 'gnomADg:OTH' => 'Genome Aggregation Database genomes v170228:Other (population not assigned)',
|
|
|
172 };
|
|
|
173
|
|
|
174 my $allelic_requirements = {
|
|
|
175 'biallelic' => { af => 0.005, rules => {HET => 2, HOM => 1} },
|
|
|
176 'monoallelic' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
|
|
|
177 'hemizygous' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
|
|
|
178 'x-linked dominant' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
|
|
|
179 'x-linked over-dominance' => { af => 0.0001, rules => {HET => 1, HOM => 1} },
|
|
|
180 };
|
|
|
181
|
|
|
182 my @allelic_requirement_terms = keys %$allelic_requirements;
|
|
|
183
|
|
|
184 my @population_wide = qw(minor_allele_freq AA AFR ALSPAC AMR EA EAS EUR SAS TOPMed TWINSUK gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH);
|
|
|
185
|
|
|
186 sub new {
|
|
|
187 my $class = shift;
|
|
|
188
|
|
|
189 my $self = $class->SUPER::new(@_);
|
|
|
190
|
|
|
191 my $supported_af_keys = { map {$_ => 1} @population_wide };
|
|
|
192
|
|
|
193 my $params = $self->params_to_hash();
|
|
|
194 my $file = '';
|
|
|
195
|
|
|
196 # user only supplied file as first param?
|
|
|
197 if (!keys %$params) {
|
|
|
198 $file = $self->params->[0];
|
|
|
199 }
|
|
|
200 else {
|
|
|
201 $file = $params->{file};
|
|
|
202
|
|
|
203 # process types
|
|
|
204 if ($params->{types}) {
|
|
|
205 $params->{types} = {map {$_ => 1} split(/[\;\&\|]/, $params->{types})};
|
|
|
206 }
|
|
|
207
|
|
|
208 # check af
|
|
|
209 foreach my $af (qw/af_monoallelic af_biallelic/) {
|
|
|
210 if($params->{$af}) {
|
|
|
211 die("ERROR: Invalid value for af: ".$params->{$af} . "\n") unless
|
|
|
212 looks_like_number($params->{$af}) && ($params->{$af} >= 0 && $params->{$af} <= 1)
|
|
|
213 }
|
|
|
214 }
|
|
|
215
|
|
|
216 my $assembly = $self->{config}->{assembly};
|
|
|
217 my $af_from_vcf_key_2_collection_id = {
|
|
|
218 ALSPAC => {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'},
|
|
|
219 TOPMed => {GRCh37 => 'topmed_GRCh37', GRCh38 => 'topmed_GRCh38'},
|
|
|
220 TWINSUK => {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'},
|
|
|
221 'gnomADe:AFR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
222 'gnomADe:ALL' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
223 'gnomADe:AMR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
224 'gnomADe:ASJ' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
225 'gnomADe:EAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
226 'gnomADe:FIN' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
227 'gnomADe:NFE' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
228 'gnomADe:OTH' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
229 'gnomADe:SAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'},
|
|
|
230 'gnomADg:AFR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
231 'gnomADg:ALL' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
232 'gnomADg:AMR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
233 'gnomADg:ASJ' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
234 'gnomADg:EAS' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
235 'gnomADg:FIN' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
236 'gnomADg:NFE' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
237 'gnomADg:OTH' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'},
|
|
|
238 };
|
|
|
239
|
|
|
240 my @keys = ();
|
|
|
241 my $vcf_collection_ids = {};
|
|
|
242 if ($params->{af_keys}) {
|
|
|
243 push @keys, $params->{af_keys};
|
|
|
244 } else {
|
|
|
245 push @keys, @{$DEFAULTS{af_keys}};
|
|
|
246 }
|
|
|
247 if ($params->{af_from_vcf}) {
|
|
|
248 if ($params->{af_from_vcf_keys}) {
|
|
|
249 push @keys, $params->{af_from_vcf_keys};
|
|
|
250 } else {
|
|
|
251 push @keys, @{$DEFAULTS{af_from_vcf_keys}};
|
|
|
252 }
|
|
|
253 }
|
|
|
254
|
|
|
255 my @af_keys = ();
|
|
|
256 foreach my $af_key_set (@keys) {
|
|
|
257 foreach my $af_key (split(/[\;\&\|]/, $af_key_set)) {
|
|
|
258 die("ERROR: af_key: " . $af_key . " not supported. Check plugin documentation for supported af_keys.\n") unless $supported_af_keys->{$af_key};
|
|
|
259 push @af_keys, $af_key;
|
|
|
260 if ($af_from_vcf_key_2_collection_id->{$af_key}) {
|
|
|
261
|
|
|
262 $vcf_collection_ids->{$af_from_vcf_key_2_collection_id->{$af_key}->{$assembly}} = 1;
|
|
|
263 }
|
|
|
264 }
|
|
|
265 }
|
|
|
266 $params->{af_keys} = \@af_keys;
|
|
|
267 $params->{vcf_collection_ids} = $vcf_collection_ids;
|
|
|
268 }
|
|
|
269
|
|
|
270 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
|
|
|
271 $year += 1900;
|
|
|
272 $mon++;
|
|
|
273 my $stamp = join('_', ($year, $mon, $mday, $hour, $min));
|
|
|
274 my $cwd_dir = getcwd;
|
|
|
275 my $new_log_dir = "$cwd_dir/g2p_log_dir\_$stamp";
|
|
|
276 my $log_dir = $params->{log_dir} || $new_log_dir;
|
|
|
277 if (-d $log_dir) {
|
|
|
278 my @files = <$log_dir/*>;
|
|
|
279 if (scalar @files > 0) {
|
|
|
280 unlink glob "'$log_dir/*.*'";
|
|
|
281 }
|
|
|
282 @files = <$log_dir/*>;
|
|
|
283 if (scalar @files > 0) {
|
|
|
284 mkdir $new_log_dir, 0755;
|
|
|
285 $params->{log_dir} = $new_log_dir;
|
|
|
286 }
|
|
|
287 } else {
|
|
|
288 mkdir $log_dir, 0755;
|
|
|
289 $params->{log_dir} = $log_dir;
|
|
|
290 }
|
|
|
291
|
|
|
292 foreach my $report_type (qw/txt_report html_report/) {
|
|
|
293 if (!$params->{$report_type}) {
|
|
|
294 my $file_type = ($report_type eq 'txt_report') ? 'txt' : 'html';
|
|
|
295 $params->{$report_type} = $cwd_dir . "/$report_type\_$stamp.$file_type";
|
|
|
296 }
|
|
|
297 }
|
|
|
298
|
|
|
299 if ($params->{all_confidence_levels}) {
|
|
|
300 push @{$params->{confidence_levels}}, 'possible', @{$DEFAULTS{confidence_levels}};
|
|
|
301 }
|
|
|
302
|
|
|
303 # copy in default params
|
|
|
304 $params->{$_} //= $DEFAULTS{$_} for keys %DEFAULTS;
|
|
|
305 $self->{user_params} = $params;
|
|
|
306
|
|
|
307 if (!defined($self->{config}->{reg})) {
|
|
|
308 my $reg = 'Bio::EnsEMBL::Registry';
|
|
|
309 $reg->load_registry_from_db(
|
|
|
310 -host => $self->config->{host},
|
|
|
311 -user => $self->config->{user},
|
|
|
312 -port => $self->config->{port},
|
|
|
313 -db_version => $self->config->{db_version},
|
|
|
314 -species => $self->config->{species},
|
|
|
315 -no_cache => $self->config->{no_slice_cache},
|
|
|
316 );
|
|
|
317 $self->{config}->{reg} = $reg;
|
|
|
318 }
|
|
|
319
|
|
|
320 my $va = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'variation');
|
|
|
321 $va->db->use_vcf(1);
|
|
|
322 $va->db->include_failed_variations(1);
|
|
|
323 $self->{config}->{va} = $va;
|
|
|
324 my $pa = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'population');
|
|
|
325 $self->{config}->{pa} = $pa;
|
|
|
326 my $vca = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'VCFCollection');
|
|
|
327 $self->{config}->{vca} = $vca;
|
|
|
328 my $ta = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'core', 'transcript');
|
|
|
329 $self->{config}->{ta} = $ta;
|
|
|
330
|
|
|
331 # read data from file
|
|
|
332 $self->{gene_data} = $self->read_gene_data_from_file($file);
|
|
|
333 $self->synonym_mappings();
|
|
|
334
|
|
|
335 # force some config params
|
|
|
336 $self->{config}->{individual} //= ['all'];
|
|
|
337 $self->{config}->{symbol} = 1;
|
|
|
338
|
|
|
339 $self->{config}->{check_existing} = 1;
|
|
|
340 $self->{config}->{failed} = 1;
|
|
|
341 $self->{config}->{af} = 1;
|
|
|
342 $self->{config}->{af_1kg} = 1;
|
|
|
343 $self->{config}->{af_esp} = 1;
|
|
|
344 $self->{config}->{af_gnomad} = 1;
|
|
|
345 # $self->{config}->{sift} = 'b';
|
|
|
346 # $self->{config}->{polyphen} = 'b';
|
|
|
347 # $self->{config}->{hgvsc} = 1;
|
|
|
348 # $self->{config}->{hgvsp} = 1;
|
|
|
349
|
|
|
350 # tell VEP we have a cache so stuff gets shared/merged between forks
|
|
|
351 $self->{has_cache} = 1;
|
|
|
352 $self->{cache}->{g2p_in_vcf} = {};
|
|
|
353
|
|
|
354 return $self;
|
|
|
355 }
|
|
|
356
|
|
|
357 sub feature_types {
|
|
|
358 return ['Transcript'];
|
|
|
359 }
|
|
|
360
|
|
|
361 sub get_header_info {
|
|
|
362 my $self = shift;
|
|
|
363
|
|
|
364 return {
|
|
|
365 G2P_flag => 'Flags zygosity of valid variants for a G2P gene',
|
|
|
366 G2P_complete => 'Indicates this variant completes the allelic requirements for a G2P gene',
|
|
|
367 G2P_gene_req => 'MONO or BI depending on the context in which this gene has been explored',
|
|
|
368 };
|
|
|
369 }
|
|
|
370
|
|
|
371 sub run {
|
|
|
372 my ($self, $tva, $line) = @_;
|
|
|
373
|
|
|
374 # only interested if we know the zygosity
|
|
|
375 my $zyg = $line->{Extra}->{ZYG} || $line->{ZYG};
|
|
|
376 return {} unless $zyg;
|
|
|
377
|
|
|
378 # only interested in given gene set
|
|
|
379 my $tr = $tva->transcript;
|
|
|
380
|
|
|
381 my $ensembl_gene_id = $tr->{_gene}->stable_id;
|
|
|
382 my $gene_symbol = $tr->{_gene_symbol} || $tr->{_gene_hgnc};
|
|
|
383 return {} unless $gene_symbol;
|
|
|
384 my $gene_data = $self->gene_data($gene_symbol);
|
|
|
385 if (! defined $gene_data) {
|
|
|
386 my $ensembl_gene_id = $tr->{_gene}->stable_id;
|
|
|
387 $gene_data = $self->gene_data($ensembl_gene_id);
|
|
|
388 }
|
|
|
389
|
|
|
390 if (!$self->{cache}->{g2p_in_vcf}->{$gene_symbol}) {
|
|
|
391 $self->write_report('G2P_in_vcf', $gene_symbol);
|
|
|
392 $self->{cache}->{g2p_in_vcf}->{$gene_symbol} = 1;
|
|
|
393 }
|
|
|
394 return {} unless defined $gene_data;
|
|
|
395
|
|
|
396 my @ars = ($gene_data->{'allelic requirement'}) ? @{$gene_data->{'allelic requirement'}} : ();
|
|
|
397 my %seen;
|
|
|
398 @ars = grep { !$seen{$_}++ } @ars;
|
|
|
399
|
|
|
400 return {} unless (@ars && ( grep { exists($allelic_requirements->{$_}) } @ars));
|
|
|
401 # limit by type
|
|
|
402 my @consequence_types = map { $_->SO_term } @{$tva->get_all_OverlapConsequences};
|
|
|
403
|
|
|
404 return {} unless grep {$self->{user_params}->{types}->{$_->SO_term}} @{$tva->get_all_OverlapConsequences};
|
|
|
405
|
|
|
406 # limit by MAF
|
|
|
407 my $threshold = 0;
|
|
|
408 my ($freqs, $existing_variant, $ar_passed) = @{$self->get_freq($tva, \@ars)};
|
|
|
409
|
|
|
410 return {} if (!keys %$ar_passed);
|
|
|
411
|
|
|
412 my $vf = $tva->base_variation_feature;
|
|
|
413 my $allele = $tva->variation_feature_seq;
|
|
|
414 my $start = $vf->{start};
|
|
|
415 my $end = $vf->{end};
|
|
|
416
|
|
|
417 my $individual = $vf->{individual};
|
|
|
418 my $vf_name = $vf->variation_name;
|
|
|
419 if ($vf_name || $vf_name eq '.') {
|
|
|
420 $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term});
|
|
|
421 }
|
|
|
422 my $allele_string = $vf->{allele_string};
|
|
|
423 my @alleles = split('/', $allele_string);
|
|
|
424 my $ref = $alleles[0];
|
|
|
425 my $seq_region_name = $vf->{chr};
|
|
|
426
|
|
|
427 my $params = $self->{user_params};
|
|
|
428 my $refseq = $tr->{_refseq} || 'NA';
|
|
|
429 my $tr_stable_id = $tr->stable_id;
|
|
|
430 my $hgvs_t = $tva->hgvs_transcript || 'NA';
|
|
|
431 my $hgvs_p = $tva->hgvs_protein || 'NA';
|
|
|
432
|
|
|
433 my ($clin_sig, $novel, $failed, $frequencies, $existing_name) = ('NA', 'yes', 'NA', 'NA', 'NA');
|
|
|
434 if ($existing_variant) {
|
|
|
435 $clin_sig = $existing_variant->{clin_sig} || 'NA';
|
|
|
436 $failed = ($existing_variant->{failed}) ? 'yes' : 'no';
|
|
|
437 $existing_name = $existing_variant->{variation_name} || 'NA';
|
|
|
438 $novel = 'no';
|
|
|
439 }
|
|
|
440
|
|
|
441 my $pph_score = (defined $tva->polyphen_score) ? $tva->polyphen_score : 'NA';
|
|
|
442 my $pph_pred = (defined $tva->polyphen_prediction) ? $tva->polyphen_prediction : 'NA';
|
|
|
443 my $sift_score = (defined $tva->sift_score) ? $tva->sift_score : 'NA';
|
|
|
444 my $sift_pred = (defined $tva->sift_prediction) ? $tva->sift_prediction : 'NA';
|
|
|
445
|
|
|
446 if (scalar keys %$freqs > 0) {
|
|
|
447 $frequencies = join(',', map {"$_=$freqs->{$_}"} keys %$freqs);
|
|
|
448 }
|
|
|
449
|
|
|
450 my $ar = join(',', sort keys %$ar_passed);
|
|
|
451 my $ar_in_g2pdb = join(',', sort @ars);
|
|
|
452 my $g2p_data = {
|
|
|
453 'zyg' => $zyg,
|
|
|
454 'allele_requirement' => $ar,
|
|
|
455 'ar_in_g2pdb' => $ar_in_g2pdb,
|
|
|
456 'frequencies' => $frequencies,
|
|
|
457 'consequence_types' => join(',', @consequence_types),
|
|
|
458 'refseq' => $refseq,
|
|
|
459 'failed' => $failed,
|
|
|
460 'clin_sig' => $clin_sig,
|
|
|
461 'novel' => $novel,
|
|
|
462 'existing_name' => $existing_name,
|
|
|
463 'hgvs_t' => $hgvs_t,
|
|
|
464 'hgvs_p' => $hgvs_p,
|
|
|
465 'vf_location' => "$seq_region_name:$start-$end $ref/$allele",
|
|
|
466 'sift_score' => "$sift_score",
|
|
|
467 'sift_prediction' => $sift_pred,
|
|
|
468 'polyphen_score' => "$pph_score",
|
|
|
469 'polyphen_prediction' => $pph_pred,
|
|
|
470 };
|
|
|
471
|
|
|
472 my %return = (
|
|
|
473 G2P_flag => $zyg
|
|
|
474 );
|
|
|
475
|
|
|
476
|
|
|
477 $self->write_report('G2P_flag', $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data);
|
|
|
478
|
|
|
479 $self->write_report('G2P_complete', $gene_symbol, $tr_stable_id, $individual, $vf_name, $ar, $zyg);
|
|
|
480
|
|
|
481 my $cache = $self->{cache}->{$individual}->{$tr->stable_id} ||= {};
|
|
|
482
|
|
|
483 delete $cache->{$vf_name} if exists($cache->{$vf_name});
|
|
|
484
|
|
|
485 # biallelic genes require >=1 hom or >=2 hets
|
|
|
486
|
|
|
487 my $gene_reqs = {};
|
|
|
488
|
|
|
489 foreach my $ar (keys %$ar_passed) {
|
|
|
490 if($ar eq 'biallelic') {
|
|
|
491 # homozygous, report complete
|
|
|
492 if(uc($zyg) eq 'HOM') {
|
|
|
493 $return{G2P_complete} = 1;
|
|
|
494 $gene_reqs->{BI} = 1;
|
|
|
495 }
|
|
|
496 # heterozygous
|
|
|
497 # we need to cache that we've observed one
|
|
|
498 elsif(uc($zyg) eq 'HET') {
|
|
|
499 if(scalar keys %$cache) {
|
|
|
500 $return{G2P_complete} = 1;
|
|
|
501 }
|
|
|
502 $cache->{$vf_name} = 1;
|
|
|
503 }
|
|
|
504 }
|
|
|
505 # monoallelic genes require only one allele
|
|
|
506 elsif($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') {
|
|
|
507 $return{G2P_complete} = 1;
|
|
|
508 $gene_reqs->{MONO} = 1;
|
|
|
509 }
|
|
|
510 else {
|
|
|
511 return {};
|
|
|
512 }
|
|
|
513 }
|
|
|
514 if ($return{G2P_complete}) {
|
|
|
515 $return{G2P_gene_req} = join(',', sort keys %$gene_reqs);
|
|
|
516 }
|
|
|
517
|
|
|
518 return \%return;
|
|
|
519 }
|
|
|
520
|
|
|
521 # read G2P CSV dump
|
|
|
522 # as from http://www.ebi.ac.uk/gene2phenotype/downloads
|
|
|
523 sub read_gene_data_from_file {
|
|
|
524 my $self = shift;
|
|
|
525 my $file = shift;
|
|
|
526 my $delimiter = shift;
|
|
|
527 my (@headers, %gene_data);
|
|
|
528
|
|
|
529 my $assembly = $self->{config}->{assembly};
|
|
|
530 die("ERROR: No file specified or could not read from file ".($file || '')."\n") unless $file && -e $file;
|
|
|
531
|
|
|
532 my @confidence_levels = @{$self->{user_params}->{confidence_levels}}, "\n";
|
|
|
533
|
|
|
534 # determine file type
|
|
|
535 my $file_type;
|
|
|
536 my $fh = FileHandle->new($file, 'r');
|
|
|
537 while (<$fh>) {
|
|
|
538 chomp;
|
|
|
539 if (/Model_Of_Inheritance/) {
|
|
|
540 $file_type = 'panelapp';
|
|
|
541 } elsif (/"allelic requirement"/) {
|
|
|
542 $file_type = 'g2p';
|
|
|
543 } else {
|
|
|
544 $file_type = 'unknown';
|
|
|
545 }
|
|
|
546 last;
|
|
|
547 }
|
|
|
548 $fh->close();
|
|
|
549 if ($file_type eq 'unknown') {
|
|
|
550 if ($file =~ /gz$/) {
|
|
|
551 die("ERROR: G2P plugin can only read uncompressed data");
|
|
|
552 } else {
|
|
|
553 die("ERROR: Could not recognize input file format. Format must be one of panelapp, g2p or custom. Check website for details: https://www.ebi.ac.uk/gene2phenotype/g2p_vep_plugin");
|
|
|
554 }
|
|
|
555 }
|
|
|
556
|
|
|
557 if ($file_type eq 'panelapp') {
|
|
|
558 my @headers = ();
|
|
|
559 my $csv = Text::CSV->new ({ sep_char => "\t" });
|
|
|
560 open my $fh, "<:encoding(utf8)", "$file" or die "$file: $!";
|
|
|
561 while ( my $row = $csv->getline( $fh ) ) {
|
|
|
562 unless (@headers) {
|
|
|
563 @headers = @$row;
|
|
|
564 } else {
|
|
|
565 my %tmp = map {$headers[$_] => $row->[$_]} (0..$#headers);
|
|
|
566 my $gene_symbol = $tmp{"Gene Entity Symbol"};
|
|
|
567 my $ensembl_gene_id = "";
|
|
|
568 if ($assembly eq 'GRCh37') {
|
|
|
569 $ensembl_gene_id = $tmp{"EnsemblId(GRch37)"};
|
|
|
570 } else { # GRCh38
|
|
|
571 $ensembl_gene_id = $tmp{"EnsemblId(GRch38)"};
|
|
|
572 }
|
|
|
573 if ($ensembl_gene_id) {
|
|
|
574 my @ars = ();
|
|
|
575 my $allelic_requirement_panel_app = $tmp{"Model_Of_Inheritance"};
|
|
|
576 if ($allelic_requirement_panel_app =~ m/MONOALLELIC|BOTH/) {
|
|
|
577 push @ars, 'monoallelic';
|
|
|
578 } elsif ($allelic_requirement_panel_app =~ m/BIALLELIC|BOTH/) {
|
|
|
579 push @ars, 'biallelic';
|
|
|
580 } elsif ($allelic_requirement_panel_app eq 'X-LINKED: hemizygous mutation in males, biallelic mutations in females') {
|
|
|
581 push @ars, 'hemizygous';
|
|
|
582 } elsif ($allelic_requirement_panel_all eq 'X-LINKED: hemizygous mutation in males, monoallelic mutations in females may cause disease (may be less severe, later onset than males)') {
|
|
|
583 push @ars, 'x-linked dominant';
|
|
|
584 } else {
|
|
|
585 $self->write_report('log', "no allelelic_requirement for $ensembl_gene_id");
|
|
|
586 }
|
|
|
587 foreach my $ar (@ars) {
|
|
|
588 push @{$gene_data{$ensembl_gene_id}->{"allelic requirement"}}, $ar;
|
|
|
589 }
|
|
|
590 } else {
|
|
|
591 $self->write_report('log', "no ensembl gene id for $gene_symbol");
|
|
|
592 }
|
|
|
593 }
|
|
|
594 }
|
|
|
595 $csv->eof or $csv->error_diag();
|
|
|
596 close $fh;
|
|
|
597 }
|
|
|
598
|
|
|
599 if ($file_type eq 'g2p') {
|
|
|
600 # this regexp allows for nested ",", e.g.
|
|
|
601 # item,description
|
|
|
602 # cheese,"salty,delicious"
|
|
|
603 my $re = qr/(?: "\( ( [^()""]* ) \)" | \( ( [^()]* ) \) | " ( [^"]* ) " | ( [^,]* ) ) , \s* /x;
|
|
|
604
|
|
|
605 my $fh = FileHandle->new($file, 'r');
|
|
|
606
|
|
|
607 while(<$fh>) {
|
|
|
608 chomp;
|
|
|
609 $_ =~ s/\R//g;
|
|
|
610 my @split = grep defined, "$_," =~ /$re/g;
|
|
|
611 unless(@headers) {
|
|
|
612 if ($file_type eq 'g2p') {
|
|
|
613 @headers = map {s/\"//g; $_} @split;
|
|
|
614 } else {
|
|
|
615 @headers = @split;
|
|
|
616 }
|
|
|
617 }
|
|
|
618 else {
|
|
|
619 my %tmp = map {$headers[$_] => $split[$_]} (0..$#split);
|
|
|
620 die("ERROR: Gene symbol column not found\n$_\n") unless $tmp{"gene symbol"};
|
|
|
621 my $confidence_value = $tmp{"DDD category"};
|
|
|
622 next if (!grep{$_ eq $confidence_value} @confidence_levels);
|
|
|
623 my $gene_symbol = $tmp{"gene symbol"};
|
|
|
624 $gene_data{$gene_symbol}->{"prev symbols"} = $tmp{"prev symbols"};
|
|
|
625 push @{$gene_data{$gene_symbol}->{"allelic requirement"}}, $tmp{"allelic requirement"} if ($tmp{"allelic requirement"});
|
|
|
626 $self->write_report('G2P_list', $tmp{"gene symbol"}, $tmp{"DDD category"});
|
|
|
627 }
|
|
|
628 }
|
|
|
629 $fh->close;
|
|
|
630 }
|
|
|
631 return \%gene_data;
|
|
|
632 }
|
|
|
633
|
|
|
634 # return either whole gene data hash or one gene's data
|
|
|
635 # this should allow updates to this plugin to e.g. query a REST server, for example
|
|
|
636 sub gene_data {
|
|
|
637 my ($self, $gene_symbol) = @_;
|
|
|
638 my $gene_data = $self->{gene_data}->{$gene_symbol};
|
|
|
639 if (!$gene_data) {
|
|
|
640 my $prev_gene_symbol = $self->{prev_symbol_mappings}->{$gene_symbol};
|
|
|
641 return $prev_gene_symbol ? $self->{gene_data}->{$prev_gene_symbol} : undef;
|
|
|
642 }
|
|
|
643 return $gene_data;
|
|
|
644 }
|
|
|
645
|
|
|
646 sub synonym_mappings {
|
|
|
647 my $self = shift;
|
|
|
648 my $gene_data = $self->{gene_data};
|
|
|
649 my $synonym_mappings = {};
|
|
|
650 foreach my $gene_symbol (keys %$gene_data) {
|
|
|
651 my $prev_symbols = $gene_data->{$gene_symbol}->{'prev symbols'};
|
|
|
652 if ($prev_symbols) {
|
|
|
653 foreach my $prev_symbol (split(';', $prev_symbols)) {
|
|
|
654 $synonym_mappings->{$prev_symbol} = $gene_symbol;
|
|
|
655 }
|
|
|
656 }
|
|
|
657 }
|
|
|
658 $self->{prev_symbol_mappings} = $synonym_mappings;
|
|
|
659 }
|
|
|
660
|
|
|
661 sub get_freq {
|
|
|
662 my $self = shift;
|
|
|
663 my $tva = shift;
|
|
|
664 my $ars = shift;
|
|
|
665 my $allele = $tva->variation_feature_seq;
|
|
|
666 my $vf = $tva->base_variation_feature;
|
|
|
667 reverse_comp(\$allele) if $vf->{strand} < 0;
|
|
|
668 my $vf_name = $vf->variation_name;
|
|
|
669 if ($vf_name || $vf_name eq '.') {
|
|
|
670 $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term});
|
|
|
671 }
|
|
|
672 my $cache = $self->{cache}->{$vf_name}->{_g2p_freqs} ||= {};
|
|
|
673
|
|
|
674 if (exists $cache->{$allele}->{failed}) {
|
|
|
675 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}];
|
|
|
676 }
|
|
|
677
|
|
|
678 if (exists $cache->{$allele}->{freq}) {
|
|
|
679 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}];
|
|
|
680 }
|
|
|
681
|
|
|
682 if (!$vf->{existing}) {
|
|
|
683 my $failed_ars = {};
|
|
|
684 my $freqs = {};
|
|
|
685 my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars);
|
|
|
686 if (!$passed) {
|
|
|
687 $cache->{$allele}->{failed} = 1;
|
|
|
688 return [{}, {}, {}];
|
|
|
689 } else {
|
|
|
690 $cache->{$allele}->{freq} = $freqs;
|
|
|
691 $cache->{$allele}->{ex_variant} = undef;
|
|
|
692 # if we get to here return all allelic requirements that passed threshold filtering
|
|
|
693 my $passed_ar = {};
|
|
|
694 foreach my $ar (@$ars) {
|
|
|
695 if (!$failed_ars->{$ar}) {
|
|
|
696 $passed_ar->{$ar} = 1;
|
|
|
697 }
|
|
|
698 }
|
|
|
699 $cache->{$allele}->{passed_ar} = $passed_ar;
|
|
|
700 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}];
|
|
|
701 }
|
|
|
702 }
|
|
|
703
|
|
|
704 my @existing_variants = @{$vf->{existing}};
|
|
|
705 # favour dbSNP variants
|
|
|
706 my @dbSNP_variants = grep {$_->{variation_name} =~ /^rs/} @existing_variants;
|
|
|
707 if (@dbSNP_variants) {
|
|
|
708 @existing_variants = @dbSNP_variants;
|
|
|
709 }
|
|
|
710 foreach my $ex (@existing_variants) {
|
|
|
711 my $existing_allele_string = $ex->{allele_string};
|
|
|
712 my $variation_name = $ex->{variation_name};
|
|
|
713 my $freqs = {};
|
|
|
714 my $failed_ars = {};
|
|
|
715 foreach my $af_key (@{$self->{user_params}->{af_keys}}) {
|
|
|
716 my $freq = $self->{user_params}->{default_af};
|
|
|
717 if ($af_key eq 'minor_allele_freq') {
|
|
|
718 if (defined $ex->{minor_allele_freq}) {
|
|
|
719 if (($ex->{minor_allele} || '') eq $allele ) {
|
|
|
720 $freq = $ex->{minor_allele_freq};
|
|
|
721 } else {
|
|
|
722 $freq = $self->correct_frequency($tva, $existing_allele_string, $ex->{minor_allele}, $ex->{minor_allele_freq}, $allele, $variation_name, $af_key, $vf_name) || $freq;
|
|
|
723 }
|
|
|
724 }
|
|
|
725 }
|
|
|
726 else {
|
|
|
727 my @pairs = split(',', $ex->{$af_key} || '');
|
|
|
728 my $found = 0;
|
|
|
729 if (scalar @pairs == 0) {
|
|
|
730 $found = 1; # no allele frequency for this population/af_key available
|
|
|
731 }
|
|
|
732 foreach my $pair (@pairs) {
|
|
|
733 my ($a, $f) = split(':', $pair);
|
|
|
734 if(($a || '') eq $allele && defined($f)) {
|
|
|
735 $freq = $f;
|
|
|
736 $found = 1;
|
|
|
737 }
|
|
|
738 }
|
|
|
739 if (!$found) {
|
|
|
740 $freq = $self->correct_frequency($tva, $existing_allele_string, undef, undef, $allele, $variation_name, $af_key, $vf_name) || $freq;
|
|
|
741 }
|
|
|
742 }
|
|
|
743 if (!$self->continue_af_annotation($ars, $failed_ars, $freq)) {
|
|
|
744 # cache failed results
|
|
|
745 $cache->{$allele}->{failed} = 1;
|
|
|
746 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}];
|
|
|
747 }
|
|
|
748 $freqs->{$af_key} = $freq if ($freq);
|
|
|
749 }
|
|
|
750 if ($self->{user_params}->{af_from_vcf}) {
|
|
|
751 my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars);
|
|
|
752 if (!$passed) {
|
|
|
753 $cache->{$allele}->{failed} = 1;
|
|
|
754 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}];
|
|
|
755 }
|
|
|
756 }
|
|
|
757 $cache->{$allele}->{freq} = $freqs;
|
|
|
758 $cache->{$allele}->{ex_variant} = $ex;
|
|
|
759
|
|
|
760 # if we get to here return all allelic requirements that passed threshold filtering
|
|
|
761 my $passed_ar = {};
|
|
|
762 foreach my $ar (@$ars) {
|
|
|
763 if (!$failed_ars->{$ar}) {
|
|
|
764 $passed_ar->{$ar} = 1;
|
|
|
765 }
|
|
|
766 }
|
|
|
767 $cache->{$allele}->{passed_ar} = $passed_ar;
|
|
|
768
|
|
|
769 }
|
|
|
770
|
|
|
771 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}];
|
|
|
772 }
|
|
|
773
|
|
|
774 sub correct_frequency {
|
|
|
775 my ($self, $tva, $allele_string, $minor_allele, $af, $allele, $variation_name, $af_key, $vf_name) = @_;
|
|
|
776
|
|
|
777 my @existing_alleles = split('/', $allele_string);
|
|
|
778 if (!grep( /^$allele$/, @existing_alleles)) {
|
|
|
779 return 0.0;
|
|
|
780 }
|
|
|
781
|
|
|
782 if ($af_key eq 'minor_allele_freq' && (scalar @existing_alleles == 2)) {
|
|
|
783 my $existing_ref_allele = $existing_alleles[0];
|
|
|
784 my $existing_alt_allele = $existing_alleles[1];
|
|
|
785 if ( ($minor_allele eq $existing_ref_allele && ($allele eq $existing_alt_allele)) ||
|
|
|
786 ($minor_allele eq $existing_alt_allele && ($allele eq $existing_ref_allele)) ) {
|
|
|
787 return (1.0 - $af);
|
|
|
788 }
|
|
|
789 } else {
|
|
|
790 my $va = $self->{config}->{va};
|
|
|
791 my $pa = $self->{config}->{pa};
|
|
|
792 my $variation = $va->fetch_by_name($variation_name);
|
|
|
793 my $af_key = $self->{user_params}->{af_keys};
|
|
|
794 my $population_name = $af_key_2_population_name->{$af_key};
|
|
|
795 if ($population_name) {
|
|
|
796 my $population = $self->{config}->{$population_name};
|
|
|
797 if (!$population) {
|
|
|
798 $population = $pa->fetch_by_name($population_name);
|
|
|
799 $self->{config}->{$population_name} = $population;
|
|
|
800 }
|
|
|
801 foreach (@{$variation->get_all_Alleles($population)}) {
|
|
|
802 if ($_->allele eq $allele) {
|
|
|
803 return $_->frequency;
|
|
|
804 }
|
|
|
805 }
|
|
|
806 }
|
|
|
807 }
|
|
|
808 return 0.0;
|
|
|
809 }
|
|
|
810
|
|
|
811 sub frequencies_from_VCF {
|
|
|
812 my $self = shift;
|
|
|
813 my $freqs = shift;
|
|
|
814 my $vf = shift;
|
|
|
815 my $vf_allele = shift;
|
|
|
816 my $ars = shift;
|
|
|
817 my $failed_ars = shift;
|
|
|
818 my $vca = $self->{config}->{vca};
|
|
|
819 my $collections = $vca->fetch_all;
|
|
|
820 foreach my $vc (@$collections) {
|
|
|
821 next if (! $self->{user_params}->{vcf_collection_ids}->{$vc->id});
|
|
|
822 my $alleles = $vc->get_all_Alleles_by_VariationFeature($vf);
|
|
|
823 foreach my $allele (@$alleles) {
|
|
|
824 if ($allele->allele eq $vf_allele) {
|
|
|
825 my $af_key = $allele->population->name;
|
|
|
826 my $freq = $allele->frequency;
|
|
|
827 return 0 if (!$self->continue_af_annotation($ars, $failed_ars, $freq));
|
|
|
828 $freqs->{$af_key} = $freq;
|
|
|
829 }
|
|
|
830 }
|
|
|
831 }
|
|
|
832 return 1;
|
|
|
833 }
|
|
|
834
|
|
|
835 sub continue_af_annotation {
|
|
|
836 my $self = shift;
|
|
|
837 my $ars = shift;
|
|
|
838 my $failed_ars = shift;
|
|
|
839 my $freq = shift;
|
|
|
840 foreach my $ar (@$ars) {
|
|
|
841 if (!$failed_ars->{$ar}) {
|
|
|
842 if (defined $allelic_requirements->{$ar}) {
|
|
|
843 my $threshold = $allelic_requirements->{$ar}->{af};
|
|
|
844 if ($freq > $threshold) {
|
|
|
845 $failed_ars->{$ar} = 1;
|
|
|
846 }
|
|
|
847 }
|
|
|
848 }
|
|
|
849 }
|
|
|
850 return (scalar @$ars != scalar keys %$failed_ars);
|
|
|
851 }
|
|
|
852
|
|
|
853
|
|
|
854
|
|
|
855 sub write_report {
|
|
|
856 my $self = shift;
|
|
|
857 my $flag = shift;
|
|
|
858 my $log_dir = $self->{user_params}->{log_dir};
|
|
|
859 my $log_file = "$log_dir/$$.txt";
|
|
|
860 open(my $fh, '>>', $log_file) or die "Could not open file '$flag $log_file' $!";
|
|
|
861 if ($flag eq 'G2P_list') {
|
|
|
862 my ($gene_symbol, $DDD_category) = @_;
|
|
|
863 $DDD_category ||= 'Not assigned';
|
|
|
864 print $fh "$flag\t$gene_symbol\t$DDD_category\n";
|
|
|
865 } elsif ($flag eq 'G2P_in_vcf') {
|
|
|
866 my $gene_symbol = shift;
|
|
|
867 print $fh "$flag\t$gene_symbol\n";
|
|
|
868 } elsif ($flag eq 'G2P_complete') {
|
|
|
869 print $fh join("\t", $flag, @_), "\n";
|
|
|
870 } elsif ($flag eq 'log') {
|
|
|
871 print $fh join("\t", $flag, @_), "\n";
|
|
|
872 } else {
|
|
|
873 my ($gene_symbol, $tr_stable_id, $individual, $vf_name, $data) = @_;
|
|
|
874 $data = join(';', map {"$_=$data->{$_}"} sort keys %$data);
|
|
|
875 print $fh join("\t", $flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $data), "\n";
|
|
|
876 }
|
|
|
877 close $fh;
|
|
|
878 }
|
|
|
879
|
|
|
880 sub finish {
|
|
|
881 my $self = shift;
|
|
|
882 $self->generate_report;
|
|
|
883 }
|
|
|
884
|
|
|
885 sub generate_report {
|
|
|
886 my $self = shift;
|
|
|
887 my $result_summary = $self->parse_log_files;
|
|
|
888 my $chart_txt_data = $self->chart_and_txt_data($result_summary);
|
|
|
889 my $chart_data = $chart_txt_data->{chart_data};
|
|
|
890 my $txt_data = $chart_txt_data->{txt_data};
|
|
|
891 my $canonical_transcripts = $chart_txt_data->{canonical_transcripts};
|
|
|
892 $self->write_txt_output($txt_data);
|
|
|
893 $self->write_charts($result_summary, $chart_data, $canonical_transcripts);
|
|
|
894 }
|
|
|
895
|
|
|
896 sub write_txt_output {
|
|
|
897 my $self = shift;
|
|
|
898 my $txt_output_data = shift;
|
|
|
899 my $txt_output_file = $self->{user_params}->{txt_report};
|
|
|
900 my $fh_txt = FileHandle->new($txt_output_file, 'w');
|
|
|
901 foreach my $individual (sort keys %$txt_output_data) {
|
|
|
902 foreach my $gene_symbol (keys %{$txt_output_data->{$individual}}) {
|
|
|
903 foreach my $ar (keys %{$txt_output_data->{$individual}->{$gene_symbol}}) {
|
|
|
904 foreach my $tr_stable_id (keys %{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}}) {
|
|
|
905 my $is_canonical = $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{is_canonical};
|
|
|
906 my $canonical_tag = ($is_canonical) ? 'is_canonical' : 'not_canonical';
|
|
|
907 my $req = $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{REQ};
|
|
|
908 my $variants = join(';', @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{variants}});
|
|
|
909 print $fh_txt join("\t", $individual, $gene_symbol, $tr_stable_id, $canonical_tag, "OBS=$ar", "REQ=$req", $variants), "\n";
|
|
|
910 }
|
|
|
911 }
|
|
|
912 }
|
|
|
913 }
|
|
|
914 $fh_txt->close();
|
|
|
915 }
|
|
|
916
|
|
|
917 sub write_charts {
|
|
|
918 my $self = shift;
|
|
|
919 my $result_summary = shift;
|
|
|
920 my $chart_data = shift;
|
|
|
921 my $canonical_transcripts = shift;
|
|
|
922
|
|
|
923 my $count_g2p_genes = keys %{$result_summary->{g2p_list}};
|
|
|
924 my $count_in_vcf_file = keys %{$result_summary->{in_vcf_file}};
|
|
|
925 my $count_complete_genes = scalar keys %{$result_summary->{complete_genes}};
|
|
|
926
|
|
|
927 my @charts = ();
|
|
|
928 my @frequencies_header = ();
|
|
|
929
|
|
|
930 foreach my $short_name (sort @{$self->{user_params}->{af_keys}}) {
|
|
|
931 my $text = $af_key_2_population_name->{$short_name};
|
|
|
932 push @frequencies_header, "<a style=\"cursor: pointer\" data-placement=\"top\" data-toggle=\"tooltip\" data-container=\"body\" title=\"$text\">$short_name</a>";
|
|
|
933 }
|
|
|
934
|
|
|
935 my $count = 1;
|
|
|
936 my @new_header = (
|
|
|
937 'Variant location and alleles (REF/ALT)',
|
|
|
938 'Variant name',
|
|
|
939 'Existing name',
|
|
|
940 'Zygosity',
|
|
|
941 'All allelic requirements from G2P DB',
|
|
|
942 'Consequence types',
|
|
|
943 'ClinVar annotation',
|
|
|
944 'SIFT',
|
|
|
945 'PolyPhen',
|
|
|
946 'Novel variant',
|
|
|
947 'Has been failed by Ensembl',
|
|
|
948 @frequencies_header,
|
|
|
949 'HGVS transcript',
|
|
|
950 'HGVS protein',
|
|
|
951 'RefSeq IDs',
|
|
|
952 );
|
|
|
953
|
|
|
954 my $html_output_file = $self->{user_params}->{html_report};
|
|
|
955 my $fh_out = FileHandle->new($html_output_file, 'w');
|
|
|
956 print $fh_out stats_html_head(\@charts);
|
|
|
957 print $fh_out "<div class='main_content container'>";
|
|
|
958
|
|
|
959
|
|
|
960 print $fh_out "<h1>G2P report</h1>";
|
|
|
961 print $fh_out "<p>Input and output files:</p>";
|
|
|
962
|
|
|
963 print $fh_out "<dl class='dl-horizontal'>";
|
|
|
964 print $fh_out "<dt>G2P list</dt>";
|
|
|
965 print $fh_out "<dd>" . $self->{user_params}->{file} . "</dd>";
|
|
|
966 print $fh_out "<dt>Log directory</dt>";
|
|
|
967 print $fh_out "<dd>" . $self->{user_params}->{log_dir} . "</dd>";
|
|
|
968 print $fh_out "<dt>HTML report</dt>";
|
|
|
969 print $fh_out "<dd>" . $self->{user_params}->{html_report} . "</dd>";
|
|
|
970 print $fh_out "<dt>TXT report</dt>";
|
|
|
971 print $fh_out "<dd>" . $self->{user_params}->{txt_report} . "</dd>";
|
|
|
972 print $fh_out "</dl>";
|
|
|
973
|
|
|
974 print $fh_out "<p>Counts:</p>";
|
|
|
975 print $fh_out "<dl class='dl-horizontal text-overflow'>";
|
|
|
976 print $fh_out "<dt>$count_g2p_genes</dt>";
|
|
|
977 print $fh_out "<dd>G2P genes</dd>";
|
|
|
978 print $fh_out "<dt>$count_in_vcf_file</dt>";
|
|
|
979 print $fh_out "<dd>G2P genes in input VCF file</dd>";
|
|
|
980 print $fh_out "<dt>$count_complete_genes</dt>";
|
|
|
981 print $fh_out "<dd>G2P complete genes in input VCF file</dd>";
|
|
|
982 print $fh_out "</dl>";
|
|
|
983
|
|
|
984
|
|
|
985 print $fh_out "<h1>Summary of G2P complete genes per individual</h1>";
|
|
|
986 print $fh_out "<p>G2P complete gene: A sufficient number of variant hits for the observed allelic requirement in at least one of the gene's transcripts. Variants are filtered by frequency.</p>";
|
|
|
987 print $fh_out "<p>Frequency thresholds and number of required variant hits for each allelic requirement:</p>";
|
|
|
988
|
|
|
989 print $fh_out "<table class='table table-bordered'>";
|
|
|
990 print $fh_out "<thead>";
|
|
|
991 print $fh_out "<tr><th>Allelic requirement</th><th>Frequency threshold for filtering</th><th>Variant counts by zygosity</th></tr>";
|
|
|
992 print $fh_out "</thead>";
|
|
|
993 print $fh_out "<tbody>";
|
|
|
994 foreach my $ar (sort keys %$allelic_requirements) {
|
|
|
995 my $af = $allelic_requirements->{$ar}->{af};
|
|
|
996 my $rules = $allelic_requirements->{$ar}->{rules};
|
|
|
997 my $rule = join(' OR ', map {"$_ >= $rules->{$_}"} keys %$rules);
|
|
|
998 print $fh_out "<tr><td>$ar</td><td>$af</td><td>$rule</td></tr>";
|
|
|
999 }
|
|
|
1000 print $fh_out "</tbody>";
|
|
|
1001 print $fh_out "</table>";
|
|
|
1002
|
|
|
1003 my $switch =<<SHTML;
|
|
|
1004 <form>
|
|
|
1005 <div class="checkbox">
|
|
|
1006 <label>
|
|
|
1007 <input class="target" type="checkbox"> Show only canonical transcript
|
|
|
1008 </label>
|
|
|
1009 </div>
|
|
|
1010 </form>
|
|
|
1011 SHTML
|
|
|
1012
|
|
|
1013 print $fh_out $switch;
|
|
|
1014
|
|
|
1015 foreach my $individual (sort keys %$chart_data) {
|
|
|
1016 foreach my $gene_symbol (keys %{$chart_data->{$individual}}) {
|
|
|
1017 foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) {
|
|
|
1018 print $fh_out "<ul>\n";
|
|
|
1019 foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) {
|
|
|
1020 my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical';
|
|
|
1021 print $fh_out "<li><a class=\"$class\" href=\"#$individual\_$gene_symbol\_$ar\_$transcript_stable_id\">" . "$individual > $gene_symbol > $ar > $transcript_stable_id" . "</a> </li>\n";
|
|
|
1022 }
|
|
|
1023 print $fh_out "</ul>\n";
|
|
|
1024 }
|
|
|
1025 }
|
|
|
1026 }
|
|
|
1027
|
|
|
1028 foreach my $individual (sort keys %$chart_data) {
|
|
|
1029 foreach my $gene_symbol (keys %{$chart_data->{$individual}}) {
|
|
|
1030 foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) {
|
|
|
1031 foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) {
|
|
|
1032 my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical';
|
|
|
1033 print $fh_out "<div class=\"$class\">";
|
|
|
1034 my $name = "$individual\_$gene_symbol\_$ar\_$transcript_stable_id";
|
|
|
1035 my $title = "$individual > $gene_symbol > $ar > $transcript_stable_id";
|
|
|
1036 print $fh_out "<h3><a name=\"$name\"></a>$title <a title=\"Back to Top\" data-toggle=\"tooltip\" href='#top'><span class=\"glyphicon glyphicon-arrow-up\" aria-hidden=\"true\"></span></a></h3>\n";
|
|
|
1037 print $fh_out "<div class=\"table-responsive\" style=\"width:100%\">\n";
|
|
|
1038 print $fh_out "<TABLE class=\"table table-bordered table-condensed\" style=\"margin-left: 2em\">";
|
|
|
1039 print $fh_out "<thead>\n";
|
|
|
1040 print $fh_out "<tr>" . join('', map {"<th>$_</th>"} @new_header) . "</tr>\n";
|
|
|
1041 print $fh_out "</thead>\n";
|
|
|
1042 print $fh_out "<tbody>\n";
|
|
|
1043 foreach my $vf_data (@{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) {
|
|
|
1044 my $data_row = $vf_data->[0];
|
|
|
1045 my @tds = ();
|
|
|
1046 foreach my $cell (@$data_row) {
|
|
|
1047 my $value = $cell->[0];
|
|
|
1048 my $class = $cell->[1];
|
|
|
1049 if ($class) {
|
|
|
1050 push @tds, "<td class=\"$class\">$value</td>";
|
|
|
1051 } else {
|
|
|
1052 push @tds, "<td>$value</td>";
|
|
|
1053 }
|
|
|
1054 }
|
|
|
1055 print $fh_out "<tr>", join('', @tds), "</tr>\n";
|
|
|
1056 }
|
|
|
1057 print $fh_out "</tbody>\n";
|
|
|
1058 print $fh_out "</TABLE>\n";
|
|
|
1059 print $fh_out "</div>\n";
|
|
|
1060 print $fh_out "</div>\n";
|
|
|
1061 }
|
|
|
1062 }
|
|
|
1063 }
|
|
|
1064 }
|
|
|
1065 print $fh_out stats_html_tail();
|
|
|
1066 }
|
|
|
1067
|
|
|
1068 sub chart_and_txt_data {
|
|
|
1069 my $self = shift;
|
|
|
1070 my $result_summary = shift;
|
|
|
1071 my $individuals = $result_summary->{individuals};
|
|
|
1072 my $complete_genes = $result_summary->{complete_genes};
|
|
|
1073 my $acting_ars = $result_summary->{acting_ars};
|
|
|
1074 my $new_order = $result_summary->{new_order};
|
|
|
1075
|
|
|
1076 # my @frequencies_header = sort keys $af_key_2_population_name;
|
|
|
1077 my @frequencies_header = sort @{$self->{user_params}->{af_keys}};
|
|
|
1078
|
|
|
1079 my $assembly = $self->{config}->{assembly};
|
|
|
1080 my $transcripts = {};
|
|
|
1081 my $canonical_transcripts = {};
|
|
|
1082 my $transcript_adaptor = $self->{config}->{ta};
|
|
|
1083 my $chart_data = {};
|
|
|
1084 my $txt_output_data = {};
|
|
|
1085
|
|
|
1086 my $prediction2bgcolor = {
|
|
|
1087 'probably damaging' => 'danger',
|
|
|
1088 'deleterious' => 'danger',
|
|
|
1089 'possibly damaging' => 'warning',
|
|
|
1090 'unknown' => 'warning',
|
|
|
1091 'benign' => 'success',
|
|
|
1092 'tolerated' => 'success',
|
|
|
1093 };
|
|
|
1094
|
|
|
1095 foreach my $individual (sort keys %$new_order) {
|
|
|
1096 foreach my $gene_symbol (keys %{$new_order->{$individual}}) {
|
|
|
1097 foreach my $ar (keys %{$new_order->{$individual}->{$gene_symbol}}) {
|
|
|
1098 foreach my $transcript_stable_id (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}}) {
|
|
|
1099 foreach my $vf_name (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) {
|
|
|
1100 my $data = $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$transcript_stable_id};
|
|
|
1101
|
|
|
1102 my $hash = {};
|
|
|
1103 foreach my $pair (split/;/, $data) {
|
|
|
1104 my ($key, $value) = split('=', $pair, 2);
|
|
|
1105 $value ||= '';
|
|
|
1106 $hash->{$key} = $value;
|
|
|
1107 }
|
|
|
1108 my $vf_location = $hash->{vf_location};
|
|
|
1109 my $existing_name = $hash->{existing_name};
|
|
|
1110 if ($existing_name ne 'NA') {
|
|
|
1111 $existing_name = "<a href=\"http://$assembly.ensembl.org/Homo_sapiens/Variation/Explore?v=$existing_name\">$existing_name</a>";
|
|
|
1112 }
|
|
|
1113 my $refseq = $hash->{refseq};
|
|
|
1114 my $failed = $hash->{failed};
|
|
|
1115 my $clin_sign = $hash->{clin_sig};
|
|
|
1116 my $novel = $hash->{novel};
|
|
|
1117 my $hgvs_t = $hash->{hgvs_t};
|
|
|
1118 my $hgvs_p = $hash->{hgvs_p};
|
|
|
1119 my $allelic_requirement = $hash->{allele_requirement};
|
|
|
1120 my $observed_allelic_requirement = $hash->{ar_in_g2pdb};
|
|
|
1121 my $consequence_types = $hash->{consequence_types};
|
|
|
1122 my $zygosity = $hash->{zyg};
|
|
|
1123 my $sift_score = $hash->{sift_score} || '0.0';
|
|
|
1124 my $sift_prediction = $hash->{sift_prediction};
|
|
|
1125 my $sift = 'NA';
|
|
|
1126 my $sift_class = '';
|
|
|
1127 if ($sift_prediction ne 'NA') {
|
|
|
1128 $sift = "$sift_prediction(" . "$sift_score)";
|
|
|
1129 $sift_class = $prediction2bgcolor->{$sift_prediction};
|
|
|
1130 }
|
|
|
1131 my $polyphen_score = $hash->{polyphen_score} || '0.0';
|
|
|
1132 my $polyphen_prediction = $hash->{polyphen_prediction};
|
|
|
1133 my $polyphen = 'NA';
|
|
|
1134 my $polyphen_class = '';
|
|
|
1135 if ($polyphen_prediction ne 'NA') {
|
|
|
1136 $polyphen = "$polyphen_prediction($polyphen_score)";
|
|
|
1137 $polyphen_class = $prediction2bgcolor->{$polyphen_prediction};
|
|
|
1138 }
|
|
|
1139
|
|
|
1140 my %frequencies_hash = ();
|
|
|
1141 if ($hash->{frequencies} ne 'NA') {
|
|
|
1142 %frequencies_hash = split /[,=]/, $hash->{frequencies};
|
|
|
1143 }
|
|
|
1144 my @frequencies = ();
|
|
|
1145 my @txt_output_frequencies = ();
|
|
|
1146 foreach my $population (@frequencies_header) {
|
|
|
1147 my $frequency = $frequencies_hash{$population} || '';
|
|
|
1148 push @frequencies, ["$frequency"];
|
|
|
1149 if ($frequency) {
|
|
|
1150 push @txt_output_frequencies, "$population=$frequency";
|
|
|
1151 }
|
|
|
1152 }
|
|
|
1153 my $is_canonical = 0;
|
|
|
1154 if ($hash->{is_canonical}) {
|
|
|
1155 $is_canonical = ($hash->{is_canonical} eq 'yes') ? 1 : 0;
|
|
|
1156 } else {
|
|
|
1157 if ($transcripts->{$transcript_stable_id}) {
|
|
|
1158 $is_canonical = 1 if ($canonical_transcripts->{$transcript_stable_id});
|
|
|
1159 } else {
|
|
|
1160 my $transcript = $transcript_adaptor->fetch_by_stable_id($transcript_stable_id);
|
|
|
1161 if ($transcript) {
|
|
|
1162 $is_canonical = $transcript->is_canonical();
|
|
|
1163 $transcripts->{$transcript_stable_id} = 1;
|
|
|
1164 $canonical_transcripts->{$transcript_stable_id} = 1 if ($is_canonical);
|
|
|
1165 }
|
|
|
1166 }
|
|
|
1167 }
|
|
|
1168 my ($location, $alleles) = split(' ', $vf_location);
|
|
|
1169 $location =~ s/\-/:/;
|
|
|
1170 $alleles =~ s/\//:/;
|
|
|
1171
|
|
|
1172 push @{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}, [[
|
|
|
1173 [$vf_location],
|
|
|
1174 [$vf_name],
|
|
|
1175 [$existing_name],
|
|
|
1176 [$zygosity],
|
|
|
1177 [$observed_allelic_requirement],
|
|
|
1178 [$consequence_types],
|
|
|
1179 [$clin_sign],
|
|
|
1180 [$sift, $sift_class],
|
|
|
1181 [$polyphen, $polyphen_class],
|
|
|
1182 [$novel],
|
|
|
1183 [$failed],
|
|
|
1184 @frequencies,
|
|
|
1185 [$hgvs_t],
|
|
|
1186 [$hgvs_p],
|
|
|
1187 [$refseq]
|
|
|
1188 ], $is_canonical];
|
|
|
1189
|
|
|
1190 my $txt_output_variant = "$location:$alleles:$zygosity:$consequence_types:SIFT=$sift:PolyPhen=$polyphen";
|
|
|
1191 if (@txt_output_frequencies) {
|
|
|
1192 $txt_output_variant .= ':' . join(',', @txt_output_frequencies);
|
|
|
1193 }
|
|
|
1194 $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{is_canonical} = $is_canonical;
|
|
|
1195 $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{REQ} = $observed_allelic_requirement;
|
|
|
1196 push @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{variants}}, $txt_output_variant;
|
|
|
1197 }
|
|
|
1198 }
|
|
|
1199 }
|
|
|
1200 }
|
|
|
1201 }
|
|
|
1202 return {txt_data => $txt_output_data, chart_data => $chart_data, canonical_transcripts => $canonical_transcripts};
|
|
|
1203 }
|
|
|
1204
|
|
|
1205 sub parse_log_files {
|
|
|
1206 my $self = shift;
|
|
|
1207
|
|
|
1208 my $log_dir = $self->{user_params}->{log_dir};
|
|
|
1209 my @files = <$log_dir/*>;
|
|
|
1210
|
|
|
1211 my $genes = {};
|
|
|
1212 my $individuals = {};
|
|
|
1213 my $complete_genes = {};
|
|
|
1214 my $g2p_list = {};
|
|
|
1215 my $in_vcf_file = {};
|
|
|
1216 my $cache = {};
|
|
|
1217 my $acting_ars = {};
|
|
|
1218
|
|
|
1219 my $new_order = {};
|
|
|
1220
|
|
|
1221 foreach my $file (@files) {
|
|
|
1222 my $fh = FileHandle->new($file, 'r');
|
|
|
1223 while (<$fh>) {
|
|
|
1224 chomp;
|
|
|
1225 if (/^G2P_list/) {
|
|
|
1226 my ($flag, $gene_symbol, $DDD_category) = split/\t/;
|
|
|
1227 $g2p_list->{$gene_symbol} = 1;
|
|
|
1228 } elsif (/^G2P_in_vcf/) {
|
|
|
1229 my ($flag, $gene_symbol) = split/\t/;
|
|
|
1230 $in_vcf_file->{$gene_symbol} = 1;
|
|
|
1231 } elsif (/^G2P_complete/) {
|
|
|
1232 my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $ars, $zyg) = split/\t/;
|
|
|
1233 foreach my $ar (split(',', $ars)) {
|
|
|
1234 if ($ar eq 'biallelic') {
|
|
|
1235 # homozygous, report complete
|
|
|
1236 if (uc($zyg) eq 'HOM') {
|
|
|
1237 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1;
|
|
|
1238 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1;
|
|
|
1239 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1;
|
|
|
1240 }
|
|
|
1241 # heterozygous
|
|
|
1242 # we need to cache that we've observed one
|
|
|
1243 elsif (uc($zyg) eq 'HET') {
|
|
|
1244 if (scalar keys %{$cache->{$individual}->{$tr_stable_id}} >= 1) {
|
|
|
1245 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1;
|
|
|
1246 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1;
|
|
|
1247 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1;
|
|
|
1248 # add first observed het variant to the list
|
|
|
1249 foreach my $vf (keys %{$cache->{$individual}->{$tr_stable_id}}) {
|
|
|
1250 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf} = 1;
|
|
|
1251 }
|
|
|
1252 }
|
|
|
1253 $cache->{$individual}->{$tr_stable_id}->{$vf_name}++;
|
|
|
1254 }
|
|
|
1255 }
|
|
|
1256 # monoallelic genes require only one allele
|
|
|
1257 elsif ($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') {
|
|
|
1258 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1;
|
|
|
1259 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1;
|
|
|
1260 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1;
|
|
|
1261 }
|
|
|
1262 }
|
|
|
1263 } elsif (/^G2P_flag/) {
|
|
|
1264 my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data) = split/\t/;
|
|
|
1265 $genes->{$gene_symbol}->{"$individual\t$vf_name"}->{$tr_stable_id} = $g2p_data;
|
|
|
1266 $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$tr_stable_id} = $g2p_data;
|
|
|
1267 } else {
|
|
|
1268
|
|
|
1269 }
|
|
|
1270 }
|
|
|
1271 $fh->close();
|
|
|
1272 }
|
|
|
1273 return {
|
|
|
1274 genes => $genes,
|
|
|
1275 individuals => $individuals,
|
|
|
1276 complete_genes => $complete_genes,
|
|
|
1277 g2p_list => $g2p_list,
|
|
|
1278 in_vcf_file => $in_vcf_file,
|
|
|
1279 acting_ars => $acting_ars,
|
|
|
1280 new_order => $new_order,
|
|
|
1281 };
|
|
|
1282 }
|
|
|
1283
|
|
|
1284
|
|
|
1285 sub stats_html_head {
|
|
|
1286 my $charts = shift;
|
|
|
1287
|
|
|
1288 my $html =<<SHTML;
|
|
|
1289 <html>
|
|
|
1290 <head>
|
|
|
1291 <title>VEP summary</title>
|
|
|
1292 <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
|
|
|
1293 <style>
|
|
|
1294 a.inactive {
|
|
|
1295 color: grey;
|
|
|
1296 pointer-events:none;
|
|
|
1297 }
|
|
|
1298 </style>
|
|
|
1299 </head>
|
|
|
1300 <body>
|
|
|
1301 SHTML
|
|
|
1302 return $html;
|
|
|
1303 }
|
|
|
1304
|
|
|
1305 sub stats_html_tail {
|
|
|
1306 my $script =<<SHTML;
|
|
|
1307 <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
|
|
|
1308 <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
|
|
|
1309 <script type="text/javascript" src="http://www.google.com/jsapi"></script>
|
|
|
1310 <script>
|
|
|
1311 \$( "input[type=checkbox]" ).on( "click", function(){
|
|
|
1312 if (\$('.target').is(':checked')) {
|
|
|
1313 \$( "div.not_canonical" ).hide();
|
|
|
1314 \$("a.not_canonical").addClass("inactive");
|
|
|
1315 } else {
|
|
|
1316 \$( "div.not_canonical" ).show();
|
|
|
1317 \$("a.not_canonical").removeClass("inactive");
|
|
|
1318 }
|
|
|
1319 } );
|
|
|
1320 \$(document).ready(function(){
|
|
|
1321 \$('[data-toggle="tooltip"]').tooltip();
|
|
|
1322 });
|
|
|
1323 </script>
|
|
|
1324 SHTML
|
|
|
1325 return "\n</div>\n$script\n</body>\n</html>\n";
|
|
|
1326 }
|
|
|
1327
|
|
|
1328 sub sort_keys {
|
|
|
1329 my $data = shift;
|
|
|
1330 my $sort = shift;
|
|
|
1331 print $data, "\n";
|
|
|
1332 my @keys;
|
|
|
1333
|
|
|
1334 # sort data
|
|
|
1335 if(defined($sort)) {
|
|
|
1336 if($sort eq 'chr') {
|
|
|
1337 @keys = sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %{$data};
|
|
|
1338 }
|
|
|
1339 elsif($sort eq 'value') {
|
|
|
1340 @keys = sort {$data->{$a} <=> $data->{$b}} keys %{$data};
|
|
|
1341 }
|
|
|
1342 elsif(ref($sort) eq 'HASH') {
|
|
|
1343 @keys = sort {$sort->{$a} <=> $sort->{$b}} keys %{$data};
|
|
|
1344 }
|
|
|
1345 }
|
|
|
1346 else {
|
|
|
1347 @keys = keys %{$data};
|
|
|
1348 }
|
|
|
1349
|
|
|
1350 return \@keys;
|
|
|
1351 }
|
|
|
1352
|
|
|
1353 1;
|