Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/G2P.pm @ 0:e545d0a25ffe draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:17:17 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e545d0a25ffe |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 G2P | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv G2P.pm ~/.vep/Plugins | |
| 31 ./vep -i variations.vcf --plugin G2P,file=/path/to/G2P.csv.gz | |
| 32 | |
| 33 =head1 DESCRIPTION | |
| 34 | |
| 35 A VEP plugin that uses G2P allelic requirements to assess variants in genes | |
| 36 for potential phenotype involvement. | |
| 37 | |
| 38 The plugin has multiple configuration options, though minimally requires only | |
| 39 the CSV file of G2P data. | |
| 40 | |
| 41 Options are passed to the plugin as key=value pairs, (defaults in parentheses): | |
| 42 | |
| 43 file : path to G2P data file, as found at http://www.ebi.ac.uk/gene2phenotype/downloads | |
| 44 | |
| 45 af_monoallelic : maximum allele frequency for inclusion for monoallelic genes (0.0001) | |
| 46 | |
| 47 af_biallelic : maximum allele frequency for inclusion for biallelic genes (0.005) | |
| 48 all_confidence_levels : set value to 1 to include all confidence levels: confirmed, probable and possible. | |
| 49 Default levels are confirmed and probable. | |
| 50 af_keys : reference populations used for annotating variant alleles with observed | |
| 51 allele frequencies. Allele frequencies are stored in VEP cache files. | |
| 52 Default populations are: | |
| 53 ESP: AA, EA | |
| 54 1000 Genomes: AFR, AMR, EAS, EUR, SAS | |
| 55 gnomAD exomes: gnomAD, gnomAD_AFR, gnomAD_AMR, gnomAD_ASJ, gnomAD_EAS, gnomAD_FIN, gnomAD_NFE, gnomAD_OTH, gnomAD_SAS | |
| 56 Separate multiple values with '&' | |
| 57 af_from_vcf : set value to 1 to include allele frequencies from VCF file. | |
| 58 Specifiy the list of reference populations to include with --af_from_vcf_keys | |
| 59 af_from_vcf_keys : reference populations used for annotating variant alleles with observed | |
| 60 allele frequencies. Allele frequencies are retrieved from VCF files. If | |
| 61 af_from_vcf is set to 1 but no populations specified with --af_from_vcf_keys | |
| 62 all available reference populations are included. | |
| 63 TOPmed: TOPMed | |
| 64 UK10K: ALSPAC, TWINSUK | |
| 65 gnomAD exomes: gnomADe:AFR, gnomADe:ALL, gnomADe:AMR, gnomADe:ASJ, gnomADe:EAS, gnomADe:FIN, gnomADe:NFE, gnomADe:OTH, gnomADe:SAS | |
| 66 gnomAD genomes: gnomADg:AFR, gnomADg:ALL, gnomADg:AMR, gnomADg:ASJ, gnomADg:EAS, gnomADg:FIN, gnomADg:NFE, gnomADg:OTH | |
| 67 Separate multiple values with '&' | |
| 68 default_af : default frequency of the input variant if no frequency data is | |
| 69 found (0). This determines whether such variants are included; | |
| 70 the value of 0 forces variants with no frequency data to be | |
| 71 included as this is considered equivalent to having a frequency | |
| 72 of 0. Set to 1 (or any value higher than af) to exclude them. | |
| 73 types : SO consequence types to include. Separate multiple values with '&' | |
| 74 (splice_donor_variant,splice_acceptor_variant,stop_gained, | |
| 75 frameshift_variant,stop_lost,initiator_codon_variant, | |
| 76 inframe_insertion,inframe_deletion,missense_variant, | |
| 77 coding_sequence_variant,start_lost,transcript_ablation, | |
| 78 transcript_amplification,protein_altering_variant) | |
| 79 | |
| 80 log_dir : write stats to log files in log_dir | |
| 81 | |
| 82 txt_report : write all G2P complete genes and attributes to txt file | |
| 83 | |
| 84 html_report : write all G2P complete genes and attributes to html file | |
| 85 | |
| 86 Example: | |
| 87 | |
| 88 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_keys=AA&gnomAD_ASJ,types=stop_gained&frameshift_variant | |
| 89 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,types=stop_gained&frameshift_variant | |
| 90 --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_from_vcf=1 | |
| 91 --plugin G2P,file=G2P.csv | |
| 92 | |
| 93 =cut | |
| 94 | |
| 95 package G2P; | |
| 96 | |
| 97 use strict; | |
| 98 use warnings; | |
| 99 | |
| 100 use Cwd; | |
| 101 use Scalar::Util qw(looks_like_number); | |
| 102 use FileHandle; | |
| 103 use Text::CSV; | |
| 104 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 105 | |
| 106 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | |
| 107 | |
| 108 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 109 | |
| 110 my %DEFAULTS = ( | |
| 111 | |
| 112 # vars must have a frequency <= to this to pass | |
| 113 af => 0.001, | |
| 114 af_monoallelic => 0.0001, | |
| 115 af_biallelic => 0.005, | |
| 116 | |
| 117 af_keys => [qw(AA AFR AMR EA EAS EUR SAS gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS)], | |
| 118 | |
| 119 af_from_vcf_keys => [qw(ALSPAC TOPMed TWINSUK gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH)], | |
| 120 | |
| 121 # if no MAF data is found, default to 0 | |
| 122 # this means absence of MAF data is considered equivalent to MAF=0 | |
| 123 # set to 1 to do the "opposite", i.e. exclude variants with no MAF data | |
| 124 default_af => 0, | |
| 125 | |
| 126 confidence_levels => [qw(confirmed probable)], | |
| 127 | |
| 128 # only include variants with these consequence types | |
| 129 # currently not ontology-resolved, exact term matches only | |
| 130 types => {map {$_ => 1} qw(splice_donor_variant splice_acceptor_variant stop_gained frameshift_variant stop_lost initiator_codon_variant inframe_insertion inframe_deletion missense_variant coding_sequence_variant start_lost transcript_ablation transcript_amplification protein_altering_variant)}, | |
| 131 | |
| 132 ); | |
| 133 | |
| 134 my $af_key_2_population_name = { | |
| 135 minor_allele_freq => 'global allele frequency (AF) from 1000 Genomes Phase 3 data', | |
| 136 AFR => '1000GENOMES:phase_3:AFR', | |
| 137 AMR => '1000GENOMES:phase_3:AMR', | |
| 138 EAS => '1000GENOMES:phase_3:EAS', | |
| 139 EUR => '1000GENOMES:phase_3:EUR', | |
| 140 SAS => '1000GENOMES:phase_3:SAS', | |
| 141 AA => 'Exome Sequencing Project 6500:African_American', | |
| 142 EA => 'Exome Sequencing Project 6500:European_American', | |
| 143 gnomAD => 'Genome Aggregation Database:Total', | |
| 144 gnomAD_AFR => 'Genome Aggregation Database exomes:African/African American', | |
| 145 gnomAD_AMR => 'Genome Aggregation Database exomes:Latino', | |
| 146 gnomAD_ASJ => 'Genome Aggregation Database exomes:Ashkenazi Jewish', | |
| 147 gnomAD_EAS => 'Genome Aggregation Database exomes:East Asian', | |
| 148 gnomAD_FIN => 'Genome Aggregation Database exomes:Finnish', | |
| 149 gnomAD_NFE => 'Genome Aggregation Database exomes:Non-Finnish European', | |
| 150 gnomAD_OTH => 'Genome Aggregation Database exomes:Other (population not assigned)', | |
| 151 gnomAD_SAS => 'Genome Aggregation Database exomes:South Asian', | |
| 152 ALSPAC => 'UK10K:ALSPAC cohort', | |
| 153 TOPMed => 'Trans-Omics for Precision Medicine (TOPMed) Program', | |
| 154 TWINSUK => 'UK10K:TWINSUK cohort', | |
| 155 'gnomADe:AFR' => 'Genome Aggregation Database exomes v170228', | |
| 156 'gnomADe:ALL' => 'Genome Aggregation Database exomes v170228', | |
| 157 'gnomADe:AMR' => 'Genome Aggregation Database exomes v170228', | |
| 158 'gnomADe:ASJ' => 'Genome Aggregation Database exomes v170228', | |
| 159 'gnomADe:EAS' => 'Genome Aggregation Database exomes v170228', | |
| 160 'gnomADe:FIN' => 'Genome Aggregation Database exomes v170228', | |
| 161 'gnomADe:NFE' => 'Genome Aggregation Database exomes v170228', | |
| 162 'gnomADe:OTH' => 'Genome Aggregation Database exomes v170228', | |
| 163 'gnomADe:SAS' => 'Genome Aggregation Database exomes v170228', | |
| 164 'gnomADg:AFR' => 'Genome Aggregation Database genomes v170228:African/African American', | |
| 165 'gnomADg:ALL' => 'Genome Aggregation Database genomes v170228:All gnomAD genomes individuals', | |
| 166 'gnomADg:AMR' => 'Genome Aggregation Database genomes v170228:Latino', | |
| 167 'gnomADg:ASJ' => 'Genome Aggregation Database genomes v170228:Ashkenazi Jewish', | |
| 168 'gnomADg:EAS' => 'Genome Aggregation Database genomes v170228:East Asian', | |
| 169 'gnomADg:FIN' => 'Genome Aggregation Database genomes v170228:Finnish', | |
| 170 'gnomADg:NFE' => 'Genome Aggregation Database genomes v170228:Non-Finnish European', | |
| 171 'gnomADg:OTH' => 'Genome Aggregation Database genomes v170228:Other (population not assigned)', | |
| 172 }; | |
| 173 | |
| 174 my $allelic_requirements = { | |
| 175 'biallelic' => { af => 0.005, rules => {HET => 2, HOM => 1} }, | |
| 176 'monoallelic' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | |
| 177 'hemizygous' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | |
| 178 'x-linked dominant' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | |
| 179 'x-linked over-dominance' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | |
| 180 }; | |
| 181 | |
| 182 my @allelic_requirement_terms = keys %$allelic_requirements; | |
| 183 | |
| 184 my @population_wide = qw(minor_allele_freq AA AFR ALSPAC AMR EA EAS EUR SAS TOPMed TWINSUK gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH); | |
| 185 | |
| 186 sub new { | |
| 187 my $class = shift; | |
| 188 | |
| 189 my $self = $class->SUPER::new(@_); | |
| 190 | |
| 191 my $supported_af_keys = { map {$_ => 1} @population_wide }; | |
| 192 | |
| 193 my $params = $self->params_to_hash(); | |
| 194 my $file = ''; | |
| 195 | |
| 196 # user only supplied file as first param? | |
| 197 if (!keys %$params) { | |
| 198 $file = $self->params->[0]; | |
| 199 } | |
| 200 else { | |
| 201 $file = $params->{file}; | |
| 202 | |
| 203 # process types | |
| 204 if ($params->{types}) { | |
| 205 $params->{types} = {map {$_ => 1} split(/[\;\&\|]/, $params->{types})}; | |
| 206 } | |
| 207 | |
| 208 # check af | |
| 209 foreach my $af (qw/af_monoallelic af_biallelic/) { | |
| 210 if($params->{$af}) { | |
| 211 die("ERROR: Invalid value for af: ".$params->{$af} . "\n") unless | |
| 212 looks_like_number($params->{$af}) && ($params->{$af} >= 0 && $params->{$af} <= 1) | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 my $assembly = $self->{config}->{assembly}; | |
| 217 my $af_from_vcf_key_2_collection_id = { | |
| 218 ALSPAC => {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'}, | |
| 219 TOPMed => {GRCh37 => 'topmed_GRCh37', GRCh38 => 'topmed_GRCh38'}, | |
| 220 TWINSUK => {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'}, | |
| 221 'gnomADe:AFR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 222 'gnomADe:ALL' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 223 'gnomADe:AMR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 224 'gnomADe:ASJ' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 225 'gnomADe:EAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 226 'gnomADe:FIN' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 227 'gnomADe:NFE' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 228 'gnomADe:OTH' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 229 'gnomADe:SAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | |
| 230 'gnomADg:AFR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 231 'gnomADg:ALL' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 232 'gnomADg:AMR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 233 'gnomADg:ASJ' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 234 'gnomADg:EAS' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 235 'gnomADg:FIN' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 236 'gnomADg:NFE' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 237 'gnomADg:OTH' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | |
| 238 }; | |
| 239 | |
| 240 my @keys = (); | |
| 241 my $vcf_collection_ids = {}; | |
| 242 if ($params->{af_keys}) { | |
| 243 push @keys, $params->{af_keys}; | |
| 244 } else { | |
| 245 push @keys, @{$DEFAULTS{af_keys}}; | |
| 246 } | |
| 247 if ($params->{af_from_vcf}) { | |
| 248 if ($params->{af_from_vcf_keys}) { | |
| 249 push @keys, $params->{af_from_vcf_keys}; | |
| 250 } else { | |
| 251 push @keys, @{$DEFAULTS{af_from_vcf_keys}}; | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 my @af_keys = (); | |
| 256 foreach my $af_key_set (@keys) { | |
| 257 foreach my $af_key (split(/[\;\&\|]/, $af_key_set)) { | |
| 258 die("ERROR: af_key: " . $af_key . " not supported. Check plugin documentation for supported af_keys.\n") unless $supported_af_keys->{$af_key}; | |
| 259 push @af_keys, $af_key; | |
| 260 if ($af_from_vcf_key_2_collection_id->{$af_key}) { | |
| 261 | |
| 262 $vcf_collection_ids->{$af_from_vcf_key_2_collection_id->{$af_key}->{$assembly}} = 1; | |
| 263 } | |
| 264 } | |
| 265 } | |
| 266 $params->{af_keys} = \@af_keys; | |
| 267 $params->{vcf_collection_ids} = $vcf_collection_ids; | |
| 268 } | |
| 269 | |
| 270 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); | |
| 271 $year += 1900; | |
| 272 $mon++; | |
| 273 my $stamp = join('_', ($year, $mon, $mday, $hour, $min)); | |
| 274 my $cwd_dir = getcwd; | |
| 275 my $new_log_dir = "$cwd_dir/g2p_log_dir\_$stamp"; | |
| 276 my $log_dir = $params->{log_dir} || $new_log_dir; | |
| 277 if (-d $log_dir) { | |
| 278 my @files = <$log_dir/*>; | |
| 279 if (scalar @files > 0) { | |
| 280 unlink glob "'$log_dir/*.*'"; | |
| 281 } | |
| 282 @files = <$log_dir/*>; | |
| 283 if (scalar @files > 0) { | |
| 284 mkdir $new_log_dir, 0755; | |
| 285 $params->{log_dir} = $new_log_dir; | |
| 286 } | |
| 287 } else { | |
| 288 mkdir $log_dir, 0755; | |
| 289 $params->{log_dir} = $log_dir; | |
| 290 } | |
| 291 | |
| 292 foreach my $report_type (qw/txt_report html_report/) { | |
| 293 if (!$params->{$report_type}) { | |
| 294 my $file_type = ($report_type eq 'txt_report') ? 'txt' : 'html'; | |
| 295 $params->{$report_type} = $cwd_dir . "/$report_type\_$stamp.$file_type"; | |
| 296 } | |
| 297 } | |
| 298 | |
| 299 if ($params->{all_confidence_levels}) { | |
| 300 push @{$params->{confidence_levels}}, 'possible', @{$DEFAULTS{confidence_levels}}; | |
| 301 } | |
| 302 | |
| 303 # copy in default params | |
| 304 $params->{$_} //= $DEFAULTS{$_} for keys %DEFAULTS; | |
| 305 $self->{user_params} = $params; | |
| 306 | |
| 307 if (!defined($self->{config}->{reg})) { | |
| 308 my $reg = 'Bio::EnsEMBL::Registry'; | |
| 309 $reg->load_registry_from_db( | |
| 310 -host => $self->config->{host}, | |
| 311 -user => $self->config->{user}, | |
| 312 -port => $self->config->{port}, | |
| 313 -db_version => $self->config->{db_version}, | |
| 314 -species => $self->config->{species}, | |
| 315 -no_cache => $self->config->{no_slice_cache}, | |
| 316 ); | |
| 317 $self->{config}->{reg} = $reg; | |
| 318 } | |
| 319 | |
| 320 my $va = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'variation'); | |
| 321 $va->db->use_vcf(1); | |
| 322 $va->db->include_failed_variations(1); | |
| 323 $self->{config}->{va} = $va; | |
| 324 my $pa = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'population'); | |
| 325 $self->{config}->{pa} = $pa; | |
| 326 my $vca = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'VCFCollection'); | |
| 327 $self->{config}->{vca} = $vca; | |
| 328 my $ta = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'core', 'transcript'); | |
| 329 $self->{config}->{ta} = $ta; | |
| 330 | |
| 331 # read data from file | |
| 332 $self->{gene_data} = $self->read_gene_data_from_file($file); | |
| 333 $self->synonym_mappings(); | |
| 334 | |
| 335 # force some config params | |
| 336 $self->{config}->{individual} //= ['all']; | |
| 337 $self->{config}->{symbol} = 1; | |
| 338 | |
| 339 $self->{config}->{check_existing} = 1; | |
| 340 $self->{config}->{failed} = 1; | |
| 341 $self->{config}->{af} = 1; | |
| 342 $self->{config}->{af_1kg} = 1; | |
| 343 $self->{config}->{af_esp} = 1; | |
| 344 $self->{config}->{af_gnomad} = 1; | |
| 345 # $self->{config}->{sift} = 'b'; | |
| 346 # $self->{config}->{polyphen} = 'b'; | |
| 347 # $self->{config}->{hgvsc} = 1; | |
| 348 # $self->{config}->{hgvsp} = 1; | |
| 349 | |
| 350 # tell VEP we have a cache so stuff gets shared/merged between forks | |
| 351 $self->{has_cache} = 1; | |
| 352 $self->{cache}->{g2p_in_vcf} = {}; | |
| 353 | |
| 354 return $self; | |
| 355 } | |
| 356 | |
| 357 sub feature_types { | |
| 358 return ['Transcript']; | |
| 359 } | |
| 360 | |
| 361 sub get_header_info { | |
| 362 my $self = shift; | |
| 363 | |
| 364 return { | |
| 365 G2P_flag => 'Flags zygosity of valid variants for a G2P gene', | |
| 366 G2P_complete => 'Indicates this variant completes the allelic requirements for a G2P gene', | |
| 367 G2P_gene_req => 'MONO or BI depending on the context in which this gene has been explored', | |
| 368 }; | |
| 369 } | |
| 370 | |
| 371 sub run { | |
| 372 my ($self, $tva, $line) = @_; | |
| 373 | |
| 374 # only interested if we know the zygosity | |
| 375 my $zyg = $line->{Extra}->{ZYG} || $line->{ZYG}; | |
| 376 return {} unless $zyg; | |
| 377 | |
| 378 # only interested in given gene set | |
| 379 my $tr = $tva->transcript; | |
| 380 | |
| 381 my $ensembl_gene_id = $tr->{_gene}->stable_id; | |
| 382 my $gene_symbol = $tr->{_gene_symbol} || $tr->{_gene_hgnc}; | |
| 383 return {} unless $gene_symbol; | |
| 384 my $gene_data = $self->gene_data($gene_symbol); | |
| 385 if (! defined $gene_data) { | |
| 386 my $ensembl_gene_id = $tr->{_gene}->stable_id; | |
| 387 $gene_data = $self->gene_data($ensembl_gene_id); | |
| 388 } | |
| 389 | |
| 390 if (!$self->{cache}->{g2p_in_vcf}->{$gene_symbol}) { | |
| 391 $self->write_report('G2P_in_vcf', $gene_symbol); | |
| 392 $self->{cache}->{g2p_in_vcf}->{$gene_symbol} = 1; | |
| 393 } | |
| 394 return {} unless defined $gene_data; | |
| 395 | |
| 396 my @ars = ($gene_data->{'allelic requirement'}) ? @{$gene_data->{'allelic requirement'}} : (); | |
| 397 my %seen; | |
| 398 @ars = grep { !$seen{$_}++ } @ars; | |
| 399 | |
| 400 return {} unless (@ars && ( grep { exists($allelic_requirements->{$_}) } @ars)); | |
| 401 # limit by type | |
| 402 my @consequence_types = map { $_->SO_term } @{$tva->get_all_OverlapConsequences}; | |
| 403 | |
| 404 return {} unless grep {$self->{user_params}->{types}->{$_->SO_term}} @{$tva->get_all_OverlapConsequences}; | |
| 405 | |
| 406 # limit by MAF | |
| 407 my $threshold = 0; | |
| 408 my ($freqs, $existing_variant, $ar_passed) = @{$self->get_freq($tva, \@ars)}; | |
| 409 | |
| 410 return {} if (!keys %$ar_passed); | |
| 411 | |
| 412 my $vf = $tva->base_variation_feature; | |
| 413 my $allele = $tva->variation_feature_seq; | |
| 414 my $start = $vf->{start}; | |
| 415 my $end = $vf->{end}; | |
| 416 | |
| 417 my $individual = $vf->{individual}; | |
| 418 my $vf_name = $vf->variation_name; | |
| 419 if ($vf_name || $vf_name eq '.') { | |
| 420 $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term}); | |
| 421 } | |
| 422 my $allele_string = $vf->{allele_string}; | |
| 423 my @alleles = split('/', $allele_string); | |
| 424 my $ref = $alleles[0]; | |
| 425 my $seq_region_name = $vf->{chr}; | |
| 426 | |
| 427 my $params = $self->{user_params}; | |
| 428 my $refseq = $tr->{_refseq} || 'NA'; | |
| 429 my $tr_stable_id = $tr->stable_id; | |
| 430 my $hgvs_t = $tva->hgvs_transcript || 'NA'; | |
| 431 my $hgvs_p = $tva->hgvs_protein || 'NA'; | |
| 432 | |
| 433 my ($clin_sig, $novel, $failed, $frequencies, $existing_name) = ('NA', 'yes', 'NA', 'NA', 'NA'); | |
| 434 if ($existing_variant) { | |
| 435 $clin_sig = $existing_variant->{clin_sig} || 'NA'; | |
| 436 $failed = ($existing_variant->{failed}) ? 'yes' : 'no'; | |
| 437 $existing_name = $existing_variant->{variation_name} || 'NA'; | |
| 438 $novel = 'no'; | |
| 439 } | |
| 440 | |
| 441 my $pph_score = (defined $tva->polyphen_score) ? $tva->polyphen_score : 'NA'; | |
| 442 my $pph_pred = (defined $tva->polyphen_prediction) ? $tva->polyphen_prediction : 'NA'; | |
| 443 my $sift_score = (defined $tva->sift_score) ? $tva->sift_score : 'NA'; | |
| 444 my $sift_pred = (defined $tva->sift_prediction) ? $tva->sift_prediction : 'NA'; | |
| 445 | |
| 446 if (scalar keys %$freqs > 0) { | |
| 447 $frequencies = join(',', map {"$_=$freqs->{$_}"} keys %$freqs); | |
| 448 } | |
| 449 | |
| 450 my $ar = join(',', sort keys %$ar_passed); | |
| 451 my $ar_in_g2pdb = join(',', sort @ars); | |
| 452 my $g2p_data = { | |
| 453 'zyg' => $zyg, | |
| 454 'allele_requirement' => $ar, | |
| 455 'ar_in_g2pdb' => $ar_in_g2pdb, | |
| 456 'frequencies' => $frequencies, | |
| 457 'consequence_types' => join(',', @consequence_types), | |
| 458 'refseq' => $refseq, | |
| 459 'failed' => $failed, | |
| 460 'clin_sig' => $clin_sig, | |
| 461 'novel' => $novel, | |
| 462 'existing_name' => $existing_name, | |
| 463 'hgvs_t' => $hgvs_t, | |
| 464 'hgvs_p' => $hgvs_p, | |
| 465 'vf_location' => "$seq_region_name:$start-$end $ref/$allele", | |
| 466 'sift_score' => "$sift_score", | |
| 467 'sift_prediction' => $sift_pred, | |
| 468 'polyphen_score' => "$pph_score", | |
| 469 'polyphen_prediction' => $pph_pred, | |
| 470 }; | |
| 471 | |
| 472 my %return = ( | |
| 473 G2P_flag => $zyg | |
| 474 ); | |
| 475 | |
| 476 | |
| 477 $self->write_report('G2P_flag', $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data); | |
| 478 | |
| 479 $self->write_report('G2P_complete', $gene_symbol, $tr_stable_id, $individual, $vf_name, $ar, $zyg); | |
| 480 | |
| 481 my $cache = $self->{cache}->{$individual}->{$tr->stable_id} ||= {}; | |
| 482 | |
| 483 delete $cache->{$vf_name} if exists($cache->{$vf_name}); | |
| 484 | |
| 485 # biallelic genes require >=1 hom or >=2 hets | |
| 486 | |
| 487 my $gene_reqs = {}; | |
| 488 | |
| 489 foreach my $ar (keys %$ar_passed) { | |
| 490 if($ar eq 'biallelic') { | |
| 491 # homozygous, report complete | |
| 492 if(uc($zyg) eq 'HOM') { | |
| 493 $return{G2P_complete} = 1; | |
| 494 $gene_reqs->{BI} = 1; | |
| 495 } | |
| 496 # heterozygous | |
| 497 # we need to cache that we've observed one | |
| 498 elsif(uc($zyg) eq 'HET') { | |
| 499 if(scalar keys %$cache) { | |
| 500 $return{G2P_complete} = 1; | |
| 501 } | |
| 502 $cache->{$vf_name} = 1; | |
| 503 } | |
| 504 } | |
| 505 # monoallelic genes require only one allele | |
| 506 elsif($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') { | |
| 507 $return{G2P_complete} = 1; | |
| 508 $gene_reqs->{MONO} = 1; | |
| 509 } | |
| 510 else { | |
| 511 return {}; | |
| 512 } | |
| 513 } | |
| 514 if ($return{G2P_complete}) { | |
| 515 $return{G2P_gene_req} = join(',', sort keys %$gene_reqs); | |
| 516 } | |
| 517 | |
| 518 return \%return; | |
| 519 } | |
| 520 | |
| 521 # read G2P CSV dump | |
| 522 # as from http://www.ebi.ac.uk/gene2phenotype/downloads | |
| 523 sub read_gene_data_from_file { | |
| 524 my $self = shift; | |
| 525 my $file = shift; | |
| 526 my $delimiter = shift; | |
| 527 my (@headers, %gene_data); | |
| 528 | |
| 529 my $assembly = $self->{config}->{assembly}; | |
| 530 die("ERROR: No file specified or could not read from file ".($file || '')."\n") unless $file && -e $file; | |
| 531 | |
| 532 my @confidence_levels = @{$self->{user_params}->{confidence_levels}}, "\n"; | |
| 533 | |
| 534 # determine file type | |
| 535 my $file_type; | |
| 536 my $fh = FileHandle->new($file, 'r'); | |
| 537 while (<$fh>) { | |
| 538 chomp; | |
| 539 if (/Model_Of_Inheritance/) { | |
| 540 $file_type = 'panelapp'; | |
| 541 } elsif (/"allelic requirement"/) { | |
| 542 $file_type = 'g2p'; | |
| 543 } else { | |
| 544 $file_type = 'unknown'; | |
| 545 } | |
| 546 last; | |
| 547 } | |
| 548 $fh->close(); | |
| 549 if ($file_type eq 'unknown') { | |
| 550 if ($file =~ /gz$/) { | |
| 551 die("ERROR: G2P plugin can only read uncompressed data"); | |
| 552 } else { | |
| 553 die("ERROR: Could not recognize input file format. Format must be one of panelapp, g2p or custom. Check website for details: https://www.ebi.ac.uk/gene2phenotype/g2p_vep_plugin"); | |
| 554 } | |
| 555 } | |
| 556 | |
| 557 if ($file_type eq 'panelapp') { | |
| 558 my @headers = (); | |
| 559 my $csv = Text::CSV->new ({ sep_char => "\t" }); | |
| 560 open my $fh, "<:encoding(utf8)", "$file" or die "$file: $!"; | |
| 561 while ( my $row = $csv->getline( $fh ) ) { | |
| 562 unless (@headers) { | |
| 563 @headers = @$row; | |
| 564 } else { | |
| 565 my %tmp = map {$headers[$_] => $row->[$_]} (0..$#headers); | |
| 566 my $gene_symbol = $tmp{"Gene Entity Symbol"}; | |
| 567 my $ensembl_gene_id = ""; | |
| 568 if ($assembly eq 'GRCh37') { | |
| 569 $ensembl_gene_id = $tmp{"EnsemblId(GRch37)"}; | |
| 570 } else { # GRCh38 | |
| 571 $ensembl_gene_id = $tmp{"EnsemblId(GRch38)"}; | |
| 572 } | |
| 573 if ($ensembl_gene_id) { | |
| 574 my @ars = (); | |
| 575 my $allelic_requirement_panel_app = $tmp{"Model_Of_Inheritance"}; | |
| 576 if ($allelic_requirement_panel_app =~ m/MONOALLELIC|BOTH/) { | |
| 577 push @ars, 'monoallelic'; | |
| 578 } elsif ($allelic_requirement_panel_app =~ m/BIALLELIC|BOTH/) { | |
| 579 push @ars, 'biallelic'; | |
| 580 } elsif ($allelic_requirement_panel_app eq 'X-LINKED: hemizygous mutation in males, biallelic mutations in females') { | |
| 581 push @ars, 'hemizygous'; | |
| 582 } elsif ($allelic_requirement_panel_all eq 'X-LINKED: hemizygous mutation in males, monoallelic mutations in females may cause disease (may be less severe, later onset than males)') { | |
| 583 push @ars, 'x-linked dominant'; | |
| 584 } else { | |
| 585 $self->write_report('log', "no allelelic_requirement for $ensembl_gene_id"); | |
| 586 } | |
| 587 foreach my $ar (@ars) { | |
| 588 push @{$gene_data{$ensembl_gene_id}->{"allelic requirement"}}, $ar; | |
| 589 } | |
| 590 } else { | |
| 591 $self->write_report('log', "no ensembl gene id for $gene_symbol"); | |
| 592 } | |
| 593 } | |
| 594 } | |
| 595 $csv->eof or $csv->error_diag(); | |
| 596 close $fh; | |
| 597 } | |
| 598 | |
| 599 if ($file_type eq 'g2p') { | |
| 600 # this regexp allows for nested ",", e.g. | |
| 601 # item,description | |
| 602 # cheese,"salty,delicious" | |
| 603 my $re = qr/(?: "\( ( [^()""]* ) \)" | \( ( [^()]* ) \) | " ( [^"]* ) " | ( [^,]* ) ) , \s* /x; | |
| 604 | |
| 605 my $fh = FileHandle->new($file, 'r'); | |
| 606 | |
| 607 while(<$fh>) { | |
| 608 chomp; | |
| 609 $_ =~ s/\R//g; | |
| 610 my @split = grep defined, "$_," =~ /$re/g; | |
| 611 unless(@headers) { | |
| 612 if ($file_type eq 'g2p') { | |
| 613 @headers = map {s/\"//g; $_} @split; | |
| 614 } else { | |
| 615 @headers = @split; | |
| 616 } | |
| 617 } | |
| 618 else { | |
| 619 my %tmp = map {$headers[$_] => $split[$_]} (0..$#split); | |
| 620 die("ERROR: Gene symbol column not found\n$_\n") unless $tmp{"gene symbol"}; | |
| 621 my $confidence_value = $tmp{"DDD category"}; | |
| 622 next if (!grep{$_ eq $confidence_value} @confidence_levels); | |
| 623 my $gene_symbol = $tmp{"gene symbol"}; | |
| 624 $gene_data{$gene_symbol}->{"prev symbols"} = $tmp{"prev symbols"}; | |
| 625 push @{$gene_data{$gene_symbol}->{"allelic requirement"}}, $tmp{"allelic requirement"} if ($tmp{"allelic requirement"}); | |
| 626 $self->write_report('G2P_list', $tmp{"gene symbol"}, $tmp{"DDD category"}); | |
| 627 } | |
| 628 } | |
| 629 $fh->close; | |
| 630 } | |
| 631 return \%gene_data; | |
| 632 } | |
| 633 | |
| 634 # return either whole gene data hash or one gene's data | |
| 635 # this should allow updates to this plugin to e.g. query a REST server, for example | |
| 636 sub gene_data { | |
| 637 my ($self, $gene_symbol) = @_; | |
| 638 my $gene_data = $self->{gene_data}->{$gene_symbol}; | |
| 639 if (!$gene_data) { | |
| 640 my $prev_gene_symbol = $self->{prev_symbol_mappings}->{$gene_symbol}; | |
| 641 return $prev_gene_symbol ? $self->{gene_data}->{$prev_gene_symbol} : undef; | |
| 642 } | |
| 643 return $gene_data; | |
| 644 } | |
| 645 | |
| 646 sub synonym_mappings { | |
| 647 my $self = shift; | |
| 648 my $gene_data = $self->{gene_data}; | |
| 649 my $synonym_mappings = {}; | |
| 650 foreach my $gene_symbol (keys %$gene_data) { | |
| 651 my $prev_symbols = $gene_data->{$gene_symbol}->{'prev symbols'}; | |
| 652 if ($prev_symbols) { | |
| 653 foreach my $prev_symbol (split(';', $prev_symbols)) { | |
| 654 $synonym_mappings->{$prev_symbol} = $gene_symbol; | |
| 655 } | |
| 656 } | |
| 657 } | |
| 658 $self->{prev_symbol_mappings} = $synonym_mappings; | |
| 659 } | |
| 660 | |
| 661 sub get_freq { | |
| 662 my $self = shift; | |
| 663 my $tva = shift; | |
| 664 my $ars = shift; | |
| 665 my $allele = $tva->variation_feature_seq; | |
| 666 my $vf = $tva->base_variation_feature; | |
| 667 reverse_comp(\$allele) if $vf->{strand} < 0; | |
| 668 my $vf_name = $vf->variation_name; | |
| 669 if ($vf_name || $vf_name eq '.') { | |
| 670 $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term}); | |
| 671 } | |
| 672 my $cache = $self->{cache}->{$vf_name}->{_g2p_freqs} ||= {}; | |
| 673 | |
| 674 if (exists $cache->{$allele}->{failed}) { | |
| 675 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}]; | |
| 676 } | |
| 677 | |
| 678 if (exists $cache->{$allele}->{freq}) { | |
| 679 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}]; | |
| 680 } | |
| 681 | |
| 682 if (!$vf->{existing}) { | |
| 683 my $failed_ars = {}; | |
| 684 my $freqs = {}; | |
| 685 my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars); | |
| 686 if (!$passed) { | |
| 687 $cache->{$allele}->{failed} = 1; | |
| 688 return [{}, {}, {}]; | |
| 689 } else { | |
| 690 $cache->{$allele}->{freq} = $freqs; | |
| 691 $cache->{$allele}->{ex_variant} = undef; | |
| 692 # if we get to here return all allelic requirements that passed threshold filtering | |
| 693 my $passed_ar = {}; | |
| 694 foreach my $ar (@$ars) { | |
| 695 if (!$failed_ars->{$ar}) { | |
| 696 $passed_ar->{$ar} = 1; | |
| 697 } | |
| 698 } | |
| 699 $cache->{$allele}->{passed_ar} = $passed_ar; | |
| 700 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}]; | |
| 701 } | |
| 702 } | |
| 703 | |
| 704 my @existing_variants = @{$vf->{existing}}; | |
| 705 # favour dbSNP variants | |
| 706 my @dbSNP_variants = grep {$_->{variation_name} =~ /^rs/} @existing_variants; | |
| 707 if (@dbSNP_variants) { | |
| 708 @existing_variants = @dbSNP_variants; | |
| 709 } | |
| 710 foreach my $ex (@existing_variants) { | |
| 711 my $existing_allele_string = $ex->{allele_string}; | |
| 712 my $variation_name = $ex->{variation_name}; | |
| 713 my $freqs = {}; | |
| 714 my $failed_ars = {}; | |
| 715 foreach my $af_key (@{$self->{user_params}->{af_keys}}) { | |
| 716 my $freq = $self->{user_params}->{default_af}; | |
| 717 if ($af_key eq 'minor_allele_freq') { | |
| 718 if (defined $ex->{minor_allele_freq}) { | |
| 719 if (($ex->{minor_allele} || '') eq $allele ) { | |
| 720 $freq = $ex->{minor_allele_freq}; | |
| 721 } else { | |
| 722 $freq = $self->correct_frequency($tva, $existing_allele_string, $ex->{minor_allele}, $ex->{minor_allele_freq}, $allele, $variation_name, $af_key, $vf_name) || $freq; | |
| 723 } | |
| 724 } | |
| 725 } | |
| 726 else { | |
| 727 my @pairs = split(',', $ex->{$af_key} || ''); | |
| 728 my $found = 0; | |
| 729 if (scalar @pairs == 0) { | |
| 730 $found = 1; # no allele frequency for this population/af_key available | |
| 731 } | |
| 732 foreach my $pair (@pairs) { | |
| 733 my ($a, $f) = split(':', $pair); | |
| 734 if(($a || '') eq $allele && defined($f)) { | |
| 735 $freq = $f; | |
| 736 $found = 1; | |
| 737 } | |
| 738 } | |
| 739 if (!$found) { | |
| 740 $freq = $self->correct_frequency($tva, $existing_allele_string, undef, undef, $allele, $variation_name, $af_key, $vf_name) || $freq; | |
| 741 } | |
| 742 } | |
| 743 if (!$self->continue_af_annotation($ars, $failed_ars, $freq)) { | |
| 744 # cache failed results | |
| 745 $cache->{$allele}->{failed} = 1; | |
| 746 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}]; | |
| 747 } | |
| 748 $freqs->{$af_key} = $freq if ($freq); | |
| 749 } | |
| 750 if ($self->{user_params}->{af_from_vcf}) { | |
| 751 my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars); | |
| 752 if (!$passed) { | |
| 753 $cache->{$allele}->{failed} = 1; | |
| 754 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}]; | |
| 755 } | |
| 756 } | |
| 757 $cache->{$allele}->{freq} = $freqs; | |
| 758 $cache->{$allele}->{ex_variant} = $ex; | |
| 759 | |
| 760 # if we get to here return all allelic requirements that passed threshold filtering | |
| 761 my $passed_ar = {}; | |
| 762 foreach my $ar (@$ars) { | |
| 763 if (!$failed_ars->{$ar}) { | |
| 764 $passed_ar->{$ar} = 1; | |
| 765 } | |
| 766 } | |
| 767 $cache->{$allele}->{passed_ar} = $passed_ar; | |
| 768 | |
| 769 } | |
| 770 | |
| 771 return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}]; | |
| 772 } | |
| 773 | |
| 774 sub correct_frequency { | |
| 775 my ($self, $tva, $allele_string, $minor_allele, $af, $allele, $variation_name, $af_key, $vf_name) = @_; | |
| 776 | |
| 777 my @existing_alleles = split('/', $allele_string); | |
| 778 if (!grep( /^$allele$/, @existing_alleles)) { | |
| 779 return 0.0; | |
| 780 } | |
| 781 | |
| 782 if ($af_key eq 'minor_allele_freq' && (scalar @existing_alleles == 2)) { | |
| 783 my $existing_ref_allele = $existing_alleles[0]; | |
| 784 my $existing_alt_allele = $existing_alleles[1]; | |
| 785 if ( ($minor_allele eq $existing_ref_allele && ($allele eq $existing_alt_allele)) || | |
| 786 ($minor_allele eq $existing_alt_allele && ($allele eq $existing_ref_allele)) ) { | |
| 787 return (1.0 - $af); | |
| 788 } | |
| 789 } else { | |
| 790 my $va = $self->{config}->{va}; | |
| 791 my $pa = $self->{config}->{pa}; | |
| 792 my $variation = $va->fetch_by_name($variation_name); | |
| 793 my $af_key = $self->{user_params}->{af_keys}; | |
| 794 my $population_name = $af_key_2_population_name->{$af_key}; | |
| 795 if ($population_name) { | |
| 796 my $population = $self->{config}->{$population_name}; | |
| 797 if (!$population) { | |
| 798 $population = $pa->fetch_by_name($population_name); | |
| 799 $self->{config}->{$population_name} = $population; | |
| 800 } | |
| 801 foreach (@{$variation->get_all_Alleles($population)}) { | |
| 802 if ($_->allele eq $allele) { | |
| 803 return $_->frequency; | |
| 804 } | |
| 805 } | |
| 806 } | |
| 807 } | |
| 808 return 0.0; | |
| 809 } | |
| 810 | |
| 811 sub frequencies_from_VCF { | |
| 812 my $self = shift; | |
| 813 my $freqs = shift; | |
| 814 my $vf = shift; | |
| 815 my $vf_allele = shift; | |
| 816 my $ars = shift; | |
| 817 my $failed_ars = shift; | |
| 818 my $vca = $self->{config}->{vca}; | |
| 819 my $collections = $vca->fetch_all; | |
| 820 foreach my $vc (@$collections) { | |
| 821 next if (! $self->{user_params}->{vcf_collection_ids}->{$vc->id}); | |
| 822 my $alleles = $vc->get_all_Alleles_by_VariationFeature($vf); | |
| 823 foreach my $allele (@$alleles) { | |
| 824 if ($allele->allele eq $vf_allele) { | |
| 825 my $af_key = $allele->population->name; | |
| 826 my $freq = $allele->frequency; | |
| 827 return 0 if (!$self->continue_af_annotation($ars, $failed_ars, $freq)); | |
| 828 $freqs->{$af_key} = $freq; | |
| 829 } | |
| 830 } | |
| 831 } | |
| 832 return 1; | |
| 833 } | |
| 834 | |
| 835 sub continue_af_annotation { | |
| 836 my $self = shift; | |
| 837 my $ars = shift; | |
| 838 my $failed_ars = shift; | |
| 839 my $freq = shift; | |
| 840 foreach my $ar (@$ars) { | |
| 841 if (!$failed_ars->{$ar}) { | |
| 842 if (defined $allelic_requirements->{$ar}) { | |
| 843 my $threshold = $allelic_requirements->{$ar}->{af}; | |
| 844 if ($freq > $threshold) { | |
| 845 $failed_ars->{$ar} = 1; | |
| 846 } | |
| 847 } | |
| 848 } | |
| 849 } | |
| 850 return (scalar @$ars != scalar keys %$failed_ars); | |
| 851 } | |
| 852 | |
| 853 | |
| 854 | |
| 855 sub write_report { | |
| 856 my $self = shift; | |
| 857 my $flag = shift; | |
| 858 my $log_dir = $self->{user_params}->{log_dir}; | |
| 859 my $log_file = "$log_dir/$$.txt"; | |
| 860 open(my $fh, '>>', $log_file) or die "Could not open file '$flag $log_file' $!"; | |
| 861 if ($flag eq 'G2P_list') { | |
| 862 my ($gene_symbol, $DDD_category) = @_; | |
| 863 $DDD_category ||= 'Not assigned'; | |
| 864 print $fh "$flag\t$gene_symbol\t$DDD_category\n"; | |
| 865 } elsif ($flag eq 'G2P_in_vcf') { | |
| 866 my $gene_symbol = shift; | |
| 867 print $fh "$flag\t$gene_symbol\n"; | |
| 868 } elsif ($flag eq 'G2P_complete') { | |
| 869 print $fh join("\t", $flag, @_), "\n"; | |
| 870 } elsif ($flag eq 'log') { | |
| 871 print $fh join("\t", $flag, @_), "\n"; | |
| 872 } else { | |
| 873 my ($gene_symbol, $tr_stable_id, $individual, $vf_name, $data) = @_; | |
| 874 $data = join(';', map {"$_=$data->{$_}"} sort keys %$data); | |
| 875 print $fh join("\t", $flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $data), "\n"; | |
| 876 } | |
| 877 close $fh; | |
| 878 } | |
| 879 | |
| 880 sub finish { | |
| 881 my $self = shift; | |
| 882 $self->generate_report; | |
| 883 } | |
| 884 | |
| 885 sub generate_report { | |
| 886 my $self = shift; | |
| 887 my $result_summary = $self->parse_log_files; | |
| 888 my $chart_txt_data = $self->chart_and_txt_data($result_summary); | |
| 889 my $chart_data = $chart_txt_data->{chart_data}; | |
| 890 my $txt_data = $chart_txt_data->{txt_data}; | |
| 891 my $canonical_transcripts = $chart_txt_data->{canonical_transcripts}; | |
| 892 $self->write_txt_output($txt_data); | |
| 893 $self->write_charts($result_summary, $chart_data, $canonical_transcripts); | |
| 894 } | |
| 895 | |
| 896 sub write_txt_output { | |
| 897 my $self = shift; | |
| 898 my $txt_output_data = shift; | |
| 899 my $txt_output_file = $self->{user_params}->{txt_report}; | |
| 900 my $fh_txt = FileHandle->new($txt_output_file, 'w'); | |
| 901 foreach my $individual (sort keys %$txt_output_data) { | |
| 902 foreach my $gene_symbol (keys %{$txt_output_data->{$individual}}) { | |
| 903 foreach my $ar (keys %{$txt_output_data->{$individual}->{$gene_symbol}}) { | |
| 904 foreach my $tr_stable_id (keys %{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}}) { | |
| 905 my $is_canonical = $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{is_canonical}; | |
| 906 my $canonical_tag = ($is_canonical) ? 'is_canonical' : 'not_canonical'; | |
| 907 my $req = $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{REQ}; | |
| 908 my $variants = join(';', @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{variants}}); | |
| 909 print $fh_txt join("\t", $individual, $gene_symbol, $tr_stable_id, $canonical_tag, "OBS=$ar", "REQ=$req", $variants), "\n"; | |
| 910 } | |
| 911 } | |
| 912 } | |
| 913 } | |
| 914 $fh_txt->close(); | |
| 915 } | |
| 916 | |
| 917 sub write_charts { | |
| 918 my $self = shift; | |
| 919 my $result_summary = shift; | |
| 920 my $chart_data = shift; | |
| 921 my $canonical_transcripts = shift; | |
| 922 | |
| 923 my $count_g2p_genes = keys %{$result_summary->{g2p_list}}; | |
| 924 my $count_in_vcf_file = keys %{$result_summary->{in_vcf_file}}; | |
| 925 my $count_complete_genes = scalar keys %{$result_summary->{complete_genes}}; | |
| 926 | |
| 927 my @charts = (); | |
| 928 my @frequencies_header = (); | |
| 929 | |
| 930 foreach my $short_name (sort @{$self->{user_params}->{af_keys}}) { | |
| 931 my $text = $af_key_2_population_name->{$short_name}; | |
| 932 push @frequencies_header, "<a style=\"cursor: pointer\" data-placement=\"top\" data-toggle=\"tooltip\" data-container=\"body\" title=\"$text\">$short_name</a>"; | |
| 933 } | |
| 934 | |
| 935 my $count = 1; | |
| 936 my @new_header = ( | |
| 937 'Variant location and alleles (REF/ALT)', | |
| 938 'Variant name', | |
| 939 'Existing name', | |
| 940 'Zygosity', | |
| 941 'All allelic requirements from G2P DB', | |
| 942 'Consequence types', | |
| 943 'ClinVar annotation', | |
| 944 'SIFT', | |
| 945 'PolyPhen', | |
| 946 'Novel variant', | |
| 947 'Has been failed by Ensembl', | |
| 948 @frequencies_header, | |
| 949 'HGVS transcript', | |
| 950 'HGVS protein', | |
| 951 'RefSeq IDs', | |
| 952 ); | |
| 953 | |
| 954 my $html_output_file = $self->{user_params}->{html_report}; | |
| 955 my $fh_out = FileHandle->new($html_output_file, 'w'); | |
| 956 print $fh_out stats_html_head(\@charts); | |
| 957 print $fh_out "<div class='main_content container'>"; | |
| 958 | |
| 959 | |
| 960 print $fh_out "<h1>G2P report</h1>"; | |
| 961 print $fh_out "<p>Input and output files:</p>"; | |
| 962 | |
| 963 print $fh_out "<dl class='dl-horizontal'>"; | |
| 964 print $fh_out "<dt>G2P list</dt>"; | |
| 965 print $fh_out "<dd>" . $self->{user_params}->{file} . "</dd>"; | |
| 966 print $fh_out "<dt>Log directory</dt>"; | |
| 967 print $fh_out "<dd>" . $self->{user_params}->{log_dir} . "</dd>"; | |
| 968 print $fh_out "<dt>HTML report</dt>"; | |
| 969 print $fh_out "<dd>" . $self->{user_params}->{html_report} . "</dd>"; | |
| 970 print $fh_out "<dt>TXT report</dt>"; | |
| 971 print $fh_out "<dd>" . $self->{user_params}->{txt_report} . "</dd>"; | |
| 972 print $fh_out "</dl>"; | |
| 973 | |
| 974 print $fh_out "<p>Counts:</p>"; | |
| 975 print $fh_out "<dl class='dl-horizontal text-overflow'>"; | |
| 976 print $fh_out "<dt>$count_g2p_genes</dt>"; | |
| 977 print $fh_out "<dd>G2P genes</dd>"; | |
| 978 print $fh_out "<dt>$count_in_vcf_file</dt>"; | |
| 979 print $fh_out "<dd>G2P genes in input VCF file</dd>"; | |
| 980 print $fh_out "<dt>$count_complete_genes</dt>"; | |
| 981 print $fh_out "<dd>G2P complete genes in input VCF file</dd>"; | |
| 982 print $fh_out "</dl>"; | |
| 983 | |
| 984 | |
| 985 print $fh_out "<h1>Summary of G2P complete genes per individual</h1>"; | |
| 986 print $fh_out "<p>G2P complete gene: A sufficient number of variant hits for the observed allelic requirement in at least one of the gene's transcripts. Variants are filtered by frequency.</p>"; | |
| 987 print $fh_out "<p>Frequency thresholds and number of required variant hits for each allelic requirement:</p>"; | |
| 988 | |
| 989 print $fh_out "<table class='table table-bordered'>"; | |
| 990 print $fh_out "<thead>"; | |
| 991 print $fh_out "<tr><th>Allelic requirement</th><th>Frequency threshold for filtering</th><th>Variant counts by zygosity</th></tr>"; | |
| 992 print $fh_out "</thead>"; | |
| 993 print $fh_out "<tbody>"; | |
| 994 foreach my $ar (sort keys %$allelic_requirements) { | |
| 995 my $af = $allelic_requirements->{$ar}->{af}; | |
| 996 my $rules = $allelic_requirements->{$ar}->{rules}; | |
| 997 my $rule = join(' OR ', map {"$_ >= $rules->{$_}"} keys %$rules); | |
| 998 print $fh_out "<tr><td>$ar</td><td>$af</td><td>$rule</td></tr>"; | |
| 999 } | |
| 1000 print $fh_out "</tbody>"; | |
| 1001 print $fh_out "</table>"; | |
| 1002 | |
| 1003 my $switch =<<SHTML; | |
| 1004 <form> | |
| 1005 <div class="checkbox"> | |
| 1006 <label> | |
| 1007 <input class="target" type="checkbox"> Show only canonical transcript | |
| 1008 </label> | |
| 1009 </div> | |
| 1010 </form> | |
| 1011 SHTML | |
| 1012 | |
| 1013 print $fh_out $switch; | |
| 1014 | |
| 1015 foreach my $individual (sort keys %$chart_data) { | |
| 1016 foreach my $gene_symbol (keys %{$chart_data->{$individual}}) { | |
| 1017 foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) { | |
| 1018 print $fh_out "<ul>\n"; | |
| 1019 foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) { | |
| 1020 my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical'; | |
| 1021 print $fh_out "<li><a class=\"$class\" href=\"#$individual\_$gene_symbol\_$ar\_$transcript_stable_id\">" . "$individual > $gene_symbol > $ar > $transcript_stable_id" . "</a> </li>\n"; | |
| 1022 } | |
| 1023 print $fh_out "</ul>\n"; | |
| 1024 } | |
| 1025 } | |
| 1026 } | |
| 1027 | |
| 1028 foreach my $individual (sort keys %$chart_data) { | |
| 1029 foreach my $gene_symbol (keys %{$chart_data->{$individual}}) { | |
| 1030 foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) { | |
| 1031 foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) { | |
| 1032 my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical'; | |
| 1033 print $fh_out "<div class=\"$class\">"; | |
| 1034 my $name = "$individual\_$gene_symbol\_$ar\_$transcript_stable_id"; | |
| 1035 my $title = "$individual > $gene_symbol > $ar > $transcript_stable_id"; | |
| 1036 print $fh_out "<h3><a name=\"$name\"></a>$title <a title=\"Back to Top\" data-toggle=\"tooltip\" href='#top'><span class=\"glyphicon glyphicon-arrow-up\" aria-hidden=\"true\"></span></a></h3>\n"; | |
| 1037 print $fh_out "<div class=\"table-responsive\" style=\"width:100%\">\n"; | |
| 1038 print $fh_out "<TABLE class=\"table table-bordered table-condensed\" style=\"margin-left: 2em\">"; | |
| 1039 print $fh_out "<thead>\n"; | |
| 1040 print $fh_out "<tr>" . join('', map {"<th>$_</th>"} @new_header) . "</tr>\n"; | |
| 1041 print $fh_out "</thead>\n"; | |
| 1042 print $fh_out "<tbody>\n"; | |
| 1043 foreach my $vf_data (@{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) { | |
| 1044 my $data_row = $vf_data->[0]; | |
| 1045 my @tds = (); | |
| 1046 foreach my $cell (@$data_row) { | |
| 1047 my $value = $cell->[0]; | |
| 1048 my $class = $cell->[1]; | |
| 1049 if ($class) { | |
| 1050 push @tds, "<td class=\"$class\">$value</td>"; | |
| 1051 } else { | |
| 1052 push @tds, "<td>$value</td>"; | |
| 1053 } | |
| 1054 } | |
| 1055 print $fh_out "<tr>", join('', @tds), "</tr>\n"; | |
| 1056 } | |
| 1057 print $fh_out "</tbody>\n"; | |
| 1058 print $fh_out "</TABLE>\n"; | |
| 1059 print $fh_out "</div>\n"; | |
| 1060 print $fh_out "</div>\n"; | |
| 1061 } | |
| 1062 } | |
| 1063 } | |
| 1064 } | |
| 1065 print $fh_out stats_html_tail(); | |
| 1066 } | |
| 1067 | |
| 1068 sub chart_and_txt_data { | |
| 1069 my $self = shift; | |
| 1070 my $result_summary = shift; | |
| 1071 my $individuals = $result_summary->{individuals}; | |
| 1072 my $complete_genes = $result_summary->{complete_genes}; | |
| 1073 my $acting_ars = $result_summary->{acting_ars}; | |
| 1074 my $new_order = $result_summary->{new_order}; | |
| 1075 | |
| 1076 # my @frequencies_header = sort keys $af_key_2_population_name; | |
| 1077 my @frequencies_header = sort @{$self->{user_params}->{af_keys}}; | |
| 1078 | |
| 1079 my $assembly = $self->{config}->{assembly}; | |
| 1080 my $transcripts = {}; | |
| 1081 my $canonical_transcripts = {}; | |
| 1082 my $transcript_adaptor = $self->{config}->{ta}; | |
| 1083 my $chart_data = {}; | |
| 1084 my $txt_output_data = {}; | |
| 1085 | |
| 1086 my $prediction2bgcolor = { | |
| 1087 'probably damaging' => 'danger', | |
| 1088 'deleterious' => 'danger', | |
| 1089 'possibly damaging' => 'warning', | |
| 1090 'unknown' => 'warning', | |
| 1091 'benign' => 'success', | |
| 1092 'tolerated' => 'success', | |
| 1093 }; | |
| 1094 | |
| 1095 foreach my $individual (sort keys %$new_order) { | |
| 1096 foreach my $gene_symbol (keys %{$new_order->{$individual}}) { | |
| 1097 foreach my $ar (keys %{$new_order->{$individual}->{$gene_symbol}}) { | |
| 1098 foreach my $transcript_stable_id (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}}) { | |
| 1099 foreach my $vf_name (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) { | |
| 1100 my $data = $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$transcript_stable_id}; | |
| 1101 | |
| 1102 my $hash = {}; | |
| 1103 foreach my $pair (split/;/, $data) { | |
| 1104 my ($key, $value) = split('=', $pair, 2); | |
| 1105 $value ||= ''; | |
| 1106 $hash->{$key} = $value; | |
| 1107 } | |
| 1108 my $vf_location = $hash->{vf_location}; | |
| 1109 my $existing_name = $hash->{existing_name}; | |
| 1110 if ($existing_name ne 'NA') { | |
| 1111 $existing_name = "<a href=\"http://$assembly.ensembl.org/Homo_sapiens/Variation/Explore?v=$existing_name\">$existing_name</a>"; | |
| 1112 } | |
| 1113 my $refseq = $hash->{refseq}; | |
| 1114 my $failed = $hash->{failed}; | |
| 1115 my $clin_sign = $hash->{clin_sig}; | |
| 1116 my $novel = $hash->{novel}; | |
| 1117 my $hgvs_t = $hash->{hgvs_t}; | |
| 1118 my $hgvs_p = $hash->{hgvs_p}; | |
| 1119 my $allelic_requirement = $hash->{allele_requirement}; | |
| 1120 my $observed_allelic_requirement = $hash->{ar_in_g2pdb}; | |
| 1121 my $consequence_types = $hash->{consequence_types}; | |
| 1122 my $zygosity = $hash->{zyg}; | |
| 1123 my $sift_score = $hash->{sift_score} || '0.0'; | |
| 1124 my $sift_prediction = $hash->{sift_prediction}; | |
| 1125 my $sift = 'NA'; | |
| 1126 my $sift_class = ''; | |
| 1127 if ($sift_prediction ne 'NA') { | |
| 1128 $sift = "$sift_prediction(" . "$sift_score)"; | |
| 1129 $sift_class = $prediction2bgcolor->{$sift_prediction}; | |
| 1130 } | |
| 1131 my $polyphen_score = $hash->{polyphen_score} || '0.0'; | |
| 1132 my $polyphen_prediction = $hash->{polyphen_prediction}; | |
| 1133 my $polyphen = 'NA'; | |
| 1134 my $polyphen_class = ''; | |
| 1135 if ($polyphen_prediction ne 'NA') { | |
| 1136 $polyphen = "$polyphen_prediction($polyphen_score)"; | |
| 1137 $polyphen_class = $prediction2bgcolor->{$polyphen_prediction}; | |
| 1138 } | |
| 1139 | |
| 1140 my %frequencies_hash = (); | |
| 1141 if ($hash->{frequencies} ne 'NA') { | |
| 1142 %frequencies_hash = split /[,=]/, $hash->{frequencies}; | |
| 1143 } | |
| 1144 my @frequencies = (); | |
| 1145 my @txt_output_frequencies = (); | |
| 1146 foreach my $population (@frequencies_header) { | |
| 1147 my $frequency = $frequencies_hash{$population} || ''; | |
| 1148 push @frequencies, ["$frequency"]; | |
| 1149 if ($frequency) { | |
| 1150 push @txt_output_frequencies, "$population=$frequency"; | |
| 1151 } | |
| 1152 } | |
| 1153 my $is_canonical = 0; | |
| 1154 if ($hash->{is_canonical}) { | |
| 1155 $is_canonical = ($hash->{is_canonical} eq 'yes') ? 1 : 0; | |
| 1156 } else { | |
| 1157 if ($transcripts->{$transcript_stable_id}) { | |
| 1158 $is_canonical = 1 if ($canonical_transcripts->{$transcript_stable_id}); | |
| 1159 } else { | |
| 1160 my $transcript = $transcript_adaptor->fetch_by_stable_id($transcript_stable_id); | |
| 1161 if ($transcript) { | |
| 1162 $is_canonical = $transcript->is_canonical(); | |
| 1163 $transcripts->{$transcript_stable_id} = 1; | |
| 1164 $canonical_transcripts->{$transcript_stable_id} = 1 if ($is_canonical); | |
| 1165 } | |
| 1166 } | |
| 1167 } | |
| 1168 my ($location, $alleles) = split(' ', $vf_location); | |
| 1169 $location =~ s/\-/:/; | |
| 1170 $alleles =~ s/\//:/; | |
| 1171 | |
| 1172 push @{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}, [[ | |
| 1173 [$vf_location], | |
| 1174 [$vf_name], | |
| 1175 [$existing_name], | |
| 1176 [$zygosity], | |
| 1177 [$observed_allelic_requirement], | |
| 1178 [$consequence_types], | |
| 1179 [$clin_sign], | |
| 1180 [$sift, $sift_class], | |
| 1181 [$polyphen, $polyphen_class], | |
| 1182 [$novel], | |
| 1183 [$failed], | |
| 1184 @frequencies, | |
| 1185 [$hgvs_t], | |
| 1186 [$hgvs_p], | |
| 1187 [$refseq] | |
| 1188 ], $is_canonical]; | |
| 1189 | |
| 1190 my $txt_output_variant = "$location:$alleles:$zygosity:$consequence_types:SIFT=$sift:PolyPhen=$polyphen"; | |
| 1191 if (@txt_output_frequencies) { | |
| 1192 $txt_output_variant .= ':' . join(',', @txt_output_frequencies); | |
| 1193 } | |
| 1194 $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{is_canonical} = $is_canonical; | |
| 1195 $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{REQ} = $observed_allelic_requirement; | |
| 1196 push @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{variants}}, $txt_output_variant; | |
| 1197 } | |
| 1198 } | |
| 1199 } | |
| 1200 } | |
| 1201 } | |
| 1202 return {txt_data => $txt_output_data, chart_data => $chart_data, canonical_transcripts => $canonical_transcripts}; | |
| 1203 } | |
| 1204 | |
| 1205 sub parse_log_files { | |
| 1206 my $self = shift; | |
| 1207 | |
| 1208 my $log_dir = $self->{user_params}->{log_dir}; | |
| 1209 my @files = <$log_dir/*>; | |
| 1210 | |
| 1211 my $genes = {}; | |
| 1212 my $individuals = {}; | |
| 1213 my $complete_genes = {}; | |
| 1214 my $g2p_list = {}; | |
| 1215 my $in_vcf_file = {}; | |
| 1216 my $cache = {}; | |
| 1217 my $acting_ars = {}; | |
| 1218 | |
| 1219 my $new_order = {}; | |
| 1220 | |
| 1221 foreach my $file (@files) { | |
| 1222 my $fh = FileHandle->new($file, 'r'); | |
| 1223 while (<$fh>) { | |
| 1224 chomp; | |
| 1225 if (/^G2P_list/) { | |
| 1226 my ($flag, $gene_symbol, $DDD_category) = split/\t/; | |
| 1227 $g2p_list->{$gene_symbol} = 1; | |
| 1228 } elsif (/^G2P_in_vcf/) { | |
| 1229 my ($flag, $gene_symbol) = split/\t/; | |
| 1230 $in_vcf_file->{$gene_symbol} = 1; | |
| 1231 } elsif (/^G2P_complete/) { | |
| 1232 my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $ars, $zyg) = split/\t/; | |
| 1233 foreach my $ar (split(',', $ars)) { | |
| 1234 if ($ar eq 'biallelic') { | |
| 1235 # homozygous, report complete | |
| 1236 if (uc($zyg) eq 'HOM') { | |
| 1237 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1; | |
| 1238 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1; | |
| 1239 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1; | |
| 1240 } | |
| 1241 # heterozygous | |
| 1242 # we need to cache that we've observed one | |
| 1243 elsif (uc($zyg) eq 'HET') { | |
| 1244 if (scalar keys %{$cache->{$individual}->{$tr_stable_id}} >= 1) { | |
| 1245 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1; | |
| 1246 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1; | |
| 1247 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1; | |
| 1248 # add first observed het variant to the list | |
| 1249 foreach my $vf (keys %{$cache->{$individual}->{$tr_stable_id}}) { | |
| 1250 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf} = 1; | |
| 1251 } | |
| 1252 } | |
| 1253 $cache->{$individual}->{$tr_stable_id}->{$vf_name}++; | |
| 1254 } | |
| 1255 } | |
| 1256 # monoallelic genes require only one allele | |
| 1257 elsif ($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') { | |
| 1258 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1; | |
| 1259 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1; | |
| 1260 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1; | |
| 1261 } | |
| 1262 } | |
| 1263 } elsif (/^G2P_flag/) { | |
| 1264 my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data) = split/\t/; | |
| 1265 $genes->{$gene_symbol}->{"$individual\t$vf_name"}->{$tr_stable_id} = $g2p_data; | |
| 1266 $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$tr_stable_id} = $g2p_data; | |
| 1267 } else { | |
| 1268 | |
| 1269 } | |
| 1270 } | |
| 1271 $fh->close(); | |
| 1272 } | |
| 1273 return { | |
| 1274 genes => $genes, | |
| 1275 individuals => $individuals, | |
| 1276 complete_genes => $complete_genes, | |
| 1277 g2p_list => $g2p_list, | |
| 1278 in_vcf_file => $in_vcf_file, | |
| 1279 acting_ars => $acting_ars, | |
| 1280 new_order => $new_order, | |
| 1281 }; | |
| 1282 } | |
| 1283 | |
| 1284 | |
| 1285 sub stats_html_head { | |
| 1286 my $charts = shift; | |
| 1287 | |
| 1288 my $html =<<SHTML; | |
| 1289 <html> | |
| 1290 <head> | |
| 1291 <title>VEP summary</title> | |
| 1292 <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous"> | |
| 1293 <style> | |
| 1294 a.inactive { | |
| 1295 color: grey; | |
| 1296 pointer-events:none; | |
| 1297 } | |
| 1298 </style> | |
| 1299 </head> | |
| 1300 <body> | |
| 1301 SHTML | |
| 1302 return $html; | |
| 1303 } | |
| 1304 | |
| 1305 sub stats_html_tail { | |
| 1306 my $script =<<SHTML; | |
| 1307 <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script> | |
| 1308 <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script> | |
| 1309 <script type="text/javascript" src="http://www.google.com/jsapi"></script> | |
| 1310 <script> | |
| 1311 \$( "input[type=checkbox]" ).on( "click", function(){ | |
| 1312 if (\$('.target').is(':checked')) { | |
| 1313 \$( "div.not_canonical" ).hide(); | |
| 1314 \$("a.not_canonical").addClass("inactive"); | |
| 1315 } else { | |
| 1316 \$( "div.not_canonical" ).show(); | |
| 1317 \$("a.not_canonical").removeClass("inactive"); | |
| 1318 } | |
| 1319 } ); | |
| 1320 \$(document).ready(function(){ | |
| 1321 \$('[data-toggle="tooltip"]').tooltip(); | |
| 1322 }); | |
| 1323 </script> | |
| 1324 SHTML | |
| 1325 return "\n</div>\n$script\n</body>\n</html>\n"; | |
| 1326 } | |
| 1327 | |
| 1328 sub sort_keys { | |
| 1329 my $data = shift; | |
| 1330 my $sort = shift; | |
| 1331 print $data, "\n"; | |
| 1332 my @keys; | |
| 1333 | |
| 1334 # sort data | |
| 1335 if(defined($sort)) { | |
| 1336 if($sort eq 'chr') { | |
| 1337 @keys = sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %{$data}; | |
| 1338 } | |
| 1339 elsif($sort eq 'value') { | |
| 1340 @keys = sort {$data->{$a} <=> $data->{$b}} keys %{$data}; | |
| 1341 } | |
| 1342 elsif(ref($sort) eq 'HASH') { | |
| 1343 @keys = sort {$sort->{$a} <=> $sort->{$b}} keys %{$data}; | |
| 1344 } | |
| 1345 } | |
| 1346 else { | |
| 1347 @keys = keys %{$data}; | |
| 1348 } | |
| 1349 | |
| 1350 return \@keys; | |
| 1351 } | |
| 1352 | |
| 1353 1; |
