| 0 | 1 =head1 LICENSE | 
|  | 2 | 
|  | 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | 
|  | 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | 
|  | 5 | 
|  | 6 Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 7 you may not use this file except in compliance with the License. | 
|  | 8 You may obtain a copy of the License at | 
|  | 9 | 
|  | 10      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | 
|  | 12 Unless required by applicable law or agreed to in writing, software | 
|  | 13 distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 15 See the License for the specific language governing permissions and | 
|  | 16 limitations under the License. | 
|  | 17 | 
|  | 18 =head1 CONTACT | 
|  | 19 | 
|  | 20  Ensembl <http://www.ensembl.org/info/about/contact/index.html> | 
|  | 21 | 
|  | 22 =cut | 
|  | 23 | 
|  | 24 =head1 NAME | 
|  | 25 | 
|  | 26  G2P | 
|  | 27 | 
|  | 28 =head1 SYNOPSIS | 
|  | 29 | 
|  | 30  mv G2P.pm ~/.vep/Plugins | 
|  | 31  ./vep -i variations.vcf --plugin G2P,file=/path/to/G2P.csv.gz | 
|  | 32 | 
|  | 33 =head1 DESCRIPTION | 
|  | 34 | 
|  | 35  A VEP plugin that uses G2P allelic requirements to assess variants in genes | 
|  | 36  for potential phenotype involvement. | 
|  | 37 | 
|  | 38  The plugin has multiple configuration options, though minimally requires only | 
|  | 39  the CSV file of G2P data. | 
|  | 40 | 
|  | 41  Options are passed to the plugin as key=value pairs, (defaults in parentheses): | 
|  | 42 | 
|  | 43  file                  : path to G2P data file, as found at http://www.ebi.ac.uk/gene2phenotype/downloads | 
|  | 44 | 
|  | 45  af_monoallelic        : maximum allele frequency for inclusion for monoallelic genes (0.0001) | 
|  | 46 | 
|  | 47  af_biallelic          : maximum allele frequency for inclusion for biallelic genes (0.005) | 
|  | 48  all_confidence_levels : set value to 1 to include all confidence levels: confirmed, probable and possible. | 
|  | 49                          Default levels are confirmed and probable. | 
|  | 50  af_keys               : reference populations used for annotating variant alleles with observed | 
|  | 51                          allele frequencies. Allele frequencies are stored in VEP cache files. | 
|  | 52                          Default populations are: | 
|  | 53                          ESP: AA, EA | 
|  | 54                          1000 Genomes: AFR, AMR, EAS, EUR, SAS | 
|  | 55                          gnomAD exomes: gnomAD, gnomAD_AFR, gnomAD_AMR, gnomAD_ASJ, gnomAD_EAS, gnomAD_FIN, gnomAD_NFE, gnomAD_OTH, gnomAD_SAS | 
|  | 56                          Separate multiple values with '&' | 
|  | 57  af_from_vcf           : set value to 1 to include allele frequencies from VCF file. | 
|  | 58                          Specifiy the list of reference populations to include with --af_from_vcf_keys | 
|  | 59  af_from_vcf_keys      : reference populations used for annotating variant alleles with observed | 
|  | 60                          allele frequencies. Allele frequencies are retrieved from VCF files. If | 
|  | 61                          af_from_vcf is set to 1 but no populations specified with --af_from_vcf_keys | 
|  | 62                          all available reference populations are included. | 
|  | 63                          TOPmed: TOPMed | 
|  | 64                          UK10K: ALSPAC, TWINSUK | 
|  | 65                          gnomAD exomes: gnomADe:AFR, gnomADe:ALL, gnomADe:AMR, gnomADe:ASJ, gnomADe:EAS, gnomADe:FIN, gnomADe:NFE, gnomADe:OTH, gnomADe:SAS | 
|  | 66                          gnomAD genomes: gnomADg:AFR, gnomADg:ALL, gnomADg:AMR, gnomADg:ASJ, gnomADg:EAS, gnomADg:FIN, gnomADg:NFE, gnomADg:OTH | 
|  | 67                          Separate multiple values with '&' | 
|  | 68  default_af            : default frequency of the input variant if no frequency data is | 
|  | 69                          found (0). This determines whether such variants are included; | 
|  | 70                          the value of 0 forces variants with no frequency data to be | 
|  | 71                          included as this is considered equivalent to having a frequency | 
|  | 72                          of 0. Set to 1 (or any value higher than af) to exclude them. | 
|  | 73  types                 : SO consequence types to include. Separate multiple values with '&' | 
|  | 74                          (splice_donor_variant,splice_acceptor_variant,stop_gained, | 
|  | 75                          frameshift_variant,stop_lost,initiator_codon_variant, | 
|  | 76                          inframe_insertion,inframe_deletion,missense_variant, | 
|  | 77                          coding_sequence_variant,start_lost,transcript_ablation, | 
|  | 78                          transcript_amplification,protein_altering_variant) | 
|  | 79 | 
|  | 80   log_dir              : write stats to log files in log_dir | 
|  | 81 | 
|  | 82   txt_report           : write all G2P complete genes and attributes to txt file | 
|  | 83 | 
|  | 84   html_report          : write all G2P complete genes and attributes to html file | 
|  | 85 | 
|  | 86  Example: | 
|  | 87 | 
|  | 88  --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_keys=AA&gnomAD_ASJ,types=stop_gained&frameshift_variant | 
|  | 89  --plugin G2P,file=G2P.csv,af_monoallelic=0.05,types=stop_gained&frameshift_variant | 
|  | 90  --plugin G2P,file=G2P.csv,af_monoallelic=0.05,af_from_vcf=1 | 
|  | 91  --plugin G2P,file=G2P.csv | 
|  | 92 | 
|  | 93 =cut | 
|  | 94 | 
|  | 95 package G2P; | 
|  | 96 | 
|  | 97 use strict; | 
|  | 98 use warnings; | 
|  | 99 | 
|  | 100 use Cwd; | 
|  | 101 use Scalar::Util qw(looks_like_number); | 
|  | 102 use FileHandle; | 
|  | 103 use Text::CSV; | 
|  | 104 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | 
|  | 105 | 
|  | 106 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | 
|  | 107 | 
|  | 108 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | 
|  | 109 | 
|  | 110 my %DEFAULTS = ( | 
|  | 111 | 
|  | 112   # vars must have a frequency <= to this to pass | 
|  | 113   af => 0.001, | 
|  | 114   af_monoallelic => 0.0001, | 
|  | 115   af_biallelic => 0.005, | 
|  | 116 | 
|  | 117   af_keys => [qw(AA AFR AMR EA EAS EUR SAS gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS)], | 
|  | 118 | 
|  | 119   af_from_vcf_keys => [qw(ALSPAC TOPMed TWINSUK gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH)], | 
|  | 120 | 
|  | 121   # if no MAF data is found, default to 0 | 
|  | 122   # this means absence of MAF data is considered equivalent to MAF=0 | 
|  | 123   # set to 1 to do the "opposite", i.e. exclude variants with no MAF data | 
|  | 124   default_af => 0, | 
|  | 125 | 
|  | 126   confidence_levels => [qw(confirmed probable)], | 
|  | 127 | 
|  | 128   # only include variants with these consequence types | 
|  | 129   # currently not ontology-resolved, exact term matches only | 
|  | 130   types => {map {$_ => 1} qw(splice_donor_variant splice_acceptor_variant stop_gained frameshift_variant stop_lost initiator_codon_variant inframe_insertion inframe_deletion missense_variant coding_sequence_variant start_lost transcript_ablation transcript_amplification protein_altering_variant)}, | 
|  | 131 | 
|  | 132 ); | 
|  | 133 | 
|  | 134 my $af_key_2_population_name = { | 
|  | 135   minor_allele_freq => 'global allele frequency (AF) from 1000 Genomes Phase 3 data', | 
|  | 136   AFR => '1000GENOMES:phase_3:AFR', | 
|  | 137   AMR => '1000GENOMES:phase_3:AMR', | 
|  | 138   EAS => '1000GENOMES:phase_3:EAS', | 
|  | 139   EUR => '1000GENOMES:phase_3:EUR', | 
|  | 140   SAS => '1000GENOMES:phase_3:SAS', | 
|  | 141   AA => 'Exome Sequencing Project 6500:African_American', | 
|  | 142   EA => 'Exome Sequencing Project 6500:European_American', | 
|  | 143   gnomAD => 'Genome Aggregation Database:Total', | 
|  | 144   gnomAD_AFR => 'Genome Aggregation Database exomes:African/African American', | 
|  | 145   gnomAD_AMR => 'Genome Aggregation Database exomes:Latino', | 
|  | 146   gnomAD_ASJ => 'Genome Aggregation Database exomes:Ashkenazi Jewish', | 
|  | 147   gnomAD_EAS => 'Genome Aggregation Database exomes:East Asian', | 
|  | 148   gnomAD_FIN => 'Genome Aggregation Database exomes:Finnish', | 
|  | 149   gnomAD_NFE => 'Genome Aggregation Database exomes:Non-Finnish European', | 
|  | 150   gnomAD_OTH => 'Genome Aggregation Database exomes:Other (population not assigned)', | 
|  | 151   gnomAD_SAS => 'Genome Aggregation Database exomes:South Asian', | 
|  | 152   ALSPAC => 'UK10K:ALSPAC cohort', | 
|  | 153   TOPMed => 'Trans-Omics for Precision Medicine (TOPMed) Program', | 
|  | 154   TWINSUK => 'UK10K:TWINSUK cohort', | 
|  | 155   'gnomADe:AFR' => 'Genome Aggregation Database exomes v170228', | 
|  | 156   'gnomADe:ALL' => 'Genome Aggregation Database exomes v170228', | 
|  | 157   'gnomADe:AMR' => 'Genome Aggregation Database exomes v170228', | 
|  | 158   'gnomADe:ASJ' => 'Genome Aggregation Database exomes v170228', | 
|  | 159   'gnomADe:EAS' => 'Genome Aggregation Database exomes v170228', | 
|  | 160   'gnomADe:FIN' => 'Genome Aggregation Database exomes v170228', | 
|  | 161   'gnomADe:NFE' => 'Genome Aggregation Database exomes v170228', | 
|  | 162   'gnomADe:OTH' => 'Genome Aggregation Database exomes v170228', | 
|  | 163   'gnomADe:SAS' => 'Genome Aggregation Database exomes v170228', | 
|  | 164   'gnomADg:AFR' => 'Genome Aggregation Database genomes v170228:African/African American', | 
|  | 165   'gnomADg:ALL' => 'Genome Aggregation Database genomes v170228:All gnomAD genomes individuals', | 
|  | 166   'gnomADg:AMR' => 'Genome Aggregation Database genomes v170228:Latino', | 
|  | 167   'gnomADg:ASJ' => 'Genome Aggregation Database genomes v170228:Ashkenazi Jewish', | 
|  | 168   'gnomADg:EAS' => 'Genome Aggregation Database genomes v170228:East Asian', | 
|  | 169   'gnomADg:FIN' => 'Genome Aggregation Database genomes v170228:Finnish', | 
|  | 170   'gnomADg:NFE' => 'Genome Aggregation Database genomes v170228:Non-Finnish European', | 
|  | 171   'gnomADg:OTH' => 'Genome Aggregation Database genomes v170228:Other (population not assigned)', | 
|  | 172 }; | 
|  | 173 | 
|  | 174 my $allelic_requirements = { | 
|  | 175   'biallelic' => { af => 0.005, rules => {HET => 2, HOM => 1} }, | 
|  | 176   'monoallelic' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | 
|  | 177   'hemizygous' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | 
|  | 178   'x-linked dominant' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | 
|  | 179   'x-linked over-dominance' => { af => 0.0001, rules => {HET => 1, HOM => 1} }, | 
|  | 180 }; | 
|  | 181 | 
|  | 182 my @allelic_requirement_terms = keys %$allelic_requirements; | 
|  | 183 | 
|  | 184 my @population_wide = qw(minor_allele_freq AA AFR ALSPAC AMR EA EAS EUR SAS TOPMed TWINSUK gnomAD gnomAD_AFR gnomAD_AMR gnomAD_ASJ gnomAD_EAS gnomAD_FIN gnomAD_NFE gnomAD_OTH gnomAD_SAS gnomADe:AFR gnomADe:ALL gnomADe:AMR gnomADe:ASJ gnomADe:EAS gnomADe:FIN gnomADe:NFE gnomADe:OTH gnomADe:SAS gnomADg:AFR gnomADg:ALL gnomADg:AMR gnomADg:ASJ gnomADg:EAS gnomADg:FIN gnomADg:NFE gnomADg:OTH); | 
|  | 185 | 
|  | 186 sub new { | 
|  | 187   my $class = shift; | 
|  | 188 | 
|  | 189   my $self = $class->SUPER::new(@_); | 
|  | 190 | 
|  | 191   my $supported_af_keys = { map {$_ => 1} @population_wide }; | 
|  | 192 | 
|  | 193   my $params = $self->params_to_hash(); | 
|  | 194   my $file = ''; | 
|  | 195 | 
|  | 196   # user only supplied file as first param? | 
|  | 197   if (!keys %$params) { | 
|  | 198     $file = $self->params->[0]; | 
|  | 199   } | 
|  | 200   else { | 
|  | 201     $file = $params->{file}; | 
|  | 202 | 
|  | 203     # process types | 
|  | 204     if ($params->{types}) { | 
|  | 205       $params->{types} = {map {$_ => 1} split(/[\;\&\|]/, $params->{types})}; | 
|  | 206     } | 
|  | 207 | 
|  | 208     # check af | 
|  | 209     foreach my $af (qw/af_monoallelic af_biallelic/) { | 
|  | 210       if($params->{$af}) { | 
|  | 211         die("ERROR: Invalid value for af: ".$params->{$af} . "\n") unless | 
|  | 212           looks_like_number($params->{$af}) && ($params->{$af} >= 0 && $params->{$af} <= 1) | 
|  | 213       } | 
|  | 214     } | 
|  | 215 | 
|  | 216     my $assembly =  $self->{config}->{assembly}; | 
|  | 217     my $af_from_vcf_key_2_collection_id = { | 
|  | 218       ALSPAC => {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'}, | 
|  | 219       TOPMed => {GRCh37 => 'topmed_GRCh37', GRCh38 => 'topmed_GRCh38'}, | 
|  | 220       TWINSUK =>  {GRCh37 => 'uk10k_GRCh37', GRCh38 => 'uk10k_GRCh38'}, | 
|  | 221       'gnomADe:AFR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 222       'gnomADe:ALL' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 223       'gnomADe:AMR' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 224       'gnomADe:ASJ' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 225       'gnomADe:EAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 226       'gnomADe:FIN' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 227       'gnomADe:NFE' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 228       'gnomADe:OTH' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 229       'gnomADe:SAS' => {GRCh37 => 'gnomADe_GRCh37', GRCh38 => 'gnomADe_GRCh38'}, | 
|  | 230       'gnomADg:AFR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 231       'gnomADg:ALL' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 232       'gnomADg:AMR' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 233       'gnomADg:ASJ' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 234       'gnomADg:EAS' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 235       'gnomADg:FIN' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 236       'gnomADg:NFE' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 237       'gnomADg:OTH' => {GRCh37 => 'gnomADg_GRCh37', GRCh38 => 'gnomADg_GRCh38'}, | 
|  | 238     }; | 
|  | 239 | 
|  | 240     my @keys = (); | 
|  | 241     my $vcf_collection_ids = {}; | 
|  | 242     if ($params->{af_keys}) { | 
|  | 243       push @keys, $params->{af_keys}; | 
|  | 244     } else { | 
|  | 245       push @keys, @{$DEFAULTS{af_keys}}; | 
|  | 246     } | 
|  | 247     if ($params->{af_from_vcf}) { | 
|  | 248       if ($params->{af_from_vcf_keys}) { | 
|  | 249         push @keys, $params->{af_from_vcf_keys}; | 
|  | 250       } else { | 
|  | 251         push @keys, @{$DEFAULTS{af_from_vcf_keys}}; | 
|  | 252       } | 
|  | 253     } | 
|  | 254 | 
|  | 255     my @af_keys = (); | 
|  | 256     foreach my $af_key_set (@keys) { | 
|  | 257       foreach my $af_key (split(/[\;\&\|]/, $af_key_set)) { | 
|  | 258         die("ERROR: af_key: " . $af_key . " not supported. Check plugin documentation for supported af_keys.\n") unless $supported_af_keys->{$af_key}; | 
|  | 259         push @af_keys, $af_key; | 
|  | 260         if ($af_from_vcf_key_2_collection_id->{$af_key}) { | 
|  | 261 | 
|  | 262           $vcf_collection_ids->{$af_from_vcf_key_2_collection_id->{$af_key}->{$assembly}} = 1; | 
|  | 263         } | 
|  | 264       } | 
|  | 265     } | 
|  | 266     $params->{af_keys} = \@af_keys; | 
|  | 267     $params->{vcf_collection_ids} = $vcf_collection_ids; | 
|  | 268   } | 
|  | 269 | 
|  | 270   my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); | 
|  | 271   $year += 1900; | 
|  | 272   $mon++; | 
|  | 273   my $stamp = join('_', ($year, $mon, $mday, $hour, $min)); | 
|  | 274   my $cwd_dir = getcwd; | 
|  | 275   my $new_log_dir = "$cwd_dir/g2p_log_dir\_$stamp"; | 
|  | 276   my $log_dir = $params->{log_dir} || $new_log_dir; | 
|  | 277   if (-d $log_dir) { | 
|  | 278     my @files = <$log_dir/*>; | 
|  | 279     if (scalar @files > 0) { | 
|  | 280       unlink glob "'$log_dir/*.*'"; | 
|  | 281     } | 
|  | 282     @files = <$log_dir/*>; | 
|  | 283     if (scalar @files > 0) { | 
|  | 284       mkdir $new_log_dir, 0755; | 
|  | 285       $params->{log_dir} = $new_log_dir; | 
|  | 286     } | 
|  | 287   } else { | 
|  | 288     mkdir $log_dir, 0755; | 
|  | 289     $params->{log_dir} = $log_dir; | 
|  | 290   } | 
|  | 291 | 
|  | 292   foreach my $report_type (qw/txt_report html_report/) { | 
|  | 293     if (!$params->{$report_type}) { | 
|  | 294       my $file_type = ($report_type eq 'txt_report') ? 'txt' : 'html'; | 
|  | 295       $params->{$report_type} = $cwd_dir . "/$report_type\_$stamp.$file_type"; | 
|  | 296     } | 
|  | 297   } | 
|  | 298 | 
|  | 299   if ($params->{all_confidence_levels}) { | 
|  | 300     push @{$params->{confidence_levels}}, 'possible', @{$DEFAULTS{confidence_levels}}; | 
|  | 301   } | 
|  | 302 | 
|  | 303   # copy in default params | 
|  | 304   $params->{$_} //= $DEFAULTS{$_} for keys %DEFAULTS; | 
|  | 305   $self->{user_params} = $params; | 
|  | 306 | 
|  | 307   if (!defined($self->{config}->{reg})) { | 
|  | 308     my $reg = 'Bio::EnsEMBL::Registry'; | 
|  | 309     $reg->load_registry_from_db( | 
|  | 310       -host       => $self->config->{host}, | 
|  | 311       -user       => $self->config->{user}, | 
|  | 312       -port       => $self->config->{port}, | 
|  | 313       -db_version => $self->config->{db_version}, | 
|  | 314       -species    => $self->config->{species}, | 
|  | 315       -no_cache   => $self->config->{no_slice_cache}, | 
|  | 316     ); | 
|  | 317     $self->{config}->{reg} = $reg; | 
|  | 318   } | 
|  | 319 | 
|  | 320   my $va = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'variation'); | 
|  | 321   $va->db->use_vcf(1); | 
|  | 322   $va->db->include_failed_variations(1); | 
|  | 323   $self->{config}->{va} = $va; | 
|  | 324   my $pa = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'population'); | 
|  | 325   $self->{config}->{pa} = $pa; | 
|  | 326   my $vca = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'variation', 'VCFCollection'); | 
|  | 327   $self->{config}->{vca} = $vca; | 
|  | 328   my $ta = $self->{config}->{reg}->get_adaptor($self->{config}->{species}, 'core', 'transcript'); | 
|  | 329   $self->{config}->{ta} = $ta; | 
|  | 330 | 
|  | 331   # read data from file | 
|  | 332   $self->{gene_data} = $self->read_gene_data_from_file($file); | 
|  | 333   $self->synonym_mappings(); | 
|  | 334 | 
|  | 335   # force some config params | 
|  | 336   $self->{config}->{individual} //= ['all']; | 
|  | 337   $self->{config}->{symbol} = 1; | 
|  | 338 | 
|  | 339   $self->{config}->{check_existing} = 1; | 
|  | 340   $self->{config}->{failed} = 1; | 
|  | 341   $self->{config}->{af} = 1; | 
|  | 342   $self->{config}->{af_1kg} = 1; | 
|  | 343   $self->{config}->{af_esp} = 1; | 
|  | 344   $self->{config}->{af_gnomad} = 1; | 
|  | 345 #  $self->{config}->{sift} = 'b'; | 
|  | 346 #  $self->{config}->{polyphen} = 'b'; | 
|  | 347 #  $self->{config}->{hgvsc} = 1; | 
|  | 348 #  $self->{config}->{hgvsp} = 1; | 
|  | 349 | 
|  | 350   # tell VEP we have a cache so stuff gets shared/merged between forks | 
|  | 351   $self->{has_cache} = 1; | 
|  | 352   $self->{cache}->{g2p_in_vcf} = {}; | 
|  | 353 | 
|  | 354   return $self; | 
|  | 355 } | 
|  | 356 | 
|  | 357 sub feature_types { | 
|  | 358   return ['Transcript']; | 
|  | 359 } | 
|  | 360 | 
|  | 361 sub get_header_info { | 
|  | 362   my $self = shift; | 
|  | 363 | 
|  | 364   return { | 
|  | 365     G2P_flag => 'Flags zygosity of valid variants for a G2P gene', | 
|  | 366     G2P_complete => 'Indicates this variant completes the allelic requirements for a G2P gene', | 
|  | 367     G2P_gene_req => 'MONO or BI depending on the context in which this gene has been explored', | 
|  | 368   }; | 
|  | 369 } | 
|  | 370 | 
|  | 371 sub run { | 
|  | 372   my ($self, $tva, $line) = @_; | 
|  | 373 | 
|  | 374   # only interested if we know the zygosity | 
|  | 375   my $zyg = $line->{Extra}->{ZYG} || $line->{ZYG}; | 
|  | 376   return {} unless $zyg; | 
|  | 377 | 
|  | 378   # only interested in given gene set | 
|  | 379   my $tr = $tva->transcript; | 
|  | 380 | 
|  | 381   my $ensembl_gene_id = $tr->{_gene}->stable_id; | 
|  | 382   my $gene_symbol = $tr->{_gene_symbol} || $tr->{_gene_hgnc}; | 
|  | 383   return {} unless $gene_symbol; | 
|  | 384   my $gene_data = $self->gene_data($gene_symbol); | 
|  | 385   if (! defined $gene_data) { | 
|  | 386     my $ensembl_gene_id = $tr->{_gene}->stable_id; | 
|  | 387     $gene_data = $self->gene_data($ensembl_gene_id); | 
|  | 388   } | 
|  | 389 | 
|  | 390   if (!$self->{cache}->{g2p_in_vcf}->{$gene_symbol}) { | 
|  | 391     $self->write_report('G2P_in_vcf', $gene_symbol); | 
|  | 392     $self->{cache}->{g2p_in_vcf}->{$gene_symbol} = 1; | 
|  | 393   } | 
|  | 394   return {} unless defined $gene_data; | 
|  | 395 | 
|  | 396   my @ars = ($gene_data->{'allelic requirement'}) ? @{$gene_data->{'allelic requirement'}} : (); | 
|  | 397   my %seen; | 
|  | 398   @ars = grep { !$seen{$_}++ } @ars; | 
|  | 399 | 
|  | 400   return {} unless (@ars && ( grep { exists($allelic_requirements->{$_}) } @ars)); | 
|  | 401   # limit by type | 
|  | 402   my @consequence_types = map { $_->SO_term } @{$tva->get_all_OverlapConsequences}; | 
|  | 403 | 
|  | 404   return {} unless grep {$self->{user_params}->{types}->{$_->SO_term}} @{$tva->get_all_OverlapConsequences}; | 
|  | 405 | 
|  | 406   # limit by MAF | 
|  | 407   my $threshold = 0; | 
|  | 408   my ($freqs, $existing_variant, $ar_passed) = @{$self->get_freq($tva, \@ars)}; | 
|  | 409 | 
|  | 410   return {} if (!keys %$ar_passed); | 
|  | 411 | 
|  | 412   my $vf = $tva->base_variation_feature; | 
|  | 413   my $allele = $tva->variation_feature_seq; | 
|  | 414   my $start = $vf->{start}; | 
|  | 415   my $end = $vf->{end}; | 
|  | 416 | 
|  | 417   my $individual = $vf->{individual}; | 
|  | 418   my $vf_name = $vf->variation_name; | 
|  | 419   if ($vf_name || $vf_name eq '.') { | 
|  | 420     $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term}); | 
|  | 421   } | 
|  | 422   my $allele_string = $vf->{allele_string}; | 
|  | 423   my @alleles = split('/', $allele_string); | 
|  | 424   my $ref = $alleles[0]; | 
|  | 425   my $seq_region_name = $vf->{chr}; | 
|  | 426 | 
|  | 427   my $params = $self->{user_params}; | 
|  | 428   my $refseq = $tr->{_refseq} || 'NA'; | 
|  | 429   my $tr_stable_id = $tr->stable_id; | 
|  | 430   my $hgvs_t = $tva->hgvs_transcript || 'NA'; | 
|  | 431   my $hgvs_p = $tva->hgvs_protein || 'NA'; | 
|  | 432 | 
|  | 433   my ($clin_sig, $novel, $failed, $frequencies, $existing_name) = ('NA', 'yes', 'NA', 'NA', 'NA'); | 
|  | 434   if ($existing_variant) { | 
|  | 435     $clin_sig = $existing_variant->{clin_sig} || 'NA'; | 
|  | 436     $failed = ($existing_variant->{failed}) ? 'yes' : 'no'; | 
|  | 437     $existing_name = $existing_variant->{variation_name} || 'NA'; | 
|  | 438     $novel = 'no'; | 
|  | 439   } | 
|  | 440 | 
|  | 441   my $pph_score   = (defined $tva->polyphen_score) ? $tva->polyphen_score : 'NA'; | 
|  | 442   my $pph_pred    = (defined $tva->polyphen_prediction) ? $tva->polyphen_prediction : 'NA'; | 
|  | 443   my $sift_score  = (defined $tva->sift_score) ? $tva->sift_score : 'NA'; | 
|  | 444   my $sift_pred   = (defined $tva->sift_prediction) ? $tva->sift_prediction : 'NA'; | 
|  | 445 | 
|  | 446   if (scalar keys %$freqs > 0) { | 
|  | 447     $frequencies = join(',', map {"$_=$freqs->{$_}"} keys %$freqs); | 
|  | 448   } | 
|  | 449 | 
|  | 450   my $ar = join(',', sort keys %$ar_passed); | 
|  | 451   my $ar_in_g2pdb = join(',', sort @ars); | 
|  | 452   my $g2p_data = { | 
|  | 453     'zyg' => $zyg, | 
|  | 454     'allele_requirement' => $ar, | 
|  | 455     'ar_in_g2pdb' => $ar_in_g2pdb, | 
|  | 456     'frequencies' => $frequencies, | 
|  | 457     'consequence_types' => join(',', @consequence_types), | 
|  | 458     'refseq' => $refseq, | 
|  | 459     'failed' => $failed, | 
|  | 460     'clin_sig' => $clin_sig, | 
|  | 461     'novel' => $novel, | 
|  | 462     'existing_name' => $existing_name, | 
|  | 463     'hgvs_t' => $hgvs_t, | 
|  | 464     'hgvs_p' => $hgvs_p, | 
|  | 465     'vf_location' => "$seq_region_name:$start-$end $ref/$allele", | 
|  | 466     'sift_score' => "$sift_score", | 
|  | 467     'sift_prediction' => $sift_pred, | 
|  | 468     'polyphen_score' => "$pph_score", | 
|  | 469     'polyphen_prediction' => $pph_pred, | 
|  | 470   }; | 
|  | 471 | 
|  | 472   my %return = ( | 
|  | 473     G2P_flag => $zyg | 
|  | 474   ); | 
|  | 475 | 
|  | 476 | 
|  | 477   $self->write_report('G2P_flag', $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data); | 
|  | 478 | 
|  | 479   $self->write_report('G2P_complete', $gene_symbol, $tr_stable_id, $individual, $vf_name, $ar, $zyg); | 
|  | 480 | 
|  | 481   my $cache = $self->{cache}->{$individual}->{$tr->stable_id} ||= {}; | 
|  | 482 | 
|  | 483   delete $cache->{$vf_name} if exists($cache->{$vf_name}); | 
|  | 484 | 
|  | 485   # biallelic genes require >=1 hom or >=2 hets | 
|  | 486 | 
|  | 487   my $gene_reqs = {}; | 
|  | 488 | 
|  | 489   foreach my $ar (keys %$ar_passed) { | 
|  | 490     if($ar eq 'biallelic') { | 
|  | 491       # homozygous, report complete | 
|  | 492       if(uc($zyg) eq 'HOM') { | 
|  | 493         $return{G2P_complete} = 1; | 
|  | 494         $gene_reqs->{BI} = 1; | 
|  | 495       } | 
|  | 496       # heterozygous | 
|  | 497       # we need to cache that we've observed one | 
|  | 498       elsif(uc($zyg) eq 'HET') { | 
|  | 499         if(scalar keys %$cache) { | 
|  | 500           $return{G2P_complete} = 1; | 
|  | 501         } | 
|  | 502         $cache->{$vf_name} = 1; | 
|  | 503       } | 
|  | 504     } | 
|  | 505     # monoallelic genes require only one allele | 
|  | 506     elsif($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') { | 
|  | 507       $return{G2P_complete} = 1; | 
|  | 508       $gene_reqs->{MONO} = 1; | 
|  | 509     } | 
|  | 510     else { | 
|  | 511       return {}; | 
|  | 512     } | 
|  | 513   } | 
|  | 514   if ($return{G2P_complete}) { | 
|  | 515     $return{G2P_gene_req} = join(',', sort keys %$gene_reqs); | 
|  | 516   } | 
|  | 517 | 
|  | 518   return \%return; | 
|  | 519 } | 
|  | 520 | 
|  | 521 # read G2P CSV dump | 
|  | 522 # as from http://www.ebi.ac.uk/gene2phenotype/downloads | 
|  | 523 sub read_gene_data_from_file { | 
|  | 524   my $self = shift; | 
|  | 525   my $file = shift; | 
|  | 526   my $delimiter = shift; | 
|  | 527   my (@headers, %gene_data); | 
|  | 528 | 
|  | 529   my $assembly =  $self->{config}->{assembly}; | 
|  | 530   die("ERROR: No file specified or could not read from file ".($file || '')."\n") unless $file && -e $file; | 
|  | 531 | 
|  | 532   my @confidence_levels = @{$self->{user_params}->{confidence_levels}}, "\n"; | 
|  | 533 | 
|  | 534   # determine file type | 
|  | 535   my $file_type; | 
|  | 536   my $fh = FileHandle->new($file, 'r'); | 
|  | 537   while (<$fh>) { | 
|  | 538     chomp; | 
|  | 539       if (/Model_Of_Inheritance/) { | 
|  | 540         $file_type = 'panelapp'; | 
|  | 541       } elsif (/"allelic requirement"/) { | 
|  | 542         $file_type = 'g2p'; | 
|  | 543       } else { | 
|  | 544         $file_type = 'unknown'; | 
|  | 545       } | 
|  | 546       last; | 
|  | 547   } | 
|  | 548   $fh->close(); | 
|  | 549   if ($file_type eq 'unknown') { | 
|  | 550     if ($file =~ /gz$/) { | 
|  | 551       die("ERROR: G2P plugin can only read uncompressed data"); | 
|  | 552     } else { | 
|  | 553       die("ERROR: Could not recognize input file format. Format must be one of panelapp, g2p or custom. Check website for details: https://www.ebi.ac.uk/gene2phenotype/g2p_vep_plugin"); | 
|  | 554     } | 
|  | 555   } | 
|  | 556 | 
|  | 557   if ($file_type eq 'panelapp') { | 
|  | 558     my @headers = (); | 
|  | 559     my $csv = Text::CSV->new ({ sep_char => "\t" }); | 
|  | 560     open my $fh, "<:encoding(utf8)", "$file" or die "$file: $!"; | 
|  | 561     while ( my $row = $csv->getline( $fh ) ) { | 
|  | 562       unless (@headers) { | 
|  | 563         @headers = @$row; | 
|  | 564       } else { | 
|  | 565         my %tmp = map {$headers[$_] => $row->[$_]} (0..$#headers); | 
|  | 566         my $gene_symbol = $tmp{"Gene Entity Symbol"}; | 
|  | 567         my $ensembl_gene_id = ""; | 
|  | 568         if ($assembly eq 'GRCh37') { | 
|  | 569           $ensembl_gene_id = $tmp{"EnsemblId(GRch37)"}; | 
|  | 570         } else { # GRCh38 | 
|  | 571           $ensembl_gene_id = $tmp{"EnsemblId(GRch38)"}; | 
|  | 572         } | 
|  | 573         if ($ensembl_gene_id) { | 
|  | 574           my @ars = (); | 
|  | 575           my $allelic_requirement_panel_app = $tmp{"Model_Of_Inheritance"}; | 
|  | 576           if ($allelic_requirement_panel_app =~ m/MONOALLELIC|BOTH/) { | 
|  | 577             push @ars, 'monoallelic'; | 
|  | 578           } elsif ($allelic_requirement_panel_app =~ m/BIALLELIC|BOTH/) { | 
|  | 579             push @ars, 'biallelic'; | 
|  | 580           } elsif ($allelic_requirement_panel_app eq 'X-LINKED: hemizygous mutation in males, biallelic mutations in females') { | 
|  | 581             push @ars, 'hemizygous'; | 
|  | 582           } elsif ($allelic_requirement_panel_all eq 'X-LINKED: hemizygous mutation in males, monoallelic mutations in females may cause disease (may be less severe, later onset than males)') { | 
|  | 583             push @ars, 'x-linked dominant'; | 
|  | 584           } else { | 
|  | 585             $self->write_report('log', "no allelelic_requirement for $ensembl_gene_id"); | 
|  | 586           } | 
|  | 587           foreach my $ar (@ars) { | 
|  | 588             push @{$gene_data{$ensembl_gene_id}->{"allelic requirement"}}, $ar; | 
|  | 589           } | 
|  | 590         } else { | 
|  | 591           $self->write_report('log', "no ensembl gene id for $gene_symbol"); | 
|  | 592         } | 
|  | 593       } | 
|  | 594     } | 
|  | 595     $csv->eof or $csv->error_diag(); | 
|  | 596     close $fh; | 
|  | 597   } | 
|  | 598 | 
|  | 599   if ($file_type eq 'g2p') { | 
|  | 600     # this regexp allows for nested ",", e.g. | 
|  | 601     # item,description | 
|  | 602     # cheese,"salty,delicious" | 
|  | 603     my $re = qr/(?: "\( ( [^()""]* ) \)" |  \( ( [^()]* ) \) |  " ( [^"]* ) " |  ( [^,]* ) ) , \s* /x; | 
|  | 604 | 
|  | 605     my $fh = FileHandle->new($file, 'r'); | 
|  | 606 | 
|  | 607     while(<$fh>) { | 
|  | 608       chomp; | 
|  | 609       $_ =~ s/\R//g; | 
|  | 610       my @split = grep defined, "$_," =~ /$re/g; | 
|  | 611       unless(@headers) { | 
|  | 612         if ($file_type eq 'g2p') { | 
|  | 613           @headers = map {s/\"//g; $_} @split; | 
|  | 614         } else { | 
|  | 615           @headers = @split; | 
|  | 616         } | 
|  | 617       } | 
|  | 618       else { | 
|  | 619         my %tmp = map {$headers[$_] => $split[$_]} (0..$#split); | 
|  | 620         die("ERROR: Gene symbol column not found\n$_\n") unless $tmp{"gene symbol"}; | 
|  | 621         my $confidence_value = $tmp{"DDD category"}; | 
|  | 622         next if (!grep{$_ eq $confidence_value} @confidence_levels); | 
|  | 623         my $gene_symbol = $tmp{"gene symbol"}; | 
|  | 624         $gene_data{$gene_symbol}->{"prev symbols"} = $tmp{"prev symbols"}; | 
|  | 625         push @{$gene_data{$gene_symbol}->{"allelic requirement"}}, $tmp{"allelic requirement"} if ($tmp{"allelic requirement"}); | 
|  | 626         $self->write_report('G2P_list', $tmp{"gene symbol"}, $tmp{"DDD category"}); | 
|  | 627       } | 
|  | 628     } | 
|  | 629     $fh->close; | 
|  | 630   } | 
|  | 631   return \%gene_data; | 
|  | 632 } | 
|  | 633 | 
|  | 634 # return either whole gene data hash or one gene's data | 
|  | 635 # this should allow updates to this plugin to e.g. query a REST server, for example | 
|  | 636 sub gene_data { | 
|  | 637   my ($self, $gene_symbol) = @_; | 
|  | 638   my $gene_data = $self->{gene_data}->{$gene_symbol}; | 
|  | 639   if (!$gene_data) { | 
|  | 640     my $prev_gene_symbol = $self->{prev_symbol_mappings}->{$gene_symbol}; | 
|  | 641     return $prev_gene_symbol ? $self->{gene_data}->{$prev_gene_symbol} : undef; | 
|  | 642   } | 
|  | 643   return $gene_data; | 
|  | 644 } | 
|  | 645 | 
|  | 646 sub synonym_mappings { | 
|  | 647   my $self = shift; | 
|  | 648   my $gene_data = $self->{gene_data}; | 
|  | 649   my $synonym_mappings = {}; | 
|  | 650   foreach my $gene_symbol (keys %$gene_data) { | 
|  | 651     my $prev_symbols = $gene_data->{$gene_symbol}->{'prev symbols'}; | 
|  | 652     if ($prev_symbols) { | 
|  | 653       foreach my $prev_symbol (split(';', $prev_symbols)) { | 
|  | 654         $synonym_mappings->{$prev_symbol} = $gene_symbol; | 
|  | 655       } | 
|  | 656     } | 
|  | 657   } | 
|  | 658   $self->{prev_symbol_mappings} = $synonym_mappings; | 
|  | 659 } | 
|  | 660 | 
|  | 661 sub get_freq { | 
|  | 662   my $self = shift; | 
|  | 663   my $tva = shift; | 
|  | 664   my $ars = shift; | 
|  | 665   my $allele = $tva->variation_feature_seq; | 
|  | 666   my $vf     = $tva->base_variation_feature; | 
|  | 667   reverse_comp(\$allele) if $vf->{strand} < 0; | 
|  | 668   my $vf_name = $vf->variation_name; | 
|  | 669   if ($vf_name || $vf_name eq '.') { | 
|  | 670     $vf_name = ($vf->{original_chr} || $vf->{chr}) . '_' . $vf->{start} . '_' . ($vf->{allele_string} || $vf->{class_SO_term}); | 
|  | 671   } | 
|  | 672   my $cache = $self->{cache}->{$vf_name}->{_g2p_freqs} ||= {}; | 
|  | 673 | 
|  | 674   if (exists $cache->{$allele}->{failed}) { | 
|  | 675     return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}]; | 
|  | 676   } | 
|  | 677 | 
|  | 678   if (exists $cache->{$allele}->{freq}) { | 
|  | 679     return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}]; | 
|  | 680   } | 
|  | 681 | 
|  | 682   if (!$vf->{existing}) { | 
|  | 683     my $failed_ars = {}; | 
|  | 684     my $freqs = {}; | 
|  | 685     my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars); | 
|  | 686     if (!$passed) { | 
|  | 687       $cache->{$allele}->{failed} = 1; | 
|  | 688       return [{}, {}, {}]; | 
|  | 689     } else { | 
|  | 690      $cache->{$allele}->{freq} = $freqs; | 
|  | 691      $cache->{$allele}->{ex_variant} = undef; | 
|  | 692      # if we get to here return all allelic requirements that passed threshold filtering | 
|  | 693      my $passed_ar = {}; | 
|  | 694      foreach my $ar (@$ars) { | 
|  | 695       if (!$failed_ars->{$ar}) { | 
|  | 696         $passed_ar->{$ar} = 1; | 
|  | 697       } | 
|  | 698      } | 
|  | 699      $cache->{$allele}->{passed_ar} = $passed_ar; | 
|  | 700      return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}]; | 
|  | 701     } | 
|  | 702   } | 
|  | 703 | 
|  | 704   my @existing_variants = @{$vf->{existing}}; | 
|  | 705   # favour dbSNP variants | 
|  | 706   my @dbSNP_variants = grep {$_->{variation_name} =~ /^rs/} @existing_variants; | 
|  | 707   if (@dbSNP_variants) { | 
|  | 708     @existing_variants = @dbSNP_variants; | 
|  | 709   } | 
|  | 710   foreach my $ex (@existing_variants) { | 
|  | 711     my $existing_allele_string = $ex->{allele_string}; | 
|  | 712     my $variation_name = $ex->{variation_name}; | 
|  | 713     my $freqs = {}; | 
|  | 714     my $failed_ars = {}; | 
|  | 715     foreach my $af_key (@{$self->{user_params}->{af_keys}}) { | 
|  | 716       my $freq = $self->{user_params}->{default_af}; | 
|  | 717       if ($af_key eq 'minor_allele_freq') { | 
|  | 718         if (defined $ex->{minor_allele_freq}) { | 
|  | 719           if (($ex->{minor_allele} || '') eq $allele ) { | 
|  | 720             $freq = $ex->{minor_allele_freq}; | 
|  | 721           } else { | 
|  | 722             $freq = $self->correct_frequency($tva, $existing_allele_string, $ex->{minor_allele}, $ex->{minor_allele_freq}, $allele, $variation_name, $af_key, $vf_name) || $freq; | 
|  | 723           } | 
|  | 724         } | 
|  | 725       } | 
|  | 726       else { | 
|  | 727         my @pairs = split(',', $ex->{$af_key} || ''); | 
|  | 728         my $found = 0; | 
|  | 729         if (scalar @pairs == 0) { | 
|  | 730           $found = 1; # no allele frequency for this population/af_key available | 
|  | 731         } | 
|  | 732         foreach my $pair (@pairs) { | 
|  | 733           my ($a, $f) = split(':', $pair); | 
|  | 734           if(($a || '') eq $allele && defined($f)) { | 
|  | 735             $freq = $f; | 
|  | 736             $found = 1; | 
|  | 737           } | 
|  | 738         } | 
|  | 739         if (!$found) { | 
|  | 740           $freq = $self->correct_frequency($tva, $existing_allele_string, undef, undef, $allele, $variation_name, $af_key, $vf_name) || $freq; | 
|  | 741         } | 
|  | 742       } | 
|  | 743       if (!$self->continue_af_annotation($ars, $failed_ars, $freq)) { | 
|  | 744         # cache failed results | 
|  | 745         $cache->{$allele}->{failed} = 1; | 
|  | 746         return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}]; | 
|  | 747       } | 
|  | 748       $freqs->{$af_key} = $freq if ($freq); | 
|  | 749     } | 
|  | 750     if ($self->{user_params}->{af_from_vcf}) { | 
|  | 751       my $passed = $self->frequencies_from_VCF($freqs, $vf, $allele, $ars, $failed_ars); | 
|  | 752       if (!$passed) { | 
|  | 753         $cache->{$allele}->{failed} = 1; | 
|  | 754         return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, {}]; | 
|  | 755       } | 
|  | 756     } | 
|  | 757     $cache->{$allele}->{freq} = $freqs; | 
|  | 758     $cache->{$allele}->{ex_variant} = $ex; | 
|  | 759 | 
|  | 760     # if we get to here return all allelic requirements that passed threshold filtering | 
|  | 761     my $passed_ar = {}; | 
|  | 762     foreach my $ar (@$ars) { | 
|  | 763       if (!$failed_ars->{$ar}) { | 
|  | 764         $passed_ar->{$ar} = 1; | 
|  | 765       } | 
|  | 766     } | 
|  | 767     $cache->{$allele}->{passed_ar} = $passed_ar; | 
|  | 768 | 
|  | 769   } | 
|  | 770 | 
|  | 771   return [$cache->{$allele}->{freq}, $cache->{$allele}->{ex_variant}, $cache->{$allele}->{passed_ar}]; | 
|  | 772 } | 
|  | 773 | 
|  | 774 sub correct_frequency { | 
|  | 775   my ($self, $tva, $allele_string, $minor_allele, $af, $allele, $variation_name, $af_key, $vf_name) = @_; | 
|  | 776 | 
|  | 777   my @existing_alleles = split('/', $allele_string); | 
|  | 778   if (!grep( /^$allele$/, @existing_alleles)) { | 
|  | 779     return 0.0; | 
|  | 780   } | 
|  | 781 | 
|  | 782   if ($af_key eq 'minor_allele_freq' && (scalar @existing_alleles == 2)) { | 
|  | 783     my $existing_ref_allele = $existing_alleles[0]; | 
|  | 784     my $existing_alt_allele = $existing_alleles[1]; | 
|  | 785     if ( ($minor_allele eq $existing_ref_allele && ($allele eq $existing_alt_allele)) || | 
|  | 786          ($minor_allele eq $existing_alt_allele && ($allele eq $existing_ref_allele)) ) { | 
|  | 787       return (1.0 - $af); | 
|  | 788     } | 
|  | 789   } else { | 
|  | 790     my $va = $self->{config}->{va}; | 
|  | 791     my $pa = $self->{config}->{pa}; | 
|  | 792     my $variation = $va->fetch_by_name($variation_name); | 
|  | 793     my $af_key = $self->{user_params}->{af_keys}; | 
|  | 794     my $population_name = $af_key_2_population_name->{$af_key}; | 
|  | 795     if ($population_name) { | 
|  | 796       my $population = $self->{config}->{$population_name}; | 
|  | 797       if (!$population) { | 
|  | 798         $population = $pa->fetch_by_name($population_name); | 
|  | 799         $self->{config}->{$population_name} = $population; | 
|  | 800       } | 
|  | 801       foreach (@{$variation->get_all_Alleles($population)}) { | 
|  | 802         if ($_->allele eq $allele) { | 
|  | 803           return $_->frequency; | 
|  | 804         } | 
|  | 805       } | 
|  | 806     } | 
|  | 807   } | 
|  | 808   return 0.0; | 
|  | 809 } | 
|  | 810 | 
|  | 811 sub frequencies_from_VCF { | 
|  | 812   my $self = shift; | 
|  | 813   my $freqs = shift; | 
|  | 814   my $vf = shift; | 
|  | 815   my $vf_allele = shift; | 
|  | 816   my $ars = shift; | 
|  | 817   my $failed_ars = shift; | 
|  | 818   my $vca = $self->{config}->{vca}; | 
|  | 819   my $collections = $vca->fetch_all; | 
|  | 820   foreach my $vc (@$collections) { | 
|  | 821     next if (! $self->{user_params}->{vcf_collection_ids}->{$vc->id}); | 
|  | 822     my $alleles = $vc->get_all_Alleles_by_VariationFeature($vf); | 
|  | 823     foreach my $allele (@$alleles) { | 
|  | 824       if ($allele->allele eq $vf_allele) { | 
|  | 825         my $af_key = $allele->population->name; | 
|  | 826         my $freq = $allele->frequency; | 
|  | 827         return 0 if (!$self->continue_af_annotation($ars, $failed_ars, $freq)); | 
|  | 828         $freqs->{$af_key} = $freq; | 
|  | 829       } | 
|  | 830     } | 
|  | 831   } | 
|  | 832   return 1; | 
|  | 833 } | 
|  | 834 | 
|  | 835 sub continue_af_annotation { | 
|  | 836   my $self = shift; | 
|  | 837   my $ars = shift; | 
|  | 838   my $failed_ars = shift; | 
|  | 839   my $freq = shift; | 
|  | 840   foreach my $ar (@$ars) { | 
|  | 841     if (!$failed_ars->{$ar})  { | 
|  | 842       if (defined $allelic_requirements->{$ar}) { | 
|  | 843         my $threshold = $allelic_requirements->{$ar}->{af}; | 
|  | 844         if ($freq > $threshold) { | 
|  | 845           $failed_ars->{$ar} = 1; | 
|  | 846         } | 
|  | 847       } | 
|  | 848     } | 
|  | 849   } | 
|  | 850   return (scalar @$ars != scalar keys %$failed_ars); | 
|  | 851 } | 
|  | 852 | 
|  | 853 | 
|  | 854 | 
|  | 855 sub write_report { | 
|  | 856   my $self = shift; | 
|  | 857   my $flag = shift; | 
|  | 858   my $log_dir = $self->{user_params}->{log_dir}; | 
|  | 859   my $log_file = "$log_dir/$$.txt"; | 
|  | 860   open(my $fh, '>>', $log_file) or die "Could not open file '$flag $log_file' $!"; | 
|  | 861   if ($flag eq 'G2P_list') { | 
|  | 862     my ($gene_symbol, $DDD_category) = @_; | 
|  | 863     $DDD_category ||= 'Not assigned'; | 
|  | 864     print $fh "$flag\t$gene_symbol\t$DDD_category\n"; | 
|  | 865   } elsif ($flag eq 'G2P_in_vcf') { | 
|  | 866     my $gene_symbol = shift; | 
|  | 867     print $fh "$flag\t$gene_symbol\n"; | 
|  | 868   } elsif ($flag eq 'G2P_complete') { | 
|  | 869     print $fh join("\t", $flag, @_), "\n"; | 
|  | 870   } elsif ($flag eq 'log') { | 
|  | 871     print $fh join("\t", $flag, @_), "\n"; | 
|  | 872   } else { | 
|  | 873     my ($gene_symbol, $tr_stable_id, $individual, $vf_name, $data) = @_; | 
|  | 874     $data = join(';', map {"$_=$data->{$_}"} sort keys %$data); | 
|  | 875     print $fh join("\t", $flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $data), "\n"; | 
|  | 876   } | 
|  | 877   close $fh; | 
|  | 878 } | 
|  | 879 | 
|  | 880 sub finish { | 
|  | 881   my $self = shift; | 
|  | 882   $self->generate_report; | 
|  | 883 } | 
|  | 884 | 
|  | 885 sub generate_report { | 
|  | 886   my $self = shift; | 
|  | 887   my $result_summary = $self->parse_log_files; | 
|  | 888   my $chart_txt_data = $self->chart_and_txt_data($result_summary); | 
|  | 889   my $chart_data = $chart_txt_data->{chart_data}; | 
|  | 890   my $txt_data = $chart_txt_data->{txt_data}; | 
|  | 891   my $canonical_transcripts = $chart_txt_data->{canonical_transcripts}; | 
|  | 892   $self->write_txt_output($txt_data); | 
|  | 893   $self->write_charts($result_summary, $chart_data, $canonical_transcripts); | 
|  | 894 } | 
|  | 895 | 
|  | 896 sub write_txt_output { | 
|  | 897   my $self = shift; | 
|  | 898   my $txt_output_data = shift; | 
|  | 899   my $txt_output_file = $self->{user_params}->{txt_report}; | 
|  | 900   my $fh_txt = FileHandle->new($txt_output_file, 'w'); | 
|  | 901   foreach my $individual (sort keys %$txt_output_data) { | 
|  | 902     foreach my $gene_symbol (keys %{$txt_output_data->{$individual}}) { | 
|  | 903       foreach my $ar (keys %{$txt_output_data->{$individual}->{$gene_symbol}}) { | 
|  | 904         foreach my $tr_stable_id (keys %{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}}) { | 
|  | 905           my $is_canonical = $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{is_canonical}; | 
|  | 906           my $canonical_tag = ($is_canonical) ? 'is_canonical' : 'not_canonical'; | 
|  | 907           my $req =  $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{REQ}; | 
|  | 908           my $variants = join(';', @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{variants}}); | 
|  | 909           print $fh_txt join("\t", $individual, $gene_symbol, $tr_stable_id, $canonical_tag, "OBS=$ar", "REQ=$req", $variants), "\n"; | 
|  | 910         } | 
|  | 911       } | 
|  | 912     } | 
|  | 913   } | 
|  | 914   $fh_txt->close(); | 
|  | 915 } | 
|  | 916 | 
|  | 917 sub write_charts { | 
|  | 918   my $self = shift; | 
|  | 919   my $result_summary = shift; | 
|  | 920   my $chart_data = shift; | 
|  | 921   my $canonical_transcripts = shift; | 
|  | 922 | 
|  | 923   my $count_g2p_genes = keys %{$result_summary->{g2p_list}}; | 
|  | 924   my $count_in_vcf_file = keys %{$result_summary->{in_vcf_file}}; | 
|  | 925   my $count_complete_genes = scalar keys %{$result_summary->{complete_genes}}; | 
|  | 926 | 
|  | 927   my @charts = (); | 
|  | 928   my @frequencies_header = (); | 
|  | 929 | 
|  | 930   foreach my $short_name (sort @{$self->{user_params}->{af_keys}}) { | 
|  | 931     my $text = $af_key_2_population_name->{$short_name}; | 
|  | 932     push @frequencies_header, "<a style=\"cursor: pointer\" data-placement=\"top\" data-toggle=\"tooltip\" data-container=\"body\" title=\"$text\">$short_name</a>"; | 
|  | 933   } | 
|  | 934 | 
|  | 935   my $count = 1; | 
|  | 936   my @new_header = ( | 
|  | 937     'Variant location and alleles (REF/ALT)', | 
|  | 938     'Variant name', | 
|  | 939     'Existing name', | 
|  | 940     'Zygosity', | 
|  | 941     'All allelic requirements from G2P DB', | 
|  | 942     'Consequence types', | 
|  | 943     'ClinVar annotation', | 
|  | 944     'SIFT', | 
|  | 945     'PolyPhen', | 
|  | 946     'Novel variant', | 
|  | 947     'Has been failed by Ensembl', | 
|  | 948     @frequencies_header, | 
|  | 949     'HGVS transcript', | 
|  | 950     'HGVS protein', | 
|  | 951     'RefSeq IDs', | 
|  | 952   ); | 
|  | 953 | 
|  | 954   my $html_output_file = $self->{user_params}->{html_report}; | 
|  | 955   my $fh_out = FileHandle->new($html_output_file, 'w'); | 
|  | 956   print $fh_out stats_html_head(\@charts); | 
|  | 957   print $fh_out "<div class='main_content container'>"; | 
|  | 958 | 
|  | 959 | 
|  | 960   print $fh_out "<h1>G2P report</h1>"; | 
|  | 961   print $fh_out "<p>Input and output files:</p>"; | 
|  | 962 | 
|  | 963   print $fh_out "<dl class='dl-horizontal'>"; | 
|  | 964   print $fh_out "<dt>G2P list</dt>"; | 
|  | 965   print $fh_out "<dd>" . $self->{user_params}->{file} .  "</dd>"; | 
|  | 966   print $fh_out "<dt>Log directory</dt>"; | 
|  | 967   print $fh_out "<dd>" . $self->{user_params}->{log_dir} .  "</dd>"; | 
|  | 968   print $fh_out "<dt>HTML report</dt>"; | 
|  | 969   print $fh_out "<dd>" . $self->{user_params}->{html_report} .  "</dd>"; | 
|  | 970   print $fh_out "<dt>TXT report</dt>"; | 
|  | 971   print $fh_out "<dd>" . $self->{user_params}->{txt_report} .  "</dd>"; | 
|  | 972   print $fh_out "</dl>"; | 
|  | 973 | 
|  | 974   print $fh_out "<p>Counts:</p>"; | 
|  | 975   print $fh_out "<dl class='dl-horizontal text-overflow'>"; | 
|  | 976   print $fh_out "<dt>$count_g2p_genes</dt>"; | 
|  | 977   print $fh_out "<dd>G2P genes</dd>"; | 
|  | 978   print $fh_out "<dt>$count_in_vcf_file</dt>"; | 
|  | 979   print $fh_out "<dd>G2P genes in input VCF file</dd>"; | 
|  | 980   print $fh_out "<dt>$count_complete_genes</dt>"; | 
|  | 981   print $fh_out "<dd>G2P complete genes in input VCF file</dd>"; | 
|  | 982   print $fh_out "</dl>"; | 
|  | 983 | 
|  | 984 | 
|  | 985   print $fh_out "<h1>Summary of G2P complete genes per individual</h1>"; | 
|  | 986   print $fh_out "<p>G2P complete gene: A sufficient number of variant hits for the observed allelic requirement in at least one of the gene's transcripts. Variants are filtered by frequency.</p>"; | 
|  | 987   print $fh_out "<p>Frequency thresholds and number of required variant hits for each allelic requirement:</p>"; | 
|  | 988 | 
|  | 989   print $fh_out "<table class='table table-bordered'>"; | 
|  | 990   print $fh_out "<thead>"; | 
|  | 991   print $fh_out "<tr><th>Allelic requirement</th><th>Frequency threshold for filtering</th><th>Variant counts by zygosity</th></tr>"; | 
|  | 992   print $fh_out "</thead>"; | 
|  | 993   print $fh_out "<tbody>"; | 
|  | 994   foreach my $ar (sort keys %$allelic_requirements) { | 
|  | 995     my $af = $allelic_requirements->{$ar}->{af}; | 
|  | 996     my $rules =  $allelic_requirements->{$ar}->{rules}; | 
|  | 997     my $rule = join(' OR ', map {"$_ >= $rules->{$_}"} keys %$rules); | 
|  | 998     print $fh_out "<tr><td>$ar</td><td>$af</td><td>$rule</td></tr>"; | 
|  | 999   } | 
|  | 1000   print $fh_out "</tbody>"; | 
|  | 1001   print $fh_out "</table>"; | 
|  | 1002 | 
|  | 1003 my $switch =<<SHTML; | 
|  | 1004 <form> | 
|  | 1005 <div class="checkbox"> | 
|  | 1006   <label> | 
|  | 1007     <input class="target" type="checkbox"> Show only canonical transcript | 
|  | 1008   </label> | 
|  | 1009 </div> | 
|  | 1010 </form> | 
|  | 1011 SHTML | 
|  | 1012 | 
|  | 1013   print $fh_out $switch; | 
|  | 1014 | 
|  | 1015   foreach my $individual (sort keys %$chart_data) { | 
|  | 1016     foreach my $gene_symbol (keys %{$chart_data->{$individual}}) { | 
|  | 1017       foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) { | 
|  | 1018         print $fh_out "<ul>\n"; | 
|  | 1019         foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) { | 
|  | 1020           my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical'; | 
|  | 1021           print $fh_out "<li><a class=\"$class\" href=\"#$individual\_$gene_symbol\_$ar\_$transcript_stable_id\">" . "$individual > $gene_symbol > $ar > $transcript_stable_id" . "</a> </li>\n"; | 
|  | 1022         } | 
|  | 1023         print $fh_out "</ul>\n"; | 
|  | 1024       } | 
|  | 1025     } | 
|  | 1026   } | 
|  | 1027 | 
|  | 1028   foreach my $individual (sort keys %$chart_data) { | 
|  | 1029     foreach my $gene_symbol (keys %{$chart_data->{$individual}}) { | 
|  | 1030       foreach my $ar (keys %{$chart_data->{$individual}->{$gene_symbol}}) { | 
|  | 1031         foreach my $transcript_stable_id (keys %{$chart_data->{$individual}->{$gene_symbol}->{$ar}}) { | 
|  | 1032           my $class = ($canonical_transcripts->{$transcript_stable_id}) ? 'is_canonical' : 'not_canonical'; | 
|  | 1033           print $fh_out "<div class=\"$class\">"; | 
|  | 1034           my $name = "$individual\_$gene_symbol\_$ar\_$transcript_stable_id"; | 
|  | 1035           my $title = "$individual > $gene_symbol > $ar > $transcript_stable_id"; | 
|  | 1036           print $fh_out "<h3><a name=\"$name\"></a>$title <a title=\"Back to Top\" data-toggle=\"tooltip\" href='#top'><span class=\"glyphicon glyphicon-arrow-up\" aria-hidden=\"true\"></span></a></h3>\n"; | 
|  | 1037           print $fh_out "<div class=\"table-responsive\" style=\"width:100%\">\n"; | 
|  | 1038           print $fh_out "<TABLE  class=\"table table-bordered table-condensed\" style=\"margin-left: 2em\">"; | 
|  | 1039           print $fh_out "<thead>\n"; | 
|  | 1040           print $fh_out "<tr>" . join('', map {"<th>$_</th>"} @new_header) . "</tr>\n"; | 
|  | 1041           print $fh_out "</thead>\n"; | 
|  | 1042           print $fh_out "<tbody>\n"; | 
|  | 1043           foreach my $vf_data (@{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) { | 
|  | 1044             my $data_row = $vf_data->[0]; | 
|  | 1045             my @tds = (); | 
|  | 1046             foreach my $cell (@$data_row) { | 
|  | 1047               my $value = $cell->[0]; | 
|  | 1048               my $class = $cell->[1]; | 
|  | 1049               if ($class) { | 
|  | 1050                 push @tds, "<td class=\"$class\">$value</td>"; | 
|  | 1051               } else { | 
|  | 1052                 push @tds, "<td>$value</td>"; | 
|  | 1053               } | 
|  | 1054             } | 
|  | 1055             print $fh_out "<tr>", join('', @tds), "</tr>\n"; | 
|  | 1056           } | 
|  | 1057           print $fh_out "</tbody>\n"; | 
|  | 1058           print $fh_out "</TABLE>\n"; | 
|  | 1059           print $fh_out "</div>\n"; | 
|  | 1060           print $fh_out "</div>\n"; | 
|  | 1061         } | 
|  | 1062       } | 
|  | 1063     } | 
|  | 1064   } | 
|  | 1065   print $fh_out stats_html_tail(); | 
|  | 1066 } | 
|  | 1067 | 
|  | 1068 sub chart_and_txt_data { | 
|  | 1069   my $self = shift; | 
|  | 1070   my $result_summary = shift; | 
|  | 1071   my $individuals = $result_summary->{individuals}; | 
|  | 1072   my $complete_genes = $result_summary->{complete_genes}; | 
|  | 1073   my $acting_ars = $result_summary->{acting_ars}; | 
|  | 1074   my $new_order = $result_summary->{new_order}; | 
|  | 1075 | 
|  | 1076 #  my @frequencies_header = sort keys $af_key_2_population_name; | 
|  | 1077   my @frequencies_header = sort @{$self->{user_params}->{af_keys}}; | 
|  | 1078 | 
|  | 1079   my $assembly = $self->{config}->{assembly}; | 
|  | 1080   my $transcripts = {}; | 
|  | 1081   my $canonical_transcripts = {}; | 
|  | 1082   my $transcript_adaptor = $self->{config}->{ta}; | 
|  | 1083   my $chart_data = {}; | 
|  | 1084   my $txt_output_data = {}; | 
|  | 1085 | 
|  | 1086   my $prediction2bgcolor = { | 
|  | 1087     'probably damaging' => 'danger', | 
|  | 1088     'deleterious' => 'danger', | 
|  | 1089     'possibly damaging' => 'warning', | 
|  | 1090     'unknown'  => 'warning', | 
|  | 1091     'benign' => 'success', | 
|  | 1092     'tolerated' => 'success', | 
|  | 1093   }; | 
|  | 1094 | 
|  | 1095   foreach my $individual (sort keys %$new_order) { | 
|  | 1096     foreach my $gene_symbol (keys %{$new_order->{$individual}}) { | 
|  | 1097       foreach my $ar (keys %{$new_order->{$individual}->{$gene_symbol}}) { | 
|  | 1098         foreach my $transcript_stable_id (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}}) { | 
|  | 1099           foreach my $vf_name (keys %{$new_order->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}) { | 
|  | 1100             my $data = $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$transcript_stable_id}; | 
|  | 1101 | 
|  | 1102             my $hash = {}; | 
|  | 1103             foreach my $pair (split/;/, $data) { | 
|  | 1104               my ($key, $value) = split('=', $pair, 2); | 
|  | 1105               $value ||= ''; | 
|  | 1106               $hash->{$key} = $value; | 
|  | 1107             } | 
|  | 1108             my $vf_location = $hash->{vf_location}; | 
|  | 1109             my $existing_name = $hash->{existing_name}; | 
|  | 1110             if ($existing_name ne 'NA') { | 
|  | 1111               $existing_name = "<a href=\"http://$assembly.ensembl.org/Homo_sapiens/Variation/Explore?v=$existing_name\">$existing_name</a>"; | 
|  | 1112             } | 
|  | 1113             my $refseq = $hash->{refseq}; | 
|  | 1114             my $failed = $hash->{failed}; | 
|  | 1115             my $clin_sign = $hash->{clin_sig}; | 
|  | 1116             my $novel = $hash->{novel}; | 
|  | 1117             my $hgvs_t = $hash->{hgvs_t}; | 
|  | 1118             my $hgvs_p = $hash->{hgvs_p}; | 
|  | 1119             my $allelic_requirement = $hash->{allele_requirement}; | 
|  | 1120             my $observed_allelic_requirement = $hash->{ar_in_g2pdb}; | 
|  | 1121             my $consequence_types = $hash->{consequence_types}; | 
|  | 1122             my $zygosity = $hash->{zyg}; | 
|  | 1123             my $sift_score = $hash->{sift_score} || '0.0'; | 
|  | 1124             my $sift_prediction = $hash->{sift_prediction}; | 
|  | 1125             my $sift = 'NA'; | 
|  | 1126             my $sift_class = ''; | 
|  | 1127             if ($sift_prediction ne 'NA') { | 
|  | 1128               $sift = "$sift_prediction(" . "$sift_score)"; | 
|  | 1129               $sift_class = $prediction2bgcolor->{$sift_prediction}; | 
|  | 1130             } | 
|  | 1131             my $polyphen_score = $hash->{polyphen_score} || '0.0'; | 
|  | 1132             my $polyphen_prediction = $hash->{polyphen_prediction}; | 
|  | 1133             my $polyphen = 'NA'; | 
|  | 1134             my $polyphen_class = ''; | 
|  | 1135             if ($polyphen_prediction ne 'NA') { | 
|  | 1136               $polyphen = "$polyphen_prediction($polyphen_score)"; | 
|  | 1137               $polyphen_class =  $prediction2bgcolor->{$polyphen_prediction}; | 
|  | 1138             } | 
|  | 1139 | 
|  | 1140             my %frequencies_hash = (); | 
|  | 1141             if ($hash->{frequencies} ne 'NA') { | 
|  | 1142               %frequencies_hash = split /[,=]/, $hash->{frequencies}; | 
|  | 1143             } | 
|  | 1144             my @frequencies = (); | 
|  | 1145             my @txt_output_frequencies = (); | 
|  | 1146             foreach my $population (@frequencies_header) { | 
|  | 1147               my $frequency = $frequencies_hash{$population} || ''; | 
|  | 1148               push @frequencies, ["$frequency"]; | 
|  | 1149               if ($frequency) { | 
|  | 1150                 push @txt_output_frequencies, "$population=$frequency"; | 
|  | 1151               } | 
|  | 1152             } | 
|  | 1153             my $is_canonical = 0; | 
|  | 1154             if ($hash->{is_canonical}) { | 
|  | 1155               $is_canonical = ($hash->{is_canonical} eq 'yes') ? 1 : 0; | 
|  | 1156             } else { | 
|  | 1157               if ($transcripts->{$transcript_stable_id}) { | 
|  | 1158                 $is_canonical = 1 if ($canonical_transcripts->{$transcript_stable_id}); | 
|  | 1159               } else { | 
|  | 1160                 my $transcript = $transcript_adaptor->fetch_by_stable_id($transcript_stable_id); | 
|  | 1161                 if ($transcript) { | 
|  | 1162                   $is_canonical = $transcript->is_canonical(); | 
|  | 1163                   $transcripts->{$transcript_stable_id} = 1; | 
|  | 1164                   $canonical_transcripts->{$transcript_stable_id} = 1 if ($is_canonical); | 
|  | 1165                 } | 
|  | 1166               } | 
|  | 1167             } | 
|  | 1168             my ($location, $alleles) = split(' ', $vf_location); | 
|  | 1169             $location =~ s/\-/:/; | 
|  | 1170             $alleles =~ s/\//:/; | 
|  | 1171 | 
|  | 1172             push @{$chart_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}}, [[ | 
|  | 1173               [$vf_location], | 
|  | 1174               [$vf_name], | 
|  | 1175               [$existing_name], | 
|  | 1176               [$zygosity], | 
|  | 1177               [$observed_allelic_requirement], | 
|  | 1178               [$consequence_types], | 
|  | 1179               [$clin_sign], | 
|  | 1180               [$sift, $sift_class], | 
|  | 1181               [$polyphen, $polyphen_class], | 
|  | 1182               [$novel], | 
|  | 1183               [$failed], | 
|  | 1184               @frequencies, | 
|  | 1185               [$hgvs_t], | 
|  | 1186               [$hgvs_p], | 
|  | 1187               [$refseq] | 
|  | 1188             ], $is_canonical]; | 
|  | 1189 | 
|  | 1190             my $txt_output_variant = "$location:$alleles:$zygosity:$consequence_types:SIFT=$sift:PolyPhen=$polyphen"; | 
|  | 1191             if (@txt_output_frequencies) { | 
|  | 1192               $txt_output_variant .= ':' . join(',', @txt_output_frequencies); | 
|  | 1193             } | 
|  | 1194             $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{is_canonical} = $is_canonical; | 
|  | 1195             $txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{REQ} = $observed_allelic_requirement; | 
|  | 1196             push @{$txt_output_data->{$individual}->{$gene_symbol}->{$ar}->{$transcript_stable_id}->{variants}}, $txt_output_variant; | 
|  | 1197           } | 
|  | 1198         } | 
|  | 1199       } | 
|  | 1200     } | 
|  | 1201   } | 
|  | 1202   return {txt_data => $txt_output_data, chart_data => $chart_data, canonical_transcripts => $canonical_transcripts}; | 
|  | 1203 } | 
|  | 1204 | 
|  | 1205 sub parse_log_files { | 
|  | 1206   my $self = shift; | 
|  | 1207 | 
|  | 1208   my $log_dir = $self->{user_params}->{log_dir}; | 
|  | 1209   my @files = <$log_dir/*>; | 
|  | 1210 | 
|  | 1211   my $genes = {}; | 
|  | 1212   my $individuals = {}; | 
|  | 1213   my $complete_genes = {}; | 
|  | 1214   my $g2p_list = {}; | 
|  | 1215   my $in_vcf_file = {}; | 
|  | 1216   my $cache = {}; | 
|  | 1217   my $acting_ars = {}; | 
|  | 1218 | 
|  | 1219   my $new_order = {}; | 
|  | 1220 | 
|  | 1221   foreach my $file (@files) { | 
|  | 1222     my $fh = FileHandle->new($file, 'r'); | 
|  | 1223     while (<$fh>) { | 
|  | 1224       chomp; | 
|  | 1225       if (/^G2P_list/) { | 
|  | 1226         my ($flag, $gene_symbol, $DDD_category) = split/\t/; | 
|  | 1227         $g2p_list->{$gene_symbol} = 1; | 
|  | 1228       } elsif (/^G2P_in_vcf/) { | 
|  | 1229         my ($flag, $gene_symbol) = split/\t/; | 
|  | 1230         $in_vcf_file->{$gene_symbol} = 1; | 
|  | 1231       } elsif (/^G2P_complete/) { | 
|  | 1232         my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $ars, $zyg) = split/\t/; | 
|  | 1233         foreach my $ar (split(',', $ars)) { | 
|  | 1234           if ($ar eq 'biallelic') { | 
|  | 1235             # homozygous, report complete | 
|  | 1236             if (uc($zyg) eq 'HOM') { | 
|  | 1237               $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1; | 
|  | 1238               $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1; | 
|  | 1239               $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1; | 
|  | 1240             } | 
|  | 1241             # heterozygous | 
|  | 1242             # we need to cache that we've observed one | 
|  | 1243             elsif (uc($zyg) eq 'HET') { | 
|  | 1244               if (scalar keys %{$cache->{$individual}->{$tr_stable_id}} >= 1) { | 
|  | 1245                 $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1; | 
|  | 1246                 $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1; | 
|  | 1247                 $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1; | 
|  | 1248                 # add first observed het variant to the list | 
|  | 1249                 foreach my $vf (keys %{$cache->{$individual}->{$tr_stable_id}}) { | 
|  | 1250                   $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf} = 1; | 
|  | 1251                 } | 
|  | 1252               } | 
|  | 1253               $cache->{$individual}->{$tr_stable_id}->{$vf_name}++; | 
|  | 1254             } | 
|  | 1255           } | 
|  | 1256           # monoallelic genes require only one allele | 
|  | 1257           elsif ($ar eq 'monoallelic' || $ar eq 'x-linked dominant' || $ar eq 'hemizygous' || $ar eq 'x-linked over-dominance') { | 
|  | 1258             $complete_genes->{$gene_symbol}->{$individual}->{$tr_stable_id} = 1; | 
|  | 1259             $acting_ars->{$gene_symbol}->{$individual}->{$ar} = 1; | 
|  | 1260             $new_order->{$individual}->{$gene_symbol}->{$ar}->{$tr_stable_id}->{$vf_name} = 1; | 
|  | 1261           } | 
|  | 1262         } | 
|  | 1263       } elsif (/^G2P_flag/) { | 
|  | 1264         my ($flag, $gene_symbol, $tr_stable_id, $individual, $vf_name, $g2p_data) = split/\t/; | 
|  | 1265         $genes->{$gene_symbol}->{"$individual\t$vf_name"}->{$tr_stable_id} = $g2p_data; | 
|  | 1266         $individuals->{$individual}->{$gene_symbol}->{$vf_name}->{$tr_stable_id} = $g2p_data; | 
|  | 1267       } else { | 
|  | 1268 | 
|  | 1269       } | 
|  | 1270     } | 
|  | 1271     $fh->close(); | 
|  | 1272   } | 
|  | 1273   return { | 
|  | 1274     genes => $genes, | 
|  | 1275     individuals => $individuals, | 
|  | 1276     complete_genes => $complete_genes, | 
|  | 1277     g2p_list => $g2p_list, | 
|  | 1278     in_vcf_file => $in_vcf_file, | 
|  | 1279     acting_ars => $acting_ars, | 
|  | 1280     new_order => $new_order, | 
|  | 1281   }; | 
|  | 1282 } | 
|  | 1283 | 
|  | 1284 | 
|  | 1285 sub stats_html_head { | 
|  | 1286     my $charts = shift; | 
|  | 1287 | 
|  | 1288     my $html =<<SHTML; | 
|  | 1289 <html> | 
|  | 1290 <head> | 
|  | 1291   <title>VEP summary</title> | 
|  | 1292   <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous"> | 
|  | 1293   <style> | 
|  | 1294     a.inactive { | 
|  | 1295       color: grey; | 
|  | 1296       pointer-events:none; | 
|  | 1297     } | 
|  | 1298   </style> | 
|  | 1299 </head> | 
|  | 1300 <body> | 
|  | 1301 SHTML | 
|  | 1302   return $html; | 
|  | 1303 } | 
|  | 1304 | 
|  | 1305 sub stats_html_tail { | 
|  | 1306   my $script =<<SHTML; | 
|  | 1307   <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script> | 
|  | 1308   <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script> | 
|  | 1309   <script type="text/javascript" src="http://www.google.com/jsapi"></script> | 
|  | 1310   <script> | 
|  | 1311     \$( "input[type=checkbox]" ).on( "click", function(){ | 
|  | 1312       if (\$('.target').is(':checked')) { | 
|  | 1313         \$( "div.not_canonical" ).hide(); | 
|  | 1314         \$("a.not_canonical").addClass("inactive"); | 
|  | 1315       } else { | 
|  | 1316         \$( "div.not_canonical" ).show(); | 
|  | 1317         \$("a.not_canonical").removeClass("inactive"); | 
|  | 1318       } | 
|  | 1319     } ); | 
|  | 1320   \$(document).ready(function(){ | 
|  | 1321     \$('[data-toggle="tooltip"]').tooltip(); | 
|  | 1322   }); | 
|  | 1323   </script> | 
|  | 1324 SHTML | 
|  | 1325   return "\n</div>\n$script\n</body>\n</html>\n"; | 
|  | 1326 } | 
|  | 1327 | 
|  | 1328 sub sort_keys { | 
|  | 1329   my $data = shift; | 
|  | 1330   my $sort = shift; | 
|  | 1331   print $data, "\n"; | 
|  | 1332   my @keys; | 
|  | 1333 | 
|  | 1334   # sort data | 
|  | 1335   if(defined($sort)) { | 
|  | 1336     if($sort eq 'chr') { | 
|  | 1337       @keys = sort {($a !~ /^\d+$/ || $b !~ /^\d+/) ? $a cmp $b : $a <=> $b} keys %{$data}; | 
|  | 1338     } | 
|  | 1339     elsif($sort eq 'value') { | 
|  | 1340       @keys = sort {$data->{$a} <=> $data->{$b}} keys %{$data}; | 
|  | 1341     } | 
|  | 1342     elsif(ref($sort) eq 'HASH') { | 
|  | 1343       @keys = sort {$sort->{$a} <=> $sort->{$b}} keys %{$data}; | 
|  | 1344     } | 
|  | 1345   } | 
|  | 1346   else { | 
|  | 1347     @keys = keys %{$data}; | 
|  | 1348   } | 
|  | 1349 | 
|  | 1350   return \@keys; | 
|  | 1351 } | 
|  | 1352 | 
|  | 1353 1; |