| 0 | 1 =head1 LICENSE | 
|  | 2 | 
|  | 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | 
|  | 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | 
|  | 5 | 
|  | 6 Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 7 you may not use this file except in compliance with the License. | 
|  | 8 You may obtain a copy of the License at | 
|  | 9 | 
|  | 10      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | 
|  | 12 Unless required by applicable law or agreed to in writing, software | 
|  | 13 distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 15 See the License for the specific language governing permissions and | 
|  | 16 limitations under the License. | 
|  | 17 | 
|  | 18 =head1 CONTACT | 
|  | 19 | 
|  | 20  Ensembl <http://www.ensembl.org/info/about/contact/index.html> | 
|  | 21 | 
|  | 22 =cut | 
|  | 23 | 
|  | 24 =head1 NAME | 
|  | 25 | 
|  | 26  dbNSFP | 
|  | 27 | 
|  | 28 =head1 SYNOPSIS | 
|  | 29 | 
|  | 30  mv dbNSFP.pm ~/.vep/Plugins | 
|  | 31  ./vep -i variations.vcf --plugin dbNSFP,/path/to/dbNSFP.gz,col1,col2 | 
|  | 32 | 
|  | 33 =head1 DESCRIPTION | 
|  | 34 | 
|  | 35  A VEP plugin that retrieves data for missense variants from a tabix-indexed | 
|  | 36  dbNSFP file. | 
|  | 37 | 
|  | 38  Please cite the dbNSFP publication alongside the VEP if you use this resource: | 
|  | 39  http://www.ncbi.nlm.nih.gov/pubmed/21520341 | 
|  | 40 | 
|  | 41  You must have the Bio::DB::HTS module or the tabix utility must be installed | 
|  | 42  in your path to use this plugin. The dbNSFP data file can be downloaded from | 
|  | 43  https://sites.google.com/site/jpopgen/dbNSFP. | 
|  | 44 | 
|  | 45  Release 3.5a of dbNSFP uses GRCh38/hg38 coordinates and GRCh37/hg19 | 
|  | 46  coordinates. | 
|  | 47  To use plugin with GRCh37/hg19 data: | 
|  | 48  > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip | 
|  | 49  > unzip dbNSFPv3.5a.zip | 
|  | 50  > head -n1 dbNSFP3.5a_variant.chr1 > h | 
|  | 51  > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | awk '$8 != "."' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP_hg19.gz | 
|  | 52  > tabix -s 8 -b 9 -e 9 dbNSFP_hg19.gz | 
|  | 53 | 
|  | 54  To use plugin with GRCh38/hg38 data: | 
|  | 55  > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip | 
|  | 56  > unzip dbNSFPv3.5a.zip | 
|  | 57  > head -n1 dbNSFP3.5a_variant.chr1 > h | 
|  | 58  > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | sort -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP.gz | 
|  | 59  > tabix -s 1 -b 2 -e 2 dbNSFP.gz | 
|  | 60 | 
|  | 61  When running the plugin you must list at least one column to retrieve from the | 
|  | 62  dbNSFP file, specified as parameters to the plugin e.g. | 
|  | 63 | 
|  | 64  --plugin dbNSFP,/path/to/dbNSFP.gz,LRT_score,GERP++_RS | 
|  | 65 | 
|  | 66  You may include all columns with ALL; this fetches a large amount of data per | 
|  | 67  variant!: | 
|  | 68 | 
|  | 69  --plugin dbNSFP,/path/to/dbNSFP.gz,ALL | 
|  | 70 | 
|  | 71  Tabix also allows the data file to be hosted on a remote server. This plugin is | 
|  | 72  fully compatible with such a setup - simply use the URL of the remote file: | 
|  | 73 | 
|  | 74  --plugin dbNSFP,http://my.files.com/dbNSFP.gz,col1,col2 | 
|  | 75 | 
|  | 76  The plugin replaces occurrences of ';' with ',' and '|' with '&'. However, some | 
|  | 77  data field columns, e.g. Interpro_domain, use the replacement characters. We | 
|  | 78  added a file with replacement logic for customising the required replacement | 
|  | 79  of ';' and '|' in dbNSFP data columns. In addition to the default replacements | 
|  | 80  (; to , and | to &) users can add customised replacements. Users can either modify | 
|  | 81  the file dbNSFP_replacement_logic in the VEP_plugins directory or provide their own | 
|  | 82  file as second argument when calling the plugin: | 
|  | 83 | 
|  | 84  --plugin dbNSFP,/path/to/dbNSFP.gz,/path/to/dbNSFP_replacement_logic,LRT_score,GERP++_RS | 
|  | 85 | 
|  | 86  Note that transcript sequences referred to in dbNSFP may be out of sync with | 
|  | 87  those in the latest release of Ensembl; this may lead to discrepancies with | 
|  | 88  scores retrieved from other sources. | 
|  | 89 | 
|  | 90  If the dbNSFP README file is found in the same directory as the data file, | 
|  | 91  column descriptions will be read from this and incorporated into the VEP output | 
|  | 92  file header. | 
|  | 93 | 
|  | 94 =cut | 
|  | 95 | 
|  | 96 package dbNSFP; | 
|  | 97 | 
|  | 98 use strict; | 
|  | 99 use warnings; | 
|  | 100 | 
|  | 101 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | 
|  | 102 | 
|  | 103 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; | 
|  | 104 | 
|  | 105 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); | 
|  | 106 | 
|  | 107 my %INCLUDE_SO = map {$_ => 1} qw(missense_variant stop_lost stop_gained start_lost); | 
|  | 108 | 
|  | 109 sub new { | 
|  | 110   my $class = shift; | 
|  | 111 | 
|  | 112   my $self = $class->SUPER::new(@_); | 
|  | 113 | 
|  | 114   $self->expand_left(0); | 
|  | 115   $self->expand_right(0); | 
|  | 116 | 
|  | 117   # get dbNSFP file | 
|  | 118   my $file = $self->params->[0]; | 
|  | 119   $self->add_file($file); | 
|  | 120 | 
|  | 121   # get headers | 
|  | 122   open HEAD, "tabix -fh $file 1:1-1 2>&1 | "; | 
|  | 123   while(<HEAD>) { | 
|  | 124     next unless /^\#/; | 
|  | 125     chomp; | 
|  | 126     $self->{headers} = [split]; | 
|  | 127   } | 
|  | 128   close HEAD; | 
|  | 129 | 
|  | 130   die "ERROR: Could not read headers from $file\n" unless defined($self->{headers}) && scalar @{$self->{headers}}; | 
|  | 131 | 
|  | 132   # check alt and Ensembl_transcriptid headers | 
|  | 133   foreach my $h(qw(alt Ensembl_transcriptid)) { | 
|  | 134     die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}}; | 
|  | 135   } | 
|  | 136 | 
|  | 137   my $i = 1; | 
|  | 138   # check if 2nd argument is a file that specifies replacement logic | 
|  | 139   # read replacement logic | 
|  | 140   my $replacement_file = $self->params->[$i]; | 
|  | 141   if (defined $replacement_file && -e $replacement_file) { | 
|  | 142     $self->add_replacement_logic($replacement_file); | 
|  | 143     $i++; | 
|  | 144   } else { | 
|  | 145     $self->add_replacement_logic(); | 
|  | 146   } | 
|  | 147 | 
|  | 148   # get required columns | 
|  | 149   while(defined($self->params->[$i])) { | 
|  | 150     my $col = $self->params->[$i]; | 
|  | 151     if($col eq 'ALL') { | 
|  | 152       $self->{cols} = {map {$_ => 1} @{$self->{headers}}}; | 
|  | 153       last; | 
|  | 154     } | 
|  | 155     die "ERROR: Column $col not found in header for file $file. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless grep {$_ eq $col} @{$self->{headers}}; | 
|  | 156 | 
|  | 157     $self->{cols}->{$self->params->[$i]} = 1; | 
|  | 158     $i++; | 
|  | 159   } | 
|  | 160 | 
|  | 161   die "ERROR: No columns selected to fetch. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless defined($self->{cols}) && scalar keys %{$self->{cols}}; | 
|  | 162 | 
|  | 163   return $self; | 
|  | 164 } | 
|  | 165 | 
|  | 166 sub feature_types { | 
|  | 167   return ['Transcript']; | 
|  | 168 } | 
|  | 169 | 
|  | 170 sub get_header_info { | 
|  | 171   my $self = shift; | 
|  | 172 | 
|  | 173   if(!exists($self->{_header_info})) { | 
|  | 174 | 
|  | 175     # look for readme | 
|  | 176     my $file_dir = $self->files->[0]; | 
|  | 177 | 
|  | 178     my %rm_descs; | 
|  | 179 | 
|  | 180     # won't work for remote | 
|  | 181     if($file_dir !~ /tp\:\/\//) { | 
|  | 182 | 
|  | 183       # get just dir | 
|  | 184       $file_dir =~ s/\/[^\/]+$/\//; | 
|  | 185 | 
|  | 186       if(opendir DIR, $file_dir) { | 
|  | 187         my ($readme_file) = grep {/dbnsfp.*readme\.txt/i} readdir DIR; | 
|  | 188         closedir DIR; | 
|  | 189 | 
|  | 190         if(open RM, $file_dir.$readme_file) { | 
|  | 191           my ($col, $reading); | 
|  | 192 | 
|  | 193           # parse dbNSFP readme | 
|  | 194           # relevant lines look like: | 
|  | 195           # | 
|  | 196           # 1   column1_name: description blah blah | 
|  | 197           #     blah blah blah | 
|  | 198           # 2   column2_name: description blah blah | 
|  | 199           #     blah blah blah | 
|  | 200 | 
|  | 201           while(<RM>) { | 
|  | 202             chomp; | 
|  | 203             s/\r$//g; | 
|  | 204 | 
|  | 205             if(/^\d+\s/) { | 
|  | 206               $reading = 1; | 
|  | 207 | 
|  | 208               m/^\d+\s+(.+?)\:\s+(.+)/; | 
|  | 209               $col = $1; | 
|  | 210 | 
|  | 211               $rm_descs{$col} = '(from dbNSFP) '.$2 if $col && $2; | 
|  | 212             } | 
|  | 213             elsif($reading && /\w/) { | 
|  | 214               s/^\s+//; | 
|  | 215               $rm_descs{$col} .= ' '.$_; | 
|  | 216             } | 
|  | 217             else { | 
|  | 218               $reading = 0; | 
|  | 219             } | 
|  | 220           } | 
|  | 221 | 
|  | 222           close RM; | 
|  | 223 | 
|  | 224           # remove multiple spaces | 
|  | 225           $rm_descs{$_} =~ s/\s+/ /g for keys %rm_descs; | 
|  | 226         } | 
|  | 227       } | 
|  | 228     } | 
|  | 229 | 
|  | 230     $self->{_header_info} = {map {$_ => $rm_descs{$_} || ($_.' from dbNSFP file')} keys %{$self->{cols}}}; | 
|  | 231   } | 
|  | 232 | 
|  | 233   return $self->{_header_info}; | 
|  | 234 } | 
|  | 235 | 
|  | 236 sub run { | 
|  | 237   my ($self, $tva) = @_; | 
|  | 238 | 
|  | 239   # only for missense variants | 
|  | 240   return {} unless grep {$INCLUDE_SO{$_->SO_term}} @{$tva->get_all_OverlapConsequences}; | 
|  | 241 | 
|  | 242   my $vf = $tva->variation_feature; | 
|  | 243 | 
|  | 244   return {} unless $vf->{start} eq $vf->{end}; | 
|  | 245 | 
|  | 246   # get allele, reverse comp if needed | 
|  | 247   my $allele = $tva->variation_feature_seq; | 
|  | 248   reverse_comp(\$allele) if $vf->{strand} < 0; | 
|  | 249 | 
|  | 250   return {} unless $allele =~ /^[ACGT]$/; | 
|  | 251 | 
|  | 252   # get transcript stable ID | 
|  | 253   my $tr_id = $tva->transcript->stable_id; | 
|  | 254 | 
|  | 255   my $data; | 
|  | 256   my $pos; | 
|  | 257 | 
|  | 258   my $assembly = $self->{config}->{assembly}; | 
|  | 259   my $chr = ($vf->{chr} =~ /MT/i) ? 'M' : $vf->{chr}; | 
|  | 260   foreach my $tmp_data(@{$self->get_data($chr, $vf->{start} - 1, $vf->{end})}) { | 
|  | 261     # compare allele and transcript | 
|  | 262     if ($assembly eq 'GRCh37') { | 
|  | 263       if (exists $tmp_data->{'pos(1-coor)'}) { | 
|  | 264         # for dbNSFP version 2.9.1 | 
|  | 265         $pos = $tmp_data->{'pos(1-coor)'} | 
|  | 266       } elsif (exists $tmp_data->{'hg19_pos(1-based)'}) { | 
|  | 267         # for dbNSFP version 3.5c indexed for hg19/(=GRCh37) | 
|  | 268         $pos =  $tmp_data->{'hg19_pos(1-based)'} | 
|  | 269       } else { | 
|  | 270         die "dbNSFP file does not contain required columns (pos(1-coor) for version 2.9.1 or hg19_pos(1-based) for version 3.5c) to use with GRCh37"; | 
|  | 271       } | 
|  | 272     } else { | 
|  | 273       if (exists $tmp_data->{'pos(1-based)'}) { | 
|  | 274         $pos = $tmp_data->{'pos(1-based)'} | 
|  | 275       } else { | 
|  | 276         die "dbNSFP file does not contain required column pos(1-based) to use with GRCh38"; | 
|  | 277       } | 
|  | 278     } | 
|  | 279 | 
|  | 280     next unless | 
|  | 281       $pos == $vf->{start} && | 
|  | 282       defined($tmp_data->{alt}) && | 
|  | 283       $tmp_data->{alt} eq $allele; | 
|  | 284 | 
|  | 285     # make a clean copy as we're going to edit it | 
|  | 286     %$data = %$tmp_data; | 
|  | 287 | 
|  | 288     # convert data with multiple transcript values | 
|  | 289     # if($data->{Ensembl_transcriptid} =~ m/\;/) { | 
|  | 290 | 
|  | 291     #   # find the "index" of this transcript | 
|  | 292     #   my @tr_ids = split(';', $data->{Ensembl_transcriptid}); | 
|  | 293     #   my $tr_index; | 
|  | 294 | 
|  | 295     #   for my $i(0..$#tr_ids) { | 
|  | 296     #     $tr_index = $i; | 
|  | 297     #     last if $tr_ids[$tr_index] =~ /^$tr_id(\.\d+)?$/; | 
|  | 298     #   } | 
|  | 299 | 
|  | 300     #   next unless defined($tr_index); | 
|  | 301 | 
|  | 302     #   # now alter other fields | 
|  | 303     #   foreach my $key(keys %$data) { | 
|  | 304     #     if($data->{$key} =~ m/\;/) { | 
|  | 305     #       my @split = split(';', $data->{$key}); | 
|  | 306     #       die("ERROR: Transcript index out of range") if $tr_index > $#split; | 
|  | 307     #       $data->{$key} = $split[$tr_index]; | 
|  | 308     #     } | 
|  | 309     #   } | 
|  | 310     # } | 
|  | 311     last; | 
|  | 312   } | 
|  | 313 | 
|  | 314   return {} unless scalar keys %$data; | 
|  | 315 | 
|  | 316   # get required data | 
|  | 317   my @from = @{$self->{replacement}->{default}->{from}}; | 
|  | 318   my @to = @{$self->{replacement}->{default}->{to}}; | 
|  | 319 | 
|  | 320   my %return; | 
|  | 321   foreach my $colname (keys %$data) { | 
|  | 322     next if(!defined($self->{cols}->{$colname})); | 
|  | 323     next if($data->{$colname} eq '.'); | 
|  | 324 | 
|  | 325     my @from = @{$self->{replacement}->{default}->{from}}; | 
|  | 326     my @to   = @{$self->{replacement}->{default}->{to}}; | 
|  | 327     @from    = @{$self->{replacement}->{$colname}->{from}} if (defined $self->{replacement}->{$colname}); | 
|  | 328     @to      = @{$self->{replacement}->{$colname}->{to}} if (defined $self->{replacement}->{$colname}); | 
|  | 329     for my $i (0 .. $#from) { | 
|  | 330       $data->{$colname} =~ s/\Q$from[$i]\E/$to[$i]/g; | 
|  | 331     } | 
|  | 332     $return{$colname} = $data->{$colname}; | 
|  | 333   } | 
|  | 334 | 
|  | 335   return \%return; | 
|  | 336 } | 
|  | 337 | 
|  | 338 sub parse_data { | 
|  | 339   my ($self, $line) = @_; | 
|  | 340 | 
|  | 341   $line =~ s/\r$//g; | 
|  | 342 | 
|  | 343   my @split = split /\t/, $line; | 
|  | 344 | 
|  | 345   # parse data into hash of col names and values | 
|  | 346   my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1)); | 
|  | 347 | 
|  | 348   return \%data; | 
|  | 349 } | 
|  | 350 | 
|  | 351 sub get_start { | 
|  | 352   return $_[1]->{'pos(1-based)'}; | 
|  | 353 } | 
|  | 354 | 
|  | 355 sub get_end { | 
|  | 356   return $_[1]->{'pos(1-based)'}; | 
|  | 357 } | 
|  | 358 | 
|  | 359 sub add_replacement_logic { | 
|  | 360   my $self = shift; | 
|  | 361   my $file = shift; | 
|  | 362   $file ||= 'dbNSFP_replacement_logic'; | 
|  | 363   if (! -e $file) { | 
|  | 364     $self->{replacement}->{default}->{from} = [';', '|']; | 
|  | 365     $self->{replacement}->{default}->{to} = [',', '&']; | 
|  | 366   } else { | 
|  | 367     open FILE, $file; | 
|  | 368     while(<FILE>) { | 
|  | 369       chomp; | 
|  | 370       next if /^colname/; | 
|  | 371       my ($colname, $from, $to) = split/\s+/; | 
|  | 372       die ("ERROR: 3 values separated by whitespace are required: colname from to.") if(!($colname && $from && $to)); | 
|  | 373       push @{$self->{replacement}->{$colname}->{from}}, $from; | 
|  | 374       push @{$self->{replacement}->{$colname}->{to}}, $to; | 
|  | 375     } | 
|  | 376     close FILE; | 
|  | 377     die("ERROR: No default replacement logic has been specified.\n") if (!defined $self->{replacement}->{default}); | 
|  | 378   } | 
|  | 379 } | 
|  | 380 | 
|  | 381 1; | 
|  | 382 | 
|  | 383 |