Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/dbNSFP.pm @ 0:e545d0a25ffe draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:17:17 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e545d0a25ffe |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 dbNSFP | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv dbNSFP.pm ~/.vep/Plugins | |
| 31 ./vep -i variations.vcf --plugin dbNSFP,/path/to/dbNSFP.gz,col1,col2 | |
| 32 | |
| 33 =head1 DESCRIPTION | |
| 34 | |
| 35 A VEP plugin that retrieves data for missense variants from a tabix-indexed | |
| 36 dbNSFP file. | |
| 37 | |
| 38 Please cite the dbNSFP publication alongside the VEP if you use this resource: | |
| 39 http://www.ncbi.nlm.nih.gov/pubmed/21520341 | |
| 40 | |
| 41 You must have the Bio::DB::HTS module or the tabix utility must be installed | |
| 42 in your path to use this plugin. The dbNSFP data file can be downloaded from | |
| 43 https://sites.google.com/site/jpopgen/dbNSFP. | |
| 44 | |
| 45 Release 3.5a of dbNSFP uses GRCh38/hg38 coordinates and GRCh37/hg19 | |
| 46 coordinates. | |
| 47 To use plugin with GRCh37/hg19 data: | |
| 48 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip | |
| 49 > unzip dbNSFPv3.5a.zip | |
| 50 > head -n1 dbNSFP3.5a_variant.chr1 > h | |
| 51 > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | awk '$8 != "."' | sort -k8,8 -k9,9n - | cat h - | bgzip -c > dbNSFP_hg19.gz | |
| 52 > tabix -s 8 -b 9 -e 9 dbNSFP_hg19.gz | |
| 53 | |
| 54 To use plugin with GRCh38/hg38 data: | |
| 55 > wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv3.5a.zip | |
| 56 > unzip dbNSFPv3.5a.zip | |
| 57 > head -n1 dbNSFP3.5a_variant.chr1 > h | |
| 58 > cat dbNSFP3.5a_variant.chr* | grep -v ^#chr | sort -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP.gz | |
| 59 > tabix -s 1 -b 2 -e 2 dbNSFP.gz | |
| 60 | |
| 61 When running the plugin you must list at least one column to retrieve from the | |
| 62 dbNSFP file, specified as parameters to the plugin e.g. | |
| 63 | |
| 64 --plugin dbNSFP,/path/to/dbNSFP.gz,LRT_score,GERP++_RS | |
| 65 | |
| 66 You may include all columns with ALL; this fetches a large amount of data per | |
| 67 variant!: | |
| 68 | |
| 69 --plugin dbNSFP,/path/to/dbNSFP.gz,ALL | |
| 70 | |
| 71 Tabix also allows the data file to be hosted on a remote server. This plugin is | |
| 72 fully compatible with such a setup - simply use the URL of the remote file: | |
| 73 | |
| 74 --plugin dbNSFP,http://my.files.com/dbNSFP.gz,col1,col2 | |
| 75 | |
| 76 The plugin replaces occurrences of ';' with ',' and '|' with '&'. However, some | |
| 77 data field columns, e.g. Interpro_domain, use the replacement characters. We | |
| 78 added a file with replacement logic for customising the required replacement | |
| 79 of ';' and '|' in dbNSFP data columns. In addition to the default replacements | |
| 80 (; to , and | to &) users can add customised replacements. Users can either modify | |
| 81 the file dbNSFP_replacement_logic in the VEP_plugins directory or provide their own | |
| 82 file as second argument when calling the plugin: | |
| 83 | |
| 84 --plugin dbNSFP,/path/to/dbNSFP.gz,/path/to/dbNSFP_replacement_logic,LRT_score,GERP++_RS | |
| 85 | |
| 86 Note that transcript sequences referred to in dbNSFP may be out of sync with | |
| 87 those in the latest release of Ensembl; this may lead to discrepancies with | |
| 88 scores retrieved from other sources. | |
| 89 | |
| 90 If the dbNSFP README file is found in the same directory as the data file, | |
| 91 column descriptions will be read from this and incorporated into the VEP output | |
| 92 file header. | |
| 93 | |
| 94 =cut | |
| 95 | |
| 96 package dbNSFP; | |
| 97 | |
| 98 use strict; | |
| 99 use warnings; | |
| 100 | |
| 101 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 102 | |
| 103 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; | |
| 104 | |
| 105 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); | |
| 106 | |
| 107 my %INCLUDE_SO = map {$_ => 1} qw(missense_variant stop_lost stop_gained start_lost); | |
| 108 | |
| 109 sub new { | |
| 110 my $class = shift; | |
| 111 | |
| 112 my $self = $class->SUPER::new(@_); | |
| 113 | |
| 114 $self->expand_left(0); | |
| 115 $self->expand_right(0); | |
| 116 | |
| 117 # get dbNSFP file | |
| 118 my $file = $self->params->[0]; | |
| 119 $self->add_file($file); | |
| 120 | |
| 121 # get headers | |
| 122 open HEAD, "tabix -fh $file 1:1-1 2>&1 | "; | |
| 123 while(<HEAD>) { | |
| 124 next unless /^\#/; | |
| 125 chomp; | |
| 126 $self->{headers} = [split]; | |
| 127 } | |
| 128 close HEAD; | |
| 129 | |
| 130 die "ERROR: Could not read headers from $file\n" unless defined($self->{headers}) && scalar @{$self->{headers}}; | |
| 131 | |
| 132 # check alt and Ensembl_transcriptid headers | |
| 133 foreach my $h(qw(alt Ensembl_transcriptid)) { | |
| 134 die "ERROR: Could not find required column $h in $file\n" unless grep {$_ eq $h} @{$self->{headers}}; | |
| 135 } | |
| 136 | |
| 137 my $i = 1; | |
| 138 # check if 2nd argument is a file that specifies replacement logic | |
| 139 # read replacement logic | |
| 140 my $replacement_file = $self->params->[$i]; | |
| 141 if (defined $replacement_file && -e $replacement_file) { | |
| 142 $self->add_replacement_logic($replacement_file); | |
| 143 $i++; | |
| 144 } else { | |
| 145 $self->add_replacement_logic(); | |
| 146 } | |
| 147 | |
| 148 # get required columns | |
| 149 while(defined($self->params->[$i])) { | |
| 150 my $col = $self->params->[$i]; | |
| 151 if($col eq 'ALL') { | |
| 152 $self->{cols} = {map {$_ => 1} @{$self->{headers}}}; | |
| 153 last; | |
| 154 } | |
| 155 die "ERROR: Column $col not found in header for file $file. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless grep {$_ eq $col} @{$self->{headers}}; | |
| 156 | |
| 157 $self->{cols}->{$self->params->[$i]} = 1; | |
| 158 $i++; | |
| 159 } | |
| 160 | |
| 161 die "ERROR: No columns selected to fetch. Available columns are:\n".join(",", @{$self->{headers}})."\n" unless defined($self->{cols}) && scalar keys %{$self->{cols}}; | |
| 162 | |
| 163 return $self; | |
| 164 } | |
| 165 | |
| 166 sub feature_types { | |
| 167 return ['Transcript']; | |
| 168 } | |
| 169 | |
| 170 sub get_header_info { | |
| 171 my $self = shift; | |
| 172 | |
| 173 if(!exists($self->{_header_info})) { | |
| 174 | |
| 175 # look for readme | |
| 176 my $file_dir = $self->files->[0]; | |
| 177 | |
| 178 my %rm_descs; | |
| 179 | |
| 180 # won't work for remote | |
| 181 if($file_dir !~ /tp\:\/\//) { | |
| 182 | |
| 183 # get just dir | |
| 184 $file_dir =~ s/\/[^\/]+$/\//; | |
| 185 | |
| 186 if(opendir DIR, $file_dir) { | |
| 187 my ($readme_file) = grep {/dbnsfp.*readme\.txt/i} readdir DIR; | |
| 188 closedir DIR; | |
| 189 | |
| 190 if(open RM, $file_dir.$readme_file) { | |
| 191 my ($col, $reading); | |
| 192 | |
| 193 # parse dbNSFP readme | |
| 194 # relevant lines look like: | |
| 195 # | |
| 196 # 1 column1_name: description blah blah | |
| 197 # blah blah blah | |
| 198 # 2 column2_name: description blah blah | |
| 199 # blah blah blah | |
| 200 | |
| 201 while(<RM>) { | |
| 202 chomp; | |
| 203 s/\r$//g; | |
| 204 | |
| 205 if(/^\d+\s/) { | |
| 206 $reading = 1; | |
| 207 | |
| 208 m/^\d+\s+(.+?)\:\s+(.+)/; | |
| 209 $col = $1; | |
| 210 | |
| 211 $rm_descs{$col} = '(from dbNSFP) '.$2 if $col && $2; | |
| 212 } | |
| 213 elsif($reading && /\w/) { | |
| 214 s/^\s+//; | |
| 215 $rm_descs{$col} .= ' '.$_; | |
| 216 } | |
| 217 else { | |
| 218 $reading = 0; | |
| 219 } | |
| 220 } | |
| 221 | |
| 222 close RM; | |
| 223 | |
| 224 # remove multiple spaces | |
| 225 $rm_descs{$_} =~ s/\s+/ /g for keys %rm_descs; | |
| 226 } | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 $self->{_header_info} = {map {$_ => $rm_descs{$_} || ($_.' from dbNSFP file')} keys %{$self->{cols}}}; | |
| 231 } | |
| 232 | |
| 233 return $self->{_header_info}; | |
| 234 } | |
| 235 | |
| 236 sub run { | |
| 237 my ($self, $tva) = @_; | |
| 238 | |
| 239 # only for missense variants | |
| 240 return {} unless grep {$INCLUDE_SO{$_->SO_term}} @{$tva->get_all_OverlapConsequences}; | |
| 241 | |
| 242 my $vf = $tva->variation_feature; | |
| 243 | |
| 244 return {} unless $vf->{start} eq $vf->{end}; | |
| 245 | |
| 246 # get allele, reverse comp if needed | |
| 247 my $allele = $tva->variation_feature_seq; | |
| 248 reverse_comp(\$allele) if $vf->{strand} < 0; | |
| 249 | |
| 250 return {} unless $allele =~ /^[ACGT]$/; | |
| 251 | |
| 252 # get transcript stable ID | |
| 253 my $tr_id = $tva->transcript->stable_id; | |
| 254 | |
| 255 my $data; | |
| 256 my $pos; | |
| 257 | |
| 258 my $assembly = $self->{config}->{assembly}; | |
| 259 my $chr = ($vf->{chr} =~ /MT/i) ? 'M' : $vf->{chr}; | |
| 260 foreach my $tmp_data(@{$self->get_data($chr, $vf->{start} - 1, $vf->{end})}) { | |
| 261 # compare allele and transcript | |
| 262 if ($assembly eq 'GRCh37') { | |
| 263 if (exists $tmp_data->{'pos(1-coor)'}) { | |
| 264 # for dbNSFP version 2.9.1 | |
| 265 $pos = $tmp_data->{'pos(1-coor)'} | |
| 266 } elsif (exists $tmp_data->{'hg19_pos(1-based)'}) { | |
| 267 # for dbNSFP version 3.5c indexed for hg19/(=GRCh37) | |
| 268 $pos = $tmp_data->{'hg19_pos(1-based)'} | |
| 269 } else { | |
| 270 die "dbNSFP file does not contain required columns (pos(1-coor) for version 2.9.1 or hg19_pos(1-based) for version 3.5c) to use with GRCh37"; | |
| 271 } | |
| 272 } else { | |
| 273 if (exists $tmp_data->{'pos(1-based)'}) { | |
| 274 $pos = $tmp_data->{'pos(1-based)'} | |
| 275 } else { | |
| 276 die "dbNSFP file does not contain required column pos(1-based) to use with GRCh38"; | |
| 277 } | |
| 278 } | |
| 279 | |
| 280 next unless | |
| 281 $pos == $vf->{start} && | |
| 282 defined($tmp_data->{alt}) && | |
| 283 $tmp_data->{alt} eq $allele; | |
| 284 | |
| 285 # make a clean copy as we're going to edit it | |
| 286 %$data = %$tmp_data; | |
| 287 | |
| 288 # convert data with multiple transcript values | |
| 289 # if($data->{Ensembl_transcriptid} =~ m/\;/) { | |
| 290 | |
| 291 # # find the "index" of this transcript | |
| 292 # my @tr_ids = split(';', $data->{Ensembl_transcriptid}); | |
| 293 # my $tr_index; | |
| 294 | |
| 295 # for my $i(0..$#tr_ids) { | |
| 296 # $tr_index = $i; | |
| 297 # last if $tr_ids[$tr_index] =~ /^$tr_id(\.\d+)?$/; | |
| 298 # } | |
| 299 | |
| 300 # next unless defined($tr_index); | |
| 301 | |
| 302 # # now alter other fields | |
| 303 # foreach my $key(keys %$data) { | |
| 304 # if($data->{$key} =~ m/\;/) { | |
| 305 # my @split = split(';', $data->{$key}); | |
| 306 # die("ERROR: Transcript index out of range") if $tr_index > $#split; | |
| 307 # $data->{$key} = $split[$tr_index]; | |
| 308 # } | |
| 309 # } | |
| 310 # } | |
| 311 last; | |
| 312 } | |
| 313 | |
| 314 return {} unless scalar keys %$data; | |
| 315 | |
| 316 # get required data | |
| 317 my @from = @{$self->{replacement}->{default}->{from}}; | |
| 318 my @to = @{$self->{replacement}->{default}->{to}}; | |
| 319 | |
| 320 my %return; | |
| 321 foreach my $colname (keys %$data) { | |
| 322 next if(!defined($self->{cols}->{$colname})); | |
| 323 next if($data->{$colname} eq '.'); | |
| 324 | |
| 325 my @from = @{$self->{replacement}->{default}->{from}}; | |
| 326 my @to = @{$self->{replacement}->{default}->{to}}; | |
| 327 @from = @{$self->{replacement}->{$colname}->{from}} if (defined $self->{replacement}->{$colname}); | |
| 328 @to = @{$self->{replacement}->{$colname}->{to}} if (defined $self->{replacement}->{$colname}); | |
| 329 for my $i (0 .. $#from) { | |
| 330 $data->{$colname} =~ s/\Q$from[$i]\E/$to[$i]/g; | |
| 331 } | |
| 332 $return{$colname} = $data->{$colname}; | |
| 333 } | |
| 334 | |
| 335 return \%return; | |
| 336 } | |
| 337 | |
| 338 sub parse_data { | |
| 339 my ($self, $line) = @_; | |
| 340 | |
| 341 $line =~ s/\r$//g; | |
| 342 | |
| 343 my @split = split /\t/, $line; | |
| 344 | |
| 345 # parse data into hash of col names and values | |
| 346 my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1)); | |
| 347 | |
| 348 return \%data; | |
| 349 } | |
| 350 | |
| 351 sub get_start { | |
| 352 return $_[1]->{'pos(1-based)'}; | |
| 353 } | |
| 354 | |
| 355 sub get_end { | |
| 356 return $_[1]->{'pos(1-based)'}; | |
| 357 } | |
| 358 | |
| 359 sub add_replacement_logic { | |
| 360 my $self = shift; | |
| 361 my $file = shift; | |
| 362 $file ||= 'dbNSFP_replacement_logic'; | |
| 363 if (! -e $file) { | |
| 364 $self->{replacement}->{default}->{from} = [';', '|']; | |
| 365 $self->{replacement}->{default}->{to} = [',', '&']; | |
| 366 } else { | |
| 367 open FILE, $file; | |
| 368 while(<FILE>) { | |
| 369 chomp; | |
| 370 next if /^colname/; | |
| 371 my ($colname, $from, $to) = split/\s+/; | |
| 372 die ("ERROR: 3 values separated by whitespace are required: colname from to.") if(!($colname && $from && $to)); | |
| 373 push @{$self->{replacement}->{$colname}->{from}}, $from; | |
| 374 push @{$self->{replacement}->{$colname}->{to}}, $to; | |
| 375 } | |
| 376 close FILE; | |
| 377 die("ERROR: No default replacement logic has been specified.\n") if (!defined $self->{replacement}->{default}); | |
| 378 } | |
| 379 } | |
| 380 | |
| 381 1; | |
| 382 | |
| 383 |
