Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/GeneSplicer.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
comparison
equal
deleted
inserted
replaced
| 2:17c98d091710 | 3:49397129aec0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 GeneSplicer | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv GeneSplicer.pm ~/.vep/Plugins | |
| 31 ./vep -i variants.vcf --plugin GeneSplicer,[path_to_genesplicer_bin],[path_to_training_dir],[option1=value],[option2=value] | |
| 32 | |
| 33 =head1 DESCRIPTION | |
| 34 | |
| 35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that | |
| 36 runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get | |
| 37 splice site predictions. | |
| 38 | |
| 39 It evaluates a tract of sequence either side of and including the | |
| 40 variant, both in reference and alternate states. The amount of | |
| 41 sequence included either side defaults to 100bp, but can be modified | |
| 42 by passing e.g. "context=50" as a parameter to the plugin. | |
| 43 | |
| 44 Any predicted splicing regions that overlap the variant are reported | |
| 45 in the output with one of four states: no_change, diff, gain, loss | |
| 46 | |
| 47 There follows a "/"-separated string consisting of the following data: | |
| 48 | |
| 49 1) type (donor, acceptor) | |
| 50 2) coordinates (start-end) | |
| 51 3) confidence (Low, Medium, High) | |
| 52 4) score | |
| 53 | |
| 54 Example: loss/acceptor/727006-727007/High/16.231924 | |
| 55 | |
| 56 If multiple sites are predicted, their reports are separated by ",". | |
| 57 | |
| 58 For diff, the confidence and score for both the reference and alternate | |
| 59 sequences is reported as REF-ALT. | |
| 60 | |
| 61 Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368 | |
| 62 | |
| 63 Several parameters can be modified by passing them to the plugin string: | |
| 64 | |
| 65 context : change the amount of sequence added either side of | |
| 66 the variant (default: 100bp) | |
| 67 tmpdir : change the temporary directory used (default: /tmp) | |
| 68 cache_size : change how many sequences' scores are cached in memory | |
| 69 (default: 50) | |
| 70 | |
| 71 Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp | |
| 72 | |
| 73 On some systems the binaries provided will not execute, but can be compiled from source: | |
| 74 | |
| 75 cd $GS/sources | |
| 76 make | |
| 77 cd - | |
| 78 ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human | |
| 79 | |
| 80 On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification: | |
| 81 | |
| 82 cd $GS/sources | |
| 83 perl -pi -e "s/^main /int main /" genesplicer.cpp | |
| 84 make | |
| 85 | |
| 86 | |
| 87 =cut | |
| 88 | |
| 89 package GeneSplicer; | |
| 90 | |
| 91 use strict; | |
| 92 use warnings; | |
| 93 | |
| 94 use Digest::MD5 qw(md5_hex); | |
| 95 | |
| 96 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 97 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); | |
| 98 | |
| 99 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | |
| 100 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 101 | |
| 102 our %DEFAULTS = ( | |
| 103 context => 100, | |
| 104 tmpdir => '/tmp', | |
| 105 cache_size => 50, | |
| 106 ); | |
| 107 | |
| 108 sub new { | |
| 109 my $class = shift; | |
| 110 | |
| 111 my $self = $class->SUPER::new(@_); | |
| 112 | |
| 113 # we need sequence, so no offline mode unless we have FASTA | |
| 114 die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta}; | |
| 115 | |
| 116 my $params = $self->params; | |
| 117 | |
| 118 my $bin = shift @$params; | |
| 119 die("ERROR: genesplicer binary not specified\n") unless $bin; | |
| 120 die("ERROR: genesplicer binary not found\n") unless -e $bin; | |
| 121 my $test = `$bin 2>&1`; | |
| 122 die("ERROR: failed to run genesplicer binary:\n$test\n") unless $test =~ /^USAGE/; | |
| 123 $self->{_bin} = $bin; | |
| 124 | |
| 125 my $training_dir = shift @$params; | |
| 126 die("ERROR: training directory not specified\n") unless $training_dir; | |
| 127 die("ERROR: training directory not found\n") unless -d $training_dir; | |
| 128 $self->{_training_dir} = $training_dir; | |
| 129 | |
| 130 # defaults | |
| 131 $self->{'_param_'.$_} = $DEFAULTS{$_} for keys %DEFAULTS; | |
| 132 | |
| 133 # REST API passes 1 as first param | |
| 134 shift @$params if $params->[0] && $params->[0] eq '1'; | |
| 135 | |
| 136 # set/override with user params | |
| 137 foreach my $param(@$params) { | |
| 138 my ($key, $val) = split('=', $param); | |
| 139 die("ERROR: Failed to parse parameter $param\n") unless defined($key) && defined($val); | |
| 140 | |
| 141 $self->{'_param_'.$key} = $val; | |
| 142 } | |
| 143 | |
| 144 return $self; | |
| 145 } | |
| 146 | |
| 147 sub feature_types { | |
| 148 return ['Transcript']; | |
| 149 } | |
| 150 | |
| 151 sub get_header_info { | |
| 152 return { | |
| 153 GeneSplicer => "GeneSplicer predictions" | |
| 154 }; | |
| 155 } | |
| 156 | |
| 157 sub run { | |
| 158 my ($self, $tva) = @_; | |
| 159 | |
| 160 my $vf = $tva->variation_feature; | |
| 161 | |
| 162 # get up and downstream sequences | |
| 163 my $up_seq = $vf->{slice}->sub_Slice( | |
| 164 $vf->{start} - $self->{'_param_context'}, | |
| 165 $vf->{start} - 1, | |
| 166 $vf->strand | |
| 167 )->seq; | |
| 168 | |
| 169 my $down_seq = $vf->{slice}->sub_Slice( | |
| 170 $vf->{end} + 1, | |
| 171 $vf->{end} + $self->{'_param_context'}, | |
| 172 $vf->strand | |
| 173 )->seq; | |
| 174 | |
| 175 # create ref seq by grabbing reference TVA | |
| 176 my $ref_seq = join("", | |
| 177 $up_seq, | |
| 178 $tva->transcript_variation->get_reference_TranscriptVariationAllele->variation_feature_seq, | |
| 179 $down_seq | |
| 180 ); | |
| 181 | |
| 182 return {} unless $ref_seq =~ /^[ACGT]+$/; | |
| 183 | |
| 184 # create alt seq | |
| 185 my $alt_allele = $tva->variation_feature_seq; | |
| 186 $alt_allele =~ s/\-//g; | |
| 187 my $alt_seq = $up_seq.$alt_allele.$down_seq; | |
| 188 | |
| 189 | |
| 190 return {} unless $alt_seq =~ /^[ACGT]+$/; | |
| 191 | |
| 192 # reverse comp if strands differ | |
| 193 if($tva->transcript->strand != $vf->strand) { | |
| 194 reverse_comp(\$ref_seq); | |
| 195 reverse_comp(\$alt_seq); | |
| 196 } | |
| 197 | |
| 198 # get results | |
| 199 my $ref_results = $self->results_from_cache($ref_seq) || $self->results_from_seq($ref_seq); | |
| 200 my $alt_results = $self->results_from_cache($alt_seq) || $self->results_from_seq($alt_seq); | |
| 201 | |
| 202 # compare results both ways | |
| 203 my $diff_ref_to_alt = $self->compare_results($ref_results, $alt_results); | |
| 204 my $diff_alt_to_ref = $self->compare_results($alt_results, $ref_results); | |
| 205 | |
| 206 # get VF pos relative to tested sequence | |
| 207 my ($vf_start, $vf_end) = ($self->{'_param_context'} + 1, $self->{'_param_context'} + (($vf->{end} - $vf->{start}) + 1)); | |
| 208 | |
| 209 # get overlapping losses and gains | |
| 210 # and map to chromosome coords | |
| 211 my @losses = | |
| 212 map {$_->{gl} = 'loss'; $_} | |
| 213 @{$diff_ref_to_alt->{lost}}; | |
| 214 | |
| 215 my @gains = | |
| 216 map {$_->{gl} = 'gain'; $_} | |
| 217 @{$diff_alt_to_ref->{lost}}; | |
| 218 | |
| 219 my @diffs = | |
| 220 map {$_->{gl} = 'diff'; $_} | |
| 221 @{$diff_ref_to_alt->{diff}}; | |
| 222 | |
| 223 my $return = join(',', | |
| 224 map { | |
| 225 join('/', | |
| 226 $_->[0]->{gl}, | |
| 227 $_->[0]->{type}, | |
| 228 $_->[1]->{end5}.'-'.$_->[1]->{end3}, | |
| 229 $_->[0]->{confidence}, | |
| 230 $_->[0]->{score} | |
| 231 ) | |
| 232 } | |
| 233 map {[$_, $self->map_ss_coords($_, $vf)]} | |
| 234 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} | |
| 235 (@losses, @gains, @diffs) | |
| 236 ); | |
| 237 | |
| 238 # probably of interest to report splice sites were found | |
| 239 # but no difference between ref and alt | |
| 240 if(!$return && grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results) { | |
| 241 $return = join(',', | |
| 242 map { | |
| 243 join('/', | |
| 244 'no_change', | |
| 245 $_->[0]->{type}, | |
| 246 $_->[1]->{end5}.'-'.$_->[1]->{end3}, | |
| 247 $_->[0]->{confidence}, | |
| 248 $_->[0]->{score} | |
| 249 ) | |
| 250 } | |
| 251 map {[$_, $self->map_ss_coords($_, $vf)]} | |
| 252 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results | |
| 253 ); | |
| 254 } | |
| 255 | |
| 256 return $return ? { GeneSplicer => $return } : {}; | |
| 257 } | |
| 258 | |
| 259 sub results_from_seq { | |
| 260 my $self = shift; | |
| 261 my $seq = shift; | |
| 262 | |
| 263 # write seqs to file | |
| 264 my $seq_file = $self->{'_param_tmpdir'}."/genesplicer_$$.fa"; | |
| 265 open SEQ, ">$seq_file" or die("ERROR: Could not write to temporary sequence file $seq_file\n"); | |
| 266 print SEQ ">SEQ\n$seq\n"; | |
| 267 close SEQ; | |
| 268 | |
| 269 my $result_file = $self->{'_param_tmpdir'}."/genesplicer_$$.results"; | |
| 270 | |
| 271 my $cmd = sprintf( | |
| 272 '%s %s %s -f %s', | |
| 273 $self->{'_bin'}, | |
| 274 $seq_file, | |
| 275 $self->{'_training_dir'}, | |
| 276 $result_file | |
| 277 ); | |
| 278 | |
| 279 my $output = `$cmd 2>&1`; | |
| 280 unlink($seq_file); | |
| 281 | |
| 282 return [] unless -e $result_file; | |
| 283 | |
| 284 open RES, $result_file; | |
| 285 my @results; | |
| 286 | |
| 287 while(<RES>) { | |
| 288 chomp; | |
| 289 my ($end5, $end3, $score, $confidence, $type) = split; | |
| 290 | |
| 291 push @results, { | |
| 292 end5 => $end5, | |
| 293 end3 => $end3, | |
| 294 score => $score, | |
| 295 confidence => $confidence, | |
| 296 type => $type | |
| 297 }; | |
| 298 } | |
| 299 close RES; | |
| 300 | |
| 301 unlink($result_file); | |
| 302 | |
| 303 push @{$self->{cache}}, { hex => md5_hex($seq), results => \@results}; | |
| 304 shift @{$self->{cache}} while scalar @{$self->{cache}} > $self->{_param_cache_size}; | |
| 305 | |
| 306 return \@results; | |
| 307 } | |
| 308 | |
| 309 sub results_from_cache { | |
| 310 my $self = shift; | |
| 311 my $seq = shift; | |
| 312 | |
| 313 my ($results) = map {$_->{results}} grep {$_->{hex} eq md5_hex($seq)} @{$self->{cache} || []}; | |
| 314 | |
| 315 return $results; | |
| 316 } | |
| 317 | |
| 318 sub compare_results { | |
| 319 my $self = shift; | |
| 320 my $a = shift; | |
| 321 my $b = shift; | |
| 322 | |
| 323 my (@diff, @lost); | |
| 324 | |
| 325 foreach my $res_a(@$a) { | |
| 326 my @match = grep { | |
| 327 $_->{end5} == $res_a->{end5} && | |
| 328 $_->{end3} == $res_a->{end3} && | |
| 329 $_->{type} eq $res_a->{type} | |
| 330 } @$b; | |
| 331 | |
| 332 # result not found in b | |
| 333 if(!@match) { | |
| 334 push @lost, $res_a; | |
| 335 } | |
| 336 | |
| 337 # >1 result found | |
| 338 elsif(scalar @match > 1) { | |
| 339 warn("WARNING: Found two matches?\n"); | |
| 340 } | |
| 341 | |
| 342 # 1 match | |
| 343 elsif($match[0]->{score} != $res_a->{score}) { | |
| 344 my %diff = %$res_a; | |
| 345 $diff{score} .= '-'.$match[0]->{score}; | |
| 346 $diff{confidence} .= '-'.$match[0]->{confidence}; | |
| 347 push @diff, \%diff; | |
| 348 } | |
| 349 } | |
| 350 | |
| 351 return { diff => \@diff, lost => \@lost}; | |
| 352 } | |
| 353 | |
| 354 sub map_ss_coords { | |
| 355 my $self = shift; | |
| 356 my $res = shift; | |
| 357 my $vf = shift; | |
| 358 | |
| 359 my $return = {}; | |
| 360 | |
| 361 foreach my $coord(qw(end5 end3)) { | |
| 362 $return->{$coord} = (($res->{$coord} - $self->{'_param_context'}) + $vf->{start}) - 1; | |
| 363 } | |
| 364 | |
| 365 return $return; | |
| 366 } | |
| 367 | |
| 368 1; | |
| 369 |
