|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 GeneSplicer
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv GeneSplicer.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variants.vcf --plugin GeneSplicer,[path_to_genesplicer_bin],[path_to_training_dir],[option1=value],[option2=value]
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
|
|
|
36 runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get
|
|
|
37 splice site predictions.
|
|
|
38
|
|
|
39 It evaluates a tract of sequence either side of and including the
|
|
|
40 variant, both in reference and alternate states. The amount of
|
|
|
41 sequence included either side defaults to 100bp, but can be modified
|
|
|
42 by passing e.g. "context=50" as a parameter to the plugin.
|
|
|
43
|
|
|
44 Any predicted splicing regions that overlap the variant are reported
|
|
|
45 in the output with one of four states: no_change, diff, gain, loss
|
|
|
46
|
|
|
47 There follows a "/"-separated string consisting of the following data:
|
|
|
48
|
|
|
49 1) type (donor, acceptor)
|
|
|
50 2) coordinates (start-end)
|
|
|
51 3) confidence (Low, Medium, High)
|
|
|
52 4) score
|
|
|
53
|
|
|
54 Example: loss/acceptor/727006-727007/High/16.231924
|
|
|
55
|
|
|
56 If multiple sites are predicted, their reports are separated by ",".
|
|
|
57
|
|
|
58 For diff, the confidence and score for both the reference and alternate
|
|
|
59 sequences is reported as REF-ALT.
|
|
|
60
|
|
|
61 Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368
|
|
|
62
|
|
|
63 Several parameters can be modified by passing them to the plugin string:
|
|
|
64
|
|
|
65 context : change the amount of sequence added either side of
|
|
|
66 the variant (default: 100bp)
|
|
|
67 tmpdir : change the temporary directory used (default: /tmp)
|
|
|
68 cache_size : change how many sequences' scores are cached in memory
|
|
|
69 (default: 50)
|
|
|
70
|
|
|
71 Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp
|
|
|
72
|
|
|
73 On some systems the binaries provided will not execute, but can be compiled from source:
|
|
|
74
|
|
|
75 cd $GS/sources
|
|
|
76 make
|
|
|
77 cd -
|
|
|
78 ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human
|
|
|
79
|
|
|
80 On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification:
|
|
|
81
|
|
|
82 cd $GS/sources
|
|
|
83 perl -pi -e "s/^main /int main /" genesplicer.cpp
|
|
|
84 make
|
|
|
85
|
|
|
86
|
|
|
87 =cut
|
|
|
88
|
|
|
89 package GeneSplicer;
|
|
|
90
|
|
|
91 use strict;
|
|
|
92 use warnings;
|
|
|
93
|
|
|
94 use Digest::MD5 qw(md5_hex);
|
|
|
95
|
|
|
96 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
97 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
|
|
|
98
|
|
|
99 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
|
|
|
100 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
101
|
|
|
102 our %DEFAULTS = (
|
|
|
103 context => 100,
|
|
|
104 tmpdir => '/tmp',
|
|
|
105 cache_size => 50,
|
|
|
106 );
|
|
|
107
|
|
|
108 sub new {
|
|
|
109 my $class = shift;
|
|
|
110
|
|
|
111 my $self = $class->SUPER::new(@_);
|
|
|
112
|
|
|
113 # we need sequence, so no offline mode unless we have FASTA
|
|
|
114 die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
|
|
|
115
|
|
|
116 my $params = $self->params;
|
|
|
117
|
|
|
118 my $bin = shift @$params;
|
|
|
119 die("ERROR: genesplicer binary not specified\n") unless $bin;
|
|
|
120 die("ERROR: genesplicer binary not found\n") unless -e $bin;
|
|
|
121 my $test = `$bin 2>&1`;
|
|
|
122 die("ERROR: failed to run genesplicer binary:\n$test\n") unless $test =~ /^USAGE/;
|
|
|
123 $self->{_bin} = $bin;
|
|
|
124
|
|
|
125 my $training_dir = shift @$params;
|
|
|
126 die("ERROR: training directory not specified\n") unless $training_dir;
|
|
|
127 die("ERROR: training directory not found\n") unless -d $training_dir;
|
|
|
128 $self->{_training_dir} = $training_dir;
|
|
|
129
|
|
|
130 # defaults
|
|
|
131 $self->{'_param_'.$_} = $DEFAULTS{$_} for keys %DEFAULTS;
|
|
|
132
|
|
|
133 # REST API passes 1 as first param
|
|
|
134 shift @$params if $params->[0] && $params->[0] eq '1';
|
|
|
135
|
|
|
136 # set/override with user params
|
|
|
137 foreach my $param(@$params) {
|
|
|
138 my ($key, $val) = split('=', $param);
|
|
|
139 die("ERROR: Failed to parse parameter $param\n") unless defined($key) && defined($val);
|
|
|
140
|
|
|
141 $self->{'_param_'.$key} = $val;
|
|
|
142 }
|
|
|
143
|
|
|
144 return $self;
|
|
|
145 }
|
|
|
146
|
|
|
147 sub feature_types {
|
|
|
148 return ['Transcript'];
|
|
|
149 }
|
|
|
150
|
|
|
151 sub get_header_info {
|
|
|
152 return {
|
|
|
153 GeneSplicer => "GeneSplicer predictions"
|
|
|
154 };
|
|
|
155 }
|
|
|
156
|
|
|
157 sub run {
|
|
|
158 my ($self, $tva) = @_;
|
|
|
159
|
|
|
160 my $vf = $tva->variation_feature;
|
|
|
161
|
|
|
162 # get up and downstream sequences
|
|
|
163 my $up_seq = $vf->{slice}->sub_Slice(
|
|
|
164 $vf->{start} - $self->{'_param_context'},
|
|
|
165 $vf->{start} - 1,
|
|
|
166 $vf->strand
|
|
|
167 )->seq;
|
|
|
168
|
|
|
169 my $down_seq = $vf->{slice}->sub_Slice(
|
|
|
170 $vf->{end} + 1,
|
|
|
171 $vf->{end} + $self->{'_param_context'},
|
|
|
172 $vf->strand
|
|
|
173 )->seq;
|
|
|
174
|
|
|
175 # create ref seq by grabbing reference TVA
|
|
|
176 my $ref_seq = join("",
|
|
|
177 $up_seq,
|
|
|
178 $tva->transcript_variation->get_reference_TranscriptVariationAllele->variation_feature_seq,
|
|
|
179 $down_seq
|
|
|
180 );
|
|
|
181
|
|
|
182 return {} unless $ref_seq =~ /^[ACGT]+$/;
|
|
|
183
|
|
|
184 # create alt seq
|
|
|
185 my $alt_allele = $tva->variation_feature_seq;
|
|
|
186 $alt_allele =~ s/\-//g;
|
|
|
187 my $alt_seq = $up_seq.$alt_allele.$down_seq;
|
|
|
188
|
|
|
189
|
|
|
190 return {} unless $alt_seq =~ /^[ACGT]+$/;
|
|
|
191
|
|
|
192 # reverse comp if strands differ
|
|
|
193 if($tva->transcript->strand != $vf->strand) {
|
|
|
194 reverse_comp(\$ref_seq);
|
|
|
195 reverse_comp(\$alt_seq);
|
|
|
196 }
|
|
|
197
|
|
|
198 # get results
|
|
|
199 my $ref_results = $self->results_from_cache($ref_seq) || $self->results_from_seq($ref_seq);
|
|
|
200 my $alt_results = $self->results_from_cache($alt_seq) || $self->results_from_seq($alt_seq);
|
|
|
201
|
|
|
202 # compare results both ways
|
|
|
203 my $diff_ref_to_alt = $self->compare_results($ref_results, $alt_results);
|
|
|
204 my $diff_alt_to_ref = $self->compare_results($alt_results, $ref_results);
|
|
|
205
|
|
|
206 # get VF pos relative to tested sequence
|
|
|
207 my ($vf_start, $vf_end) = ($self->{'_param_context'} + 1, $self->{'_param_context'} + (($vf->{end} - $vf->{start}) + 1));
|
|
|
208
|
|
|
209 # get overlapping losses and gains
|
|
|
210 # and map to chromosome coords
|
|
|
211 my @losses =
|
|
|
212 map {$_->{gl} = 'loss'; $_}
|
|
|
213 @{$diff_ref_to_alt->{lost}};
|
|
|
214
|
|
|
215 my @gains =
|
|
|
216 map {$_->{gl} = 'gain'; $_}
|
|
|
217 @{$diff_alt_to_ref->{lost}};
|
|
|
218
|
|
|
219 my @diffs =
|
|
|
220 map {$_->{gl} = 'diff'; $_}
|
|
|
221 @{$diff_ref_to_alt->{diff}};
|
|
|
222
|
|
|
223 my $return = join(',',
|
|
|
224 map {
|
|
|
225 join('/',
|
|
|
226 $_->[0]->{gl},
|
|
|
227 $_->[0]->{type},
|
|
|
228 $_->[1]->{end5}.'-'.$_->[1]->{end3},
|
|
|
229 $_->[0]->{confidence},
|
|
|
230 $_->[0]->{score}
|
|
|
231 )
|
|
|
232 }
|
|
|
233 map {[$_, $self->map_ss_coords($_, $vf)]}
|
|
|
234 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})}
|
|
|
235 (@losses, @gains, @diffs)
|
|
|
236 );
|
|
|
237
|
|
|
238 # probably of interest to report splice sites were found
|
|
|
239 # but no difference between ref and alt
|
|
|
240 if(!$return && grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results) {
|
|
|
241 $return = join(',',
|
|
|
242 map {
|
|
|
243 join('/',
|
|
|
244 'no_change',
|
|
|
245 $_->[0]->{type},
|
|
|
246 $_->[1]->{end5}.'-'.$_->[1]->{end3},
|
|
|
247 $_->[0]->{confidence},
|
|
|
248 $_->[0]->{score}
|
|
|
249 )
|
|
|
250 }
|
|
|
251 map {[$_, $self->map_ss_coords($_, $vf)]}
|
|
|
252 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results
|
|
|
253 );
|
|
|
254 }
|
|
|
255
|
|
|
256 return $return ? { GeneSplicer => $return } : {};
|
|
|
257 }
|
|
|
258
|
|
|
259 sub results_from_seq {
|
|
|
260 my $self = shift;
|
|
|
261 my $seq = shift;
|
|
|
262
|
|
|
263 # write seqs to file
|
|
|
264 my $seq_file = $self->{'_param_tmpdir'}."/genesplicer_$$.fa";
|
|
|
265 open SEQ, ">$seq_file" or die("ERROR: Could not write to temporary sequence file $seq_file\n");
|
|
|
266 print SEQ ">SEQ\n$seq\n";
|
|
|
267 close SEQ;
|
|
|
268
|
|
|
269 my $result_file = $self->{'_param_tmpdir'}."/genesplicer_$$.results";
|
|
|
270
|
|
|
271 my $cmd = sprintf(
|
|
|
272 '%s %s %s -f %s',
|
|
|
273 $self->{'_bin'},
|
|
|
274 $seq_file,
|
|
|
275 $self->{'_training_dir'},
|
|
|
276 $result_file
|
|
|
277 );
|
|
|
278
|
|
|
279 my $output = `$cmd 2>&1`;
|
|
|
280 unlink($seq_file);
|
|
|
281
|
|
|
282 return [] unless -e $result_file;
|
|
|
283
|
|
|
284 open RES, $result_file;
|
|
|
285 my @results;
|
|
|
286
|
|
|
287 while(<RES>) {
|
|
|
288 chomp;
|
|
|
289 my ($end5, $end3, $score, $confidence, $type) = split;
|
|
|
290
|
|
|
291 push @results, {
|
|
|
292 end5 => $end5,
|
|
|
293 end3 => $end3,
|
|
|
294 score => $score,
|
|
|
295 confidence => $confidence,
|
|
|
296 type => $type
|
|
|
297 };
|
|
|
298 }
|
|
|
299 close RES;
|
|
|
300
|
|
|
301 unlink($result_file);
|
|
|
302
|
|
|
303 push @{$self->{cache}}, { hex => md5_hex($seq), results => \@results};
|
|
|
304 shift @{$self->{cache}} while scalar @{$self->{cache}} > $self->{_param_cache_size};
|
|
|
305
|
|
|
306 return \@results;
|
|
|
307 }
|
|
|
308
|
|
|
309 sub results_from_cache {
|
|
|
310 my $self = shift;
|
|
|
311 my $seq = shift;
|
|
|
312
|
|
|
313 my ($results) = map {$_->{results}} grep {$_->{hex} eq md5_hex($seq)} @{$self->{cache} || []};
|
|
|
314
|
|
|
315 return $results;
|
|
|
316 }
|
|
|
317
|
|
|
318 sub compare_results {
|
|
|
319 my $self = shift;
|
|
|
320 my $a = shift;
|
|
|
321 my $b = shift;
|
|
|
322
|
|
|
323 my (@diff, @lost);
|
|
|
324
|
|
|
325 foreach my $res_a(@$a) {
|
|
|
326 my @match = grep {
|
|
|
327 $_->{end5} == $res_a->{end5} &&
|
|
|
328 $_->{end3} == $res_a->{end3} &&
|
|
|
329 $_->{type} eq $res_a->{type}
|
|
|
330 } @$b;
|
|
|
331
|
|
|
332 # result not found in b
|
|
|
333 if(!@match) {
|
|
|
334 push @lost, $res_a;
|
|
|
335 }
|
|
|
336
|
|
|
337 # >1 result found
|
|
|
338 elsif(scalar @match > 1) {
|
|
|
339 warn("WARNING: Found two matches?\n");
|
|
|
340 }
|
|
|
341
|
|
|
342 # 1 match
|
|
|
343 elsif($match[0]->{score} != $res_a->{score}) {
|
|
|
344 my %diff = %$res_a;
|
|
|
345 $diff{score} .= '-'.$match[0]->{score};
|
|
|
346 $diff{confidence} .= '-'.$match[0]->{confidence};
|
|
|
347 push @diff, \%diff;
|
|
|
348 }
|
|
|
349 }
|
|
|
350
|
|
|
351 return { diff => \@diff, lost => \@lost};
|
|
|
352 }
|
|
|
353
|
|
|
354 sub map_ss_coords {
|
|
|
355 my $self = shift;
|
|
|
356 my $res = shift;
|
|
|
357 my $vf = shift;
|
|
|
358
|
|
|
359 my $return = {};
|
|
|
360
|
|
|
361 foreach my $coord(qw(end5 end3)) {
|
|
|
362 $return->{$coord} = (($res->{$coord} - $self->{'_param_context'}) + $vf->{start}) - 1;
|
|
|
363 }
|
|
|
364
|
|
|
365 return $return;
|
|
|
366 }
|
|
|
367
|
|
|
368 1;
|
|
|
369
|