comparison dir_plugins/GeneSplicer.pm @ 0:e545d0a25ffe draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:17:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e545d0a25ffe
1 =head1 LICENSE
2
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 =head1 CONTACT
19
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
21
22 =cut
23
24 =head1 NAME
25
26 GeneSplicer
27
28 =head1 SYNOPSIS
29
30 mv GeneSplicer.pm ~/.vep/Plugins
31 ./vep -i variants.vcf --plugin GeneSplicer,[path_to_genesplicer_bin],[path_to_training_dir],[option1=value],[option2=value]
32
33 =head1 DESCRIPTION
34
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
36 runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get
37 splice site predictions.
38
39 It evaluates a tract of sequence either side of and including the
40 variant, both in reference and alternate states. The amount of
41 sequence included either side defaults to 100bp, but can be modified
42 by passing e.g. "context=50" as a parameter to the plugin.
43
44 Any predicted splicing regions that overlap the variant are reported
45 in the output with one of four states: no_change, diff, gain, loss
46
47 There follows a "/"-separated string consisting of the following data:
48
49 1) type (donor, acceptor)
50 2) coordinates (start-end)
51 3) confidence (Low, Medium, High)
52 4) score
53
54 Example: loss/acceptor/727006-727007/High/16.231924
55
56 If multiple sites are predicted, their reports are separated by ",".
57
58 For diff, the confidence and score for both the reference and alternate
59 sequences is reported as REF-ALT.
60
61 Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368
62
63 Several parameters can be modified by passing them to the plugin string:
64
65 context : change the amount of sequence added either side of
66 the variant (default: 100bp)
67 tmpdir : change the temporary directory used (default: /tmp)
68 cache_size : change how many sequences' scores are cached in memory
69 (default: 50)
70
71 Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp
72
73 On some systems the binaries provided will not execute, but can be compiled from source:
74
75 cd $GS/sources
76 make
77 cd -
78 ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human
79
80 On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification:
81
82 cd $GS/sources
83 perl -pi -e "s/^main /int main /" genesplicer.cpp
84 make
85
86
87 =cut
88
89 package GeneSplicer;
90
91 use strict;
92 use warnings;
93
94 use Digest::MD5 qw(md5_hex);
95
96 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
97 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
98
99 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
100 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
101
102 our %DEFAULTS = (
103 context => 100,
104 tmpdir => '/tmp',
105 cache_size => 50,
106 );
107
108 sub new {
109 my $class = shift;
110
111 my $self = $class->SUPER::new(@_);
112
113 # we need sequence, so no offline mode unless we have FASTA
114 die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
115
116 my $params = $self->params;
117
118 my $bin = shift @$params;
119 die("ERROR: genesplicer binary not specified\n") unless $bin;
120 die("ERROR: genesplicer binary not found\n") unless -e $bin;
121 my $test = `$bin 2>&1`;
122 die("ERROR: failed to run genesplicer binary:\n$test\n") unless $test =~ /^USAGE/;
123 $self->{_bin} = $bin;
124
125 my $training_dir = shift @$params;
126 die("ERROR: training directory not specified\n") unless $training_dir;
127 die("ERROR: training directory not found\n") unless -d $training_dir;
128 $self->{_training_dir} = $training_dir;
129
130 # defaults
131 $self->{'_param_'.$_} = $DEFAULTS{$_} for keys %DEFAULTS;
132
133 # REST API passes 1 as first param
134 shift @$params if $params->[0] && $params->[0] eq '1';
135
136 # set/override with user params
137 foreach my $param(@$params) {
138 my ($key, $val) = split('=', $param);
139 die("ERROR: Failed to parse parameter $param\n") unless defined($key) && defined($val);
140
141 $self->{'_param_'.$key} = $val;
142 }
143
144 return $self;
145 }
146
147 sub feature_types {
148 return ['Transcript'];
149 }
150
151 sub get_header_info {
152 return {
153 GeneSplicer => "GeneSplicer predictions"
154 };
155 }
156
157 sub run {
158 my ($self, $tva) = @_;
159
160 my $vf = $tva->variation_feature;
161
162 # get up and downstream sequences
163 my $up_seq = $vf->{slice}->sub_Slice(
164 $vf->{start} - $self->{'_param_context'},
165 $vf->{start} - 1,
166 $vf->strand
167 )->seq;
168
169 my $down_seq = $vf->{slice}->sub_Slice(
170 $vf->{end} + 1,
171 $vf->{end} + $self->{'_param_context'},
172 $vf->strand
173 )->seq;
174
175 # create ref seq by grabbing reference TVA
176 my $ref_seq = join("",
177 $up_seq,
178 $tva->transcript_variation->get_reference_TranscriptVariationAllele->variation_feature_seq,
179 $down_seq
180 );
181
182 return {} unless $ref_seq =~ /^[ACGT]+$/;
183
184 # create alt seq
185 my $alt_allele = $tva->variation_feature_seq;
186 $alt_allele =~ s/\-//g;
187 my $alt_seq = $up_seq.$alt_allele.$down_seq;
188
189
190 return {} unless $alt_seq =~ /^[ACGT]+$/;
191
192 # reverse comp if strands differ
193 if($tva->transcript->strand != $vf->strand) {
194 reverse_comp(\$ref_seq);
195 reverse_comp(\$alt_seq);
196 }
197
198 # get results
199 my $ref_results = $self->results_from_cache($ref_seq) || $self->results_from_seq($ref_seq);
200 my $alt_results = $self->results_from_cache($alt_seq) || $self->results_from_seq($alt_seq);
201
202 # compare results both ways
203 my $diff_ref_to_alt = $self->compare_results($ref_results, $alt_results);
204 my $diff_alt_to_ref = $self->compare_results($alt_results, $ref_results);
205
206 # get VF pos relative to tested sequence
207 my ($vf_start, $vf_end) = ($self->{'_param_context'} + 1, $self->{'_param_context'} + (($vf->{end} - $vf->{start}) + 1));
208
209 # get overlapping losses and gains
210 # and map to chromosome coords
211 my @losses =
212 map {$_->{gl} = 'loss'; $_}
213 @{$diff_ref_to_alt->{lost}};
214
215 my @gains =
216 map {$_->{gl} = 'gain'; $_}
217 @{$diff_alt_to_ref->{lost}};
218
219 my @diffs =
220 map {$_->{gl} = 'diff'; $_}
221 @{$diff_ref_to_alt->{diff}};
222
223 my $return = join(',',
224 map {
225 join('/',
226 $_->[0]->{gl},
227 $_->[0]->{type},
228 $_->[1]->{end5}.'-'.$_->[1]->{end3},
229 $_->[0]->{confidence},
230 $_->[0]->{score}
231 )
232 }
233 map {[$_, $self->map_ss_coords($_, $vf)]}
234 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})}
235 (@losses, @gains, @diffs)
236 );
237
238 # probably of interest to report splice sites were found
239 # but no difference between ref and alt
240 if(!$return && grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results) {
241 $return = join(',',
242 map {
243 join('/',
244 'no_change',
245 $_->[0]->{type},
246 $_->[1]->{end5}.'-'.$_->[1]->{end3},
247 $_->[0]->{confidence},
248 $_->[0]->{score}
249 )
250 }
251 map {[$_, $self->map_ss_coords($_, $vf)]}
252 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results
253 );
254 }
255
256 return $return ? { GeneSplicer => $return } : {};
257 }
258
259 sub results_from_seq {
260 my $self = shift;
261 my $seq = shift;
262
263 # write seqs to file
264 my $seq_file = $self->{'_param_tmpdir'}."/genesplicer_$$.fa";
265 open SEQ, ">$seq_file" or die("ERROR: Could not write to temporary sequence file $seq_file\n");
266 print SEQ ">SEQ\n$seq\n";
267 close SEQ;
268
269 my $result_file = $self->{'_param_tmpdir'}."/genesplicer_$$.results";
270
271 my $cmd = sprintf(
272 '%s %s %s -f %s',
273 $self->{'_bin'},
274 $seq_file,
275 $self->{'_training_dir'},
276 $result_file
277 );
278
279 my $output = `$cmd 2>&1`;
280 unlink($seq_file);
281
282 return [] unless -e $result_file;
283
284 open RES, $result_file;
285 my @results;
286
287 while(<RES>) {
288 chomp;
289 my ($end5, $end3, $score, $confidence, $type) = split;
290
291 push @results, {
292 end5 => $end5,
293 end3 => $end3,
294 score => $score,
295 confidence => $confidence,
296 type => $type
297 };
298 }
299 close RES;
300
301 unlink($result_file);
302
303 push @{$self->{cache}}, { hex => md5_hex($seq), results => \@results};
304 shift @{$self->{cache}} while scalar @{$self->{cache}} > $self->{_param_cache_size};
305
306 return \@results;
307 }
308
309 sub results_from_cache {
310 my $self = shift;
311 my $seq = shift;
312
313 my ($results) = map {$_->{results}} grep {$_->{hex} eq md5_hex($seq)} @{$self->{cache} || []};
314
315 return $results;
316 }
317
318 sub compare_results {
319 my $self = shift;
320 my $a = shift;
321 my $b = shift;
322
323 my (@diff, @lost);
324
325 foreach my $res_a(@$a) {
326 my @match = grep {
327 $_->{end5} == $res_a->{end5} &&
328 $_->{end3} == $res_a->{end3} &&
329 $_->{type} eq $res_a->{type}
330 } @$b;
331
332 # result not found in b
333 if(!@match) {
334 push @lost, $res_a;
335 }
336
337 # >1 result found
338 elsif(scalar @match > 1) {
339 warn("WARNING: Found two matches?\n");
340 }
341
342 # 1 match
343 elsif($match[0]->{score} != $res_a->{score}) {
344 my %diff = %$res_a;
345 $diff{score} .= '-'.$match[0]->{score};
346 $diff{confidence} .= '-'.$match[0]->{confidence};
347 push @diff, \%diff;
348 }
349 }
350
351 return { diff => \@diff, lost => \@lost};
352 }
353
354 sub map_ss_coords {
355 my $self = shift;
356 my $res = shift;
357 my $vf = shift;
358
359 my $return = {};
360
361 foreach my $coord(qw(end5 end3)) {
362 $return->{$coord} = (($res->{$coord} - $self->{'_param_context'}) + $vf->{start}) - 1;
363 }
364
365 return $return;
366 }
367
368 1;
369