annotate dir_plugins/GeneSplicer.pm @ 10:f594c6bed58f draft default tip

Uploaded
author dvanzessen
date Tue, 21 Apr 2020 11:40:19 +0000
parents e545d0a25ffe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
1 =head1 LICENSE
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
2
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
5
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
6 Licensed under the Apache License, Version 2.0 (the "License");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
7 you may not use this file except in compliance with the License.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
8 You may obtain a copy of the License at
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
9
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
10 http://www.apache.org/licenses/LICENSE-2.0
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
11
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
12 Unless required by applicable law or agreed to in writing, software
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
13 distributed under the License is distributed on an "AS IS" BASIS,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
15 See the License for the specific language governing permissions and
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
16 limitations under the License.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
17
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
18 =head1 CONTACT
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
19
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
21
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
22 =cut
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
23
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
24 =head1 NAME
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
25
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
26 GeneSplicer
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
27
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
28 =head1 SYNOPSIS
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
29
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
30 mv GeneSplicer.pm ~/.vep/Plugins
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
31 ./vep -i variants.vcf --plugin GeneSplicer,[path_to_genesplicer_bin],[path_to_training_dir],[option1=value],[option2=value]
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
32
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
33 =head1 DESCRIPTION
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
34
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
36 runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
37 splice site predictions.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
38
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
39 It evaluates a tract of sequence either side of and including the
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
40 variant, both in reference and alternate states. The amount of
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
41 sequence included either side defaults to 100bp, but can be modified
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
42 by passing e.g. "context=50" as a parameter to the plugin.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
43
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
44 Any predicted splicing regions that overlap the variant are reported
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
45 in the output with one of four states: no_change, diff, gain, loss
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
46
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
47 There follows a "/"-separated string consisting of the following data:
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
48
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
49 1) type (donor, acceptor)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
50 2) coordinates (start-end)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
51 3) confidence (Low, Medium, High)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
52 4) score
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
53
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
54 Example: loss/acceptor/727006-727007/High/16.231924
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
55
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
56 If multiple sites are predicted, their reports are separated by ",".
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
57
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
58 For diff, the confidence and score for both the reference and alternate
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
59 sequences is reported as REF-ALT.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
60
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
61 Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
62
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
63 Several parameters can be modified by passing them to the plugin string:
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
64
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
65 context : change the amount of sequence added either side of
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
66 the variant (default: 100bp)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
67 tmpdir : change the temporary directory used (default: /tmp)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
68 cache_size : change how many sequences' scores are cached in memory
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
69 (default: 50)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
70
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
71 Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
72
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
73 On some systems the binaries provided will not execute, but can be compiled from source:
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
74
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
75 cd $GS/sources
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
76 make
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
77 cd -
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
78 ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
79
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
80 On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification:
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
81
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
82 cd $GS/sources
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
83 perl -pi -e "s/^main /int main /" genesplicer.cpp
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
84 make
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
85
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
86
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
87 =cut
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
88
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
89 package GeneSplicer;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
90
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
91 use strict;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
92 use warnings;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
93
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
94 use Digest::MD5 qw(md5_hex);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
95
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
96 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
97 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
98
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
99 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
100 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
101
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
102 our %DEFAULTS = (
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
103 context => 100,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
104 tmpdir => '/tmp',
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
105 cache_size => 50,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
106 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
107
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
108 sub new {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
109 my $class = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
110
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
111 my $self = $class->SUPER::new(@_);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
112
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
113 # we need sequence, so no offline mode unless we have FASTA
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
114 die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
115
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
116 my $params = $self->params;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
117
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
118 my $bin = shift @$params;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
119 die("ERROR: genesplicer binary not specified\n") unless $bin;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
120 die("ERROR: genesplicer binary not found\n") unless -e $bin;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
121 my $test = `$bin 2>&1`;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
122 die("ERROR: failed to run genesplicer binary:\n$test\n") unless $test =~ /^USAGE/;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
123 $self->{_bin} = $bin;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
124
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
125 my $training_dir = shift @$params;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
126 die("ERROR: training directory not specified\n") unless $training_dir;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
127 die("ERROR: training directory not found\n") unless -d $training_dir;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
128 $self->{_training_dir} = $training_dir;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
129
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
130 # defaults
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
131 $self->{'_param_'.$_} = $DEFAULTS{$_} for keys %DEFAULTS;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
132
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
133 # REST API passes 1 as first param
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
134 shift @$params if $params->[0] && $params->[0] eq '1';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
135
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
136 # set/override with user params
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
137 foreach my $param(@$params) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
138 my ($key, $val) = split('=', $param);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
139 die("ERROR: Failed to parse parameter $param\n") unless defined($key) && defined($val);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
140
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
141 $self->{'_param_'.$key} = $val;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
142 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
143
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
144 return $self;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
145 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
146
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
147 sub feature_types {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
148 return ['Transcript'];
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
149 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
150
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
151 sub get_header_info {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
152 return {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
153 GeneSplicer => "GeneSplicer predictions"
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
154 };
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
155 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
156
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
157 sub run {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
158 my ($self, $tva) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
159
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
160 my $vf = $tva->variation_feature;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
161
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
162 # get up and downstream sequences
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
163 my $up_seq = $vf->{slice}->sub_Slice(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
164 $vf->{start} - $self->{'_param_context'},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
165 $vf->{start} - 1,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
166 $vf->strand
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
167 )->seq;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
168
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
169 my $down_seq = $vf->{slice}->sub_Slice(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
170 $vf->{end} + 1,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
171 $vf->{end} + $self->{'_param_context'},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
172 $vf->strand
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
173 )->seq;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
174
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
175 # create ref seq by grabbing reference TVA
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
176 my $ref_seq = join("",
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
177 $up_seq,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
178 $tva->transcript_variation->get_reference_TranscriptVariationAllele->variation_feature_seq,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
179 $down_seq
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
180 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
181
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
182 return {} unless $ref_seq =~ /^[ACGT]+$/;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
183
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
184 # create alt seq
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
185 my $alt_allele = $tva->variation_feature_seq;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
186 $alt_allele =~ s/\-//g;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
187 my $alt_seq = $up_seq.$alt_allele.$down_seq;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
188
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
189
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
190 return {} unless $alt_seq =~ /^[ACGT]+$/;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
191
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
192 # reverse comp if strands differ
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
193 if($tva->transcript->strand != $vf->strand) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
194 reverse_comp(\$ref_seq);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
195 reverse_comp(\$alt_seq);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
196 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
197
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
198 # get results
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
199 my $ref_results = $self->results_from_cache($ref_seq) || $self->results_from_seq($ref_seq);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
200 my $alt_results = $self->results_from_cache($alt_seq) || $self->results_from_seq($alt_seq);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
201
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
202 # compare results both ways
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
203 my $diff_ref_to_alt = $self->compare_results($ref_results, $alt_results);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
204 my $diff_alt_to_ref = $self->compare_results($alt_results, $ref_results);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
205
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
206 # get VF pos relative to tested sequence
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
207 my ($vf_start, $vf_end) = ($self->{'_param_context'} + 1, $self->{'_param_context'} + (($vf->{end} - $vf->{start}) + 1));
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
208
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
209 # get overlapping losses and gains
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
210 # and map to chromosome coords
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
211 my @losses =
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
212 map {$_->{gl} = 'loss'; $_}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
213 @{$diff_ref_to_alt->{lost}};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
214
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
215 my @gains =
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
216 map {$_->{gl} = 'gain'; $_}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
217 @{$diff_alt_to_ref->{lost}};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
218
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
219 my @diffs =
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
220 map {$_->{gl} = 'diff'; $_}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
221 @{$diff_ref_to_alt->{diff}};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
222
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
223 my $return = join(',',
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
224 map {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
225 join('/',
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
226 $_->[0]->{gl},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
227 $_->[0]->{type},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
228 $_->[1]->{end5}.'-'.$_->[1]->{end3},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
229 $_->[0]->{confidence},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
230 $_->[0]->{score}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
231 )
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
232 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
233 map {[$_, $self->map_ss_coords($_, $vf)]}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
234 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
235 (@losses, @gains, @diffs)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
236 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
237
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
238 # probably of interest to report splice sites were found
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
239 # but no difference between ref and alt
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
240 if(!$return && grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
241 $return = join(',',
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
242 map {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
243 join('/',
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
244 'no_change',
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
245 $_->[0]->{type},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
246 $_->[1]->{end5}.'-'.$_->[1]->{end3},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
247 $_->[0]->{confidence},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
248 $_->[0]->{score}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
249 )
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
250 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
251 map {[$_, $self->map_ss_coords($_, $vf)]}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
252 grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
253 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
254 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
255
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
256 return $return ? { GeneSplicer => $return } : {};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
257 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
258
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
259 sub results_from_seq {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
260 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
261 my $seq = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
262
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
263 # write seqs to file
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
264 my $seq_file = $self->{'_param_tmpdir'}."/genesplicer_$$.fa";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
265 open SEQ, ">$seq_file" or die("ERROR: Could not write to temporary sequence file $seq_file\n");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
266 print SEQ ">SEQ\n$seq\n";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
267 close SEQ;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
268
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
269 my $result_file = $self->{'_param_tmpdir'}."/genesplicer_$$.results";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
270
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
271 my $cmd = sprintf(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
272 '%s %s %s -f %s',
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
273 $self->{'_bin'},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
274 $seq_file,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
275 $self->{'_training_dir'},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
276 $result_file
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
277 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
278
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
279 my $output = `$cmd 2>&1`;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
280 unlink($seq_file);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
281
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
282 return [] unless -e $result_file;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
283
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
284 open RES, $result_file;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
285 my @results;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
286
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
287 while(<RES>) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
288 chomp;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
289 my ($end5, $end3, $score, $confidence, $type) = split;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
290
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
291 push @results, {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
292 end5 => $end5,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
293 end3 => $end3,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
294 score => $score,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
295 confidence => $confidence,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
296 type => $type
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
297 };
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
298 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
299 close RES;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
300
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
301 unlink($result_file);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
302
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
303 push @{$self->{cache}}, { hex => md5_hex($seq), results => \@results};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
304 shift @{$self->{cache}} while scalar @{$self->{cache}} > $self->{_param_cache_size};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
305
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
306 return \@results;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
307 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
308
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
309 sub results_from_cache {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
310 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
311 my $seq = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
312
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
313 my ($results) = map {$_->{results}} grep {$_->{hex} eq md5_hex($seq)} @{$self->{cache} || []};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
314
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
315 return $results;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
316 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
317
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
318 sub compare_results {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
319 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
320 my $a = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
321 my $b = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
322
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
323 my (@diff, @lost);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
324
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
325 foreach my $res_a(@$a) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
326 my @match = grep {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
327 $_->{end5} == $res_a->{end5} &&
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
328 $_->{end3} == $res_a->{end3} &&
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
329 $_->{type} eq $res_a->{type}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
330 } @$b;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
331
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
332 # result not found in b
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
333 if(!@match) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
334 push @lost, $res_a;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
335 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
336
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
337 # >1 result found
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
338 elsif(scalar @match > 1) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
339 warn("WARNING: Found two matches?\n");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
340 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
341
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
342 # 1 match
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
343 elsif($match[0]->{score} != $res_a->{score}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
344 my %diff = %$res_a;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
345 $diff{score} .= '-'.$match[0]->{score};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
346 $diff{confidence} .= '-'.$match[0]->{confidence};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
347 push @diff, \%diff;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
348 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
349 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
350
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
351 return { diff => \@diff, lost => \@lost};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
352 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
353
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
354 sub map_ss_coords {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
355 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
356 my $res = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
357 my $vf = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
358
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
359 my $return = {};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
360
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
361 foreach my $coord(qw(end5 end3)) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
362 $return->{$coord} = (($res->{$coord} - $self->{'_param_context'}) + $vf->{start}) - 1;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
363 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
364
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
365 return $return;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
366 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
367
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
368 1;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
369