|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 Conservation
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv Conservation.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin Conservation,GERP_CONSERVATION_SCORE,mammals
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
|
|
|
36 retrieves a conservation score from the Ensembl Compara databases
|
|
|
37 for variant positions. You can specify the method link type and
|
|
|
38 species sets as command line options, the default is to fetch GERP
|
|
|
39 scores from the EPO 35 way mammalian alignment (please refer to the
|
|
|
40 Compara documentation for more details of available analyses).
|
|
|
41
|
|
|
42 If a variant affects multiple nucleotides the average score for the
|
|
|
43 position will be returned, and for insertions the average score of
|
|
|
44 the 2 flanking bases will be returned.
|
|
|
45
|
|
|
46 The plugin requires the ensembl-compara API module to be installed;
|
|
|
47 see http://www.ensembl.org/info/docs/api/index.html
|
|
|
48
|
|
|
49 =cut
|
|
|
50
|
|
|
51 package Conservation;
|
|
|
52
|
|
|
53 use strict;
|
|
|
54 use warnings;
|
|
|
55
|
|
|
56 use Bio::EnsEMBL::Registry;
|
|
|
57 use Bio::EnsEMBL::Slice;
|
|
|
58
|
|
|
59 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
60
|
|
|
61 sub version {
|
|
|
62 return '2.3';
|
|
|
63 }
|
|
|
64
|
|
|
65 sub feature_types {
|
|
|
66 return ['Feature','Intergenic'];
|
|
|
67 }
|
|
|
68
|
|
|
69 sub get_header_info {
|
|
|
70
|
|
|
71 my $self = shift;
|
|
|
72
|
|
|
73 return {
|
|
|
74 Conservation => "The conservation score for this site (method_link_type=\"".
|
|
|
75 $self->{method_link_type}."\", species_set=\"".$self->{species_set}."\")",
|
|
|
76 };
|
|
|
77 }
|
|
|
78
|
|
|
79 sub new {
|
|
|
80 my $class = shift;
|
|
|
81
|
|
|
82 my $self = $class->SUPER::new(@_);
|
|
|
83
|
|
|
84 my $params = $self->params;
|
|
|
85
|
|
|
86 # REST API passes 1 as first param
|
|
|
87 shift @$params if $params->[0] && $params->[0] eq '1';
|
|
|
88
|
|
|
89 $self->{method_link_type} = $params->[0] || 'GERP_CONSERVATION_SCORE';
|
|
|
90 $self->{species_set} = $params->[1] || 'mammals';
|
|
|
91
|
|
|
92 my $config = $self->{config};
|
|
|
93 my $reg = 'Bio::EnsEMBL::Registry';
|
|
|
94
|
|
|
95 # reconnect to DB without species param
|
|
|
96 if($config->{host}) {
|
|
|
97 $reg->load_registry_from_db(
|
|
|
98 -host => $config->{host},
|
|
|
99 -user => $config->{user},
|
|
|
100 -pass => $config->{password},
|
|
|
101 -port => $config->{port},
|
|
|
102 -db_version => $config->{db_version},
|
|
|
103 -no_cache => $config->{no_slice_cache},
|
|
|
104 );
|
|
|
105 }
|
|
|
106
|
|
|
107 my $mlss_adap = $config->{mlssa} ||= $reg->get_adaptor('Multi', 'compara', 'MethodLinkSpeciesSet')
|
|
|
108 or die "Failed to connect to compara database\n";
|
|
|
109
|
|
|
110 $self->{mlss} = $mlss_adap->fetch_by_method_link_type_species_set_name($self->{method_link_type}, $self->{species_set})
|
|
|
111 or die "Failed to fetch MLSS for ".$self->{method_link_type}." and ".$self->{species_set}."\n";
|
|
|
112
|
|
|
113 $self->{cs_adap} = $config->{cosa} ||= $reg->get_adaptor('Multi', 'compara', 'ConservationScore')
|
|
|
114 or die "Failed to fetch conservation adaptor\n";
|
|
|
115
|
|
|
116 return $self;
|
|
|
117 }
|
|
|
118
|
|
|
119 sub run {
|
|
|
120 my ($self, $bvfoa) = @_;
|
|
|
121
|
|
|
122 my $bvf = $bvfoa->base_variation_feature;
|
|
|
123
|
|
|
124 # we cache the score on the BaseVariationFeature so we don't have to
|
|
|
125 # fetch it multiple times if this variant overlaps multiple Features
|
|
|
126
|
|
|
127 unless (exists $bvf->{_conservation_score}) {
|
|
|
128
|
|
|
129 my $slice;
|
|
|
130
|
|
|
131 my $true_snp = 0;
|
|
|
132
|
|
|
133 if ($bvf->{end} >= $bvf->{start}) {
|
|
|
134
|
|
|
135 if ($bvf->{start} == $bvf->{end}) {
|
|
|
136
|
|
|
137 # work around a bug in the compara API that means you can't fetch
|
|
|
138 # conservation scores for 1bp slices by creating a 2bp slice for
|
|
|
139 # SNPs and then ignoring the score returned for the second position
|
|
|
140
|
|
|
141 my $s = $bvf->slice;
|
|
|
142
|
|
|
143 $slice = Bio::EnsEMBL::Slice->new(
|
|
|
144 -seq_region_name => $s->seq_region_name,
|
|
|
145 -seq_region_length => $s->seq_region_length,
|
|
|
146 -coord_system => $s->coord_system,
|
|
|
147 -start => $bvf->{start},
|
|
|
148 -end => $bvf->{end} + 1,
|
|
|
149 -strand => $bvf->{strand},
|
|
|
150 -adaptor => $s->adaptor
|
|
|
151 );
|
|
|
152
|
|
|
153 $true_snp = 1;
|
|
|
154 }
|
|
|
155 else {
|
|
|
156
|
|
|
157 # otherwise, just get a slice that covers our variant feature
|
|
|
158
|
|
|
159 $slice = $bvf->feature_Slice;
|
|
|
160 }
|
|
|
161 }
|
|
|
162 else {
|
|
|
163
|
|
|
164 # this is an insertion, we return the average score of the flanking
|
|
|
165 # bases, so we create a 2bp slice around the insertion site
|
|
|
166
|
|
|
167 my $s = $bvf->slice;
|
|
|
168
|
|
|
169 $slice = Bio::EnsEMBL::Slice->new(
|
|
|
170 -seq_region_name => $s->seq_region_name,
|
|
|
171 -seq_region_length => $s->seq_region_length,
|
|
|
172 -coord_system => $s->coord_system,
|
|
|
173 -start => $bvf->{end},
|
|
|
174 -end => $bvf->{start},
|
|
|
175 -strand => $bvf->{strand},
|
|
|
176 -adaptor => $s->adaptor
|
|
|
177 );
|
|
|
178 }
|
|
|
179
|
|
|
180 my $scores = $self->{cs_adap}->fetch_all_by_MethodLinkSpeciesSet_Slice(
|
|
|
181 $self->{mlss}, # our MLSS for the conservation metric and the set of species
|
|
|
182 $slice, # our slice
|
|
|
183 ($slice->end - $slice->start + 1), # the number of scores we want back (one for each base)
|
|
|
184 );
|
|
|
185
|
|
|
186 if (@$scores > 0) {
|
|
|
187
|
|
|
188 # we use the simple average of the diff_scores as the overall score
|
|
|
189
|
|
|
190 pop @$scores if $true_snp; # get rid of our spurious second score for SNPs
|
|
|
191
|
|
|
192 if (@$scores > 0) {
|
|
|
193
|
|
|
194 my $tot_score = 0;
|
|
|
195
|
|
|
196 $tot_score += $_->diff_score for @$scores;
|
|
|
197
|
|
|
198 $tot_score /= @$scores;
|
|
|
199
|
|
|
200 $bvf->{_conservation_score} = sprintf "%.3f", $tot_score;
|
|
|
201 }
|
|
|
202 else {
|
|
|
203 $bvf->{_conservation_score} = undef;
|
|
|
204 }
|
|
|
205 }
|
|
|
206 else {
|
|
|
207 $bvf->{_conservation_score} = undef;
|
|
|
208 }
|
|
|
209 }
|
|
|
210
|
|
|
211 if (defined $bvf->{_conservation_score}) {
|
|
|
212 return {
|
|
|
213 Conservation => $bvf->{_conservation_score}
|
|
|
214 };
|
|
|
215 }
|
|
|
216 else {
|
|
|
217 return {};
|
|
|
218 }
|
|
|
219 }
|
|
|
220
|
|
|
221 1;
|
|
|
222
|