annotate dir_plugins/Conservation.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
1 =head1 LICENSE
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
2
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
5
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
6 Licensed under the Apache License, Version 2.0 (the "License");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
7 you may not use this file except in compliance with the License.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
8 You may obtain a copy of the License at
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
9
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
10 http://www.apache.org/licenses/LICENSE-2.0
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
11
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
12 Unless required by applicable law or agreed to in writing, software
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
13 distributed under the License is distributed on an "AS IS" BASIS,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
15 See the License for the specific language governing permissions and
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
16 limitations under the License.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
17
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
18 =head1 CONTACT
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
19
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
21
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
22 =cut
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
23
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
24 =head1 NAME
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
25
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
26 Conservation
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
27
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
28 =head1 SYNOPSIS
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
29
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
30 mv Conservation.pm ~/.vep/Plugins
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
31 ./vep -i variations.vcf --plugin Conservation,GERP_CONSERVATION_SCORE,mammals
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
32
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
33 =head1 DESCRIPTION
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
34
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
36 retrieves a conservation score from the Ensembl Compara databases
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
37 for variant positions. You can specify the method link type and
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
38 species sets as command line options, the default is to fetch GERP
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
39 scores from the EPO 35 way mammalian alignment (please refer to the
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
40 Compara documentation for more details of available analyses).
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
41
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
42 If a variant affects multiple nucleotides the average score for the
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
43 position will be returned, and for insertions the average score of
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
44 the 2 flanking bases will be returned.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
45
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
46 The plugin requires the ensembl-compara API module to be installed;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
47 see http://www.ensembl.org/info/docs/api/index.html
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
48
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
49 =cut
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
50
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
51 package Conservation;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
52
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
53 use strict;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
54 use warnings;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
55
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
56 use Bio::EnsEMBL::Registry;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
57 use Bio::EnsEMBL::Slice;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
58
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
59 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
60
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
61 sub version {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
62 return '2.3';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
63 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
64
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
65 sub feature_types {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
66 return ['Feature','Intergenic'];
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
67 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
68
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
69 sub get_header_info {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
70
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
71 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
72
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
73 return {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
74 Conservation => "The conservation score for this site (method_link_type=\"".
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
75 $self->{method_link_type}."\", species_set=\"".$self->{species_set}."\")",
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
76 };
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
77 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
78
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
79 sub new {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
80 my $class = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
81
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
82 my $self = $class->SUPER::new(@_);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
83
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
84 my $params = $self->params;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
85
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
86 # REST API passes 1 as first param
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
87 shift @$params if $params->[0] && $params->[0] eq '1';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
88
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
89 $self->{method_link_type} = $params->[0] || 'GERP_CONSERVATION_SCORE';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
90 $self->{species_set} = $params->[1] || 'mammals';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
91
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
92 my $config = $self->{config};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
93 my $reg = 'Bio::EnsEMBL::Registry';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
94
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
95 # reconnect to DB without species param
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
96 if($config->{host}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
97 $reg->load_registry_from_db(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
98 -host => $config->{host},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
99 -user => $config->{user},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
100 -pass => $config->{password},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
101 -port => $config->{port},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
102 -db_version => $config->{db_version},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
103 -no_cache => $config->{no_slice_cache},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
104 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
105 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
106
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
107 my $mlss_adap = $config->{mlssa} ||= $reg->get_adaptor('Multi', 'compara', 'MethodLinkSpeciesSet')
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
108 or die "Failed to connect to compara database\n";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
109
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
110 $self->{mlss} = $mlss_adap->fetch_by_method_link_type_species_set_name($self->{method_link_type}, $self->{species_set})
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
111 or die "Failed to fetch MLSS for ".$self->{method_link_type}." and ".$self->{species_set}."\n";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
112
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
113 $self->{cs_adap} = $config->{cosa} ||= $reg->get_adaptor('Multi', 'compara', 'ConservationScore')
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
114 or die "Failed to fetch conservation adaptor\n";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
115
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
116 return $self;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
117 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
118
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
119 sub run {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
120 my ($self, $bvfoa) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
121
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
122 my $bvf = $bvfoa->base_variation_feature;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
123
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
124 # we cache the score on the BaseVariationFeature so we don't have to
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
125 # fetch it multiple times if this variant overlaps multiple Features
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
126
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
127 unless (exists $bvf->{_conservation_score}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
128
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
129 my $slice;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
130
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
131 my $true_snp = 0;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
132
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
133 if ($bvf->{end} >= $bvf->{start}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
134
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
135 if ($bvf->{start} == $bvf->{end}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
136
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
137 # work around a bug in the compara API that means you can't fetch
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
138 # conservation scores for 1bp slices by creating a 2bp slice for
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
139 # SNPs and then ignoring the score returned for the second position
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
140
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
141 my $s = $bvf->slice;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
142
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
143 $slice = Bio::EnsEMBL::Slice->new(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
144 -seq_region_name => $s->seq_region_name,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
145 -seq_region_length => $s->seq_region_length,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
146 -coord_system => $s->coord_system,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
147 -start => $bvf->{start},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
148 -end => $bvf->{end} + 1,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
149 -strand => $bvf->{strand},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
150 -adaptor => $s->adaptor
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
151 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
152
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
153 $true_snp = 1;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
154 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
155 else {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
156
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
157 # otherwise, just get a slice that covers our variant feature
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
158
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
159 $slice = $bvf->feature_Slice;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
160 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
161 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
162 else {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
163
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
164 # this is an insertion, we return the average score of the flanking
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
165 # bases, so we create a 2bp slice around the insertion site
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
166
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
167 my $s = $bvf->slice;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
168
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
169 $slice = Bio::EnsEMBL::Slice->new(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
170 -seq_region_name => $s->seq_region_name,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
171 -seq_region_length => $s->seq_region_length,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
172 -coord_system => $s->coord_system,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
173 -start => $bvf->{end},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
174 -end => $bvf->{start},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
175 -strand => $bvf->{strand},
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
176 -adaptor => $s->adaptor
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
177 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
178 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
179
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
180 my $scores = $self->{cs_adap}->fetch_all_by_MethodLinkSpeciesSet_Slice(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
181 $self->{mlss}, # our MLSS for the conservation metric and the set of species
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
182 $slice, # our slice
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
183 ($slice->end - $slice->start + 1), # the number of scores we want back (one for each base)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
184 );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
185
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
186 if (@$scores > 0) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
187
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
188 # we use the simple average of the diff_scores as the overall score
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
189
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
190 pop @$scores if $true_snp; # get rid of our spurious second score for SNPs
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
191
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
192 if (@$scores > 0) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
193
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
194 my $tot_score = 0;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
195
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
196 $tot_score += $_->diff_score for @$scores;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
197
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
198 $tot_score /= @$scores;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
199
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
200 $bvf->{_conservation_score} = sprintf "%.3f", $tot_score;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
201 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
202 else {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
203 $bvf->{_conservation_score} = undef;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
204 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
205 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
206 else {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
207 $bvf->{_conservation_score} = undef;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
208 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
209 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
210
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
211 if (defined $bvf->{_conservation_score}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
212 return {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
213 Conservation => $bvf->{_conservation_score}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
214 };
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
215 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
216 else {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
217 return {};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
218 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
219 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
220
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
221 1;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
222