|
0
|
1 =head1 NAME
|
|
|
2
|
|
|
3 SubsetVCF
|
|
|
4
|
|
|
5 =head1 DESCRIPTION
|
|
|
6
|
|
|
7 A VEP plugin to retrieve overlapping records from a given VCF file.
|
|
|
8 Values for POS, ID, and ALT, are retrieved as well as values for any requested
|
|
|
9 INFO field. Additionally, the allele number of the matching ALT is returned.
|
|
|
10
|
|
|
11 Though similar to using '--custom', this plugin returns all ALTs for a given
|
|
|
12 POS, as well as all associated INFO values.
|
|
|
13
|
|
|
14 By default, only VCF records with a filter value of "PASS" are returned,
|
|
|
15 however this behaviour can be changed via the 'filter' option.
|
|
|
16
|
|
|
17 Parameters:
|
|
|
18 name: short name added used as a prefix (required)
|
|
|
19 file: path to tabix-index vcf file (required)
|
|
|
20 filter: only consider variants marked as 'PASS', 1 or 0 (default, 1)
|
|
|
21 fields: info fields to be returned (default, not used)
|
|
|
22 '%' can delimit multiple fields
|
|
|
23 '*' can be used as a wildcard
|
|
|
24
|
|
|
25 Returns:
|
|
|
26 <name>_POS: POS field from VCF
|
|
|
27 <name>_REF: REF field from VCF (minimised)
|
|
|
28 <name>_ALT: ALT field from VCF (minimised)
|
|
|
29 <name>_alt_index: Index of matching variant (zero-based)
|
|
|
30 <name>_<field>: List of requested info values
|
|
|
31
|
|
|
32 =head1 SYNOPSIS
|
|
|
33
|
|
|
34 ./vep -i variations.vcf --plugin SubsetVCF,file=filepath.vcf.gz,name=myvfc,fields=AC*%AN*
|
|
|
35
|
|
|
36 =head1 CONTACT
|
|
|
37
|
|
|
38 Joseph A. Prinz <jp102@duke.edu>
|
|
|
39
|
|
|
40 =head1 LICENSE
|
|
|
41
|
|
|
42 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
43 you may not use this file except in compliance with the License.
|
|
|
44 You may obtain a copy of the License at
|
|
|
45 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
46 Unless required by applicable law or agreed to in writing, software
|
|
|
47 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
48 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
49 See the License for the specific language governing permissions and
|
|
|
50 limitations under the License.
|
|
|
51
|
|
|
52 =cut
|
|
|
53
|
|
|
54 package SubsetVCF;
|
|
|
55
|
|
|
56 use strict;
|
|
|
57 use warnings;
|
|
|
58
|
|
|
59 use Storable qw(dclone);
|
|
|
60 use Data::Dumper;
|
|
|
61
|
|
|
62 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
|
63 use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles);
|
|
|
64 use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_all_consequences);
|
|
|
65 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
|
|
|
66 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
|
|
|
67
|
|
|
68 sub simple_vf {
|
|
|
69 my ($vf, $params) = @_;
|
|
|
70 my @alleles = split /\//, $vf->{allele_string};
|
|
|
71 my $ref = shift @alleles;
|
|
|
72
|
|
|
73 my @line;
|
|
|
74 if (ref $vf->{_line} ne "ARRAY") {
|
|
|
75 @line = split /\t/, $vf->{_line};
|
|
|
76 } else {
|
|
|
77 @line = @{$vf->{_line}};
|
|
|
78 }
|
|
|
79
|
|
|
80 # Use a particular allele if requested
|
|
|
81 @alleles = ($params->{allele}) if $params->{allele};
|
|
|
82
|
|
|
83 # Reverse comp if needed
|
|
|
84 if ($vf->{strand} < 0) {
|
|
|
85 @alleles = map { reverse_comp($_) } @alleles;
|
|
|
86 $ref = reverse_comp($ref);
|
|
|
87 $vf->{strand} = 1;
|
|
|
88 }
|
|
|
89
|
|
|
90 # Return values
|
|
|
91 my $ret = {
|
|
|
92 chr => $vf->{chr},
|
|
|
93 pos => $vf->{start},
|
|
|
94 start => $vf->{start},
|
|
|
95 end => $vf->{end},
|
|
|
96 strand => $vf->{strand},
|
|
|
97 alts => [@alleles],
|
|
|
98 line => [@line],
|
|
|
99 ref => $ref};
|
|
|
100
|
|
|
101 # If filter is true, only return $ret if filter eq "PASS"
|
|
|
102 return $params->{filter} && $line[6] ne "PASS" ? {} : $ret;
|
|
|
103 }
|
|
|
104
|
|
|
105 sub parse_info {
|
|
|
106 my ($line, $valid_fields) = @_;
|
|
|
107 my %ret;
|
|
|
108 for my $dat (split /;/, $line->[7]) {
|
|
|
109 my ($field, $val) = split /=/, $dat;
|
|
|
110 if (grep { $field eq $_ } @$valid_fields) {
|
|
|
111 $ret{$field} = [split /,/, $val];
|
|
|
112 }
|
|
|
113 }
|
|
|
114 return \%ret;
|
|
|
115 }
|
|
|
116
|
|
|
117 sub new {
|
|
|
118 my $class = shift;
|
|
|
119 my $self = $class->SUPER::new(@_);
|
|
|
120
|
|
|
121 $self->expand_left(0);
|
|
|
122 $self->expand_right(0);
|
|
|
123 $self->get_user_params();
|
|
|
124
|
|
|
125 # Get params and ensure a minumum number of parameters
|
|
|
126 my $params = $self->params_to_hash();
|
|
|
127 die "ERROR: no value for 'file' specified" if !$params->{file};
|
|
|
128 die "ERROR: no value for 'name' specified" if !$params->{name};
|
|
|
129
|
|
|
130 # Defaults
|
|
|
131 $params->{filter} = 1 if !$params->{filter};
|
|
|
132
|
|
|
133 # Add file via parameter hash
|
|
|
134 $self->add_file($params->{file});
|
|
|
135 $self->{filter} = $params->{filter};
|
|
|
136 $self->{name} = $params->{name};
|
|
|
137
|
|
|
138 if ($params->{fields}) {
|
|
|
139 # Mung filter to turn AC*%AN* into AC[^,]+|AN[^,]+
|
|
|
140 $params->{fields} =~ s/%/|/g;
|
|
|
141 $params->{fields} =~ s/\*/[^,]*/g;
|
|
|
142
|
|
|
143 # Get input file headers
|
|
|
144 my %fields;
|
|
|
145 my $info_regex = "^##INFO=<ID=($params->{fields}),.*Description=\"([^\"]+).*";
|
|
|
146 open HEAD, "tabix -fh $params->{file} 1:1-1 2>&1 | ";
|
|
|
147 while(my $line = <HEAD>) {
|
|
|
148 next unless $line =~ $info_regex;
|
|
|
149 $fields{$1} = $2;
|
|
|
150 }
|
|
|
151 die "Could not find any valid info fields" if not %fields;
|
|
|
152 $self->{fields} = \%fields;
|
|
|
153 $self->{valid_fields} = [keys %fields];
|
|
|
154 }
|
|
|
155 return $self;
|
|
|
156 }
|
|
|
157
|
|
|
158 sub feature_types {
|
|
|
159 return ['Feature', 'Intergenic'];
|
|
|
160 }
|
|
|
161
|
|
|
162 sub get_header_info {
|
|
|
163 my $self = shift;
|
|
|
164 my %ret;
|
|
|
165
|
|
|
166 # Add fields if requested
|
|
|
167 if ($self->{fields}) {
|
|
|
168 while (my ($field, $desc) = each %{$self->{fields}}) {
|
|
|
169 $ret{"$self->{name}_$field"} = $desc;
|
|
|
170 }
|
|
|
171 }
|
|
|
172
|
|
|
173 $ret{"$self->{name}_ID"} = "Original ID";
|
|
|
174 $ret{"$self->{name}_POS"} = "Original POS";
|
|
|
175 $ret{"$self->{name}_REF"} = "Original refrance allele";
|
|
|
176 $ret{"$self->{name}_ALT"} = "Original alternatives as they appear in the VCF file";
|
|
|
177 $ret{"$self->{name}_alt_index"} = "Index of matching alternative (zero-based)";
|
|
|
178 return \%ret;
|
|
|
179 }
|
|
|
180
|
|
|
181 sub parse_data {
|
|
|
182 my ($self, $line) = @_;
|
|
|
183 my ($vf) = @{parse_line({format => 'vcf', minimal => 1}, $line)};
|
|
|
184 return simple_vf($vf, {filter => $self->{filter}});
|
|
|
185 }
|
|
|
186
|
|
|
187 sub run {
|
|
|
188 my ($self, $tva) = @_;
|
|
|
189 my $vf = simple_vf($tva->variation_feature, {allele => $tva->{variation_feature_seq}});
|
|
|
190
|
|
|
191 # Zero-indexing start for tabix and adding 1 to end for VEP indels
|
|
|
192 my @data = @{$self->get_data($vf->{chr}, ($vf->{start} - 1), ($vf->{end} + 1))};
|
|
|
193
|
|
|
194 my (%ret, $found_vf, @matches);
|
|
|
195 for my $dat (@data) {
|
|
|
196 next unless %$dat;
|
|
|
197 @matches = @{get_matched_variant_alleles($vf, $dat)};
|
|
|
198 if (@matches) {
|
|
|
199 $found_vf = dclone $dat;
|
|
|
200 last;
|
|
|
201 }
|
|
|
202 }
|
|
|
203
|
|
|
204 if (@matches) {
|
|
|
205 # Return the index of matching found alleles
|
|
|
206 my @found_alts = map { $_->{b_index} } @matches;
|
|
|
207
|
|
|
208 # Parse info fields if needed
|
|
|
209 if ($self->{fields}) {
|
|
|
210 my %found_fields = %{parse_info($found_vf->{line}, $self->{valid_fields})};
|
|
|
211 while (my ($field, $val) = each %found_fields) {
|
|
|
212 $ret{"$self->{name}_$field"} = [@$val];
|
|
|
213 }
|
|
|
214 }
|
|
|
215
|
|
|
216 $ret{"$self->{name}_ID"} = $found_vf->{line}->[2];
|
|
|
217 $ret{"$self->{name}_POS"} = $found_vf->{pos};
|
|
|
218 $ret{"$self->{name}_POS"} = $found_vf->{pos};
|
|
|
219 $ret{"$self->{name}_REF"} = $found_vf->{ref};
|
|
|
220 $ret{"$self->{name}_ALT"} = $found_vf->{alts};
|
|
|
221 $ret{"$self->{name}_alt_index"} = [@found_alts];
|
|
|
222 }
|
|
|
223 return \%ret;
|
|
|
224 }
|
|
|
225
|
|
|
226 1;
|