comparison dir_plugins/SubsetVCF.pm @ 0:e545d0a25ffe draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:17:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e545d0a25ffe
1 =head1 NAME
2
3 SubsetVCF
4
5 =head1 DESCRIPTION
6
7 A VEP plugin to retrieve overlapping records from a given VCF file.
8 Values for POS, ID, and ALT, are retrieved as well as values for any requested
9 INFO field. Additionally, the allele number of the matching ALT is returned.
10
11 Though similar to using '--custom', this plugin returns all ALTs for a given
12 POS, as well as all associated INFO values.
13
14 By default, only VCF records with a filter value of "PASS" are returned,
15 however this behaviour can be changed via the 'filter' option.
16
17 Parameters:
18 name: short name added used as a prefix (required)
19 file: path to tabix-index vcf file (required)
20 filter: only consider variants marked as 'PASS', 1 or 0 (default, 1)
21 fields: info fields to be returned (default, not used)
22 '%' can delimit multiple fields
23 '*' can be used as a wildcard
24
25 Returns:
26 <name>_POS: POS field from VCF
27 <name>_REF: REF field from VCF (minimised)
28 <name>_ALT: ALT field from VCF (minimised)
29 <name>_alt_index: Index of matching variant (zero-based)
30 <name>_<field>: List of requested info values
31
32 =head1 SYNOPSIS
33
34 ./vep -i variations.vcf --plugin SubsetVCF,file=filepath.vcf.gz,name=myvfc,fields=AC*%AN*
35
36 =head1 CONTACT
37
38 Joseph A. Prinz <jp102@duke.edu>
39
40 =head1 LICENSE
41
42 Licensed under the Apache License, Version 2.0 (the "License");
43 you may not use this file except in compliance with the License.
44 You may obtain a copy of the License at
45 http://www.apache.org/licenses/LICENSE-2.0
46 Unless required by applicable law or agreed to in writing, software
47 distributed under the License is distributed on an "AS IS" BASIS,
48 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
49 See the License for the specific language governing permissions and
50 limitations under the License.
51
52 =cut
53
54 package SubsetVCF;
55
56 use strict;
57 use warnings;
58
59 use Storable qw(dclone);
60 use Data::Dumper;
61
62 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
63 use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles);
64 use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_all_consequences);
65 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
66 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
67
68 sub simple_vf {
69 my ($vf, $params) = @_;
70 my @alleles = split /\//, $vf->{allele_string};
71 my $ref = shift @alleles;
72
73 my @line;
74 if (ref $vf->{_line} ne "ARRAY") {
75 @line = split /\t/, $vf->{_line};
76 } else {
77 @line = @{$vf->{_line}};
78 }
79
80 # Use a particular allele if requested
81 @alleles = ($params->{allele}) if $params->{allele};
82
83 # Reverse comp if needed
84 if ($vf->{strand} < 0) {
85 @alleles = map { reverse_comp($_) } @alleles;
86 $ref = reverse_comp($ref);
87 $vf->{strand} = 1;
88 }
89
90 # Return values
91 my $ret = {
92 chr => $vf->{chr},
93 pos => $vf->{start},
94 start => $vf->{start},
95 end => $vf->{end},
96 strand => $vf->{strand},
97 alts => [@alleles],
98 line => [@line],
99 ref => $ref};
100
101 # If filter is true, only return $ret if filter eq "PASS"
102 return $params->{filter} && $line[6] ne "PASS" ? {} : $ret;
103 }
104
105 sub parse_info {
106 my ($line, $valid_fields) = @_;
107 my %ret;
108 for my $dat (split /;/, $line->[7]) {
109 my ($field, $val) = split /=/, $dat;
110 if (grep { $field eq $_ } @$valid_fields) {
111 $ret{$field} = [split /,/, $val];
112 }
113 }
114 return \%ret;
115 }
116
117 sub new {
118 my $class = shift;
119 my $self = $class->SUPER::new(@_);
120
121 $self->expand_left(0);
122 $self->expand_right(0);
123 $self->get_user_params();
124
125 # Get params and ensure a minumum number of parameters
126 my $params = $self->params_to_hash();
127 die "ERROR: no value for 'file' specified" if !$params->{file};
128 die "ERROR: no value for 'name' specified" if !$params->{name};
129
130 # Defaults
131 $params->{filter} = 1 if !$params->{filter};
132
133 # Add file via parameter hash
134 $self->add_file($params->{file});
135 $self->{filter} = $params->{filter};
136 $self->{name} = $params->{name};
137
138 if ($params->{fields}) {
139 # Mung filter to turn AC*%AN* into AC[^,]+|AN[^,]+
140 $params->{fields} =~ s/%/|/g;
141 $params->{fields} =~ s/\*/[^,]*/g;
142
143 # Get input file headers
144 my %fields;
145 my $info_regex = "^##INFO=<ID=($params->{fields}),.*Description=\"([^\"]+).*";
146 open HEAD, "tabix -fh $params->{file} 1:1-1 2>&1 | ";
147 while(my $line = <HEAD>) {
148 next unless $line =~ $info_regex;
149 $fields{$1} = $2;
150 }
151 die "Could not find any valid info fields" if not %fields;
152 $self->{fields} = \%fields;
153 $self->{valid_fields} = [keys %fields];
154 }
155 return $self;
156 }
157
158 sub feature_types {
159 return ['Feature', 'Intergenic'];
160 }
161
162 sub get_header_info {
163 my $self = shift;
164 my %ret;
165
166 # Add fields if requested
167 if ($self->{fields}) {
168 while (my ($field, $desc) = each %{$self->{fields}}) {
169 $ret{"$self->{name}_$field"} = $desc;
170 }
171 }
172
173 $ret{"$self->{name}_ID"} = "Original ID";
174 $ret{"$self->{name}_POS"} = "Original POS";
175 $ret{"$self->{name}_REF"} = "Original refrance allele";
176 $ret{"$self->{name}_ALT"} = "Original alternatives as they appear in the VCF file";
177 $ret{"$self->{name}_alt_index"} = "Index of matching alternative (zero-based)";
178 return \%ret;
179 }
180
181 sub parse_data {
182 my ($self, $line) = @_;
183 my ($vf) = @{parse_line({format => 'vcf', minimal => 1}, $line)};
184 return simple_vf($vf, {filter => $self->{filter}});
185 }
186
187 sub run {
188 my ($self, $tva) = @_;
189 my $vf = simple_vf($tva->variation_feature, {allele => $tva->{variation_feature_seq}});
190
191 # Zero-indexing start for tabix and adding 1 to end for VEP indels
192 my @data = @{$self->get_data($vf->{chr}, ($vf->{start} - 1), ($vf->{end} + 1))};
193
194 my (%ret, $found_vf, @matches);
195 for my $dat (@data) {
196 next unless %$dat;
197 @matches = @{get_matched_variant_alleles($vf, $dat)};
198 if (@matches) {
199 $found_vf = dclone $dat;
200 last;
201 }
202 }
203
204 if (@matches) {
205 # Return the index of matching found alleles
206 my @found_alts = map { $_->{b_index} } @matches;
207
208 # Parse info fields if needed
209 if ($self->{fields}) {
210 my %found_fields = %{parse_info($found_vf->{line}, $self->{valid_fields})};
211 while (my ($field, $val) = each %found_fields) {
212 $ret{"$self->{name}_$field"} = [@$val];
213 }
214 }
215
216 $ret{"$self->{name}_ID"} = $found_vf->{line}->[2];
217 $ret{"$self->{name}_POS"} = $found_vf->{pos};
218 $ret{"$self->{name}_POS"} = $found_vf->{pos};
219 $ret{"$self->{name}_REF"} = $found_vf->{ref};
220 $ret{"$self->{name}_ALT"} = $found_vf->{alts};
221 $ret{"$self->{name}_alt_index"} = [@found_alts];
222 }
223 return \%ret;
224 }
225
226 1;