Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/SubsetVCF.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
comparison
equal
deleted
inserted
replaced
| 2:17c98d091710 | 3:49397129aec0 |
|---|---|
| 1 =head1 NAME | |
| 2 | |
| 3 SubsetVCF | |
| 4 | |
| 5 =head1 DESCRIPTION | |
| 6 | |
| 7 A VEP plugin to retrieve overlapping records from a given VCF file. | |
| 8 Values for POS, ID, and ALT, are retrieved as well as values for any requested | |
| 9 INFO field. Additionally, the allele number of the matching ALT is returned. | |
| 10 | |
| 11 Though similar to using '--custom', this plugin returns all ALTs for a given | |
| 12 POS, as well as all associated INFO values. | |
| 13 | |
| 14 By default, only VCF records with a filter value of "PASS" are returned, | |
| 15 however this behaviour can be changed via the 'filter' option. | |
| 16 | |
| 17 Parameters: | |
| 18 name: short name added used as a prefix (required) | |
| 19 file: path to tabix-index vcf file (required) | |
| 20 filter: only consider variants marked as 'PASS', 1 or 0 (default, 1) | |
| 21 fields: info fields to be returned (default, not used) | |
| 22 '%' can delimit multiple fields | |
| 23 '*' can be used as a wildcard | |
| 24 | |
| 25 Returns: | |
| 26 <name>_POS: POS field from VCF | |
| 27 <name>_REF: REF field from VCF (minimised) | |
| 28 <name>_ALT: ALT field from VCF (minimised) | |
| 29 <name>_alt_index: Index of matching variant (zero-based) | |
| 30 <name>_<field>: List of requested info values | |
| 31 | |
| 32 =head1 SYNOPSIS | |
| 33 | |
| 34 ./vep -i variations.vcf --plugin SubsetVCF,file=filepath.vcf.gz,name=myvfc,fields=AC*%AN* | |
| 35 | |
| 36 =head1 CONTACT | |
| 37 | |
| 38 Joseph A. Prinz <jp102@duke.edu> | |
| 39 | |
| 40 =head1 LICENSE | |
| 41 | |
| 42 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 43 you may not use this file except in compliance with the License. | |
| 44 You may obtain a copy of the License at | |
| 45 http://www.apache.org/licenses/LICENSE-2.0 | |
| 46 Unless required by applicable law or agreed to in writing, software | |
| 47 distributed under the License is distributed on an "AS IS" BASIS, | |
| 48 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 49 See the License for the specific language governing permissions and | |
| 50 limitations under the License. | |
| 51 | |
| 52 =cut | |
| 53 | |
| 54 package SubsetVCF; | |
| 55 | |
| 56 use strict; | |
| 57 use warnings; | |
| 58 | |
| 59 use Storable qw(dclone); | |
| 60 use Data::Dumper; | |
| 61 | |
| 62 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 63 use Bio::EnsEMBL::Variation::Utils::Sequence qw(get_matched_variant_alleles); | |
| 64 use Bio::EnsEMBL::Variation::Utils::VEP qw(parse_line get_all_consequences); | |
| 65 use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin; | |
| 66 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); | |
| 67 | |
| 68 sub simple_vf { | |
| 69 my ($vf, $params) = @_; | |
| 70 my @alleles = split /\//, $vf->{allele_string}; | |
| 71 my $ref = shift @alleles; | |
| 72 | |
| 73 my @line; | |
| 74 if (ref $vf->{_line} ne "ARRAY") { | |
| 75 @line = split /\t/, $vf->{_line}; | |
| 76 } else { | |
| 77 @line = @{$vf->{_line}}; | |
| 78 } | |
| 79 | |
| 80 # Use a particular allele if requested | |
| 81 @alleles = ($params->{allele}) if $params->{allele}; | |
| 82 | |
| 83 # Reverse comp if needed | |
| 84 if ($vf->{strand} < 0) { | |
| 85 @alleles = map { reverse_comp($_) } @alleles; | |
| 86 $ref = reverse_comp($ref); | |
| 87 $vf->{strand} = 1; | |
| 88 } | |
| 89 | |
| 90 # Return values | |
| 91 my $ret = { | |
| 92 chr => $vf->{chr}, | |
| 93 pos => $vf->{start}, | |
| 94 start => $vf->{start}, | |
| 95 end => $vf->{end}, | |
| 96 strand => $vf->{strand}, | |
| 97 alts => [@alleles], | |
| 98 line => [@line], | |
| 99 ref => $ref}; | |
| 100 | |
| 101 # If filter is true, only return $ret if filter eq "PASS" | |
| 102 return $params->{filter} && $line[6] ne "PASS" ? {} : $ret; | |
| 103 } | |
| 104 | |
| 105 sub parse_info { | |
| 106 my ($line, $valid_fields) = @_; | |
| 107 my %ret; | |
| 108 for my $dat (split /;/, $line->[7]) { | |
| 109 my ($field, $val) = split /=/, $dat; | |
| 110 if (grep { $field eq $_ } @$valid_fields) { | |
| 111 $ret{$field} = [split /,/, $val]; | |
| 112 } | |
| 113 } | |
| 114 return \%ret; | |
| 115 } | |
| 116 | |
| 117 sub new { | |
| 118 my $class = shift; | |
| 119 my $self = $class->SUPER::new(@_); | |
| 120 | |
| 121 $self->expand_left(0); | |
| 122 $self->expand_right(0); | |
| 123 $self->get_user_params(); | |
| 124 | |
| 125 # Get params and ensure a minumum number of parameters | |
| 126 my $params = $self->params_to_hash(); | |
| 127 die "ERROR: no value for 'file' specified" if !$params->{file}; | |
| 128 die "ERROR: no value for 'name' specified" if !$params->{name}; | |
| 129 | |
| 130 # Defaults | |
| 131 $params->{filter} = 1 if !$params->{filter}; | |
| 132 | |
| 133 # Add file via parameter hash | |
| 134 $self->add_file($params->{file}); | |
| 135 $self->{filter} = $params->{filter}; | |
| 136 $self->{name} = $params->{name}; | |
| 137 | |
| 138 if ($params->{fields}) { | |
| 139 # Mung filter to turn AC*%AN* into AC[^,]+|AN[^,]+ | |
| 140 $params->{fields} =~ s/%/|/g; | |
| 141 $params->{fields} =~ s/\*/[^,]*/g; | |
| 142 | |
| 143 # Get input file headers | |
| 144 my %fields; | |
| 145 my $info_regex = "^##INFO=<ID=($params->{fields}),.*Description=\"([^\"]+).*"; | |
| 146 open HEAD, "tabix -fh $params->{file} 1:1-1 2>&1 | "; | |
| 147 while(my $line = <HEAD>) { | |
| 148 next unless $line =~ $info_regex; | |
| 149 $fields{$1} = $2; | |
| 150 } | |
| 151 die "Could not find any valid info fields" if not %fields; | |
| 152 $self->{fields} = \%fields; | |
| 153 $self->{valid_fields} = [keys %fields]; | |
| 154 } | |
| 155 return $self; | |
| 156 } | |
| 157 | |
| 158 sub feature_types { | |
| 159 return ['Feature', 'Intergenic']; | |
| 160 } | |
| 161 | |
| 162 sub get_header_info { | |
| 163 my $self = shift; | |
| 164 my %ret; | |
| 165 | |
| 166 # Add fields if requested | |
| 167 if ($self->{fields}) { | |
| 168 while (my ($field, $desc) = each %{$self->{fields}}) { | |
| 169 $ret{"$self->{name}_$field"} = $desc; | |
| 170 } | |
| 171 } | |
| 172 | |
| 173 $ret{"$self->{name}_ID"} = "Original ID"; | |
| 174 $ret{"$self->{name}_POS"} = "Original POS"; | |
| 175 $ret{"$self->{name}_REF"} = "Original refrance allele"; | |
| 176 $ret{"$self->{name}_ALT"} = "Original alternatives as they appear in the VCF file"; | |
| 177 $ret{"$self->{name}_alt_index"} = "Index of matching alternative (zero-based)"; | |
| 178 return \%ret; | |
| 179 } | |
| 180 | |
| 181 sub parse_data { | |
| 182 my ($self, $line) = @_; | |
| 183 my ($vf) = @{parse_line({format => 'vcf', minimal => 1}, $line)}; | |
| 184 return simple_vf($vf, {filter => $self->{filter}}); | |
| 185 } | |
| 186 | |
| 187 sub run { | |
| 188 my ($self, $tva) = @_; | |
| 189 my $vf = simple_vf($tva->variation_feature, {allele => $tva->{variation_feature_seq}}); | |
| 190 | |
| 191 # Zero-indexing start for tabix and adding 1 to end for VEP indels | |
| 192 my @data = @{$self->get_data($vf->{chr}, ($vf->{start} - 1), ($vf->{end} + 1))}; | |
| 193 | |
| 194 my (%ret, $found_vf, @matches); | |
| 195 for my $dat (@data) { | |
| 196 next unless %$dat; | |
| 197 @matches = @{get_matched_variant_alleles($vf, $dat)}; | |
| 198 if (@matches) { | |
| 199 $found_vf = dclone $dat; | |
| 200 last; | |
| 201 } | |
| 202 } | |
| 203 | |
| 204 if (@matches) { | |
| 205 # Return the index of matching found alleles | |
| 206 my @found_alts = map { $_->{b_index} } @matches; | |
| 207 | |
| 208 # Parse info fields if needed | |
| 209 if ($self->{fields}) { | |
| 210 my %found_fields = %{parse_info($found_vf->{line}, $self->{valid_fields})}; | |
| 211 while (my ($field, $val) = each %found_fields) { | |
| 212 $ret{"$self->{name}_$field"} = [@$val]; | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 $ret{"$self->{name}_ID"} = $found_vf->{line}->[2]; | |
| 217 $ret{"$self->{name}_POS"} = $found_vf->{pos}; | |
| 218 $ret{"$self->{name}_POS"} = $found_vf->{pos}; | |
| 219 $ret{"$self->{name}_REF"} = $found_vf->{ref}; | |
| 220 $ret{"$self->{name}_ALT"} = $found_vf->{alts}; | |
| 221 $ret{"$self->{name}_alt_index"} = [@found_alts]; | |
| 222 } | |
| 223 return \%ret; | |
| 224 } | |
| 225 | |
| 226 1; |
