comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/dbSNP.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Variation::Utils::dbSNP
24
25 =head1 SYNOPSIS
26
27 use Bio::EnsEMBL::Variation::Utils::dbSNP qw(decode_bitfield);
28
29 my $hashref = decode_bitfield('050160000a01050512110101');
30
31 print "variant is precious\n" if $hashref->{precious};
32
33 =head1 DESCRIPTION
34
35 This module provides a single subroutine decode_bitfield which decodes
36 a dbSNP bitfield from their VCF files into a hash reference with values
37 for each value specified in the field.
38
39 The encoding is taken from the following NCBI document:
40
41 ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf
42
43 =cut
44
45 package Bio::EnsEMBL::Variation::Utils::dbSNP;
46
47 use strict;
48 use warnings;
49
50 use base qw(Exporter);
51
52 our @EXPORT_OK = qw(decode_bitfield);
53
54 use constant ENCODING_VERSION => 5;
55
56 # an example string, with the fields and offsets
57
58 # F0 F1 F2 F3 F4 F5 F6 F7 F8 F9
59 # 05 0160 000a 01 05 05 12 11 01 01
60 # 0 2 4 6 8 10 12 14 16 18 20 22
61
62 # offsets into the string for each field
63
64 my %offsets = (
65 F0 => 0,
66 F1_1 => 2,
67 F1_2 => 4,
68 F2_1 => 6,
69 F2_2 => 8,
70 F3 => 10,
71 F4 => 12,
72 F5 => 14,
73 F6 => 16,
74 F7 => 18,
75 F8 => 20,
76 F9 => 22,
77 );
78
79 # a hash mapping the values encoded in each field to the bits used encode them
80 # if multiple bits are used (e.g. for version) then the values should be a
81 # listref of all the bits, this will be used to construct a bit mask to pick
82 # out the necessary information
83
84 my %fields= (
85
86 F0 => {
87 version => [3,2,1],
88 },
89
90 F1_1 => {
91 trace_archive => 8,
92 assembly_archive => 7,
93 entrez_geo => 6,
94 probe_db => 5,
95 entrez_gene => 4,
96 entrez_sts => 3,
97 has_structure => 2,
98 submitter_link_out => 1,
99 },
100
101 F1_2 => {
102 clinical => 7,
103 precious => 6,
104 provisional_tpa => 5,
105 pubmed => 4,
106 sra => 3,
107 organism_db_link => 2,
108 mgc_clone => 1,
109 },
110
111 F2_1 => {
112 utr_3 => 8,
113 utr_5 => 7,
114 acceptor_ss => 6,
115 donor_ss => 5,
116 intron => 4,
117 region_3 => 3,
118 region_5 => 2,
119 in_gene => 1,
120 },
121
122 F2_2 => {
123 stop_loss => 6,
124 frameshift => 5,
125 missense => 4,
126 stop_gain => 3,
127 has_ref => 2,
128 has_syn => 1,
129 },
130
131 F3 => {
132 has_other_snp => 5,
133 has_assembly_conflict => 4,
134 is_assembly_specific => 3,
135 weight => [1,2],
136 },
137
138 F4 => {
139 is_mutation => 4,
140 is_validated => 3,
141 maf_all_pops => 2,
142 maf_some_pops => 1,
143 },
144
145 F5 => {
146 marker_high_density => 3,
147 in_haplotype_tagging_set => 2,
148 genotypes_available => 1,
149 },
150
151 F6 => {
152 tgp_2010_production => 7,
153 tgp_validated => 6,
154 tgp_2010_pilot => 5,
155 tgp_2009_pilot => 4,
156 hm_phase_3_genotyped => 3,
157 hm_phase_2_genotyped => 2,
158 hm_phase_1_genotyped => 1,
159 },
160
161 F7 => {
162 has_mesh => 8,
163 clinical_assay => 7,
164 has_tf => 6,
165 lsdb => 5,
166 dbgap_significant => 4,
167 dbgap_lod_score => 3,
168 third_party_annot => 2,
169 omim => 1,
170 },
171
172 F8 => {
173 var_class => [4,3,2,1],
174 },
175
176 F9 => {
177 is_suspect => 7,
178 is_somatic => 6,
179 contig_allele_not_present => 5,
180 withdrawn => 4,
181 cluster_no_overlap => 3,
182 strain_specific => 2,
183 genotype_conflict => 1,
184 },
185 );
186
187 # a lookup table for the variation class
188
189 my %var_class = (
190 0b0001 => 'snp',
191 0b0010 => 'dips',
192 0b0011 => 'heterozygous',
193 0b0100 => 'microsatellite',
194 0b0101 => 'named',
195 0b0110 => 'no_variation',
196 0b0111 => 'mixed',
197 0b1000 => 'multi_base',
198 );
199
200 =head2 decode_bitfield
201
202 Arg[1] : string $bitfield
203 Example : my $hashref = decode_bitfield('050160000a01050512110101');
204 Description : Decodes a dbSNP bitfield string which encodes various attributes of a variation
205 Returntype : A hash reference with a key for each attribute set in the field, if the field
206 is boolean (e.g. precious, suspect etc.) then the value should be treated as a
207 true or false value, otherwise (e.g. var_class, weight) the value is the actual
208 value of the attribute
209
210 =cut
211
212 sub decode_bitfield {
213
214 my $bitfield = shift;
215
216 my %res;
217
218 for my $field (keys %fields) {
219
220 for my $value (keys %{ $fields{$field} }) {
221
222 my $bits = $fields{$field}->{$value};
223
224 # if bits isn't an array, put the single bit into an array
225 $bits = [$bits] unless ref $bits eq 'ARRAY';
226
227 # OR together all the bits to give us our mask
228 my $mask;
229
230 for my $bit (@$bits) {
231 $mask |= 2**($bit-1);
232 }
233
234 # extract the relevant characters from the bitfield string,
235 # convert them to an integer, and apply our mask
236 $res{$value} = hex(substr($bitfield, $offsets{$field}, 2)) & $mask;
237
238 # check that the version matches what we expect
239 if ($value eq 'version' && $res{$value} != ENCODING_VERSION) {
240 warn "Version field does not match the expected version (".$res{$value}." vs ".ENCODING_VERSION.")";
241 return undef;
242 }
243
244 # lookup the class description
245 $res{$value} = $var_class{$res{$value}} if $value eq 'var_class';
246
247 # get rid of anything set to 0
248 delete $res{$value} unless $res{$value};
249 }
250 }
251
252 return \%res;
253 }
254
255 1;
256