0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Variation::Utils::dbSNP
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 use Bio::EnsEMBL::Variation::Utils::dbSNP qw(decode_bitfield);
|
|
28
|
|
29 my $hashref = decode_bitfield('050160000a01050512110101');
|
|
30
|
|
31 print "variant is precious\n" if $hashref->{precious};
|
|
32
|
|
33 =head1 DESCRIPTION
|
|
34
|
|
35 This module provides a single subroutine decode_bitfield which decodes
|
|
36 a dbSNP bitfield from their VCF files into a hash reference with values
|
|
37 for each value specified in the field.
|
|
38
|
|
39 The encoding is taken from the following NCBI document:
|
|
40
|
|
41 ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf
|
|
42
|
|
43 =cut
|
|
44
|
|
45 package Bio::EnsEMBL::Variation::Utils::dbSNP;
|
|
46
|
|
47 use strict;
|
|
48 use warnings;
|
|
49
|
|
50 use base qw(Exporter);
|
|
51
|
|
52 our @EXPORT_OK = qw(decode_bitfield);
|
|
53
|
|
54 use constant ENCODING_VERSION => 5;
|
|
55
|
|
56 # an example string, with the fields and offsets
|
|
57
|
|
58 # F0 F1 F2 F3 F4 F5 F6 F7 F8 F9
|
|
59 # 05 0160 000a 01 05 05 12 11 01 01
|
|
60 # 0 2 4 6 8 10 12 14 16 18 20 22
|
|
61
|
|
62 # offsets into the string for each field
|
|
63
|
|
64 my %offsets = (
|
|
65 F0 => 0,
|
|
66 F1_1 => 2,
|
|
67 F1_2 => 4,
|
|
68 F2_1 => 6,
|
|
69 F2_2 => 8,
|
|
70 F3 => 10,
|
|
71 F4 => 12,
|
|
72 F5 => 14,
|
|
73 F6 => 16,
|
|
74 F7 => 18,
|
|
75 F8 => 20,
|
|
76 F9 => 22,
|
|
77 );
|
|
78
|
|
79 # a hash mapping the values encoded in each field to the bits used encode them
|
|
80 # if multiple bits are used (e.g. for version) then the values should be a
|
|
81 # listref of all the bits, this will be used to construct a bit mask to pick
|
|
82 # out the necessary information
|
|
83
|
|
84 my %fields= (
|
|
85
|
|
86 F0 => {
|
|
87 version => [3,2,1],
|
|
88 },
|
|
89
|
|
90 F1_1 => {
|
|
91 trace_archive => 8,
|
|
92 assembly_archive => 7,
|
|
93 entrez_geo => 6,
|
|
94 probe_db => 5,
|
|
95 entrez_gene => 4,
|
|
96 entrez_sts => 3,
|
|
97 has_structure => 2,
|
|
98 submitter_link_out => 1,
|
|
99 },
|
|
100
|
|
101 F1_2 => {
|
|
102 clinical => 7,
|
|
103 precious => 6,
|
|
104 provisional_tpa => 5,
|
|
105 pubmed => 4,
|
|
106 sra => 3,
|
|
107 organism_db_link => 2,
|
|
108 mgc_clone => 1,
|
|
109 },
|
|
110
|
|
111 F2_1 => {
|
|
112 utr_3 => 8,
|
|
113 utr_5 => 7,
|
|
114 acceptor_ss => 6,
|
|
115 donor_ss => 5,
|
|
116 intron => 4,
|
|
117 region_3 => 3,
|
|
118 region_5 => 2,
|
|
119 in_gene => 1,
|
|
120 },
|
|
121
|
|
122 F2_2 => {
|
|
123 stop_loss => 6,
|
|
124 frameshift => 5,
|
|
125 missense => 4,
|
|
126 stop_gain => 3,
|
|
127 has_ref => 2,
|
|
128 has_syn => 1,
|
|
129 },
|
|
130
|
|
131 F3 => {
|
|
132 has_other_snp => 5,
|
|
133 has_assembly_conflict => 4,
|
|
134 is_assembly_specific => 3,
|
|
135 weight => [1,2],
|
|
136 },
|
|
137
|
|
138 F4 => {
|
|
139 is_mutation => 4,
|
|
140 is_validated => 3,
|
|
141 maf_all_pops => 2,
|
|
142 maf_some_pops => 1,
|
|
143 },
|
|
144
|
|
145 F5 => {
|
|
146 marker_high_density => 3,
|
|
147 in_haplotype_tagging_set => 2,
|
|
148 genotypes_available => 1,
|
|
149 },
|
|
150
|
|
151 F6 => {
|
|
152 tgp_2010_production => 7,
|
|
153 tgp_validated => 6,
|
|
154 tgp_2010_pilot => 5,
|
|
155 tgp_2009_pilot => 4,
|
|
156 hm_phase_3_genotyped => 3,
|
|
157 hm_phase_2_genotyped => 2,
|
|
158 hm_phase_1_genotyped => 1,
|
|
159 },
|
|
160
|
|
161 F7 => {
|
|
162 has_mesh => 8,
|
|
163 clinical_assay => 7,
|
|
164 has_tf => 6,
|
|
165 lsdb => 5,
|
|
166 dbgap_significant => 4,
|
|
167 dbgap_lod_score => 3,
|
|
168 third_party_annot => 2,
|
|
169 omim => 1,
|
|
170 },
|
|
171
|
|
172 F8 => {
|
|
173 var_class => [4,3,2,1],
|
|
174 },
|
|
175
|
|
176 F9 => {
|
|
177 is_suspect => 7,
|
|
178 is_somatic => 6,
|
|
179 contig_allele_not_present => 5,
|
|
180 withdrawn => 4,
|
|
181 cluster_no_overlap => 3,
|
|
182 strain_specific => 2,
|
|
183 genotype_conflict => 1,
|
|
184 },
|
|
185 );
|
|
186
|
|
187 # a lookup table for the variation class
|
|
188
|
|
189 my %var_class = (
|
|
190 0b0001 => 'snp',
|
|
191 0b0010 => 'dips',
|
|
192 0b0011 => 'heterozygous',
|
|
193 0b0100 => 'microsatellite',
|
|
194 0b0101 => 'named',
|
|
195 0b0110 => 'no_variation',
|
|
196 0b0111 => 'mixed',
|
|
197 0b1000 => 'multi_base',
|
|
198 );
|
|
199
|
|
200 =head2 decode_bitfield
|
|
201
|
|
202 Arg[1] : string $bitfield
|
|
203 Example : my $hashref = decode_bitfield('050160000a01050512110101');
|
|
204 Description : Decodes a dbSNP bitfield string which encodes various attributes of a variation
|
|
205 Returntype : A hash reference with a key for each attribute set in the field, if the field
|
|
206 is boolean (e.g. precious, suspect etc.) then the value should be treated as a
|
|
207 true or false value, otherwise (e.g. var_class, weight) the value is the actual
|
|
208 value of the attribute
|
|
209
|
|
210 =cut
|
|
211
|
|
212 sub decode_bitfield {
|
|
213
|
|
214 my $bitfield = shift;
|
|
215
|
|
216 my %res;
|
|
217
|
|
218 for my $field (keys %fields) {
|
|
219
|
|
220 for my $value (keys %{ $fields{$field} }) {
|
|
221
|
|
222 my $bits = $fields{$field}->{$value};
|
|
223
|
|
224 # if bits isn't an array, put the single bit into an array
|
|
225 $bits = [$bits] unless ref $bits eq 'ARRAY';
|
|
226
|
|
227 # OR together all the bits to give us our mask
|
|
228 my $mask;
|
|
229
|
|
230 for my $bit (@$bits) {
|
|
231 $mask |= 2**($bit-1);
|
|
232 }
|
|
233
|
|
234 # extract the relevant characters from the bitfield string,
|
|
235 # convert them to an integer, and apply our mask
|
|
236 $res{$value} = hex(substr($bitfield, $offsets{$field}, 2)) & $mask;
|
|
237
|
|
238 # check that the version matches what we expect
|
|
239 if ($value eq 'version' && $res{$value} != ENCODING_VERSION) {
|
|
240 warn "Version field does not match the expected version (".$res{$value}." vs ".ENCODING_VERSION.")";
|
|
241 return undef;
|
|
242 }
|
|
243
|
|
244 # lookup the class description
|
|
245 $res{$value} = $var_class{$res{$value}} if $value eq 'var_class';
|
|
246
|
|
247 # get rid of anything set to 0
|
|
248 delete $res{$value} unless $res{$value};
|
|
249 }
|
|
250 }
|
|
251
|
|
252 return \%res;
|
|
253 }
|
|
254
|
|
255 1;
|
|
256
|