Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/dbSNP.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Variation::Utils::dbSNP | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 use Bio::EnsEMBL::Variation::Utils::dbSNP qw(decode_bitfield); | |
28 | |
29 my $hashref = decode_bitfield('050160000a01050512110101'); | |
30 | |
31 print "variant is precious\n" if $hashref->{precious}; | |
32 | |
33 =head1 DESCRIPTION | |
34 | |
35 This module provides a single subroutine decode_bitfield which decodes | |
36 a dbSNP bitfield from their VCF files into a hash reference with values | |
37 for each value specified in the field. | |
38 | |
39 The encoding is taken from the following NCBI document: | |
40 | |
41 ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf | |
42 | |
43 =cut | |
44 | |
45 package Bio::EnsEMBL::Variation::Utils::dbSNP; | |
46 | |
47 use strict; | |
48 use warnings; | |
49 | |
50 use base qw(Exporter); | |
51 | |
52 our @EXPORT_OK = qw(decode_bitfield); | |
53 | |
54 use constant ENCODING_VERSION => 5; | |
55 | |
56 # an example string, with the fields and offsets | |
57 | |
58 # F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 | |
59 # 05 0160 000a 01 05 05 12 11 01 01 | |
60 # 0 2 4 6 8 10 12 14 16 18 20 22 | |
61 | |
62 # offsets into the string for each field | |
63 | |
64 my %offsets = ( | |
65 F0 => 0, | |
66 F1_1 => 2, | |
67 F1_2 => 4, | |
68 F2_1 => 6, | |
69 F2_2 => 8, | |
70 F3 => 10, | |
71 F4 => 12, | |
72 F5 => 14, | |
73 F6 => 16, | |
74 F7 => 18, | |
75 F8 => 20, | |
76 F9 => 22, | |
77 ); | |
78 | |
79 # a hash mapping the values encoded in each field to the bits used encode them | |
80 # if multiple bits are used (e.g. for version) then the values should be a | |
81 # listref of all the bits, this will be used to construct a bit mask to pick | |
82 # out the necessary information | |
83 | |
84 my %fields= ( | |
85 | |
86 F0 => { | |
87 version => [3,2,1], | |
88 }, | |
89 | |
90 F1_1 => { | |
91 trace_archive => 8, | |
92 assembly_archive => 7, | |
93 entrez_geo => 6, | |
94 probe_db => 5, | |
95 entrez_gene => 4, | |
96 entrez_sts => 3, | |
97 has_structure => 2, | |
98 submitter_link_out => 1, | |
99 }, | |
100 | |
101 F1_2 => { | |
102 clinical => 7, | |
103 precious => 6, | |
104 provisional_tpa => 5, | |
105 pubmed => 4, | |
106 sra => 3, | |
107 organism_db_link => 2, | |
108 mgc_clone => 1, | |
109 }, | |
110 | |
111 F2_1 => { | |
112 utr_3 => 8, | |
113 utr_5 => 7, | |
114 acceptor_ss => 6, | |
115 donor_ss => 5, | |
116 intron => 4, | |
117 region_3 => 3, | |
118 region_5 => 2, | |
119 in_gene => 1, | |
120 }, | |
121 | |
122 F2_2 => { | |
123 stop_loss => 6, | |
124 frameshift => 5, | |
125 missense => 4, | |
126 stop_gain => 3, | |
127 has_ref => 2, | |
128 has_syn => 1, | |
129 }, | |
130 | |
131 F3 => { | |
132 has_other_snp => 5, | |
133 has_assembly_conflict => 4, | |
134 is_assembly_specific => 3, | |
135 weight => [1,2], | |
136 }, | |
137 | |
138 F4 => { | |
139 is_mutation => 4, | |
140 is_validated => 3, | |
141 maf_all_pops => 2, | |
142 maf_some_pops => 1, | |
143 }, | |
144 | |
145 F5 => { | |
146 marker_high_density => 3, | |
147 in_haplotype_tagging_set => 2, | |
148 genotypes_available => 1, | |
149 }, | |
150 | |
151 F6 => { | |
152 tgp_2010_production => 7, | |
153 tgp_validated => 6, | |
154 tgp_2010_pilot => 5, | |
155 tgp_2009_pilot => 4, | |
156 hm_phase_3_genotyped => 3, | |
157 hm_phase_2_genotyped => 2, | |
158 hm_phase_1_genotyped => 1, | |
159 }, | |
160 | |
161 F7 => { | |
162 has_mesh => 8, | |
163 clinical_assay => 7, | |
164 has_tf => 6, | |
165 lsdb => 5, | |
166 dbgap_significant => 4, | |
167 dbgap_lod_score => 3, | |
168 third_party_annot => 2, | |
169 omim => 1, | |
170 }, | |
171 | |
172 F8 => { | |
173 var_class => [4,3,2,1], | |
174 }, | |
175 | |
176 F9 => { | |
177 is_suspect => 7, | |
178 is_somatic => 6, | |
179 contig_allele_not_present => 5, | |
180 withdrawn => 4, | |
181 cluster_no_overlap => 3, | |
182 strain_specific => 2, | |
183 genotype_conflict => 1, | |
184 }, | |
185 ); | |
186 | |
187 # a lookup table for the variation class | |
188 | |
189 my %var_class = ( | |
190 0b0001 => 'snp', | |
191 0b0010 => 'dips', | |
192 0b0011 => 'heterozygous', | |
193 0b0100 => 'microsatellite', | |
194 0b0101 => 'named', | |
195 0b0110 => 'no_variation', | |
196 0b0111 => 'mixed', | |
197 0b1000 => 'multi_base', | |
198 ); | |
199 | |
200 =head2 decode_bitfield | |
201 | |
202 Arg[1] : string $bitfield | |
203 Example : my $hashref = decode_bitfield('050160000a01050512110101'); | |
204 Description : Decodes a dbSNP bitfield string which encodes various attributes of a variation | |
205 Returntype : A hash reference with a key for each attribute set in the field, if the field | |
206 is boolean (e.g. precious, suspect etc.) then the value should be treated as a | |
207 true or false value, otherwise (e.g. var_class, weight) the value is the actual | |
208 value of the attribute | |
209 | |
210 =cut | |
211 | |
212 sub decode_bitfield { | |
213 | |
214 my $bitfield = shift; | |
215 | |
216 my %res; | |
217 | |
218 for my $field (keys %fields) { | |
219 | |
220 for my $value (keys %{ $fields{$field} }) { | |
221 | |
222 my $bits = $fields{$field}->{$value}; | |
223 | |
224 # if bits isn't an array, put the single bit into an array | |
225 $bits = [$bits] unless ref $bits eq 'ARRAY'; | |
226 | |
227 # OR together all the bits to give us our mask | |
228 my $mask; | |
229 | |
230 for my $bit (@$bits) { | |
231 $mask |= 2**($bit-1); | |
232 } | |
233 | |
234 # extract the relevant characters from the bitfield string, | |
235 # convert them to an integer, and apply our mask | |
236 $res{$value} = hex(substr($bitfield, $offsets{$field}, 2)) & $mask; | |
237 | |
238 # check that the version matches what we expect | |
239 if ($value eq 'version' && $res{$value} != ENCODING_VERSION) { | |
240 warn "Version field does not match the expected version (".$res{$value}." vs ".ENCODING_VERSION.")"; | |
241 return undef; | |
242 } | |
243 | |
244 # lookup the class description | |
245 $res{$value} = $var_class{$res{$value}} if $value eq 'var_class'; | |
246 | |
247 # get rid of anything set to 0 | |
248 delete $res{$value} unless $res{$value}; | |
249 } | |
250 } | |
251 | |
252 return \%res; | |
253 } | |
254 | |
255 1; | |
256 |