Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/dbSNP.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/dbSNP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,256 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::dbSNP + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::Utils::dbSNP qw(decode_bitfield); + + my $hashref = decode_bitfield('050160000a01050512110101'); + + print "variant is precious\n" if $hashref->{precious}; + +=head1 DESCRIPTION + +This module provides a single subroutine decode_bitfield which decodes +a dbSNP bitfield from their VCF files into a hash reference with values +for each value specified in the field. + +The encoding is taken from the following NCBI document: + +ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf + +=cut + +package Bio::EnsEMBL::Variation::Utils::dbSNP; + +use strict; +use warnings; + +use base qw(Exporter); + +our @EXPORT_OK = qw(decode_bitfield); + +use constant ENCODING_VERSION => 5; + +# an example string, with the fields and offsets + +# F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 +# 05 0160 000a 01 05 05 12 11 01 01 +# 0 2 4 6 8 10 12 14 16 18 20 22 + +# offsets into the string for each field + +my %offsets = ( + F0 => 0, + F1_1 => 2, + F1_2 => 4, + F2_1 => 6, + F2_2 => 8, + F3 => 10, + F4 => 12, + F5 => 14, + F6 => 16, + F7 => 18, + F8 => 20, + F9 => 22, +); + +# a hash mapping the values encoded in each field to the bits used encode them +# if multiple bits are used (e.g. for version) then the values should be a +# listref of all the bits, this will be used to construct a bit mask to pick +# out the necessary information + +my %fields= ( + + F0 => { + version => [3,2,1], + }, + + F1_1 => { + trace_archive => 8, + assembly_archive => 7, + entrez_geo => 6, + probe_db => 5, + entrez_gene => 4, + entrez_sts => 3, + has_structure => 2, + submitter_link_out => 1, + }, + + F1_2 => { + clinical => 7, + precious => 6, + provisional_tpa => 5, + pubmed => 4, + sra => 3, + organism_db_link => 2, + mgc_clone => 1, + }, + + F2_1 => { + utr_3 => 8, + utr_5 => 7, + acceptor_ss => 6, + donor_ss => 5, + intron => 4, + region_3 => 3, + region_5 => 2, + in_gene => 1, + }, + + F2_2 => { + stop_loss => 6, + frameshift => 5, + missense => 4, + stop_gain => 3, + has_ref => 2, + has_syn => 1, + }, + + F3 => { + has_other_snp => 5, + has_assembly_conflict => 4, + is_assembly_specific => 3, + weight => [1,2], + }, + + F4 => { + is_mutation => 4, + is_validated => 3, + maf_all_pops => 2, + maf_some_pops => 1, + }, + + F5 => { + marker_high_density => 3, + in_haplotype_tagging_set => 2, + genotypes_available => 1, + }, + + F6 => { + tgp_2010_production => 7, + tgp_validated => 6, + tgp_2010_pilot => 5, + tgp_2009_pilot => 4, + hm_phase_3_genotyped => 3, + hm_phase_2_genotyped => 2, + hm_phase_1_genotyped => 1, + }, + + F7 => { + has_mesh => 8, + clinical_assay => 7, + has_tf => 6, + lsdb => 5, + dbgap_significant => 4, + dbgap_lod_score => 3, + third_party_annot => 2, + omim => 1, + }, + + F8 => { + var_class => [4,3,2,1], + }, + + F9 => { + is_suspect => 7, + is_somatic => 6, + contig_allele_not_present => 5, + withdrawn => 4, + cluster_no_overlap => 3, + strain_specific => 2, + genotype_conflict => 1, + }, +); + +# a lookup table for the variation class + +my %var_class = ( + 0b0001 => 'snp', + 0b0010 => 'dips', + 0b0011 => 'heterozygous', + 0b0100 => 'microsatellite', + 0b0101 => 'named', + 0b0110 => 'no_variation', + 0b0111 => 'mixed', + 0b1000 => 'multi_base', +); + +=head2 decode_bitfield + + Arg[1] : string $bitfield + Example : my $hashref = decode_bitfield('050160000a01050512110101'); + Description : Decodes a dbSNP bitfield string which encodes various attributes of a variation + Returntype : A hash reference with a key for each attribute set in the field, if the field + is boolean (e.g. precious, suspect etc.) then the value should be treated as a + true or false value, otherwise (e.g. var_class, weight) the value is the actual + value of the attribute + +=cut + +sub decode_bitfield { + + my $bitfield = shift; + + my %res; + + for my $field (keys %fields) { + + for my $value (keys %{ $fields{$field} }) { + + my $bits = $fields{$field}->{$value}; + + # if bits isn't an array, put the single bit into an array + $bits = [$bits] unless ref $bits eq 'ARRAY'; + + # OR together all the bits to give us our mask + my $mask; + + for my $bit (@$bits) { + $mask |= 2**($bit-1); + } + + # extract the relevant characters from the bitfield string, + # convert them to an integer, and apply our mask + $res{$value} = hex(substr($bitfield, $offsets{$field}, 2)) & $mask; + + # check that the version matches what we expect + if ($value eq 'version' && $res{$value} != ENCODING_VERSION) { + warn "Version field does not match the expected version (".$res{$value}." vs ".ENCODING_VERSION.")"; + return undef; + } + + # lookup the class description + $res{$value} = $var_class{$res{$value}} if $value eq 'var_class'; + + # get rid of anything set to 0 + delete $res{$value} unless $res{$value}; + } + } + + return \%res; +} + +1; +