Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,882 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +# EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence +# +# + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::Sequence - Utility functions for sequences + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code variation_class); + + my $alleles = 'A|C'; + + print "my alleles = $alleles\n"; + + my $ambig_code = ambiguity_code($alleles); + + print "my ambiguity code is $ambig_code\n"; + + print "my SNP class is = variation_class($alleles)"; + + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::Variation::Utils::Sequence; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::Constants qw(:SO_class_terms); +use Bio::EnsEMBL::Utils::Scalar qw(wrap_array); +use Exporter; + +use vars qw(@ISA @EXPORT_OK); + +@ISA = qw(Exporter); + +@EXPORT_OK = qw( + &ambiguity_code + &variation_class + &unambiguity_code + &sequence_with_ambiguity + &hgvs_variant_notation + &format_hgvs_string + &SO_variation_class + &align_seqs + &strain_ambiguity_code + &get_all_validation_states + &get_validation_code + &add_validation_state +); + +# List of validation states. Order must match that of set in database +our @VALIDATION_STATES = qw(cluster freq submitter doublehit hapmap 1000Genome failed precious); + +=head2 ambiguity_code + + Arg[1] : string $alleles + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code) + my $alleles = 'A|C'; + my $ambig_code = ambiguity_code($alleles); + print "the ambiguity code for $alleles is: ",$ambig_code; + Description : returns the ambiguity code for a SNP allele + ReturnType : String + The ambiguity code + Exceptions : None + Caller : Variation, VariationFeature + +=cut + +sub ambiguity_code { + my $alleles = shift; + my %duplicates; #hash containing all alleles to remove duplicates + + foreach my $a(split /[\|\/\\]/, $alleles) { + # convert Ns + my @a = ($a eq 'N' ? qw(A C G T) : ($a)); + map {$duplicates{$_}++} @a; + } + $alleles = uc( join '', sort keys %duplicates ); + #my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y +#GT K C C A A T T G G - - -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-' + my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y GT K C C A A T T G G - -); + return $ambig{$alleles}; +} + +=head2 strain_ambiguity_code + + Arg[1] : string $alleles (separated by "/", "\" or "|") + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(strain_ambiguity_code) + my $alleles = 'A|C'; + my $ambig_code = strain_ambiguity_code($alleles); + print "the ambiguity code for $alleles is: ",$ambig_code; + Description : returns the ambiguity code for a strain genotype + ReturnType : String + Exceptions : None + Caller : AlleleFeatureAdaptor + +=cut + +sub strain_ambiguity_code { + my $alleles = shift; + + # return normal ambiguity code for a SNP + return ambiguity_code($alleles) if($alleles =~ /^[ACGT][\|\/\\][ACGT]$/); + + # get alleles + my ($a1, $a2) = split /[\|\/\\]/, $alleles; + + # pad + if(length($a1) > length($a2)) { + $a2 .= '-' x (length($a1) - length($a2)); + } + else { + $a1 .= '-' x (length($a2) - length($a1)); + } + + # build ambiguity code base by base + my $ambig = ''; + + for my $i(0..(length($a1) - 1)) { + my $b1 = substr($a1, $i, 1); + my $b2 = substr($a2, $i, 1); + + # -/- = - + if($b1 eq '-' && $b2 eq '-') { + $ambig .= '-'; + } + + # G/- = g + elsif($b1 eq '-') { + $ambig .= lc($b2); + } + + # -/G = g + elsif($b2 eq '-') { + $ambig .= lc($b1); + } + + # A/G = R + else { + $ambig .= ambiguity_code($b1.'|'.$b2); + } + } + + return $ambig; +} + +=head2 unambiguity_code + + Arg[1] : string $alleles + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code) + my $ambiguity_code = 'M'; + my $alleles = unambiguity_code($ambiguity_code); + print "the alleles for ambiguity code $ambiguity_code is: ",$alleles; + Description : returns the alleles for an ambiguity code + ReturnType : String + The Alleles, alphabetically sorted and in capital + Exceptions : None + Caller : Variation, VariationFeature + +=cut + +sub unambiguity_code { + my $ambiguity_code = shift; + + #my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K +#GT C CC A AA T TT G GG - -- -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-' + my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K GT C CC A AA T TT G GG - --); + return $unambig{$ambiguity_code}; +} + + +=head2 variation_class + + Arg[1] : string $alleles + Arg[2] : boolean $is_somatic - flag that this variation is somatic + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (variation_class) + my $alleles = 'A|C'; + my $variation_class = variation_class($alleles); + print "the variation class for the alleles $alleles is: ",$variation_class; + Description : return the class of the alleles according to dbSNP classification(SNP,indel,mixed,substitution...) + ReturnType : String. The class of the alleles + Exceptions : none + Caller : Variation, VariationFeature + +=cut + +sub variation_class{ + + my ($alleles, $is_somatic) = @_; + + my $class; + + if ($alleles =~ /^[ACGTN]([\|\\\/][ACGTN])+$/i) { + $class = 'snp'; + } + elsif (($alleles eq 'cnv') || ($alleles eq 'CNV')) { + $class = 'cnv'; + } + elsif ($alleles =~ /CNV\_PROBE/i) { + $class = 'cnv probe'; + } + elsif ($alleles =~ /HGMD\_MUTATION/i) { + $class = 'hgmd_mutation'; + } + else { + my @alleles = split /[\|\/\\]/, $alleles; + + if (@alleles == 1) { + #(HETEROZYGOUS) 1 allele + $class = 'het'; + } + elsif(@alleles == 2) { + if ((($alleles[0] =~ tr/ACTGN//)== length($alleles[0]) && ($alleles[1] =~ tr/-//) == 1) || + (($alleles[0] =~ tr/-//) == 1 && ($alleles[1] =~ tr/ACTGN//) == length($alleles[1])) ){ + #A/- 2 alleles + $class = 'in-del' + } + elsif (($alleles[0] =~ /LARGE|INS|DEL/) || ($alleles[1] =~ /LARGE|INS|DEL/)){ + #(LARGEDELETION) 2 alleles + $class = 'named' + } + elsif (($alleles[0] =~ tr/ACTG//) > 1 || ($alleles[1] =~ tr/ACTG//) > 1){ + #AA/GC 2 alleles + $class = 'substitution' + } + else { + warning("not possible to determine class for @alleles"); + $class = ''; + } + } + elsif (@alleles > 2) { + + if ($alleles[0] =~ /\d+/) { + #(CA)14/15/16/17 > 2 alleles, all of them contain the number of repetitions of the allele + $class = 'microsat' + } + + elsif ((grep {/-/} @alleles) > 0) { + #-/A/T/TTA > 2 alleles + $class = 'mixed' + } + else { + # warning("not possible to determine class of alleles " . @alleles); + $class = ''; + } + } + else{ + warning("no alleles available "); + $class = ''; + } + } + + if ($is_somatic) { + if ($class eq '') { + # for undetermined classes just call it somatic + $class = 'somatic'; + } + else { + # somatic mutations aren't polymorphisms, so change SNPs to SNVs + $class = 'snv' if $class eq 'snp'; + + # and prefix the class with 'somatic' + $class = 'somatic_'.$class; + } + } + + return $class; +} + +=head2 SO_variation_class + + Arg[1] : string $alleles - a slash ()'/') separated list of alleles, the first allele is + assumed to be the reference unless the $ref_correct argument is false + Arg[2] : boolean $ref_correct - flags that the first allele is not known to be the + reference sequence (so we can't call insertions or deletions and have to + resort to 'indel') + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (SO_variation_class) + my $alleles = 'A/C'; + my $SO_term = SO_variation_class($alleles); + print "the SO term for the alleles $alleles is: ",$SO_term; + Description : return the SO term for the class of the alleles + ReturnType : String. The SO term for the class of the alleles + Exceptions : none + Caller : Variation, VariationFeature + +=cut + +sub SO_variation_class { + + my $alleles = shift; + my $ref_correct = shift; + + $ref_correct = 1 unless defined $ref_correct; + + my $allele_class = '[A-Z]'; + + # default to sequence_alteration + my $class = SO_TERM_SEQUENCE_ALTERATION; + + if ($alleles =~ /^$allele_class(\/$allele_class)+$/) { + # A/T, A/T/G + $class = SO_TERM_SNV; + } + elsif ($alleles =~ /^$allele_class+(\/$allele_class+)+$/) { + # AA/TTT + $class = SO_TERM_SUBSTITUTION; + } + elsif ($alleles =~ /\)\d+/) { + # (CAG)8/(CAG)9 + $class = SO_TERM_TANDEM_REPEAT; + } + else { + my @alleles = split /\//, $alleles; + + if (@alleles > 1) { + + my $ref = shift @alleles; + + if ($ref eq '-') { + + if (@alleles == 1 && $alleles[0] =~ /DEL/) { + # -/(LARGEDELETION) (rather oddly!) + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + + unless (grep { $_ !~ /^$allele_class+$|INS/ } @alleles) { + # -/ATT, -/(LARGEINSERTION) + $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL; + } + + # else must be mixed insertion and deletion, so just called sequence_alteration + } + elsif ($ref =~ /^$allele_class+$/) { + unless (grep { $_ !~ /-|DEL/ } @alleles) { + # A/-, A/(LARGEDELETION) + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + } + elsif ($ref =~ /DEL/) { + unless (grep { $_ !~ /-/ } @alleles) { + # (LARGEDELETION)/-, (2345 BP DELETION)/- + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + } + } + elsif (@alleles == 1) { + if ($alleles[0] =~ /INS/) { + # (LARGEINSERTION) + $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL; + } + elsif($alleles[0] =~ /DEL/) { + # (308 BP DELETION) + $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; + } + } + } + + return $class; +} + +=head2 sequence_with_ambiguity + + Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dbCore + Arg[2] : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor $dbVar + Arg[3] : string $chr + Arg[4] : int $start + Arg[5] : int $end + Arg[6] : int $strand + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (sequence_with_ambiguity) + my $slice = sequence_with_ambiguity($dbCore,$dbVar,1,100,200); + print "the sequence with ambiguity code for your region is: ",$slice->seq() + Description : given a region, returns a Bio::EnsEMBL::Slice object with + the sequence set with ambiguity codes + ReturnType : Bio::EnsEMBL::Slice object + Exceptions : none + Caller : general + +=cut + +sub sequence_with_ambiguity{ + my ($dbCore,$dbVar,$chr,$start,$end,$strand) = @_; + + my $slice; + if (ref($dbCore) ne 'Bio::EnsEMBL::DBSQL::DBAdaptor'){ + warning('You need to provide a Bio::EnsEMBL::DBSQL::DBAdaptor as a first argument'); + return $slice; + } + if (ref($dbVar) ne 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor'){ + warning('You need to provide a Bio::EnsEMBL::Variation::DBSQL::DBAdaptor object as second argument'); + return $slice; + } + my $slice_adaptor = $dbCore->get_SliceAdaptor(); + my $vf_adaptor = $dbVar->get_VariationFeatureAdaptor; + $slice = $slice_adaptor->fetch_by_region('chromosome',$chr,$start,$end,$strand); #get the slice + my $seq = $slice->seq; + foreach my $vf (@{$vf_adaptor->fetch_all_by_Slice($slice)}){ + substr($seq,$vf->start-1,1,$vf->ambig_code); + } + $slice->{'seq'} = $seq; + return $slice; +} + +=head2 hgvs_variant_notation + + Arg[1] : string $alt_allele + Arg[2] : string $ref_sequence + Arg[3] : int $ref_start + Arg[4] : int $ref_end + Arg[5] : int $display_start (optional) + Arg[6] : int $display_end (optional) + + Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (hgvs_variant_notation) + my $alt_allele = 'A'; + my $ref_sequence = 'CCGTGATGTGC'; + my $ref_start = 4; + my $ref_end = 4; + my $ref_name = 'test_seq'; + my $ref_type = 'g'; + my $notation = hgvs_variant_notation($alt_allele,$ref_sequence,$ref_start,$ref_end); + print "HGVS notation of your variant: $ref_name\:$ref_type\." . $notation->{'hgvs'}; + + Description : Given an allele, a reference sequence and position of variant, returns a reference to a hash containing metadata and a + string with HGVS notation of the variant. Returns undef if reference and variant alleles are identical. + The optional display_start and display_end, if specified, will be used in the notation instead of the ref_start and ref_end. + This can be useful, e.g. if we want coordinates relative to chromosome but don't want to pass the entire chromosome sequence + into the subroutine. + The data fields in the returned hash are: + 'start' -> Displayed start position of variant + 'end' -> Displayed end position of variant + 'ref' -> Reference allele + 'alt' -> Alternative allele + 'type' -> The variant class, e.g. ins, inv, >, delins + 'hgvs' -> A string with HGVS notation + ReturnType : reference to a hash + Exceptions : If the length of the interval to be displayed is different from the length of the reference allele + Caller : general + +=cut +sub hgvs_variant_notation { + my $alt_allele = shift; + my $ref_sequence = shift; + my $ref_start = shift; + my $ref_end = shift; + my $display_start = shift; + my $display_end = shift; + + # If display_start and display_end were not specified, use ref_start and ref_end + $display_start ||= $ref_start; + $display_end ||= $ref_end; + + #ÊThrow an exception if the lengths of the display interval and reference interval are different + throw("The coordinate interval for display is of different length than for the reference allele") if (($display_end - $display_start) != ($ref_end - $ref_start)); + + # Length of the reference allele. Negative lengths make no sense + my $ref_length = ($ref_end - $ref_start + 1); + if ($ref_length < 0) { + $ref_length = 0; + } + + # Remove any gap characters in the alt allele + $alt_allele =~ s/\-//g; + + # Length of alternative allele + my $alt_length = length($alt_allele); + + # Get the reference allele + my $ref_allele = substr($ref_sequence,($ref_start-1),$ref_length); + + # Check that the alleles are different, otherwise return undef + return undef unless ($ref_allele ne $alt_allele); + + # Store the notation in a hash that will be returned + my %notation; + $notation{'start'} = $display_start; + $notation{'end'} = $display_end; + $notation{'ref'} = $ref_allele; + $notation{'alt'} = $alt_allele; + + # The simplest case is a deletion + if (!$alt_length) { + $notation{'type'} = 'del'; + + # Return the notation + return \%notation; + } + + # Another case is if the allele lengths are equal + if ($ref_length == $alt_length) { + + # If length is 1 it's a single substitution + if ($ref_length == 1) { + $notation{'type'} = '>'; + return \%notation; + } + + # Check if it's an inversion + my $rev_ref = $ref_allele; + reverse_comp(\$rev_ref); + if ($alt_allele eq $rev_ref) { + $notation{'type'} = 'inv'; + return \%notation; + } + + $notation{'type'} = 'delins'; + + return \%notation; + } + + # If this is an insertion, we should check if the preceeding reference nucleotides match the insertion. In that case it should be annotated as a multiplication. + if (!$ref_length) { + + # Get the same number of nucleotides preceding the insertion as the length of the insertion + my $prev_str = substr($ref_sequence,($ref_end-$alt_length),$alt_length); + + # If they match, this is a duplication + if ($prev_str eq $alt_allele) { + + $notation{'start'} = ($display_end - $alt_length + 1); + $notation{'type'} = 'dup'; + $notation{'ref'} = $prev_str; + # Return the notation + return \%notation; + } + + # If they didn't match it's a plain insertion + $notation{'start'} = $display_end; + $notation{'end'} = $display_start; + $notation{'type'} = 'ins'; + + return \%notation; + } + + # Otherwise, the reference and allele are of different lengths. By default, this is a delins but + # we need to check if the alt allele is a multiplication of the reference + # Check if the length of the alt allele is a multiple of the reference allele + if ($alt_length%$ref_length == 0) { + my $multiple = ($alt_length / $ref_length); + if ($alt_allele eq ($ref_allele x $multiple)) { + if ($multiple == 2) { + $notation{'type'} = 'dup'; + } + else { + $notation{'type'} = '[' . $multiple . ']'; + } + return \%notation; + } + } + + # Else, it's gotta be a delins + $notation{'type'} = 'delins'; + + return \%notation; +} + + +=head2 format_hgvs_string + + Arg[1] : string reference sequence name + Arg[2] : string strand + Arg[3] : hash of hgvs information + Example : + Description : Creates HGVS formatted string from input hash + ReturnType : string in HGVS format + Exceptions : + Caller : + +=cut + +sub format_hgvs_string{ + ##### generic formatting routine for genomic and coding HGVS names + + my $hgvs_notation = shift; + + ### all start with refseq name & numbering type + $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . "."; + + my $coordinates; + #### if single base event, list position only once + if($hgvs_notation->{'start'} eq $hgvs_notation->{'end'}){ + $coordinates = $hgvs_notation->{'start'}; + } + else{ + $coordinates = $hgvs_notation->{'start'} . "_" . $hgvs_notation->{'end'}; + } + + ##### format rest of string according to type + + if($hgvs_notation->{'type'} eq 'del' || $hgvs_notation->{'type'} eq 'inv' || $hgvs_notation->{'type'} eq 'dup'){ + ### inversion of reference bases => list ref not alt + ### deletion of reference bases => list ref lost + ### duplication of reference bases (eg ref = GAAA alt = GAAAGAAA) => list duplicated ref (dupGAAA) + $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'}; + } + + elsif( $hgvs_notation->{'type'} eq '>'){ + ### substitution - list both alleles + $hgvs_notation->{'hgvs'} .= $hgvs_notation->{'start'} . $hgvs_notation->{'ref'} . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'}; + } + + elsif( $hgvs_notation->{'type'} eq 'delins'){ + $hgvs_notation->{'hgvs'} .= $coordinates . 'del' . $hgvs_notation->{'ref'} . 'ins' . $hgvs_notation->{'alt'}; + } + + elsif($hgvs_notation->{'type'} eq 'ins'){ + ## reference not listed + $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'}; + } + + elsif($hgvs_notation->{'type'} =~ /\[\d+\]/){ + #### insertion described by string and number + $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'}; + } + + else{ + warn "PROBLEM with generic HGVS formatter - type = ". $hgvs_notation->{'type'} ."\n"; + } + + return $hgvs_notation->{'hgvs'}; + +} + +=head2 align_seqs + + Arg[1] : string $seq1 + Arg[2] : string $seq2 + Example : my $aligned_seqs = align_seqs($seq1, $seq2); + Description : Does a simple NW align of two sequence strings. Best used on + short (<1000bp) sequences, otherwise runtime will be long + ReturnType : arrayref to a pair of strings + Exceptions : none + Caller : web flanking sequence display + +=cut + +sub align_seqs { + my $seq1 = shift; + my $seq2 = shift; + + # align parameters + my $match = 10; + my $mismatch = -10; + my $gep = -10; + + # split sequences into arrays + my @split1 = split //, $seq1; + my @split2 = split //, $seq2; + + # evaluate substitutions + my $len1 = length($seq1); + my $len2 = length($seq2); + + my (@smat, @tb); + + for (my $i=0; $i<=$len1; $i++) { + $smat[$i][0] = $i * $gep; + $tb[$i][0] = 1; + } + for (my $j=0; $j<=$len2; $j++) { + $smat[0][$j] = $j * $gep; + $tb[0][$j] = -1; + } + + my ($s, $sub, $del, $ins); + + for (my $i=1; $i<=$len1; $i++) { + for (my $j=1; $j<=$len2; $j++) { + + # calculate score + if($split1[$i-1] eq $split2[$j-1]) { + $s = $match; + } + else { + $s = $mismatch; + } + + $sub = $smat[$i-1][$j-1] + $s; + $del = $smat[$i][$j-1] + $gep; + $ins = $smat[$i-1][$j] + $gep; + + if($sub > $del && $sub > $ins) { + $smat[$i][$j] = $sub; + $tb[$i][$j] = 0; + } + elsif($del > $ins) { + $smat[$i][$j] = $del; + $tb[$i][$j] = -1; + } + else { + $smat[$i][$j] = $ins; + $tb[$i][$j] = 1; + } + } + } + + + my $i = $len1; + my $j = $len2; + my $aln_len = 0; + my (@aln1, @aln2); + + while(!($i == 0 && $j == 0)) { + if($tb[$i][$j] == 0) { + $aln1[$aln_len] = $split1[--$i]; + $aln2[$aln_len] = $split2[--$j]; + } + elsif($tb[$i][$j] == -1) { + $aln1[$aln_len] = '-'; + $aln2[$aln_len] = $split2[--$j]; + } + elsif($tb[$i][$j] == 1) { + $aln1[$aln_len] = $split1[--$i]; + $aln2[$aln_len] = '-'; + } + + $aln_len++; + } + + return [(join "", reverse @aln1), (join "", reverse @aln2)]; +} + + +=head2 array_to_bitval + + Arg[1] : arrayref $arr + Arg[2] : arrayref $ref + Example : my $bitval = array_to_bitval(['hapmap','precious'],['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']); + Description : Takes a reference to an array as input and return a bit value representing the + combination of elements from a reference array. c.f. the SET datatype in MySQL + ReturnType : bitvalue that represents the combination of elements in the reference array specified in the given array + Exceptions : none + Caller : get_validation_code + +=cut + +sub array_to_bitval { + my $arr = shift; + my $ref = shift; + + #ÊEnsure that we have array references + $arr = wrap_array($arr); + $ref = wrap_array($ref); + + #ÊTurn the reference array into a hash, the values will correspond to 2 raised to the power of the position in the array + my $i=0; + my %ref_hash = map {lc($_) => $i++;} @{$ref}; + + #ÊSet the bitval + my $bitval = 0; + foreach my $a (@{$arr}) { + + my $pos = $ref_hash{lc($a)}; + if (defined($pos)) { + $bitval |= 2**$pos; + } + # Warn if the element is not present in the reference array + else { + warning("$a is not a recognised element. Recognised elements are: " . join(",",@{$ref})); + } + } + + return $bitval; +} + +=head2 bitval_to_array + + Arg [1] : int $bitval + Arg [2] : arrayref $ref + Example : my $arr = bitval_to_array(6,['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']); + : print join(",",@{$arr}); #ÊWill print 'freq,submitter' + Description: Returns an array with the combination of elements from the reference array specified by the supplied bitvalue. + c.f. the SET datatype in MySQL + Returntype : reference to list of strings + Exceptions : none + Caller : get_all_validation_states + +=cut + +sub bitval_to_array { + my $bitval = shift || 0; + my $ref = shift; + + #ÊEnsure that we have array references + $ref = wrap_array($ref); + + # convert the bit value into an ordered array + my @arr; + for (my $i = 0; $i < @{$ref}; $i++) { + push(@arr,$ref->[$i]) if ((1 << $i) & $bitval); + } + + return \@arr; +} + + +=head2 add_validation_state + + Arg [1] : string $state + Example : add_validation_state('cluster'); + Description: Adds a validation state to this variation. + Returntype : none + Exceptions : warning if validation state is not a recognised type + Caller : general + Status : At Risk + +=cut + +sub add_validation_state { + my $obj = shift; + my $state = shift; + + #ÊGet the bitvalue for the new state + my $newbit = get_validation_code($state) || 0; + + #ÊBit-add it to the current validation_code + my $oldbit = $obj->{'validation_code'} || 0; + $newbit |= $oldbit; + + # Set the validation_code + $obj->{'validation_code'} = $newbit; + + return; +} + +=head2 get_all_validation_states + + Arg [1] : int $bitval + Example : my @vstates = @{get_all_validation_states($var->{'validation_code'})}; + Description: Retrieves all validation states for a specified bit value. + Returntype : reference to list of strings + Exceptions : none + Caller : general + +=cut + +sub get_all_validation_states { + return bitval_to_array(shift,\@VALIDATION_STATES); +} + +=head2 get_validation_code + + Arg [1] : arrayref $validation_status + Example : $var->{'validation_code'} = get_validation_code(['submitter','precious']); + Description: Retrieves the bit value for a combination of validation statuses. + Returntype : int + Exceptions : none + Caller : Variation::new + +=cut + +sub get_validation_code { + return array_to_bitval(shift,\@VALIDATION_STATES); +} + +1;