ensembl: variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm comparison

comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+=head1 LICENSE
+Copyright (c) 1999-2012 The European Bioinformatics Institute and
+Genome Research Limited.  All rights reserved.
+This software is distributed under a modified Apache license.
+For license details, please see
+http://www.ensembl.org/info/about/code_licence.html
+=head1 CONTACT
+Please email comments or questions to the public Ensembl
+developers list at <dev@ensembl.org>.
+Questions may also be sent to the Ensembl help desk at
+<helpdesk@ensembl.org>.
+=cut
+# EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence
+#
+#
+=head1 NAME
+Bio::EnsEMBL::Variation::Utils::Sequence - Utility functions for sequences
+=head1 SYNOPSIS
+use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code variation_class);
+my $alleles = 'A|C';
+print "my alleles = $alleles\n";
+my $ambig_code = ambiguity_code($alleles);
+print "my ambiguity code is $ambig_code\n";
+print "my SNP class is = variation_class($alleles)";
+=head1 METHODS
+=cut
+use strict;
+use warnings;
+package Bio::EnsEMBL::Variation::Utils::Sequence;
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
+use Bio::EnsEMBL::Variation::Utils::Constants qw(:SO_class_terms);
+use Bio::EnsEMBL::Utils::Scalar qw(wrap_array);
+use Exporter;
+use vars qw(@ISA @EXPORT_OK);
+@ISA = qw(Exporter);
+@EXPORT_OK = qw(
+&ambiguity_code
+&variation_class
+&unambiguity_code
+&sequence_with_ambiguity
+&hgvs_variant_notation
+&format_hgvs_string
+&SO_variation_class
+&align_seqs
+&strain_ambiguity_code
+&get_all_validation_states
+&get_validation_code
+&add_validation_state
+);
+# List of validation states. Order must match that of set in database
+our @VALIDATION_STATES = qw(cluster freq submitter doublehit hapmap 1000Genome failed precious);
+=head2 ambiguity_code
+Arg[1]      : string $alleles
+Example     :  use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code)
+my $alleles = 'A|C';
+my $ambig_code = ambiguity_code($alleles);
+print "the ambiguity code for $alleles is: ",$ambig_code;
+Description : returns the ambiguity code for a SNP allele
+ReturnType  : String
+The ambiguity code
+Exceptions  : None
+Caller      : Variation, VariationFeature
+=cut
+sub ambiguity_code {
+my $alleles = shift;
+my %duplicates; #hash containing all alleles to remove duplicates
+	foreach my $a(split /[\|\/\\]/, $alleles) {
+		# convert Ns
+		my @a = ($a eq 'N' ? qw(A C G T) : ($a));
+		map {$duplicates{$_}++} @a;
+	}
+$alleles = uc( join '', sort keys %duplicates );
+#my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y
+#GT K C C A A T T G G - - -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-'
+	my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y GT K C C A A T T G G - -);
+return $ambig{$alleles};
+}
+=head2 strain_ambiguity_code
+Arg[1]      : string $alleles (separated by "/", "\" or "|")
+Example     :  use Bio::EnsEMBL::Variation::Utils::Sequence qw(strain_ambiguity_code)
+my $alleles = 'A|C';
+my $ambig_code = strain_ambiguity_code($alleles);
+print "the ambiguity code for $alleles is: ",$ambig_code;
+Description : returns the ambiguity code for a strain genotype
+ReturnType  : String
+Exceptions  : None
+Caller      : AlleleFeatureAdaptor
+=cut
+sub strain_ambiguity_code {
+my $alleles = shift;
+	# return normal ambiguity code for a SNP
+	return ambiguity_code($alleles) if($alleles =~ /^[ACGT][\|\/\\][ACGT]$/);
+	# get alleles
+	my ($a1, $a2) = split /[\|\/\\]/, $alleles;
+	# pad
+	if(length($a1) > length($a2)) {
+		$a2 .= '-' x (length($a1) - length($a2));
+	}
+	else {
+		$a1 .= '-' x (length($a2) - length($a1));
+	}
+	# build ambiguity code base by base
+	my $ambig = '';
+	for my $i(0..(length($a1) - 1)) {
+		my $b1 = substr($a1, $i, 1);
+		my $b2 = substr($a2, $i, 1);
+		# -/- = -
+		if($b1 eq '-' && $b2 eq '-') {
+			$ambig .= '-';
+		}
+		# G/- = g
+		elsif($b1 eq '-') {
+			$ambig .= lc($b2);
+		}
+		# -/G = g
+		elsif($b2 eq '-') {
+			$ambig .= lc($b1);
+		}
+		# A/G = R
+		else {
+			$ambig .= ambiguity_code($b1.'|'.$b2);
+		}
+	}
+	return $ambig;
+}
+=head2 unambiguity_code
+Arg[1]      : string $alleles
+Example     :  use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code)
+my $ambiguity_code = 'M';
+my $alleles = unambiguity_code($ambiguity_code);
+print "the alleles for ambiguity code $ambiguity_code is: ",$alleles;
+Description : returns the alleles for an ambiguity code
+ReturnType  : String
+The Alleles, alphabetically sorted and in capital
+Exceptions  : None
+Caller      : Variation, VariationFeature
+=cut
+sub unambiguity_code {
+my $ambiguity_code = shift;
+#my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K
+#GT C CC A AA T TT G GG - -- -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-'
+	my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K GT C CC A AA T TT G GG - --);
+return $unambig{$ambiguity_code};
+}
+=head2 variation_class
+Arg[1]      : string $alleles
+Arg[2]      : boolean $is_somatic - flag that this variation is somatic
+Example     : use Bio::EnsEMBL::Variation::Utils::Sequence qw (variation_class)
+my $alleles = 'A|C';
+my $variation_class = variation_class($alleles);
+print "the variation class for the alleles $alleles is: ",$variation_class;
+Description : return the class of the alleles according to dbSNP classification(SNP,indel,mixed,substitution...)
+ReturnType  : String. The class of the alleles
+Exceptions  : none
+Caller      : Variation, VariationFeature
+=cut
+sub variation_class{
+my ($alleles, $is_somatic) = @_;
+my $class;
+if ($alleles =~ /^[ACGTN]([\|\\\/][ACGTN])+$/i) {
+$class = 'snp';
+}
+elsif (($alleles eq 'cnv') || ($alleles eq 'CNV')) {
+$class = 'cnv';
+}
+elsif ($alleles =~ /CNV\_PROBE/i) {
+$class = 'cnv probe';
+}
+elsif ($alleles =~ /HGMD\_MUTATION/i) {
+$class = 'hgmd_mutation';
+}
+else {
+my @alleles = split /[\|\/\\]/, $alleles;
+if (@alleles == 1) {
+#(HETEROZYGOUS) 1 allele
+	        $class =  'het';
+}
+elsif(@alleles == 2) {
+	       if ((($alleles[0] =~ tr/ACTGN//)== length($alleles[0]) && ($alleles[1] =~ tr/-//) == 1) ||
+	           (($alleles[0] =~ tr/-//) == 1 && ($alleles[1] =~ tr/ACTGN//) == length($alleles[1])) ){
+	           #A/- 2 alleles
+	           $class =  'in-del'
+	       }
+	       elsif (($alleles[0] =~ /LARGE|INS|DEL/) || ($alleles[1] =~ /LARGE|INS|DEL/)){
+	           #(LARGEDELETION) 2 alleles
+	           $class = 'named'
+	       }
+	       elsif (($alleles[0] =~ tr/ACTG//) > 1 || ($alleles[1] =~ tr/ACTG//) > 1){
+	           #AA/GC 2 alleles
+	           $class = 'substitution'
+	       }
+	       else {
+	           warning("not possible to determine class for  @alleles");
+	           $class = '';
+	       }
+}
+elsif (@alleles > 2) {
+	       if ($alleles[0] =~ /\d+/) {
+	           #(CA)14/15/16/17 > 2 alleles, all of them contain the number of repetitions of the allele
+	           $class = 'microsat'
+	       }
+	       elsif ((grep {/-/} @alleles) > 0) {
+	           #-/A/T/TTA > 2 alleles
+	           $class = 'mixed'
+	       }
+	       else {
+	           #  warning("not possible to determine class of alleles " . @alleles);
+	           $class = '';
+	       }
+}
+else{
+	       warning("no alleles available ");
+	       $class = '';
+}
+}
+if ($is_somatic) {
+if ($class eq '') {
+# for undetermined classes just call it somatic
+$class = 'somatic';
+}
+else {
+# somatic mutations aren't polymorphisms, so change SNPs to SNVs
+$class = 'snv' if $class eq 'snp';
+# and prefix the class with 'somatic'
+$class = 'somatic_'.$class;
+}
+}
+return $class;
+}
+=head2 SO_variation_class
+Arg[1]      : string $alleles - a slash ()'/') separated list of alleles, the first allele is
+assumed to be the reference unless the $ref_correct argument is false
+Arg[2]      : boolean $ref_correct - flags that the first allele is not known to be the
+reference sequence (so we can't call insertions or deletions and have to
+resort to 'indel')
+Example     : use Bio::EnsEMBL::Variation::Utils::Sequence qw (SO_variation_class)
+my $alleles = 'A/C';
+my $SO_term = SO_variation_class($alleles);
+print "the SO term for the alleles $alleles is: ",$SO_term;
+Description : return the SO term for the class of the alleles
+ReturnType  : String. The SO term for the class of the alleles
+Exceptions  : none
+Caller      : Variation, VariationFeature
+=cut
+sub SO_variation_class {
+my $alleles     = shift;
+my $ref_correct = shift;
+$ref_correct = 1 unless defined $ref_correct;
+my $allele_class = '[A-Z]';
+# default to sequence_alteration
+my $class = SO_TERM_SEQUENCE_ALTERATION;
+if ($alleles =~ /^$allele_class(\/$allele_class)+$/) {
+# A/T, A/T/G
+$class = SO_TERM_SNV;
+}
+elsif ($alleles =~ /^$allele_class+(\/$allele_class+)+$/) {
+# AA/TTT
+$class = SO_TERM_SUBSTITUTION;
+}
+elsif ($alleles =~ /\)\d+/) {
+# (CAG)8/(CAG)9
+$class = SO_TERM_TANDEM_REPEAT;
+}
+else {
+my @alleles = split /\//, $alleles;
+if (@alleles > 1) {
+my $ref = shift @alleles;
+if ($ref eq '-') {
+if (@alleles == 1 && $alleles[0] =~ /DEL/) {
+# -/(LARGEDELETION) (rather oddly!)
+$class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
+}
+unless (grep { $_ !~ /^$allele_class+$|INS/ } @alleles) {
+# -/ATT, -/(LARGEINSERTION)
+$class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL;
+}
+# else must be mixed insertion and deletion, so just called sequence_alteration
+}
+elsif ($ref =~ /^$allele_class+$/) {
+unless (grep { $_ !~ /-|DEL/ } @alleles) {
+# A/-, A/(LARGEDELETION)
+$class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
+}
+}
+elsif ($ref =~ /DEL/) {
+unless (grep { $_ !~ /-/ } @alleles) {
+# (LARGEDELETION)/-, (2345 BP DELETION)/-
+$class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
+}
+}
+}
+elsif (@alleles == 1) {
+if ($alleles[0] =~ /INS/) {
+# (LARGEINSERTION)
+$class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL;
+}
+elsif($alleles[0] =~ /DEL/) {
+# (308 BP DELETION)
+$class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
+}
+}
+}
+return $class;
+}
+=head2 sequence_with_ambiguity
+Arg[1]      : Bio::EnsEMBL::DBSQL::DBAdaptor $dbCore
+Arg[2]      : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor $dbVar
+Arg[3]      : string $chr
+Arg[4]      : int $start
+Arg[5]      : int $end
+Arg[6]      : int $strand
+Example     : use Bio::EnsEMBL::Variation::Utils::Sequence qw (sequence_with_ambiguity)
+my $slice = sequence_with_ambiguity($dbCore,$dbVar,1,100,200);
+print "the sequence with ambiguity code for your region is: ",$slice->seq()
+Description : given a region, returns a Bio::EnsEMBL::Slice object with
+the sequence set with ambiguity codes
+ReturnType  : Bio::EnsEMBL::Slice object
+Exceptions  : none
+Caller      : general
+=cut
+sub sequence_with_ambiguity{
+my ($dbCore,$dbVar,$chr,$start,$end,$strand) = @_;
+my $slice;
+if (ref($dbCore) ne 'Bio::EnsEMBL::DBSQL::DBAdaptor'){
+	warning('You need to provide a Bio::EnsEMBL::DBSQL::DBAdaptor as a first argument');
+	return $slice;
+}
+if (ref($dbVar) ne 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor'){
+	warning('You need to provide a Bio::EnsEMBL::Variation::DBSQL::DBAdaptor object as second argument');
+	return $slice;
+}
+my $slice_adaptor = $dbCore->get_SliceAdaptor();
+my $vf_adaptor = $dbVar->get_VariationFeatureAdaptor;
+$slice = $slice_adaptor->fetch_by_region('chromosome',$chr,$start,$end,$strand); #get the slice
+my $seq = $slice->seq;
+foreach my $vf (@{$vf_adaptor->fetch_all_by_Slice($slice)}){
+	substr($seq,$vf->start-1,1,$vf->ambig_code);
+}
+$slice->{'seq'} = $seq;
+return $slice;
+}
+=head2 hgvs_variant_notation
+Arg[1]      : string $alt_allele
+Arg[2]      : string $ref_sequence
+Arg[3]      : int $ref_start
+Arg[4]      : int $ref_end
+Arg[5]      : int $display_start (optional)
+Arg[6]      : int $display_end (optional)
+Example     : use Bio::EnsEMBL::Variation::Utils::Sequence qw (hgvs_variant_notation)
+		my $alt_allele = 'A';
+		my $ref_sequence = 'CCGTGATGTGC';
+		my $ref_start = 4;
+		my $ref_end = 4;
+		my $ref_name = 'test_seq';
+		my $ref_type = 'g';
+		my $notation = hgvs_variant_notation($alt_allele,$ref_sequence,$ref_start,$ref_end);
+print "HGVS notation of your variant: $ref_name\:$ref_type\." . $notation->{'hgvs'};
+Description : Given an allele, a reference sequence and position of variant, returns a reference to a hash containing metadata and a
+		string with HGVS notation of the variant. Returns undef if reference and variant alleles are identical.
+		The optional display_start and display_end, if specified, will be used in the notation instead of the ref_start and ref_end.
+		This can be useful, e.g. if we want coordinates relative to chromosome but don't want to pass the entire chromosome sequence
+		into the subroutine.
+		The data fields in the returned hash are:
+		'start'	-> Displayed start position of variant
+		'end' -> Displayed end position of variant
+		'ref' -> Reference allele
+		'alt' -> Alternative allele
+		'type' -> The variant class, e.g. ins, inv, >, delins
+		'hgvs' -> A string with HGVS notation
+ReturnType  : reference to a hash
+Exceptions  : If the length of the interval to be displayed is different from the length of the reference allele
+Caller      : general
+=cut
+sub hgvs_variant_notation {
+my $alt_allele = shift;
+my $ref_sequence = shift;
+my $ref_start = shift;
+my $ref_end = shift;
+my $display_start = shift;
+my $display_end = shift;
+# If display_start and display_end were not specified, use ref_start and ref_end
+$display_start ||= $ref_start;
+$display_end ||= $ref_end;
+#�Throw an exception if the lengths of the display interval and reference interval are different
+throw("The coordinate interval for display is of different length than for the reference allele") if (($display_end - $display_start) != ($ref_end - $ref_start));
+# Length of the reference allele. Negative lengths make no sense
+my $ref_length = ($ref_end - $ref_start + 1);
+if ($ref_length < 0) {
+	$ref_length = 0;
+}
+# Remove any gap characters in the alt allele
+$alt_allele =~ s/\-//g;
+# Length of alternative allele
+my $alt_length = length($alt_allele);
+# Get the reference allele
+my $ref_allele = substr($ref_sequence,($ref_start-1),$ref_length);
+# Check that the alleles are different, otherwise return undef
+return undef unless ($ref_allele ne $alt_allele);
+# Store the notation in a hash that will be returned
+my %notation;
+$notation{'start'} = $display_start;
+$notation{'end'} = $display_end;
+$notation{'ref'} = $ref_allele;
+$notation{'alt'} = $alt_allele;
+# The simplest case is a deletion
+if (!$alt_length) {
+	$notation{'type'} = 'del';
+# Return the notation
+return \%notation;
+}
+# Another case is if the allele lengths are equal
+if ($ref_length == $alt_length) {
+	# If length is 1 it's a single substitution
+if ($ref_length == 1) {
+	    $notation{'type'} = '>';
+	    return \%notation;
+}
+	# Check if it's an inversion
+my $rev_ref = $ref_allele;
+reverse_comp(\$rev_ref);
+if ($alt_allele eq $rev_ref) {
+	    $notation{'type'} = 'inv';
+	    return \%notation;
+}
+	$notation{'type'} = 'delins';
+return \%notation;
+}
+# If this is an insertion, we should check if the preceeding reference nucleotides match the insertion. In that case it should be annotated as a multiplication.
+if (!$ref_length) {
+# Get the same number of nucleotides preceding the insertion as the length of the insertion
+my $prev_str = substr($ref_sequence,($ref_end-$alt_length),$alt_length);
+# If they match, this is a duplication
+if ($prev_str eq $alt_allele) {
+	    $notation{'start'} = ($display_end - $alt_length + 1);
+	    $notation{'type'} = 'dup';
+	    $notation{'ref'} = $prev_str;
+# Return the notation
+	    return \%notation;
+}
+# If they didn't match it's a plain insertion
+	$notation{'start'} = $display_end;
+	$notation{'end'} = $display_start;
+	$notation{'type'} = 'ins';
+return \%notation;
+}
+# Otherwise, the reference and allele are of different lengths. By default, this is a delins but
+# we need to check if the alt allele is a multiplication of the reference
+# Check if the length of the alt allele is a multiple of the reference allele
+if ($alt_length%$ref_length == 0) {
+my $multiple = ($alt_length / $ref_length);
+if ($alt_allele eq ($ref_allele x $multiple)) {
+if ($multiple == 2) {
+		$notation{'type'} = 'dup';
+}
+else {
+		$notation{'type'} = '[' . $multiple . ']';
+}
+	    return \%notation;
+}
+}
+# Else, it's gotta be a delins
+$notation{'type'} = 'delins';
+return \%notation;
+}
+=head2 format_hgvs_string
+Arg[1]      : string reference sequence name
+Arg[2]      : string strand
+Arg[3]      : hash of hgvs information
+Example     :
+Description : Creates HGVS formatted string from input hash
+ReturnType  : string in HGVS format
+Exceptions  :
+Caller      :
+=cut
+sub format_hgvs_string{
+##### generic formatting routine for genomic and coding HGVS names
+my $hgvs_notation = shift;
+### all start with refseq name & numbering type
+$hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . ".";
+my $coordinates;
+#### if single base event, list position only once
+if($hgvs_notation->{'start'} eq $hgvs_notation->{'end'}){
+	$coordinates =  $hgvs_notation->{'start'};
+}
+else{
+	$coordinates = $hgvs_notation->{'start'} . "_" . $hgvs_notation->{'end'};
+}
+##### format rest of string according to type
+if($hgvs_notation->{'type'} eq 'del' ||  $hgvs_notation->{'type'} eq 'inv' || $hgvs_notation->{'type'} eq 'dup'){
+	### inversion of reference bases => list ref not alt
+	### deletion  of reference bases => list ref lost
+	### duplication  of reference bases (eg ref = GAAA alt = GAAAGAAA) => list duplicated ref (dupGAAA)
+	$hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'};
+}
+elsif( $hgvs_notation->{'type'} eq '>'){
+	### substitution - list both alleles
+	$hgvs_notation->{'hgvs'} .= $hgvs_notation->{'start'} . $hgvs_notation->{'ref'} . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'};
+}
+elsif( $hgvs_notation->{'type'} eq 'delins'){
+	$hgvs_notation->{'hgvs'} .= $coordinates . 'del' . $hgvs_notation->{'ref'} . 'ins' . $hgvs_notation->{'alt'};
+}
+elsif($hgvs_notation->{'type'} eq 'ins'){
+	## reference not listed
+	$hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'};
+}
+elsif($hgvs_notation->{'type'} =~ /\[\d+\]/){
+	#### insertion described by string and number
+	$hgvs_notation->{'hgvs'} .= $coordinates  . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'};
+}
+else{
+	warn "PROBLEM with generic HGVS formatter - type = ". $hgvs_notation->{'type'} ."\n";
+}
+return $hgvs_notation->{'hgvs'};
+}
+=head2 align_seqs
+Arg[1]      : string $seq1
+Arg[2]      : string $seq2
+Example     : my $aligned_seqs = align_seqs($seq1, $seq2);
+Description : Does a simple NW align of two sequence strings. Best used on
+short (<1000bp) sequences, otherwise runtime will be long
+ReturnType  : arrayref to a pair of strings
+Exceptions  : none
+Caller      : web flanking sequence display
+=cut
+sub align_seqs {
+	my $seq1 = shift;
+	my $seq2 = shift;
+	# align parameters
+	my $match    = 10;
+	my $mismatch = -10;
+	my $gep      = -10;
+	# split sequences into arrays
+	my @split1 = split //, $seq1;
+	my @split2 = split //, $seq2;
+	# evaluate substitutions
+	my $len1 = length($seq1);
+	my $len2 = length($seq2);
+	my (@smat, @tb);
+	for (my $i=0; $i<=$len1; $i++) {
+		$smat[$i][0] = $i * $gep;
+		$tb[$i][0] = 1;
+	}
+	for (my $j=0; $j<=$len2; $j++) {
+		$smat[0][$j] = $j * $gep;
+		$tb[0][$j] = -1;
+	}
+	my ($s, $sub, $del, $ins);
+	for (my $i=1; $i<=$len1; $i++) {
+		for (my $j=1; $j<=$len2; $j++)	{
+			# calculate score
+			if($split1[$i-1] eq $split2[$j-1]) {
+				$s = $match;
+			}
+			else {
+				$s = $mismatch;
+			}
+			$sub = $smat[$i-1][$j-1] + $s;
+			$del = $smat[$i][$j-1] + $gep;
+			$ins = $smat[$i-1][$j] + $gep;
+			if($sub > $del && $sub > $ins) {
+				$smat[$i][$j] = $sub;
+				$tb[$i][$j] = 0;
+			}
+			elsif($del > $ins) {
+				$smat[$i][$j] = $del;
+				$tb[$i][$j] = -1;
+			}
+			else {
+				$smat[$i][$j] = $ins;
+				$tb[$i][$j] = 1;
+			}
+		}
+	}
+	my $i = $len1;
+	my $j = $len2;
+	my $aln_len = 0;
+	my (@aln1, @aln2);
+	while(!($i == 0 && $j == 0)) {
+		if($tb[$i][$j] == 0) {
+			$aln1[$aln_len] = $split1[--$i];
+			$aln2[$aln_len] = $split2[--$j];
+		}
+		elsif($tb[$i][$j] == -1) {
+			$aln1[$aln_len] = '-';
+			$aln2[$aln_len] = $split2[--$j];
+		}
+		elsif($tb[$i][$j] == 1) {
+			$aln1[$aln_len] = $split1[--$i];
+			$aln2[$aln_len] = '-';
+		}
+		$aln_len++;
+	}
+	return [(join "", reverse @aln1), (join "", reverse @aln2)];
+}
+=head2 array_to_bitval
+Arg[1]      : arrayref $arr
+Arg[2]      : arrayref $ref
+Example     : my $bitval = array_to_bitval(['hapmap','precious'],['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']);
+Description : Takes a reference to an array as input and return a bit value representing the
+combination of elements from a reference array. c.f. the SET datatype in MySQL
+ReturnType  : bitvalue that represents the combination of elements in the reference array specified in the given array
+Exceptions  : none
+Caller      : get_validation_code
+=cut
+sub array_to_bitval {
+my $arr = shift;
+my $ref = shift;
+#�Ensure that we have array references
+$arr = wrap_array($arr);
+$ref = wrap_array($ref);
+#�Turn the reference array into a hash, the values will correspond to 2 raised to the power of the position in the array
+my $i=0;
+my %ref_hash = map {lc($_) => $i++;} @{$ref};
+#�Set the bitval
+my $bitval = 0;
+foreach my $a (@{$arr}) {
+my $pos = $ref_hash{lc($a)};
+if (defined($pos)) {
+$bitval |= 2**$pos;
+}
+# Warn if the element is not present in the reference array
+else {
+warning("$a is not a recognised element. Recognised elements are: " . join(",",@{$ref}));
+}
+}
+return $bitval;
+}
+=head2 bitval_to_array
+Arg [1]    : int $bitval
+Arg [2]    : arrayref $ref
+Example    : my $arr = bitval_to_array(6,['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']);
+: print join(",",@{$arr}); #�Will print 'freq,submitter'
+Description: Returns an array with the combination of elements from the reference array specified by the supplied bitvalue.
+c.f. the SET datatype in MySQL
+Returntype : reference to list of strings
+Exceptions : none
+Caller     : get_all_validation_states
+=cut
+sub bitval_to_array {
+my $bitval = shift || 0;
+my $ref = shift;
+#�Ensure that we have array references
+$ref = wrap_array($ref);
+# convert the bit value into an ordered array
+my @arr;
+for (my $i = 0; $i < @{$ref}; $i++) {
+push(@arr,$ref->[$i]) if ((1 << $i) & $bitval);
+}
+return \@arr;
+}
+=head2 add_validation_state
+Arg [1]    : string $state
+Example    : add_validation_state('cluster');
+Description: Adds a validation state to this variation.
+Returntype : none
+Exceptions : warning if validation state is not a recognised type
+Caller     : general
+Status     : At Risk
+=cut
+sub add_validation_state {
+my $obj = shift;
+my $state = shift;
+#�Get the bitvalue for the new state
+my $newbit = get_validation_code($state) || 0;
+#�Bit-add it to the current validation_code
+my $oldbit = $obj->{'validation_code'} || 0;
+$newbit |= $oldbit;
+# Set the validation_code
+$obj->{'validation_code'} = $newbit;
+return;
+}
+=head2 get_all_validation_states
+Arg [1]    : int $bitval
+Example    : my @vstates = @{get_all_validation_states($var->{'validation_code'})};
+Description: Retrieves all validation states for a specified bit value.
+Returntype : reference to list of strings
+Exceptions : none
+Caller     : general
+=cut
+sub get_all_validation_states {
+return bitval_to_array(shift,\@VALIDATION_STATES);
+}
+=head2 get_validation_code
+Arg [1]    : arrayref $validation_status
+Example    : $var->{'validation_code'} = get_validation_code(['submitter','precious']);
+Description: Retrieves the bit value for a combination of validation statuses.
+Returntype : int
+Exceptions : none
+Caller     : Variation::new
+=cut
+sub get_validation_code {
+return array_to_bitval(shift,\@VALIDATION_STATES);
+}
+1;

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm @ 0:1f6dce3d34e0