comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence
22 #
23 #
24
25 =head1 NAME
26
27 Bio::EnsEMBL::Variation::Utils::Sequence - Utility functions for sequences
28
29 =head1 SYNOPSIS
30
31 use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code variation_class);
32
33 my $alleles = 'A|C';
34
35 print "my alleles = $alleles\n";
36
37 my $ambig_code = ambiguity_code($alleles);
38
39 print "my ambiguity code is $ambig_code\n";
40
41 print "my SNP class is = variation_class($alleles)";
42
43
44 =head1 METHODS
45
46 =cut
47
48
49 use strict;
50 use warnings;
51
52 package Bio::EnsEMBL::Variation::Utils::Sequence;
53
54 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
55 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
56 use Bio::EnsEMBL::Variation::Utils::Constants qw(:SO_class_terms);
57 use Bio::EnsEMBL::Utils::Scalar qw(wrap_array);
58 use Exporter;
59
60 use vars qw(@ISA @EXPORT_OK);
61
62 @ISA = qw(Exporter);
63
64 @EXPORT_OK = qw(
65 &ambiguity_code
66 &variation_class
67 &unambiguity_code
68 &sequence_with_ambiguity
69 &hgvs_variant_notation
70 &format_hgvs_string
71 &SO_variation_class
72 &align_seqs
73 &strain_ambiguity_code
74 &get_all_validation_states
75 &get_validation_code
76 &add_validation_state
77 );
78
79 # List of validation states. Order must match that of set in database
80 our @VALIDATION_STATES = qw(cluster freq submitter doublehit hapmap 1000Genome failed precious);
81
82 =head2 ambiguity_code
83
84 Arg[1] : string $alleles
85 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code)
86 my $alleles = 'A|C';
87 my $ambig_code = ambiguity_code($alleles);
88 print "the ambiguity code for $alleles is: ",$ambig_code;
89 Description : returns the ambiguity code for a SNP allele
90 ReturnType : String
91 The ambiguity code
92 Exceptions : None
93 Caller : Variation, VariationFeature
94
95 =cut
96
97 sub ambiguity_code {
98 my $alleles = shift;
99 my %duplicates; #hash containing all alleles to remove duplicates
100
101 foreach my $a(split /[\|\/\\]/, $alleles) {
102 # convert Ns
103 my @a = ($a eq 'N' ? qw(A C G T) : ($a));
104 map {$duplicates{$_}++} @a;
105 }
106 $alleles = uc( join '', sort keys %duplicates );
107 #my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y
108 #GT K C C A A T T G G - - -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-'
109 my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y GT K C C A A T T G G - -);
110 return $ambig{$alleles};
111 }
112
113 =head2 strain_ambiguity_code
114
115 Arg[1] : string $alleles (separated by "/", "\" or "|")
116 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(strain_ambiguity_code)
117 my $alleles = 'A|C';
118 my $ambig_code = strain_ambiguity_code($alleles);
119 print "the ambiguity code for $alleles is: ",$ambig_code;
120 Description : returns the ambiguity code for a strain genotype
121 ReturnType : String
122 Exceptions : None
123 Caller : AlleleFeatureAdaptor
124
125 =cut
126
127 sub strain_ambiguity_code {
128 my $alleles = shift;
129
130 # return normal ambiguity code for a SNP
131 return ambiguity_code($alleles) if($alleles =~ /^[ACGT][\|\/\\][ACGT]$/);
132
133 # get alleles
134 my ($a1, $a2) = split /[\|\/\\]/, $alleles;
135
136 # pad
137 if(length($a1) > length($a2)) {
138 $a2 .= '-' x (length($a1) - length($a2));
139 }
140 else {
141 $a1 .= '-' x (length($a2) - length($a1));
142 }
143
144 # build ambiguity code base by base
145 my $ambig = '';
146
147 for my $i(0..(length($a1) - 1)) {
148 my $b1 = substr($a1, $i, 1);
149 my $b2 = substr($a2, $i, 1);
150
151 # -/- = -
152 if($b1 eq '-' && $b2 eq '-') {
153 $ambig .= '-';
154 }
155
156 # G/- = g
157 elsif($b1 eq '-') {
158 $ambig .= lc($b2);
159 }
160
161 # -/G = g
162 elsif($b2 eq '-') {
163 $ambig .= lc($b1);
164 }
165
166 # A/G = R
167 else {
168 $ambig .= ambiguity_code($b1.'|'.$b2);
169 }
170 }
171
172 return $ambig;
173 }
174
175 =head2 unambiguity_code
176
177 Arg[1] : string $alleles
178 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code)
179 my $ambiguity_code = 'M';
180 my $alleles = unambiguity_code($ambiguity_code);
181 print "the alleles for ambiguity code $ambiguity_code is: ",$alleles;
182 Description : returns the alleles for an ambiguity code
183 ReturnType : String
184 The Alleles, alphabetically sorted and in capital
185 Exceptions : None
186 Caller : Variation, VariationFeature
187
188 =cut
189
190 sub unambiguity_code {
191 my $ambiguity_code = shift;
192
193 #my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K
194 #GT C CC A AA T TT G GG - -- -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-'
195 my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K GT C CC A AA T TT G GG - --);
196 return $unambig{$ambiguity_code};
197 }
198
199
200 =head2 variation_class
201
202 Arg[1] : string $alleles
203 Arg[2] : boolean $is_somatic - flag that this variation is somatic
204 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (variation_class)
205 my $alleles = 'A|C';
206 my $variation_class = variation_class($alleles);
207 print "the variation class for the alleles $alleles is: ",$variation_class;
208 Description : return the class of the alleles according to dbSNP classification(SNP,indel,mixed,substitution...)
209 ReturnType : String. The class of the alleles
210 Exceptions : none
211 Caller : Variation, VariationFeature
212
213 =cut
214
215 sub variation_class{
216
217 my ($alleles, $is_somatic) = @_;
218
219 my $class;
220
221 if ($alleles =~ /^[ACGTN]([\|\\\/][ACGTN])+$/i) {
222 $class = 'snp';
223 }
224 elsif (($alleles eq 'cnv') || ($alleles eq 'CNV')) {
225 $class = 'cnv';
226 }
227 elsif ($alleles =~ /CNV\_PROBE/i) {
228 $class = 'cnv probe';
229 }
230 elsif ($alleles =~ /HGMD\_MUTATION/i) {
231 $class = 'hgmd_mutation';
232 }
233 else {
234 my @alleles = split /[\|\/\\]/, $alleles;
235
236 if (@alleles == 1) {
237 #(HETEROZYGOUS) 1 allele
238 $class = 'het';
239 }
240 elsif(@alleles == 2) {
241 if ((($alleles[0] =~ tr/ACTGN//)== length($alleles[0]) && ($alleles[1] =~ tr/-//) == 1) ||
242 (($alleles[0] =~ tr/-//) == 1 && ($alleles[1] =~ tr/ACTGN//) == length($alleles[1])) ){
243 #A/- 2 alleles
244 $class = 'in-del'
245 }
246 elsif (($alleles[0] =~ /LARGE|INS|DEL/) || ($alleles[1] =~ /LARGE|INS|DEL/)){
247 #(LARGEDELETION) 2 alleles
248 $class = 'named'
249 }
250 elsif (($alleles[0] =~ tr/ACTG//) > 1 || ($alleles[1] =~ tr/ACTG//) > 1){
251 #AA/GC 2 alleles
252 $class = 'substitution'
253 }
254 else {
255 warning("not possible to determine class for @alleles");
256 $class = '';
257 }
258 }
259 elsif (@alleles > 2) {
260
261 if ($alleles[0] =~ /\d+/) {
262 #(CA)14/15/16/17 > 2 alleles, all of them contain the number of repetitions of the allele
263 $class = 'microsat'
264 }
265
266 elsif ((grep {/-/} @alleles) > 0) {
267 #-/A/T/TTA > 2 alleles
268 $class = 'mixed'
269 }
270 else {
271 # warning("not possible to determine class of alleles " . @alleles);
272 $class = '';
273 }
274 }
275 else{
276 warning("no alleles available ");
277 $class = '';
278 }
279 }
280
281 if ($is_somatic) {
282 if ($class eq '') {
283 # for undetermined classes just call it somatic
284 $class = 'somatic';
285 }
286 else {
287 # somatic mutations aren't polymorphisms, so change SNPs to SNVs
288 $class = 'snv' if $class eq 'snp';
289
290 # and prefix the class with 'somatic'
291 $class = 'somatic_'.$class;
292 }
293 }
294
295 return $class;
296 }
297
298 =head2 SO_variation_class
299
300 Arg[1] : string $alleles - a slash ()'/') separated list of alleles, the first allele is
301 assumed to be the reference unless the $ref_correct argument is false
302 Arg[2] : boolean $ref_correct - flags that the first allele is not known to be the
303 reference sequence (so we can't call insertions or deletions and have to
304 resort to 'indel')
305 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (SO_variation_class)
306 my $alleles = 'A/C';
307 my $SO_term = SO_variation_class($alleles);
308 print "the SO term for the alleles $alleles is: ",$SO_term;
309 Description : return the SO term for the class of the alleles
310 ReturnType : String. The SO term for the class of the alleles
311 Exceptions : none
312 Caller : Variation, VariationFeature
313
314 =cut
315
316 sub SO_variation_class {
317
318 my $alleles = shift;
319 my $ref_correct = shift;
320
321 $ref_correct = 1 unless defined $ref_correct;
322
323 my $allele_class = '[A-Z]';
324
325 # default to sequence_alteration
326 my $class = SO_TERM_SEQUENCE_ALTERATION;
327
328 if ($alleles =~ /^$allele_class(\/$allele_class)+$/) {
329 # A/T, A/T/G
330 $class = SO_TERM_SNV;
331 }
332 elsif ($alleles =~ /^$allele_class+(\/$allele_class+)+$/) {
333 # AA/TTT
334 $class = SO_TERM_SUBSTITUTION;
335 }
336 elsif ($alleles =~ /\)\d+/) {
337 # (CAG)8/(CAG)9
338 $class = SO_TERM_TANDEM_REPEAT;
339 }
340 else {
341 my @alleles = split /\//, $alleles;
342
343 if (@alleles > 1) {
344
345 my $ref = shift @alleles;
346
347 if ($ref eq '-') {
348
349 if (@alleles == 1 && $alleles[0] =~ /DEL/) {
350 # -/(LARGEDELETION) (rather oddly!)
351 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
352 }
353
354 unless (grep { $_ !~ /^$allele_class+$|INS/ } @alleles) {
355 # -/ATT, -/(LARGEINSERTION)
356 $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL;
357 }
358
359 # else must be mixed insertion and deletion, so just called sequence_alteration
360 }
361 elsif ($ref =~ /^$allele_class+$/) {
362 unless (grep { $_ !~ /-|DEL/ } @alleles) {
363 # A/-, A/(LARGEDELETION)
364 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
365 }
366 }
367 elsif ($ref =~ /DEL/) {
368 unless (grep { $_ !~ /-/ } @alleles) {
369 # (LARGEDELETION)/-, (2345 BP DELETION)/-
370 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
371 }
372 }
373 }
374 elsif (@alleles == 1) {
375 if ($alleles[0] =~ /INS/) {
376 # (LARGEINSERTION)
377 $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL;
378 }
379 elsif($alleles[0] =~ /DEL/) {
380 # (308 BP DELETION)
381 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
382 }
383 }
384 }
385
386 return $class;
387 }
388
389 =head2 sequence_with_ambiguity
390
391 Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dbCore
392 Arg[2] : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor $dbVar
393 Arg[3] : string $chr
394 Arg[4] : int $start
395 Arg[5] : int $end
396 Arg[6] : int $strand
397 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (sequence_with_ambiguity)
398 my $slice = sequence_with_ambiguity($dbCore,$dbVar,1,100,200);
399 print "the sequence with ambiguity code for your region is: ",$slice->seq()
400 Description : given a region, returns a Bio::EnsEMBL::Slice object with
401 the sequence set with ambiguity codes
402 ReturnType : Bio::EnsEMBL::Slice object
403 Exceptions : none
404 Caller : general
405
406 =cut
407
408 sub sequence_with_ambiguity{
409 my ($dbCore,$dbVar,$chr,$start,$end,$strand) = @_;
410
411 my $slice;
412 if (ref($dbCore) ne 'Bio::EnsEMBL::DBSQL::DBAdaptor'){
413 warning('You need to provide a Bio::EnsEMBL::DBSQL::DBAdaptor as a first argument');
414 return $slice;
415 }
416 if (ref($dbVar) ne 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor'){
417 warning('You need to provide a Bio::EnsEMBL::Variation::DBSQL::DBAdaptor object as second argument');
418 return $slice;
419 }
420 my $slice_adaptor = $dbCore->get_SliceAdaptor();
421 my $vf_adaptor = $dbVar->get_VariationFeatureAdaptor;
422 $slice = $slice_adaptor->fetch_by_region('chromosome',$chr,$start,$end,$strand); #get the slice
423 my $seq = $slice->seq;
424 foreach my $vf (@{$vf_adaptor->fetch_all_by_Slice($slice)}){
425 substr($seq,$vf->start-1,1,$vf->ambig_code);
426 }
427 $slice->{'seq'} = $seq;
428 return $slice;
429 }
430
431 =head2 hgvs_variant_notation
432
433 Arg[1] : string $alt_allele
434 Arg[2] : string $ref_sequence
435 Arg[3] : int $ref_start
436 Arg[4] : int $ref_end
437 Arg[5] : int $display_start (optional)
438 Arg[6] : int $display_end (optional)
439
440 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (hgvs_variant_notation)
441 my $alt_allele = 'A';
442 my $ref_sequence = 'CCGTGATGTGC';
443 my $ref_start = 4;
444 my $ref_end = 4;
445 my $ref_name = 'test_seq';
446 my $ref_type = 'g';
447 my $notation = hgvs_variant_notation($alt_allele,$ref_sequence,$ref_start,$ref_end);
448 print "HGVS notation of your variant: $ref_name\:$ref_type\." . $notation->{'hgvs'};
449
450 Description : Given an allele, a reference sequence and position of variant, returns a reference to a hash containing metadata and a
451 string with HGVS notation of the variant. Returns undef if reference and variant alleles are identical.
452 The optional display_start and display_end, if specified, will be used in the notation instead of the ref_start and ref_end.
453 This can be useful, e.g. if we want coordinates relative to chromosome but don't want to pass the entire chromosome sequence
454 into the subroutine.
455 The data fields in the returned hash are:
456 'start' -> Displayed start position of variant
457 'end' -> Displayed end position of variant
458 'ref' -> Reference allele
459 'alt' -> Alternative allele
460 'type' -> The variant class, e.g. ins, inv, >, delins
461 'hgvs' -> A string with HGVS notation
462 ReturnType : reference to a hash
463 Exceptions : If the length of the interval to be displayed is different from the length of the reference allele
464 Caller : general
465
466 =cut
467 sub hgvs_variant_notation {
468 my $alt_allele = shift;
469 my $ref_sequence = shift;
470 my $ref_start = shift;
471 my $ref_end = shift;
472 my $display_start = shift;
473 my $display_end = shift;
474
475 # If display_start and display_end were not specified, use ref_start and ref_end
476 $display_start ||= $ref_start;
477 $display_end ||= $ref_end;
478
479 #ÊThrow an exception if the lengths of the display interval and reference interval are different
480 throw("The coordinate interval for display is of different length than for the reference allele") if (($display_end - $display_start) != ($ref_end - $ref_start));
481
482 # Length of the reference allele. Negative lengths make no sense
483 my $ref_length = ($ref_end - $ref_start + 1);
484 if ($ref_length < 0) {
485 $ref_length = 0;
486 }
487
488 # Remove any gap characters in the alt allele
489 $alt_allele =~ s/\-//g;
490
491 # Length of alternative allele
492 my $alt_length = length($alt_allele);
493
494 # Get the reference allele
495 my $ref_allele = substr($ref_sequence,($ref_start-1),$ref_length);
496
497 # Check that the alleles are different, otherwise return undef
498 return undef unless ($ref_allele ne $alt_allele);
499
500 # Store the notation in a hash that will be returned
501 my %notation;
502 $notation{'start'} = $display_start;
503 $notation{'end'} = $display_end;
504 $notation{'ref'} = $ref_allele;
505 $notation{'alt'} = $alt_allele;
506
507 # The simplest case is a deletion
508 if (!$alt_length) {
509 $notation{'type'} = 'del';
510
511 # Return the notation
512 return \%notation;
513 }
514
515 # Another case is if the allele lengths are equal
516 if ($ref_length == $alt_length) {
517
518 # If length is 1 it's a single substitution
519 if ($ref_length == 1) {
520 $notation{'type'} = '>';
521 return \%notation;
522 }
523
524 # Check if it's an inversion
525 my $rev_ref = $ref_allele;
526 reverse_comp(\$rev_ref);
527 if ($alt_allele eq $rev_ref) {
528 $notation{'type'} = 'inv';
529 return \%notation;
530 }
531
532 $notation{'type'} = 'delins';
533
534 return \%notation;
535 }
536
537 # If this is an insertion, we should check if the preceeding reference nucleotides match the insertion. In that case it should be annotated as a multiplication.
538 if (!$ref_length) {
539
540 # Get the same number of nucleotides preceding the insertion as the length of the insertion
541 my $prev_str = substr($ref_sequence,($ref_end-$alt_length),$alt_length);
542
543 # If they match, this is a duplication
544 if ($prev_str eq $alt_allele) {
545
546 $notation{'start'} = ($display_end - $alt_length + 1);
547 $notation{'type'} = 'dup';
548 $notation{'ref'} = $prev_str;
549 # Return the notation
550 return \%notation;
551 }
552
553 # If they didn't match it's a plain insertion
554 $notation{'start'} = $display_end;
555 $notation{'end'} = $display_start;
556 $notation{'type'} = 'ins';
557
558 return \%notation;
559 }
560
561 # Otherwise, the reference and allele are of different lengths. By default, this is a delins but
562 # we need to check if the alt allele is a multiplication of the reference
563 # Check if the length of the alt allele is a multiple of the reference allele
564 if ($alt_length%$ref_length == 0) {
565 my $multiple = ($alt_length / $ref_length);
566 if ($alt_allele eq ($ref_allele x $multiple)) {
567 if ($multiple == 2) {
568 $notation{'type'} = 'dup';
569 }
570 else {
571 $notation{'type'} = '[' . $multiple . ']';
572 }
573 return \%notation;
574 }
575 }
576
577 # Else, it's gotta be a delins
578 $notation{'type'} = 'delins';
579
580 return \%notation;
581 }
582
583
584 =head2 format_hgvs_string
585
586 Arg[1] : string reference sequence name
587 Arg[2] : string strand
588 Arg[3] : hash of hgvs information
589 Example :
590 Description : Creates HGVS formatted string from input hash
591 ReturnType : string in HGVS format
592 Exceptions :
593 Caller :
594
595 =cut
596
597 sub format_hgvs_string{
598 ##### generic formatting routine for genomic and coding HGVS names
599
600 my $hgvs_notation = shift;
601
602 ### all start with refseq name & numbering type
603 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . ".";
604
605 my $coordinates;
606 #### if single base event, list position only once
607 if($hgvs_notation->{'start'} eq $hgvs_notation->{'end'}){
608 $coordinates = $hgvs_notation->{'start'};
609 }
610 else{
611 $coordinates = $hgvs_notation->{'start'} . "_" . $hgvs_notation->{'end'};
612 }
613
614 ##### format rest of string according to type
615
616 if($hgvs_notation->{'type'} eq 'del' || $hgvs_notation->{'type'} eq 'inv' || $hgvs_notation->{'type'} eq 'dup'){
617 ### inversion of reference bases => list ref not alt
618 ### deletion of reference bases => list ref lost
619 ### duplication of reference bases (eg ref = GAAA alt = GAAAGAAA) => list duplicated ref (dupGAAA)
620 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'};
621 }
622
623 elsif( $hgvs_notation->{'type'} eq '>'){
624 ### substitution - list both alleles
625 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{'start'} . $hgvs_notation->{'ref'} . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'};
626 }
627
628 elsif( $hgvs_notation->{'type'} eq 'delins'){
629 $hgvs_notation->{'hgvs'} .= $coordinates . 'del' . $hgvs_notation->{'ref'} . 'ins' . $hgvs_notation->{'alt'};
630 }
631
632 elsif($hgvs_notation->{'type'} eq 'ins'){
633 ## reference not listed
634 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'};
635 }
636
637 elsif($hgvs_notation->{'type'} =~ /\[\d+\]/){
638 #### insertion described by string and number
639 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'};
640 }
641
642 else{
643 warn "PROBLEM with generic HGVS formatter - type = ". $hgvs_notation->{'type'} ."\n";
644 }
645
646 return $hgvs_notation->{'hgvs'};
647
648 }
649
650 =head2 align_seqs
651
652 Arg[1] : string $seq1
653 Arg[2] : string $seq2
654 Example : my $aligned_seqs = align_seqs($seq1, $seq2);
655 Description : Does a simple NW align of two sequence strings. Best used on
656 short (<1000bp) sequences, otherwise runtime will be long
657 ReturnType : arrayref to a pair of strings
658 Exceptions : none
659 Caller : web flanking sequence display
660
661 =cut
662
663 sub align_seqs {
664 my $seq1 = shift;
665 my $seq2 = shift;
666
667 # align parameters
668 my $match = 10;
669 my $mismatch = -10;
670 my $gep = -10;
671
672 # split sequences into arrays
673 my @split1 = split //, $seq1;
674 my @split2 = split //, $seq2;
675
676 # evaluate substitutions
677 my $len1 = length($seq1);
678 my $len2 = length($seq2);
679
680 my (@smat, @tb);
681
682 for (my $i=0; $i<=$len1; $i++) {
683 $smat[$i][0] = $i * $gep;
684 $tb[$i][0] = 1;
685 }
686 for (my $j=0; $j<=$len2; $j++) {
687 $smat[0][$j] = $j * $gep;
688 $tb[0][$j] = -1;
689 }
690
691 my ($s, $sub, $del, $ins);
692
693 for (my $i=1; $i<=$len1; $i++) {
694 for (my $j=1; $j<=$len2; $j++) {
695
696 # calculate score
697 if($split1[$i-1] eq $split2[$j-1]) {
698 $s = $match;
699 }
700 else {
701 $s = $mismatch;
702 }
703
704 $sub = $smat[$i-1][$j-1] + $s;
705 $del = $smat[$i][$j-1] + $gep;
706 $ins = $smat[$i-1][$j] + $gep;
707
708 if($sub > $del && $sub > $ins) {
709 $smat[$i][$j] = $sub;
710 $tb[$i][$j] = 0;
711 }
712 elsif($del > $ins) {
713 $smat[$i][$j] = $del;
714 $tb[$i][$j] = -1;
715 }
716 else {
717 $smat[$i][$j] = $ins;
718 $tb[$i][$j] = 1;
719 }
720 }
721 }
722
723
724 my $i = $len1;
725 my $j = $len2;
726 my $aln_len = 0;
727 my (@aln1, @aln2);
728
729 while(!($i == 0 && $j == 0)) {
730 if($tb[$i][$j] == 0) {
731 $aln1[$aln_len] = $split1[--$i];
732 $aln2[$aln_len] = $split2[--$j];
733 }
734 elsif($tb[$i][$j] == -1) {
735 $aln1[$aln_len] = '-';
736 $aln2[$aln_len] = $split2[--$j];
737 }
738 elsif($tb[$i][$j] == 1) {
739 $aln1[$aln_len] = $split1[--$i];
740 $aln2[$aln_len] = '-';
741 }
742
743 $aln_len++;
744 }
745
746 return [(join "", reverse @aln1), (join "", reverse @aln2)];
747 }
748
749
750 =head2 array_to_bitval
751
752 Arg[1] : arrayref $arr
753 Arg[2] : arrayref $ref
754 Example : my $bitval = array_to_bitval(['hapmap','precious'],['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']);
755 Description : Takes a reference to an array as input and return a bit value representing the
756 combination of elements from a reference array. c.f. the SET datatype in MySQL
757 ReturnType : bitvalue that represents the combination of elements in the reference array specified in the given array
758 Exceptions : none
759 Caller : get_validation_code
760
761 =cut
762
763 sub array_to_bitval {
764 my $arr = shift;
765 my $ref = shift;
766
767 #ÊEnsure that we have array references
768 $arr = wrap_array($arr);
769 $ref = wrap_array($ref);
770
771 #ÊTurn the reference array into a hash, the values will correspond to 2 raised to the power of the position in the array
772 my $i=0;
773 my %ref_hash = map {lc($_) => $i++;} @{$ref};
774
775 #ÊSet the bitval
776 my $bitval = 0;
777 foreach my $a (@{$arr}) {
778
779 my $pos = $ref_hash{lc($a)};
780 if (defined($pos)) {
781 $bitval |= 2**$pos;
782 }
783 # Warn if the element is not present in the reference array
784 else {
785 warning("$a is not a recognised element. Recognised elements are: " . join(",",@{$ref}));
786 }
787 }
788
789 return $bitval;
790 }
791
792 =head2 bitval_to_array
793
794 Arg [1] : int $bitval
795 Arg [2] : arrayref $ref
796 Example : my $arr = bitval_to_array(6,['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']);
797 : print join(",",@{$arr}); #ÊWill print 'freq,submitter'
798 Description: Returns an array with the combination of elements from the reference array specified by the supplied bitvalue.
799 c.f. the SET datatype in MySQL
800 Returntype : reference to list of strings
801 Exceptions : none
802 Caller : get_all_validation_states
803
804 =cut
805
806 sub bitval_to_array {
807 my $bitval = shift || 0;
808 my $ref = shift;
809
810 #ÊEnsure that we have array references
811 $ref = wrap_array($ref);
812
813 # convert the bit value into an ordered array
814 my @arr;
815 for (my $i = 0; $i < @{$ref}; $i++) {
816 push(@arr,$ref->[$i]) if ((1 << $i) & $bitval);
817 }
818
819 return \@arr;
820 }
821
822
823 =head2 add_validation_state
824
825 Arg [1] : string $state
826 Example : add_validation_state('cluster');
827 Description: Adds a validation state to this variation.
828 Returntype : none
829 Exceptions : warning if validation state is not a recognised type
830 Caller : general
831 Status : At Risk
832
833 =cut
834
835 sub add_validation_state {
836 my $obj = shift;
837 my $state = shift;
838
839 #ÊGet the bitvalue for the new state
840 my $newbit = get_validation_code($state) || 0;
841
842 #ÊBit-add it to the current validation_code
843 my $oldbit = $obj->{'validation_code'} || 0;
844 $newbit |= $oldbit;
845
846 # Set the validation_code
847 $obj->{'validation_code'} = $newbit;
848
849 return;
850 }
851
852 =head2 get_all_validation_states
853
854 Arg [1] : int $bitval
855 Example : my @vstates = @{get_all_validation_states($var->{'validation_code'})};
856 Description: Retrieves all validation states for a specified bit value.
857 Returntype : reference to list of strings
858 Exceptions : none
859 Caller : general
860
861 =cut
862
863 sub get_all_validation_states {
864 return bitval_to_array(shift,\@VALIDATION_STATES);
865 }
866
867 =head2 get_validation_code
868
869 Arg [1] : arrayref $validation_status
870 Example : $var->{'validation_code'} = get_validation_code(['submitter','precious']);
871 Description: Retrieves the bit value for a combination of validation statuses.
872 Returntype : int
873 Exceptions : none
874 Caller : Variation::new
875
876 =cut
877
878 sub get_validation_code {
879 return array_to_bitval(shift,\@VALIDATION_STATES);
880 }
881
882 1;