Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/Sequence.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 4 Genome Research Limited. All rights reserved. | |
| 5 | |
| 6 This software is distributed under a modified Apache license. | |
| 7 For license details, please see | |
| 8 | |
| 9 http://www.ensembl.org/info/about/code_licence.html | |
| 10 | |
| 11 =head1 CONTACT | |
| 12 | |
| 13 Please email comments or questions to the public Ensembl | |
| 14 developers list at <dev@ensembl.org>. | |
| 15 | |
| 16 Questions may also be sent to the Ensembl help desk at | |
| 17 <helpdesk@ensembl.org>. | |
| 18 | |
| 19 =cut | |
| 20 | |
| 21 # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence | |
| 22 # | |
| 23 # | |
| 24 | |
| 25 =head1 NAME | |
| 26 | |
| 27 Bio::EnsEMBL::Variation::Utils::Sequence - Utility functions for sequences | |
| 28 | |
| 29 =head1 SYNOPSIS | |
| 30 | |
| 31 use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code variation_class); | |
| 32 | |
| 33 my $alleles = 'A|C'; | |
| 34 | |
| 35 print "my alleles = $alleles\n"; | |
| 36 | |
| 37 my $ambig_code = ambiguity_code($alleles); | |
| 38 | |
| 39 print "my ambiguity code is $ambig_code\n"; | |
| 40 | |
| 41 print "my SNP class is = variation_class($alleles)"; | |
| 42 | |
| 43 | |
| 44 =head1 METHODS | |
| 45 | |
| 46 =cut | |
| 47 | |
| 48 | |
| 49 use strict; | |
| 50 use warnings; | |
| 51 | |
| 52 package Bio::EnsEMBL::Variation::Utils::Sequence; | |
| 53 | |
| 54 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
| 55 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 56 use Bio::EnsEMBL::Variation::Utils::Constants qw(:SO_class_terms); | |
| 57 use Bio::EnsEMBL::Utils::Scalar qw(wrap_array); | |
| 58 use Exporter; | |
| 59 | |
| 60 use vars qw(@ISA @EXPORT_OK); | |
| 61 | |
| 62 @ISA = qw(Exporter); | |
| 63 | |
| 64 @EXPORT_OK = qw( | |
| 65 &ambiguity_code | |
| 66 &variation_class | |
| 67 &unambiguity_code | |
| 68 &sequence_with_ambiguity | |
| 69 &hgvs_variant_notation | |
| 70 &format_hgvs_string | |
| 71 &SO_variation_class | |
| 72 &align_seqs | |
| 73 &strain_ambiguity_code | |
| 74 &get_all_validation_states | |
| 75 &get_validation_code | |
| 76 &add_validation_state | |
| 77 ); | |
| 78 | |
| 79 # List of validation states. Order must match that of set in database | |
| 80 our @VALIDATION_STATES = qw(cluster freq submitter doublehit hapmap 1000Genome failed precious); | |
| 81 | |
| 82 =head2 ambiguity_code | |
| 83 | |
| 84 Arg[1] : string $alleles | |
| 85 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code) | |
| 86 my $alleles = 'A|C'; | |
| 87 my $ambig_code = ambiguity_code($alleles); | |
| 88 print "the ambiguity code for $alleles is: ",$ambig_code; | |
| 89 Description : returns the ambiguity code for a SNP allele | |
| 90 ReturnType : String | |
| 91 The ambiguity code | |
| 92 Exceptions : None | |
| 93 Caller : Variation, VariationFeature | |
| 94 | |
| 95 =cut | |
| 96 | |
| 97 sub ambiguity_code { | |
| 98 my $alleles = shift; | |
| 99 my %duplicates; #hash containing all alleles to remove duplicates | |
| 100 | |
| 101 foreach my $a(split /[\|\/\\]/, $alleles) { | |
| 102 # convert Ns | |
| 103 my @a = ($a eq 'N' ? qw(A C G T) : ($a)); | |
| 104 map {$duplicates{$_}++} @a; | |
| 105 } | |
| 106 $alleles = uc( join '', sort keys %duplicates ); | |
| 107 #my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y | |
| 108 #GT K C C A A T T G G - - -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-' | |
| 109 my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y GT K C C A A T T G G - -); | |
| 110 return $ambig{$alleles}; | |
| 111 } | |
| 112 | |
| 113 =head2 strain_ambiguity_code | |
| 114 | |
| 115 Arg[1] : string $alleles (separated by "/", "\" or "|") | |
| 116 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(strain_ambiguity_code) | |
| 117 my $alleles = 'A|C'; | |
| 118 my $ambig_code = strain_ambiguity_code($alleles); | |
| 119 print "the ambiguity code for $alleles is: ",$ambig_code; | |
| 120 Description : returns the ambiguity code for a strain genotype | |
| 121 ReturnType : String | |
| 122 Exceptions : None | |
| 123 Caller : AlleleFeatureAdaptor | |
| 124 | |
| 125 =cut | |
| 126 | |
| 127 sub strain_ambiguity_code { | |
| 128 my $alleles = shift; | |
| 129 | |
| 130 # return normal ambiguity code for a SNP | |
| 131 return ambiguity_code($alleles) if($alleles =~ /^[ACGT][\|\/\\][ACGT]$/); | |
| 132 | |
| 133 # get alleles | |
| 134 my ($a1, $a2) = split /[\|\/\\]/, $alleles; | |
| 135 | |
| 136 # pad | |
| 137 if(length($a1) > length($a2)) { | |
| 138 $a2 .= '-' x (length($a1) - length($a2)); | |
| 139 } | |
| 140 else { | |
| 141 $a1 .= '-' x (length($a2) - length($a1)); | |
| 142 } | |
| 143 | |
| 144 # build ambiguity code base by base | |
| 145 my $ambig = ''; | |
| 146 | |
| 147 for my $i(0..(length($a1) - 1)) { | |
| 148 my $b1 = substr($a1, $i, 1); | |
| 149 my $b2 = substr($a2, $i, 1); | |
| 150 | |
| 151 # -/- = - | |
| 152 if($b1 eq '-' && $b2 eq '-') { | |
| 153 $ambig .= '-'; | |
| 154 } | |
| 155 | |
| 156 # G/- = g | |
| 157 elsif($b1 eq '-') { | |
| 158 $ambig .= lc($b2); | |
| 159 } | |
| 160 | |
| 161 # -/G = g | |
| 162 elsif($b2 eq '-') { | |
| 163 $ambig .= lc($b1); | |
| 164 } | |
| 165 | |
| 166 # A/G = R | |
| 167 else { | |
| 168 $ambig .= ambiguity_code($b1.'|'.$b2); | |
| 169 } | |
| 170 } | |
| 171 | |
| 172 return $ambig; | |
| 173 } | |
| 174 | |
| 175 =head2 unambiguity_code | |
| 176 | |
| 177 Arg[1] : string $alleles | |
| 178 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code) | |
| 179 my $ambiguity_code = 'M'; | |
| 180 my $alleles = unambiguity_code($ambiguity_code); | |
| 181 print "the alleles for ambiguity code $ambiguity_code is: ",$alleles; | |
| 182 Description : returns the alleles for an ambiguity code | |
| 183 ReturnType : String | |
| 184 The Alleles, alphabetically sorted and in capital | |
| 185 Exceptions : None | |
| 186 Caller : Variation, VariationFeature | |
| 187 | |
| 188 =cut | |
| 189 | |
| 190 sub unambiguity_code { | |
| 191 my $ambiguity_code = shift; | |
| 192 | |
| 193 #my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K | |
| 194 #GT C CC A AA T TT G GG - -- -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-' | |
| 195 my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K GT C CC A AA T TT G GG - --); | |
| 196 return $unambig{$ambiguity_code}; | |
| 197 } | |
| 198 | |
| 199 | |
| 200 =head2 variation_class | |
| 201 | |
| 202 Arg[1] : string $alleles | |
| 203 Arg[2] : boolean $is_somatic - flag that this variation is somatic | |
| 204 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (variation_class) | |
| 205 my $alleles = 'A|C'; | |
| 206 my $variation_class = variation_class($alleles); | |
| 207 print "the variation class for the alleles $alleles is: ",$variation_class; | |
| 208 Description : return the class of the alleles according to dbSNP classification(SNP,indel,mixed,substitution...) | |
| 209 ReturnType : String. The class of the alleles | |
| 210 Exceptions : none | |
| 211 Caller : Variation, VariationFeature | |
| 212 | |
| 213 =cut | |
| 214 | |
| 215 sub variation_class{ | |
| 216 | |
| 217 my ($alleles, $is_somatic) = @_; | |
| 218 | |
| 219 my $class; | |
| 220 | |
| 221 if ($alleles =~ /^[ACGTN]([\|\\\/][ACGTN])+$/i) { | |
| 222 $class = 'snp'; | |
| 223 } | |
| 224 elsif (($alleles eq 'cnv') || ($alleles eq 'CNV')) { | |
| 225 $class = 'cnv'; | |
| 226 } | |
| 227 elsif ($alleles =~ /CNV\_PROBE/i) { | |
| 228 $class = 'cnv probe'; | |
| 229 } | |
| 230 elsif ($alleles =~ /HGMD\_MUTATION/i) { | |
| 231 $class = 'hgmd_mutation'; | |
| 232 } | |
| 233 else { | |
| 234 my @alleles = split /[\|\/\\]/, $alleles; | |
| 235 | |
| 236 if (@alleles == 1) { | |
| 237 #(HETEROZYGOUS) 1 allele | |
| 238 $class = 'het'; | |
| 239 } | |
| 240 elsif(@alleles == 2) { | |
| 241 if ((($alleles[0] =~ tr/ACTGN//)== length($alleles[0]) && ($alleles[1] =~ tr/-//) == 1) || | |
| 242 (($alleles[0] =~ tr/-//) == 1 && ($alleles[1] =~ tr/ACTGN//) == length($alleles[1])) ){ | |
| 243 #A/- 2 alleles | |
| 244 $class = 'in-del' | |
| 245 } | |
| 246 elsif (($alleles[0] =~ /LARGE|INS|DEL/) || ($alleles[1] =~ /LARGE|INS|DEL/)){ | |
| 247 #(LARGEDELETION) 2 alleles | |
| 248 $class = 'named' | |
| 249 } | |
| 250 elsif (($alleles[0] =~ tr/ACTG//) > 1 || ($alleles[1] =~ tr/ACTG//) > 1){ | |
| 251 #AA/GC 2 alleles | |
| 252 $class = 'substitution' | |
| 253 } | |
| 254 else { | |
| 255 warning("not possible to determine class for @alleles"); | |
| 256 $class = ''; | |
| 257 } | |
| 258 } | |
| 259 elsif (@alleles > 2) { | |
| 260 | |
| 261 if ($alleles[0] =~ /\d+/) { | |
| 262 #(CA)14/15/16/17 > 2 alleles, all of them contain the number of repetitions of the allele | |
| 263 $class = 'microsat' | |
| 264 } | |
| 265 | |
| 266 elsif ((grep {/-/} @alleles) > 0) { | |
| 267 #-/A/T/TTA > 2 alleles | |
| 268 $class = 'mixed' | |
| 269 } | |
| 270 else { | |
| 271 # warning("not possible to determine class of alleles " . @alleles); | |
| 272 $class = ''; | |
| 273 } | |
| 274 } | |
| 275 else{ | |
| 276 warning("no alleles available "); | |
| 277 $class = ''; | |
| 278 } | |
| 279 } | |
| 280 | |
| 281 if ($is_somatic) { | |
| 282 if ($class eq '') { | |
| 283 # for undetermined classes just call it somatic | |
| 284 $class = 'somatic'; | |
| 285 } | |
| 286 else { | |
| 287 # somatic mutations aren't polymorphisms, so change SNPs to SNVs | |
| 288 $class = 'snv' if $class eq 'snp'; | |
| 289 | |
| 290 # and prefix the class with 'somatic' | |
| 291 $class = 'somatic_'.$class; | |
| 292 } | |
| 293 } | |
| 294 | |
| 295 return $class; | |
| 296 } | |
| 297 | |
| 298 =head2 SO_variation_class | |
| 299 | |
| 300 Arg[1] : string $alleles - a slash ()'/') separated list of alleles, the first allele is | |
| 301 assumed to be the reference unless the $ref_correct argument is false | |
| 302 Arg[2] : boolean $ref_correct - flags that the first allele is not known to be the | |
| 303 reference sequence (so we can't call insertions or deletions and have to | |
| 304 resort to 'indel') | |
| 305 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (SO_variation_class) | |
| 306 my $alleles = 'A/C'; | |
| 307 my $SO_term = SO_variation_class($alleles); | |
| 308 print "the SO term for the alleles $alleles is: ",$SO_term; | |
| 309 Description : return the SO term for the class of the alleles | |
| 310 ReturnType : String. The SO term for the class of the alleles | |
| 311 Exceptions : none | |
| 312 Caller : Variation, VariationFeature | |
| 313 | |
| 314 =cut | |
| 315 | |
| 316 sub SO_variation_class { | |
| 317 | |
| 318 my $alleles = shift; | |
| 319 my $ref_correct = shift; | |
| 320 | |
| 321 $ref_correct = 1 unless defined $ref_correct; | |
| 322 | |
| 323 my $allele_class = '[A-Z]'; | |
| 324 | |
| 325 # default to sequence_alteration | |
| 326 my $class = SO_TERM_SEQUENCE_ALTERATION; | |
| 327 | |
| 328 if ($alleles =~ /^$allele_class(\/$allele_class)+$/) { | |
| 329 # A/T, A/T/G | |
| 330 $class = SO_TERM_SNV; | |
| 331 } | |
| 332 elsif ($alleles =~ /^$allele_class+(\/$allele_class+)+$/) { | |
| 333 # AA/TTT | |
| 334 $class = SO_TERM_SUBSTITUTION; | |
| 335 } | |
| 336 elsif ($alleles =~ /\)\d+/) { | |
| 337 # (CAG)8/(CAG)9 | |
| 338 $class = SO_TERM_TANDEM_REPEAT; | |
| 339 } | |
| 340 else { | |
| 341 my @alleles = split /\//, $alleles; | |
| 342 | |
| 343 if (@alleles > 1) { | |
| 344 | |
| 345 my $ref = shift @alleles; | |
| 346 | |
| 347 if ($ref eq '-') { | |
| 348 | |
| 349 if (@alleles == 1 && $alleles[0] =~ /DEL/) { | |
| 350 # -/(LARGEDELETION) (rather oddly!) | |
| 351 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; | |
| 352 } | |
| 353 | |
| 354 unless (grep { $_ !~ /^$allele_class+$|INS/ } @alleles) { | |
| 355 # -/ATT, -/(LARGEINSERTION) | |
| 356 $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL; | |
| 357 } | |
| 358 | |
| 359 # else must be mixed insertion and deletion, so just called sequence_alteration | |
| 360 } | |
| 361 elsif ($ref =~ /^$allele_class+$/) { | |
| 362 unless (grep { $_ !~ /-|DEL/ } @alleles) { | |
| 363 # A/-, A/(LARGEDELETION) | |
| 364 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; | |
| 365 } | |
| 366 } | |
| 367 elsif ($ref =~ /DEL/) { | |
| 368 unless (grep { $_ !~ /-/ } @alleles) { | |
| 369 # (LARGEDELETION)/-, (2345 BP DELETION)/- | |
| 370 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; | |
| 371 } | |
| 372 } | |
| 373 } | |
| 374 elsif (@alleles == 1) { | |
| 375 if ($alleles[0] =~ /INS/) { | |
| 376 # (LARGEINSERTION) | |
| 377 $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL; | |
| 378 } | |
| 379 elsif($alleles[0] =~ /DEL/) { | |
| 380 # (308 BP DELETION) | |
| 381 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL; | |
| 382 } | |
| 383 } | |
| 384 } | |
| 385 | |
| 386 return $class; | |
| 387 } | |
| 388 | |
| 389 =head2 sequence_with_ambiguity | |
| 390 | |
| 391 Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dbCore | |
| 392 Arg[2] : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor $dbVar | |
| 393 Arg[3] : string $chr | |
| 394 Arg[4] : int $start | |
| 395 Arg[5] : int $end | |
| 396 Arg[6] : int $strand | |
| 397 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (sequence_with_ambiguity) | |
| 398 my $slice = sequence_with_ambiguity($dbCore,$dbVar,1,100,200); | |
| 399 print "the sequence with ambiguity code for your region is: ",$slice->seq() | |
| 400 Description : given a region, returns a Bio::EnsEMBL::Slice object with | |
| 401 the sequence set with ambiguity codes | |
| 402 ReturnType : Bio::EnsEMBL::Slice object | |
| 403 Exceptions : none | |
| 404 Caller : general | |
| 405 | |
| 406 =cut | |
| 407 | |
| 408 sub sequence_with_ambiguity{ | |
| 409 my ($dbCore,$dbVar,$chr,$start,$end,$strand) = @_; | |
| 410 | |
| 411 my $slice; | |
| 412 if (ref($dbCore) ne 'Bio::EnsEMBL::DBSQL::DBAdaptor'){ | |
| 413 warning('You need to provide a Bio::EnsEMBL::DBSQL::DBAdaptor as a first argument'); | |
| 414 return $slice; | |
| 415 } | |
| 416 if (ref($dbVar) ne 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor'){ | |
| 417 warning('You need to provide a Bio::EnsEMBL::Variation::DBSQL::DBAdaptor object as second argument'); | |
| 418 return $slice; | |
| 419 } | |
| 420 my $slice_adaptor = $dbCore->get_SliceAdaptor(); | |
| 421 my $vf_adaptor = $dbVar->get_VariationFeatureAdaptor; | |
| 422 $slice = $slice_adaptor->fetch_by_region('chromosome',$chr,$start,$end,$strand); #get the slice | |
| 423 my $seq = $slice->seq; | |
| 424 foreach my $vf (@{$vf_adaptor->fetch_all_by_Slice($slice)}){ | |
| 425 substr($seq,$vf->start-1,1,$vf->ambig_code); | |
| 426 } | |
| 427 $slice->{'seq'} = $seq; | |
| 428 return $slice; | |
| 429 } | |
| 430 | |
| 431 =head2 hgvs_variant_notation | |
| 432 | |
| 433 Arg[1] : string $alt_allele | |
| 434 Arg[2] : string $ref_sequence | |
| 435 Arg[3] : int $ref_start | |
| 436 Arg[4] : int $ref_end | |
| 437 Arg[5] : int $display_start (optional) | |
| 438 Arg[6] : int $display_end (optional) | |
| 439 | |
| 440 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (hgvs_variant_notation) | |
| 441 my $alt_allele = 'A'; | |
| 442 my $ref_sequence = 'CCGTGATGTGC'; | |
| 443 my $ref_start = 4; | |
| 444 my $ref_end = 4; | |
| 445 my $ref_name = 'test_seq'; | |
| 446 my $ref_type = 'g'; | |
| 447 my $notation = hgvs_variant_notation($alt_allele,$ref_sequence,$ref_start,$ref_end); | |
| 448 print "HGVS notation of your variant: $ref_name\:$ref_type\." . $notation->{'hgvs'}; | |
| 449 | |
| 450 Description : Given an allele, a reference sequence and position of variant, returns a reference to a hash containing metadata and a | |
| 451 string with HGVS notation of the variant. Returns undef if reference and variant alleles are identical. | |
| 452 The optional display_start and display_end, if specified, will be used in the notation instead of the ref_start and ref_end. | |
| 453 This can be useful, e.g. if we want coordinates relative to chromosome but don't want to pass the entire chromosome sequence | |
| 454 into the subroutine. | |
| 455 The data fields in the returned hash are: | |
| 456 'start' -> Displayed start position of variant | |
| 457 'end' -> Displayed end position of variant | |
| 458 'ref' -> Reference allele | |
| 459 'alt' -> Alternative allele | |
| 460 'type' -> The variant class, e.g. ins, inv, >, delins | |
| 461 'hgvs' -> A string with HGVS notation | |
| 462 ReturnType : reference to a hash | |
| 463 Exceptions : If the length of the interval to be displayed is different from the length of the reference allele | |
| 464 Caller : general | |
| 465 | |
| 466 =cut | |
| 467 sub hgvs_variant_notation { | |
| 468 my $alt_allele = shift; | |
| 469 my $ref_sequence = shift; | |
| 470 my $ref_start = shift; | |
| 471 my $ref_end = shift; | |
| 472 my $display_start = shift; | |
| 473 my $display_end = shift; | |
| 474 | |
| 475 # If display_start and display_end were not specified, use ref_start and ref_end | |
| 476 $display_start ||= $ref_start; | |
| 477 $display_end ||= $ref_end; | |
| 478 | |
| 479 #ÊThrow an exception if the lengths of the display interval and reference interval are different | |
| 480 throw("The coordinate interval for display is of different length than for the reference allele") if (($display_end - $display_start) != ($ref_end - $ref_start)); | |
| 481 | |
| 482 # Length of the reference allele. Negative lengths make no sense | |
| 483 my $ref_length = ($ref_end - $ref_start + 1); | |
| 484 if ($ref_length < 0) { | |
| 485 $ref_length = 0; | |
| 486 } | |
| 487 | |
| 488 # Remove any gap characters in the alt allele | |
| 489 $alt_allele =~ s/\-//g; | |
| 490 | |
| 491 # Length of alternative allele | |
| 492 my $alt_length = length($alt_allele); | |
| 493 | |
| 494 # Get the reference allele | |
| 495 my $ref_allele = substr($ref_sequence,($ref_start-1),$ref_length); | |
| 496 | |
| 497 # Check that the alleles are different, otherwise return undef | |
| 498 return undef unless ($ref_allele ne $alt_allele); | |
| 499 | |
| 500 # Store the notation in a hash that will be returned | |
| 501 my %notation; | |
| 502 $notation{'start'} = $display_start; | |
| 503 $notation{'end'} = $display_end; | |
| 504 $notation{'ref'} = $ref_allele; | |
| 505 $notation{'alt'} = $alt_allele; | |
| 506 | |
| 507 # The simplest case is a deletion | |
| 508 if (!$alt_length) { | |
| 509 $notation{'type'} = 'del'; | |
| 510 | |
| 511 # Return the notation | |
| 512 return \%notation; | |
| 513 } | |
| 514 | |
| 515 # Another case is if the allele lengths are equal | |
| 516 if ($ref_length == $alt_length) { | |
| 517 | |
| 518 # If length is 1 it's a single substitution | |
| 519 if ($ref_length == 1) { | |
| 520 $notation{'type'} = '>'; | |
| 521 return \%notation; | |
| 522 } | |
| 523 | |
| 524 # Check if it's an inversion | |
| 525 my $rev_ref = $ref_allele; | |
| 526 reverse_comp(\$rev_ref); | |
| 527 if ($alt_allele eq $rev_ref) { | |
| 528 $notation{'type'} = 'inv'; | |
| 529 return \%notation; | |
| 530 } | |
| 531 | |
| 532 $notation{'type'} = 'delins'; | |
| 533 | |
| 534 return \%notation; | |
| 535 } | |
| 536 | |
| 537 # If this is an insertion, we should check if the preceeding reference nucleotides match the insertion. In that case it should be annotated as a multiplication. | |
| 538 if (!$ref_length) { | |
| 539 | |
| 540 # Get the same number of nucleotides preceding the insertion as the length of the insertion | |
| 541 my $prev_str = substr($ref_sequence,($ref_end-$alt_length),$alt_length); | |
| 542 | |
| 543 # If they match, this is a duplication | |
| 544 if ($prev_str eq $alt_allele) { | |
| 545 | |
| 546 $notation{'start'} = ($display_end - $alt_length + 1); | |
| 547 $notation{'type'} = 'dup'; | |
| 548 $notation{'ref'} = $prev_str; | |
| 549 # Return the notation | |
| 550 return \%notation; | |
| 551 } | |
| 552 | |
| 553 # If they didn't match it's a plain insertion | |
| 554 $notation{'start'} = $display_end; | |
| 555 $notation{'end'} = $display_start; | |
| 556 $notation{'type'} = 'ins'; | |
| 557 | |
| 558 return \%notation; | |
| 559 } | |
| 560 | |
| 561 # Otherwise, the reference and allele are of different lengths. By default, this is a delins but | |
| 562 # we need to check if the alt allele is a multiplication of the reference | |
| 563 # Check if the length of the alt allele is a multiple of the reference allele | |
| 564 if ($alt_length%$ref_length == 0) { | |
| 565 my $multiple = ($alt_length / $ref_length); | |
| 566 if ($alt_allele eq ($ref_allele x $multiple)) { | |
| 567 if ($multiple == 2) { | |
| 568 $notation{'type'} = 'dup'; | |
| 569 } | |
| 570 else { | |
| 571 $notation{'type'} = '[' . $multiple . ']'; | |
| 572 } | |
| 573 return \%notation; | |
| 574 } | |
| 575 } | |
| 576 | |
| 577 # Else, it's gotta be a delins | |
| 578 $notation{'type'} = 'delins'; | |
| 579 | |
| 580 return \%notation; | |
| 581 } | |
| 582 | |
| 583 | |
| 584 =head2 format_hgvs_string | |
| 585 | |
| 586 Arg[1] : string reference sequence name | |
| 587 Arg[2] : string strand | |
| 588 Arg[3] : hash of hgvs information | |
| 589 Example : | |
| 590 Description : Creates HGVS formatted string from input hash | |
| 591 ReturnType : string in HGVS format | |
| 592 Exceptions : | |
| 593 Caller : | |
| 594 | |
| 595 =cut | |
| 596 | |
| 597 sub format_hgvs_string{ | |
| 598 ##### generic formatting routine for genomic and coding HGVS names | |
| 599 | |
| 600 my $hgvs_notation = shift; | |
| 601 | |
| 602 ### all start with refseq name & numbering type | |
| 603 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . "."; | |
| 604 | |
| 605 my $coordinates; | |
| 606 #### if single base event, list position only once | |
| 607 if($hgvs_notation->{'start'} eq $hgvs_notation->{'end'}){ | |
| 608 $coordinates = $hgvs_notation->{'start'}; | |
| 609 } | |
| 610 else{ | |
| 611 $coordinates = $hgvs_notation->{'start'} . "_" . $hgvs_notation->{'end'}; | |
| 612 } | |
| 613 | |
| 614 ##### format rest of string according to type | |
| 615 | |
| 616 if($hgvs_notation->{'type'} eq 'del' || $hgvs_notation->{'type'} eq 'inv' || $hgvs_notation->{'type'} eq 'dup'){ | |
| 617 ### inversion of reference bases => list ref not alt | |
| 618 ### deletion of reference bases => list ref lost | |
| 619 ### duplication of reference bases (eg ref = GAAA alt = GAAAGAAA) => list duplicated ref (dupGAAA) | |
| 620 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'}; | |
| 621 } | |
| 622 | |
| 623 elsif( $hgvs_notation->{'type'} eq '>'){ | |
| 624 ### substitution - list both alleles | |
| 625 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{'start'} . $hgvs_notation->{'ref'} . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'}; | |
| 626 } | |
| 627 | |
| 628 elsif( $hgvs_notation->{'type'} eq 'delins'){ | |
| 629 $hgvs_notation->{'hgvs'} .= $coordinates . 'del' . $hgvs_notation->{'ref'} . 'ins' . $hgvs_notation->{'alt'}; | |
| 630 } | |
| 631 | |
| 632 elsif($hgvs_notation->{'type'} eq 'ins'){ | |
| 633 ## reference not listed | |
| 634 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'}; | |
| 635 } | |
| 636 | |
| 637 elsif($hgvs_notation->{'type'} =~ /\[\d+\]/){ | |
| 638 #### insertion described by string and number | |
| 639 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'}; | |
| 640 } | |
| 641 | |
| 642 else{ | |
| 643 warn "PROBLEM with generic HGVS formatter - type = ". $hgvs_notation->{'type'} ."\n"; | |
| 644 } | |
| 645 | |
| 646 return $hgvs_notation->{'hgvs'}; | |
| 647 | |
| 648 } | |
| 649 | |
| 650 =head2 align_seqs | |
| 651 | |
| 652 Arg[1] : string $seq1 | |
| 653 Arg[2] : string $seq2 | |
| 654 Example : my $aligned_seqs = align_seqs($seq1, $seq2); | |
| 655 Description : Does a simple NW align of two sequence strings. Best used on | |
| 656 short (<1000bp) sequences, otherwise runtime will be long | |
| 657 ReturnType : arrayref to a pair of strings | |
| 658 Exceptions : none | |
| 659 Caller : web flanking sequence display | |
| 660 | |
| 661 =cut | |
| 662 | |
| 663 sub align_seqs { | |
| 664 my $seq1 = shift; | |
| 665 my $seq2 = shift; | |
| 666 | |
| 667 # align parameters | |
| 668 my $match = 10; | |
| 669 my $mismatch = -10; | |
| 670 my $gep = -10; | |
| 671 | |
| 672 # split sequences into arrays | |
| 673 my @split1 = split //, $seq1; | |
| 674 my @split2 = split //, $seq2; | |
| 675 | |
| 676 # evaluate substitutions | |
| 677 my $len1 = length($seq1); | |
| 678 my $len2 = length($seq2); | |
| 679 | |
| 680 my (@smat, @tb); | |
| 681 | |
| 682 for (my $i=0; $i<=$len1; $i++) { | |
| 683 $smat[$i][0] = $i * $gep; | |
| 684 $tb[$i][0] = 1; | |
| 685 } | |
| 686 for (my $j=0; $j<=$len2; $j++) { | |
| 687 $smat[0][$j] = $j * $gep; | |
| 688 $tb[0][$j] = -1; | |
| 689 } | |
| 690 | |
| 691 my ($s, $sub, $del, $ins); | |
| 692 | |
| 693 for (my $i=1; $i<=$len1; $i++) { | |
| 694 for (my $j=1; $j<=$len2; $j++) { | |
| 695 | |
| 696 # calculate score | |
| 697 if($split1[$i-1] eq $split2[$j-1]) { | |
| 698 $s = $match; | |
| 699 } | |
| 700 else { | |
| 701 $s = $mismatch; | |
| 702 } | |
| 703 | |
| 704 $sub = $smat[$i-1][$j-1] + $s; | |
| 705 $del = $smat[$i][$j-1] + $gep; | |
| 706 $ins = $smat[$i-1][$j] + $gep; | |
| 707 | |
| 708 if($sub > $del && $sub > $ins) { | |
| 709 $smat[$i][$j] = $sub; | |
| 710 $tb[$i][$j] = 0; | |
| 711 } | |
| 712 elsif($del > $ins) { | |
| 713 $smat[$i][$j] = $del; | |
| 714 $tb[$i][$j] = -1; | |
| 715 } | |
| 716 else { | |
| 717 $smat[$i][$j] = $ins; | |
| 718 $tb[$i][$j] = 1; | |
| 719 } | |
| 720 } | |
| 721 } | |
| 722 | |
| 723 | |
| 724 my $i = $len1; | |
| 725 my $j = $len2; | |
| 726 my $aln_len = 0; | |
| 727 my (@aln1, @aln2); | |
| 728 | |
| 729 while(!($i == 0 && $j == 0)) { | |
| 730 if($tb[$i][$j] == 0) { | |
| 731 $aln1[$aln_len] = $split1[--$i]; | |
| 732 $aln2[$aln_len] = $split2[--$j]; | |
| 733 } | |
| 734 elsif($tb[$i][$j] == -1) { | |
| 735 $aln1[$aln_len] = '-'; | |
| 736 $aln2[$aln_len] = $split2[--$j]; | |
| 737 } | |
| 738 elsif($tb[$i][$j] == 1) { | |
| 739 $aln1[$aln_len] = $split1[--$i]; | |
| 740 $aln2[$aln_len] = '-'; | |
| 741 } | |
| 742 | |
| 743 $aln_len++; | |
| 744 } | |
| 745 | |
| 746 return [(join "", reverse @aln1), (join "", reverse @aln2)]; | |
| 747 } | |
| 748 | |
| 749 | |
| 750 =head2 array_to_bitval | |
| 751 | |
| 752 Arg[1] : arrayref $arr | |
| 753 Arg[2] : arrayref $ref | |
| 754 Example : my $bitval = array_to_bitval(['hapmap','precious'],['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']); | |
| 755 Description : Takes a reference to an array as input and return a bit value representing the | |
| 756 combination of elements from a reference array. c.f. the SET datatype in MySQL | |
| 757 ReturnType : bitvalue that represents the combination of elements in the reference array specified in the given array | |
| 758 Exceptions : none | |
| 759 Caller : get_validation_code | |
| 760 | |
| 761 =cut | |
| 762 | |
| 763 sub array_to_bitval { | |
| 764 my $arr = shift; | |
| 765 my $ref = shift; | |
| 766 | |
| 767 #ÊEnsure that we have array references | |
| 768 $arr = wrap_array($arr); | |
| 769 $ref = wrap_array($ref); | |
| 770 | |
| 771 #ÊTurn the reference array into a hash, the values will correspond to 2 raised to the power of the position in the array | |
| 772 my $i=0; | |
| 773 my %ref_hash = map {lc($_) => $i++;} @{$ref}; | |
| 774 | |
| 775 #ÊSet the bitval | |
| 776 my $bitval = 0; | |
| 777 foreach my $a (@{$arr}) { | |
| 778 | |
| 779 my $pos = $ref_hash{lc($a)}; | |
| 780 if (defined($pos)) { | |
| 781 $bitval |= 2**$pos; | |
| 782 } | |
| 783 # Warn if the element is not present in the reference array | |
| 784 else { | |
| 785 warning("$a is not a recognised element. Recognised elements are: " . join(",",@{$ref})); | |
| 786 } | |
| 787 } | |
| 788 | |
| 789 return $bitval; | |
| 790 } | |
| 791 | |
| 792 =head2 bitval_to_array | |
| 793 | |
| 794 Arg [1] : int $bitval | |
| 795 Arg [2] : arrayref $ref | |
| 796 Example : my $arr = bitval_to_array(6,['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']); | |
| 797 : print join(",",@{$arr}); #ÊWill print 'freq,submitter' | |
| 798 Description: Returns an array with the combination of elements from the reference array specified by the supplied bitvalue. | |
| 799 c.f. the SET datatype in MySQL | |
| 800 Returntype : reference to list of strings | |
| 801 Exceptions : none | |
| 802 Caller : get_all_validation_states | |
| 803 | |
| 804 =cut | |
| 805 | |
| 806 sub bitval_to_array { | |
| 807 my $bitval = shift || 0; | |
| 808 my $ref = shift; | |
| 809 | |
| 810 #ÊEnsure that we have array references | |
| 811 $ref = wrap_array($ref); | |
| 812 | |
| 813 # convert the bit value into an ordered array | |
| 814 my @arr; | |
| 815 for (my $i = 0; $i < @{$ref}; $i++) { | |
| 816 push(@arr,$ref->[$i]) if ((1 << $i) & $bitval); | |
| 817 } | |
| 818 | |
| 819 return \@arr; | |
| 820 } | |
| 821 | |
| 822 | |
| 823 =head2 add_validation_state | |
| 824 | |
| 825 Arg [1] : string $state | |
| 826 Example : add_validation_state('cluster'); | |
| 827 Description: Adds a validation state to this variation. | |
| 828 Returntype : none | |
| 829 Exceptions : warning if validation state is not a recognised type | |
| 830 Caller : general | |
| 831 Status : At Risk | |
| 832 | |
| 833 =cut | |
| 834 | |
| 835 sub add_validation_state { | |
| 836 my $obj = shift; | |
| 837 my $state = shift; | |
| 838 | |
| 839 #ÊGet the bitvalue for the new state | |
| 840 my $newbit = get_validation_code($state) || 0; | |
| 841 | |
| 842 #ÊBit-add it to the current validation_code | |
| 843 my $oldbit = $obj->{'validation_code'} || 0; | |
| 844 $newbit |= $oldbit; | |
| 845 | |
| 846 # Set the validation_code | |
| 847 $obj->{'validation_code'} = $newbit; | |
| 848 | |
| 849 return; | |
| 850 } | |
| 851 | |
| 852 =head2 get_all_validation_states | |
| 853 | |
| 854 Arg [1] : int $bitval | |
| 855 Example : my @vstates = @{get_all_validation_states($var->{'validation_code'})}; | |
| 856 Description: Retrieves all validation states for a specified bit value. | |
| 857 Returntype : reference to list of strings | |
| 858 Exceptions : none | |
| 859 Caller : general | |
| 860 | |
| 861 =cut | |
| 862 | |
| 863 sub get_all_validation_states { | |
| 864 return bitval_to_array(shift,\@VALIDATION_STATES); | |
| 865 } | |
| 866 | |
| 867 =head2 get_validation_code | |
| 868 | |
| 869 Arg [1] : arrayref $validation_status | |
| 870 Example : $var->{'validation_code'} = get_validation_code(['submitter','precious']); | |
| 871 Description: Retrieves the bit value for a combination of validation statuses. | |
| 872 Returntype : int | |
| 873 Exceptions : none | |
| 874 Caller : Variation::new | |
| 875 | |
| 876 =cut | |
| 877 | |
| 878 sub get_validation_code { | |
| 879 return array_to_bitval(shift,\@VALIDATION_STATES); | |
| 880 } | |
| 881 | |
| 882 1; |
