0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence
|
|
22 #
|
|
23 #
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 Bio::EnsEMBL::Variation::Utils::Sequence - Utility functions for sequences
|
|
28
|
|
29 =head1 SYNOPSIS
|
|
30
|
|
31 use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code variation_class);
|
|
32
|
|
33 my $alleles = 'A|C';
|
|
34
|
|
35 print "my alleles = $alleles\n";
|
|
36
|
|
37 my $ambig_code = ambiguity_code($alleles);
|
|
38
|
|
39 print "my ambiguity code is $ambig_code\n";
|
|
40
|
|
41 print "my SNP class is = variation_class($alleles)";
|
|
42
|
|
43
|
|
44 =head1 METHODS
|
|
45
|
|
46 =cut
|
|
47
|
|
48
|
|
49 use strict;
|
|
50 use warnings;
|
|
51
|
|
52 package Bio::EnsEMBL::Variation::Utils::Sequence;
|
|
53
|
|
54 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
55 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
|
|
56 use Bio::EnsEMBL::Variation::Utils::Constants qw(:SO_class_terms);
|
|
57 use Bio::EnsEMBL::Utils::Scalar qw(wrap_array);
|
|
58 use Exporter;
|
|
59
|
|
60 use vars qw(@ISA @EXPORT_OK);
|
|
61
|
|
62 @ISA = qw(Exporter);
|
|
63
|
|
64 @EXPORT_OK = qw(
|
|
65 &ambiguity_code
|
|
66 &variation_class
|
|
67 &unambiguity_code
|
|
68 &sequence_with_ambiguity
|
|
69 &hgvs_variant_notation
|
|
70 &format_hgvs_string
|
|
71 &SO_variation_class
|
|
72 &align_seqs
|
|
73 &strain_ambiguity_code
|
|
74 &get_all_validation_states
|
|
75 &get_validation_code
|
|
76 &add_validation_state
|
|
77 );
|
|
78
|
|
79 # List of validation states. Order must match that of set in database
|
|
80 our @VALIDATION_STATES = qw(cluster freq submitter doublehit hapmap 1000Genome failed precious);
|
|
81
|
|
82 =head2 ambiguity_code
|
|
83
|
|
84 Arg[1] : string $alleles
|
|
85 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code)
|
|
86 my $alleles = 'A|C';
|
|
87 my $ambig_code = ambiguity_code($alleles);
|
|
88 print "the ambiguity code for $alleles is: ",$ambig_code;
|
|
89 Description : returns the ambiguity code for a SNP allele
|
|
90 ReturnType : String
|
|
91 The ambiguity code
|
|
92 Exceptions : None
|
|
93 Caller : Variation, VariationFeature
|
|
94
|
|
95 =cut
|
|
96
|
|
97 sub ambiguity_code {
|
|
98 my $alleles = shift;
|
|
99 my %duplicates; #hash containing all alleles to remove duplicates
|
|
100
|
|
101 foreach my $a(split /[\|\/\\]/, $alleles) {
|
|
102 # convert Ns
|
|
103 my @a = ($a eq 'N' ? qw(A C G T) : ($a));
|
|
104 map {$duplicates{$_}++} @a;
|
|
105 }
|
|
106 $alleles = uc( join '', sort keys %duplicates );
|
|
107 #my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y
|
|
108 #GT K C C A A T T G G - - -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-'
|
|
109 my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y GT K C C A A T T G G - -);
|
|
110 return $ambig{$alleles};
|
|
111 }
|
|
112
|
|
113 =head2 strain_ambiguity_code
|
|
114
|
|
115 Arg[1] : string $alleles (separated by "/", "\" or "|")
|
|
116 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(strain_ambiguity_code)
|
|
117 my $alleles = 'A|C';
|
|
118 my $ambig_code = strain_ambiguity_code($alleles);
|
|
119 print "the ambiguity code for $alleles is: ",$ambig_code;
|
|
120 Description : returns the ambiguity code for a strain genotype
|
|
121 ReturnType : String
|
|
122 Exceptions : None
|
|
123 Caller : AlleleFeatureAdaptor
|
|
124
|
|
125 =cut
|
|
126
|
|
127 sub strain_ambiguity_code {
|
|
128 my $alleles = shift;
|
|
129
|
|
130 # return normal ambiguity code for a SNP
|
|
131 return ambiguity_code($alleles) if($alleles =~ /^[ACGT][\|\/\\][ACGT]$/);
|
|
132
|
|
133 # get alleles
|
|
134 my ($a1, $a2) = split /[\|\/\\]/, $alleles;
|
|
135
|
|
136 # pad
|
|
137 if(length($a1) > length($a2)) {
|
|
138 $a2 .= '-' x (length($a1) - length($a2));
|
|
139 }
|
|
140 else {
|
|
141 $a1 .= '-' x (length($a2) - length($a1));
|
|
142 }
|
|
143
|
|
144 # build ambiguity code base by base
|
|
145 my $ambig = '';
|
|
146
|
|
147 for my $i(0..(length($a1) - 1)) {
|
|
148 my $b1 = substr($a1, $i, 1);
|
|
149 my $b2 = substr($a2, $i, 1);
|
|
150
|
|
151 # -/- = -
|
|
152 if($b1 eq '-' && $b2 eq '-') {
|
|
153 $ambig .= '-';
|
|
154 }
|
|
155
|
|
156 # G/- = g
|
|
157 elsif($b1 eq '-') {
|
|
158 $ambig .= lc($b2);
|
|
159 }
|
|
160
|
|
161 # -/G = g
|
|
162 elsif($b2 eq '-') {
|
|
163 $ambig .= lc($b1);
|
|
164 }
|
|
165
|
|
166 # A/G = R
|
|
167 else {
|
|
168 $ambig .= ambiguity_code($b1.'|'.$b2);
|
|
169 }
|
|
170 }
|
|
171
|
|
172 return $ambig;
|
|
173 }
|
|
174
|
|
175 =head2 unambiguity_code
|
|
176
|
|
177 Arg[1] : string $alleles
|
|
178 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code)
|
|
179 my $ambiguity_code = 'M';
|
|
180 my $alleles = unambiguity_code($ambiguity_code);
|
|
181 print "the alleles for ambiguity code $ambiguity_code is: ",$alleles;
|
|
182 Description : returns the alleles for an ambiguity code
|
|
183 ReturnType : String
|
|
184 The Alleles, alphabetically sorted and in capital
|
|
185 Exceptions : None
|
|
186 Caller : Variation, VariationFeature
|
|
187
|
|
188 =cut
|
|
189
|
|
190 sub unambiguity_code {
|
|
191 my $ambiguity_code = shift;
|
|
192
|
|
193 #my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K
|
|
194 #GT C CC A AA T TT G GG - -- -A -A -C -C -G -G -T -T A- A- C- C- G- G- T- T-); #for now just make e.g. 'A-' -> 'A-'
|
|
195 my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K GT C CC A AA T TT G GG - --);
|
|
196 return $unambig{$ambiguity_code};
|
|
197 }
|
|
198
|
|
199
|
|
200 =head2 variation_class
|
|
201
|
|
202 Arg[1] : string $alleles
|
|
203 Arg[2] : boolean $is_somatic - flag that this variation is somatic
|
|
204 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (variation_class)
|
|
205 my $alleles = 'A|C';
|
|
206 my $variation_class = variation_class($alleles);
|
|
207 print "the variation class for the alleles $alleles is: ",$variation_class;
|
|
208 Description : return the class of the alleles according to dbSNP classification(SNP,indel,mixed,substitution...)
|
|
209 ReturnType : String. The class of the alleles
|
|
210 Exceptions : none
|
|
211 Caller : Variation, VariationFeature
|
|
212
|
|
213 =cut
|
|
214
|
|
215 sub variation_class{
|
|
216
|
|
217 my ($alleles, $is_somatic) = @_;
|
|
218
|
|
219 my $class;
|
|
220
|
|
221 if ($alleles =~ /^[ACGTN]([\|\\\/][ACGTN])+$/i) {
|
|
222 $class = 'snp';
|
|
223 }
|
|
224 elsif (($alleles eq 'cnv') || ($alleles eq 'CNV')) {
|
|
225 $class = 'cnv';
|
|
226 }
|
|
227 elsif ($alleles =~ /CNV\_PROBE/i) {
|
|
228 $class = 'cnv probe';
|
|
229 }
|
|
230 elsif ($alleles =~ /HGMD\_MUTATION/i) {
|
|
231 $class = 'hgmd_mutation';
|
|
232 }
|
|
233 else {
|
|
234 my @alleles = split /[\|\/\\]/, $alleles;
|
|
235
|
|
236 if (@alleles == 1) {
|
|
237 #(HETEROZYGOUS) 1 allele
|
|
238 $class = 'het';
|
|
239 }
|
|
240 elsif(@alleles == 2) {
|
|
241 if ((($alleles[0] =~ tr/ACTGN//)== length($alleles[0]) && ($alleles[1] =~ tr/-//) == 1) ||
|
|
242 (($alleles[0] =~ tr/-//) == 1 && ($alleles[1] =~ tr/ACTGN//) == length($alleles[1])) ){
|
|
243 #A/- 2 alleles
|
|
244 $class = 'in-del'
|
|
245 }
|
|
246 elsif (($alleles[0] =~ /LARGE|INS|DEL/) || ($alleles[1] =~ /LARGE|INS|DEL/)){
|
|
247 #(LARGEDELETION) 2 alleles
|
|
248 $class = 'named'
|
|
249 }
|
|
250 elsif (($alleles[0] =~ tr/ACTG//) > 1 || ($alleles[1] =~ tr/ACTG//) > 1){
|
|
251 #AA/GC 2 alleles
|
|
252 $class = 'substitution'
|
|
253 }
|
|
254 else {
|
|
255 warning("not possible to determine class for @alleles");
|
|
256 $class = '';
|
|
257 }
|
|
258 }
|
|
259 elsif (@alleles > 2) {
|
|
260
|
|
261 if ($alleles[0] =~ /\d+/) {
|
|
262 #(CA)14/15/16/17 > 2 alleles, all of them contain the number of repetitions of the allele
|
|
263 $class = 'microsat'
|
|
264 }
|
|
265
|
|
266 elsif ((grep {/-/} @alleles) > 0) {
|
|
267 #-/A/T/TTA > 2 alleles
|
|
268 $class = 'mixed'
|
|
269 }
|
|
270 else {
|
|
271 # warning("not possible to determine class of alleles " . @alleles);
|
|
272 $class = '';
|
|
273 }
|
|
274 }
|
|
275 else{
|
|
276 warning("no alleles available ");
|
|
277 $class = '';
|
|
278 }
|
|
279 }
|
|
280
|
|
281 if ($is_somatic) {
|
|
282 if ($class eq '') {
|
|
283 # for undetermined classes just call it somatic
|
|
284 $class = 'somatic';
|
|
285 }
|
|
286 else {
|
|
287 # somatic mutations aren't polymorphisms, so change SNPs to SNVs
|
|
288 $class = 'snv' if $class eq 'snp';
|
|
289
|
|
290 # and prefix the class with 'somatic'
|
|
291 $class = 'somatic_'.$class;
|
|
292 }
|
|
293 }
|
|
294
|
|
295 return $class;
|
|
296 }
|
|
297
|
|
298 =head2 SO_variation_class
|
|
299
|
|
300 Arg[1] : string $alleles - a slash ()'/') separated list of alleles, the first allele is
|
|
301 assumed to be the reference unless the $ref_correct argument is false
|
|
302 Arg[2] : boolean $ref_correct - flags that the first allele is not known to be the
|
|
303 reference sequence (so we can't call insertions or deletions and have to
|
|
304 resort to 'indel')
|
|
305 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (SO_variation_class)
|
|
306 my $alleles = 'A/C';
|
|
307 my $SO_term = SO_variation_class($alleles);
|
|
308 print "the SO term for the alleles $alleles is: ",$SO_term;
|
|
309 Description : return the SO term for the class of the alleles
|
|
310 ReturnType : String. The SO term for the class of the alleles
|
|
311 Exceptions : none
|
|
312 Caller : Variation, VariationFeature
|
|
313
|
|
314 =cut
|
|
315
|
|
316 sub SO_variation_class {
|
|
317
|
|
318 my $alleles = shift;
|
|
319 my $ref_correct = shift;
|
|
320
|
|
321 $ref_correct = 1 unless defined $ref_correct;
|
|
322
|
|
323 my $allele_class = '[A-Z]';
|
|
324
|
|
325 # default to sequence_alteration
|
|
326 my $class = SO_TERM_SEQUENCE_ALTERATION;
|
|
327
|
|
328 if ($alleles =~ /^$allele_class(\/$allele_class)+$/) {
|
|
329 # A/T, A/T/G
|
|
330 $class = SO_TERM_SNV;
|
|
331 }
|
|
332 elsif ($alleles =~ /^$allele_class+(\/$allele_class+)+$/) {
|
|
333 # AA/TTT
|
|
334 $class = SO_TERM_SUBSTITUTION;
|
|
335 }
|
|
336 elsif ($alleles =~ /\)\d+/) {
|
|
337 # (CAG)8/(CAG)9
|
|
338 $class = SO_TERM_TANDEM_REPEAT;
|
|
339 }
|
|
340 else {
|
|
341 my @alleles = split /\//, $alleles;
|
|
342
|
|
343 if (@alleles > 1) {
|
|
344
|
|
345 my $ref = shift @alleles;
|
|
346
|
|
347 if ($ref eq '-') {
|
|
348
|
|
349 if (@alleles == 1 && $alleles[0] =~ /DEL/) {
|
|
350 # -/(LARGEDELETION) (rather oddly!)
|
|
351 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
|
|
352 }
|
|
353
|
|
354 unless (grep { $_ !~ /^$allele_class+$|INS/ } @alleles) {
|
|
355 # -/ATT, -/(LARGEINSERTION)
|
|
356 $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL;
|
|
357 }
|
|
358
|
|
359 # else must be mixed insertion and deletion, so just called sequence_alteration
|
|
360 }
|
|
361 elsif ($ref =~ /^$allele_class+$/) {
|
|
362 unless (grep { $_ !~ /-|DEL/ } @alleles) {
|
|
363 # A/-, A/(LARGEDELETION)
|
|
364 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
|
|
365 }
|
|
366 }
|
|
367 elsif ($ref =~ /DEL/) {
|
|
368 unless (grep { $_ !~ /-/ } @alleles) {
|
|
369 # (LARGEDELETION)/-, (2345 BP DELETION)/-
|
|
370 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
|
|
371 }
|
|
372 }
|
|
373 }
|
|
374 elsif (@alleles == 1) {
|
|
375 if ($alleles[0] =~ /INS/) {
|
|
376 # (LARGEINSERTION)
|
|
377 $class = $ref_correct ? SO_TERM_INSERTION : SO_TERM_INDEL;
|
|
378 }
|
|
379 elsif($alleles[0] =~ /DEL/) {
|
|
380 # (308 BP DELETION)
|
|
381 $class = $ref_correct ? SO_TERM_DELETION : SO_TERM_INDEL;
|
|
382 }
|
|
383 }
|
|
384 }
|
|
385
|
|
386 return $class;
|
|
387 }
|
|
388
|
|
389 =head2 sequence_with_ambiguity
|
|
390
|
|
391 Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dbCore
|
|
392 Arg[2] : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor $dbVar
|
|
393 Arg[3] : string $chr
|
|
394 Arg[4] : int $start
|
|
395 Arg[5] : int $end
|
|
396 Arg[6] : int $strand
|
|
397 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (sequence_with_ambiguity)
|
|
398 my $slice = sequence_with_ambiguity($dbCore,$dbVar,1,100,200);
|
|
399 print "the sequence with ambiguity code for your region is: ",$slice->seq()
|
|
400 Description : given a region, returns a Bio::EnsEMBL::Slice object with
|
|
401 the sequence set with ambiguity codes
|
|
402 ReturnType : Bio::EnsEMBL::Slice object
|
|
403 Exceptions : none
|
|
404 Caller : general
|
|
405
|
|
406 =cut
|
|
407
|
|
408 sub sequence_with_ambiguity{
|
|
409 my ($dbCore,$dbVar,$chr,$start,$end,$strand) = @_;
|
|
410
|
|
411 my $slice;
|
|
412 if (ref($dbCore) ne 'Bio::EnsEMBL::DBSQL::DBAdaptor'){
|
|
413 warning('You need to provide a Bio::EnsEMBL::DBSQL::DBAdaptor as a first argument');
|
|
414 return $slice;
|
|
415 }
|
|
416 if (ref($dbVar) ne 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor'){
|
|
417 warning('You need to provide a Bio::EnsEMBL::Variation::DBSQL::DBAdaptor object as second argument');
|
|
418 return $slice;
|
|
419 }
|
|
420 my $slice_adaptor = $dbCore->get_SliceAdaptor();
|
|
421 my $vf_adaptor = $dbVar->get_VariationFeatureAdaptor;
|
|
422 $slice = $slice_adaptor->fetch_by_region('chromosome',$chr,$start,$end,$strand); #get the slice
|
|
423 my $seq = $slice->seq;
|
|
424 foreach my $vf (@{$vf_adaptor->fetch_all_by_Slice($slice)}){
|
|
425 substr($seq,$vf->start-1,1,$vf->ambig_code);
|
|
426 }
|
|
427 $slice->{'seq'} = $seq;
|
|
428 return $slice;
|
|
429 }
|
|
430
|
|
431 =head2 hgvs_variant_notation
|
|
432
|
|
433 Arg[1] : string $alt_allele
|
|
434 Arg[2] : string $ref_sequence
|
|
435 Arg[3] : int $ref_start
|
|
436 Arg[4] : int $ref_end
|
|
437 Arg[5] : int $display_start (optional)
|
|
438 Arg[6] : int $display_end (optional)
|
|
439
|
|
440 Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (hgvs_variant_notation)
|
|
441 my $alt_allele = 'A';
|
|
442 my $ref_sequence = 'CCGTGATGTGC';
|
|
443 my $ref_start = 4;
|
|
444 my $ref_end = 4;
|
|
445 my $ref_name = 'test_seq';
|
|
446 my $ref_type = 'g';
|
|
447 my $notation = hgvs_variant_notation($alt_allele,$ref_sequence,$ref_start,$ref_end);
|
|
448 print "HGVS notation of your variant: $ref_name\:$ref_type\." . $notation->{'hgvs'};
|
|
449
|
|
450 Description : Given an allele, a reference sequence and position of variant, returns a reference to a hash containing metadata and a
|
|
451 string with HGVS notation of the variant. Returns undef if reference and variant alleles are identical.
|
|
452 The optional display_start and display_end, if specified, will be used in the notation instead of the ref_start and ref_end.
|
|
453 This can be useful, e.g. if we want coordinates relative to chromosome but don't want to pass the entire chromosome sequence
|
|
454 into the subroutine.
|
|
455 The data fields in the returned hash are:
|
|
456 'start' -> Displayed start position of variant
|
|
457 'end' -> Displayed end position of variant
|
|
458 'ref' -> Reference allele
|
|
459 'alt' -> Alternative allele
|
|
460 'type' -> The variant class, e.g. ins, inv, >, delins
|
|
461 'hgvs' -> A string with HGVS notation
|
|
462 ReturnType : reference to a hash
|
|
463 Exceptions : If the length of the interval to be displayed is different from the length of the reference allele
|
|
464 Caller : general
|
|
465
|
|
466 =cut
|
|
467 sub hgvs_variant_notation {
|
|
468 my $alt_allele = shift;
|
|
469 my $ref_sequence = shift;
|
|
470 my $ref_start = shift;
|
|
471 my $ref_end = shift;
|
|
472 my $display_start = shift;
|
|
473 my $display_end = shift;
|
|
474
|
|
475 # If display_start and display_end were not specified, use ref_start and ref_end
|
|
476 $display_start ||= $ref_start;
|
|
477 $display_end ||= $ref_end;
|
|
478
|
|
479 #ÊThrow an exception if the lengths of the display interval and reference interval are different
|
|
480 throw("The coordinate interval for display is of different length than for the reference allele") if (($display_end - $display_start) != ($ref_end - $ref_start));
|
|
481
|
|
482 # Length of the reference allele. Negative lengths make no sense
|
|
483 my $ref_length = ($ref_end - $ref_start + 1);
|
|
484 if ($ref_length < 0) {
|
|
485 $ref_length = 0;
|
|
486 }
|
|
487
|
|
488 # Remove any gap characters in the alt allele
|
|
489 $alt_allele =~ s/\-//g;
|
|
490
|
|
491 # Length of alternative allele
|
|
492 my $alt_length = length($alt_allele);
|
|
493
|
|
494 # Get the reference allele
|
|
495 my $ref_allele = substr($ref_sequence,($ref_start-1),$ref_length);
|
|
496
|
|
497 # Check that the alleles are different, otherwise return undef
|
|
498 return undef unless ($ref_allele ne $alt_allele);
|
|
499
|
|
500 # Store the notation in a hash that will be returned
|
|
501 my %notation;
|
|
502 $notation{'start'} = $display_start;
|
|
503 $notation{'end'} = $display_end;
|
|
504 $notation{'ref'} = $ref_allele;
|
|
505 $notation{'alt'} = $alt_allele;
|
|
506
|
|
507 # The simplest case is a deletion
|
|
508 if (!$alt_length) {
|
|
509 $notation{'type'} = 'del';
|
|
510
|
|
511 # Return the notation
|
|
512 return \%notation;
|
|
513 }
|
|
514
|
|
515 # Another case is if the allele lengths are equal
|
|
516 if ($ref_length == $alt_length) {
|
|
517
|
|
518 # If length is 1 it's a single substitution
|
|
519 if ($ref_length == 1) {
|
|
520 $notation{'type'} = '>';
|
|
521 return \%notation;
|
|
522 }
|
|
523
|
|
524 # Check if it's an inversion
|
|
525 my $rev_ref = $ref_allele;
|
|
526 reverse_comp(\$rev_ref);
|
|
527 if ($alt_allele eq $rev_ref) {
|
|
528 $notation{'type'} = 'inv';
|
|
529 return \%notation;
|
|
530 }
|
|
531
|
|
532 $notation{'type'} = 'delins';
|
|
533
|
|
534 return \%notation;
|
|
535 }
|
|
536
|
|
537 # If this is an insertion, we should check if the preceeding reference nucleotides match the insertion. In that case it should be annotated as a multiplication.
|
|
538 if (!$ref_length) {
|
|
539
|
|
540 # Get the same number of nucleotides preceding the insertion as the length of the insertion
|
|
541 my $prev_str = substr($ref_sequence,($ref_end-$alt_length),$alt_length);
|
|
542
|
|
543 # If they match, this is a duplication
|
|
544 if ($prev_str eq $alt_allele) {
|
|
545
|
|
546 $notation{'start'} = ($display_end - $alt_length + 1);
|
|
547 $notation{'type'} = 'dup';
|
|
548 $notation{'ref'} = $prev_str;
|
|
549 # Return the notation
|
|
550 return \%notation;
|
|
551 }
|
|
552
|
|
553 # If they didn't match it's a plain insertion
|
|
554 $notation{'start'} = $display_end;
|
|
555 $notation{'end'} = $display_start;
|
|
556 $notation{'type'} = 'ins';
|
|
557
|
|
558 return \%notation;
|
|
559 }
|
|
560
|
|
561 # Otherwise, the reference and allele are of different lengths. By default, this is a delins but
|
|
562 # we need to check if the alt allele is a multiplication of the reference
|
|
563 # Check if the length of the alt allele is a multiple of the reference allele
|
|
564 if ($alt_length%$ref_length == 0) {
|
|
565 my $multiple = ($alt_length / $ref_length);
|
|
566 if ($alt_allele eq ($ref_allele x $multiple)) {
|
|
567 if ($multiple == 2) {
|
|
568 $notation{'type'} = 'dup';
|
|
569 }
|
|
570 else {
|
|
571 $notation{'type'} = '[' . $multiple . ']';
|
|
572 }
|
|
573 return \%notation;
|
|
574 }
|
|
575 }
|
|
576
|
|
577 # Else, it's gotta be a delins
|
|
578 $notation{'type'} = 'delins';
|
|
579
|
|
580 return \%notation;
|
|
581 }
|
|
582
|
|
583
|
|
584 =head2 format_hgvs_string
|
|
585
|
|
586 Arg[1] : string reference sequence name
|
|
587 Arg[2] : string strand
|
|
588 Arg[3] : hash of hgvs information
|
|
589 Example :
|
|
590 Description : Creates HGVS formatted string from input hash
|
|
591 ReturnType : string in HGVS format
|
|
592 Exceptions :
|
|
593 Caller :
|
|
594
|
|
595 =cut
|
|
596
|
|
597 sub format_hgvs_string{
|
|
598 ##### generic formatting routine for genomic and coding HGVS names
|
|
599
|
|
600 my $hgvs_notation = shift;
|
|
601
|
|
602 ### all start with refseq name & numbering type
|
|
603 $hgvs_notation->{'hgvs'} = $hgvs_notation->{'ref_name'} . ":" . $hgvs_notation->{'numbering'} . ".";
|
|
604
|
|
605 my $coordinates;
|
|
606 #### if single base event, list position only once
|
|
607 if($hgvs_notation->{'start'} eq $hgvs_notation->{'end'}){
|
|
608 $coordinates = $hgvs_notation->{'start'};
|
|
609 }
|
|
610 else{
|
|
611 $coordinates = $hgvs_notation->{'start'} . "_" . $hgvs_notation->{'end'};
|
|
612 }
|
|
613
|
|
614 ##### format rest of string according to type
|
|
615
|
|
616 if($hgvs_notation->{'type'} eq 'del' || $hgvs_notation->{'type'} eq 'inv' || $hgvs_notation->{'type'} eq 'dup'){
|
|
617 ### inversion of reference bases => list ref not alt
|
|
618 ### deletion of reference bases => list ref lost
|
|
619 ### duplication of reference bases (eg ref = GAAA alt = GAAAGAAA) => list duplicated ref (dupGAAA)
|
|
620 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'};
|
|
621 }
|
|
622
|
|
623 elsif( $hgvs_notation->{'type'} eq '>'){
|
|
624 ### substitution - list both alleles
|
|
625 $hgvs_notation->{'hgvs'} .= $hgvs_notation->{'start'} . $hgvs_notation->{'ref'} . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'};
|
|
626 }
|
|
627
|
|
628 elsif( $hgvs_notation->{'type'} eq 'delins'){
|
|
629 $hgvs_notation->{'hgvs'} .= $coordinates . 'del' . $hgvs_notation->{'ref'} . 'ins' . $hgvs_notation->{'alt'};
|
|
630 }
|
|
631
|
|
632 elsif($hgvs_notation->{'type'} eq 'ins'){
|
|
633 ## reference not listed
|
|
634 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'alt'};
|
|
635 }
|
|
636
|
|
637 elsif($hgvs_notation->{'type'} =~ /\[\d+\]/){
|
|
638 #### insertion described by string and number
|
|
639 $hgvs_notation->{'hgvs'} .= $coordinates . $hgvs_notation->{'type'} . $hgvs_notation->{'ref'};
|
|
640 }
|
|
641
|
|
642 else{
|
|
643 warn "PROBLEM with generic HGVS formatter - type = ". $hgvs_notation->{'type'} ."\n";
|
|
644 }
|
|
645
|
|
646 return $hgvs_notation->{'hgvs'};
|
|
647
|
|
648 }
|
|
649
|
|
650 =head2 align_seqs
|
|
651
|
|
652 Arg[1] : string $seq1
|
|
653 Arg[2] : string $seq2
|
|
654 Example : my $aligned_seqs = align_seqs($seq1, $seq2);
|
|
655 Description : Does a simple NW align of two sequence strings. Best used on
|
|
656 short (<1000bp) sequences, otherwise runtime will be long
|
|
657 ReturnType : arrayref to a pair of strings
|
|
658 Exceptions : none
|
|
659 Caller : web flanking sequence display
|
|
660
|
|
661 =cut
|
|
662
|
|
663 sub align_seqs {
|
|
664 my $seq1 = shift;
|
|
665 my $seq2 = shift;
|
|
666
|
|
667 # align parameters
|
|
668 my $match = 10;
|
|
669 my $mismatch = -10;
|
|
670 my $gep = -10;
|
|
671
|
|
672 # split sequences into arrays
|
|
673 my @split1 = split //, $seq1;
|
|
674 my @split2 = split //, $seq2;
|
|
675
|
|
676 # evaluate substitutions
|
|
677 my $len1 = length($seq1);
|
|
678 my $len2 = length($seq2);
|
|
679
|
|
680 my (@smat, @tb);
|
|
681
|
|
682 for (my $i=0; $i<=$len1; $i++) {
|
|
683 $smat[$i][0] = $i * $gep;
|
|
684 $tb[$i][0] = 1;
|
|
685 }
|
|
686 for (my $j=0; $j<=$len2; $j++) {
|
|
687 $smat[0][$j] = $j * $gep;
|
|
688 $tb[0][$j] = -1;
|
|
689 }
|
|
690
|
|
691 my ($s, $sub, $del, $ins);
|
|
692
|
|
693 for (my $i=1; $i<=$len1; $i++) {
|
|
694 for (my $j=1; $j<=$len2; $j++) {
|
|
695
|
|
696 # calculate score
|
|
697 if($split1[$i-1] eq $split2[$j-1]) {
|
|
698 $s = $match;
|
|
699 }
|
|
700 else {
|
|
701 $s = $mismatch;
|
|
702 }
|
|
703
|
|
704 $sub = $smat[$i-1][$j-1] + $s;
|
|
705 $del = $smat[$i][$j-1] + $gep;
|
|
706 $ins = $smat[$i-1][$j] + $gep;
|
|
707
|
|
708 if($sub > $del && $sub > $ins) {
|
|
709 $smat[$i][$j] = $sub;
|
|
710 $tb[$i][$j] = 0;
|
|
711 }
|
|
712 elsif($del > $ins) {
|
|
713 $smat[$i][$j] = $del;
|
|
714 $tb[$i][$j] = -1;
|
|
715 }
|
|
716 else {
|
|
717 $smat[$i][$j] = $ins;
|
|
718 $tb[$i][$j] = 1;
|
|
719 }
|
|
720 }
|
|
721 }
|
|
722
|
|
723
|
|
724 my $i = $len1;
|
|
725 my $j = $len2;
|
|
726 my $aln_len = 0;
|
|
727 my (@aln1, @aln2);
|
|
728
|
|
729 while(!($i == 0 && $j == 0)) {
|
|
730 if($tb[$i][$j] == 0) {
|
|
731 $aln1[$aln_len] = $split1[--$i];
|
|
732 $aln2[$aln_len] = $split2[--$j];
|
|
733 }
|
|
734 elsif($tb[$i][$j] == -1) {
|
|
735 $aln1[$aln_len] = '-';
|
|
736 $aln2[$aln_len] = $split2[--$j];
|
|
737 }
|
|
738 elsif($tb[$i][$j] == 1) {
|
|
739 $aln1[$aln_len] = $split1[--$i];
|
|
740 $aln2[$aln_len] = '-';
|
|
741 }
|
|
742
|
|
743 $aln_len++;
|
|
744 }
|
|
745
|
|
746 return [(join "", reverse @aln1), (join "", reverse @aln2)];
|
|
747 }
|
|
748
|
|
749
|
|
750 =head2 array_to_bitval
|
|
751
|
|
752 Arg[1] : arrayref $arr
|
|
753 Arg[2] : arrayref $ref
|
|
754 Example : my $bitval = array_to_bitval(['hapmap','precious'],['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']);
|
|
755 Description : Takes a reference to an array as input and return a bit value representing the
|
|
756 combination of elements from a reference array. c.f. the SET datatype in MySQL
|
|
757 ReturnType : bitvalue that represents the combination of elements in the reference array specified in the given array
|
|
758 Exceptions : none
|
|
759 Caller : get_validation_code
|
|
760
|
|
761 =cut
|
|
762
|
|
763 sub array_to_bitval {
|
|
764 my $arr = shift;
|
|
765 my $ref = shift;
|
|
766
|
|
767 #ÊEnsure that we have array references
|
|
768 $arr = wrap_array($arr);
|
|
769 $ref = wrap_array($ref);
|
|
770
|
|
771 #ÊTurn the reference array into a hash, the values will correspond to 2 raised to the power of the position in the array
|
|
772 my $i=0;
|
|
773 my %ref_hash = map {lc($_) => $i++;} @{$ref};
|
|
774
|
|
775 #ÊSet the bitval
|
|
776 my $bitval = 0;
|
|
777 foreach my $a (@{$arr}) {
|
|
778
|
|
779 my $pos = $ref_hash{lc($a)};
|
|
780 if (defined($pos)) {
|
|
781 $bitval |= 2**$pos;
|
|
782 }
|
|
783 # Warn if the element is not present in the reference array
|
|
784 else {
|
|
785 warning("$a is not a recognised element. Recognised elements are: " . join(",",@{$ref}));
|
|
786 }
|
|
787 }
|
|
788
|
|
789 return $bitval;
|
|
790 }
|
|
791
|
|
792 =head2 bitval_to_array
|
|
793
|
|
794 Arg [1] : int $bitval
|
|
795 Arg [2] : arrayref $ref
|
|
796 Example : my $arr = bitval_to_array(6,['cluster','freq','submitter','doublehit','hapmap','1000Genome','failed','precious']);
|
|
797 : print join(",",@{$arr}); #ÊWill print 'freq,submitter'
|
|
798 Description: Returns an array with the combination of elements from the reference array specified by the supplied bitvalue.
|
|
799 c.f. the SET datatype in MySQL
|
|
800 Returntype : reference to list of strings
|
|
801 Exceptions : none
|
|
802 Caller : get_all_validation_states
|
|
803
|
|
804 =cut
|
|
805
|
|
806 sub bitval_to_array {
|
|
807 my $bitval = shift || 0;
|
|
808 my $ref = shift;
|
|
809
|
|
810 #ÊEnsure that we have array references
|
|
811 $ref = wrap_array($ref);
|
|
812
|
|
813 # convert the bit value into an ordered array
|
|
814 my @arr;
|
|
815 for (my $i = 0; $i < @{$ref}; $i++) {
|
|
816 push(@arr,$ref->[$i]) if ((1 << $i) & $bitval);
|
|
817 }
|
|
818
|
|
819 return \@arr;
|
|
820 }
|
|
821
|
|
822
|
|
823 =head2 add_validation_state
|
|
824
|
|
825 Arg [1] : string $state
|
|
826 Example : add_validation_state('cluster');
|
|
827 Description: Adds a validation state to this variation.
|
|
828 Returntype : none
|
|
829 Exceptions : warning if validation state is not a recognised type
|
|
830 Caller : general
|
|
831 Status : At Risk
|
|
832
|
|
833 =cut
|
|
834
|
|
835 sub add_validation_state {
|
|
836 my $obj = shift;
|
|
837 my $state = shift;
|
|
838
|
|
839 #ÊGet the bitvalue for the new state
|
|
840 my $newbit = get_validation_code($state) || 0;
|
|
841
|
|
842 #ÊBit-add it to the current validation_code
|
|
843 my $oldbit = $obj->{'validation_code'} || 0;
|
|
844 $newbit |= $oldbit;
|
|
845
|
|
846 # Set the validation_code
|
|
847 $obj->{'validation_code'} = $newbit;
|
|
848
|
|
849 return;
|
|
850 }
|
|
851
|
|
852 =head2 get_all_validation_states
|
|
853
|
|
854 Arg [1] : int $bitval
|
|
855 Example : my @vstates = @{get_all_validation_states($var->{'validation_code'})};
|
|
856 Description: Retrieves all validation states for a specified bit value.
|
|
857 Returntype : reference to list of strings
|
|
858 Exceptions : none
|
|
859 Caller : general
|
|
860
|
|
861 =cut
|
|
862
|
|
863 sub get_all_validation_states {
|
|
864 return bitval_to_array(shift,\@VALIDATION_STATES);
|
|
865 }
|
|
866
|
|
867 =head2 get_validation_code
|
|
868
|
|
869 Arg [1] : arrayref $validation_status
|
|
870 Example : $var->{'validation_code'} = get_validation_code(['submitter','precious']);
|
|
871 Description: Retrieves the bit value for a combination of validation statuses.
|
|
872 Returntype : int
|
|
873 Exceptions : none
|
|
874 Caller : Variation::new
|
|
875
|
|
876 =cut
|
|
877
|
|
878 sub get_validation_code {
|
|
879 return array_to_bitval(shift,\@VALIDATION_STATES);
|
|
880 }
|
|
881
|
|
882 1;
|