comparison variant_effect_predictor/Bio/Align/DNAStatistics.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: DNAStatistics.pm,v 1.4 2002/10/22 07:45:10 lapp Exp $
2 #
3 # BioPerl module for Bio::Align::DNAStatistics
4 #
5 # Cared for by Jason Stajich <jason@bioperl.org>
6 #
7 # Copyright Jason Stajich
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
16
17 =head1 SYNOPSIS
18
19 use Bio::Align::DNAStatistics;
20 use Bio::AlignIO;
21
22 my $stats = new Bio::Align::PairwiseStatistics;
23 my $alignin = new Bio::AlignIO(-format => 'emboss',
24 -file => 't/data/insulin.water');
25 my $jc = $stats->distance($aln, 'Jukes-Cantor');
26 foreach my $r ( @$jc ) {
27 print "\t";
28 foreach my $r ( @$d ) {
29 print "$r\t";
30 }
31 print "\n";
32 }
33
34 =head1 DESCRIPTION
35
36 This object contains routines for calculating various statistics and
37 distances for DNA alignments. The routines are not well tested and do
38 contain errors at this point. Work is underway to correct them, but
39 do not expect this code to give you the right answer currently! Use
40 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
41 distances.
42
43 =head1 FEEDBACK
44
45 =head2 Mailing Lists
46
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
50
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/MailList.shtml - About the mailing lists
53
54 =head2 Reporting Bugs
55
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 of the bugs and their resolution. Bug reports can be submitted via
58 email or the web:
59
60 bioperl-bugs@bioperl.org
61 http://bugzilla.bioperl.org/
62
63 =head1 AUTHOR - Jason Stajich
64
65 Email jason@bioperl.org
66
67 Describe contact details here
68
69 =head1 CONTRIBUTORS
70
71 Additional contributors names and emails here
72
73 =head1 APPENDIX
74
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
77
78 =cut
79
80
81 # Let the code begin...
82
83
84 package Bio::Align::DNAStatistics;
85 use vars qw(@ISA %DNAChanges @Nucleotides %NucleotideIndexes
86 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods);
87 use strict;
88 use Bio::Align::PairwiseStatistics;
89 use Bio::Root::Root;
90
91 BEGIN {
92 $GapChars = '(\.|\-)';
93 @Nucleotides = qw(A G T C);
94 $SeqCount = 2;
95 # these values come from EMBOSS distmat implementation
96 %NucleotideIndexes = ( 'A' => 0,
97 'T' => 1,
98 'C' => 2,
99 'G' => 3,
100
101 'AT' => 0,
102 'AC' => 1,
103 'AG' => 2,
104 'CT' => 3,
105 'GT' => 4,
106 'CG' => 5,
107
108 # these are wrong now
109 # 'S' => [ 1, 3],
110 # 'W' => [ 0, 4],
111 # 'Y' => [ 2, 3],
112 # 'R' => [ 0, 1],
113 # 'M' => [ 0, 3],
114 # 'K' => [ 1, 2],
115 # 'B' => [ 1, 2, 3],
116 # 'H' => [ 0, 2, 3],
117 # 'V' => [ 0, 1, 3],
118 # 'D' => [ 0, 1, 2],
119 );
120
121 $DefaultGapPenalty = 0;
122 # could put ambiguities here?
123 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
124 'T' => [ 'A', 'G'],
125 'C' => [ 'A', 'G'],
126 'G' => [ 'C', 'T'],
127 },
128 'Transitions' => { 'A' => [ 'G' ],
129 'G' => [ 'A' ],
130 'C' => [ 'T' ],
131 'T' => [ 'C' ],
132 },
133 );
134 %DistanceMethods = ( 'jc|jukes|jukes-cantor' => 'JukesCantor',
135 'f81' => 'F81',
136 'k2|k2p|k80|kimura' => 'Kimura',
137 't92|tamura|tamura92' => 'Tamura',
138 'f84' => 'F84',
139 'tajimanei|tajima-nei' => 'TajimaNei' );
140 }
141
142 @ISA = qw( Bio::Root::Root Bio::Align::StatisticsI );
143
144 =head2 new
145
146 Title : new
147 Usage : my $obj = new Bio::Align::DNAStatistics();
148 Function: Builds a new Bio::Align::DNAStatistics object
149 Returns : Bio::Align::DNAStatistics
150 Args : none
151
152
153 =cut
154
155 sub new {
156 my ($class,@args) = @_;
157 my $self = $class->SUPER::new(@args);
158
159 $self->pairwise_stats( new Bio::Align::PairwiseStatistics());
160
161 return $self;
162 }
163
164
165 =head2 distance
166
167 Title : distance
168 Usage : my $distance_mat = $stats->distance(-align => $aln,
169 -method => $method);
170 Function: Calculates a distance matrix for all pairwise distances of
171 sequences in an alignment.
172 Returns : Array ref
173 Args : -align => Bio::Align::AlignI object
174 -method => String specifying specific distance method
175 (implementing class may assume a default)
176
177 =cut
178
179 sub distance{
180 my ($self,@args) = @_;
181 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
182 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
183 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
184 }
185 $method ||= 'JukesCantor';
186 foreach my $m ( keys %DistanceMethods ) {
187 if(defined $m && $method =~ /$m/i ) {
188 my $mtd = "D_$DistanceMethods{$m}";
189 return $self->$mtd($aln);
190 }
191 }
192 $self->warn("Unrecognized distance method $method must be one of [".
193 join(',',$self->available_distance_methods())."]");
194 return undef;
195 }
196
197 =head2 available_distance_methods
198
199 Title : available_distance_methods
200 Usage : my @methods = $stats->available_distance_methods();
201 Function: Enumerates the possible distance methods
202 Returns : Array of strings
203 Args : none
204
205
206 =cut
207
208 sub available_distance_methods{
209 my ($self,@args) = @_;
210 return values %DistanceMethods;
211 }
212
213 =head2 D - distance methods
214
215 =cut
216
217 =head2 D_JukesCantor
218
219 Title : D_JukesCantor
220 Usage : my $d = $stat->D_JukesCantor($aln)
221 Function: Calculates D (pairwise distance) between 2 sequences in an
222 alignment using the Jukes-Cantor 1 parameter model.
223 Returns : ArrayRef of all pairwise distances of all sequence pairs in the alignment
224 Args : Bio::Align::AlignI of DNA sequences
225 double - gap penalty
226
227
228 =cut
229
230 sub D_JukesCantor{
231 my ($self,$aln,$gappenalty) = @_;
232 return 0 unless $self->_check_arg($aln);
233 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
234 # ambiguities ignored at this point
235
236 my (@seqs);
237 foreach my $seq ( $aln->each_seq) {
238 push @seqs, [ split(//,uc $seq->seq())];
239 }
240 my $seqct = scalar @seqs;
241 my @DVals;
242 for(my $i = 1; $i <= $seqct; $i++ ) {
243 for( my $j = $i+1; $j <= $seqct; $j++ ) {
244 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1],
245 $seqs[$j-1]);
246 # just want diagonals
247 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
248 $matrix->[2]->[2] + $matrix->[3]->[3] );
249 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
250 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
251 $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d;
252 }
253 }
254 return \@DVals;
255 }
256
257 =head2 D_F81
258
259 Title : D_F81
260 Usage : my $d = $stat->D_F81($aln)
261 Function: Calculates D (pairwise distance) between 2 sequences in an
262 alignment using the Felsenstein 1981 distance model.
263 Returns : ArrayRef of a 2d array of all pairwise distances in the alignment
264 Args : Bio::Align::AlignI of DNA sequences
265
266
267 =cut
268
269 sub D_F81{
270 my ($self,$aln) = @_;
271 return 0 unless $self->_check_arg($aln);
272 $self->throw("This isn't implemented yet - sorry");
273 }
274
275
276 # M Kimura, J. Mol. Evol., 1980, 16, 111.
277
278 =head2 D_Kimura
279
280 Title : D_Kimura
281 Usage : my $d = $stat->D_Kimura($aln)
282 Function: Calculates D (pairwise distance) between 2 sequences in an
283 alignment using the Kimura 2 parameter model.
284 Returns : ArrayRef of pairwise distances between all sequences in alignment
285 Args : Bio::Align::AlignI of DNA sequences
286
287
288 =cut
289
290 sub D_Kimura{
291 my ($self,$aln) = @_;
292 return 0 unless $self->_check_arg($aln);
293 my $seqct = $aln->no_sequences;
294 my @KVals;
295 for( my $i = 1; $i <= $seqct; $i++ ) {
296 for( my $j = $i+1; $j <= $seqct; $j++ ) {
297 my $pairwise = $aln->select_noncont($i,$j);
298 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
299 my $P = $self->transitions($pairwise) / $L;
300 my $Q = $self->transversions($pairwise) / $L;
301
302 my $a = 1 / ( 1 - (2 * $P) - $Q);
303 my $b = 1 / ( 1 - 2 * $Q );
304 my $K = (1/2) * log ( $a ) + (1/4) * log($b);
305 $KVals[$i]->[$j] = $K;
306 $KVals[$j]->[$i] = $K;
307 }
308 }
309 return \@KVals;
310 }
311
312 # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
313
314 =head2 D_Tamura
315
316 Title : D_Tamura
317 Usage :
318 Function:
319 Returns :
320 Args :
321
322
323 =cut
324
325 sub D_Tamura{
326 my ($self,$aln) = @_;
327 my $seqct = $aln->no_sequences;
328 my @KVals;
329 for( my $i = 1; $i <= $seqct; $i++ ) {
330 for( my $j = $i+1; $j <= $seqct; $j++ ) {
331 }
332 }
333 my $O = 0.25;
334 my $t = 0;
335 my $a = 0;
336 my $b = 0;
337
338
339 my $d = 4 * $O * ( 1 - $O ) * $a * $t + 2 * $b * $t;
340 return $d;
341 }
342
343 =head2 D_F84
344
345 Title : D_F84
346 Usage : my $d = $stat->D_F84($aln)
347 Function: Calculates D (pairwise distance) between 2 sequences in an
348 alignment using the Felsenstein 1984 distance model.
349 Returns : Distance value
350 Args : Bio::Align::AlignI of DNA sequences
351 double - gap penalty
352
353 =cut
354
355 sub D_F84{
356 my ($self,$aln) = @_;
357 return 0 unless $self->_check_arg($aln);
358 }
359
360 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
361
362 =head2 D_TajimaNei
363
364 Title : D_TajimaNei
365 Usage : my $d = $stat->D_TajimaNei($aln)
366 Function: Calculates D (pairwise distance) between 2 sequences in an
367 alignment using the TajimaNei 1984 distance model.
368 Returns : Distance value
369 Args : Bio::Align::AlignI of DNA sequences
370
371
372 =cut
373
374 sub D_TajimaNei{
375 my ($self,$aln) = @_;
376 $self->warn("The result from this method is not correct right now");
377 my (@seqs);
378 foreach my $seq ( $aln->each_seq) {
379 push @seqs, [ split(//,uc $seq->seq())];
380 }
381 my $seqct = scalar @seqs;
382 my @DVals;
383 for(my $i = 1; $i <= $seqct; $i++ ) {
384 for( my $j = $i+1; $j <= $seqct; $j++ ) {
385 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1],
386 $seqs[$j-1]);
387 my $fij2;
388 my $slen = $aln->length - $gaps;
389 for( my $bs = 0; $bs < 4; $bs++ ) {
390 my $fi = 0;
391 map {$fi += $matrix->[$bs]->[$_] } 0..3;
392 my $fj = 0;
393 map { $fj += $matrix->[$_]->[$bs] } 0..3;
394 my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
395 $fij2 += $fij**2;
396 }
397 my ($pair,$h) = (0,0);
398 for( my $bs = 0; $bs < 3; $bs++ ) {
399 for( my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
400 my $fij = $pfreq->[$pair++] / $slen;
401 if( $fij ) {
402
403 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
404
405 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
406 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
407 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
408 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
409
410 $h += ( $fij*$fij / 2 ) /
411 ( ( ( $ci1 + $cj1 ) / 2 * $slen ) *
412 ( ( $ci2 + $cj2 ) /2 * $slen )
413 );
414 $self->debug( "h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
415 }
416 }
417 }
418 # just want diagonals first
419
420 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
421 $matrix->[2]->[2] + $matrix->[3]->[3] );
422 my $D = 1 - ( $m / $slen);
423
424 my $b = (1-$fij2+(($D**2)/$h)) / 2;
425 $self->debug("h is $h fij2 is $fij2 b is $b\n");
426
427 my $d = (-1 * $b) * log ( 1 - $D/ $b);
428 $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d;
429 }
430 }
431 return \@DVals;
432
433
434 }
435
436 # HKY -- HASEGAWA, M., H. KISHINO, and T. YANO. 1985
437 # Tamura and Nei 1993?
438 # GTR?
439
440 =head2 K - sequence substitution methods
441
442 =cut
443
444 =head2 K_JukesCantor
445
446 Title : K_JukesCantor
447 Usage : my $k = $stats->K_JukesCantor($aln)
448 Function: Calculates K - the number of nucleotide substitutions between
449 2 seqs - according to the Jukes-Cantor 1 parameter model
450 This only involves the number of changes between two sequences.
451 Returns : double
452 Args : Bio::Align::AlignI
453
454
455 =cut
456
457 sub K_JukesCantor{
458 my ($self,$aln) = @_;
459 return 0 unless $self->_check_arg($aln);
460 my $seqct = $aln->no_sequences;
461 my @KVals;
462 for( my $i = 1; $i <= $seqct; $i++ ) {
463 for( my $j = $i+1; $j <= $seqct; $j++ ) {
464 my $pairwise = $aln->select_noncont($i,$j);
465 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
466 my $N = $self->pairwise_stats->number_of_differences($pairwise);
467 my $p = $N / $L;
468 my $K = - ( 3 / 4) * log ( 1 - (( 4 * $p) / 3 ));
469 $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K;
470 }
471 }
472 return \@KVals;
473 }
474
475 =head2 K_TajimaNei
476
477 Title : K_TajimaNei
478 Usage : my $k = $stats->K_TajimaNei($aln)
479 Function: Calculates K - the number of nucleotide substitutions between
480 2 seqs - according to the Kimura 2 parameter model.
481 This does not assume equal frequencies among all the nucleotides.
482 Returns : ArrayRef of 2d matrix which contains pairwise K values for
483 all sequences in the alignment
484 Args : Bio::Align::AlignI
485
486 =cut
487
488 sub K_TajimaNei {
489 my ($self,$aln) = @_;
490 return 0 unless $self->_check_arg($aln);
491
492 my @seqs;
493 foreach my $seq ( $aln->each_seq) {
494 push @seqs, [ split(//,uc $seq->seq())];
495 }
496 my @KVals;
497 my $L = $self->pairwise_stats->number_of_comparable_bases($aln);
498 my $seqct = scalar @seqs;
499 for( my $i = 1; $i <= $seqct; $i++ ) {
500 for( my $j = $i+1; $j <= $seqct; $j++ ) {
501 my (%q,%y);
502 my ($first,$second) = ($seqs[$i-1],$seqs[$j-1]);
503
504 for (my $k = 0;$k<$aln->length; $k++ ) {
505 next if( $first->[$k] =~ /^$GapChars$/ ||
506 $second->[$k] =~ /^$GapChars$/);
507
508 $q{$second->[$k]}++;
509 $q{$first->[$k]}++;
510 if( $first->[$k] ne $second->[$k] ) {
511 $y{$first->[$k]}->{$second->[$k]}++;
512 }
513 }
514
515 my $q_sum = 0;
516 foreach my $let ( @Nucleotides ) {
517 # ct is the number of sequences compared (2)
518 # L is the length of the alignment without gaps
519 # $ct * $L = total number of nt compared
520 my $avg = $q{$let} / ( $SeqCount * $L );
521 $q_sum += $avg**2;
522 }
523 my $b1 = 1 - $q_sum;
524 my $h = 0;
525 for( my $i = 0; $i <= 2; $i++ ) {
526 for( my $j = $i+1; $j <= 3; $j++) {
527 $y{$Nucleotides[$i]}->{$Nucleotides[$j]} ||= 0;
528 $y{$Nucleotides[$j]}->{$Nucleotides[$i]} ||= 0;
529 my $x = ($y{$Nucleotides[$i]}->{$Nucleotides[$j]} +
530 $y{$Nucleotides[$j]}->{$Nucleotides[$i]}) / $L;
531 $h += ($x ** 2) / ( 2 * $q{$Nucleotides[$i]} *
532 $q{$Nucleotides[$j]} );
533 }
534 }
535 my $N = $self->pairwise_stats->number_of_differences($aln);
536 my $p = $N / $L;
537 my $b = ( $b1 + $p ** 2 / $h ) / 2;
538 my $K = - $b * log ( 1 - $p / $b );
539 $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K;
540 }
541 }
542 return \@KVals;
543 }
544
545
546
547 =head2 transversions
548
549 Title : transversions
550 Usage : my $transversions = $stats->transversion($aln);
551 Function: Calculates the number of transversions between two sequences in
552 an alignment
553 Returns : integer
554 Args : Bio::Align::AlignI
555
556
557 =cut
558
559 sub transversions{
560 my ($self,$aln) = @_;
561 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
562 }
563
564 =head2 transitions
565
566 Title : transitions
567 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
568 Function: Calculates the number of transitions in a given DNA alignment
569 Returns : integer representing the number of transitions
570 Args : Bio::Align::AlignI object
571
572
573 =cut
574
575 sub transitions{
576 my ($self,$aln) = @_;
577 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
578 }
579
580
581 sub _trans_count_helper {
582 my ($self,$aln,$type) = @_;
583 return 0 unless( $self->_check_arg($aln) );
584 if( ! $aln->is_flush ) { $self->throw("must be flush") }
585 my (@seqs,@tcount);
586 foreach my $seq ( $aln->get_seq_by_pos(1), $aln->get_seq_by_pos(2) ) {
587 push @seqs, [ split(//,$seq->seq())];
588 }
589 my ($first,$second) = @seqs;
590
591 for (my $i = 0;$i<$aln->length; $i++ ) {
592 next if( $first->[$i] =~ /^$GapChars$/ ||
593 $second->[$i] =~ /^$GapChars$/);
594 if( $first->[$i] ne $second->[$i] ) {
595 foreach my $nt ( @{$type->{$first->[$i]}} ) {
596 if( $nt eq $second->[$i]) {
597 $tcount[$i]++;
598 }
599 }
600 }
601 }
602 my $sum = 0;
603 map { if( $_) { $sum += $_} } @tcount;
604 return $sum;
605 }
606
607 # this will generate a matrix which records across the row, the number
608 # of DNA subst
609 #
610 sub _build_nt_matrix {
611 my ($self,$seqa,$seqb) = @_;
612
613
614 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
615 [ qw(0 0 0 0) ],
616 [ qw(0 0 0 0) ],
617 [ qw(0 0 0 0) ] ];
618 my $gaps = 0; # number of gaps
619 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
620
621 for( my $i = 0; $i < scalar @$seqa; $i++) {
622
623 my ($ti,$tj) = ($seqa->[$i],$seqb->[$i]);
624 $ti =~ tr/U/T/;
625 $tj =~ tr/U/T/;
626
627 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
628 if( $tj =~ /^$GapChars$/) { $gaps++; next }
629
630 my $ti_index = $NucleotideIndexes{$ti};
631 my $tj_index = $NucleotideIndexes{$tj};
632
633 if( ! defined $ti_index ) {
634 print "ti_index not defined for $ti\n";
635 next;
636 }
637
638 $basect_matrix->[$ti_index]->[$tj_index]++;
639
640 if( $ti ne $tj ) {
641 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
642 }
643 }
644 return ($basect_matrix,$pfreq,$gaps);
645 }
646
647 sub _check_arg {
648 my($self,$aln ) = @_;
649 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
650 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
651 return 0;
652 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
653 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
654 return 0;
655 }
656 return 1;
657 }
658
659 =head2 Data Methods
660
661 =cut
662
663 =head2 pairwise_stats
664
665 Title : pairwise_stats
666 Usage : $obj->pairwise_stats($newval)
667 Function:
668 Returns : value of pairwise_stats
669 Args : newvalue (optional)
670
671
672 =cut
673
674 sub pairwise_stats{
675 my ($self,$value) = @_;
676 if( defined $value) {
677 $self->{'_pairwise_stats'} = $value;
678 }
679 return $self->{'_pairwise_stats'};
680
681 }
682
683 1;