0
|
1 # $Id: DNAStatistics.pm,v 1.4 2002/10/22 07:45:10 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Align::DNAStatistics
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 use Bio::Align::DNAStatistics;
|
|
20 use Bio::AlignIO;
|
|
21
|
|
22 my $stats = new Bio::Align::PairwiseStatistics;
|
|
23 my $alignin = new Bio::AlignIO(-format => 'emboss',
|
|
24 -file => 't/data/insulin.water');
|
|
25 my $jc = $stats->distance($aln, 'Jukes-Cantor');
|
|
26 foreach my $r ( @$jc ) {
|
|
27 print "\t";
|
|
28 foreach my $r ( @$d ) {
|
|
29 print "$r\t";
|
|
30 }
|
|
31 print "\n";
|
|
32 }
|
|
33
|
|
34 =head1 DESCRIPTION
|
|
35
|
|
36 This object contains routines for calculating various statistics and
|
|
37 distances for DNA alignments. The routines are not well tested and do
|
|
38 contain errors at this point. Work is underway to correct them, but
|
|
39 do not expect this code to give you the right answer currently! Use
|
|
40 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
|
|
41 distances.
|
|
42
|
|
43 =head1 FEEDBACK
|
|
44
|
|
45 =head2 Mailing Lists
|
|
46
|
|
47 User feedback is an integral part of the evolution of this and other
|
|
48 Bioperl modules. Send your comments and suggestions preferably to
|
|
49 the Bioperl mailing list. Your participation is much appreciated.
|
|
50
|
|
51 bioperl-l@bioperl.org - General discussion
|
|
52 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
53
|
|
54 =head2 Reporting Bugs
|
|
55
|
|
56 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
57 of the bugs and their resolution. Bug reports can be submitted via
|
|
58 email or the web:
|
|
59
|
|
60 bioperl-bugs@bioperl.org
|
|
61 http://bugzilla.bioperl.org/
|
|
62
|
|
63 =head1 AUTHOR - Jason Stajich
|
|
64
|
|
65 Email jason@bioperl.org
|
|
66
|
|
67 Describe contact details here
|
|
68
|
|
69 =head1 CONTRIBUTORS
|
|
70
|
|
71 Additional contributors names and emails here
|
|
72
|
|
73 =head1 APPENDIX
|
|
74
|
|
75 The rest of the documentation details each of the object methods.
|
|
76 Internal methods are usually preceded with a _
|
|
77
|
|
78 =cut
|
|
79
|
|
80
|
|
81 # Let the code begin...
|
|
82
|
|
83
|
|
84 package Bio::Align::DNAStatistics;
|
|
85 use vars qw(@ISA %DNAChanges @Nucleotides %NucleotideIndexes
|
|
86 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods);
|
|
87 use strict;
|
|
88 use Bio::Align::PairwiseStatistics;
|
|
89 use Bio::Root::Root;
|
|
90
|
|
91 BEGIN {
|
|
92 $GapChars = '(\.|\-)';
|
|
93 @Nucleotides = qw(A G T C);
|
|
94 $SeqCount = 2;
|
|
95 # these values come from EMBOSS distmat implementation
|
|
96 %NucleotideIndexes = ( 'A' => 0,
|
|
97 'T' => 1,
|
|
98 'C' => 2,
|
|
99 'G' => 3,
|
|
100
|
|
101 'AT' => 0,
|
|
102 'AC' => 1,
|
|
103 'AG' => 2,
|
|
104 'CT' => 3,
|
|
105 'GT' => 4,
|
|
106 'CG' => 5,
|
|
107
|
|
108 # these are wrong now
|
|
109 # 'S' => [ 1, 3],
|
|
110 # 'W' => [ 0, 4],
|
|
111 # 'Y' => [ 2, 3],
|
|
112 # 'R' => [ 0, 1],
|
|
113 # 'M' => [ 0, 3],
|
|
114 # 'K' => [ 1, 2],
|
|
115 # 'B' => [ 1, 2, 3],
|
|
116 # 'H' => [ 0, 2, 3],
|
|
117 # 'V' => [ 0, 1, 3],
|
|
118 # 'D' => [ 0, 1, 2],
|
|
119 );
|
|
120
|
|
121 $DefaultGapPenalty = 0;
|
|
122 # could put ambiguities here?
|
|
123 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
|
|
124 'T' => [ 'A', 'G'],
|
|
125 'C' => [ 'A', 'G'],
|
|
126 'G' => [ 'C', 'T'],
|
|
127 },
|
|
128 'Transitions' => { 'A' => [ 'G' ],
|
|
129 'G' => [ 'A' ],
|
|
130 'C' => [ 'T' ],
|
|
131 'T' => [ 'C' ],
|
|
132 },
|
|
133 );
|
|
134 %DistanceMethods = ( 'jc|jukes|jukes-cantor' => 'JukesCantor',
|
|
135 'f81' => 'F81',
|
|
136 'k2|k2p|k80|kimura' => 'Kimura',
|
|
137 't92|tamura|tamura92' => 'Tamura',
|
|
138 'f84' => 'F84',
|
|
139 'tajimanei|tajima-nei' => 'TajimaNei' );
|
|
140 }
|
|
141
|
|
142 @ISA = qw( Bio::Root::Root Bio::Align::StatisticsI );
|
|
143
|
|
144 =head2 new
|
|
145
|
|
146 Title : new
|
|
147 Usage : my $obj = new Bio::Align::DNAStatistics();
|
|
148 Function: Builds a new Bio::Align::DNAStatistics object
|
|
149 Returns : Bio::Align::DNAStatistics
|
|
150 Args : none
|
|
151
|
|
152
|
|
153 =cut
|
|
154
|
|
155 sub new {
|
|
156 my ($class,@args) = @_;
|
|
157 my $self = $class->SUPER::new(@args);
|
|
158
|
|
159 $self->pairwise_stats( new Bio::Align::PairwiseStatistics());
|
|
160
|
|
161 return $self;
|
|
162 }
|
|
163
|
|
164
|
|
165 =head2 distance
|
|
166
|
|
167 Title : distance
|
|
168 Usage : my $distance_mat = $stats->distance(-align => $aln,
|
|
169 -method => $method);
|
|
170 Function: Calculates a distance matrix for all pairwise distances of
|
|
171 sequences in an alignment.
|
|
172 Returns : Array ref
|
|
173 Args : -align => Bio::Align::AlignI object
|
|
174 -method => String specifying specific distance method
|
|
175 (implementing class may assume a default)
|
|
176
|
|
177 =cut
|
|
178
|
|
179 sub distance{
|
|
180 my ($self,@args) = @_;
|
|
181 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
|
|
182 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
|
|
183 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
|
|
184 }
|
|
185 $method ||= 'JukesCantor';
|
|
186 foreach my $m ( keys %DistanceMethods ) {
|
|
187 if(defined $m && $method =~ /$m/i ) {
|
|
188 my $mtd = "D_$DistanceMethods{$m}";
|
|
189 return $self->$mtd($aln);
|
|
190 }
|
|
191 }
|
|
192 $self->warn("Unrecognized distance method $method must be one of [".
|
|
193 join(',',$self->available_distance_methods())."]");
|
|
194 return undef;
|
|
195 }
|
|
196
|
|
197 =head2 available_distance_methods
|
|
198
|
|
199 Title : available_distance_methods
|
|
200 Usage : my @methods = $stats->available_distance_methods();
|
|
201 Function: Enumerates the possible distance methods
|
|
202 Returns : Array of strings
|
|
203 Args : none
|
|
204
|
|
205
|
|
206 =cut
|
|
207
|
|
208 sub available_distance_methods{
|
|
209 my ($self,@args) = @_;
|
|
210 return values %DistanceMethods;
|
|
211 }
|
|
212
|
|
213 =head2 D - distance methods
|
|
214
|
|
215 =cut
|
|
216
|
|
217 =head2 D_JukesCantor
|
|
218
|
|
219 Title : D_JukesCantor
|
|
220 Usage : my $d = $stat->D_JukesCantor($aln)
|
|
221 Function: Calculates D (pairwise distance) between 2 sequences in an
|
|
222 alignment using the Jukes-Cantor 1 parameter model.
|
|
223 Returns : ArrayRef of all pairwise distances of all sequence pairs in the alignment
|
|
224 Args : Bio::Align::AlignI of DNA sequences
|
|
225 double - gap penalty
|
|
226
|
|
227
|
|
228 =cut
|
|
229
|
|
230 sub D_JukesCantor{
|
|
231 my ($self,$aln,$gappenalty) = @_;
|
|
232 return 0 unless $self->_check_arg($aln);
|
|
233 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
|
|
234 # ambiguities ignored at this point
|
|
235
|
|
236 my (@seqs);
|
|
237 foreach my $seq ( $aln->each_seq) {
|
|
238 push @seqs, [ split(//,uc $seq->seq())];
|
|
239 }
|
|
240 my $seqct = scalar @seqs;
|
|
241 my @DVals;
|
|
242 for(my $i = 1; $i <= $seqct; $i++ ) {
|
|
243 for( my $j = $i+1; $j <= $seqct; $j++ ) {
|
|
244 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1],
|
|
245 $seqs[$j-1]);
|
|
246 # just want diagonals
|
|
247 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
|
|
248 $matrix->[2]->[2] + $matrix->[3]->[3] );
|
|
249 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
|
|
250 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
|
|
251 $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d;
|
|
252 }
|
|
253 }
|
|
254 return \@DVals;
|
|
255 }
|
|
256
|
|
257 =head2 D_F81
|
|
258
|
|
259 Title : D_F81
|
|
260 Usage : my $d = $stat->D_F81($aln)
|
|
261 Function: Calculates D (pairwise distance) between 2 sequences in an
|
|
262 alignment using the Felsenstein 1981 distance model.
|
|
263 Returns : ArrayRef of a 2d array of all pairwise distances in the alignment
|
|
264 Args : Bio::Align::AlignI of DNA sequences
|
|
265
|
|
266
|
|
267 =cut
|
|
268
|
|
269 sub D_F81{
|
|
270 my ($self,$aln) = @_;
|
|
271 return 0 unless $self->_check_arg($aln);
|
|
272 $self->throw("This isn't implemented yet - sorry");
|
|
273 }
|
|
274
|
|
275
|
|
276 # M Kimura, J. Mol. Evol., 1980, 16, 111.
|
|
277
|
|
278 =head2 D_Kimura
|
|
279
|
|
280 Title : D_Kimura
|
|
281 Usage : my $d = $stat->D_Kimura($aln)
|
|
282 Function: Calculates D (pairwise distance) between 2 sequences in an
|
|
283 alignment using the Kimura 2 parameter model.
|
|
284 Returns : ArrayRef of pairwise distances between all sequences in alignment
|
|
285 Args : Bio::Align::AlignI of DNA sequences
|
|
286
|
|
287
|
|
288 =cut
|
|
289
|
|
290 sub D_Kimura{
|
|
291 my ($self,$aln) = @_;
|
|
292 return 0 unless $self->_check_arg($aln);
|
|
293 my $seqct = $aln->no_sequences;
|
|
294 my @KVals;
|
|
295 for( my $i = 1; $i <= $seqct; $i++ ) {
|
|
296 for( my $j = $i+1; $j <= $seqct; $j++ ) {
|
|
297 my $pairwise = $aln->select_noncont($i,$j);
|
|
298 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
|
|
299 my $P = $self->transitions($pairwise) / $L;
|
|
300 my $Q = $self->transversions($pairwise) / $L;
|
|
301
|
|
302 my $a = 1 / ( 1 - (2 * $P) - $Q);
|
|
303 my $b = 1 / ( 1 - 2 * $Q );
|
|
304 my $K = (1/2) * log ( $a ) + (1/4) * log($b);
|
|
305 $KVals[$i]->[$j] = $K;
|
|
306 $KVals[$j]->[$i] = $K;
|
|
307 }
|
|
308 }
|
|
309 return \@KVals;
|
|
310 }
|
|
311
|
|
312 # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
|
|
313
|
|
314 =head2 D_Tamura
|
|
315
|
|
316 Title : D_Tamura
|
|
317 Usage :
|
|
318 Function:
|
|
319 Returns :
|
|
320 Args :
|
|
321
|
|
322
|
|
323 =cut
|
|
324
|
|
325 sub D_Tamura{
|
|
326 my ($self,$aln) = @_;
|
|
327 my $seqct = $aln->no_sequences;
|
|
328 my @KVals;
|
|
329 for( my $i = 1; $i <= $seqct; $i++ ) {
|
|
330 for( my $j = $i+1; $j <= $seqct; $j++ ) {
|
|
331 }
|
|
332 }
|
|
333 my $O = 0.25;
|
|
334 my $t = 0;
|
|
335 my $a = 0;
|
|
336 my $b = 0;
|
|
337
|
|
338
|
|
339 my $d = 4 * $O * ( 1 - $O ) * $a * $t + 2 * $b * $t;
|
|
340 return $d;
|
|
341 }
|
|
342
|
|
343 =head2 D_F84
|
|
344
|
|
345 Title : D_F84
|
|
346 Usage : my $d = $stat->D_F84($aln)
|
|
347 Function: Calculates D (pairwise distance) between 2 sequences in an
|
|
348 alignment using the Felsenstein 1984 distance model.
|
|
349 Returns : Distance value
|
|
350 Args : Bio::Align::AlignI of DNA sequences
|
|
351 double - gap penalty
|
|
352
|
|
353 =cut
|
|
354
|
|
355 sub D_F84{
|
|
356 my ($self,$aln) = @_;
|
|
357 return 0 unless $self->_check_arg($aln);
|
|
358 }
|
|
359
|
|
360 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
|
|
361
|
|
362 =head2 D_TajimaNei
|
|
363
|
|
364 Title : D_TajimaNei
|
|
365 Usage : my $d = $stat->D_TajimaNei($aln)
|
|
366 Function: Calculates D (pairwise distance) between 2 sequences in an
|
|
367 alignment using the TajimaNei 1984 distance model.
|
|
368 Returns : Distance value
|
|
369 Args : Bio::Align::AlignI of DNA sequences
|
|
370
|
|
371
|
|
372 =cut
|
|
373
|
|
374 sub D_TajimaNei{
|
|
375 my ($self,$aln) = @_;
|
|
376 $self->warn("The result from this method is not correct right now");
|
|
377 my (@seqs);
|
|
378 foreach my $seq ( $aln->each_seq) {
|
|
379 push @seqs, [ split(//,uc $seq->seq())];
|
|
380 }
|
|
381 my $seqct = scalar @seqs;
|
|
382 my @DVals;
|
|
383 for(my $i = 1; $i <= $seqct; $i++ ) {
|
|
384 for( my $j = $i+1; $j <= $seqct; $j++ ) {
|
|
385 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1],
|
|
386 $seqs[$j-1]);
|
|
387 my $fij2;
|
|
388 my $slen = $aln->length - $gaps;
|
|
389 for( my $bs = 0; $bs < 4; $bs++ ) {
|
|
390 my $fi = 0;
|
|
391 map {$fi += $matrix->[$bs]->[$_] } 0..3;
|
|
392 my $fj = 0;
|
|
393 map { $fj += $matrix->[$_]->[$bs] } 0..3;
|
|
394 my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
|
|
395 $fij2 += $fij**2;
|
|
396 }
|
|
397 my ($pair,$h) = (0,0);
|
|
398 for( my $bs = 0; $bs < 3; $bs++ ) {
|
|
399 for( my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
|
|
400 my $fij = $pfreq->[$pair++] / $slen;
|
|
401 if( $fij ) {
|
|
402
|
|
403 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
|
|
404
|
|
405 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
|
|
406 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
|
|
407 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
|
|
408 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
|
|
409
|
|
410 $h += ( $fij*$fij / 2 ) /
|
|
411 ( ( ( $ci1 + $cj1 ) / 2 * $slen ) *
|
|
412 ( ( $ci2 + $cj2 ) /2 * $slen )
|
|
413 );
|
|
414 $self->debug( "h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
|
|
415 }
|
|
416 }
|
|
417 }
|
|
418 # just want diagonals first
|
|
419
|
|
420 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
|
|
421 $matrix->[2]->[2] + $matrix->[3]->[3] );
|
|
422 my $D = 1 - ( $m / $slen);
|
|
423
|
|
424 my $b = (1-$fij2+(($D**2)/$h)) / 2;
|
|
425 $self->debug("h is $h fij2 is $fij2 b is $b\n");
|
|
426
|
|
427 my $d = (-1 * $b) * log ( 1 - $D/ $b);
|
|
428 $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d;
|
|
429 }
|
|
430 }
|
|
431 return \@DVals;
|
|
432
|
|
433
|
|
434 }
|
|
435
|
|
436 # HKY -- HASEGAWA, M., H. KISHINO, and T. YANO. 1985
|
|
437 # Tamura and Nei 1993?
|
|
438 # GTR?
|
|
439
|
|
440 =head2 K - sequence substitution methods
|
|
441
|
|
442 =cut
|
|
443
|
|
444 =head2 K_JukesCantor
|
|
445
|
|
446 Title : K_JukesCantor
|
|
447 Usage : my $k = $stats->K_JukesCantor($aln)
|
|
448 Function: Calculates K - the number of nucleotide substitutions between
|
|
449 2 seqs - according to the Jukes-Cantor 1 parameter model
|
|
450 This only involves the number of changes between two sequences.
|
|
451 Returns : double
|
|
452 Args : Bio::Align::AlignI
|
|
453
|
|
454
|
|
455 =cut
|
|
456
|
|
457 sub K_JukesCantor{
|
|
458 my ($self,$aln) = @_;
|
|
459 return 0 unless $self->_check_arg($aln);
|
|
460 my $seqct = $aln->no_sequences;
|
|
461 my @KVals;
|
|
462 for( my $i = 1; $i <= $seqct; $i++ ) {
|
|
463 for( my $j = $i+1; $j <= $seqct; $j++ ) {
|
|
464 my $pairwise = $aln->select_noncont($i,$j);
|
|
465 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
|
|
466 my $N = $self->pairwise_stats->number_of_differences($pairwise);
|
|
467 my $p = $N / $L;
|
|
468 my $K = - ( 3 / 4) * log ( 1 - (( 4 * $p) / 3 ));
|
|
469 $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K;
|
|
470 }
|
|
471 }
|
|
472 return \@KVals;
|
|
473 }
|
|
474
|
|
475 =head2 K_TajimaNei
|
|
476
|
|
477 Title : K_TajimaNei
|
|
478 Usage : my $k = $stats->K_TajimaNei($aln)
|
|
479 Function: Calculates K - the number of nucleotide substitutions between
|
|
480 2 seqs - according to the Kimura 2 parameter model.
|
|
481 This does not assume equal frequencies among all the nucleotides.
|
|
482 Returns : ArrayRef of 2d matrix which contains pairwise K values for
|
|
483 all sequences in the alignment
|
|
484 Args : Bio::Align::AlignI
|
|
485
|
|
486 =cut
|
|
487
|
|
488 sub K_TajimaNei {
|
|
489 my ($self,$aln) = @_;
|
|
490 return 0 unless $self->_check_arg($aln);
|
|
491
|
|
492 my @seqs;
|
|
493 foreach my $seq ( $aln->each_seq) {
|
|
494 push @seqs, [ split(//,uc $seq->seq())];
|
|
495 }
|
|
496 my @KVals;
|
|
497 my $L = $self->pairwise_stats->number_of_comparable_bases($aln);
|
|
498 my $seqct = scalar @seqs;
|
|
499 for( my $i = 1; $i <= $seqct; $i++ ) {
|
|
500 for( my $j = $i+1; $j <= $seqct; $j++ ) {
|
|
501 my (%q,%y);
|
|
502 my ($first,$second) = ($seqs[$i-1],$seqs[$j-1]);
|
|
503
|
|
504 for (my $k = 0;$k<$aln->length; $k++ ) {
|
|
505 next if( $first->[$k] =~ /^$GapChars$/ ||
|
|
506 $second->[$k] =~ /^$GapChars$/);
|
|
507
|
|
508 $q{$second->[$k]}++;
|
|
509 $q{$first->[$k]}++;
|
|
510 if( $first->[$k] ne $second->[$k] ) {
|
|
511 $y{$first->[$k]}->{$second->[$k]}++;
|
|
512 }
|
|
513 }
|
|
514
|
|
515 my $q_sum = 0;
|
|
516 foreach my $let ( @Nucleotides ) {
|
|
517 # ct is the number of sequences compared (2)
|
|
518 # L is the length of the alignment without gaps
|
|
519 # $ct * $L = total number of nt compared
|
|
520 my $avg = $q{$let} / ( $SeqCount * $L );
|
|
521 $q_sum += $avg**2;
|
|
522 }
|
|
523 my $b1 = 1 - $q_sum;
|
|
524 my $h = 0;
|
|
525 for( my $i = 0; $i <= 2; $i++ ) {
|
|
526 for( my $j = $i+1; $j <= 3; $j++) {
|
|
527 $y{$Nucleotides[$i]}->{$Nucleotides[$j]} ||= 0;
|
|
528 $y{$Nucleotides[$j]}->{$Nucleotides[$i]} ||= 0;
|
|
529 my $x = ($y{$Nucleotides[$i]}->{$Nucleotides[$j]} +
|
|
530 $y{$Nucleotides[$j]}->{$Nucleotides[$i]}) / $L;
|
|
531 $h += ($x ** 2) / ( 2 * $q{$Nucleotides[$i]} *
|
|
532 $q{$Nucleotides[$j]} );
|
|
533 }
|
|
534 }
|
|
535 my $N = $self->pairwise_stats->number_of_differences($aln);
|
|
536 my $p = $N / $L;
|
|
537 my $b = ( $b1 + $p ** 2 / $h ) / 2;
|
|
538 my $K = - $b * log ( 1 - $p / $b );
|
|
539 $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K;
|
|
540 }
|
|
541 }
|
|
542 return \@KVals;
|
|
543 }
|
|
544
|
|
545
|
|
546
|
|
547 =head2 transversions
|
|
548
|
|
549 Title : transversions
|
|
550 Usage : my $transversions = $stats->transversion($aln);
|
|
551 Function: Calculates the number of transversions between two sequences in
|
|
552 an alignment
|
|
553 Returns : integer
|
|
554 Args : Bio::Align::AlignI
|
|
555
|
|
556
|
|
557 =cut
|
|
558
|
|
559 sub transversions{
|
|
560 my ($self,$aln) = @_;
|
|
561 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
|
|
562 }
|
|
563
|
|
564 =head2 transitions
|
|
565
|
|
566 Title : transitions
|
|
567 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
|
|
568 Function: Calculates the number of transitions in a given DNA alignment
|
|
569 Returns : integer representing the number of transitions
|
|
570 Args : Bio::Align::AlignI object
|
|
571
|
|
572
|
|
573 =cut
|
|
574
|
|
575 sub transitions{
|
|
576 my ($self,$aln) = @_;
|
|
577 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
|
|
578 }
|
|
579
|
|
580
|
|
581 sub _trans_count_helper {
|
|
582 my ($self,$aln,$type) = @_;
|
|
583 return 0 unless( $self->_check_arg($aln) );
|
|
584 if( ! $aln->is_flush ) { $self->throw("must be flush") }
|
|
585 my (@seqs,@tcount);
|
|
586 foreach my $seq ( $aln->get_seq_by_pos(1), $aln->get_seq_by_pos(2) ) {
|
|
587 push @seqs, [ split(//,$seq->seq())];
|
|
588 }
|
|
589 my ($first,$second) = @seqs;
|
|
590
|
|
591 for (my $i = 0;$i<$aln->length; $i++ ) {
|
|
592 next if( $first->[$i] =~ /^$GapChars$/ ||
|
|
593 $second->[$i] =~ /^$GapChars$/);
|
|
594 if( $first->[$i] ne $second->[$i] ) {
|
|
595 foreach my $nt ( @{$type->{$first->[$i]}} ) {
|
|
596 if( $nt eq $second->[$i]) {
|
|
597 $tcount[$i]++;
|
|
598 }
|
|
599 }
|
|
600 }
|
|
601 }
|
|
602 my $sum = 0;
|
|
603 map { if( $_) { $sum += $_} } @tcount;
|
|
604 return $sum;
|
|
605 }
|
|
606
|
|
607 # this will generate a matrix which records across the row, the number
|
|
608 # of DNA subst
|
|
609 #
|
|
610 sub _build_nt_matrix {
|
|
611 my ($self,$seqa,$seqb) = @_;
|
|
612
|
|
613
|
|
614 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
|
|
615 [ qw(0 0 0 0) ],
|
|
616 [ qw(0 0 0 0) ],
|
|
617 [ qw(0 0 0 0) ] ];
|
|
618 my $gaps = 0; # number of gaps
|
|
619 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
|
|
620
|
|
621 for( my $i = 0; $i < scalar @$seqa; $i++) {
|
|
622
|
|
623 my ($ti,$tj) = ($seqa->[$i],$seqb->[$i]);
|
|
624 $ti =~ tr/U/T/;
|
|
625 $tj =~ tr/U/T/;
|
|
626
|
|
627 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
|
|
628 if( $tj =~ /^$GapChars$/) { $gaps++; next }
|
|
629
|
|
630 my $ti_index = $NucleotideIndexes{$ti};
|
|
631 my $tj_index = $NucleotideIndexes{$tj};
|
|
632
|
|
633 if( ! defined $ti_index ) {
|
|
634 print "ti_index not defined for $ti\n";
|
|
635 next;
|
|
636 }
|
|
637
|
|
638 $basect_matrix->[$ti_index]->[$tj_index]++;
|
|
639
|
|
640 if( $ti ne $tj ) {
|
|
641 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
|
|
642 }
|
|
643 }
|
|
644 return ($basect_matrix,$pfreq,$gaps);
|
|
645 }
|
|
646
|
|
647 sub _check_arg {
|
|
648 my($self,$aln ) = @_;
|
|
649 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
|
|
650 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
|
|
651 return 0;
|
|
652 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
|
|
653 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
|
|
654 return 0;
|
|
655 }
|
|
656 return 1;
|
|
657 }
|
|
658
|
|
659 =head2 Data Methods
|
|
660
|
|
661 =cut
|
|
662
|
|
663 =head2 pairwise_stats
|
|
664
|
|
665 Title : pairwise_stats
|
|
666 Usage : $obj->pairwise_stats($newval)
|
|
667 Function:
|
|
668 Returns : value of pairwise_stats
|
|
669 Args : newvalue (optional)
|
|
670
|
|
671
|
|
672 =cut
|
|
673
|
|
674 sub pairwise_stats{
|
|
675 my ($self,$value) = @_;
|
|
676 if( defined $value) {
|
|
677 $self->{'_pairwise_stats'} = $value;
|
|
678 }
|
|
679 return $self->{'_pairwise_stats'};
|
|
680
|
|
681 }
|
|
682
|
|
683 1;
|