Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/Align/DNAStatistics.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: DNAStatistics.pm,v 1.4 2002/10/22 07:45:10 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::Align::DNAStatistics | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 use Bio::Align::DNAStatistics; | |
20 use Bio::AlignIO; | |
21 | |
22 my $stats = new Bio::Align::PairwiseStatistics; | |
23 my $alignin = new Bio::AlignIO(-format => 'emboss', | |
24 -file => 't/data/insulin.water'); | |
25 my $jc = $stats->distance($aln, 'Jukes-Cantor'); | |
26 foreach my $r ( @$jc ) { | |
27 print "\t"; | |
28 foreach my $r ( @$d ) { | |
29 print "$r\t"; | |
30 } | |
31 print "\n"; | |
32 } | |
33 | |
34 =head1 DESCRIPTION | |
35 | |
36 This object contains routines for calculating various statistics and | |
37 distances for DNA alignments. The routines are not well tested and do | |
38 contain errors at this point. Work is underway to correct them, but | |
39 do not expect this code to give you the right answer currently! Use | |
40 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the | |
41 distances. | |
42 | |
43 =head1 FEEDBACK | |
44 | |
45 =head2 Mailing Lists | |
46 | |
47 User feedback is an integral part of the evolution of this and other | |
48 Bioperl modules. Send your comments and suggestions preferably to | |
49 the Bioperl mailing list. Your participation is much appreciated. | |
50 | |
51 bioperl-l@bioperl.org - General discussion | |
52 http://bioperl.org/MailList.shtml - About the mailing lists | |
53 | |
54 =head2 Reporting Bugs | |
55 | |
56 Report bugs to the Bioperl bug tracking system to help us keep track | |
57 of the bugs and their resolution. Bug reports can be submitted via | |
58 email or the web: | |
59 | |
60 bioperl-bugs@bioperl.org | |
61 http://bugzilla.bioperl.org/ | |
62 | |
63 =head1 AUTHOR - Jason Stajich | |
64 | |
65 Email jason@bioperl.org | |
66 | |
67 Describe contact details here | |
68 | |
69 =head1 CONTRIBUTORS | |
70 | |
71 Additional contributors names and emails here | |
72 | |
73 =head1 APPENDIX | |
74 | |
75 The rest of the documentation details each of the object methods. | |
76 Internal methods are usually preceded with a _ | |
77 | |
78 =cut | |
79 | |
80 | |
81 # Let the code begin... | |
82 | |
83 | |
84 package Bio::Align::DNAStatistics; | |
85 use vars qw(@ISA %DNAChanges @Nucleotides %NucleotideIndexes | |
86 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods); | |
87 use strict; | |
88 use Bio::Align::PairwiseStatistics; | |
89 use Bio::Root::Root; | |
90 | |
91 BEGIN { | |
92 $GapChars = '(\.|\-)'; | |
93 @Nucleotides = qw(A G T C); | |
94 $SeqCount = 2; | |
95 # these values come from EMBOSS distmat implementation | |
96 %NucleotideIndexes = ( 'A' => 0, | |
97 'T' => 1, | |
98 'C' => 2, | |
99 'G' => 3, | |
100 | |
101 'AT' => 0, | |
102 'AC' => 1, | |
103 'AG' => 2, | |
104 'CT' => 3, | |
105 'GT' => 4, | |
106 'CG' => 5, | |
107 | |
108 # these are wrong now | |
109 # 'S' => [ 1, 3], | |
110 # 'W' => [ 0, 4], | |
111 # 'Y' => [ 2, 3], | |
112 # 'R' => [ 0, 1], | |
113 # 'M' => [ 0, 3], | |
114 # 'K' => [ 1, 2], | |
115 # 'B' => [ 1, 2, 3], | |
116 # 'H' => [ 0, 2, 3], | |
117 # 'V' => [ 0, 1, 3], | |
118 # 'D' => [ 0, 1, 2], | |
119 ); | |
120 | |
121 $DefaultGapPenalty = 0; | |
122 # could put ambiguities here? | |
123 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'], | |
124 'T' => [ 'A', 'G'], | |
125 'C' => [ 'A', 'G'], | |
126 'G' => [ 'C', 'T'], | |
127 }, | |
128 'Transitions' => { 'A' => [ 'G' ], | |
129 'G' => [ 'A' ], | |
130 'C' => [ 'T' ], | |
131 'T' => [ 'C' ], | |
132 }, | |
133 ); | |
134 %DistanceMethods = ( 'jc|jukes|jukes-cantor' => 'JukesCantor', | |
135 'f81' => 'F81', | |
136 'k2|k2p|k80|kimura' => 'Kimura', | |
137 't92|tamura|tamura92' => 'Tamura', | |
138 'f84' => 'F84', | |
139 'tajimanei|tajima-nei' => 'TajimaNei' ); | |
140 } | |
141 | |
142 @ISA = qw( Bio::Root::Root Bio::Align::StatisticsI ); | |
143 | |
144 =head2 new | |
145 | |
146 Title : new | |
147 Usage : my $obj = new Bio::Align::DNAStatistics(); | |
148 Function: Builds a new Bio::Align::DNAStatistics object | |
149 Returns : Bio::Align::DNAStatistics | |
150 Args : none | |
151 | |
152 | |
153 =cut | |
154 | |
155 sub new { | |
156 my ($class,@args) = @_; | |
157 my $self = $class->SUPER::new(@args); | |
158 | |
159 $self->pairwise_stats( new Bio::Align::PairwiseStatistics()); | |
160 | |
161 return $self; | |
162 } | |
163 | |
164 | |
165 =head2 distance | |
166 | |
167 Title : distance | |
168 Usage : my $distance_mat = $stats->distance(-align => $aln, | |
169 -method => $method); | |
170 Function: Calculates a distance matrix for all pairwise distances of | |
171 sequences in an alignment. | |
172 Returns : Array ref | |
173 Args : -align => Bio::Align::AlignI object | |
174 -method => String specifying specific distance method | |
175 (implementing class may assume a default) | |
176 | |
177 =cut | |
178 | |
179 sub distance{ | |
180 my ($self,@args) = @_; | |
181 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args); | |
182 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) { | |
183 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance"); | |
184 } | |
185 $method ||= 'JukesCantor'; | |
186 foreach my $m ( keys %DistanceMethods ) { | |
187 if(defined $m && $method =~ /$m/i ) { | |
188 my $mtd = "D_$DistanceMethods{$m}"; | |
189 return $self->$mtd($aln); | |
190 } | |
191 } | |
192 $self->warn("Unrecognized distance method $method must be one of [". | |
193 join(',',$self->available_distance_methods())."]"); | |
194 return undef; | |
195 } | |
196 | |
197 =head2 available_distance_methods | |
198 | |
199 Title : available_distance_methods | |
200 Usage : my @methods = $stats->available_distance_methods(); | |
201 Function: Enumerates the possible distance methods | |
202 Returns : Array of strings | |
203 Args : none | |
204 | |
205 | |
206 =cut | |
207 | |
208 sub available_distance_methods{ | |
209 my ($self,@args) = @_; | |
210 return values %DistanceMethods; | |
211 } | |
212 | |
213 =head2 D - distance methods | |
214 | |
215 =cut | |
216 | |
217 =head2 D_JukesCantor | |
218 | |
219 Title : D_JukesCantor | |
220 Usage : my $d = $stat->D_JukesCantor($aln) | |
221 Function: Calculates D (pairwise distance) between 2 sequences in an | |
222 alignment using the Jukes-Cantor 1 parameter model. | |
223 Returns : ArrayRef of all pairwise distances of all sequence pairs in the alignment | |
224 Args : Bio::Align::AlignI of DNA sequences | |
225 double - gap penalty | |
226 | |
227 | |
228 =cut | |
229 | |
230 sub D_JukesCantor{ | |
231 my ($self,$aln,$gappenalty) = @_; | |
232 return 0 unless $self->_check_arg($aln); | |
233 $gappenalty = $DefaultGapPenalty unless defined $gappenalty; | |
234 # ambiguities ignored at this point | |
235 | |
236 my (@seqs); | |
237 foreach my $seq ( $aln->each_seq) { | |
238 push @seqs, [ split(//,uc $seq->seq())]; | |
239 } | |
240 my $seqct = scalar @seqs; | |
241 my @DVals; | |
242 for(my $i = 1; $i <= $seqct; $i++ ) { | |
243 for( my $j = $i+1; $j <= $seqct; $j++ ) { | |
244 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1], | |
245 $seqs[$j-1]); | |
246 # just want diagonals | |
247 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] + | |
248 $matrix->[2]->[2] + $matrix->[3]->[3] ); | |
249 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty))); | |
250 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3)); | |
251 $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d; | |
252 } | |
253 } | |
254 return \@DVals; | |
255 } | |
256 | |
257 =head2 D_F81 | |
258 | |
259 Title : D_F81 | |
260 Usage : my $d = $stat->D_F81($aln) | |
261 Function: Calculates D (pairwise distance) between 2 sequences in an | |
262 alignment using the Felsenstein 1981 distance model. | |
263 Returns : ArrayRef of a 2d array of all pairwise distances in the alignment | |
264 Args : Bio::Align::AlignI of DNA sequences | |
265 | |
266 | |
267 =cut | |
268 | |
269 sub D_F81{ | |
270 my ($self,$aln) = @_; | |
271 return 0 unless $self->_check_arg($aln); | |
272 $self->throw("This isn't implemented yet - sorry"); | |
273 } | |
274 | |
275 | |
276 # M Kimura, J. Mol. Evol., 1980, 16, 111. | |
277 | |
278 =head2 D_Kimura | |
279 | |
280 Title : D_Kimura | |
281 Usage : my $d = $stat->D_Kimura($aln) | |
282 Function: Calculates D (pairwise distance) between 2 sequences in an | |
283 alignment using the Kimura 2 parameter model. | |
284 Returns : ArrayRef of pairwise distances between all sequences in alignment | |
285 Args : Bio::Align::AlignI of DNA sequences | |
286 | |
287 | |
288 =cut | |
289 | |
290 sub D_Kimura{ | |
291 my ($self,$aln) = @_; | |
292 return 0 unless $self->_check_arg($aln); | |
293 my $seqct = $aln->no_sequences; | |
294 my @KVals; | |
295 for( my $i = 1; $i <= $seqct; $i++ ) { | |
296 for( my $j = $i+1; $j <= $seqct; $j++ ) { | |
297 my $pairwise = $aln->select_noncont($i,$j); | |
298 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise); | |
299 my $P = $self->transitions($pairwise) / $L; | |
300 my $Q = $self->transversions($pairwise) / $L; | |
301 | |
302 my $a = 1 / ( 1 - (2 * $P) - $Q); | |
303 my $b = 1 / ( 1 - 2 * $Q ); | |
304 my $K = (1/2) * log ( $a ) + (1/4) * log($b); | |
305 $KVals[$i]->[$j] = $K; | |
306 $KVals[$j]->[$i] = $K; | |
307 } | |
308 } | |
309 return \@KVals; | |
310 } | |
311 | |
312 # K Tamura, Mol. Biol. Evol. 1992, 9, 678. | |
313 | |
314 =head2 D_Tamura | |
315 | |
316 Title : D_Tamura | |
317 Usage : | |
318 Function: | |
319 Returns : | |
320 Args : | |
321 | |
322 | |
323 =cut | |
324 | |
325 sub D_Tamura{ | |
326 my ($self,$aln) = @_; | |
327 my $seqct = $aln->no_sequences; | |
328 my @KVals; | |
329 for( my $i = 1; $i <= $seqct; $i++ ) { | |
330 for( my $j = $i+1; $j <= $seqct; $j++ ) { | |
331 } | |
332 } | |
333 my $O = 0.25; | |
334 my $t = 0; | |
335 my $a = 0; | |
336 my $b = 0; | |
337 | |
338 | |
339 my $d = 4 * $O * ( 1 - $O ) * $a * $t + 2 * $b * $t; | |
340 return $d; | |
341 } | |
342 | |
343 =head2 D_F84 | |
344 | |
345 Title : D_F84 | |
346 Usage : my $d = $stat->D_F84($aln) | |
347 Function: Calculates D (pairwise distance) between 2 sequences in an | |
348 alignment using the Felsenstein 1984 distance model. | |
349 Returns : Distance value | |
350 Args : Bio::Align::AlignI of DNA sequences | |
351 double - gap penalty | |
352 | |
353 =cut | |
354 | |
355 sub D_F84{ | |
356 my ($self,$aln) = @_; | |
357 return 0 unless $self->_check_arg($aln); | |
358 } | |
359 | |
360 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269. | |
361 | |
362 =head2 D_TajimaNei | |
363 | |
364 Title : D_TajimaNei | |
365 Usage : my $d = $stat->D_TajimaNei($aln) | |
366 Function: Calculates D (pairwise distance) between 2 sequences in an | |
367 alignment using the TajimaNei 1984 distance model. | |
368 Returns : Distance value | |
369 Args : Bio::Align::AlignI of DNA sequences | |
370 | |
371 | |
372 =cut | |
373 | |
374 sub D_TajimaNei{ | |
375 my ($self,$aln) = @_; | |
376 $self->warn("The result from this method is not correct right now"); | |
377 my (@seqs); | |
378 foreach my $seq ( $aln->each_seq) { | |
379 push @seqs, [ split(//,uc $seq->seq())]; | |
380 } | |
381 my $seqct = scalar @seqs; | |
382 my @DVals; | |
383 for(my $i = 1; $i <= $seqct; $i++ ) { | |
384 for( my $j = $i+1; $j <= $seqct; $j++ ) { | |
385 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1], | |
386 $seqs[$j-1]); | |
387 my $fij2; | |
388 my $slen = $aln->length - $gaps; | |
389 for( my $bs = 0; $bs < 4; $bs++ ) { | |
390 my $fi = 0; | |
391 map {$fi += $matrix->[$bs]->[$_] } 0..3; | |
392 my $fj = 0; | |
393 map { $fj += $matrix->[$_]->[$bs] } 0..3; | |
394 my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0; | |
395 $fij2 += $fij**2; | |
396 } | |
397 my ($pair,$h) = (0,0); | |
398 for( my $bs = 0; $bs < 3; $bs++ ) { | |
399 for( my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) { | |
400 my $fij = $pfreq->[$pair++] / $slen; | |
401 if( $fij ) { | |
402 | |
403 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0); | |
404 | |
405 map { $ci1 += $matrix->[$_]->[$bs] } 0..3; | |
406 map { $cj1 += $matrix->[$bs]->[$_] } 0..3; | |
407 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3; | |
408 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3; | |
409 | |
410 $h += ( $fij*$fij / 2 ) / | |
411 ( ( ( $ci1 + $cj1 ) / 2 * $slen ) * | |
412 ( ( $ci2 + $cj2 ) /2 * $slen ) | |
413 ); | |
414 $self->debug( "h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n"); | |
415 } | |
416 } | |
417 } | |
418 # just want diagonals first | |
419 | |
420 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] + | |
421 $matrix->[2]->[2] + $matrix->[3]->[3] ); | |
422 my $D = 1 - ( $m / $slen); | |
423 | |
424 my $b = (1-$fij2+(($D**2)/$h)) / 2; | |
425 $self->debug("h is $h fij2 is $fij2 b is $b\n"); | |
426 | |
427 my $d = (-1 * $b) * log ( 1 - $D/ $b); | |
428 $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d; | |
429 } | |
430 } | |
431 return \@DVals; | |
432 | |
433 | |
434 } | |
435 | |
436 # HKY -- HASEGAWA, M., H. KISHINO, and T. YANO. 1985 | |
437 # Tamura and Nei 1993? | |
438 # GTR? | |
439 | |
440 =head2 K - sequence substitution methods | |
441 | |
442 =cut | |
443 | |
444 =head2 K_JukesCantor | |
445 | |
446 Title : K_JukesCantor | |
447 Usage : my $k = $stats->K_JukesCantor($aln) | |
448 Function: Calculates K - the number of nucleotide substitutions between | |
449 2 seqs - according to the Jukes-Cantor 1 parameter model | |
450 This only involves the number of changes between two sequences. | |
451 Returns : double | |
452 Args : Bio::Align::AlignI | |
453 | |
454 | |
455 =cut | |
456 | |
457 sub K_JukesCantor{ | |
458 my ($self,$aln) = @_; | |
459 return 0 unless $self->_check_arg($aln); | |
460 my $seqct = $aln->no_sequences; | |
461 my @KVals; | |
462 for( my $i = 1; $i <= $seqct; $i++ ) { | |
463 for( my $j = $i+1; $j <= $seqct; $j++ ) { | |
464 my $pairwise = $aln->select_noncont($i,$j); | |
465 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise); | |
466 my $N = $self->pairwise_stats->number_of_differences($pairwise); | |
467 my $p = $N / $L; | |
468 my $K = - ( 3 / 4) * log ( 1 - (( 4 * $p) / 3 )); | |
469 $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K; | |
470 } | |
471 } | |
472 return \@KVals; | |
473 } | |
474 | |
475 =head2 K_TajimaNei | |
476 | |
477 Title : K_TajimaNei | |
478 Usage : my $k = $stats->K_TajimaNei($aln) | |
479 Function: Calculates K - the number of nucleotide substitutions between | |
480 2 seqs - according to the Kimura 2 parameter model. | |
481 This does not assume equal frequencies among all the nucleotides. | |
482 Returns : ArrayRef of 2d matrix which contains pairwise K values for | |
483 all sequences in the alignment | |
484 Args : Bio::Align::AlignI | |
485 | |
486 =cut | |
487 | |
488 sub K_TajimaNei { | |
489 my ($self,$aln) = @_; | |
490 return 0 unless $self->_check_arg($aln); | |
491 | |
492 my @seqs; | |
493 foreach my $seq ( $aln->each_seq) { | |
494 push @seqs, [ split(//,uc $seq->seq())]; | |
495 } | |
496 my @KVals; | |
497 my $L = $self->pairwise_stats->number_of_comparable_bases($aln); | |
498 my $seqct = scalar @seqs; | |
499 for( my $i = 1; $i <= $seqct; $i++ ) { | |
500 for( my $j = $i+1; $j <= $seqct; $j++ ) { | |
501 my (%q,%y); | |
502 my ($first,$second) = ($seqs[$i-1],$seqs[$j-1]); | |
503 | |
504 for (my $k = 0;$k<$aln->length; $k++ ) { | |
505 next if( $first->[$k] =~ /^$GapChars$/ || | |
506 $second->[$k] =~ /^$GapChars$/); | |
507 | |
508 $q{$second->[$k]}++; | |
509 $q{$first->[$k]}++; | |
510 if( $first->[$k] ne $second->[$k] ) { | |
511 $y{$first->[$k]}->{$second->[$k]}++; | |
512 } | |
513 } | |
514 | |
515 my $q_sum = 0; | |
516 foreach my $let ( @Nucleotides ) { | |
517 # ct is the number of sequences compared (2) | |
518 # L is the length of the alignment without gaps | |
519 # $ct * $L = total number of nt compared | |
520 my $avg = $q{$let} / ( $SeqCount * $L ); | |
521 $q_sum += $avg**2; | |
522 } | |
523 my $b1 = 1 - $q_sum; | |
524 my $h = 0; | |
525 for( my $i = 0; $i <= 2; $i++ ) { | |
526 for( my $j = $i+1; $j <= 3; $j++) { | |
527 $y{$Nucleotides[$i]}->{$Nucleotides[$j]} ||= 0; | |
528 $y{$Nucleotides[$j]}->{$Nucleotides[$i]} ||= 0; | |
529 my $x = ($y{$Nucleotides[$i]}->{$Nucleotides[$j]} + | |
530 $y{$Nucleotides[$j]}->{$Nucleotides[$i]}) / $L; | |
531 $h += ($x ** 2) / ( 2 * $q{$Nucleotides[$i]} * | |
532 $q{$Nucleotides[$j]} ); | |
533 } | |
534 } | |
535 my $N = $self->pairwise_stats->number_of_differences($aln); | |
536 my $p = $N / $L; | |
537 my $b = ( $b1 + $p ** 2 / $h ) / 2; | |
538 my $K = - $b * log ( 1 - $p / $b ); | |
539 $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K; | |
540 } | |
541 } | |
542 return \@KVals; | |
543 } | |
544 | |
545 | |
546 | |
547 =head2 transversions | |
548 | |
549 Title : transversions | |
550 Usage : my $transversions = $stats->transversion($aln); | |
551 Function: Calculates the number of transversions between two sequences in | |
552 an alignment | |
553 Returns : integer | |
554 Args : Bio::Align::AlignI | |
555 | |
556 | |
557 =cut | |
558 | |
559 sub transversions{ | |
560 my ($self,$aln) = @_; | |
561 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'}); | |
562 } | |
563 | |
564 =head2 transitions | |
565 | |
566 Title : transitions | |
567 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln); | |
568 Function: Calculates the number of transitions in a given DNA alignment | |
569 Returns : integer representing the number of transitions | |
570 Args : Bio::Align::AlignI object | |
571 | |
572 | |
573 =cut | |
574 | |
575 sub transitions{ | |
576 my ($self,$aln) = @_; | |
577 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'}); | |
578 } | |
579 | |
580 | |
581 sub _trans_count_helper { | |
582 my ($self,$aln,$type) = @_; | |
583 return 0 unless( $self->_check_arg($aln) ); | |
584 if( ! $aln->is_flush ) { $self->throw("must be flush") } | |
585 my (@seqs,@tcount); | |
586 foreach my $seq ( $aln->get_seq_by_pos(1), $aln->get_seq_by_pos(2) ) { | |
587 push @seqs, [ split(//,$seq->seq())]; | |
588 } | |
589 my ($first,$second) = @seqs; | |
590 | |
591 for (my $i = 0;$i<$aln->length; $i++ ) { | |
592 next if( $first->[$i] =~ /^$GapChars$/ || | |
593 $second->[$i] =~ /^$GapChars$/); | |
594 if( $first->[$i] ne $second->[$i] ) { | |
595 foreach my $nt ( @{$type->{$first->[$i]}} ) { | |
596 if( $nt eq $second->[$i]) { | |
597 $tcount[$i]++; | |
598 } | |
599 } | |
600 } | |
601 } | |
602 my $sum = 0; | |
603 map { if( $_) { $sum += $_} } @tcount; | |
604 return $sum; | |
605 } | |
606 | |
607 # this will generate a matrix which records across the row, the number | |
608 # of DNA subst | |
609 # | |
610 sub _build_nt_matrix { | |
611 my ($self,$seqa,$seqb) = @_; | |
612 | |
613 | |
614 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match | |
615 [ qw(0 0 0 0) ], | |
616 [ qw(0 0 0 0) ], | |
617 [ qw(0 0 0 0) ] ]; | |
618 my $gaps = 0; # number of gaps | |
619 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency | |
620 | |
621 for( my $i = 0; $i < scalar @$seqa; $i++) { | |
622 | |
623 my ($ti,$tj) = ($seqa->[$i],$seqb->[$i]); | |
624 $ti =~ tr/U/T/; | |
625 $tj =~ tr/U/T/; | |
626 | |
627 if( $ti =~ /^$GapChars$/) { $gaps++; next; } | |
628 if( $tj =~ /^$GapChars$/) { $gaps++; next } | |
629 | |
630 my $ti_index = $NucleotideIndexes{$ti}; | |
631 my $tj_index = $NucleotideIndexes{$tj}; | |
632 | |
633 if( ! defined $ti_index ) { | |
634 print "ti_index not defined for $ti\n"; | |
635 next; | |
636 } | |
637 | |
638 $basect_matrix->[$ti_index]->[$tj_index]++; | |
639 | |
640 if( $ti ne $tj ) { | |
641 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++; | |
642 } | |
643 } | |
644 return ($basect_matrix,$pfreq,$gaps); | |
645 } | |
646 | |
647 sub _check_arg { | |
648 my($self,$aln ) = @_; | |
649 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) { | |
650 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics"); | |
651 return 0; | |
652 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) { | |
653 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet); | |
654 return 0; | |
655 } | |
656 return 1; | |
657 } | |
658 | |
659 =head2 Data Methods | |
660 | |
661 =cut | |
662 | |
663 =head2 pairwise_stats | |
664 | |
665 Title : pairwise_stats | |
666 Usage : $obj->pairwise_stats($newval) | |
667 Function: | |
668 Returns : value of pairwise_stats | |
669 Args : newvalue (optional) | |
670 | |
671 | |
672 =cut | |
673 | |
674 sub pairwise_stats{ | |
675 my ($self,$value) = @_; | |
676 if( defined $value) { | |
677 $self->{'_pairwise_stats'} = $value; | |
678 } | |
679 return $self->{'_pairwise_stats'}; | |
680 | |
681 } | |
682 | |
683 1; |