Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/Align/DNAStatistics.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Align/DNAStatistics.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,683 @@ +# $Id: DNAStatistics.pm,v 1.4 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::Align::DNAStatistics +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment + +=head1 SYNOPSIS + + use Bio::Align::DNAStatistics; + use Bio::AlignIO; + + my $stats = new Bio::Align::PairwiseStatistics; + my $alignin = new Bio::AlignIO(-format => 'emboss', + -file => 't/data/insulin.water'); + my $jc = $stats->distance($aln, 'Jukes-Cantor'); + foreach my $r ( @$jc ) { + print "\t"; + foreach my $r ( @$d ) { + print "$r\t"; + } + print "\n"; + } + +=head1 DESCRIPTION + +This object contains routines for calculating various statistics and +distances for DNA alignments. The routines are not well tested and do +contain errors at this point. Work is underway to correct them, but +do not expect this code to give you the right answer currently! Use +dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the +distances. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Align::DNAStatistics; +use vars qw(@ISA %DNAChanges @Nucleotides %NucleotideIndexes + $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods); +use strict; +use Bio::Align::PairwiseStatistics; +use Bio::Root::Root; + +BEGIN { + $GapChars = '(\.|\-)'; + @Nucleotides = qw(A G T C); + $SeqCount = 2; + # these values come from EMBOSS distmat implementation + %NucleotideIndexes = ( 'A' => 0, + 'T' => 1, + 'C' => 2, + 'G' => 3, + + 'AT' => 0, + 'AC' => 1, + 'AG' => 2, + 'CT' => 3, + 'GT' => 4, + 'CG' => 5, + +# these are wrong now +# 'S' => [ 1, 3], +# 'W' => [ 0, 4], +# 'Y' => [ 2, 3], +# 'R' => [ 0, 1], +# 'M' => [ 0, 3], +# 'K' => [ 1, 2], +# 'B' => [ 1, 2, 3], +# 'H' => [ 0, 2, 3], +# 'V' => [ 0, 1, 3], +# 'D' => [ 0, 1, 2], + ); + + $DefaultGapPenalty = 0; + # could put ambiguities here? + %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'], + 'T' => [ 'A', 'G'], + 'C' => [ 'A', 'G'], + 'G' => [ 'C', 'T'], + }, + 'Transitions' => { 'A' => [ 'G' ], + 'G' => [ 'A' ], + 'C' => [ 'T' ], + 'T' => [ 'C' ], + }, + ); + %DistanceMethods = ( 'jc|jukes|jukes-cantor' => 'JukesCantor', + 'f81' => 'F81', + 'k2|k2p|k80|kimura' => 'Kimura', + 't92|tamura|tamura92' => 'Tamura', + 'f84' => 'F84', + 'tajimanei|tajima-nei' => 'TajimaNei' ); +} + +@ISA = qw( Bio::Root::Root Bio::Align::StatisticsI ); + +=head2 new + + Title : new + Usage : my $obj = new Bio::Align::DNAStatistics(); + Function: Builds a new Bio::Align::DNAStatistics object + Returns : Bio::Align::DNAStatistics + Args : none + + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + $self->pairwise_stats( new Bio::Align::PairwiseStatistics()); + + return $self; +} + + +=head2 distance + + Title : distance + Usage : my $distance_mat = $stats->distance(-align => $aln, + -method => $method); + Function: Calculates a distance matrix for all pairwise distances of + sequences in an alignment. + Returns : Array ref + Args : -align => Bio::Align::AlignI object + -method => String specifying specific distance method + (implementing class may assume a default) + +=cut + +sub distance{ + my ($self,@args) = @_; + my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args); + if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) { + $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance"); + } + $method ||= 'JukesCantor'; + foreach my $m ( keys %DistanceMethods ) { + if(defined $m && $method =~ /$m/i ) { + my $mtd = "D_$DistanceMethods{$m}"; + return $self->$mtd($aln); + } + } + $self->warn("Unrecognized distance method $method must be one of [". + join(',',$self->available_distance_methods())."]"); + return undef; +} + +=head2 available_distance_methods + + Title : available_distance_methods + Usage : my @methods = $stats->available_distance_methods(); + Function: Enumerates the possible distance methods + Returns : Array of strings + Args : none + + +=cut + +sub available_distance_methods{ + my ($self,@args) = @_; + return values %DistanceMethods; +} + +=head2 D - distance methods + +=cut + +=head2 D_JukesCantor + + Title : D_JukesCantor + Usage : my $d = $stat->D_JukesCantor($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Jukes-Cantor 1 parameter model. + Returns : ArrayRef of all pairwise distances of all sequence pairs in the alignment + Args : Bio::Align::AlignI of DNA sequences + double - gap penalty + + +=cut + +sub D_JukesCantor{ + my ($self,$aln,$gappenalty) = @_; + return 0 unless $self->_check_arg($aln); + $gappenalty = $DefaultGapPenalty unless defined $gappenalty; + # ambiguities ignored at this point + + my (@seqs); + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,uc $seq->seq())]; + } + my $seqct = scalar @seqs; + my @DVals; + for(my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1], + $seqs[$j-1]); + # just want diagonals + my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] + + $matrix->[2]->[2] + $matrix->[3]->[3] ); + my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty))); + my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3)); + $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d; + } + } + return \@DVals; +} + +=head2 D_F81 + + Title : D_F81 + Usage : my $d = $stat->D_F81($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Felsenstein 1981 distance model. + Returns : ArrayRef of a 2d array of all pairwise distances in the alignment + Args : Bio::Align::AlignI of DNA sequences + + +=cut + +sub D_F81{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + $self->throw("This isn't implemented yet - sorry"); +} + + +# M Kimura, J. Mol. Evol., 1980, 16, 111. + +=head2 D_Kimura + + Title : D_Kimura + Usage : my $d = $stat->D_Kimura($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Kimura 2 parameter model. + Returns : ArrayRef of pairwise distances between all sequences in alignment + Args : Bio::Align::AlignI of DNA sequences + + +=cut + +sub D_Kimura{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + my $seqct = $aln->no_sequences; + my @KVals; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my $pairwise = $aln->select_noncont($i,$j); + my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise); + my $P = $self->transitions($pairwise) / $L; + my $Q = $self->transversions($pairwise) / $L; + + my $a = 1 / ( 1 - (2 * $P) - $Q); + my $b = 1 / ( 1 - 2 * $Q ); + my $K = (1/2) * log ( $a ) + (1/4) * log($b); + $KVals[$i]->[$j] = $K; + $KVals[$j]->[$i] = $K; + } + } + return \@KVals; +} + +# K Tamura, Mol. Biol. Evol. 1992, 9, 678. + +=head2 D_Tamura + + Title : D_Tamura + Usage : + Function: + Returns : + Args : + + +=cut + +sub D_Tamura{ + my ($self,$aln) = @_; + my $seqct = $aln->no_sequences; + my @KVals; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + } + } + my $O = 0.25; + my $t = 0; + my $a = 0; + my $b = 0; + + + my $d = 4 * $O * ( 1 - $O ) * $a * $t + 2 * $b * $t; + return $d; +} + +=head2 D_F84 + + Title : D_F84 + Usage : my $d = $stat->D_F84($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the Felsenstein 1984 distance model. + Returns : Distance value + Args : Bio::Align::AlignI of DNA sequences + double - gap penalty + +=cut + +sub D_F84{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); +} + +# Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269. + +=head2 D_TajimaNei + + Title : D_TajimaNei + Usage : my $d = $stat->D_TajimaNei($aln) + Function: Calculates D (pairwise distance) between 2 sequences in an + alignment using the TajimaNei 1984 distance model. + Returns : Distance value + Args : Bio::Align::AlignI of DNA sequences + + +=cut + +sub D_TajimaNei{ + my ($self,$aln) = @_; + $self->warn("The result from this method is not correct right now"); + my (@seqs); + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,uc $seq->seq())]; + } + my $seqct = scalar @seqs; + my @DVals; + for(my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i-1], + $seqs[$j-1]); + my $fij2; + my $slen = $aln->length - $gaps; + for( my $bs = 0; $bs < 4; $bs++ ) { + my $fi = 0; + map {$fi += $matrix->[$bs]->[$_] } 0..3; + my $fj = 0; + map { $fj += $matrix->[$_]->[$bs] } 0..3; + my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0; + $fij2 += $fij**2; + } + my ($pair,$h) = (0,0); + for( my $bs = 0; $bs < 3; $bs++ ) { + for( my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) { + my $fij = $pfreq->[$pair++] / $slen; + if( $fij ) { + + my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0); + + map { $ci1 += $matrix->[$_]->[$bs] } 0..3; + map { $cj1 += $matrix->[$bs]->[$_] } 0..3; + map { $ci2 += $matrix->[$_]->[$bs1] } 0..3; + map { $cj2 += $matrix->[$bs1]->[$_] } 0..3; + + $h += ( $fij*$fij / 2 ) / + ( ( ( $ci1 + $cj1 ) / 2 * $slen ) * + ( ( $ci2 + $cj2 ) /2 * $slen ) + ); + $self->debug( "h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n"); + } + } + } + # just want diagonals first + + my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] + + $matrix->[2]->[2] + $matrix->[3]->[3] ); + my $D = 1 - ( $m / $slen); + + my $b = (1-$fij2+(($D**2)/$h)) / 2; + $self->debug("h is $h fij2 is $fij2 b is $b\n"); + + my $d = (-1 * $b) * log ( 1 - $D/ $b); + $DVals[$i]->[$j] = $DVals[$j]->[$i] = $d; + } + } + return \@DVals; + + +} + +# HKY -- HASEGAWA, M., H. KISHINO, and T. YANO. 1985 +# Tamura and Nei 1993? +# GTR? + +=head2 K - sequence substitution methods + +=cut + +=head2 K_JukesCantor + + Title : K_JukesCantor + Usage : my $k = $stats->K_JukesCantor($aln) + Function: Calculates K - the number of nucleotide substitutions between + 2 seqs - according to the Jukes-Cantor 1 parameter model + This only involves the number of changes between two sequences. + Returns : double + Args : Bio::Align::AlignI + + +=cut + +sub K_JukesCantor{ + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + my $seqct = $aln->no_sequences; + my @KVals; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my $pairwise = $aln->select_noncont($i,$j); + my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise); + my $N = $self->pairwise_stats->number_of_differences($pairwise); + my $p = $N / $L; + my $K = - ( 3 / 4) * log ( 1 - (( 4 * $p) / 3 )); + $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K; + } + } + return \@KVals; +} + +=head2 K_TajimaNei + + Title : K_TajimaNei + Usage : my $k = $stats->K_TajimaNei($aln) + Function: Calculates K - the number of nucleotide substitutions between + 2 seqs - according to the Kimura 2 parameter model. + This does not assume equal frequencies among all the nucleotides. + Returns : ArrayRef of 2d matrix which contains pairwise K values for + all sequences in the alignment + Args : Bio::Align::AlignI + +=cut + +sub K_TajimaNei { + my ($self,$aln) = @_; + return 0 unless $self->_check_arg($aln); + + my @seqs; + foreach my $seq ( $aln->each_seq) { + push @seqs, [ split(//,uc $seq->seq())]; + } + my @KVals; + my $L = $self->pairwise_stats->number_of_comparable_bases($aln); + my $seqct = scalar @seqs; + for( my $i = 1; $i <= $seqct; $i++ ) { + for( my $j = $i+1; $j <= $seqct; $j++ ) { + my (%q,%y); + my ($first,$second) = ($seqs[$i-1],$seqs[$j-1]); + + for (my $k = 0;$k<$aln->length; $k++ ) { + next if( $first->[$k] =~ /^$GapChars$/ || + $second->[$k] =~ /^$GapChars$/); + + $q{$second->[$k]}++; + $q{$first->[$k]}++; + if( $first->[$k] ne $second->[$k] ) { + $y{$first->[$k]}->{$second->[$k]}++; + } + } + + my $q_sum = 0; + foreach my $let ( @Nucleotides ) { + # ct is the number of sequences compared (2) + # L is the length of the alignment without gaps + # $ct * $L = total number of nt compared + my $avg = $q{$let} / ( $SeqCount * $L ); + $q_sum += $avg**2; + } + my $b1 = 1 - $q_sum; + my $h = 0; + for( my $i = 0; $i <= 2; $i++ ) { + for( my $j = $i+1; $j <= 3; $j++) { + $y{$Nucleotides[$i]}->{$Nucleotides[$j]} ||= 0; + $y{$Nucleotides[$j]}->{$Nucleotides[$i]} ||= 0; + my $x = ($y{$Nucleotides[$i]}->{$Nucleotides[$j]} + + $y{$Nucleotides[$j]}->{$Nucleotides[$i]}) / $L; + $h += ($x ** 2) / ( 2 * $q{$Nucleotides[$i]} * + $q{$Nucleotides[$j]} ); + } + } + my $N = $self->pairwise_stats->number_of_differences($aln); + my $p = $N / $L; + my $b = ( $b1 + $p ** 2 / $h ) / 2; + my $K = - $b * log ( 1 - $p / $b ); + $KVals[$i]->[$j] = $KVals[$j]->[$i] = $K; + } + } + return \@KVals; +} + + + +=head2 transversions + + Title : transversions + Usage : my $transversions = $stats->transversion($aln); + Function: Calculates the number of transversions between two sequences in + an alignment + Returns : integer + Args : Bio::Align::AlignI + + +=cut + +sub transversions{ + my ($self,$aln) = @_; + return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'}); +} + +=head2 transitions + + Title : transitions + Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln); + Function: Calculates the number of transitions in a given DNA alignment + Returns : integer representing the number of transitions + Args : Bio::Align::AlignI object + + +=cut + +sub transitions{ + my ($self,$aln) = @_; + return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'}); +} + + +sub _trans_count_helper { + my ($self,$aln,$type) = @_; + return 0 unless( $self->_check_arg($aln) ); + if( ! $aln->is_flush ) { $self->throw("must be flush") } + my (@seqs,@tcount); + foreach my $seq ( $aln->get_seq_by_pos(1), $aln->get_seq_by_pos(2) ) { + push @seqs, [ split(//,$seq->seq())]; + } + my ($first,$second) = @seqs; + + for (my $i = 0;$i<$aln->length; $i++ ) { + next if( $first->[$i] =~ /^$GapChars$/ || + $second->[$i] =~ /^$GapChars$/); + if( $first->[$i] ne $second->[$i] ) { + foreach my $nt ( @{$type->{$first->[$i]}} ) { + if( $nt eq $second->[$i]) { + $tcount[$i]++; + } + } + } + } + my $sum = 0; + map { if( $_) { $sum += $_} } @tcount; + return $sum; +} + +# this will generate a matrix which records across the row, the number +# of DNA subst +# +sub _build_nt_matrix { + my ($self,$seqa,$seqb) = @_; + + + my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match + [ qw(0 0 0 0) ], + [ qw(0 0 0 0) ], + [ qw(0 0 0 0) ] ]; + my $gaps = 0; # number of gaps + my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency + + for( my $i = 0; $i < scalar @$seqa; $i++) { + + my ($ti,$tj) = ($seqa->[$i],$seqb->[$i]); + $ti =~ tr/U/T/; + $tj =~ tr/U/T/; + + if( $ti =~ /^$GapChars$/) { $gaps++; next; } + if( $tj =~ /^$GapChars$/) { $gaps++; next } + + my $ti_index = $NucleotideIndexes{$ti}; + my $tj_index = $NucleotideIndexes{$tj}; + + if( ! defined $ti_index ) { + print "ti_index not defined for $ti\n"; + next; + } + + $basect_matrix->[$ti_index]->[$tj_index]++; + + if( $ti ne $tj ) { + $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++; + } + } + return ($basect_matrix,$pfreq,$gaps); +} + +sub _check_arg { + my($self,$aln ) = @_; + if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics"); + return 0; + } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) { + $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet); + return 0; + } + return 1; +} + +=head2 Data Methods + +=cut + +=head2 pairwise_stats + + Title : pairwise_stats + Usage : $obj->pairwise_stats($newval) + Function: + Returns : value of pairwise_stats + Args : newvalue (optional) + + +=cut + +sub pairwise_stats{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_pairwise_stats'} = $value; + } + return $self->{'_pairwise_stats'}; + +} + +1;