comparison mayachemtools/lib/StatisticsUtil.pm @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 package StatisticsUtil;
2 #
3 # $RCSfile: StatisticsUtil.pm,v $
4 # $Date: 2015/02/28 20:47:18 $
5 # $Revision: 1.41 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use Exporter;
31
32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
33
34 @ISA = qw(Exporter);
35 @EXPORT = qw(Average AverageDeviation Covariance Correlation Euclidean Factorial FactorialDivison GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode PearsonCorrelation Permutations Product Range RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError Standardize StandardScores StandardScoresN TrimMean Variance VarianceN);
36 @EXPORT_OK = qw();
37 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
38
39 # Compute the mean of an array of numbers
40 sub Average {
41 my($XArrayRef) = @_;
42 return Mean($XArrayRef);
43 }
44
45 # Compute the average of the absolute deviation of an array of numbers: SUM( ABS(x[i] - Xmean) ) / n
46 sub AverageDeviation {
47 my($XArrayRef) = @_;
48
49 if (!@$XArrayRef) {
50 return undef;
51 }
52 my($AverageDeviation, $Mean, $Value, $SumOfDeviations);
53
54 $AverageDeviation = 0;
55 $Mean = Mean($XArrayRef);
56 foreach $Value (@$XArrayRef) {
57 $SumOfDeviations += abs($Value - $Mean);
58 }
59 $AverageDeviation = $SumOfDeviations / @$XArrayRef;
60
61 return $AverageDeviation;
62 }
63
64 # Compute correlation coefficient between two arrays of numbers
65 sub Correlation {
66 my($XArrayRef, $YArrayRef) = @_;
67 return PearsonCorrelation($XArrayRef, $YArrayRef);
68 }
69
70 # Compute the covariance between two arrays of numbers: SUM( (x[i] - Xmean) (y[i] - Ymean) ) / n
71 sub Covariance {
72 my($XArrayRef, $YArrayRef) = @_;
73
74 if (!(@$XArrayRef && @$YArrayRef && (@$XArrayRef == @$YArrayRef))) {
75 return undef;
76 }
77 my($Covariance, $XMean, $YMean, $Index, $ProductOfDeviations);
78
79 $Covariance = 0;
80 $XMean = Mean($XArrayRef);
81 $YMean = Mean($YArrayRef);
82 $ProductOfDeviations = 0;
83 for $Index (0 .. $#{@$XArrayRef}) {
84 $ProductOfDeviations += (($XArrayRef->[$Index] - $XMean) * ($YArrayRef->[$Index] - $YMean));
85 }
86 $Covariance = $ProductOfDeviations / @$XArrayRef;
87 return $Covariance;
88 }
89
90 # Compute the euclidean distance of an array of numbers: SQRT( SUM( x[i] ** 2) )
91 sub Euclidean {
92 my($XArrayRef) = @_;
93
94 if (!@$XArrayRef) {
95 return undef;
96 }
97 my($SumOfSquares);
98
99 $SumOfSquares = SumOfSquares($XArrayRef);
100
101 return sqrt $SumOfSquares;
102 }
103
104 # Compute factorial of a number...
105 sub Factorial {
106 my($Num) = @_;
107
108 return _Factorial($Num, 1);
109 }
110
111 # Perform factorial division of two numbers...
112 sub FactorialDivison {
113 my($Numerator, $Denominator) = @_;
114
115 # Only works for integer numbers...
116 if ($Numerator <= 0 || ($Numerator != int($Numerator)) ||
117 $Denominator <= 0 || ($Denominator != int($Denominator)) ) {
118 return undef;
119 }
120 my($LargerNum, $SmallerNum, $Result);
121 $LargerNum = ($Numerator > $Denominator) ? $Numerator : $Denominator;
122 $SmallerNum = ($Numerator < $Denominator) ? $Numerator : $Denominator;
123
124 $Result = _Factorial($LargerNum, $SmallerNum);
125 if ($Numerator < $Denominator) {
126 $Result = 1/$Result;
127 }
128 return $Result;
129 }
130
131 # Calculate factorial of a number upto a specific limit...
132 sub _Factorial {
133 my($Num, $Limit) = @_;
134
135 # Only works for integer numbers...
136 if ($Num <= 0 || ($Num != int($Num)) || $Limit < 1) {
137 return undef;
138 }
139
140 my($Result) = 1;
141
142 while ($Num > $Limit) {
143 $Result *= $Num;
144 $Num--;
145 }
146 return $Result;
147 }
148
149 # Generate all possible permuations or a specific permutations of items in an array
150 # and return a reference to an array containing array references to generated permuations...
151 #
152 # This alogrithm is based on the example provided by Mark Jason-Dominus, and is available
153 # at CPAN as mjd_permute standalone script.
154 #
155 sub Permutations {
156 my(@DataToPermute) = @_;
157 my($PermutationNum, $NumOfPermutations, @Permutations);
158
159 if (!@DataToPermute) {
160 return undef;
161 }
162
163 @Permutations = ();
164 $NumOfPermutations = Factorial(scalar @DataToPermute);
165
166 for ($PermutationNum = 0; $PermutationNum < $NumOfPermutations; $PermutationNum++) {
167 my @Permutation = @DataToPermute[_PermutationNumToPermutation($PermutationNum, $#DataToPermute)];
168 push @Permutations, \@Permutation;
169 }
170
171 return \@Permutations;
172 }
173
174 # Generte Nth permutation for a collection of specific size...
175 #
176 sub _PermutationNumToPermutation {
177 my($Num, $Size) = @_;
178
179 return _PatternToPermutation(_PermutationNumToPattern($Num, $Size));
180 }
181
182 # Generate Nth pattern for a collection of specific size...
183 #
184 sub _PermutationNumToPattern {
185 my($Num, $Size) = @_;
186 my($Index, @Pattern);
187
188 $Index = 1;
189
190 while ($Index <= $Size + 1) {
191 push @Pattern, $Num % $Index;
192 $Num = int($Num/$Index);
193 $Index++;
194 }
195
196 return @Pattern;
197 }
198
199 # Generate permutation of integers from pattern...
200 #
201 sub _PatternToPermutation {
202 my(@Pattern) = @_;
203 my(@Source, @Permutation);
204
205 @Source = (0 .. $#Pattern);
206
207 while (@Pattern) {
208 push @Permutation, splice(@Source, (pop @Pattern), 1);
209 }
210
211 return @Permutation;
212 }
213
214 # Compute the frequency of occurance of values in an array of numbers. Three different
215 # invocation methods are supported:
216 #
217 # Frequency(\@ArrayRef) : Using the smallest and largest values, group the numbers into
218 # 10 bins.
219 #
220 # Frequency(\@ArrayRef, $NumOfBins) : Using the smallest and largest values, group the
221 # numbers into specified bins.
222 #
223 # Frequency(\@ArrayRef, \@BinRange): Use bin range to goup the values into different bins.
224 #
225 # A hash array is returned with keys and values representing range and frequency values respectively.
226 # The frequency value for a specific key corresponds to all the values which are greater than
227 # the previous key and less than or equal to the current key. A key value representing maximum value is
228 # added for generating frequency distribution for specific number of bins, and whenever the maximum
229 # array value is greater than the maximum specified in bin range, it is also added to bin range.
230 #
231 sub Frequency {
232 my($XArrayRef) = @_;
233
234 if (!@$XArrayRef) {
235 return undef;
236 }
237
238 my($BinRange, $NumOfBins, $BinRangeSpecified);
239
240 $BinRangeSpecified = 0;
241 $NumOfBins = 10;
242 if (@_ == 2) {
243 if (ref($_[1]) eq 'ARRAY') {
244 $BinRange = $_[1];
245 if (!(@$BinRange && (@$BinRange > 1))) {
246 return undef;
247 }
248 # Make sure the bin range contains values in increasing order...
249 my($Index1, $Index2);
250 for $Index1 (0 .. $#{@$BinRange}) {
251 for $Index2 (($Index1 + 1) .. $#{@$BinRange}) {
252 if ($BinRange->[$Index1] >= $BinRange->[$Index2]) {
253 return undef;
254 }
255 }
256 }
257 $BinRangeSpecified = 1;
258 }
259 else {
260 $NumOfBins = $_[1];
261 if ($NumOfBins <= 1) {
262 return undef;
263 }
264 }
265 }
266
267 # Setup range keys...
268 my(@RangeKeys);
269 @RangeKeys = ();
270
271 my($MinValue, $MaxValue) = Range($XArrayRef);
272 if ($BinRangeSpecified) {
273 push @RangeKeys, @$BinRange;
274 if ($MaxValue > $RangeKeys[$#RangeKeys]) {
275 push @RangeKeys, $MaxValue;
276 }
277 }
278 else {
279 my($MinValue, $MaxValue) = Range($XArrayRef);
280 my($Interval) = ($MaxValue - $MinValue)/$NumOfBins;
281 my($KeyValue) = $MinValue + $Interval;
282 while ($KeyValue < $MaxValue) {
283 push @RangeKeys, $KeyValue;
284 $KeyValue += $Interval;
285 }
286 push @RangeKeys, $MaxValue;
287 }
288
289 #Setup frequency hash array...
290 my(%FrequencyMap);
291 %FrequencyMap = ();
292
293 %FrequencyMap = map { $_ => 0 } @RangeKeys;
294
295 # Count values...
296 my($Key, $Value);
297
298 VALUE: for $Value (@$XArrayRef) {
299 for $Key (@RangeKeys) {
300 if ($Value <= $Key) {
301 $FrequencyMap{$Key} += 1;
302 next VALUE;
303 }
304 }
305 }
306 return (%FrequencyMap);
307 }
308
309 # Compute the geometric mean of an array of numbers: NthROOT( PRODUCT(x[i]) )
310 sub GeometricMean {
311 my($XArrayRef) = @_;
312
313 if (!@$XArrayRef) {
314 return undef;
315 }
316 my($Mean, $Product, $Value);
317 $Product = 1;
318 foreach $Value (@$XArrayRef) {
319 if ($Value <= 0 ) {
320 return undef;
321 }
322 $Product *= $Value;
323 }
324 $Mean = $Product ** (1 / @$XArrayRef);
325 return $Mean;
326 }
327
328 # Compute the harmonic mean of an array of numbers: 1 / ( SUM(1/x[i]) / n )
329 sub HarmonicMean {
330 my($XArrayRef) = @_;
331
332 if (!@$XArrayRef) {
333 return undef;
334 }
335 my($Mean, $Sum, $Value);
336 $Sum = 0;
337 foreach $Value (@$XArrayRef) {
338 if ($Value <= 0 ) {
339 return undef;
340 }
341 $Sum += 1/$Value;
342 }
343 $Mean = 1/($Sum/@$XArrayRef);
344 return $Mean;
345 }
346
347 # Return the k-largest value from an array of numbers
348 sub KLargest {
349 my($XArrayRef, $K) = @_;
350
351 if (!(@$XArrayRef && ($K > 0) && ($K <= @$XArrayRef))) {
352 return undef;
353 }
354 my($KLargest, @SortedXArray);
355 @SortedXArray = sort { $b <=> $a } @$XArrayRef;
356 $KLargest = $SortedXArray[$K - 1];
357 return $KLargest;
358 }
359
360 # Return the k-smallest value from an array of numbers
361 sub KSmallest {
362 my($XArrayRef, $K) = @_;
363
364 if (!(@$XArrayRef && ($K > 0) && ($K <= @$XArrayRef))) {
365 return undef;
366 }
367 my($KSmallest, @SortedXArray);
368 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
369 $KSmallest = $SortedXArray[$K - 1];
370 return $KSmallest;
371 }
372
373 # Compute the kurtosis of an array of numbers:
374 # [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] - {3((n - 1)^2)}/{(n - 2)(n-3)}
375 #
376 sub Kurtosis {
377 my($XArrayRef) = @_;
378
379 if (!@$XArrayRef || ((@$XArrayRef - 3) <= 0)) {
380 return undef;
381 }
382 my($Kurtosis, $Mean, $StandardDeviation, $Value);
383 $Mean = Mean($XArrayRef);
384 if (!defined $Mean) {
385 return undef;
386 }
387 $StandardDeviation = StandardDeviation($XArrayRef);
388 if (!(defined $StandardDeviation && $StandardDeviation != 0)) {
389 return undef;
390 }
391
392 my($SumOfScores, $SampleSize);
393 $SumOfScores = 0;
394 for $Value (@$XArrayRef) {
395 $SumOfScores += (($Value - $Mean)/$StandardDeviation) ** 4;
396 }
397 $SampleSize = @$XArrayRef;
398 $Kurtosis = ((($SampleSize * ($SampleSize + 1))/(($SampleSize - 1) * ($SampleSize - 2) * ($SampleSize - 3))) * $SumOfScores) - ((3 * (($SampleSize - 1) ** 2))/(($SampleSize - 2) * ($SampleSize - 3)));
399 return $Kurtosis;
400 }
401
402 # Return the smallest value from an array of numbers
403 sub Minimum {
404 my($XArrayRef) = @_;
405 return KSmallest($XArrayRef, 1);
406 }
407
408 # Return the largest value from an array of numbers
409 sub Maximum {
410 my($XArrayRef) = @_;
411 return KLargest($XArrayRef, 1);
412 }
413
414 # Compute the mean of an array of numbers: SUM( x[i] ) / n
415 sub Mean {
416 my($XArrayRef) = @_;
417
418 if (!@$XArrayRef) {
419 return undef;
420 }
421 my($Mean, $Sum, $Value);
422 $Sum = 0;
423 foreach $Value (@$XArrayRef) {
424 $Sum += $Value;
425 }
426 $Mean = $Sum / @$XArrayRef;
427 return $Mean;
428 }
429
430 # Compute the median value of an array of numbers. For an even number array, it's
431 # the average of two middle values.
432 #
433 # For even values of n: Xsorted[(n - 1)/2 + 1]
434 # For odd values of n: (Xsorted[n/2] + Xsorted[n/2 + 1])/2
435 #
436 sub Median {
437 my($XArrayRef) = @_;
438
439 if (!@$XArrayRef) {
440 return undef;
441 }
442 my($Median, @SortedXArray);
443 $Median = 0;
444 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
445 if (@$XArrayRef % 2) {
446 my($MidIndex);
447 $MidIndex = int(@SortedXArray - 1)/2;
448 $Median = $SortedXArray[$MidIndex];
449 }
450 else {
451 # Even number array...
452 my($MidPosition);
453 $MidPosition = int(@SortedXArray / 2);
454 $Median = ($SortedXArray[$MidPosition - 1] + $SortedXArray[$MidPosition]) / 2;
455 }
456 return $Median;
457 }
458
459 # Return the most frequently occuring value in an array of numbers
460 sub Mode {
461 my($XArrayRef) = @_;
462
463 if (!@$XArrayRef) {
464 return undef;
465 }
466 my($Value, %ValueToCountMap, @CountList, @SortedCountList);
467 %ValueToCountMap = ();
468 @CountList = ();
469 @SortedCountList = ();
470 for $Value (@$XArrayRef) {
471 if (exists $ValueToCountMap{$Value}) {
472 $ValueToCountMap{$Value} += 1;
473 }
474 else {
475 $ValueToCountMap{$Value} = 1;
476 }
477 }
478 for $Value (keys %ValueToCountMap) {
479 push @CountList, $ValueToCountMap{$Value};
480 }
481 @SortedCountList = sort { $b <=> $a } @CountList;
482
483 # Make sure the frequency of mode value is greater than one and check for
484 # multiple modes as well...
485 #
486 my($ModeCount, $ModeValue);
487 $ModeCount = $SortedCountList[0];
488 if ($ModeCount <= 1) {
489 return undef;
490 }
491 # Get the first mode value...
492 VALUE: for $Value (keys %ValueToCountMap) {
493 if ($ValueToCountMap{$Value} == $ModeCount) {
494 $ModeValue = $Value;
495 # Set it to zero to skip it next time...
496 $ValueToCountMap{$Value} = 0;
497 last VALUE;
498 }
499 }
500
501 if (wantarray) {
502 # Retrieve all the modes...
503 my(@Modes, $Count);
504 @Modes = ();
505 push @Modes, $ModeValue;
506 for $Count (@SortedCountList) {
507 if ($Count == $ModeCount) {
508 VALUE: for $Value (keys %ValueToCountMap) {
509 if ($ValueToCountMap{$Value} == $ModeCount) {
510 push @Modes, $Value;
511 # Set it to zero to skip it next time...
512 $ValueToCountMap{$Value} = 0;
513 last VALUE;
514 }
515 }
516 }
517 }
518 return sort {$b <=> $a} @Modes;
519 }
520 else {
521 return $ModeValue;
522 }
523 }
524
525
526 # Compute the Pearson correlation coefficient between two arrays of numbers:
527 #
528 # SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )(SUM( (y[i] - Ymean)^2 )) )
529 #
530 # It returns values in the range from -1.0 to 1.0
531 sub PearsonCorrelation {
532 my($XArrayRef, $YArrayRef) = @_;
533
534 if (!(@$XArrayRef && @$YArrayRef && (@$XArrayRef == @$YArrayRef))) {
535 return undef;
536 }
537 my($Correlation, $XMean, $YMean, $Index, $XValueDeviation, $YValueDeviation, $SquareOfXDeviations, $SquareOfYDeviations, $ProductOfDeviations);
538
539 $Correlation = 0;
540 $XMean = Mean($XArrayRef);
541 $YMean = Mean($YArrayRef);
542 $ProductOfDeviations = 0; $SquareOfXDeviations = 0; $SquareOfYDeviations = 0;
543 for $Index (0 .. $#{@$XArrayRef}) {
544 $XValueDeviation = $XArrayRef->[$Index] - $XMean;
545 $YValueDeviation = $YArrayRef->[$Index] - $YMean;
546 $ProductOfDeviations += ($XValueDeviation * $YValueDeviation);
547 $SquareOfXDeviations += $XValueDeviation ** 2;
548 $SquareOfYDeviations += $YValueDeviation ** 2;
549 }
550 $Correlation = $ProductOfDeviations / sqrt($SquareOfXDeviations * $SquareOfYDeviations);
551 return $Correlation;
552 }
553
554 # Return the smallest and largest values from an array of numbers
555 sub Range {
556 my($XArrayRef) = @_;
557
558 if (!@$XArrayRef) {
559 return (undef, undef);
560 }
561 my($Smallest, $Largest, @SortedXArray);
562 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
563 $Smallest = $SortedXArray[0];
564 $Largest = $SortedXArray[$#SortedXArray];
565 return ($Smallest, $Largest);
566 }
567
568 # Compute square of the Pearson correlation coefficient between two arrays of numbers.
569 #
570 sub RSquare {
571 my($XArrayRef, $YArrayRef) = @_;
572 my($RSquare, $Correlation);
573
574 $RSquare = undef;
575 $Correlation = PearsonCorrelation($XArrayRef, $YArrayRef);
576 if (defined $Correlation) {
577 $RSquare = $Correlation ** 2;
578 }
579 return $RSquare;
580 }
581
582 # Compute the skewness of an array of numbers:
583 # {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
584 #
585 sub Skewness {
586 my($XArrayRef) = @_;
587
588 if (!@$XArrayRef || ((@$XArrayRef - 2) <= 0)) {
589 return undef;
590 }
591 my($Skewness, $Mean, $StandardDeviation, $Value);
592 $Mean = Mean($XArrayRef);
593 if (!defined $Mean) {
594 return undef;
595 }
596 $StandardDeviation = StandardDeviation($XArrayRef);
597 if (!(defined $StandardDeviation && $StandardDeviation != 0)) {
598 return undef;
599 }
600
601 my($SumOfScores, $SampleSize);
602 $SumOfScores = 0;
603 for $Value (@$XArrayRef) {
604 $SumOfScores += (($Value - $Mean)/$StandardDeviation) ** 3;
605 }
606 $SampleSize = @$XArrayRef;
607 $Skewness = ($SampleSize/(($SampleSize - 1) * ($SampleSize - 2) )) * $SumOfScores;
608 return $Skewness;
609 }
610
611 # Compute the standard deviation of an array of numbers
612 sub StandardDeviation {
613 my($XArrayRef) = @_;
614 return _CalculateStandardDeviation($XArrayRef, 2);
615 }
616
617 # Compute the standard deviation of an array of numbers representing entire population
618 sub StandardDeviationN {
619 my($XArrayRef) = @_;
620 return _CalculateStandardDeviation($XArrayRef, 1);
621 }
622
623 # Compute the standard deviation of an array of numbers.
624 # Mode 1: SQRT ( SUM( (x[i] - mean)^2 ) / n )
625 # Mode 2: SQRT ( SUM( (x[i] - mean)^2 ) / (n - 1) )
626 #
627 sub _CalculateStandardDeviation {
628 my($XArrayRef, $Mode) = @_;
629
630 if (!@$XArrayRef) {
631 return undef;
632 }
633 my($StandardDeviation, $Value, $SquareOfDeviations, $Mean, $N);
634
635 $StandardDeviation = 0;
636 $Mean = Mean($XArrayRef);
637 $SquareOfDeviations = 0;
638 foreach $Value (@$XArrayRef) {
639 $SquareOfDeviations += ($Value - $Mean) ** 2;
640 }
641 $N = ($Mode == 1) ? @$XArrayRef : (@$XArrayRef - 1);
642 $StandardDeviation = sqrt($SquareOfDeviations / $N);
643
644 return $StandardDeviation;
645 }
646
647 # Compute the standard error using standard deviation and sample size
648 sub StandardError {
649 my($StandardDeviation, $Count) = @_;
650
651 if ($Count <= 0) {
652 return undef;
653 }
654 my($StandardError);
655 $StandardError = $StandardDeviation / sqrt($Count);
656
657 return $StandardError;
658 }
659
660 # Standardize the value using mean and standard deviation
661 sub Standardize {
662 my($Value, $Mean, $StandardDeviation) = @_;
663
664 if ($StandardDeviation <= 0) {
665 return undef;
666 }
667 my($StandardizedValue);
668 $StandardizedValue = ($Value - $Mean)/$StandardDeviation;
669
670 return $StandardizedValue;
671 }
672
673 # Compute the standard deviation above the mean for an array of numbers.
674 sub StandardScores {
675 my($XArrayRef) = @_;
676 return _CalculateStandardScores($XArrayRef, 2);
677 }
678
679 # Compute the standard deviation above the mean for an array of numbers representing entire population
680 sub StandardScoresN {
681 my($XArrayRef) = @_;
682 return _CalculateStandardScores($XArrayRef, 1);
683 }
684
685 # Compute the standard deviation above the mean for an array of numbers.
686 # Mode 1: (x[i] - mean) / n
687 # Mode 2: (x[i] - mean) / (n - 1)
688 #
689 sub _CalculateStandardScores {
690 my($XArrayRef, $Mode) = @_;
691
692 if (!@$XArrayRef) {
693 return undef;
694 }
695 my(@StandardScores, $Mean, $StandardDeviation, $Value);
696
697 $Mean = Mean($XArrayRef);
698 $StandardDeviation = _CalculateStandardDeviation($XArrayRef, $Mode);
699 if (!(defined($StandardDeviation) && $StandardDeviation > 0)) {
700 return undef;
701 }
702 @StandardScores = ();
703 for $Value (@$XArrayRef) {
704 push @StandardScores, ($Value - $Mean)/$StandardDeviation;
705 }
706
707 return @StandardScores;
708 }
709
710 # Compute the product of an array of numbers
711 sub Product {
712 my($XArrayRef) = @_;
713
714 if (!@$XArrayRef) {
715 return undef;
716 }
717 my($Product, $Value);
718 $Product = 1;
719 foreach $Value (@$XArrayRef) {
720 $Product *= $Value;
721 }
722 return $Product;
723 }
724
725 # Compute the sum of an array of numbers
726 sub Sum {
727 my($XArrayRef) = @_;
728
729 if (!@$XArrayRef) {
730 return undef;
731 }
732 my($Sum, $Value);
733 $Sum = 0;
734 foreach $Value (@$XArrayRef) {
735 $Sum += $Value;
736 }
737 return $Sum;
738 }
739
740 # Compute the sum of squares of an array of numbers
741 sub SumOfSquares {
742 my($XArrayRef) = @_;
743
744 if (!@$XArrayRef) {
745 return undef;
746 }
747 my($SumOfSquares, $Value);
748 $SumOfSquares = 0;
749 foreach $Value (@$XArrayRef) {
750 $SumOfSquares += $Value ** 2;
751 }
752 return $SumOfSquares;
753 }
754
755 # Compute the mean of an array of numbers by excluding a fraction of
756 # numbers from the top and bottom of the data set.
757 sub TrimMean {
758 my($XArrayRef, $FractionToExclude) = @_;
759
760 if (!(@$XArrayRef && $FractionToExclude > 0 && $FractionToExclude <= 1)) {
761 return undef;
762 }
763 my($NumberToExclude);
764 $NumberToExclude = int(@$XArrayRef * $FractionToExclude);
765 $NumberToExclude = ($NumberToExclude % 2) ? ($NumberToExclude - 1) : $NumberToExclude;
766 if ($NumberToExclude == @$XArrayRef) {
767 return undef;
768 }
769 my($Mean, $Sum, $Index, $FirstIndex, $LastIndex);
770 $FirstIndex = $NumberToExclude/2;
771 $LastIndex = @$XArrayRef - ($NumberToExclude/2) - 1;
772 $Sum = 0;
773 my(@SortedXArray);
774 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
775 for $Index ($FirstIndex .. $LastIndex) {
776 $Sum += $SortedXArray[$Index];
777 }
778 $Mean = $Sum/(@SortedXArray - $NumberToExclude);
779 return $Mean;
780 }
781
782 # Compute the variance of an array of numbers
783 sub Variance {
784 my($XArrayRef) = @_;
785 return _CalculateVariance($XArrayRef, 2);
786 }
787
788 # Compute the variance of an array of numbers representing entire population
789 sub VarianceN {
790 my($XArrayRef) = @_;
791 return _CalculateVariance($XArrayRef, 1);
792 }
793
794 # Compute the variance of an array of numbers:
795 # Mode 1: SUM( (x[i] - Xmean)^2 / n )
796 # Mode 2: SUM( (x[i] - Xmean)^2 / (n - 1) )
797 #
798 sub _CalculateVariance {
799 my($XArrayRef, $Mode) = @_;
800
801 if (!@$XArrayRef) {
802 return undef;
803 }
804 my($Variance, $Value, $SquareOfDeviations, $Mean, $N);
805
806 $Variance = 0;
807 $Mean = Mean($XArrayRef);
808 $SquareOfDeviations = 0;
809 foreach $Value (@$XArrayRef) {
810 $SquareOfDeviations += ($Value - $Mean) ** 2;
811 }
812 $N = ($Mode == 1) ? @$XArrayRef : (@$XArrayRef - 1);
813 $Variance = $SquareOfDeviations / $N;
814
815 return $Variance;
816 }
817
818 1;
819
820 __END__
821
822 =head1 NAME
823
824 StatisticsUtil
825
826 =head1 SYNOPSIS
827
828 use StatisticsUtil;
829
830 use Statistics qw(:all);
831
832 =head1 DESCRIPTION
833
834 B<StatisticsUtil> module provides the following functions:
835
836 Average, AverageDeviation, Correlation, Covariance, Euclidean, Factorial,
837 FactorialDivison, Frequency, GeometricMean, HarmonicMean, KLargest, KSmallest,
838 Kurtosis, Maximum, Mean, Median, Minimum, Mode, PearsonCorrelation, Permutations,
839 Product, RSquare, Range, Skewness, StandardDeviation, StandardDeviationN,
840 StandardError, StandardScores, StandardScoresN, Standardize, Sum, SumOfSquares,
841 TrimMean, Variance, VarianceN
842
843 =head2 METHODS
844
845 =over 4
846
847 =item B<Average>
848
849 $Value = Average(\@DataArray);
850
851 Computes the mean of an array of numbers: SUM( x[i] ) / n
852
853 =item B<AverageDeviation>
854
855 $Value = AverageDeviation(\@DataArray);
856
857 Computes the average of the absolute deviation of an array of numbers: SUM( ABS(x[i] - Xmean) ) / n
858
859 =item B<Correlation>
860
861 $Value = Correlation(\@XDataArray, \@YDataArray);
862
863 Computes the Pearson correlation coefficient between two arrays of numbers:
864 SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )(SUM( (y[i] - Ymean)^2 )) )
865
866 =item B<Euclidean>
867
868 $Return = Euclidean(\@DataArray);
869
870 Computes the euclidean distance of an array of numbers: SQRT( SUM( x[i] ** 2) )
871
872 =item B<Covariance>
873
874 $Value = Covariance(\@XDataArray, \@YDataArray);
875
876 Computes the covariance between two arrays of numbers: SUM( (x[i] - Xmean) (y[i] - Ymean) ) / n
877
878 =item B<Factorial>
879
880 $Value = Factorial($Num);
881
882 Computes the factorial of a positive integer.
883
884 =item B<FactorialDivison>
885
886 $Value = FactorialDivision($Numerator, $Denominator);
887
888 Compute the factorial divison of two positive integers.
889
890 =item B<Frequency>
891
892 %FrequencyValues = Frequency(\@DataArray, [$NumOfBins]);
893 %FrequencyValues = Frequency(\@DataArray, [\@BinRange]);
894
895 A hash array is returned with keys and values representing range and frequency values, respectively.
896 The frequency value for a specific key corresponds to all the values which are greater than
897 the previous key and less than or equal to the current key. A key value representing maximum value is
898 added for generating frequency distribution for specific number of bins, and whenever the maximum
899 array value is greater than the maximum specified in bin range, it is also added to bin range.
900
901 =item B<GeometricMean>
902
903 $Value = GeometricMean(\@DataArray);
904
905 Computes the geometric mean of an array of numbers: NthROOT( PRODUCT(x[i]) )
906
907 =item B<HarmonicMean>
908
909 $Value = HarmonicMean(\@DataArray);
910
911 Computes the harmonic mean of an array of numbers: 1 / ( SUM(1/x[i]) / n )
912
913 =item B<KLargest>
914
915 $Value = KLargest(\@DataArray, $KthNumber);
916
917 Returns the k-largest value from an array of numbers.
918
919 =item B<KSmallest>
920
921 $Value = KSmallest(\@DataArray, $KthNumber);
922
923 Returns the k-smallest value from an array of numbers.
924
925 =item B<Kurtosis>
926
927 $Value = Kurtosis(\@DataArray);
928
929 Computes the kurtosis of an array of numbers:
930 [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] - {3((n - 1)^2)}/{(n - 2)(n-3)}
931
932 =item B<Maximum>
933
934 $Value = Maximum(\@DataArray);
935
936 Returns the largest value from an array of numbers.
937
938 =item B<Minimum>
939
940 $Value = Minimum(\@DataArray);
941
942 Returns the smallest value from an array of numbers.
943
944 =item B<Mean>
945
946 $Value = Mean(\@DataArray);
947
948 Computes the mean of an array of numbers: SUM( x[i] ) / n
949
950 =item B<Median>
951
952 $Value = Median(\@DataArray);
953
954 Computes the median value of an array of numbers. For an even number array, it's
955 the average of two middle values.
956
957 For even values of n: Xsorted[(n - 1)/2 + 1]
958 For odd values of n: (Xsorted[n/2] + Xsorted[n/2 + 1])/2
959
960 =item B<Mode>
961
962 $Value = Mode(\@DataArray);
963
964 Returns the most frequently occuring value in an array of numbers.
965
966 =item B<PearsonCorrelation>
967
968 $Value = Correlation(\@XDataArray, \@YDataArray);
969
970 Computes the Pearson correlation coefficient between two arrays of numbers:
971 SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )(SUM( (y[i] - Ymean)^2 )) )
972
973 =item B<Permutations>
974
975 $PermutationsRef = Permutations(@DataToPermute);
976
977 Generate all possible permuations or a specific permutations of items in an array
978 and return a reference to an array containing array references to generated permuations.
979
980 This alogrithm is based on the example provided by Mark Jason-Dominus, and is available
981 at CPAN as mjd_permute standalone script.
982
983 =item B<Product>
984
985 $Value = Product(\@DataArray);
986
987 Compute the product of an array of numbers.
988
989 =item B<Range>
990
991 ($Smallest, $Largest) = Range(\@DataArray);
992
993 Return the smallest and largest values from an array of numbers.
994
995 =item B<RSquare>
996
997 $Value = RSquare(\@XDataArray, \@YDataArray);
998
999 Computes square of the Pearson correlation coefficient between two arrays of numbers.
1000
1001 =item B<Skewness>
1002
1003 $Value = Skewness(\@DataArray);
1004
1005 Computes the skewness of an array of numbers:
1006 {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
1007
1008 =item B<StandardDeviation>
1009
1010 $Value = StandardDeviation(\@DataArray);
1011
1012 Computes the standard deviation of an array of numbers.
1013 SQRT ( SUM( (x[i] - mean)^2 ) / (n - 1) )
1014
1015 =item B<StandardDeviationN>
1016
1017 $Value = StandardDeviationN(\@DataArray);
1018
1019 Computes the standard deviation of an array of numbers representing entire population:
1020 SQRT ( SUM( (x[i] - mean)^2 ) / n )
1021
1022 =item B<StandardError>
1023
1024 $Value = StandardError($StandardDeviation, $Count);
1025
1026 Computes the standard error using standard deviation and sample size.
1027
1028 =item B<Standardize>
1029
1030 $Value = Standardize($Value, $Mean, $StandardDeviation);
1031
1032 Standardizes the value using mean and standard deviation.
1033
1034 =item B<StandardScores>
1035
1036 @Values = StandardScores(\@DataArray);
1037
1038 Computes the standard deviation above the mean for an array of numbers:
1039 (x[i] - mean) / (n - 1)
1040
1041 =item B<StandardScoresN>
1042
1043 @Values = StandardScoresN(\@DataArray);
1044
1045 Computes the standard deviation above the mean for an array of numbers representing entire population:
1046 (x[i] - mean) / n
1047
1048 =item B<Sum>
1049
1050 $Value = Sum(\@DataArray);
1051
1052 Compute the sum of an array of numbers.
1053
1054 =item B<SumOfSquares>
1055
1056 $Value = SumOfSquares(\@DataArray);
1057
1058 Computes the sum of an array of numbers.
1059
1060 =item B<TrimMean>
1061
1062 $Value = TrimMean(\@DataArray, $FractionToExclude));
1063
1064 Computes the mean of an array of numbers by excluding a fraction of
1065 numbers from the top and bottom of the data set.
1066
1067 =item B<Variance>
1068
1069 $Value = Variance(\@DataArray);
1070
1071 Computes the variance of an array of numbers: SUM( (x[i] - Xmean)^2 / (n - 1) )
1072
1073 =item B<VarianceN>
1074
1075 $Value = Variance(\@DataArray);
1076
1077 Compute the variance of an array of numbers representing entire population:
1078 SUM( (x[i] - Xmean)^2 / n )
1079
1080 =back
1081
1082 =head1 AUTHOR
1083
1084 Manish Sud <msud@san.rr.com>
1085
1086 =head1 SEE ALSO
1087
1088 Constants.pm, ConversionsUtil.pm, MathUtil.pm
1089
1090 =head1 COPYRIGHT
1091
1092 Copyright (C) 2015 Manish Sud. All rights reserved.
1093
1094 This file is part of MayaChemTools.
1095
1096 MayaChemTools is free software; you can redistribute it and/or modify it under
1097 the terms of the GNU Lesser General Public License as published by the Free
1098 Software Foundation; either version 3 of the License, or (at your option)
1099 any later version.
1100
1101 =cut