0
|
1 package StatisticsUtil;
|
|
2 #
|
|
3 # $RCSfile: StatisticsUtil.pm,v $
|
|
4 # $Date: 2015/02/28 20:47:18 $
|
|
5 # $Revision: 1.41 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use Exporter;
|
|
31
|
|
32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
33
|
|
34 @ISA = qw(Exporter);
|
|
35 @EXPORT = qw(Average AverageDeviation Covariance Correlation Euclidean Factorial FactorialDivison GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode PearsonCorrelation Permutations Product Range RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError Standardize StandardScores StandardScoresN TrimMean Variance VarianceN);
|
|
36 @EXPORT_OK = qw();
|
|
37 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
|
|
38
|
|
39 # Compute the mean of an array of numbers
|
|
40 sub Average {
|
|
41 my($XArrayRef) = @_;
|
|
42 return Mean($XArrayRef);
|
|
43 }
|
|
44
|
|
45 # Compute the average of the absolute deviation of an array of numbers: SUM( ABS(x[i] - Xmean) ) / n
|
|
46 sub AverageDeviation {
|
|
47 my($XArrayRef) = @_;
|
|
48
|
|
49 if (!@$XArrayRef) {
|
|
50 return undef;
|
|
51 }
|
|
52 my($AverageDeviation, $Mean, $Value, $SumOfDeviations);
|
|
53
|
|
54 $AverageDeviation = 0;
|
|
55 $Mean = Mean($XArrayRef);
|
|
56 foreach $Value (@$XArrayRef) {
|
|
57 $SumOfDeviations += abs($Value - $Mean);
|
|
58 }
|
|
59 $AverageDeviation = $SumOfDeviations / @$XArrayRef;
|
|
60
|
|
61 return $AverageDeviation;
|
|
62 }
|
|
63
|
|
64 # Compute correlation coefficient between two arrays of numbers
|
|
65 sub Correlation {
|
|
66 my($XArrayRef, $YArrayRef) = @_;
|
|
67 return PearsonCorrelation($XArrayRef, $YArrayRef);
|
|
68 }
|
|
69
|
|
70 # Compute the covariance between two arrays of numbers: SUM( (x[i] - Xmean) (y[i] - Ymean) ) / n
|
|
71 sub Covariance {
|
|
72 my($XArrayRef, $YArrayRef) = @_;
|
|
73
|
|
74 if (!(@$XArrayRef && @$YArrayRef && (@$XArrayRef == @$YArrayRef))) {
|
|
75 return undef;
|
|
76 }
|
|
77 my($Covariance, $XMean, $YMean, $Index, $ProductOfDeviations);
|
|
78
|
|
79 $Covariance = 0;
|
|
80 $XMean = Mean($XArrayRef);
|
|
81 $YMean = Mean($YArrayRef);
|
|
82 $ProductOfDeviations = 0;
|
|
83 for $Index (0 .. $#{@$XArrayRef}) {
|
|
84 $ProductOfDeviations += (($XArrayRef->[$Index] - $XMean) * ($YArrayRef->[$Index] - $YMean));
|
|
85 }
|
|
86 $Covariance = $ProductOfDeviations / @$XArrayRef;
|
|
87 return $Covariance;
|
|
88 }
|
|
89
|
|
90 # Compute the euclidean distance of an array of numbers: SQRT( SUM( x[i] ** 2) )
|
|
91 sub Euclidean {
|
|
92 my($XArrayRef) = @_;
|
|
93
|
|
94 if (!@$XArrayRef) {
|
|
95 return undef;
|
|
96 }
|
|
97 my($SumOfSquares);
|
|
98
|
|
99 $SumOfSquares = SumOfSquares($XArrayRef);
|
|
100
|
|
101 return sqrt $SumOfSquares;
|
|
102 }
|
|
103
|
|
104 # Compute factorial of a number...
|
|
105 sub Factorial {
|
|
106 my($Num) = @_;
|
|
107
|
|
108 return _Factorial($Num, 1);
|
|
109 }
|
|
110
|
|
111 # Perform factorial division of two numbers...
|
|
112 sub FactorialDivison {
|
|
113 my($Numerator, $Denominator) = @_;
|
|
114
|
|
115 # Only works for integer numbers...
|
|
116 if ($Numerator <= 0 || ($Numerator != int($Numerator)) ||
|
|
117 $Denominator <= 0 || ($Denominator != int($Denominator)) ) {
|
|
118 return undef;
|
|
119 }
|
|
120 my($LargerNum, $SmallerNum, $Result);
|
|
121 $LargerNum = ($Numerator > $Denominator) ? $Numerator : $Denominator;
|
|
122 $SmallerNum = ($Numerator < $Denominator) ? $Numerator : $Denominator;
|
|
123
|
|
124 $Result = _Factorial($LargerNum, $SmallerNum);
|
|
125 if ($Numerator < $Denominator) {
|
|
126 $Result = 1/$Result;
|
|
127 }
|
|
128 return $Result;
|
|
129 }
|
|
130
|
|
131 # Calculate factorial of a number upto a specific limit...
|
|
132 sub _Factorial {
|
|
133 my($Num, $Limit) = @_;
|
|
134
|
|
135 # Only works for integer numbers...
|
|
136 if ($Num <= 0 || ($Num != int($Num)) || $Limit < 1) {
|
|
137 return undef;
|
|
138 }
|
|
139
|
|
140 my($Result) = 1;
|
|
141
|
|
142 while ($Num > $Limit) {
|
|
143 $Result *= $Num;
|
|
144 $Num--;
|
|
145 }
|
|
146 return $Result;
|
|
147 }
|
|
148
|
|
149 # Generate all possible permuations or a specific permutations of items in an array
|
|
150 # and return a reference to an array containing array references to generated permuations...
|
|
151 #
|
|
152 # This alogrithm is based on the example provided by Mark Jason-Dominus, and is available
|
|
153 # at CPAN as mjd_permute standalone script.
|
|
154 #
|
|
155 sub Permutations {
|
|
156 my(@DataToPermute) = @_;
|
|
157 my($PermutationNum, $NumOfPermutations, @Permutations);
|
|
158
|
|
159 if (!@DataToPermute) {
|
|
160 return undef;
|
|
161 }
|
|
162
|
|
163 @Permutations = ();
|
|
164 $NumOfPermutations = Factorial(scalar @DataToPermute);
|
|
165
|
|
166 for ($PermutationNum = 0; $PermutationNum < $NumOfPermutations; $PermutationNum++) {
|
|
167 my @Permutation = @DataToPermute[_PermutationNumToPermutation($PermutationNum, $#DataToPermute)];
|
|
168 push @Permutations, \@Permutation;
|
|
169 }
|
|
170
|
|
171 return \@Permutations;
|
|
172 }
|
|
173
|
|
174 # Generte Nth permutation for a collection of specific size...
|
|
175 #
|
|
176 sub _PermutationNumToPermutation {
|
|
177 my($Num, $Size) = @_;
|
|
178
|
|
179 return _PatternToPermutation(_PermutationNumToPattern($Num, $Size));
|
|
180 }
|
|
181
|
|
182 # Generate Nth pattern for a collection of specific size...
|
|
183 #
|
|
184 sub _PermutationNumToPattern {
|
|
185 my($Num, $Size) = @_;
|
|
186 my($Index, @Pattern);
|
|
187
|
|
188 $Index = 1;
|
|
189
|
|
190 while ($Index <= $Size + 1) {
|
|
191 push @Pattern, $Num % $Index;
|
|
192 $Num = int($Num/$Index);
|
|
193 $Index++;
|
|
194 }
|
|
195
|
|
196 return @Pattern;
|
|
197 }
|
|
198
|
|
199 # Generate permutation of integers from pattern...
|
|
200 #
|
|
201 sub _PatternToPermutation {
|
|
202 my(@Pattern) = @_;
|
|
203 my(@Source, @Permutation);
|
|
204
|
|
205 @Source = (0 .. $#Pattern);
|
|
206
|
|
207 while (@Pattern) {
|
|
208 push @Permutation, splice(@Source, (pop @Pattern), 1);
|
|
209 }
|
|
210
|
|
211 return @Permutation;
|
|
212 }
|
|
213
|
|
214 # Compute the frequency of occurance of values in an array of numbers. Three different
|
|
215 # invocation methods are supported:
|
|
216 #
|
|
217 # Frequency(\@ArrayRef) : Using the smallest and largest values, group the numbers into
|
|
218 # 10 bins.
|
|
219 #
|
|
220 # Frequency(\@ArrayRef, $NumOfBins) : Using the smallest and largest values, group the
|
|
221 # numbers into specified bins.
|
|
222 #
|
|
223 # Frequency(\@ArrayRef, \@BinRange): Use bin range to goup the values into different bins.
|
|
224 #
|
|
225 # A hash array is returned with keys and values representing range and frequency values respectively.
|
|
226 # The frequency value for a specific key corresponds to all the values which are greater than
|
|
227 # the previous key and less than or equal to the current key. A key value representing maximum value is
|
|
228 # added for generating frequency distribution for specific number of bins, and whenever the maximum
|
|
229 # array value is greater than the maximum specified in bin range, it is also added to bin range.
|
|
230 #
|
|
231 sub Frequency {
|
|
232 my($XArrayRef) = @_;
|
|
233
|
|
234 if (!@$XArrayRef) {
|
|
235 return undef;
|
|
236 }
|
|
237
|
|
238 my($BinRange, $NumOfBins, $BinRangeSpecified);
|
|
239
|
|
240 $BinRangeSpecified = 0;
|
|
241 $NumOfBins = 10;
|
|
242 if (@_ == 2) {
|
|
243 if (ref($_[1]) eq 'ARRAY') {
|
|
244 $BinRange = $_[1];
|
|
245 if (!(@$BinRange && (@$BinRange > 1))) {
|
|
246 return undef;
|
|
247 }
|
|
248 # Make sure the bin range contains values in increasing order...
|
|
249 my($Index1, $Index2);
|
|
250 for $Index1 (0 .. $#{@$BinRange}) {
|
|
251 for $Index2 (($Index1 + 1) .. $#{@$BinRange}) {
|
|
252 if ($BinRange->[$Index1] >= $BinRange->[$Index2]) {
|
|
253 return undef;
|
|
254 }
|
|
255 }
|
|
256 }
|
|
257 $BinRangeSpecified = 1;
|
|
258 }
|
|
259 else {
|
|
260 $NumOfBins = $_[1];
|
|
261 if ($NumOfBins <= 1) {
|
|
262 return undef;
|
|
263 }
|
|
264 }
|
|
265 }
|
|
266
|
|
267 # Setup range keys...
|
|
268 my(@RangeKeys);
|
|
269 @RangeKeys = ();
|
|
270
|
|
271 my($MinValue, $MaxValue) = Range($XArrayRef);
|
|
272 if ($BinRangeSpecified) {
|
|
273 push @RangeKeys, @$BinRange;
|
|
274 if ($MaxValue > $RangeKeys[$#RangeKeys]) {
|
|
275 push @RangeKeys, $MaxValue;
|
|
276 }
|
|
277 }
|
|
278 else {
|
|
279 my($MinValue, $MaxValue) = Range($XArrayRef);
|
|
280 my($Interval) = ($MaxValue - $MinValue)/$NumOfBins;
|
|
281 my($KeyValue) = $MinValue + $Interval;
|
|
282 while ($KeyValue < $MaxValue) {
|
|
283 push @RangeKeys, $KeyValue;
|
|
284 $KeyValue += $Interval;
|
|
285 }
|
|
286 push @RangeKeys, $MaxValue;
|
|
287 }
|
|
288
|
|
289 #Setup frequency hash array...
|
|
290 my(%FrequencyMap);
|
|
291 %FrequencyMap = ();
|
|
292
|
|
293 %FrequencyMap = map { $_ => 0 } @RangeKeys;
|
|
294
|
|
295 # Count values...
|
|
296 my($Key, $Value);
|
|
297
|
|
298 VALUE: for $Value (@$XArrayRef) {
|
|
299 for $Key (@RangeKeys) {
|
|
300 if ($Value <= $Key) {
|
|
301 $FrequencyMap{$Key} += 1;
|
|
302 next VALUE;
|
|
303 }
|
|
304 }
|
|
305 }
|
|
306 return (%FrequencyMap);
|
|
307 }
|
|
308
|
|
309 # Compute the geometric mean of an array of numbers: NthROOT( PRODUCT(x[i]) )
|
|
310 sub GeometricMean {
|
|
311 my($XArrayRef) = @_;
|
|
312
|
|
313 if (!@$XArrayRef) {
|
|
314 return undef;
|
|
315 }
|
|
316 my($Mean, $Product, $Value);
|
|
317 $Product = 1;
|
|
318 foreach $Value (@$XArrayRef) {
|
|
319 if ($Value <= 0 ) {
|
|
320 return undef;
|
|
321 }
|
|
322 $Product *= $Value;
|
|
323 }
|
|
324 $Mean = $Product ** (1 / @$XArrayRef);
|
|
325 return $Mean;
|
|
326 }
|
|
327
|
|
328 # Compute the harmonic mean of an array of numbers: 1 / ( SUM(1/x[i]) / n )
|
|
329 sub HarmonicMean {
|
|
330 my($XArrayRef) = @_;
|
|
331
|
|
332 if (!@$XArrayRef) {
|
|
333 return undef;
|
|
334 }
|
|
335 my($Mean, $Sum, $Value);
|
|
336 $Sum = 0;
|
|
337 foreach $Value (@$XArrayRef) {
|
|
338 if ($Value <= 0 ) {
|
|
339 return undef;
|
|
340 }
|
|
341 $Sum += 1/$Value;
|
|
342 }
|
|
343 $Mean = 1/($Sum/@$XArrayRef);
|
|
344 return $Mean;
|
|
345 }
|
|
346
|
|
347 # Return the k-largest value from an array of numbers
|
|
348 sub KLargest {
|
|
349 my($XArrayRef, $K) = @_;
|
|
350
|
|
351 if (!(@$XArrayRef && ($K > 0) && ($K <= @$XArrayRef))) {
|
|
352 return undef;
|
|
353 }
|
|
354 my($KLargest, @SortedXArray);
|
|
355 @SortedXArray = sort { $b <=> $a } @$XArrayRef;
|
|
356 $KLargest = $SortedXArray[$K - 1];
|
|
357 return $KLargest;
|
|
358 }
|
|
359
|
|
360 # Return the k-smallest value from an array of numbers
|
|
361 sub KSmallest {
|
|
362 my($XArrayRef, $K) = @_;
|
|
363
|
|
364 if (!(@$XArrayRef && ($K > 0) && ($K <= @$XArrayRef))) {
|
|
365 return undef;
|
|
366 }
|
|
367 my($KSmallest, @SortedXArray);
|
|
368 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
|
|
369 $KSmallest = $SortedXArray[$K - 1];
|
|
370 return $KSmallest;
|
|
371 }
|
|
372
|
|
373 # Compute the kurtosis of an array of numbers:
|
|
374 # [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] - {3((n - 1)^2)}/{(n - 2)(n-3)}
|
|
375 #
|
|
376 sub Kurtosis {
|
|
377 my($XArrayRef) = @_;
|
|
378
|
|
379 if (!@$XArrayRef || ((@$XArrayRef - 3) <= 0)) {
|
|
380 return undef;
|
|
381 }
|
|
382 my($Kurtosis, $Mean, $StandardDeviation, $Value);
|
|
383 $Mean = Mean($XArrayRef);
|
|
384 if (!defined $Mean) {
|
|
385 return undef;
|
|
386 }
|
|
387 $StandardDeviation = StandardDeviation($XArrayRef);
|
|
388 if (!(defined $StandardDeviation && $StandardDeviation != 0)) {
|
|
389 return undef;
|
|
390 }
|
|
391
|
|
392 my($SumOfScores, $SampleSize);
|
|
393 $SumOfScores = 0;
|
|
394 for $Value (@$XArrayRef) {
|
|
395 $SumOfScores += (($Value - $Mean)/$StandardDeviation) ** 4;
|
|
396 }
|
|
397 $SampleSize = @$XArrayRef;
|
|
398 $Kurtosis = ((($SampleSize * ($SampleSize + 1))/(($SampleSize - 1) * ($SampleSize - 2) * ($SampleSize - 3))) * $SumOfScores) - ((3 * (($SampleSize - 1) ** 2))/(($SampleSize - 2) * ($SampleSize - 3)));
|
|
399 return $Kurtosis;
|
|
400 }
|
|
401
|
|
402 # Return the smallest value from an array of numbers
|
|
403 sub Minimum {
|
|
404 my($XArrayRef) = @_;
|
|
405 return KSmallest($XArrayRef, 1);
|
|
406 }
|
|
407
|
|
408 # Return the largest value from an array of numbers
|
|
409 sub Maximum {
|
|
410 my($XArrayRef) = @_;
|
|
411 return KLargest($XArrayRef, 1);
|
|
412 }
|
|
413
|
|
414 # Compute the mean of an array of numbers: SUM( x[i] ) / n
|
|
415 sub Mean {
|
|
416 my($XArrayRef) = @_;
|
|
417
|
|
418 if (!@$XArrayRef) {
|
|
419 return undef;
|
|
420 }
|
|
421 my($Mean, $Sum, $Value);
|
|
422 $Sum = 0;
|
|
423 foreach $Value (@$XArrayRef) {
|
|
424 $Sum += $Value;
|
|
425 }
|
|
426 $Mean = $Sum / @$XArrayRef;
|
|
427 return $Mean;
|
|
428 }
|
|
429
|
|
430 # Compute the median value of an array of numbers. For an even number array, it's
|
|
431 # the average of two middle values.
|
|
432 #
|
|
433 # For even values of n: Xsorted[(n - 1)/2 + 1]
|
|
434 # For odd values of n: (Xsorted[n/2] + Xsorted[n/2 + 1])/2
|
|
435 #
|
|
436 sub Median {
|
|
437 my($XArrayRef) = @_;
|
|
438
|
|
439 if (!@$XArrayRef) {
|
|
440 return undef;
|
|
441 }
|
|
442 my($Median, @SortedXArray);
|
|
443 $Median = 0;
|
|
444 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
|
|
445 if (@$XArrayRef % 2) {
|
|
446 my($MidIndex);
|
|
447 $MidIndex = int(@SortedXArray - 1)/2;
|
|
448 $Median = $SortedXArray[$MidIndex];
|
|
449 }
|
|
450 else {
|
|
451 # Even number array...
|
|
452 my($MidPosition);
|
|
453 $MidPosition = int(@SortedXArray / 2);
|
|
454 $Median = ($SortedXArray[$MidPosition - 1] + $SortedXArray[$MidPosition]) / 2;
|
|
455 }
|
|
456 return $Median;
|
|
457 }
|
|
458
|
|
459 # Return the most frequently occuring value in an array of numbers
|
|
460 sub Mode {
|
|
461 my($XArrayRef) = @_;
|
|
462
|
|
463 if (!@$XArrayRef) {
|
|
464 return undef;
|
|
465 }
|
|
466 my($Value, %ValueToCountMap, @CountList, @SortedCountList);
|
|
467 %ValueToCountMap = ();
|
|
468 @CountList = ();
|
|
469 @SortedCountList = ();
|
|
470 for $Value (@$XArrayRef) {
|
|
471 if (exists $ValueToCountMap{$Value}) {
|
|
472 $ValueToCountMap{$Value} += 1;
|
|
473 }
|
|
474 else {
|
|
475 $ValueToCountMap{$Value} = 1;
|
|
476 }
|
|
477 }
|
|
478 for $Value (keys %ValueToCountMap) {
|
|
479 push @CountList, $ValueToCountMap{$Value};
|
|
480 }
|
|
481 @SortedCountList = sort { $b <=> $a } @CountList;
|
|
482
|
|
483 # Make sure the frequency of mode value is greater than one and check for
|
|
484 # multiple modes as well...
|
|
485 #
|
|
486 my($ModeCount, $ModeValue);
|
|
487 $ModeCount = $SortedCountList[0];
|
|
488 if ($ModeCount <= 1) {
|
|
489 return undef;
|
|
490 }
|
|
491 # Get the first mode value...
|
|
492 VALUE: for $Value (keys %ValueToCountMap) {
|
|
493 if ($ValueToCountMap{$Value} == $ModeCount) {
|
|
494 $ModeValue = $Value;
|
|
495 # Set it to zero to skip it next time...
|
|
496 $ValueToCountMap{$Value} = 0;
|
|
497 last VALUE;
|
|
498 }
|
|
499 }
|
|
500
|
|
501 if (wantarray) {
|
|
502 # Retrieve all the modes...
|
|
503 my(@Modes, $Count);
|
|
504 @Modes = ();
|
|
505 push @Modes, $ModeValue;
|
|
506 for $Count (@SortedCountList) {
|
|
507 if ($Count == $ModeCount) {
|
|
508 VALUE: for $Value (keys %ValueToCountMap) {
|
|
509 if ($ValueToCountMap{$Value} == $ModeCount) {
|
|
510 push @Modes, $Value;
|
|
511 # Set it to zero to skip it next time...
|
|
512 $ValueToCountMap{$Value} = 0;
|
|
513 last VALUE;
|
|
514 }
|
|
515 }
|
|
516 }
|
|
517 }
|
|
518 return sort {$b <=> $a} @Modes;
|
|
519 }
|
|
520 else {
|
|
521 return $ModeValue;
|
|
522 }
|
|
523 }
|
|
524
|
|
525
|
|
526 # Compute the Pearson correlation coefficient between two arrays of numbers:
|
|
527 #
|
|
528 # SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )(SUM( (y[i] - Ymean)^2 )) )
|
|
529 #
|
|
530 # It returns values in the range from -1.0 to 1.0
|
|
531 sub PearsonCorrelation {
|
|
532 my($XArrayRef, $YArrayRef) = @_;
|
|
533
|
|
534 if (!(@$XArrayRef && @$YArrayRef && (@$XArrayRef == @$YArrayRef))) {
|
|
535 return undef;
|
|
536 }
|
|
537 my($Correlation, $XMean, $YMean, $Index, $XValueDeviation, $YValueDeviation, $SquareOfXDeviations, $SquareOfYDeviations, $ProductOfDeviations);
|
|
538
|
|
539 $Correlation = 0;
|
|
540 $XMean = Mean($XArrayRef);
|
|
541 $YMean = Mean($YArrayRef);
|
|
542 $ProductOfDeviations = 0; $SquareOfXDeviations = 0; $SquareOfYDeviations = 0;
|
|
543 for $Index (0 .. $#{@$XArrayRef}) {
|
|
544 $XValueDeviation = $XArrayRef->[$Index] - $XMean;
|
|
545 $YValueDeviation = $YArrayRef->[$Index] - $YMean;
|
|
546 $ProductOfDeviations += ($XValueDeviation * $YValueDeviation);
|
|
547 $SquareOfXDeviations += $XValueDeviation ** 2;
|
|
548 $SquareOfYDeviations += $YValueDeviation ** 2;
|
|
549 }
|
|
550 $Correlation = $ProductOfDeviations / sqrt($SquareOfXDeviations * $SquareOfYDeviations);
|
|
551 return $Correlation;
|
|
552 }
|
|
553
|
|
554 # Return the smallest and largest values from an array of numbers
|
|
555 sub Range {
|
|
556 my($XArrayRef) = @_;
|
|
557
|
|
558 if (!@$XArrayRef) {
|
|
559 return (undef, undef);
|
|
560 }
|
|
561 my($Smallest, $Largest, @SortedXArray);
|
|
562 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
|
|
563 $Smallest = $SortedXArray[0];
|
|
564 $Largest = $SortedXArray[$#SortedXArray];
|
|
565 return ($Smallest, $Largest);
|
|
566 }
|
|
567
|
|
568 # Compute square of the Pearson correlation coefficient between two arrays of numbers.
|
|
569 #
|
|
570 sub RSquare {
|
|
571 my($XArrayRef, $YArrayRef) = @_;
|
|
572 my($RSquare, $Correlation);
|
|
573
|
|
574 $RSquare = undef;
|
|
575 $Correlation = PearsonCorrelation($XArrayRef, $YArrayRef);
|
|
576 if (defined $Correlation) {
|
|
577 $RSquare = $Correlation ** 2;
|
|
578 }
|
|
579 return $RSquare;
|
|
580 }
|
|
581
|
|
582 # Compute the skewness of an array of numbers:
|
|
583 # {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
|
|
584 #
|
|
585 sub Skewness {
|
|
586 my($XArrayRef) = @_;
|
|
587
|
|
588 if (!@$XArrayRef || ((@$XArrayRef - 2) <= 0)) {
|
|
589 return undef;
|
|
590 }
|
|
591 my($Skewness, $Mean, $StandardDeviation, $Value);
|
|
592 $Mean = Mean($XArrayRef);
|
|
593 if (!defined $Mean) {
|
|
594 return undef;
|
|
595 }
|
|
596 $StandardDeviation = StandardDeviation($XArrayRef);
|
|
597 if (!(defined $StandardDeviation && $StandardDeviation != 0)) {
|
|
598 return undef;
|
|
599 }
|
|
600
|
|
601 my($SumOfScores, $SampleSize);
|
|
602 $SumOfScores = 0;
|
|
603 for $Value (@$XArrayRef) {
|
|
604 $SumOfScores += (($Value - $Mean)/$StandardDeviation) ** 3;
|
|
605 }
|
|
606 $SampleSize = @$XArrayRef;
|
|
607 $Skewness = ($SampleSize/(($SampleSize - 1) * ($SampleSize - 2) )) * $SumOfScores;
|
|
608 return $Skewness;
|
|
609 }
|
|
610
|
|
611 # Compute the standard deviation of an array of numbers
|
|
612 sub StandardDeviation {
|
|
613 my($XArrayRef) = @_;
|
|
614 return _CalculateStandardDeviation($XArrayRef, 2);
|
|
615 }
|
|
616
|
|
617 # Compute the standard deviation of an array of numbers representing entire population
|
|
618 sub StandardDeviationN {
|
|
619 my($XArrayRef) = @_;
|
|
620 return _CalculateStandardDeviation($XArrayRef, 1);
|
|
621 }
|
|
622
|
|
623 # Compute the standard deviation of an array of numbers.
|
|
624 # Mode 1: SQRT ( SUM( (x[i] - mean)^2 ) / n )
|
|
625 # Mode 2: SQRT ( SUM( (x[i] - mean)^2 ) / (n - 1) )
|
|
626 #
|
|
627 sub _CalculateStandardDeviation {
|
|
628 my($XArrayRef, $Mode) = @_;
|
|
629
|
|
630 if (!@$XArrayRef) {
|
|
631 return undef;
|
|
632 }
|
|
633 my($StandardDeviation, $Value, $SquareOfDeviations, $Mean, $N);
|
|
634
|
|
635 $StandardDeviation = 0;
|
|
636 $Mean = Mean($XArrayRef);
|
|
637 $SquareOfDeviations = 0;
|
|
638 foreach $Value (@$XArrayRef) {
|
|
639 $SquareOfDeviations += ($Value - $Mean) ** 2;
|
|
640 }
|
|
641 $N = ($Mode == 1) ? @$XArrayRef : (@$XArrayRef - 1);
|
|
642 $StandardDeviation = sqrt($SquareOfDeviations / $N);
|
|
643
|
|
644 return $StandardDeviation;
|
|
645 }
|
|
646
|
|
647 # Compute the standard error using standard deviation and sample size
|
|
648 sub StandardError {
|
|
649 my($StandardDeviation, $Count) = @_;
|
|
650
|
|
651 if ($Count <= 0) {
|
|
652 return undef;
|
|
653 }
|
|
654 my($StandardError);
|
|
655 $StandardError = $StandardDeviation / sqrt($Count);
|
|
656
|
|
657 return $StandardError;
|
|
658 }
|
|
659
|
|
660 # Standardize the value using mean and standard deviation
|
|
661 sub Standardize {
|
|
662 my($Value, $Mean, $StandardDeviation) = @_;
|
|
663
|
|
664 if ($StandardDeviation <= 0) {
|
|
665 return undef;
|
|
666 }
|
|
667 my($StandardizedValue);
|
|
668 $StandardizedValue = ($Value - $Mean)/$StandardDeviation;
|
|
669
|
|
670 return $StandardizedValue;
|
|
671 }
|
|
672
|
|
673 # Compute the standard deviation above the mean for an array of numbers.
|
|
674 sub StandardScores {
|
|
675 my($XArrayRef) = @_;
|
|
676 return _CalculateStandardScores($XArrayRef, 2);
|
|
677 }
|
|
678
|
|
679 # Compute the standard deviation above the mean for an array of numbers representing entire population
|
|
680 sub StandardScoresN {
|
|
681 my($XArrayRef) = @_;
|
|
682 return _CalculateStandardScores($XArrayRef, 1);
|
|
683 }
|
|
684
|
|
685 # Compute the standard deviation above the mean for an array of numbers.
|
|
686 # Mode 1: (x[i] - mean) / n
|
|
687 # Mode 2: (x[i] - mean) / (n - 1)
|
|
688 #
|
|
689 sub _CalculateStandardScores {
|
|
690 my($XArrayRef, $Mode) = @_;
|
|
691
|
|
692 if (!@$XArrayRef) {
|
|
693 return undef;
|
|
694 }
|
|
695 my(@StandardScores, $Mean, $StandardDeviation, $Value);
|
|
696
|
|
697 $Mean = Mean($XArrayRef);
|
|
698 $StandardDeviation = _CalculateStandardDeviation($XArrayRef, $Mode);
|
|
699 if (!(defined($StandardDeviation) && $StandardDeviation > 0)) {
|
|
700 return undef;
|
|
701 }
|
|
702 @StandardScores = ();
|
|
703 for $Value (@$XArrayRef) {
|
|
704 push @StandardScores, ($Value - $Mean)/$StandardDeviation;
|
|
705 }
|
|
706
|
|
707 return @StandardScores;
|
|
708 }
|
|
709
|
|
710 # Compute the product of an array of numbers
|
|
711 sub Product {
|
|
712 my($XArrayRef) = @_;
|
|
713
|
|
714 if (!@$XArrayRef) {
|
|
715 return undef;
|
|
716 }
|
|
717 my($Product, $Value);
|
|
718 $Product = 1;
|
|
719 foreach $Value (@$XArrayRef) {
|
|
720 $Product *= $Value;
|
|
721 }
|
|
722 return $Product;
|
|
723 }
|
|
724
|
|
725 # Compute the sum of an array of numbers
|
|
726 sub Sum {
|
|
727 my($XArrayRef) = @_;
|
|
728
|
|
729 if (!@$XArrayRef) {
|
|
730 return undef;
|
|
731 }
|
|
732 my($Sum, $Value);
|
|
733 $Sum = 0;
|
|
734 foreach $Value (@$XArrayRef) {
|
|
735 $Sum += $Value;
|
|
736 }
|
|
737 return $Sum;
|
|
738 }
|
|
739
|
|
740 # Compute the sum of squares of an array of numbers
|
|
741 sub SumOfSquares {
|
|
742 my($XArrayRef) = @_;
|
|
743
|
|
744 if (!@$XArrayRef) {
|
|
745 return undef;
|
|
746 }
|
|
747 my($SumOfSquares, $Value);
|
|
748 $SumOfSquares = 0;
|
|
749 foreach $Value (@$XArrayRef) {
|
|
750 $SumOfSquares += $Value ** 2;
|
|
751 }
|
|
752 return $SumOfSquares;
|
|
753 }
|
|
754
|
|
755 # Compute the mean of an array of numbers by excluding a fraction of
|
|
756 # numbers from the top and bottom of the data set.
|
|
757 sub TrimMean {
|
|
758 my($XArrayRef, $FractionToExclude) = @_;
|
|
759
|
|
760 if (!(@$XArrayRef && $FractionToExclude > 0 && $FractionToExclude <= 1)) {
|
|
761 return undef;
|
|
762 }
|
|
763 my($NumberToExclude);
|
|
764 $NumberToExclude = int(@$XArrayRef * $FractionToExclude);
|
|
765 $NumberToExclude = ($NumberToExclude % 2) ? ($NumberToExclude - 1) : $NumberToExclude;
|
|
766 if ($NumberToExclude == @$XArrayRef) {
|
|
767 return undef;
|
|
768 }
|
|
769 my($Mean, $Sum, $Index, $FirstIndex, $LastIndex);
|
|
770 $FirstIndex = $NumberToExclude/2;
|
|
771 $LastIndex = @$XArrayRef - ($NumberToExclude/2) - 1;
|
|
772 $Sum = 0;
|
|
773 my(@SortedXArray);
|
|
774 @SortedXArray = sort { $a <=> $b } @$XArrayRef;
|
|
775 for $Index ($FirstIndex .. $LastIndex) {
|
|
776 $Sum += $SortedXArray[$Index];
|
|
777 }
|
|
778 $Mean = $Sum/(@SortedXArray - $NumberToExclude);
|
|
779 return $Mean;
|
|
780 }
|
|
781
|
|
782 # Compute the variance of an array of numbers
|
|
783 sub Variance {
|
|
784 my($XArrayRef) = @_;
|
|
785 return _CalculateVariance($XArrayRef, 2);
|
|
786 }
|
|
787
|
|
788 # Compute the variance of an array of numbers representing entire population
|
|
789 sub VarianceN {
|
|
790 my($XArrayRef) = @_;
|
|
791 return _CalculateVariance($XArrayRef, 1);
|
|
792 }
|
|
793
|
|
794 # Compute the variance of an array of numbers:
|
|
795 # Mode 1: SUM( (x[i] - Xmean)^2 / n )
|
|
796 # Mode 2: SUM( (x[i] - Xmean)^2 / (n - 1) )
|
|
797 #
|
|
798 sub _CalculateVariance {
|
|
799 my($XArrayRef, $Mode) = @_;
|
|
800
|
|
801 if (!@$XArrayRef) {
|
|
802 return undef;
|
|
803 }
|
|
804 my($Variance, $Value, $SquareOfDeviations, $Mean, $N);
|
|
805
|
|
806 $Variance = 0;
|
|
807 $Mean = Mean($XArrayRef);
|
|
808 $SquareOfDeviations = 0;
|
|
809 foreach $Value (@$XArrayRef) {
|
|
810 $SquareOfDeviations += ($Value - $Mean) ** 2;
|
|
811 }
|
|
812 $N = ($Mode == 1) ? @$XArrayRef : (@$XArrayRef - 1);
|
|
813 $Variance = $SquareOfDeviations / $N;
|
|
814
|
|
815 return $Variance;
|
|
816 }
|
|
817
|
|
818 1;
|
|
819
|
|
820 __END__
|
|
821
|
|
822 =head1 NAME
|
|
823
|
|
824 StatisticsUtil
|
|
825
|
|
826 =head1 SYNOPSIS
|
|
827
|
|
828 use StatisticsUtil;
|
|
829
|
|
830 use Statistics qw(:all);
|
|
831
|
|
832 =head1 DESCRIPTION
|
|
833
|
|
834 B<StatisticsUtil> module provides the following functions:
|
|
835
|
|
836 Average, AverageDeviation, Correlation, Covariance, Euclidean, Factorial,
|
|
837 FactorialDivison, Frequency, GeometricMean, HarmonicMean, KLargest, KSmallest,
|
|
838 Kurtosis, Maximum, Mean, Median, Minimum, Mode, PearsonCorrelation, Permutations,
|
|
839 Product, RSquare, Range, Skewness, StandardDeviation, StandardDeviationN,
|
|
840 StandardError, StandardScores, StandardScoresN, Standardize, Sum, SumOfSquares,
|
|
841 TrimMean, Variance, VarianceN
|
|
842
|
|
843 =head2 METHODS
|
|
844
|
|
845 =over 4
|
|
846
|
|
847 =item B<Average>
|
|
848
|
|
849 $Value = Average(\@DataArray);
|
|
850
|
|
851 Computes the mean of an array of numbers: SUM( x[i] ) / n
|
|
852
|
|
853 =item B<AverageDeviation>
|
|
854
|
|
855 $Value = AverageDeviation(\@DataArray);
|
|
856
|
|
857 Computes the average of the absolute deviation of an array of numbers: SUM( ABS(x[i] - Xmean) ) / n
|
|
858
|
|
859 =item B<Correlation>
|
|
860
|
|
861 $Value = Correlation(\@XDataArray, \@YDataArray);
|
|
862
|
|
863 Computes the Pearson correlation coefficient between two arrays of numbers:
|
|
864 SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )(SUM( (y[i] - Ymean)^2 )) )
|
|
865
|
|
866 =item B<Euclidean>
|
|
867
|
|
868 $Return = Euclidean(\@DataArray);
|
|
869
|
|
870 Computes the euclidean distance of an array of numbers: SQRT( SUM( x[i] ** 2) )
|
|
871
|
|
872 =item B<Covariance>
|
|
873
|
|
874 $Value = Covariance(\@XDataArray, \@YDataArray);
|
|
875
|
|
876 Computes the covariance between two arrays of numbers: SUM( (x[i] - Xmean) (y[i] - Ymean) ) / n
|
|
877
|
|
878 =item B<Factorial>
|
|
879
|
|
880 $Value = Factorial($Num);
|
|
881
|
|
882 Computes the factorial of a positive integer.
|
|
883
|
|
884 =item B<FactorialDivison>
|
|
885
|
|
886 $Value = FactorialDivision($Numerator, $Denominator);
|
|
887
|
|
888 Compute the factorial divison of two positive integers.
|
|
889
|
|
890 =item B<Frequency>
|
|
891
|
|
892 %FrequencyValues = Frequency(\@DataArray, [$NumOfBins]);
|
|
893 %FrequencyValues = Frequency(\@DataArray, [\@BinRange]);
|
|
894
|
|
895 A hash array is returned with keys and values representing range and frequency values, respectively.
|
|
896 The frequency value for a specific key corresponds to all the values which are greater than
|
|
897 the previous key and less than or equal to the current key. A key value representing maximum value is
|
|
898 added for generating frequency distribution for specific number of bins, and whenever the maximum
|
|
899 array value is greater than the maximum specified in bin range, it is also added to bin range.
|
|
900
|
|
901 =item B<GeometricMean>
|
|
902
|
|
903 $Value = GeometricMean(\@DataArray);
|
|
904
|
|
905 Computes the geometric mean of an array of numbers: NthROOT( PRODUCT(x[i]) )
|
|
906
|
|
907 =item B<HarmonicMean>
|
|
908
|
|
909 $Value = HarmonicMean(\@DataArray);
|
|
910
|
|
911 Computes the harmonic mean of an array of numbers: 1 / ( SUM(1/x[i]) / n )
|
|
912
|
|
913 =item B<KLargest>
|
|
914
|
|
915 $Value = KLargest(\@DataArray, $KthNumber);
|
|
916
|
|
917 Returns the k-largest value from an array of numbers.
|
|
918
|
|
919 =item B<KSmallest>
|
|
920
|
|
921 $Value = KSmallest(\@DataArray, $KthNumber);
|
|
922
|
|
923 Returns the k-smallest value from an array of numbers.
|
|
924
|
|
925 =item B<Kurtosis>
|
|
926
|
|
927 $Value = Kurtosis(\@DataArray);
|
|
928
|
|
929 Computes the kurtosis of an array of numbers:
|
|
930 [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] - {3((n - 1)^2)}/{(n - 2)(n-3)}
|
|
931
|
|
932 =item B<Maximum>
|
|
933
|
|
934 $Value = Maximum(\@DataArray);
|
|
935
|
|
936 Returns the largest value from an array of numbers.
|
|
937
|
|
938 =item B<Minimum>
|
|
939
|
|
940 $Value = Minimum(\@DataArray);
|
|
941
|
|
942 Returns the smallest value from an array of numbers.
|
|
943
|
|
944 =item B<Mean>
|
|
945
|
|
946 $Value = Mean(\@DataArray);
|
|
947
|
|
948 Computes the mean of an array of numbers: SUM( x[i] ) / n
|
|
949
|
|
950 =item B<Median>
|
|
951
|
|
952 $Value = Median(\@DataArray);
|
|
953
|
|
954 Computes the median value of an array of numbers. For an even number array, it's
|
|
955 the average of two middle values.
|
|
956
|
|
957 For even values of n: Xsorted[(n - 1)/2 + 1]
|
|
958 For odd values of n: (Xsorted[n/2] + Xsorted[n/2 + 1])/2
|
|
959
|
|
960 =item B<Mode>
|
|
961
|
|
962 $Value = Mode(\@DataArray);
|
|
963
|
|
964 Returns the most frequently occuring value in an array of numbers.
|
|
965
|
|
966 =item B<PearsonCorrelation>
|
|
967
|
|
968 $Value = Correlation(\@XDataArray, \@YDataArray);
|
|
969
|
|
970 Computes the Pearson correlation coefficient between two arrays of numbers:
|
|
971 SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )(SUM( (y[i] - Ymean)^2 )) )
|
|
972
|
|
973 =item B<Permutations>
|
|
974
|
|
975 $PermutationsRef = Permutations(@DataToPermute);
|
|
976
|
|
977 Generate all possible permuations or a specific permutations of items in an array
|
|
978 and return a reference to an array containing array references to generated permuations.
|
|
979
|
|
980 This alogrithm is based on the example provided by Mark Jason-Dominus, and is available
|
|
981 at CPAN as mjd_permute standalone script.
|
|
982
|
|
983 =item B<Product>
|
|
984
|
|
985 $Value = Product(\@DataArray);
|
|
986
|
|
987 Compute the product of an array of numbers.
|
|
988
|
|
989 =item B<Range>
|
|
990
|
|
991 ($Smallest, $Largest) = Range(\@DataArray);
|
|
992
|
|
993 Return the smallest and largest values from an array of numbers.
|
|
994
|
|
995 =item B<RSquare>
|
|
996
|
|
997 $Value = RSquare(\@XDataArray, \@YDataArray);
|
|
998
|
|
999 Computes square of the Pearson correlation coefficient between two arrays of numbers.
|
|
1000
|
|
1001 =item B<Skewness>
|
|
1002
|
|
1003 $Value = Skewness(\@DataArray);
|
|
1004
|
|
1005 Computes the skewness of an array of numbers:
|
|
1006 {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
|
|
1007
|
|
1008 =item B<StandardDeviation>
|
|
1009
|
|
1010 $Value = StandardDeviation(\@DataArray);
|
|
1011
|
|
1012 Computes the standard deviation of an array of numbers.
|
|
1013 SQRT ( SUM( (x[i] - mean)^2 ) / (n - 1) )
|
|
1014
|
|
1015 =item B<StandardDeviationN>
|
|
1016
|
|
1017 $Value = StandardDeviationN(\@DataArray);
|
|
1018
|
|
1019 Computes the standard deviation of an array of numbers representing entire population:
|
|
1020 SQRT ( SUM( (x[i] - mean)^2 ) / n )
|
|
1021
|
|
1022 =item B<StandardError>
|
|
1023
|
|
1024 $Value = StandardError($StandardDeviation, $Count);
|
|
1025
|
|
1026 Computes the standard error using standard deviation and sample size.
|
|
1027
|
|
1028 =item B<Standardize>
|
|
1029
|
|
1030 $Value = Standardize($Value, $Mean, $StandardDeviation);
|
|
1031
|
|
1032 Standardizes the value using mean and standard deviation.
|
|
1033
|
|
1034 =item B<StandardScores>
|
|
1035
|
|
1036 @Values = StandardScores(\@DataArray);
|
|
1037
|
|
1038 Computes the standard deviation above the mean for an array of numbers:
|
|
1039 (x[i] - mean) / (n - 1)
|
|
1040
|
|
1041 =item B<StandardScoresN>
|
|
1042
|
|
1043 @Values = StandardScoresN(\@DataArray);
|
|
1044
|
|
1045 Computes the standard deviation above the mean for an array of numbers representing entire population:
|
|
1046 (x[i] - mean) / n
|
|
1047
|
|
1048 =item B<Sum>
|
|
1049
|
|
1050 $Value = Sum(\@DataArray);
|
|
1051
|
|
1052 Compute the sum of an array of numbers.
|
|
1053
|
|
1054 =item B<SumOfSquares>
|
|
1055
|
|
1056 $Value = SumOfSquares(\@DataArray);
|
|
1057
|
|
1058 Computes the sum of an array of numbers.
|
|
1059
|
|
1060 =item B<TrimMean>
|
|
1061
|
|
1062 $Value = TrimMean(\@DataArray, $FractionToExclude));
|
|
1063
|
|
1064 Computes the mean of an array of numbers by excluding a fraction of
|
|
1065 numbers from the top and bottom of the data set.
|
|
1066
|
|
1067 =item B<Variance>
|
|
1068
|
|
1069 $Value = Variance(\@DataArray);
|
|
1070
|
|
1071 Computes the variance of an array of numbers: SUM( (x[i] - Xmean)^2 / (n - 1) )
|
|
1072
|
|
1073 =item B<VarianceN>
|
|
1074
|
|
1075 $Value = Variance(\@DataArray);
|
|
1076
|
|
1077 Compute the variance of an array of numbers representing entire population:
|
|
1078 SUM( (x[i] - Xmean)^2 / n )
|
|
1079
|
|
1080 =back
|
|
1081
|
|
1082 =head1 AUTHOR
|
|
1083
|
|
1084 Manish Sud <msud@san.rr.com>
|
|
1085
|
|
1086 =head1 SEE ALSO
|
|
1087
|
|
1088 Constants.pm, ConversionsUtil.pm, MathUtil.pm
|
|
1089
|
|
1090 =head1 COPYRIGHT
|
|
1091
|
|
1092 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1093
|
|
1094 This file is part of MayaChemTools.
|
|
1095
|
|
1096 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1097 the terms of the GNU Lesser General Public License as published by the Free
|
|
1098 Software Foundation; either version 3 of the License, or (at your option)
|
|
1099 any later version.
|
|
1100
|
|
1101 =cut
|