1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: AnalyzeTextFilesData.pl,v $ 4 # $Date: 2015/02/28 20:46:04 $ 5 # $Revision: 1.36 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use StatisticsUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName: Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@TextFilesList); 56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 57 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 # Collect column information for all the text files... 63 print "Checking input text file(s)...\n"; 64 my(%TextFilesInfo); 65 RetrieveTextFilesInfo(); 66 ProcessColumnsInfo(); 67 68 # Generate output files... 69 my($FileIndex); 70 if (@TextFilesList > 1) { 71 print "\nProcessing text files...\n"; 72 } 73 for $FileIndex (0 .. $#TextFilesList) { 74 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 75 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 76 AnalyzeTextFile($FileIndex); 77 } 78 } 79 print "\n$ScriptName:Done...\n\n"; 80 81 $EndTime = new Benchmark; 82 $TotalTime = timediff ($EndTime, $StartTime); 83 print "Total time: ", timestr($TotalTime), "\n"; 84 85 ############################################################################### 86 87 # Analyze data... 88 sub AnalyzeTextFile { 89 my($Index) = @_; 90 my($TextFile, $Line, $InDelim, $ColNum, $Value, @LineWords, @ColNumsToAnalyze, %ColValuesToAnalyzeMap); 91 92 $TextFile = $TextFilesList[$Index]; 93 $InDelim = $TextFilesInfo{InDelim}[$Index]; 94 @ColNumsToAnalyze = @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}; 95 %ColValuesToAnalyzeMap = (); 96 for $ColNum (@ColNumsToAnalyze) { 97 @{$ColValuesToAnalyzeMap{$ColNum}} = (); 98 } 99 100 my($LineCount, $InvalidLineCount, @InvalidColLabels); 101 102 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 103 # Skip over column labels line in text file and collect appropriate column data 104 # for analysis... 105 $Line = GetTextLine(\*TEXTFILE); 106 $LineCount = 1; 107 $InvalidLineCount = 0; 108 while ($Line = GetTextLine(\*TEXTFILE)) { 109 $LineCount++; 110 @LineWords = quotewords($InDelim, 0, $Line); 111 @InvalidColLabels = (); 112 COLNUM: for $ColNum (@ColNumsToAnalyze) { 113 $Value = $LineWords[$ColNum]; 114 if ($OptionsInfo{CheckData}) { 115 if (!IsNumerical($Value)) { 116 push @InvalidColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 117 next COLNUM; 118 } 119 } 120 push @{$ColValuesToAnalyzeMap{$ColNum}}, $Value; 121 } 122 if (@InvalidColLabels) { 123 $InvalidLineCount++; 124 if ($OptionsInfo{DetailLevel} >=4 ) { 125 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed: $Line \n"; 126 } 127 elsif ($OptionsInfo{DetailLevel} >= 3) { 128 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed...\n"; 129 } 130 elsif ($OptionsInfo{DetailLevel} >= 2) { 131 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for columns to be analyzed...\n"; 132 } 133 } 134 } 135 if ($InvalidLineCount && ($OptionsInfo{DetailLevel} >= 1)) { 136 print "Non-numerical or empty data present in $InvalidLineCount line(s)...\n"; 137 } 138 close TEXTFILE; 139 140 # Perform the analysis... 141 my(@SpecifiedFunctionNames, $SpecifiedFunction); 142 @SpecifiedFunctionNames = (); 143 144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { 146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)}; 147 } 148 } 149 if (@SpecifiedFunctionNames) { 150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%ColValuesToAnalyzeMap) 151 } 152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 153 if ($OptionsInfo{AllColumnPairs}) { 154 PerformMatrixAnalysis($Index, \%ColValuesToAnalyzeMap); 155 } 156 else { 157 # Perform pairwise analysis for specified columns and write out calculated values - correlation 158 # rsquare, or covariance - in the same file. 159 PerformColumnPairAnalysis($Index, \%ColValuesToAnalyzeMap); 160 } 161 } 162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) { 163 PerformStandardScoresAnalysis($Index, \%ColValuesToAnalyzeMap); 164 } 165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 166 PerformFrequencyAnalysis($Index, \%ColValuesToAnalyzeMap); 167 } 168 } 169 170 # Calculate values for various statistical functions... 171 sub PerformAnalysis { 172 my($Index, $SpecifiedFunctionNamesRef, $ColValuesToAnalyzeMapRef) = @_; 173 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @ColNumsToAnalyze); 174 175 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $TextFilesInfo{OutFileExt}[$Index]; 176 177 print "Generating new text file $NewTextFile...\n"; 178 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 179 180 # Write out column labels... 181 @ColLabels = (); 182 push @ColLabels, "ColumnID"; 183 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 184 $Label = $SpecifiedFunction; 185 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { 186 my($KthValue); 187 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest}; 188 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; 189 $Label =~ s/K//g; 190 } 191 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 192 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})"; 193 } 194 push @ColLabels, $Label; 195 } 196 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 197 print NEWTEXTFILE "$Line\n"; 198 199 # Go over each column to be analyzed... 200 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; 201 202 # Turn off "strict"; otherwise, invoking statistical functions using function name string 203 # is problematic. 204 no strict; 205 206 my($ColValuesRef, $ColNum, $Value, @RowValues, %CalculatedValues); 207 %CalculatedValues = (); 208 for $ColNum (@ColNumsToAnalyze) { 209 @RowValues = (); 210 # Setup column id... 211 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 212 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 213 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 214 $Value = ""; 215 if (!@{$ColValuesToAnalyzeMapRef->{$ColNum}}) { 216 # Invalid column values... 217 push @RowValues, $Value; 218 next FUNCTIONNAME; 219 } 220 if ($SpecifiedFunction =~ /^Count$/i) { 221 $Value = @{$ColValuesToAnalyzeMapRef->{$ColNum}}; 222 } 223 elsif ($SpecifiedFunction =~ /^KLargest$/i) { 224 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KLargest}); 225 } 226 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { 227 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KSmallest}); 228 } 229 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { 230 if (exists($CalculatedValues{$ColNum}{StandardDeviation})) { 231 $Value = $CalculatedValues{$ColNum}{StandardDeviation}; 232 } 233 else { 234 $Value = &$SpecifiedFunction($ColValuesRef); 235 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; 236 } 237 } 238 elsif ($SpecifiedFunction =~ /^StandardError$/i) { 239 if (!exists($CalculatedValues{$ColNum}{StandardDeviation})) { 240 $Value = StandardDeviation($ColValuesRef); 241 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; 242 } 243 if (defined $CalculatedValues{$ColNum}{StandardDeviation}) { 244 $Value = &$SpecifiedFunction($CalculatedValues{$ColNum}{StandardDeviation}, @{$ColValuesToAnalyzeMapRef->{$ColNum}}); 245 } 246 } 247 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 248 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{TrimFraction}); 249 } 250 else { 251 $Value = &$SpecifiedFunction($ColValuesRef); 252 } 253 # Format the output value. And add zero to get rid of tariling zeros... 254 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ""; 255 push @RowValues, $Value; 256 } 257 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 258 print NEWTEXTFILE "$Line\n"; 259 } 260 close NEWTEXTFILE; 261 } 262 263 # Calculate covariance, correlation, rsquare for specified column pairs.... 264 sub PerformColumnPairAnalysis { 265 my($Index, $ColValuesToAnalyzeMapRef) = @_; 266 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 267 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 268 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 269 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 270 271 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "ColumnPairsAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; 272 print "Generating new text file $NewTextFile...\n"; 273 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 274 275 # Write out the column labels... 276 @ColLabels = (); 277 push @ColLabels, ("ColumnID1", "ColumnID2"); 278 if ($CalculateCorrelation || $CalculateRSquare) { 279 push @ColLabels, "Correlation"; 280 if ($CalculateRSquare) { 281 push @ColLabels, "RSquare"; 282 } 283 } 284 if ($CalculateCovariance) { 285 push @ColLabels, "Covariance"; 286 } 287 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 288 print NEWTEXTFILE "$Line\n"; 289 290 # Go over each column pair... 291 my($CorrelationValue, $RSquareValue, $CovarianceValue, $ColIndex, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColPairs1ToAnalyze, @ColPairs2ToAnalyze, @RowValues, $Value); 292 293 @ColPairs1ToAnalyze = @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}; 294 @ColPairs2ToAnalyze = @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}; 295 for $ColIndex (0 .. $#ColPairs1ToAnalyze) { 296 @RowValues = (); 297 $ColNum1 = $ColPairs1ToAnalyze[$ColIndex]; 298 $ColNum2 = $ColPairs2ToAnalyze[$ColIndex]; 299 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; 300 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; 301 302 # Setup column ids... 303 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 304 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum2]; 305 306 if (@$ColValuesRef1 != @$ColValuesRef2) { 307 # Print a warning... 308 warn "Warning: Skipping analysis for column pair $TextFilesInfo{ColLabels}[$Index][$ColNum1], $TextFilesInfo{ColLabels}[$Index][$ColNum2]: Number of valid data values must be same.\n"; 309 if ($CalculateCorrelation || $CalculateRSquare) { 310 push @RowValues, ""; 311 if ($CalculateRSquare) { 312 push @RowValues, ""; 313 } 314 } 315 if ($CalculateCovariance) { 316 push @RowValues, ""; 317 } 318 } 319 else { 320 # Calculate appropriate value... 321 if ($CalculateCorrelation || $CalculateRSquare) { 322 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); 323 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 324 push @RowValues, $Value; 325 if ($CalculateRSquare) { 326 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 327 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 328 push @RowValues, $Value; 329 } 330 } 331 if ($CalculateCovariance) { 332 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); 333 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 334 push @RowValues, $Value; 335 } 336 } 337 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 338 print NEWTEXTFILE "$Line\n"; 339 } 340 close NEWTEXTFILE; 341 } 342 343 # Generate histogram numbers... 344 sub PerformFrequencyAnalysis { 345 my($Index, $ColValuesToAnalyzeMapRef) = @_; 346 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $ColNum, @ColNumsToAnalyze, $ColValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); 347 348 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; 349 for $ColNum (@ColNumsToAnalyze) { 350 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; 351 print "Generating new text file $NewTextFile...\n"; 352 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 353 354 # Write out the column labels... 355 @ColLabels = (); 356 push @ColLabels , ("Bins", "Frequency"); 357 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 358 print NEWTEXTFILE "$Line\n"; 359 360 #Calculate and write out frequency values... 361 %FrequencyMap = (); 362 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 363 if (@$ColValuesRef) { 364 if (@{$OptionsInfo{BinRange}}) { 365 %FrequencyMap = Frequency($ColValuesRef, \@{$OptionsInfo{BinRange}}); 366 } 367 else { 368 %FrequencyMap = Frequency($ColValuesRef, $OptionsInfo{NumOfBins}); 369 } 370 } 371 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { 372 $FrequencyValue = $FrequencyMap{$BinValue}; 373 374 @RowValues = (); 375 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : ""; 376 push @RowValues, $Value; 377 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : ""; 378 push @RowValues, $Value; 379 380 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 381 print NEWTEXTFILE "$Line\n"; 382 } 383 close NEWTEXTFILE; 384 } 385 } 386 387 # Calculate covariance, correlation/rsquare matrices.... 388 sub PerformMatrixAnalysis { 389 my($Index, $ColValuesToAnalyzeMapRef) = @_; 390 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 391 392 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 393 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 394 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 395 396 $CorrelationTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CorrelationMatrix." . $TextFilesInfo{OutFileExt}[$Index]; 397 $RSquareTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "RSquareMatrix." . $TextFilesInfo{OutFileExt}[$Index]; 398 $CovarianceTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CovarianceMatrix." . $TextFilesInfo{OutFileExt}[$Index]; 399 400 my($TextFilesList, $Delimiter); 401 $TextFilesList = ""; 402 if ($CalculateCorrelation || $CalculateRSquare) { 403 $TextFilesList = $CorrelationTextFile; 404 if ($CalculateRSquare) { 405 $TextFilesList .= ", $CorrelationTextFile"; 406 } 407 } 408 $Delimiter = length($TextFilesList) ? "," : ""; 409 if ($CalculateCovariance) { 410 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; 411 } 412 if ($TextFilesList =~ /\,/) { 413 print "Generating new text files $TextFilesList...\n" 414 } 415 else { 416 print "Generating new text file $TextFilesList...\n" 417 } 418 if ($CalculateCorrelation || $CalculateRSquare) { 419 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; 420 if ($CalculateRSquare) { 421 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; 422 } 423 } 424 if ($CalculateCovariance) { 425 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; 426 } 427 428 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $ColNum, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); 429 430 # Write out the column labels... 431 @ColLabels = (); 432 push @ColLabels, ""; 433 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 434 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 435 } 436 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 437 if ($CalculateCorrelation || $CalculateRSquare) { 438 print CORRELATIONTEXTFILE "$Line\n"; 439 if ($CalculateRSquare) { 440 print RSQUARETEXTFILE "$Line\n"; 441 } 442 } 443 if ($CalculateCovariance) { 444 print COVARIANCETEXTFILE "$Line\n"; 445 } 446 447 # Due to symmetric nature of these matrices, only one half needs to be 448 # calculated. So, just calculate the lower half and copy it to upper half... 449 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap); 450 451 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); 452 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 453 for $ColNum2 (0 .. $ColNum1) { 454 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; 455 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; 456 if ($CalculateCorrelation || $CalculateRSquare) { 457 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); 458 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 459 $CorrelationMatrixMap{$ColNum1}{$ColNum2} = $CorrelationValue; 460 if ($ColNum1 != $ColNum2) { 461 $CorrelationMatrixMap{$ColNum2}{$ColNum1} = $CorrelationValue; 462 } 463 if ($CalculateRSquare) { 464 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 465 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 466 $RSquareMatrixMap{$ColNum1}{$ColNum2} = $RSquareValue; 467 if ($ColNum1 != $ColNum2) { 468 $RSquareMatrixMap{$ColNum2}{$ColNum1} = $RSquareValue; 469 } 470 } 471 } 472 if ($CalculateCovariance) { 473 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); 474 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 475 $CovarianceMatrixMap{$ColNum1}{$ColNum2} = $CovarianceValue; 476 if ($ColNum1 != $ColNum2) { 477 $CovarianceMatrixMap{$ColNum2}{$ColNum1} = $CovarianceValue; 478 } 479 } 480 } 481 } 482 483 # Write out the matrices... 484 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 485 @CorrelationRowValues = (); 486 @RSquareRowValues = (); 487 @CovarianceRowValues = (); 488 if ($CalculateCorrelation || $CalculateRSquare) { 489 push @CorrelationRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 490 if ($CalculateRSquare) { 491 push @RSquareRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 492 } 493 } 494 if ($CalculateCovariance) { 495 push @CovarianceRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 496 } 497 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 498 if ($CalculateCorrelation || $CalculateRSquare) { 499 push @CorrelationRowValues, $CorrelationMatrixMap{$ColNum1}{$ColNum2}; 500 if ($CalculateRSquare) { 501 push @RSquareRowValues, $RSquareMatrixMap{$ColNum1}{$ColNum2}; 502 } 503 } 504 if ($CalculateCovariance) { 505 push @CovarianceRowValues, $CovarianceMatrixMap{$ColNum1}{$ColNum2}; 506 } 507 } 508 if ($CalculateCorrelation || $CalculateRSquare) { 509 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 510 print CORRELATIONTEXTFILE "$Line\n"; 511 if ($CalculateRSquare) { 512 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 513 print RSQUARETEXTFILE "$Line\n"; 514 } 515 } 516 if ($CalculateCovariance) { 517 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 518 print COVARIANCETEXTFILE "$Line\n"; 519 } 520 } 521 if ($CalculateCorrelation || $CalculateRSquare) { 522 close CORRELATIONTEXTFILE; 523 if ($CalculateRSquare) { 524 close RSQUARETEXTFILE; 525 } 526 } 527 if ($CalculateCovariance) { 528 close COVARIANCETEXTFILE; 529 } 530 } 531 532 # Calculate standard scores... 533 sub PerformStandardScoresAnalysis { 534 my($Index, $ColValuesToAnalyzeMapRef) = @_; 535 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); 536 537 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0; 538 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0; 539 540 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "StandardScores." . $TextFilesInfo{OutFileExt}[$Index]; 541 print "Generating new text file $NewTextFile...\n"; 542 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 543 544 my($ColValuesRef, $ColNum, @ColNumsToAnalyze); 545 # Write out column labels... 546 @ColLabels = (); 547 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; 548 for $ColNum (@ColNumsToAnalyze) { 549 $Label = $TextFilesInfo{ColLabels}[$Index][$ColNum]; 550 if ($StandardScores) { 551 push @ColLabels, "${Label}\(StandardScores)"; 552 } 553 if ($StandardScoresN) { 554 push @ColLabels, "${Label}\(StandardScoresN)"; 555 } 556 } 557 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 558 print NEWTEXTFILE "$NewLine\n"; 559 560 # Go over each column to be analyzed and calculate standard deviation 561 # and mean values... 562 my(%StandardDeviationMap, %StandardDeviationNMap, %MeanMap); 563 %StandardDeviationMap = (); 564 %StandardDeviationNMap = (); 565 %MeanMap = (); 566 for $ColNum (@ColNumsToAnalyze) { 567 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 568 if (!exists($MeanMap{$ColNum})) { 569 $MeanMap{$ColNum} = Mean($ColValuesRef); 570 } 571 if ($StandardScores) { 572 if (!exists($StandardDeviationMap{$ColNum})) { 573 $StandardDeviationMap{$ColNum} = StandardDeviation($ColValuesRef); 574 } 575 } 576 if ($StandardScoresN) { 577 if (!exists($StandardDeviationNMap{$ColNum})) { 578 $StandardDeviationNMap{$ColNum} = StandardDeviationN($ColValuesRef); 579 } 580 } 581 } 582 # 583 # Go over each row and calculate standard scores for each column 584 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n 585 # for StandardScoresN; write out the calculated values as well... 586 587 my($TextFile, $InDelim, $Line, $Value, $ValueOkay, $ScoreValue, @RowValues, @LineWords); 588 $TextFile = $TextFilesList[$Index]; 589 $InDelim = $TextFilesInfo{InDelim}[$Index]; 590 591 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 592 $Line = GetTextLine(\*TEXTFILE); 593 while ($Line = GetTextLine(\*TEXTFILE)) { 594 @LineWords = quotewords($InDelim, 0, $Line); 595 @RowValues = (); 596 COLNUM: for $ColNum (@ColNumsToAnalyze) { 597 $Value = $LineWords[$ColNum]; 598 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1; 599 if ($StandardScores) { 600 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationMap{$ColNum}) : ""; 601 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 602 push @RowValues, $ScoreValue; 603 } 604 if ($StandardScoresN) { 605 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationNMap{$ColNum}) : ""; 606 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 607 push @RowValues, $ScoreValue; 608 } 609 } 610 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 611 print NEWTEXTFILE "$NewLine\n"; 612 } 613 close TEXTFILE; 614 close NEWTEXTFILE; 615 } 616 617 # Make sure the specified columns exists in text files... 618 sub ProcessColumnsInfo { 619 my($Index, $TextFile, $ColNum, $NewColNum, $ColIndex, @ColNumsToAnalyze, %UniqueColNumsToAnalyzeMap); 620 621 @{$TextFilesInfo{ColNumsToAnalyze}} = (); 622 @{$TextFilesInfo{ColPairs1ToAnalyze}} = (); 623 @{$TextFilesInfo{ColPairs2ToAnalyze}} = (); 624 @{$TextFilesInfo{UniqueColNumsToAnalyze}} = (); 625 626 FILELIST: for $Index (0 .. $#TextFilesList) { 627 $TextFile = $TextFilesList[$Index]; 628 629 @{$TextFilesInfo{ColNumsToAnalyze}[$Index]} = (); 630 @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]} = (); 631 @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]} = (); 632 @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]} = (); 633 634 %UniqueColNumsToAnalyzeMap = (); 635 636 if ($TextFilesInfo{FileOkay}[$Index]) { 637 @ColNumsToAnalyze = (); 638 if (@{$OptionsInfo{SpecifiedColumns}}) { 639 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 640 for $ColNum (@{$OptionsInfo{SpecifiedColumns}}) { 641 if ($ColNum >=1 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) { 642 $NewColNum = $ColNum -1; 643 push @ColNumsToAnalyze, $NewColNum; 644 } 645 } 646 } 647 else { 648 my($ColLabel); 649 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) { 650 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { 651 push @ColNumsToAnalyze, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 652 } 653 } 654 } 655 } 656 elsif (defined $OptionsInfo{Columns} && $OptionsInfo{Columns} =~ /^All$/i) { 657 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 658 push @ColNumsToAnalyze, $ColNum; 659 } 660 } 661 else { 662 push @ColNumsToAnalyze, 0; 663 } 664 if (@ColNumsToAnalyze) { 665 push @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}, @ColNumsToAnalyze; 666 # Set up unique columns map as well... 667 for $ColNum (@ColNumsToAnalyze) { 668 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { 669 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; 670 } 671 } 672 } 673 else { 674 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist.\n"; 675 $TextFilesInfo{FileOkay}[$Index] = 0; 676 next FILELIST; 677 } 678 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 679 # Make sure specific frequency files don't exist... 680 my($FrequencyFile); 681 for $ColNum (@ColNumsToAnalyze) { 682 $FrequencyFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; 683 if (-e $FrequencyFile) { 684 warn "Warning: Ignoring file $TextFile: The file $FrequencyFile already exists.\n"; 685 $TextFilesInfo{FileOkay}[$Index] = 0; 686 next FILELIST; 687 } 688 } 689 } 690 # Setup specified column pairs... 691 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) { 692 my(@ColPairsToAnalyze, $ColNum1, $ColNum2); 693 if (@{$OptionsInfo{SpecifiedColumnPairs}}) { 694 # Make sure both columns exist... 695 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 696 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) { 697 $ColNum1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex]; 698 $ColNum2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1]; 699 if ($ColNum1 >=1 && $ColNum1 <= $TextFilesInfo{ColCount}[$Index] && $ColNum2 >=1 && $ColNum2 <= $TextFilesInfo{ColCount}[$Index]) { 700 $ColNum1 -= 1; 701 $ColNum2 -= 1; 702 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 703 } 704 } 705 } 706 else { 707 my($ColLabel1, $ColLabel2); 708 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) { 709 $ColLabel1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex]; 710 $ColLabel2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1]; 711 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}) && exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2})) { 712 $ColNum1 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}; 713 $ColNum2 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2}; 714 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 715 } 716 } 717 } 718 } 719 elsif ($OptionsInfo{AllColumnPairs}) { 720 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 721 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 722 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 723 } 724 } 725 } 726 else { 727 if ($TextFilesInfo{ColCount}[$Index] >= 2) { 728 push @ColPairsToAnalyze, (0,1); 729 } 730 } 731 if (@ColPairsToAnalyze) { 732 if (@ColPairsToAnalyze % 2) { 733 warn "Warning: Ignoring file $TextFile: Invalid number values specified using \"--columnpairs\" option: It must contain even number of valid values.\n"; 734 $TextFilesInfo{FileOkay}[$Index] = 0; 735 next FILELIST; 736 } 737 else { 738 for ($ColIndex = 0; $ColIndex < @ColPairsToAnalyze; $ColIndex += 2) { 739 push @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex]; 740 push @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex + 1]; 741 } 742 # Set up unique columns map as well... 743 for $ColNum (@ColPairsToAnalyze) { 744 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { 745 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; 746 } 747 } 748 } 749 } 750 } 751 # Setup uniques columns array... 752 push @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}, (sort keys %UniqueColNumsToAnalyzeMap); 753 } 754 } 755 } 756 757 # Retrieve information about input text files... 758 sub RetrieveTextFilesInfo { 759 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $OutFile, $OutFileExt, $ColNum, $ColLabel); 760 761 %TextFilesInfo = (); 762 763 @{$TextFilesInfo{FileOkay}} = (); 764 @{$TextFilesInfo{ColCount}} = (); 765 @{$TextFilesInfo{ColLabels}} = (); 766 @{$TextFilesInfo{ColLabelToNumMap}} = (); 767 @{$TextFilesInfo{InDelim}} = (); 768 @{$TextFilesInfo{OutFileRoot}} = (); 769 @{$TextFilesInfo{OutFileExt}} = (); 770 771 FILELIST: for $Index (0 .. $#TextFilesList) { 772 $TextFile = $TextFilesList[$Index]; 773 774 $TextFilesInfo{FileOkay}[$Index] = 0; 775 $TextFilesInfo{ColCount}[$Index] = 0; 776 $TextFilesInfo{InDelim}[$Index] = ""; 777 $TextFilesInfo{OutFileRoot}[$Index] = ""; 778 $TextFilesInfo{OutFileExt}[$Index] = ""; 779 780 @{$TextFilesInfo{ColLabels}[$Index]} = (); 781 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 782 783 if (!(-e $TextFile)) { 784 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 785 next FILELIST; 786 } 787 if (!CheckFileType($TextFile, "csv tsv")) { 788 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 789 next FILELIST; 790 } 791 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 792 if ($FileExt =~ /^tsv$/i) { 793 $InDelim = "\t"; 794 } 795 else { 796 $InDelim = "\,"; 797 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 798 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 799 next FILELIST; 800 } 801 if ($Options{indelim} =~ /^semicolon$/i) { 802 $InDelim = "\;"; 803 } 804 } 805 806 if (!open TEXTFILE, "$TextFile") { 807 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 808 next FILELIST; 809 } 810 811 $Line = GetTextLine(\*TEXTFILE); 812 @ColLabels = quotewords($InDelim, 0, $Line); 813 close TEXTFILE; 814 815 $FileDir = ""; $FileName = ""; $FileExt = ""; 816 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 817 $FileExt = "csv"; 818 if ($Options{outdelim} =~ /^tab$/i) { 819 $FileExt = "tsv"; 820 } 821 $OutFileExt = $FileExt; 822 if ($Options{root} && (@TextFilesList == 1)) { 823 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 824 if ($RootFileName && $RootFileExt) { 825 $FileName = $RootFileName; 826 } 827 else { 828 $FileName = $Options{root}; 829 } 830 $OutFileRoot = $FileName; 831 } 832 else { 833 $OutFileRoot = $FileName; 834 } 835 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt"; 836 837 if (lc($OutFile) eq lc($TextFile)) { 838 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; 839 next FILELIST; 840 } 841 if (!$Options{overwrite}) { 842 if (-e $OutFile) { 843 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; 844 next FILELIST; 845 } 846 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 847 if ($OptionsInfo{AllColumnPairs}) { 848 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) { 849 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n"; 850 next FILELIST; 851 } 852 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) { 853 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n"; 854 next FILELIST; 855 } 856 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) { 857 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n"; 858 next FILELIST; 859 } 860 } 861 else { 862 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") { 863 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n"; 864 next FILELIST; 865 } 866 } 867 } 868 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) { 869 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n"; 870 next FILELIST; 871 } 872 } 873 874 $TextFilesInfo{FileOkay}[$Index] = 1; 875 $TextFilesInfo{InDelim}[$Index] = $InDelim; 876 $TextFilesInfo{OutFileRoot}[$Index] = "$OutFileRoot"; 877 $TextFilesInfo{OutFileExt}[$Index] = "$OutFileExt"; 878 879 $TextFilesInfo{ColCount}[$Index] = @ColLabels; 880 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 881 for $ColNum (0 .. $#ColLabels) { 882 $ColLabel = $ColLabels[$ColNum]; 883 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 884 } 885 } 886 } 887 888 # Process option values... 889 sub ProcessOptions { 890 %OptionsInfo = (); 891 892 $OptionsInfo{Mode} = $Options{mode}; 893 894 $OptionsInfo{DetailLevel} = $Options{detail}; 895 896 # Setup supported statistical functions... 897 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); 898 %SupportedStatisticaFunctionsMap = (); 899 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); 900 901 for $SupportedFunction (@SupportedStatisticaFunctions) { 902 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; 903 } 904 905 # Setup a list of functions to use for analysis... 906 my($SpecifiedFunction); 907 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = (); 908 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = (); 909 # Check mode values... 910 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { 911 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic"; 912 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); 913 } 914 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { 915 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll"; 916 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); 917 } 918 elsif ($Options{mode} =~ /^All$/i ) { 919 $OptionsInfo{FileNameMode} = "AllStatistics"; 920 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions; 921 } 922 else { 923 $OptionsInfo{FileNameMode} = "SpecifiedStatistics"; 924 # Comma delimited list of functions... 925 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); 926 $Mode = $Options{mode}; 927 $Mode =~ s/ //g; 928 @SpecifiedFunctions = split ",", $Mode; 929 @UnsupportedSpecifiedFunctions = (); 930 for $SpecifiedFunction (@SpecifiedFunctions) { 931 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { 932 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction; 933 } 934 else { 935 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; 936 } 937 } 938 if (@UnsupportedSpecifiedFunctions) { 939 if (@UnsupportedSpecifiedFunctions > 1) { 940 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 941 } 942 else { 943 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; 944 } 945 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; 946 } 947 } 948 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 949 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) { 950 next FUNCTION; 951 } 952 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; 953 } 954 955 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 956 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; 957 958 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 959 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef; 960 961 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1; 962 $OptionsInfo{Precision} = $Options{precision}; 963 964 $OptionsInfo{KLargest} = $Options{klargest}; 965 $OptionsInfo{KSmallest} = $Options{ksmallest}; 966 967 $OptionsInfo{TrimFraction} = $Options{trimfraction}; 968 969 # Setup frequency bin values... 970 $OptionsInfo{NumOfBins} = 10; 971 @{$OptionsInfo{BinRange}} = (); 972 if ($Options{frequencybins} =~ /\,/) { 973 my($BinValue, @SpecifiedBinRange); 974 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 975 if (@SpecifiedBinRange < 2) { 976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 977 } 978 for $BinValue (@SpecifiedBinRange) { 979 if (!IsNumerical($BinValue)) { 980 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 981 } 982 } 983 my($Index1, $Index2); 984 for $Index1 (0 .. $#SpecifiedBinRange) { 985 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 986 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 987 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 988 } 989 } 990 } 991 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; 992 } 993 else { 994 $OptionsInfo{NumOfBins} = $Options{frequencybins}; 995 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { 996 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 997 } 998 } 999 1000 # Setup specified columns... 1001 $OptionsInfo{ColMode} = $Options{colmode}; 1002 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef; 1003 1004 @{$OptionsInfo{SpecifiedColumns}} = (); 1005 if (defined $Options{columns} && $Options{columns} !~ /^All$/i) { 1006 my(@SpecifiedValues) = split ",", $Options{columns}; 1007 if ($Options{colmode} =~ /^colnum$/i) { 1008 my($ColValue); 1009 for $ColValue (@SpecifiedValues) { 1010 if (!IsPositiveInteger($ColValue)) { 1011 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; 1012 } 1013 } 1014 } 1015 push @{$OptionsInfo{SpecifiedColumns}}, @SpecifiedValues; 1016 } 1017 @{$OptionsInfo{SpecifiedColumnPairs}} = (); 1018 $OptionsInfo{AllColumnPairs} = (defined($Options{columnpairs}) && $Options{columnpairs} =~ /^AllPairs$/i) ? 1 : 0; 1019 if (defined($Options{columnpairs}) && !$OptionsInfo{AllColumnPairs}) { 1020 my(@SpecifiedValues) = split ",", $Options{columnpairs}; 1021 if (@SpecifiedValues % 2) { 1022 die "Error: Invalid number of values specified using \"--columnpairs\" option: It must contain even number of values.\n"; 1023 } 1024 if ($Options{colmode} =~ /^colnum$/i) { 1025 my($ColValue); 1026 for $ColValue (@SpecifiedValues) { 1027 if (!IsPositiveInteger($ColValue)) { 1028 die "Error: Column value, $ColValue, specified using \"--columnpairs\" is not valid: Allowed integer values: > 0.\n"; 1029 } 1030 } 1031 } 1032 push @{$OptionsInfo{SpecifiedColumnPairs}}, @SpecifiedValues; 1033 } 1034 1035 } 1036 1037 # Setup script usage and retrieve command line arguments specified using various options... 1038 sub SetupScriptUsage { 1039 1040 # Retrieve all the options... 1041 %Options = (); 1042 $Options{colmode} = "colnum"; 1043 $Options{detail} = 1; 1044 $Options{indelim} = "comma"; 1045 $Options{frequencybins} = 10; 1046 $Options{klargest} = 2; 1047 $Options{ksmallest} = 2; 1048 $Options{mode} = "DescriptiveStatisticsBasic"; 1049 $Options{outdelim} = "comma"; 1050 $Options{precision} = 2; 1051 $Options{quote} = "yes"; 1052 $Options{trimfraction} = 0.1; 1053 1054 if (!GetOptions(\%Options, "colmode|c=s", "columns=s", "columnpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "indelim=s", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) { 1055 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1056 } 1057 if ($Options{workingdir}) { 1058 if (! -d $Options{workingdir}) { 1059 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1060 } 1061 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1062 } 1063 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { 1064 die "Error: The value specified, $Options{colmode}, for option \"-c --colmode\" is not valid. Allowed values: colnum or collabel\n"; 1065 } 1066 if (!IsPositiveInteger($Options{detail})) { 1067 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 1068 } 1069 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 1070 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 1071 } 1072 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1073 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1074 } 1075 if ($Options{quote} !~ /^(yes|no)$/i) { 1076 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 1077 } 1078 if (!IsPositiveInteger($Options{precision})) { 1079 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 1080 } 1081 if (!IsPositiveInteger($Options{klargest})) { 1082 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n"; 1083 } 1084 if (!IsPositiveInteger($Options{ksmallest})) { 1085 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n"; 1086 } 1087 if (IsFloat($Options{trimfraction})) { 1088 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) { 1089 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1090 } 1091 } 1092 else { 1093 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1094 } 1095 } 1096