1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: AnalyzeSDFilesData.pl,v $ 4 # $Date: 2015/02/28 20:46:04 $ 5 # $Revision: 1.27 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use SDFileUtil; 37 use TextUtil; 38 use StatisticsUtil; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 # Starting message... 46 $ScriptName = basename($0); 47 print "\n$ScriptName: Starting...\n\n"; 48 $StartTime = new Benchmark; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@SDFilesList); 57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf"); 58 59 print "Processing options...\n"; 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 # Collect information about SD files... 64 print "Checking input SD file(s)...\n"; 65 my(%SDFilesInfo); 66 RetrieveSDFilesInfo(); 67 ProcessSDFilesDataLabelsInfo(); 68 69 # Generate output files... 70 my($FileIndex); 71 if (@SDFilesList > 1) { 72 print "\nProcessing SD files...\n"; 73 } 74 for $FileIndex (0 .. $#SDFilesList) { 75 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 76 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 77 AnalyzeSDFile($FileIndex); 78 } 79 } 80 print "\n$ScriptName:Done...\n\n"; 81 82 $EndTime = new Benchmark; 83 $TotalTime = timediff ($EndTime, $StartTime); 84 print "Total time: ", timestr($TotalTime), "\n"; 85 86 ############################################################################### 87 88 # Analyze data... 89 sub AnalyzeSDFile { 90 my($Index) = @_; 91 my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap); 92 93 $SDFile = $SDFilesList[$Index]; 94 @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}; 95 %DataFieldValuesToAnalyzeMap = (); 96 for $DataLabel (@DataLabelsToAnalyze) { 97 @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = (); 98 } 99 100 # Collect appropriate data field label values for analysis... 101 my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels); 102 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 103 $CmpdCount = 0; 104 $InvalidCmpdCount = 0; 105 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 106 $CmpdCount++; 107 @CmpdLines = split "\n", $CmpdString; 108 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 109 @InvalidCmpdDataLabels = (); 110 DATALABEL: for $DataLabel (@DataLabelsToAnalyze) { 111 if (exists $DataFieldValues{$DataLabel}) { 112 $DataValue = $DataFieldValues{$DataLabel}; 113 if ($OptionsInfo{CheckData}) { 114 if (!IsNumerical($DataValue)) { 115 push @InvalidCmpdDataLabels, $DataLabel; 116 next DATALABEL; 117 } 118 } 119 push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue; 120 } 121 } 122 if (@InvalidCmpdDataLabels) { 123 $InvalidCmpdCount++; 124 if ($OptionsInfo{DetailLevel} >=4 ) { 125 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n"; 126 } 127 elsif ($OptionsInfo{DetailLevel} >= 3) { 128 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n"; 129 } 130 elsif ($OptionsInfo{DetailLevel} >= 2) { 131 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n"; 132 } 133 } 134 } 135 if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) { 136 print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n"; 137 } 138 close SDFILE; 139 140 # Perform the analysis... 141 my(@SpecifiedFunctionNames, $SpecifiedFunction); 142 @SpecifiedFunctionNames = (); 143 144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { 146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)}; 147 } 148 } 149 if (@SpecifiedFunctionNames) { 150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap) 151 } 152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 153 if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) { 154 PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 155 } 156 else { 157 # Perform pairwise analysis for specified columns and write out calculated values - correlation 158 # rsquare, or covariance - in the same file. 159 PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 160 } 161 } 162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) { 163 PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 164 } 165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 166 PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 167 } 168 169 } 170 171 # Calculate values for various statistical functions... 172 sub PerformAnalysis { 173 my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_; 174 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze); 175 176 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index]; 177 178 print "Generating new text file $NewTextFile...\n"; 179 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 180 181 # Write out column labels... 182 @ColLabels = (); 183 push @ColLabels, "DataLabel"; 184 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 185 $Label = $SpecifiedFunction; 186 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { 187 my($KthValue); 188 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest}; 189 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; 190 $Label =~ s/K//g; 191 } 192 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 193 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})"; 194 } 195 push @ColLabels, $Label; 196 } 197 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 198 print NEWTEXTFILE "$Line\n"; 199 200 # Go over each column to be analyzed... 201 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; 202 203 # Turn off "strict"; otherwise, invoking statistical functions using function name string 204 # is problematic. 205 no strict; 206 207 my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues); 208 %CalculatedValues = (); 209 for $DataLabel (@DataLabelsToAnalyze) { 210 @RowValues = (); 211 # Setup column id... 212 push @RowValues, $DataLabel; 213 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 214 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 215 $Value = ""; 216 if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) { 217 # Invalid column values... 218 push @RowValues, $Value; 219 next FUNCTIONNAME; 220 } 221 if ($SpecifiedFunction =~ /^Count$/i) { 222 $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 223 } 224 elsif ($SpecifiedFunction =~ /^KLargest$/i) { 225 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest}); 226 } 227 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { 228 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest}); 229 } 230 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { 231 if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) { 232 $Value = $CalculatedValues{$DataLabel}{StandardDeviation}; 233 } 234 else { 235 $Value = &$SpecifiedFunction($DataValuesRef); 236 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; 237 } 238 } 239 elsif ($SpecifiedFunction =~ /^StandardError$/i) { 240 if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) { 241 $Value = StandardDeviation($DataValuesRef); 242 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; 243 } 244 if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) { 245 $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}}); 246 } 247 } 248 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 249 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction}); 250 } 251 else { 252 $Value = &$SpecifiedFunction($DataValuesRef); 253 } 254 # Format the output value. And add zero to get rid of tariling zeros... 255 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ""; 256 push @RowValues, $Value; 257 } 258 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 259 print NEWTEXTFILE "$Line\n"; 260 } 261 close NEWTEXTFILE; 262 } 263 264 # Calculate covariance, correlation, rsquare for specified data field label pairs.... 265 sub PerformDataLabelPairAnalysis { 266 my($Index, $DataValuesToAnalyzeMapRef) = @_; 267 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 268 269 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 270 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 271 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 272 273 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; 274 print "Generating new text file $NewTextFile...\n"; 275 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 276 277 # Write out the column labels... 278 @ColLabels = (); 279 push @ColLabels, ("DataLabel1", "DataLabel2"); 280 if ($CalculateCorrelation || $CalculateRSquare) { 281 push @ColLabels, "Correlation"; 282 if ($CalculateRSquare) { 283 push @ColLabels, "RSquare"; 284 } 285 } 286 if ($CalculateCovariance) { 287 push @ColLabels, "Covariance"; 288 } 289 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 290 print NEWTEXTFILE "$Line\n"; 291 292 # Go over each data field pair... 293 my($CorrelationValue, $RSquareValue, $CovarianceValue, $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value); 294 295 @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}; 296 @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}; 297 for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) { 298 @RowValues = (); 299 $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex]; 300 $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex]; 301 $DataValues1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; 302 $DataValues2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; 303 304 # Setup column ids... 305 push @RowValues, $DataLabel1; 306 push @RowValues, $DataLabel2; 307 308 if (@$DataValues1 != @$DataValues2) { 309 # Print a warning... 310 warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n"; 311 if ($CalculateCorrelation || $CalculateRSquare) { 312 push @RowValues, ""; 313 if ($CalculateRSquare) { 314 push @RowValues, ""; 315 } 316 } 317 if ($CalculateCovariance) { 318 push @RowValues, ""; 319 } 320 } 321 else { 322 # Calculate appropriate value... 323 if ($CalculateCorrelation || $CalculateRSquare) { 324 $CorrelationValue = Correlation($DataValues1, $DataValues2); 325 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 326 push @RowValues, $Value; 327 if ($CalculateRSquare) { 328 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 329 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 330 push @RowValues, $Value; 331 } 332 } 333 if ($CalculateCovariance) { 334 $CovarianceValue = Covariance($DataValues1, $DataValues2); 335 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 336 push @RowValues, $Value; 337 } 338 } 339 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 340 print NEWTEXTFILE "$Line\n"; 341 } 342 close NEWTEXTFILE; 343 } 344 345 # Generate histogram numbers... 346 sub PerformFrequencyAnalysis { 347 my($Index, $DataValuesToAnalyzeMapRef) = @_; 348 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); 349 350 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; 351 for $DataLabel (@DataLabelsToAnalyze) { 352 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; 353 print "Generating new text file $NewTextFile...\n"; 354 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 355 356 # Write out the column labels... 357 @ColLabels = (); 358 push @ColLabels , ("Bins", "Frequency"); 359 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 360 print NEWTEXTFILE "$Line\n"; 361 362 #Calculate and write out frequency values... 363 %FrequencyMap = (); 364 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 365 if (@$DataValuesRef) { 366 if (@{$OptionsInfo{BinRange}}) { 367 %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}}); 368 } 369 else { 370 %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins}); 371 } 372 } 373 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { 374 $FrequencyValue = $FrequencyMap{$BinValue}; 375 376 @RowValues = (); 377 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : ""; 378 push @RowValues, $Value; 379 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : ""; 380 push @RowValues, $Value; 381 382 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 383 print NEWTEXTFILE "$Line\n"; 384 } 385 close NEWTEXTFILE; 386 } 387 } 388 389 # Calculate covariance, correlation/rsquare matrices.... 390 sub PerformMatrixAnalysis { 391 my($Index, $DataValuesToAnalyzeMapRef) = @_; 392 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 393 394 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 395 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 396 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 397 398 $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; 399 $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; 400 $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; 401 402 my($TextFilesList, $Delimiter); 403 $TextFilesList = ""; 404 if ($CalculateCorrelation || $CalculateRSquare) { 405 $TextFilesList = $CorrelationTextFile; 406 if ($CalculateRSquare) { 407 $TextFilesList .= ", $CorrelationTextFile"; 408 } 409 } 410 $Delimiter = length($TextFilesList) ? "," : ""; 411 if ($CalculateCovariance) { 412 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; 413 } 414 if ($TextFilesList =~ /\,/) { 415 print "Generating new text files $TextFilesList...\n" 416 } 417 else { 418 print "Generating new text file $TextFilesList...\n" 419 } 420 if ($CalculateCorrelation || $CalculateRSquare) { 421 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; 422 if ($CalculateRSquare) { 423 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; 424 } 425 } 426 if ($CalculateCovariance) { 427 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; 428 } 429 430 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); 431 432 # Write out the column labels... 433 @ColLabels = (); 434 push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]}; 435 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 436 if ($CalculateCorrelation || $CalculateRSquare) { 437 print CORRELATIONTEXTFILE "$Line\n"; 438 if ($CalculateRSquare) { 439 print RSQUARETEXTFILE "$Line\n"; 440 } 441 } 442 if ($CalculateCovariance) { 443 print COVARIANCETEXTFILE "$Line\n"; 444 } 445 446 # Due to symmetric nature of these matrices, only one half needs to be 447 # calculated. So, just calculate the lower half and copy it to upper half... 448 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze); 449 450 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); 451 @DataLabelsToAnalyze = (); 452 @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]}; 453 454 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { 455 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; 456 for $LabelIndex2 (0 .. $LabelIndex1) { 457 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; 458 $DataValuesRef1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; 459 $DataValuesRef2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; 460 if ($CalculateCorrelation || $CalculateRSquare) { 461 $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2); 462 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 463 $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue; 464 if ($DataLabel1 ne $DataLabel2) { 465 $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue; 466 } 467 if ($CalculateRSquare) { 468 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 469 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 470 $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue; 471 if ($DataLabel1 ne $DataLabel2) { 472 $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue; 473 } 474 } 475 } 476 if ($CalculateCovariance) { 477 $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2); 478 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 479 $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue; 480 if ($DataLabel1 ne $DataLabel2) { 481 $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue; 482 } 483 } 484 } 485 } 486 487 # Write out the matrices... 488 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { 489 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; 490 @CorrelationRowValues = (); 491 @RSquareRowValues = (); 492 @CovarianceRowValues = (); 493 if ($CalculateCorrelation || $CalculateRSquare) { 494 push @CorrelationRowValues, $DataLabel1; 495 if ($CalculateRSquare) { 496 push @RSquareRowValues, $DataLabel1; 497 } 498 } 499 if ($CalculateCovariance) { 500 push @CovarianceRowValues, $DataLabel; 501 } 502 for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) { 503 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; 504 if ($CalculateCorrelation || $CalculateRSquare) { 505 push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2}; 506 if ($CalculateRSquare) { 507 push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2}; 508 } 509 } 510 if ($CalculateCovariance) { 511 push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2}; 512 } 513 } 514 if ($CalculateCorrelation || $CalculateRSquare) { 515 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 516 print CORRELATIONTEXTFILE "$Line\n"; 517 if ($CalculateRSquare) { 518 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 519 print RSQUARETEXTFILE "$Line\n"; 520 } 521 } 522 if ($CalculateCovariance) { 523 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 524 print COVARIANCETEXTFILE "$Line\n"; 525 } 526 } 527 if ($CalculateCorrelation || $CalculateRSquare) { 528 close CORRELATIONTEXTFILE; 529 if ($CalculateRSquare) { 530 close RSQUARETEXTFILE; 531 } 532 } 533 if ($CalculateCovariance) { 534 close COVARIANCETEXTFILE; 535 } 536 } 537 538 # Calculate standard scores... 539 sub PerformStandardScoresAnalysis { 540 my($Index, $DataValuesToAnalyzeMapRef) = @_; 541 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); 542 543 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0; 544 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0; 545 546 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." . $SDFilesInfo{NewTextFileExt}[$Index]; 547 print "Generating new text file $NewTextFile...\n"; 548 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 549 550 my($DataLabel, @DataLabelsToAnalyze); 551 # Write out column labels... 552 @ColLabels = (); 553 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; 554 for $DataLabel (@DataLabelsToAnalyze) { 555 if ($StandardScores) { 556 push @ColLabels, "${DataLabel}\(StandardScores)"; 557 } 558 if ($StandardScoresN) { 559 push @ColLabels, "${DataLabel}\(StandardScoresN)"; 560 } 561 } 562 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 563 print NEWTEXTFILE "$NewLine\n"; 564 565 # Go over each column to be analyzed and calculate standard deviation 566 # and mean values... 567 my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap); 568 %StandardDeviationMap = (); 569 %StandardDeviationNMap = (); 570 %MeanMap = (); 571 for $DataLabel (@DataLabelsToAnalyze) { 572 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 573 if (!exists($MeanMap{$DataLabel})) { 574 $MeanMap{$DataLabel} = Mean($DataValuesRef); 575 } 576 if ($StandardScores) { 577 if (!exists($StandardDeviationMap{$DataLabel})) { 578 $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef); 579 } 580 } 581 if ($StandardScoresN) { 582 if (!exists($StandardDeviationNMap{$DataLabel})) { 583 $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef); 584 } 585 } 586 } 587 # 588 # Go over each data field and calculate standard scores for each column 589 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n 590 # for StandardScoresN; write out the calculated values as well... 591 592 my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues); 593 $SDFile = $SDFilesList[$Index]; 594 595 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 596 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 597 @CmpdLines = split "\n", $CmpdString; 598 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 599 @RowValues = (); 600 for $DataLabel (@DataLabelsToAnalyze) { 601 $Value = ""; 602 if (exists $DataFieldValues{$DataLabel}) { 603 $Value = $DataFieldValues{$DataLabel}; 604 } 605 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1; 606 if ($StandardScores) { 607 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : ""; 608 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 609 push @RowValues, $ScoreValue; 610 } 611 if ($StandardScoresN) { 612 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : ""; 613 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 614 push @RowValues, $ScoreValue; 615 } 616 } 617 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 618 print NEWTEXTFILE "$NewLine\n"; 619 } 620 close SDFILE; 621 close NEWTEXTFILE; 622 623 } 624 625 # Make sure the specified data field labels exists in SD files... 626 sub ProcessSDFilesDataLabelsInfo { 627 my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap); 628 629 @{$SDFilesInfo{DataLabelsToAnalyze}} = (); 630 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = (); 631 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = (); 632 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = (); 633 634 FILELIST: for $Index (0 .. $#SDFilesList) { 635 $SDFile = $SDFilesList[$Index]; 636 637 @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = (); 638 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = (); 639 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = (); 640 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = (); 641 642 %UniqueDataLabelsToAnalyzeMap = (); 643 644 if ($SDFilesInfo{FileOkay}[$Index]) { 645 @DataLabelsToAnalyze = (); 646 if (@{$OptionsInfo{SpecifiedDataLabels}}) { 647 for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) { 648 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) { 649 push @DataLabelsToAnalyze, $DataLabel; 650 } 651 } 652 } 653 elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) { 654 push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]}; 655 } 656 else { 657 push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]}; 658 } 659 if (@DataLabelsToAnalyze) { 660 push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze; 661 # Set up unique data field label map as well... 662 for $DataLabel (@DataLabelsToAnalyze) { 663 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { 664 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; 665 } 666 } 667 } 668 else { 669 warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n"; 670 $SDFilesInfo{FileOkay}[$Index] = 0; 671 next FILELIST; 672 } 673 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 674 # Make sure specific frequency files don't exist... 675 my($FrequencyFile); 676 for $DataLabel (@DataLabelsToAnalyze) { 677 $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; 678 if (-e $FrequencyFile) { 679 warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n"; 680 $SDFilesInfo{FileOkay}[$Index] = 0; 681 next FILELIST; 682 } 683 } 684 } 685 # Setup specified data field label pairs... 686 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) { 687 my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2); 688 if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) { 689 # Make sure both data field labels exist... 690 my($DataFieldIndex); 691 for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) { 692 $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex]; 693 $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1]; 694 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) { 695 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 696 } 697 } 698 } 699 elsif ($OptionsInfo{AllDataLabelPairs}) { 700 for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) { 701 for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) { 702 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 703 } 704 } 705 } 706 else { 707 for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) { 708 for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) { 709 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 710 } 711 } 712 } 713 if (@DataLabelPairsToAnalyze) { 714 if (@DataLabelPairsToAnalyze % 2) { 715 warn "Warning: Ignoring file $SDFile: Invalid number values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n"; 716 $SDFilesInfo{FileOkay}[$Index] = 0; 717 next FILELIST; 718 } 719 else { 720 for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) { 721 push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex]; 722 push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1]; 723 } 724 # Set up unique data field labe map as well... 725 for $DataLabel (@DataLabelPairsToAnalyze) { 726 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { 727 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; 728 } 729 } 730 } 731 } 732 } 733 # Setup unique data field label array... 734 push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap); 735 } 736 } 737 } 738 739 # Retrieve information about input SD files... 740 sub RetrieveSDFilesInfo { 741 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount); 742 743 %SDFilesInfo = (); 744 745 @{$SDFilesInfo{FileOkay}} = (); 746 @{$SDFilesInfo{CmpdCount}} = (); 747 @{$SDFilesInfo{NewTextFileRoot}} = (); 748 @{$SDFilesInfo{NewTextFileExt}} = (); 749 750 @{$SDFilesInfo{AllDataFieldLabels}} = (); 751 @{$SDFilesInfo{AllDataFieldLabelsMap}} = (); 752 @{$SDFilesInfo{CommonDataLabels}} = (); 753 754 FILELIST: for $Index (0 .. $#SDFilesList) { 755 $SDFile = $SDFilesList[$Index]; 756 757 $SDFilesInfo{FileOkay}[$Index] = 0; 758 759 $SDFilesInfo{CmpdCount}[$Index] = 0; 760 $SDFilesInfo{NewTextFileRoot}[$Index] = ""; 761 $SDFilesInfo{NewTextFileExt}[$Index] = ""; 762 763 @{$SDFilesInfo{AllDataLabels}[$Index]} = (); 764 %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = (); 765 @{$SDFilesInfo{CommonDataLabels}[$Index]} = (); 766 767 if (!(-e $SDFile)) { 768 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 769 next FILELIST; 770 } 771 if (!CheckFileType($SDFile, "sd sdf")) { 772 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 773 next FILELIST; 774 } 775 776 # Generate appropriate name for the new text files... 777 $FileDir = ""; $FileName = ""; $FileExt = ""; 778 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 779 $OutFileExt = "csv"; 780 if ($Options{outdelim} =~ /^tab$/i) { 781 $OutFileExt = "tsv"; 782 } 783 if ($Options{root} && (@SDFilesList == 1)) { 784 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 785 if ($RootFileName && $RootFileExt) { 786 $FileName = $RootFileName; 787 } 788 else { 789 $FileName = $Options{root}; 790 } 791 $OutFileRoot = $FileName; 792 } 793 else { 794 $OutFileRoot = $FileName; 795 } 796 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt"; 797 798 if (!$OptionsInfo{Overwrite}) { 799 if (-e $OutFile) { 800 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; 801 next FILELIST; 802 } 803 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 804 if ($OptionsInfo{AllDataLabelPairs}) { 805 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) { 806 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n"; 807 next FILELIST; 808 } 809 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) { 810 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n"; 811 next FILELIST; 812 } 813 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) { 814 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n"; 815 next FILELIST; 816 } 817 } 818 else { 819 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") { 820 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n"; 821 next FILELIST; 822 } 823 } 824 } 825 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) { 826 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n"; 827 next FILELIST; 828 } 829 } 830 831 if (!open SDFILE, "$SDFile") { 832 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 833 next FILELIST; 834 } 835 836 my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels); 837 $CmpdCount = 0; 838 @DataFieldLabels = (); 839 @CommonDataFieldLabels = (); 840 ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 841 push @DataFieldLabels, @{$DataFieldLabelsRef}; 842 push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef}; 843 close SDFILE; 844 845 $SDFilesInfo{FileOkay}[$Index] = 1; 846 $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot"; 847 $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt"; 848 849 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; 850 push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels; 851 push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels; 852 for $Label (@DataFieldLabels) { 853 $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label; 854 } 855 } 856 } 857 858 # Process option values... 859 sub ProcessOptions { 860 %OptionsInfo = (); 861 862 $OptionsInfo{Mode} = $Options{mode}; 863 864 $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef; 865 866 $OptionsInfo{DetailLevel} = $Options{detail}; 867 868 # Setup supported statistical functions... 869 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); 870 871 %SupportedStatisticaFunctionsMap = (); 872 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); 873 874 for $SupportedFunction (@SupportedStatisticaFunctions) { 875 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; 876 } 877 878 # Setup a list of functions to use for analysis... 879 my($SpecifiedFunction); 880 881 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = (); 882 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = (); 883 884 # Check mode values... 885 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { 886 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic"; 887 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); 888 } 889 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { 890 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll"; 891 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); 892 } 893 elsif ($Options{mode} =~ /^All$/i ) { 894 $OptionsInfo{FileNameMode} = "AllStatistics"; 895 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions; 896 } 897 else { 898 $OptionsInfo{FileNameMode} = "SpecifiedStatistics"; 899 900 # Comma delimited list of functions... 901 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); 902 903 $Mode = $Options{mode}; 904 $Mode =~ s/ //g; 905 @SpecifiedFunctions = split ",", $Mode; 906 @UnsupportedSpecifiedFunctions = (); 907 for $SpecifiedFunction (@SpecifiedFunctions) { 908 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { 909 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction; 910 } 911 else { 912 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; 913 } 914 } 915 if (@UnsupportedSpecifiedFunctions) { 916 if (@UnsupportedSpecifiedFunctions > 1) { 917 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 918 } 919 else { 920 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; 921 } 922 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; 923 } 924 } 925 926 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 927 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) { 928 next FUNCTION; 929 } 930 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; 931 } 932 933 # Setup delimiter and quotes... 934 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 935 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; 936 937 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 938 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef; 939 940 # Setup miscellaneous options... 941 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1; 942 $OptionsInfo{Precision} = $Options{precision}; 943 944 $OptionsInfo{KLargest} = $Options{klargest}; 945 $OptionsInfo{KSmallest} = $Options{ksmallest}; 946 947 $OptionsInfo{TrimFraction} = $Options{trimfraction}; 948 949 # Setup frequency bin values... 950 $OptionsInfo{NumOfBins} = 10; 951 @{$OptionsInfo{BinRange}} = (); 952 if ($Options{frequencybins} =~ /\,/) { 953 my($BinValue, @SpecifiedBinRange); 954 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 955 if (@SpecifiedBinRange < 2) { 956 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 957 } 958 for $BinValue (@SpecifiedBinRange) { 959 if (!IsNumerical($BinValue)) { 960 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 961 } 962 } 963 my($Index1, $Index2); 964 for $Index1 (0 .. $#SpecifiedBinRange) { 965 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 966 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 967 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 968 } 969 } 970 } 971 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; 972 } 973 else { 974 $OptionsInfo{NumOfBins} = $Options{frequencybins}; 975 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { 976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 977 } 978 } 979 980 # Setup specified data field labels... 981 @{$OptionsInfo{SpecifiedDataLabels}} = (); 982 if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) { 983 my(@SpecifiedValues) = split ",", $Options{datafields}; 984 push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues; 985 } 986 @{$OptionsInfo{SpecifiedDataLabelPairs}} = (); 987 $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0; 988 $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0; 989 if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) { 990 my(@SpecifiedValues) = split ",", $Options{datafieldpairs}; 991 if (@SpecifiedValues % 2) { 992 die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n"; 993 } 994 push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues; 995 } 996 997 } 998 999 # Setup script usage and retrieve command line arguments specified using various options... 1000 sub SetupScriptUsage { 1001 1002 # Retrieve all the options... 1003 %Options = (); 1004 $Options{detail} = 0; 1005 $Options{datafields} = "Common"; 1006 $Options{datafieldpairs} = "CommonPairs"; 1007 $Options{frequencybins} = 10; 1008 $Options{klargest} = 2; 1009 $Options{ksmallest} = 2; 1010 $Options{mode} = "DescriptiveStatisticsBasic"; 1011 $Options{outdelim} = "comma"; 1012 $Options{precision} = 2; 1013 $Options{quote} = "yes"; 1014 $Options{trimfraction} = 0.1; 1015 1016 if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) { 1017 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1018 } 1019 if ($Options{workingdir}) { 1020 if (! -d $Options{workingdir}) { 1021 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1022 } 1023 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1024 } 1025 if (!IsInteger($Options{detail})) { 1026 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n"; 1027 } 1028 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1029 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1030 } 1031 if ($Options{quote} !~ /^(yes|no)$/i) { 1032 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 1033 } 1034 if (!IsPositiveInteger($Options{precision})) { 1035 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 1036 } 1037 if (!IsPositiveInteger($Options{klargest})) { 1038 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n"; 1039 } 1040 if (!IsPositiveInteger($Options{ksmallest})) { 1041 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n"; 1042 } 1043 if (IsFloat($Options{trimfraction})) { 1044 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) { 1045 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1046 } 1047 } 1048 else { 1049 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1050 } 1051 } 1052