MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: AnalyzeSDFilesData.pl,v $
   4 # $Date: 2015/02/28 20:46:04 $
   5 # $Revision: 1.27 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use SDFileUtil;
  37 use TextUtil;
  38 use StatisticsUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@SDFilesList);
  57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf");
  58 
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Collect information about SD files...
  64 print "Checking input SD file(s)...\n";
  65 my(%SDFilesInfo);
  66 RetrieveSDFilesInfo();
  67 ProcessSDFilesDataLabelsInfo();
  68 
  69 # Generate output files...
  70 my($FileIndex);
  71 if (@SDFilesList > 1) {
  72   print "\nProcessing SD files...\n";
  73 }
  74 for $FileIndex (0 .. $#SDFilesList) {
  75   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  76     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  77     AnalyzeSDFile($FileIndex);
  78   }
  79 }
  80 print "\n$ScriptName:Done...\n\n";
  81 
  82 $EndTime = new Benchmark;
  83 $TotalTime = timediff ($EndTime, $StartTime);
  84 print "Total time: ", timestr($TotalTime), "\n";
  85 
  86 ###############################################################################
  87 
  88 # Analyze data...
  89 sub AnalyzeSDFile {
  90   my($Index) = @_;
  91   my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap);
  92 
  93   $SDFile = $SDFilesList[$Index];
  94   @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]};
  95   %DataFieldValuesToAnalyzeMap = ();
  96   for $DataLabel (@DataLabelsToAnalyze) {
  97     @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = ();
  98   }
  99 
 100   # Collect appropriate data field label values for analysis...
 101   my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels);
 102   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 103   $CmpdCount = 0;
 104   $InvalidCmpdCount = 0;
 105   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 106     $CmpdCount++;
 107     @CmpdLines = split "\n", $CmpdString;
 108     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 109     @InvalidCmpdDataLabels = ();
 110     DATALABEL: for $DataLabel (@DataLabelsToAnalyze) {
 111       if (exists $DataFieldValues{$DataLabel}) {
 112         $DataValue = $DataFieldValues{$DataLabel};
 113         if ($OptionsInfo{CheckData}) {
 114           if (!IsNumerical($DataValue)) {
 115             push @InvalidCmpdDataLabels, $DataLabel;
 116             next DATALABEL;
 117           }
 118         }
 119         push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue;
 120       }
 121     }
 122     if (@InvalidCmpdDataLabels) {
 123       $InvalidCmpdCount++;
 124       if ($OptionsInfo{DetailLevel} >=4 ) {
 125         print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n";
 126       }
 127       elsif ($OptionsInfo{DetailLevel} >= 3) {
 128         print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n";
 129       }
 130       elsif ($OptionsInfo{DetailLevel} >= 2) {
 131         print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n";
 132       }
 133     }
 134   }
 135   if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) {
 136     print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n";
 137   }
 138   close SDFILE;
 139 
 140   # Perform the analysis...
 141   my(@SpecifiedFunctionNames, $SpecifiedFunction);
 142   @SpecifiedFunctionNames = ();
 143 
 144   for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
 145     if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
 146       push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)};
 147     }
 148   }
 149   if (@SpecifiedFunctionNames) {
 150     PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap)
 151   }
 152   if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
 153     if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) {
 154       PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 155     }
 156     else {
 157       # Perform pairwise analysis for specified columns and write out calculated values - correlation
 158       # rsquare, or covariance - in the same file.
 159       PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 160     }
 161   }
 162   if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) {
 163     PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 164   }
 165   if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
 166     PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 167   }
 168 
 169 }
 170 
 171 # Calculate values for various statistical functions...
 172 sub PerformAnalysis {
 173   my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_;
 174   my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze);
 175 
 176   $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index];
 177 
 178   print "Generating new text file $NewTextFile...\n";
 179   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 180 
 181   # Write out column labels...
 182   @ColLabels = ();
 183   push @ColLabels, "DataLabel";
 184   for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 185     $Label = $SpecifiedFunction;
 186     if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
 187       my($KthValue);
 188       $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest};
 189       $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
 190       $Label =~ s/K//g;
 191     }
 192     elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 193       $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})";
 194     }
 195     push @ColLabels, $Label;
 196   }
 197   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 198   print NEWTEXTFILE "$Line\n";
 199 
 200   # Go over each column to be analyzed...
 201   @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
 202 
 203   # Turn off "strict"; otherwise, invoking statistical functions using function name string
 204   # is problematic.
 205   no strict;
 206 
 207   my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues);
 208   %CalculatedValues = ();
 209   for $DataLabel (@DataLabelsToAnalyze) {
 210     @RowValues = ();
 211     # Setup column id...
 212     push @RowValues, $DataLabel;
 213     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 214     FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 215       $Value = "";
 216       if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) {
 217         # Invalid column values...
 218         push @RowValues, $Value;
 219         next FUNCTIONNAME;
 220       }
 221       if ($SpecifiedFunction =~ /^Count$/i) {
 222         $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 223       }
 224       elsif ($SpecifiedFunction =~ /^KLargest$/i) {
 225         $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest});
 226       }
 227       elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
 228         $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest});
 229       }
 230       elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
 231         if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
 232           $Value = $CalculatedValues{$DataLabel}{StandardDeviation};
 233         }
 234         else {
 235           $Value = &$SpecifiedFunction($DataValuesRef);
 236           $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
 237         }
 238       }
 239       elsif ($SpecifiedFunction =~ /^StandardError$/i) {
 240         if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
 241           $Value = StandardDeviation($DataValuesRef);
 242           $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
 243         }
 244         if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) {
 245           $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}});
 246         }
 247       }
 248       elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 249         $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction});
 250       }
 251       else {
 252         $Value = &$SpecifiedFunction($DataValuesRef);
 253       }
 254       # Format the output value. And add zero to get rid of tariling zeros...
 255       $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : "";
 256       push @RowValues, $Value;
 257     }
 258     $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 259     print NEWTEXTFILE "$Line\n";
 260   }
 261   close NEWTEXTFILE;
 262 }
 263 
 264 # Calculate covariance, correlation, rsquare for specified data field label pairs....
 265 sub PerformDataLabelPairAnalysis {
 266   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 267   my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 268 
 269   $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
 270   $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
 271   $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
 272 
 273   $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." .  $SDFilesInfo{NewTextFileExt}[$Index];
 274   print "Generating new text file $NewTextFile...\n";
 275   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 276 
 277   # Write out the column labels...
 278   @ColLabels = ();
 279   push @ColLabels, ("DataLabel1", "DataLabel2");
 280   if ($CalculateCorrelation || $CalculateRSquare) {
 281     push @ColLabels, "Correlation";
 282     if ($CalculateRSquare) {
 283       push @ColLabels, "RSquare";
 284     }
 285   }
 286   if ($CalculateCovariance) {
 287     push @ColLabels, "Covariance";
 288   }
 289   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 290   print NEWTEXTFILE "$Line\n";
 291 
 292   # Go over each data field pair...
 293   my($CorrelationValue, $RSquareValue, $CovarianceValue,  $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value);
 294 
 295   @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]};
 296   @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]};
 297   for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) {
 298     @RowValues = ();
 299     $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex];
 300     $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex];
 301     $DataValues1 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
 302     $DataValues2 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
 303 
 304     # Setup column ids...
 305     push @RowValues, $DataLabel1;
 306     push @RowValues, $DataLabel2;
 307 
 308     if (@$DataValues1 != @$DataValues2) {
 309       # Print a warning...
 310       warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n";
 311       if ($CalculateCorrelation || $CalculateRSquare) {
 312         push @RowValues, "";
 313         if ($CalculateRSquare) {
 314           push @RowValues, "";
 315         }
 316       }
 317       if ($CalculateCovariance) {
 318         push @RowValues, "";
 319       }
 320     }
 321     else {
 322       # Calculate appropriate value...
 323       if ($CalculateCorrelation || $CalculateRSquare) {
 324         $CorrelationValue = Correlation($DataValues1, $DataValues2);
 325         $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
 326         push @RowValues, $Value;
 327         if ($CalculateRSquare) {
 328           $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 329           $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
 330           push @RowValues, $Value;
 331         }
 332       }
 333       if ($CalculateCovariance) {
 334         $CovarianceValue = Covariance($DataValues1, $DataValues2);
 335         $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
 336         push @RowValues, $Value;
 337       }
 338     }
 339     $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 340     print NEWTEXTFILE "$Line\n";
 341   }
 342   close NEWTEXTFILE;
 343 }
 344 
 345 # Generate histogram numbers...
 346 sub PerformFrequencyAnalysis {
 347   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 348   my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
 349 
 350   @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
 351   for $DataLabel (@DataLabelsToAnalyze) {
 352     $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." .  $SDFilesInfo{NewTextFileExt}[$Index];
 353     print "Generating new text file $NewTextFile...\n";
 354     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 355 
 356     # Write out the column labels...
 357     @ColLabels = ();
 358     push @ColLabels , ("Bins", "Frequency");
 359     $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 360     print NEWTEXTFILE "$Line\n";
 361 
 362     #Calculate and write out frequency values...
 363     %FrequencyMap = ();
 364     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 365     if (@$DataValuesRef) {
 366       if (@{$OptionsInfo{BinRange}}) {
 367         %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}});
 368       }
 369       else {
 370         %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins});
 371       }
 372     }
 373     for $BinValue (sort { $a <=> $b }  keys %FrequencyMap) {
 374       $FrequencyValue = $FrequencyMap{$BinValue};
 375 
 376       @RowValues = ();
 377       $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : "";
 378       push @RowValues, $Value;
 379       $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : "";
 380       push @RowValues, $Value;
 381 
 382       $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 383       print NEWTEXTFILE "$Line\n";
 384     }
 385     close NEWTEXTFILE;
 386   }
 387 }
 388 
 389 # Calculate covariance, correlation/rsquare matrices....
 390 sub PerformMatrixAnalysis {
 391   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 392   my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 393 
 394   $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
 395   $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
 396   $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
 397 
 398   $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." .  $SDFilesInfo{NewTextFileExt}[$Index];
 399   $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." .  $SDFilesInfo{NewTextFileExt}[$Index];
 400   $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." .  $SDFilesInfo{NewTextFileExt}[$Index];
 401 
 402   my($TextFilesList, $Delimiter);
 403   $TextFilesList =  "";
 404   if ($CalculateCorrelation || $CalculateRSquare) {
 405     $TextFilesList = $CorrelationTextFile;
 406     if ($CalculateRSquare) {
 407       $TextFilesList .= ", $CorrelationTextFile";
 408     }
 409   }
 410   $Delimiter = length($TextFilesList) ? "," : "";
 411   if ($CalculateCovariance) {
 412     $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
 413   }
 414   if ($TextFilesList =~ /\,/) {
 415     print "Generating new text files $TextFilesList...\n"
 416   }
 417   else {
 418     print "Generating new text file $TextFilesList...\n"
 419   }
 420   if ($CalculateCorrelation || $CalculateRSquare) {
 421     open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
 422     if ($CalculateRSquare) {
 423       open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
 424     }
 425   }
 426   if ($CalculateCovariance) {
 427     open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
 428   }
 429 
 430   my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
 431 
 432   # Write out the column labels...
 433   @ColLabels = ();
 434   push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]};
 435   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 436   if ($CalculateCorrelation || $CalculateRSquare) {
 437     print CORRELATIONTEXTFILE "$Line\n";
 438     if ($CalculateRSquare) {
 439       print RSQUARETEXTFILE "$Line\n";
 440     }
 441   }
 442   if ($CalculateCovariance) {
 443     print COVARIANCETEXTFILE "$Line\n";
 444   }
 445 
 446   # Due to symmetric nature of these matrices, only one half needs to be
 447   # calculated. So, just calculate the lower half and copy it to upper half...
 448   my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze);
 449 
 450   %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
 451   @DataLabelsToAnalyze = ();
 452   @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]};
 453 
 454   for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
 455     $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
 456     for $LabelIndex2 (0 .. $LabelIndex1) {
 457       $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
 458       $DataValuesRef1 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
 459       $DataValuesRef2 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
 460       if ($CalculateCorrelation || $CalculateRSquare) {
 461         $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2);
 462         $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
 463         $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue;
 464         if ($DataLabel1 ne $DataLabel2) {
 465           $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue;
 466         }
 467         if ($CalculateRSquare) {
 468           $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 469           $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
 470           $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue;
 471           if ($DataLabel1 ne $DataLabel2) {
 472             $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue;
 473           }
 474         }
 475       }
 476       if ($CalculateCovariance) {
 477         $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2);
 478         $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
 479         $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue;
 480         if ($DataLabel1 ne $DataLabel2) {
 481           $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue;
 482         }
 483       }
 484     }
 485   }
 486 
 487   # Write out the matrices...
 488   for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
 489     $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
 490     @CorrelationRowValues = ();
 491     @RSquareRowValues = ();
 492     @CovarianceRowValues = ();
 493     if ($CalculateCorrelation || $CalculateRSquare) {
 494       push @CorrelationRowValues, $DataLabel1;
 495       if ($CalculateRSquare) {
 496         push @RSquareRowValues, $DataLabel1;
 497       }
 498     }
 499     if ($CalculateCovariance) {
 500       push @CovarianceRowValues, $DataLabel;
 501     }
 502     for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) {
 503       $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
 504       if ($CalculateCorrelation || $CalculateRSquare) {
 505         push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2};
 506         if ($CalculateRSquare) {
 507           push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2};
 508         }
 509       }
 510       if ($CalculateCovariance) {
 511         push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2};
 512       }
 513     }
 514     if ($CalculateCorrelation || $CalculateRSquare) {
 515       $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 516       print CORRELATIONTEXTFILE "$Line\n";
 517       if ($CalculateRSquare) {
 518         $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 519         print RSQUARETEXTFILE "$Line\n";
 520       }
 521     }
 522     if ($CalculateCovariance) {
 523       $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 524       print COVARIANCETEXTFILE "$Line\n";
 525     }
 526   }
 527   if ($CalculateCorrelation || $CalculateRSquare) {
 528     close CORRELATIONTEXTFILE;
 529     if ($CalculateRSquare) {
 530       close RSQUARETEXTFILE;
 531     }
 532   }
 533   if ($CalculateCovariance) {
 534     close COVARIANCETEXTFILE;
 535   }
 536 }
 537 
 538 # Calculate standard scores...
 539 sub PerformStandardScoresAnalysis {
 540   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 541   my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
 542 
 543   $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0;
 544   $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0;
 545 
 546   $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." .  $SDFilesInfo{NewTextFileExt}[$Index];
 547   print "Generating new text file $NewTextFile...\n";
 548   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 549 
 550   my($DataLabel, @DataLabelsToAnalyze);
 551   # Write out column labels...
 552   @ColLabels = ();
 553   @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
 554   for $DataLabel (@DataLabelsToAnalyze) {
 555     if ($StandardScores) {
 556       push @ColLabels, "${DataLabel}\(StandardScores)";
 557     }
 558     if ($StandardScoresN) {
 559       push @ColLabels, "${DataLabel}\(StandardScoresN)";
 560     }
 561   }
 562   $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 563   print NEWTEXTFILE "$NewLine\n";
 564 
 565   # Go over each column to be analyzed and calculate standard deviation
 566   # and mean values...
 567   my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
 568   %StandardDeviationMap = ();
 569   %StandardDeviationNMap = ();
 570   %MeanMap = ();
 571   for $DataLabel (@DataLabelsToAnalyze) {
 572     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 573     if (!exists($MeanMap{$DataLabel})) {
 574       $MeanMap{$DataLabel} = Mean($DataValuesRef);
 575     }
 576     if ($StandardScores) {
 577       if (!exists($StandardDeviationMap{$DataLabel})) {
 578         $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef);
 579       }
 580     }
 581     if ($StandardScoresN) {
 582       if (!exists($StandardDeviationNMap{$DataLabel})) {
 583         $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef);
 584       }
 585     }
 586   }
 587   #
 588   # Go over each data field and calculate standard scores for each column
 589   # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
 590   # for StandardScoresN; write out the calculated values as well...
 591 
 592   my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues);
 593   $SDFile = $SDFilesList[$Index];
 594 
 595   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 596   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 597     @CmpdLines = split "\n", $CmpdString;
 598     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 599     @RowValues = ();
 600     for $DataLabel (@DataLabelsToAnalyze) {
 601       $Value = "";
 602       if (exists $DataFieldValues{$DataLabel}) {
 603         $Value = $DataFieldValues{$DataLabel};
 604       }
 605       $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1;
 606       if ($StandardScores) {
 607         $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : "";
 608         $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
 609         push @RowValues, $ScoreValue;
 610       }
 611       if ($StandardScoresN) {
 612         $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : "";
 613         $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
 614         push @RowValues, $ScoreValue;
 615       }
 616     }
 617     $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 618     print NEWTEXTFILE "$NewLine\n";
 619   }
 620   close SDFILE;
 621   close NEWTEXTFILE;
 622 
 623 }
 624 
 625 # Make sure the specified data field labels exists in SD files...
 626 sub ProcessSDFilesDataLabelsInfo {
 627   my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap);
 628 
 629   @{$SDFilesInfo{DataLabelsToAnalyze}} = ();
 630   @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = ();
 631   @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = ();
 632   @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = ();
 633 
 634   FILELIST: for $Index (0 .. $#SDFilesList) {
 635     $SDFile = $SDFilesList[$Index];
 636 
 637     @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = ();
 638     @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = ();
 639     @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = ();
 640     @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = ();
 641 
 642     %UniqueDataLabelsToAnalyzeMap = ();
 643 
 644     if ($SDFilesInfo{FileOkay}[$Index]) {
 645       @DataLabelsToAnalyze = ();
 646       if (@{$OptionsInfo{SpecifiedDataLabels}}) {
 647         for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) {
 648           if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) {
 649             push @DataLabelsToAnalyze, $DataLabel;
 650           }
 651         }
 652       }
 653       elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) {
 654         push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]};
 655       }
 656       else {
 657         push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]};
 658       }
 659       if (@DataLabelsToAnalyze) {
 660         push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze;
 661         # Set up unique data field label map as well...
 662         for $DataLabel (@DataLabelsToAnalyze) {
 663           if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
 664             $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
 665           }
 666         }
 667       }
 668       else {
 669         warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n";
 670         $SDFilesInfo{FileOkay}[$Index] = 0;
 671         next FILELIST;
 672       }
 673       if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
 674         # Make sure specific frequency files don't exist...
 675         my($FrequencyFile);
 676         for $DataLabel (@DataLabelsToAnalyze) {
 677           $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." .  $SDFilesInfo{NewTextFileExt}[$Index];
 678           if (-e $FrequencyFile) {
 679             warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n";
 680             $SDFilesInfo{FileOkay}[$Index] = 0;
 681             next FILELIST;
 682           }
 683         }
 684       }
 685       # Setup specified data field label pairs...
 686       if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) {
 687         my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2);
 688         if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) {
 689           # Make sure both data field labels exist...
 690           my($DataFieldIndex);
 691           for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) {
 692             $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex];
 693             $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1];
 694             if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) {
 695               push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 696             }
 697           }
 698         }
 699         elsif ($OptionsInfo{AllDataLabelPairs}) {
 700           for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
 701             for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
 702               push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 703             }
 704           }
 705         }
 706         else {
 707           for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
 708             for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
 709               push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 710             }
 711           }
 712         }
 713         if (@DataLabelPairsToAnalyze) {
 714           if (@DataLabelPairsToAnalyze % 2) {
 715             warn "Warning: Ignoring file $SDFile: Invalid number  values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n";
 716             $SDFilesInfo{FileOkay}[$Index] = 0;
 717             next FILELIST;
 718           }
 719           else {
 720             for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) {
 721               push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex];
 722               push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1];
 723             }
 724             # Set up unique data field labe map as well...
 725             for $DataLabel (@DataLabelPairsToAnalyze) {
 726               if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
 727                 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
 728               }
 729             }
 730           }
 731         }
 732       }
 733       # Setup unique data field label array...
 734       push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap);
 735     }
 736   }
 737 }
 738 
 739 # Retrieve information about input SD files...
 740 sub RetrieveSDFilesInfo {
 741   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount);
 742 
 743   %SDFilesInfo = ();
 744 
 745   @{$SDFilesInfo{FileOkay}} = ();
 746   @{$SDFilesInfo{CmpdCount}} = ();
 747   @{$SDFilesInfo{NewTextFileRoot}} = ();
 748   @{$SDFilesInfo{NewTextFileExt}} = ();
 749 
 750   @{$SDFilesInfo{AllDataFieldLabels}} = ();
 751   @{$SDFilesInfo{AllDataFieldLabelsMap}} = ();
 752   @{$SDFilesInfo{CommonDataLabels}} = ();
 753 
 754   FILELIST: for $Index (0 .. $#SDFilesList) {
 755     $SDFile = $SDFilesList[$Index];
 756 
 757     $SDFilesInfo{FileOkay}[$Index] = 0;
 758 
 759     $SDFilesInfo{CmpdCount}[$Index] = 0;
 760     $SDFilesInfo{NewTextFileRoot}[$Index] = "";
 761     $SDFilesInfo{NewTextFileExt}[$Index] = "";
 762 
 763     @{$SDFilesInfo{AllDataLabels}[$Index]} = ();
 764     %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = ();
 765     @{$SDFilesInfo{CommonDataLabels}[$Index]} = ();
 766 
 767     if (!(-e $SDFile)) {
 768       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 769       next FILELIST;
 770     }
 771     if (!CheckFileType($SDFile, "sd sdf")) {
 772       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 773       next FILELIST;
 774     }
 775 
 776     # Generate appropriate name for the new text files...
 777     $FileDir = ""; $FileName = ""; $FileExt = "";
 778     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 779     $OutFileExt = "csv";
 780     if ($Options{outdelim} =~ /^tab$/i) {
 781       $OutFileExt = "tsv";
 782     }
 783     if ($Options{root} && (@SDFilesList == 1)) {
 784       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 785       if ($RootFileName && $RootFileExt) {
 786         $FileName = $RootFileName;
 787       }
 788       else {
 789         $FileName = $Options{root};
 790       }
 791       $OutFileRoot = $FileName;
 792     }
 793     else {
 794       $OutFileRoot = $FileName;
 795     }
 796     $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt";
 797 
 798     if (!$OptionsInfo{Overwrite}) {
 799       if (-e $OutFile) {
 800         warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
 801         next FILELIST;
 802       }
 803       if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
 804         if ($OptionsInfo{AllDataLabelPairs}) {
 805           if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) {
 806             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n";
 807             next FILELIST;
 808           }
 809           if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) {
 810             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n";
 811             next FILELIST;
 812           }
 813           if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) {
 814             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n";
 815             next FILELIST;
 816           }
 817         }
 818         else {
 819           if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") {
 820             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n";
 821             next FILELIST;
 822           }
 823         }
 824       }
 825       if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) {
 826         warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n";
 827         next FILELIST;
 828       }
 829     }
 830 
 831     if (!open SDFILE, "$SDFile") {
 832       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 833       next FILELIST;
 834     }
 835 
 836     my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels);
 837     $CmpdCount = 0;
 838     @DataFieldLabels = ();
 839     @CommonDataFieldLabels = ();
 840     ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 841     push @DataFieldLabels, @{$DataFieldLabelsRef};
 842     push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef};
 843     close SDFILE;
 844 
 845     $SDFilesInfo{FileOkay}[$Index] = 1;
 846     $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot";
 847     $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt";
 848 
 849     $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
 850     push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels;
 851     push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels;
 852     for $Label (@DataFieldLabels) {
 853       $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label;
 854     }
 855   }
 856 }
 857 
 858 # Process option values...
 859 sub ProcessOptions {
 860   %OptionsInfo = ();
 861 
 862   $OptionsInfo{Mode} = $Options{mode};
 863 
 864   $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef;
 865 
 866   $OptionsInfo{DetailLevel} = $Options{detail};
 867 
 868   # Setup supported statistical functions...
 869   my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
 870 
 871   %SupportedStatisticaFunctionsMap = ();
 872   @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
 873 
 874   for $SupportedFunction (@SupportedStatisticaFunctions) {
 875     $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
 876   }
 877 
 878   # Setup a list of functions to use for analysis...
 879   my($SpecifiedFunction);
 880 
 881   %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = ();
 882   @{$OptionsInfo{SpecifiedStatisticalFunctions}} = ();
 883 
 884   # Check mode values...
 885   if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
 886     $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic";
 887     @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
 888   }
 889   elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
 890     $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll";
 891     @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance  RSquare Frequency  KLargest KSmallest Sum);
 892   }
 893   elsif ($Options{mode} =~ /^All$/i ) {
 894     $OptionsInfo{FileNameMode} = "AllStatistics";
 895     @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions;
 896   }
 897   else {
 898     $OptionsInfo{FileNameMode} = "SpecifiedStatistics";
 899 
 900     # Comma delimited list of functions...
 901     my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
 902 
 903     $Mode = $Options{mode};
 904     $Mode =~ s/ //g;
 905     @SpecifiedFunctions = split ",", $Mode;
 906     @UnsupportedSpecifiedFunctions = ();
 907     for $SpecifiedFunction (@SpecifiedFunctions) {
 908       if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
 909         push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction;
 910       }
 911       else {
 912         push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
 913       }
 914     }
 915     if (@UnsupportedSpecifiedFunctions) {
 916       if (@UnsupportedSpecifiedFunctions > 1) {
 917         warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
 918       }
 919       else {
 920         warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
 921       }
 922       die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
 923     }
 924   }
 925 
 926   FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
 927     if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) {
 928       next FUNCTION;
 929     }
 930     $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
 931   }
 932 
 933   # Setup delimiter and quotes...
 934   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 935   $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
 936 
 937   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 938   $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
 939 
 940   # Setup miscellaneous options...
 941   $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1;
 942   $OptionsInfo{Precision} = $Options{precision};
 943 
 944   $OptionsInfo{KLargest} = $Options{klargest};
 945   $OptionsInfo{KSmallest} = $Options{ksmallest};
 946 
 947   $OptionsInfo{TrimFraction} = $Options{trimfraction};
 948 
 949   # Setup frequency bin values...
 950   $OptionsInfo{NumOfBins} = 10;
 951   @{$OptionsInfo{BinRange}} = ();
 952   if ($Options{frequencybins} =~ /\,/) {
 953     my($BinValue, @SpecifiedBinRange);
 954     @SpecifiedBinRange = split /\,/,  $Options{frequencybins};
 955     if (@SpecifiedBinRange < 2) {
 956       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
 957     }
 958     for $BinValue (@SpecifiedBinRange) {
 959       if (!IsNumerical($BinValue)) {
 960         die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
 961       }
 962     }
 963     my($Index1, $Index2);
 964     for $Index1 (0 .. $#SpecifiedBinRange) {
 965       for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
 966         if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
 967           die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
 968         }
 969       }
 970     }
 971     push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
 972   }
 973   else {
 974     $OptionsInfo{NumOfBins} = $Options{frequencybins};
 975     if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
 976       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
 977     }
 978   }
 979 
 980   # Setup specified data field labels...
 981   @{$OptionsInfo{SpecifiedDataLabels}} = ();
 982   if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) {
 983     my(@SpecifiedValues) = split ",", $Options{datafields};
 984     push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues;
 985   }
 986   @{$OptionsInfo{SpecifiedDataLabelPairs}} = ();
 987   $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0;
 988   $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0;
 989   if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) {
 990     my(@SpecifiedValues) = split ",", $Options{datafieldpairs};
 991     if (@SpecifiedValues % 2) {
 992       die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n";
 993     }
 994     push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues;
 995   }
 996 
 997 }
 998 
 999 # Setup script usage  and retrieve command line arguments specified using various options...
1000 sub SetupScriptUsage {
1001 
1002   # Retrieve all the options...
1003   %Options = ();
1004   $Options{detail} = 0;
1005   $Options{datafields} = "Common";
1006   $Options{datafieldpairs} = "CommonPairs";
1007   $Options{frequencybins} = 10;
1008   $Options{klargest} = 2;
1009   $Options{ksmallest} = 2;
1010   $Options{mode} = "DescriptiveStatisticsBasic";
1011   $Options{outdelim} = "comma";
1012   $Options{precision} = 2;
1013   $Options{quote} = "yes";
1014   $Options{trimfraction} = 0.1;
1015 
1016   if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) {
1017     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1018   }
1019   if ($Options{workingdir}) {
1020     if (! -d $Options{workingdir}) {
1021       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1022     }
1023     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1024   }
1025   if (!IsInteger($Options{detail})) {
1026     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n";
1027   }
1028   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1029     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1030   }
1031   if ($Options{quote} !~ /^(yes|no)$/i) {
1032     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1033   }
1034   if (!IsPositiveInteger($Options{precision})) {
1035     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
1036   }
1037   if (!IsPositiveInteger($Options{klargest})) {
1038     die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n";
1039   }
1040   if (!IsPositiveInteger($Options{ksmallest})) {
1041     die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n";
1042   }
1043   if (IsFloat($Options{trimfraction})) {
1044     if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) {
1045       die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1046     }
1047   }
1048   else {
1049     die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1050   }
1051 }
1052