Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/AnalyzeSDFilesData.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: AnalyzeSDFilesData.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:04 $ | |
| 5 # $Revision: 1.27 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Text::ParseWords; | |
| 34 use Benchmark; | |
| 35 use FileUtil; | |
| 36 use SDFileUtil; | |
| 37 use TextUtil; | |
| 38 use StatisticsUtil; | |
| 39 | |
| 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 41 | |
| 42 # Autoflush STDOUT | |
| 43 $| = 1; | |
| 44 | |
| 45 # Starting message... | |
| 46 $ScriptName = basename($0); | |
| 47 print "\n$ScriptName: Starting...\n\n"; | |
| 48 $StartTime = new Benchmark; | |
| 49 | |
| 50 # Get the options and setup script... | |
| 51 SetupScriptUsage(); | |
| 52 if ($Options{help} || @ARGV < 1) { | |
| 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 54 } | |
| 55 | |
| 56 my(@SDFilesList); | |
| 57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf"); | |
| 58 | |
| 59 print "Processing options...\n"; | |
| 60 my(%OptionsInfo); | |
| 61 ProcessOptions(); | |
| 62 | |
| 63 # Collect information about SD files... | |
| 64 print "Checking input SD file(s)...\n"; | |
| 65 my(%SDFilesInfo); | |
| 66 RetrieveSDFilesInfo(); | |
| 67 ProcessSDFilesDataLabelsInfo(); | |
| 68 | |
| 69 # Generate output files... | |
| 70 my($FileIndex); | |
| 71 if (@SDFilesList > 1) { | |
| 72 print "\nProcessing SD files...\n"; | |
| 73 } | |
| 74 for $FileIndex (0 .. $#SDFilesList) { | |
| 75 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
| 76 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
| 77 AnalyzeSDFile($FileIndex); | |
| 78 } | |
| 79 } | |
| 80 print "\n$ScriptName:Done...\n\n"; | |
| 81 | |
| 82 $EndTime = new Benchmark; | |
| 83 $TotalTime = timediff ($EndTime, $StartTime); | |
| 84 print "Total time: ", timestr($TotalTime), "\n"; | |
| 85 | |
| 86 ############################################################################### | |
| 87 | |
| 88 # Analyze data... | |
| 89 sub AnalyzeSDFile { | |
| 90 my($Index) = @_; | |
| 91 my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap); | |
| 92 | |
| 93 $SDFile = $SDFilesList[$Index]; | |
| 94 @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}; | |
| 95 %DataFieldValuesToAnalyzeMap = (); | |
| 96 for $DataLabel (@DataLabelsToAnalyze) { | |
| 97 @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = (); | |
| 98 } | |
| 99 | |
| 100 # Collect appropriate data field label values for analysis... | |
| 101 my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels); | |
| 102 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; | |
| 103 $CmpdCount = 0; | |
| 104 $InvalidCmpdCount = 0; | |
| 105 while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
| 106 $CmpdCount++; | |
| 107 @CmpdLines = split "\n", $CmpdString; | |
| 108 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 109 @InvalidCmpdDataLabels = (); | |
| 110 DATALABEL: for $DataLabel (@DataLabelsToAnalyze) { | |
| 111 if (exists $DataFieldValues{$DataLabel}) { | |
| 112 $DataValue = $DataFieldValues{$DataLabel}; | |
| 113 if ($OptionsInfo{CheckData}) { | |
| 114 if (!IsNumerical($DataValue)) { | |
| 115 push @InvalidCmpdDataLabels, $DataLabel; | |
| 116 next DATALABEL; | |
| 117 } | |
| 118 } | |
| 119 push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue; | |
| 120 } | |
| 121 } | |
| 122 if (@InvalidCmpdDataLabels) { | |
| 123 $InvalidCmpdCount++; | |
| 124 if ($OptionsInfo{DetailLevel} >=4 ) { | |
| 125 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n"; | |
| 126 } | |
| 127 elsif ($OptionsInfo{DetailLevel} >= 3) { | |
| 128 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n"; | |
| 129 } | |
| 130 elsif ($OptionsInfo{DetailLevel} >= 2) { | |
| 131 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n"; | |
| 132 } | |
| 133 } | |
| 134 } | |
| 135 if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) { | |
| 136 print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n"; | |
| 137 } | |
| 138 close SDFILE; | |
| 139 | |
| 140 # Perform the analysis... | |
| 141 my(@SpecifiedFunctionNames, $SpecifiedFunction); | |
| 142 @SpecifiedFunctionNames = (); | |
| 143 | |
| 144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { | |
| 145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { | |
| 146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)}; | |
| 147 } | |
| 148 } | |
| 149 if (@SpecifiedFunctionNames) { | |
| 150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap) | |
| 151 } | |
| 152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { | |
| 153 if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) { | |
| 154 PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap); | |
| 155 } | |
| 156 else { | |
| 157 # Perform pairwise analysis for specified columns and write out calculated values - correlation | |
| 158 # rsquare, or covariance - in the same file. | |
| 159 PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap); | |
| 160 } | |
| 161 } | |
| 162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) { | |
| 163 PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap); | |
| 164 } | |
| 165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { | |
| 166 PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap); | |
| 167 } | |
| 168 | |
| 169 } | |
| 170 | |
| 171 # Calculate values for various statistical functions... | |
| 172 sub PerformAnalysis { | |
| 173 my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_; | |
| 174 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze); | |
| 175 | |
| 176 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 177 | |
| 178 print "Generating new text file $NewTextFile...\n"; | |
| 179 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
| 180 | |
| 181 # Write out column labels... | |
| 182 @ColLabels = (); | |
| 183 push @ColLabels, "DataLabel"; | |
| 184 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { | |
| 185 $Label = $SpecifiedFunction; | |
| 186 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { | |
| 187 my($KthValue); | |
| 188 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest}; | |
| 189 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; | |
| 190 $Label =~ s/K//g; | |
| 191 } | |
| 192 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { | |
| 193 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})"; | |
| 194 } | |
| 195 push @ColLabels, $Label; | |
| 196 } | |
| 197 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 198 print NEWTEXTFILE "$Line\n"; | |
| 199 | |
| 200 # Go over each column to be analyzed... | |
| 201 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; | |
| 202 | |
| 203 # Turn off "strict"; otherwise, invoking statistical functions using function name string | |
| 204 # is problematic. | |
| 205 no strict; | |
| 206 | |
| 207 my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues); | |
| 208 %CalculatedValues = (); | |
| 209 for $DataLabel (@DataLabelsToAnalyze) { | |
| 210 @RowValues = (); | |
| 211 # Setup column id... | |
| 212 push @RowValues, $DataLabel; | |
| 213 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; | |
| 214 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { | |
| 215 $Value = ""; | |
| 216 if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) { | |
| 217 # Invalid column values... | |
| 218 push @RowValues, $Value; | |
| 219 next FUNCTIONNAME; | |
| 220 } | |
| 221 if ($SpecifiedFunction =~ /^Count$/i) { | |
| 222 $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}}; | |
| 223 } | |
| 224 elsif ($SpecifiedFunction =~ /^KLargest$/i) { | |
| 225 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest}); | |
| 226 } | |
| 227 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { | |
| 228 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest}); | |
| 229 } | |
| 230 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { | |
| 231 if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) { | |
| 232 $Value = $CalculatedValues{$DataLabel}{StandardDeviation}; | |
| 233 } | |
| 234 else { | |
| 235 $Value = &$SpecifiedFunction($DataValuesRef); | |
| 236 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; | |
| 237 } | |
| 238 } | |
| 239 elsif ($SpecifiedFunction =~ /^StandardError$/i) { | |
| 240 if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) { | |
| 241 $Value = StandardDeviation($DataValuesRef); | |
| 242 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; | |
| 243 } | |
| 244 if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) { | |
| 245 $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}}); | |
| 246 } | |
| 247 } | |
| 248 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { | |
| 249 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction}); | |
| 250 } | |
| 251 else { | |
| 252 $Value = &$SpecifiedFunction($DataValuesRef); | |
| 253 } | |
| 254 # Format the output value. And add zero to get rid of tariling zeros... | |
| 255 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ""; | |
| 256 push @RowValues, $Value; | |
| 257 } | |
| 258 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 259 print NEWTEXTFILE "$Line\n"; | |
| 260 } | |
| 261 close NEWTEXTFILE; | |
| 262 } | |
| 263 | |
| 264 # Calculate covariance, correlation, rsquare for specified data field label pairs.... | |
| 265 sub PerformDataLabelPairAnalysis { | |
| 266 my($Index, $DataValuesToAnalyzeMapRef) = @_; | |
| 267 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); | |
| 268 | |
| 269 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; | |
| 270 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; | |
| 271 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; | |
| 272 | |
| 273 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 274 print "Generating new text file $NewTextFile...\n"; | |
| 275 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
| 276 | |
| 277 # Write out the column labels... | |
| 278 @ColLabels = (); | |
| 279 push @ColLabels, ("DataLabel1", "DataLabel2"); | |
| 280 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 281 push @ColLabels, "Correlation"; | |
| 282 if ($CalculateRSquare) { | |
| 283 push @ColLabels, "RSquare"; | |
| 284 } | |
| 285 } | |
| 286 if ($CalculateCovariance) { | |
| 287 push @ColLabels, "Covariance"; | |
| 288 } | |
| 289 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 290 print NEWTEXTFILE "$Line\n"; | |
| 291 | |
| 292 # Go over each data field pair... | |
| 293 my($CorrelationValue, $RSquareValue, $CovarianceValue, $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value); | |
| 294 | |
| 295 @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}; | |
| 296 @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}; | |
| 297 for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) { | |
| 298 @RowValues = (); | |
| 299 $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex]; | |
| 300 $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex]; | |
| 301 $DataValues1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; | |
| 302 $DataValues2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; | |
| 303 | |
| 304 # Setup column ids... | |
| 305 push @RowValues, $DataLabel1; | |
| 306 push @RowValues, $DataLabel2; | |
| 307 | |
| 308 if (@$DataValues1 != @$DataValues2) { | |
| 309 # Print a warning... | |
| 310 warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n"; | |
| 311 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 312 push @RowValues, ""; | |
| 313 if ($CalculateRSquare) { | |
| 314 push @RowValues, ""; | |
| 315 } | |
| 316 } | |
| 317 if ($CalculateCovariance) { | |
| 318 push @RowValues, ""; | |
| 319 } | |
| 320 } | |
| 321 else { | |
| 322 # Calculate appropriate value... | |
| 323 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 324 $CorrelationValue = Correlation($DataValues1, $DataValues2); | |
| 325 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; | |
| 326 push @RowValues, $Value; | |
| 327 if ($CalculateRSquare) { | |
| 328 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; | |
| 329 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; | |
| 330 push @RowValues, $Value; | |
| 331 } | |
| 332 } | |
| 333 if ($CalculateCovariance) { | |
| 334 $CovarianceValue = Covariance($DataValues1, $DataValues2); | |
| 335 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; | |
| 336 push @RowValues, $Value; | |
| 337 } | |
| 338 } | |
| 339 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 340 print NEWTEXTFILE "$Line\n"; | |
| 341 } | |
| 342 close NEWTEXTFILE; | |
| 343 } | |
| 344 | |
| 345 # Generate histogram numbers... | |
| 346 sub PerformFrequencyAnalysis { | |
| 347 my($Index, $DataValuesToAnalyzeMapRef) = @_; | |
| 348 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); | |
| 349 | |
| 350 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; | |
| 351 for $DataLabel (@DataLabelsToAnalyze) { | |
| 352 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 353 print "Generating new text file $NewTextFile...\n"; | |
| 354 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
| 355 | |
| 356 # Write out the column labels... | |
| 357 @ColLabels = (); | |
| 358 push @ColLabels , ("Bins", "Frequency"); | |
| 359 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 360 print NEWTEXTFILE "$Line\n"; | |
| 361 | |
| 362 #Calculate and write out frequency values... | |
| 363 %FrequencyMap = (); | |
| 364 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; | |
| 365 if (@$DataValuesRef) { | |
| 366 if (@{$OptionsInfo{BinRange}}) { | |
| 367 %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}}); | |
| 368 } | |
| 369 else { | |
| 370 %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins}); | |
| 371 } | |
| 372 } | |
| 373 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { | |
| 374 $FrequencyValue = $FrequencyMap{$BinValue}; | |
| 375 | |
| 376 @RowValues = (); | |
| 377 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : ""; | |
| 378 push @RowValues, $Value; | |
| 379 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : ""; | |
| 380 push @RowValues, $Value; | |
| 381 | |
| 382 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 383 print NEWTEXTFILE "$Line\n"; | |
| 384 } | |
| 385 close NEWTEXTFILE; | |
| 386 } | |
| 387 } | |
| 388 | |
| 389 # Calculate covariance, correlation/rsquare matrices.... | |
| 390 sub PerformMatrixAnalysis { | |
| 391 my($Index, $DataValuesToAnalyzeMapRef) = @_; | |
| 392 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); | |
| 393 | |
| 394 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; | |
| 395 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; | |
| 396 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; | |
| 397 | |
| 398 $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 399 $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 400 $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 401 | |
| 402 my($TextFilesList, $Delimiter); | |
| 403 $TextFilesList = ""; | |
| 404 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 405 $TextFilesList = $CorrelationTextFile; | |
| 406 if ($CalculateRSquare) { | |
| 407 $TextFilesList .= ", $CorrelationTextFile"; | |
| 408 } | |
| 409 } | |
| 410 $Delimiter = length($TextFilesList) ? "," : ""; | |
| 411 if ($CalculateCovariance) { | |
| 412 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; | |
| 413 } | |
| 414 if ($TextFilesList =~ /\,/) { | |
| 415 print "Generating new text files $TextFilesList...\n" | |
| 416 } | |
| 417 else { | |
| 418 print "Generating new text file $TextFilesList...\n" | |
| 419 } | |
| 420 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 421 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; | |
| 422 if ($CalculateRSquare) { | |
| 423 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; | |
| 424 } | |
| 425 } | |
| 426 if ($CalculateCovariance) { | |
| 427 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; | |
| 428 } | |
| 429 | |
| 430 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); | |
| 431 | |
| 432 # Write out the column labels... | |
| 433 @ColLabels = (); | |
| 434 push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]}; | |
| 435 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 436 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 437 print CORRELATIONTEXTFILE "$Line\n"; | |
| 438 if ($CalculateRSquare) { | |
| 439 print RSQUARETEXTFILE "$Line\n"; | |
| 440 } | |
| 441 } | |
| 442 if ($CalculateCovariance) { | |
| 443 print COVARIANCETEXTFILE "$Line\n"; | |
| 444 } | |
| 445 | |
| 446 # Due to symmetric nature of these matrices, only one half needs to be | |
| 447 # calculated. So, just calculate the lower half and copy it to upper half... | |
| 448 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze); | |
| 449 | |
| 450 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); | |
| 451 @DataLabelsToAnalyze = (); | |
| 452 @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]}; | |
| 453 | |
| 454 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { | |
| 455 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; | |
| 456 for $LabelIndex2 (0 .. $LabelIndex1) { | |
| 457 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; | |
| 458 $DataValuesRef1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; | |
| 459 $DataValuesRef2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; | |
| 460 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 461 $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2); | |
| 462 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; | |
| 463 $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue; | |
| 464 if ($DataLabel1 ne $DataLabel2) { | |
| 465 $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue; | |
| 466 } | |
| 467 if ($CalculateRSquare) { | |
| 468 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; | |
| 469 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; | |
| 470 $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue; | |
| 471 if ($DataLabel1 ne $DataLabel2) { | |
| 472 $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue; | |
| 473 } | |
| 474 } | |
| 475 } | |
| 476 if ($CalculateCovariance) { | |
| 477 $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2); | |
| 478 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; | |
| 479 $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue; | |
| 480 if ($DataLabel1 ne $DataLabel2) { | |
| 481 $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue; | |
| 482 } | |
| 483 } | |
| 484 } | |
| 485 } | |
| 486 | |
| 487 # Write out the matrices... | |
| 488 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { | |
| 489 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; | |
| 490 @CorrelationRowValues = (); | |
| 491 @RSquareRowValues = (); | |
| 492 @CovarianceRowValues = (); | |
| 493 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 494 push @CorrelationRowValues, $DataLabel1; | |
| 495 if ($CalculateRSquare) { | |
| 496 push @RSquareRowValues, $DataLabel1; | |
| 497 } | |
| 498 } | |
| 499 if ($CalculateCovariance) { | |
| 500 push @CovarianceRowValues, $DataLabel; | |
| 501 } | |
| 502 for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) { | |
| 503 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; | |
| 504 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 505 push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2}; | |
| 506 if ($CalculateRSquare) { | |
| 507 push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2}; | |
| 508 } | |
| 509 } | |
| 510 if ($CalculateCovariance) { | |
| 511 push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2}; | |
| 512 } | |
| 513 } | |
| 514 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 515 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 516 print CORRELATIONTEXTFILE "$Line\n"; | |
| 517 if ($CalculateRSquare) { | |
| 518 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 519 print RSQUARETEXTFILE "$Line\n"; | |
| 520 } | |
| 521 } | |
| 522 if ($CalculateCovariance) { | |
| 523 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 524 print COVARIANCETEXTFILE "$Line\n"; | |
| 525 } | |
| 526 } | |
| 527 if ($CalculateCorrelation || $CalculateRSquare) { | |
| 528 close CORRELATIONTEXTFILE; | |
| 529 if ($CalculateRSquare) { | |
| 530 close RSQUARETEXTFILE; | |
| 531 } | |
| 532 } | |
| 533 if ($CalculateCovariance) { | |
| 534 close COVARIANCETEXTFILE; | |
| 535 } | |
| 536 } | |
| 537 | |
| 538 # Calculate standard scores... | |
| 539 sub PerformStandardScoresAnalysis { | |
| 540 my($Index, $DataValuesToAnalyzeMapRef) = @_; | |
| 541 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); | |
| 542 | |
| 543 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0; | |
| 544 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0; | |
| 545 | |
| 546 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 547 print "Generating new text file $NewTextFile...\n"; | |
| 548 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
| 549 | |
| 550 my($DataLabel, @DataLabelsToAnalyze); | |
| 551 # Write out column labels... | |
| 552 @ColLabels = (); | |
| 553 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; | |
| 554 for $DataLabel (@DataLabelsToAnalyze) { | |
| 555 if ($StandardScores) { | |
| 556 push @ColLabels, "${DataLabel}\(StandardScores)"; | |
| 557 } | |
| 558 if ($StandardScoresN) { | |
| 559 push @ColLabels, "${DataLabel}\(StandardScoresN)"; | |
| 560 } | |
| 561 } | |
| 562 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 563 print NEWTEXTFILE "$NewLine\n"; | |
| 564 | |
| 565 # Go over each column to be analyzed and calculate standard deviation | |
| 566 # and mean values... | |
| 567 my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap); | |
| 568 %StandardDeviationMap = (); | |
| 569 %StandardDeviationNMap = (); | |
| 570 %MeanMap = (); | |
| 571 for $DataLabel (@DataLabelsToAnalyze) { | |
| 572 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; | |
| 573 if (!exists($MeanMap{$DataLabel})) { | |
| 574 $MeanMap{$DataLabel} = Mean($DataValuesRef); | |
| 575 } | |
| 576 if ($StandardScores) { | |
| 577 if (!exists($StandardDeviationMap{$DataLabel})) { | |
| 578 $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef); | |
| 579 } | |
| 580 } | |
| 581 if ($StandardScoresN) { | |
| 582 if (!exists($StandardDeviationNMap{$DataLabel})) { | |
| 583 $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef); | |
| 584 } | |
| 585 } | |
| 586 } | |
| 587 # | |
| 588 # Go over each data field and calculate standard scores for each column | |
| 589 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n | |
| 590 # for StandardScoresN; write out the calculated values as well... | |
| 591 | |
| 592 my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues); | |
| 593 $SDFile = $SDFilesList[$Index]; | |
| 594 | |
| 595 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; | |
| 596 while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
| 597 @CmpdLines = split "\n", $CmpdString; | |
| 598 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 599 @RowValues = (); | |
| 600 for $DataLabel (@DataLabelsToAnalyze) { | |
| 601 $Value = ""; | |
| 602 if (exists $DataFieldValues{$DataLabel}) { | |
| 603 $Value = $DataFieldValues{$DataLabel}; | |
| 604 } | |
| 605 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1; | |
| 606 if ($StandardScores) { | |
| 607 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : ""; | |
| 608 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; | |
| 609 push @RowValues, $ScoreValue; | |
| 610 } | |
| 611 if ($StandardScoresN) { | |
| 612 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : ""; | |
| 613 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; | |
| 614 push @RowValues, $ScoreValue; | |
| 615 } | |
| 616 } | |
| 617 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 618 print NEWTEXTFILE "$NewLine\n"; | |
| 619 } | |
| 620 close SDFILE; | |
| 621 close NEWTEXTFILE; | |
| 622 | |
| 623 } | |
| 624 | |
| 625 # Make sure the specified data field labels exists in SD files... | |
| 626 sub ProcessSDFilesDataLabelsInfo { | |
| 627 my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap); | |
| 628 | |
| 629 @{$SDFilesInfo{DataLabelsToAnalyze}} = (); | |
| 630 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = (); | |
| 631 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = (); | |
| 632 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = (); | |
| 633 | |
| 634 FILELIST: for $Index (0 .. $#SDFilesList) { | |
| 635 $SDFile = $SDFilesList[$Index]; | |
| 636 | |
| 637 @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = (); | |
| 638 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = (); | |
| 639 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = (); | |
| 640 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = (); | |
| 641 | |
| 642 %UniqueDataLabelsToAnalyzeMap = (); | |
| 643 | |
| 644 if ($SDFilesInfo{FileOkay}[$Index]) { | |
| 645 @DataLabelsToAnalyze = (); | |
| 646 if (@{$OptionsInfo{SpecifiedDataLabels}}) { | |
| 647 for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) { | |
| 648 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) { | |
| 649 push @DataLabelsToAnalyze, $DataLabel; | |
| 650 } | |
| 651 } | |
| 652 } | |
| 653 elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) { | |
| 654 push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]}; | |
| 655 } | |
| 656 else { | |
| 657 push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]}; | |
| 658 } | |
| 659 if (@DataLabelsToAnalyze) { | |
| 660 push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze; | |
| 661 # Set up unique data field label map as well... | |
| 662 for $DataLabel (@DataLabelsToAnalyze) { | |
| 663 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { | |
| 664 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; | |
| 665 } | |
| 666 } | |
| 667 } | |
| 668 else { | |
| 669 warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n"; | |
| 670 $SDFilesInfo{FileOkay}[$Index] = 0; | |
| 671 next FILELIST; | |
| 672 } | |
| 673 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { | |
| 674 # Make sure specific frequency files don't exist... | |
| 675 my($FrequencyFile); | |
| 676 for $DataLabel (@DataLabelsToAnalyze) { | |
| 677 $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; | |
| 678 if (-e $FrequencyFile) { | |
| 679 warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n"; | |
| 680 $SDFilesInfo{FileOkay}[$Index] = 0; | |
| 681 next FILELIST; | |
| 682 } | |
| 683 } | |
| 684 } | |
| 685 # Setup specified data field label pairs... | |
| 686 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) { | |
| 687 my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2); | |
| 688 if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) { | |
| 689 # Make sure both data field labels exist... | |
| 690 my($DataFieldIndex); | |
| 691 for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) { | |
| 692 $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex]; | |
| 693 $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1]; | |
| 694 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) { | |
| 695 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); | |
| 696 } | |
| 697 } | |
| 698 } | |
| 699 elsif ($OptionsInfo{AllDataLabelPairs}) { | |
| 700 for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) { | |
| 701 for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) { | |
| 702 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); | |
| 703 } | |
| 704 } | |
| 705 } | |
| 706 else { | |
| 707 for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) { | |
| 708 for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) { | |
| 709 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); | |
| 710 } | |
| 711 } | |
| 712 } | |
| 713 if (@DataLabelPairsToAnalyze) { | |
| 714 if (@DataLabelPairsToAnalyze % 2) { | |
| 715 warn "Warning: Ignoring file $SDFile: Invalid number values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n"; | |
| 716 $SDFilesInfo{FileOkay}[$Index] = 0; | |
| 717 next FILELIST; | |
| 718 } | |
| 719 else { | |
| 720 for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) { | |
| 721 push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex]; | |
| 722 push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1]; | |
| 723 } | |
| 724 # Set up unique data field labe map as well... | |
| 725 for $DataLabel (@DataLabelPairsToAnalyze) { | |
| 726 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { | |
| 727 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; | |
| 728 } | |
| 729 } | |
| 730 } | |
| 731 } | |
| 732 } | |
| 733 # Setup unique data field label array... | |
| 734 push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap); | |
| 735 } | |
| 736 } | |
| 737 } | |
| 738 | |
| 739 # Retrieve information about input SD files... | |
| 740 sub RetrieveSDFilesInfo { | |
| 741 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount); | |
| 742 | |
| 743 %SDFilesInfo = (); | |
| 744 | |
| 745 @{$SDFilesInfo{FileOkay}} = (); | |
| 746 @{$SDFilesInfo{CmpdCount}} = (); | |
| 747 @{$SDFilesInfo{NewTextFileRoot}} = (); | |
| 748 @{$SDFilesInfo{NewTextFileExt}} = (); | |
| 749 | |
| 750 @{$SDFilesInfo{AllDataFieldLabels}} = (); | |
| 751 @{$SDFilesInfo{AllDataFieldLabelsMap}} = (); | |
| 752 @{$SDFilesInfo{CommonDataLabels}} = (); | |
| 753 | |
| 754 FILELIST: for $Index (0 .. $#SDFilesList) { | |
| 755 $SDFile = $SDFilesList[$Index]; | |
| 756 | |
| 757 $SDFilesInfo{FileOkay}[$Index] = 0; | |
| 758 | |
| 759 $SDFilesInfo{CmpdCount}[$Index] = 0; | |
| 760 $SDFilesInfo{NewTextFileRoot}[$Index] = ""; | |
| 761 $SDFilesInfo{NewTextFileExt}[$Index] = ""; | |
| 762 | |
| 763 @{$SDFilesInfo{AllDataLabels}[$Index]} = (); | |
| 764 %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = (); | |
| 765 @{$SDFilesInfo{CommonDataLabels}[$Index]} = (); | |
| 766 | |
| 767 if (!(-e $SDFile)) { | |
| 768 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
| 769 next FILELIST; | |
| 770 } | |
| 771 if (!CheckFileType($SDFile, "sd sdf")) { | |
| 772 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
| 773 next FILELIST; | |
| 774 } | |
| 775 | |
| 776 # Generate appropriate name for the new text files... | |
| 777 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 778 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
| 779 $OutFileExt = "csv"; | |
| 780 if ($Options{outdelim} =~ /^tab$/i) { | |
| 781 $OutFileExt = "tsv"; | |
| 782 } | |
| 783 if ($Options{root} && (@SDFilesList == 1)) { | |
| 784 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); | |
| 785 if ($RootFileName && $RootFileExt) { | |
| 786 $FileName = $RootFileName; | |
| 787 } | |
| 788 else { | |
| 789 $FileName = $Options{root}; | |
| 790 } | |
| 791 $OutFileRoot = $FileName; | |
| 792 } | |
| 793 else { | |
| 794 $OutFileRoot = $FileName; | |
| 795 } | |
| 796 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt"; | |
| 797 | |
| 798 if (!$OptionsInfo{Overwrite}) { | |
| 799 if (-e $OutFile) { | |
| 800 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; | |
| 801 next FILELIST; | |
| 802 } | |
| 803 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { | |
| 804 if ($OptionsInfo{AllDataLabelPairs}) { | |
| 805 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) { | |
| 806 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n"; | |
| 807 next FILELIST; | |
| 808 } | |
| 809 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) { | |
| 810 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n"; | |
| 811 next FILELIST; | |
| 812 } | |
| 813 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) { | |
| 814 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n"; | |
| 815 next FILELIST; | |
| 816 } | |
| 817 } | |
| 818 else { | |
| 819 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") { | |
| 820 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n"; | |
| 821 next FILELIST; | |
| 822 } | |
| 823 } | |
| 824 } | |
| 825 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) { | |
| 826 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n"; | |
| 827 next FILELIST; | |
| 828 } | |
| 829 } | |
| 830 | |
| 831 if (!open SDFILE, "$SDFile") { | |
| 832 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; | |
| 833 next FILELIST; | |
| 834 } | |
| 835 | |
| 836 my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels); | |
| 837 $CmpdCount = 0; | |
| 838 @DataFieldLabels = (); | |
| 839 @CommonDataFieldLabels = (); | |
| 840 ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); | |
| 841 push @DataFieldLabels, @{$DataFieldLabelsRef}; | |
| 842 push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef}; | |
| 843 close SDFILE; | |
| 844 | |
| 845 $SDFilesInfo{FileOkay}[$Index] = 1; | |
| 846 $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot"; | |
| 847 $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt"; | |
| 848 | |
| 849 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; | |
| 850 push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels; | |
| 851 push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels; | |
| 852 for $Label (@DataFieldLabels) { | |
| 853 $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label; | |
| 854 } | |
| 855 } | |
| 856 } | |
| 857 | |
| 858 # Process option values... | |
| 859 sub ProcessOptions { | |
| 860 %OptionsInfo = (); | |
| 861 | |
| 862 $OptionsInfo{Mode} = $Options{mode}; | |
| 863 | |
| 864 $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef; | |
| 865 | |
| 866 $OptionsInfo{DetailLevel} = $Options{detail}; | |
| 867 | |
| 868 # Setup supported statistical functions... | |
| 869 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); | |
| 870 | |
| 871 %SupportedStatisticaFunctionsMap = (); | |
| 872 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); | |
| 873 | |
| 874 for $SupportedFunction (@SupportedStatisticaFunctions) { | |
| 875 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; | |
| 876 } | |
| 877 | |
| 878 # Setup a list of functions to use for analysis... | |
| 879 my($SpecifiedFunction); | |
| 880 | |
| 881 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = (); | |
| 882 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = (); | |
| 883 | |
| 884 # Check mode values... | |
| 885 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { | |
| 886 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic"; | |
| 887 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); | |
| 888 } | |
| 889 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { | |
| 890 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll"; | |
| 891 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); | |
| 892 } | |
| 893 elsif ($Options{mode} =~ /^All$/i ) { | |
| 894 $OptionsInfo{FileNameMode} = "AllStatistics"; | |
| 895 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions; | |
| 896 } | |
| 897 else { | |
| 898 $OptionsInfo{FileNameMode} = "SpecifiedStatistics"; | |
| 899 | |
| 900 # Comma delimited list of functions... | |
| 901 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); | |
| 902 | |
| 903 $Mode = $Options{mode}; | |
| 904 $Mode =~ s/ //g; | |
| 905 @SpecifiedFunctions = split ",", $Mode; | |
| 906 @UnsupportedSpecifiedFunctions = (); | |
| 907 for $SpecifiedFunction (@SpecifiedFunctions) { | |
| 908 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { | |
| 909 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction; | |
| 910 } | |
| 911 else { | |
| 912 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; | |
| 913 } | |
| 914 } | |
| 915 if (@UnsupportedSpecifiedFunctions) { | |
| 916 if (@UnsupportedSpecifiedFunctions > 1) { | |
| 917 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; | |
| 918 } | |
| 919 else { | |
| 920 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; | |
| 921 } | |
| 922 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; | |
| 923 } | |
| 924 } | |
| 925 | |
| 926 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { | |
| 927 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) { | |
| 928 next FUNCTION; | |
| 929 } | |
| 930 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; | |
| 931 } | |
| 932 | |
| 933 # Setup delimiter and quotes... | |
| 934 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); | |
| 935 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; | |
| 936 | |
| 937 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; | |
| 938 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef; | |
| 939 | |
| 940 # Setup miscellaneous options... | |
| 941 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1; | |
| 942 $OptionsInfo{Precision} = $Options{precision}; | |
| 943 | |
| 944 $OptionsInfo{KLargest} = $Options{klargest}; | |
| 945 $OptionsInfo{KSmallest} = $Options{ksmallest}; | |
| 946 | |
| 947 $OptionsInfo{TrimFraction} = $Options{trimfraction}; | |
| 948 | |
| 949 # Setup frequency bin values... | |
| 950 $OptionsInfo{NumOfBins} = 10; | |
| 951 @{$OptionsInfo{BinRange}} = (); | |
| 952 if ($Options{frequencybins} =~ /\,/) { | |
| 953 my($BinValue, @SpecifiedBinRange); | |
| 954 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; | |
| 955 if (@SpecifiedBinRange < 2) { | |
| 956 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; | |
| 957 } | |
| 958 for $BinValue (@SpecifiedBinRange) { | |
| 959 if (!IsNumerical($BinValue)) { | |
| 960 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; | |
| 961 } | |
| 962 } | |
| 963 my($Index1, $Index2); | |
| 964 for $Index1 (0 .. $#SpecifiedBinRange) { | |
| 965 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { | |
| 966 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { | |
| 967 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; | |
| 968 } | |
| 969 } | |
| 970 } | |
| 971 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; | |
| 972 } | |
| 973 else { | |
| 974 $OptionsInfo{NumOfBins} = $Options{frequencybins}; | |
| 975 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { | |
| 976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; | |
| 977 } | |
| 978 } | |
| 979 | |
| 980 # Setup specified data field labels... | |
| 981 @{$OptionsInfo{SpecifiedDataLabels}} = (); | |
| 982 if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) { | |
| 983 my(@SpecifiedValues) = split ",", $Options{datafields}; | |
| 984 push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues; | |
| 985 } | |
| 986 @{$OptionsInfo{SpecifiedDataLabelPairs}} = (); | |
| 987 $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0; | |
| 988 $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0; | |
| 989 if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) { | |
| 990 my(@SpecifiedValues) = split ",", $Options{datafieldpairs}; | |
| 991 if (@SpecifiedValues % 2) { | |
| 992 die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n"; | |
| 993 } | |
| 994 push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues; | |
| 995 } | |
| 996 | |
| 997 } | |
| 998 | |
| 999 # Setup script usage and retrieve command line arguments specified using various options... | |
| 1000 sub SetupScriptUsage { | |
| 1001 | |
| 1002 # Retrieve all the options... | |
| 1003 %Options = (); | |
| 1004 $Options{detail} = 0; | |
| 1005 $Options{datafields} = "Common"; | |
| 1006 $Options{datafieldpairs} = "CommonPairs"; | |
| 1007 $Options{frequencybins} = 10; | |
| 1008 $Options{klargest} = 2; | |
| 1009 $Options{ksmallest} = 2; | |
| 1010 $Options{mode} = "DescriptiveStatisticsBasic"; | |
| 1011 $Options{outdelim} = "comma"; | |
| 1012 $Options{precision} = 2; | |
| 1013 $Options{quote} = "yes"; | |
| 1014 $Options{trimfraction} = 0.1; | |
| 1015 | |
| 1016 if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) { | |
| 1017 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 1018 } | |
| 1019 if ($Options{workingdir}) { | |
| 1020 if (! -d $Options{workingdir}) { | |
| 1021 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 1022 } | |
| 1023 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 1024 } | |
| 1025 if (!IsInteger($Options{detail})) { | |
| 1026 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n"; | |
| 1027 } | |
| 1028 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
| 1029 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
| 1030 } | |
| 1031 if ($Options{quote} !~ /^(yes|no)$/i) { | |
| 1032 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
| 1033 } | |
| 1034 if (!IsPositiveInteger($Options{precision})) { | |
| 1035 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; | |
| 1036 } | |
| 1037 if (!IsPositiveInteger($Options{klargest})) { | |
| 1038 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n"; | |
| 1039 } | |
| 1040 if (!IsPositiveInteger($Options{ksmallest})) { | |
| 1041 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n"; | |
| 1042 } | |
| 1043 if (IsFloat($Options{trimfraction})) { | |
| 1044 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) { | |
| 1045 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; | |
| 1046 } | |
| 1047 } | |
| 1048 else { | |
| 1049 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; | |
| 1050 } | |
| 1051 } | |
| 1052 | |
| 1053 __END__ | |
| 1054 | |
| 1055 =head1 NAME | |
| 1056 | |
| 1057 AnalyzeSDFilesData.pl - Analyze numerical data field values in SDFile(s) | |
| 1058 | |
| 1059 =head1 SYNOPSIS | |
| 1060 | |
| 1061 AnalyzeSDFilesData.pl SDFile(s)... | |
| 1062 | |
| 1063 AnalyzeSDFilesData.pl [B<--datafields> "fieldlabel,[fieldlabel,...]" | All] | |
| 1064 [B<--datafieldpairs> "fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | AllPairs] [B<-d, --detail> infolevel] | |
| 1065 [B<-f, --fast>] [B<--frequencybins> number | "number,number,[number,...]"] | |
| 1066 [B<-h, --help>] [B<--klargest> number] [B<--ksmallest> number] | |
| 1067 [B<-m, --mode> DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"] | |
| 1068 [B<--trimfraction> number] [B<-w, --workingdir> dirname] SDFiles(s)... | |
| 1069 | |
| 1070 =head1 DESCRIPTION | |
| 1071 | |
| 1072 Analyze numerical data field values in I<SDFile(s)> using a combination of various statistical | |
| 1073 functions; Non-numerical values are simply ignored. For I<Correlation, RSquare, and | |
| 1074 Covariance> analysis, the count of valid values in specified data field pairs must be same; | |
| 1075 otherwise, column data field pair is ignored. The file names are separated by space.The valid file | |
| 1076 extensions are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a | |
| 1077 current directory can be specified either by I<*.sdf> or the current directory name. | |
| 1078 | |
| 1079 =head1 OPTIONS | |
| 1080 | |
| 1081 =over 4 | |
| 1082 | |
| 1083 =item B<--datafields> I<"fieldlabel,[fieldlabel,...]" | Common | All> | |
| 1084 | |
| 1085 Data fields to use for analysis. Possible values: list of comma separated data field | |
| 1086 labels, data fields common to all records, or all data fields. Default value: I<Common>. | |
| 1087 Examples: | |
| 1088 | |
| 1089 ALogP,MolWeight,EC50 | |
| 1090 "MolWeight,PSA" | |
| 1091 | |
| 1092 =item B<--datafieldpairs> I<"fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | CommonPairs | AllPairs> | |
| 1093 | |
| 1094 This value is mode specific and is only used for I<Correlation, PearsonCorrelation, or | |
| 1095 Covariance> value of B<-m, --mode> option. It specifies data field label pairs to use | |
| 1096 for data analysis during I<Correlation> and I<Covariance> calculations. Possible values: | |
| 1097 comma delimited list of data field label pairs, data field label pairs common to all records, | |
| 1098 or all data field pairs. Default value:I<CommonPairs>. Example: | |
| 1099 | |
| 1100 MolWeight,EC50,NumN+O,PSA | |
| 1101 | |
| 1102 For I<AllPairs> value of B<--datafieldpairs> option, all data field label pairs are used for | |
| 1103 I<Correlation> and I<Covariance> calculations. | |
| 1104 | |
| 1105 =item B<-d, --detail> I<infolevel> | |
| 1106 | |
| 1107 Level of information to print about column values being ignored. Default: I<0>. Possible values: | |
| 1108 0, 1, 2, 3, or 4. | |
| 1109 | |
| 1110 =item B<-f, --fast> | |
| 1111 | |
| 1112 In this mode, all the data field values specified for analysis are assumed to contain numerical | |
| 1113 data and no checking is performed before analysis. By default, only numerical data is | |
| 1114 used for analysis; other types of column data is ignored. | |
| 1115 | |
| 1116 =item B<--frequencybins> I<number | "number,number,[number,...]"> | |
| 1117 | |
| 1118 Specify number of bins or bin range to use for frequency analysis. Default value: I<10> | |
| 1119 | |
| 1120 Number of bins value along with the smallest and largest value for a column is used to | |
| 1121 group the column values into different groups. | |
| 1122 | |
| 1123 The bin range list is used to group values for a column into different groups; It must contain | |
| 1124 values in ascending order. Examples: | |
| 1125 | |
| 1126 10,20,30 | |
| 1127 0.1,0.2,0.3,0.4,0.5 | |
| 1128 | |
| 1129 The frequency value calculated for a specific bin corresponds to all the column values | |
| 1130 which are greater than the previous bin value and less than or equal to the current bin value. | |
| 1131 | |
| 1132 =item B<-h, --help> | |
| 1133 | |
| 1134 Print this help message. | |
| 1135 | |
| 1136 =item B<--klargest> I<number> | |
| 1137 | |
| 1138 Kth largest value to find by I<KLargest> function. Default value: I<2>. Valid values: positive | |
| 1139 integers. | |
| 1140 | |
| 1141 =item B<--ksmallest> I<number> | |
| 1142 | |
| 1143 Kth smallest value to find by I<KSmallest> function. Default values: I<2>. Valid values: positive | |
| 1144 integers. | |
| 1145 | |
| 1146 =item B<-m, --mode> I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"> | |
| 1147 | |
| 1148 Specify how to analyze data in SDFile(s): calculate basic or all descriptive statistics; or | |
| 1149 use a comma delimited list of supported statistical functions. Possible values: | |
| 1150 I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | "function1,[function2]...">. Default | |
| 1151 value: I<DescriptiveStatisticsBasic> | |
| 1152 | |
| 1153 I<DescriptiveStatisticsBasic> includes these functions: I<Count, Maximum, Minimum, Mean, | |
| 1154 Median, Sum, StandardDeviation, StandardError, Variance>. | |
| 1155 | |
| 1156 I<DescriptiveStatisticsAll>, in addition to I<DescriptiveStatisticsBasic> functions, includes: | |
| 1157 I<GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare, | |
| 1158 Skewness, TrimMean>. | |
| 1159 | |
| 1160 I<All> uses complete list of supported functions: I<Average, AverageDeviation, Correlation, | |
| 1161 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, | |
| 1162 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum, | |
| 1163 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores, | |
| 1164 StandardScoresN, TrimMean, Variance, VarianceN>. The function names ending with N | |
| 1165 calculate corresponding values assuming an entire population instead of a population sample. | |
| 1166 Here are the formulas for these functions: | |
| 1167 | |
| 1168 Average: See Mean | |
| 1169 | |
| 1170 AverageDeviation: SUM( ABS(x[i] - Xmean) ) / n | |
| 1171 | |
| 1172 Correlation: See Pearson Correlation | |
| 1173 | |
| 1174 Covariance: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / n | |
| 1175 | |
| 1176 GeometricMean: NthROOT( PRODUCT(x[i]) ) | |
| 1177 | |
| 1178 HarmonicMean: 1 / ( SUM(1/x[i]) / n ) | |
| 1179 | |
| 1180 Mean: SUM( x[i] ) / n | |
| 1181 | |
| 1182 Median: Xsorted[(n - 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2 | |
| 1183 for odd values of n. | |
| 1184 | |
| 1185 Kurtosis: [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] - | |
| 1186 {3((n - 1)^2)}/{(n - 2)(n-3)} | |
| 1187 | |
| 1188 PearsonCorrelation: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 ) | |
| 1189 (SUM( (y[i] - Ymean)^2 )) ) | |
| 1190 | |
| 1191 RSquare: PearsonCorrelation^2 | |
| 1192 | |
| 1193 Skewness: {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 } | |
| 1194 | |
| 1195 StandardDeviation: SQRT ( SUM( (x[i] - Mean)^2 ) / (n - 1) ) | |
| 1196 | |
| 1197 StandardDeviationN: SQRT ( SUM( (x[i] - Mean)^2 ) / n ) | |
| 1198 | |
| 1199 StandardError: StandardDeviation / SQRT( n ) | |
| 1200 | |
| 1201 StandardScore: (x[i] - Mean) / (n - 1) | |
| 1202 | |
| 1203 StandardScoreN: (x[i] - Mean) / n | |
| 1204 | |
| 1205 Variance: SUM( (x[i] - Xmean)^2 / (n - 1) ) | |
| 1206 | |
| 1207 VarianceN: SUM( (x[i] - Xmean)^2 / n ) | |
| 1208 | |
| 1209 =item B<-o, --overwrite> | |
| 1210 | |
| 1211 Overwrite existing files. | |
| 1212 | |
| 1213 =item B<--outdelim> I<comma | tab | semicolon> | |
| 1214 | |
| 1215 Output text file delimiter. Possible values: I<comma, tab, or semicolon> | |
| 1216 Default value: I<comma>. | |
| 1217 | |
| 1218 =item B<-p, --precision> I<number> | |
| 1219 | |
| 1220 Precision of calculated values in the output file. Default: up to I<2> decimal places. | |
| 1221 Valid values: positive integers. | |
| 1222 | |
| 1223 =item B<-q, --quote> I<yes | no> | |
| 1224 | |
| 1225 Put quotes around column values in output text file. Possible values: I<yes or | |
| 1226 no>. Default value: I<yes>. | |
| 1227 | |
| 1228 =item B<-r, --root> I<rootname> | |
| 1229 | |
| 1230 New text file name is generated using the root: <Root>.<Ext>. Default new file | |
| 1231 name: <InitialSDFileName><Mode>.<Ext>. Based on the specified analysis, | |
| 1232 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic, | |
| 1233 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation, | |
| 1234 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for | |
| 1235 comma/semicolon, and tab delimited text files respectively. This option is ignored for | |
| 1236 multiple input files. | |
| 1237 | |
| 1238 =item B<--trimfraction> I<number> | |
| 1239 | |
| 1240 Fraction of data to exclude from the top and bottom of the data set during | |
| 1241 I<TrimMean> calculation. Default value: I<0.1> Valid values: > 0 and < 1. | |
| 1242 | |
| 1243 =item B<-w --workingdir> I<text> | |
| 1244 | |
| 1245 Location of working directory. Default: current directory. | |
| 1246 | |
| 1247 =back | |
| 1248 | |
| 1249 =head1 EXAMPLES | |
| 1250 | |
| 1251 To calculate basic statistics for data in all common data fields and generate a | |
| 1252 NewSample1DescriptiveStatisticsBasic.csv file, type: | |
| 1253 | |
| 1254 % AnalyzeSDFilesData.pl -o -r NewSample1 Sample1.sdf | |
| 1255 | |
| 1256 To calculate basic statistics for MolWeight data field and generate a | |
| 1257 NewSample1DescriptiveStatisticsBasic.csv file, type: | |
| 1258 | |
| 1259 % AnalyzeSDFilesData.pl --datafields MolWeight -o -r NewSample1 | |
| 1260 Sample1.sdf | |
| 1261 | |
| 1262 To calculate all available statistics for MolWeight data field and all data field pairs, | |
| 1263 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv, | |
| 1264 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv | |
| 1265 files, type: | |
| 1266 | |
| 1267 % AnalyzeSDFilesData.pl -m DescriptiveStatisticsAll --datafields | |
| 1268 MolWeight -o --datafieldpairs AllPairs -r NewSample1 Sample1.sdf | |
| 1269 | |
| 1270 To compute frequency distribution of MolWeight data field into five bins and | |
| 1271 generate NewSample1MolWeightFrequencyAnalysis.csv, type: | |
| 1272 | |
| 1273 % AnalyzeSDFilesData.pl -m Frequency --frequencybins 5 --datafields | |
| 1274 MolWeight -o -r NewSample1 Sample1.sdf | |
| 1275 | |
| 1276 To compute frequency distribution of data in MolWeight data field into specified bin range | |
| 1277 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type: | |
| 1278 | |
| 1279 % AnalyzeSDFilesData.pl -m Frequency --frequencybins "100,200,400" | |
| 1280 --datafields MolWeight -o -r NewSample1 Sample1.sdf | |
| 1281 | |
| 1282 To calculate all available statistics for data in all data fields and pairs, type: | |
| 1283 | |
| 1284 % AnalyzeSDFilesData.pl -m All --datafields All --datafieldpairs | |
| 1285 AllPairs -o -r NewSample1 Sample1.sdf | |
| 1286 | |
| 1287 =head1 AUTHOR | |
| 1288 | |
| 1289 Manish Sud <msud@san.rr.com> | |
| 1290 | |
| 1291 =head1 SEE ALSO | |
| 1292 | |
| 1293 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl | |
| 1294 | |
| 1295 =head1 COPYRIGHT | |
| 1296 | |
| 1297 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 1298 | |
| 1299 This file is part of MayaChemTools. | |
| 1300 | |
| 1301 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 1302 the terms of the GNU Lesser General Public License as published by the Free | |
| 1303 Software Foundation; either version 3 of the License, or (at your option) | |
| 1304 any later version. | |
| 1305 | |
| 1306 =cut |
