comparison mayachemtools/bin/AnalyzeSDFilesData.pl @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: AnalyzeSDFilesData.pl,v $
4 # $Date: 2015/02/28 20:46:04 $
5 # $Revision: 1.27 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use SDFileUtil;
37 use TextUtil;
38 use StatisticsUtil;
39
40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
41
42 # Autoflush STDOUT
43 $| = 1;
44
45 # Starting message...
46 $ScriptName = basename($0);
47 print "\n$ScriptName: Starting...\n\n";
48 $StartTime = new Benchmark;
49
50 # Get the options and setup script...
51 SetupScriptUsage();
52 if ($Options{help} || @ARGV < 1) {
53 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
54 }
55
56 my(@SDFilesList);
57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf");
58
59 print "Processing options...\n";
60 my(%OptionsInfo);
61 ProcessOptions();
62
63 # Collect information about SD files...
64 print "Checking input SD file(s)...\n";
65 my(%SDFilesInfo);
66 RetrieveSDFilesInfo();
67 ProcessSDFilesDataLabelsInfo();
68
69 # Generate output files...
70 my($FileIndex);
71 if (@SDFilesList > 1) {
72 print "\nProcessing SD files...\n";
73 }
74 for $FileIndex (0 .. $#SDFilesList) {
75 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
76 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
77 AnalyzeSDFile($FileIndex);
78 }
79 }
80 print "\n$ScriptName:Done...\n\n";
81
82 $EndTime = new Benchmark;
83 $TotalTime = timediff ($EndTime, $StartTime);
84 print "Total time: ", timestr($TotalTime), "\n";
85
86 ###############################################################################
87
88 # Analyze data...
89 sub AnalyzeSDFile {
90 my($Index) = @_;
91 my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap);
92
93 $SDFile = $SDFilesList[$Index];
94 @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]};
95 %DataFieldValuesToAnalyzeMap = ();
96 for $DataLabel (@DataLabelsToAnalyze) {
97 @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = ();
98 }
99
100 # Collect appropriate data field label values for analysis...
101 my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels);
102 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
103 $CmpdCount = 0;
104 $InvalidCmpdCount = 0;
105 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
106 $CmpdCount++;
107 @CmpdLines = split "\n", $CmpdString;
108 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
109 @InvalidCmpdDataLabels = ();
110 DATALABEL: for $DataLabel (@DataLabelsToAnalyze) {
111 if (exists $DataFieldValues{$DataLabel}) {
112 $DataValue = $DataFieldValues{$DataLabel};
113 if ($OptionsInfo{CheckData}) {
114 if (!IsNumerical($DataValue)) {
115 push @InvalidCmpdDataLabels, $DataLabel;
116 next DATALABEL;
117 }
118 }
119 push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue;
120 }
121 }
122 if (@InvalidCmpdDataLabels) {
123 $InvalidCmpdCount++;
124 if ($OptionsInfo{DetailLevel} >=4 ) {
125 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n";
126 }
127 elsif ($OptionsInfo{DetailLevel} >= 3) {
128 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n";
129 }
130 elsif ($OptionsInfo{DetailLevel} >= 2) {
131 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n";
132 }
133 }
134 }
135 if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) {
136 print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n";
137 }
138 close SDFILE;
139
140 # Perform the analysis...
141 my(@SpecifiedFunctionNames, $SpecifiedFunction);
142 @SpecifiedFunctionNames = ();
143
144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)};
147 }
148 }
149 if (@SpecifiedFunctionNames) {
150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap)
151 }
152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
153 if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) {
154 PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
155 }
156 else {
157 # Perform pairwise analysis for specified columns and write out calculated values - correlation
158 # rsquare, or covariance - in the same file.
159 PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
160 }
161 }
162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) {
163 PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
164 }
165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
166 PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
167 }
168
169 }
170
171 # Calculate values for various statistical functions...
172 sub PerformAnalysis {
173 my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_;
174 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze);
175
176 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index];
177
178 print "Generating new text file $NewTextFile...\n";
179 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
180
181 # Write out column labels...
182 @ColLabels = ();
183 push @ColLabels, "DataLabel";
184 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
185 $Label = $SpecifiedFunction;
186 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
187 my($KthValue);
188 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest};
189 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
190 $Label =~ s/K//g;
191 }
192 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
193 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})";
194 }
195 push @ColLabels, $Label;
196 }
197 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
198 print NEWTEXTFILE "$Line\n";
199
200 # Go over each column to be analyzed...
201 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
202
203 # Turn off "strict"; otherwise, invoking statistical functions using function name string
204 # is problematic.
205 no strict;
206
207 my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues);
208 %CalculatedValues = ();
209 for $DataLabel (@DataLabelsToAnalyze) {
210 @RowValues = ();
211 # Setup column id...
212 push @RowValues, $DataLabel;
213 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
214 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
215 $Value = "";
216 if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) {
217 # Invalid column values...
218 push @RowValues, $Value;
219 next FUNCTIONNAME;
220 }
221 if ($SpecifiedFunction =~ /^Count$/i) {
222 $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}};
223 }
224 elsif ($SpecifiedFunction =~ /^KLargest$/i) {
225 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest});
226 }
227 elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
228 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest});
229 }
230 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
231 if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
232 $Value = $CalculatedValues{$DataLabel}{StandardDeviation};
233 }
234 else {
235 $Value = &$SpecifiedFunction($DataValuesRef);
236 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
237 }
238 }
239 elsif ($SpecifiedFunction =~ /^StandardError$/i) {
240 if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
241 $Value = StandardDeviation($DataValuesRef);
242 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
243 }
244 if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) {
245 $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}});
246 }
247 }
248 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
249 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction});
250 }
251 else {
252 $Value = &$SpecifiedFunction($DataValuesRef);
253 }
254 # Format the output value. And add zero to get rid of tariling zeros...
255 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : "";
256 push @RowValues, $Value;
257 }
258 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
259 print NEWTEXTFILE "$Line\n";
260 }
261 close NEWTEXTFILE;
262 }
263
264 # Calculate covariance, correlation, rsquare for specified data field label pairs....
265 sub PerformDataLabelPairAnalysis {
266 my($Index, $DataValuesToAnalyzeMapRef) = @_;
267 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
268
269 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
270 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
271 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
272
273 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
274 print "Generating new text file $NewTextFile...\n";
275 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
276
277 # Write out the column labels...
278 @ColLabels = ();
279 push @ColLabels, ("DataLabel1", "DataLabel2");
280 if ($CalculateCorrelation || $CalculateRSquare) {
281 push @ColLabels, "Correlation";
282 if ($CalculateRSquare) {
283 push @ColLabels, "RSquare";
284 }
285 }
286 if ($CalculateCovariance) {
287 push @ColLabels, "Covariance";
288 }
289 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
290 print NEWTEXTFILE "$Line\n";
291
292 # Go over each data field pair...
293 my($CorrelationValue, $RSquareValue, $CovarianceValue, $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value);
294
295 @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]};
296 @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]};
297 for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) {
298 @RowValues = ();
299 $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex];
300 $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex];
301 $DataValues1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
302 $DataValues2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
303
304 # Setup column ids...
305 push @RowValues, $DataLabel1;
306 push @RowValues, $DataLabel2;
307
308 if (@$DataValues1 != @$DataValues2) {
309 # Print a warning...
310 warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n";
311 if ($CalculateCorrelation || $CalculateRSquare) {
312 push @RowValues, "";
313 if ($CalculateRSquare) {
314 push @RowValues, "";
315 }
316 }
317 if ($CalculateCovariance) {
318 push @RowValues, "";
319 }
320 }
321 else {
322 # Calculate appropriate value...
323 if ($CalculateCorrelation || $CalculateRSquare) {
324 $CorrelationValue = Correlation($DataValues1, $DataValues2);
325 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
326 push @RowValues, $Value;
327 if ($CalculateRSquare) {
328 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
329 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
330 push @RowValues, $Value;
331 }
332 }
333 if ($CalculateCovariance) {
334 $CovarianceValue = Covariance($DataValues1, $DataValues2);
335 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
336 push @RowValues, $Value;
337 }
338 }
339 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
340 print NEWTEXTFILE "$Line\n";
341 }
342 close NEWTEXTFILE;
343 }
344
345 # Generate histogram numbers...
346 sub PerformFrequencyAnalysis {
347 my($Index, $DataValuesToAnalyzeMapRef) = @_;
348 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
349
350 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
351 for $DataLabel (@DataLabelsToAnalyze) {
352 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
353 print "Generating new text file $NewTextFile...\n";
354 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
355
356 # Write out the column labels...
357 @ColLabels = ();
358 push @ColLabels , ("Bins", "Frequency");
359 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
360 print NEWTEXTFILE "$Line\n";
361
362 #Calculate and write out frequency values...
363 %FrequencyMap = ();
364 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
365 if (@$DataValuesRef) {
366 if (@{$OptionsInfo{BinRange}}) {
367 %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}});
368 }
369 else {
370 %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins});
371 }
372 }
373 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) {
374 $FrequencyValue = $FrequencyMap{$BinValue};
375
376 @RowValues = ();
377 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : "";
378 push @RowValues, $Value;
379 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : "";
380 push @RowValues, $Value;
381
382 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
383 print NEWTEXTFILE "$Line\n";
384 }
385 close NEWTEXTFILE;
386 }
387 }
388
389 # Calculate covariance, correlation/rsquare matrices....
390 sub PerformMatrixAnalysis {
391 my($Index, $DataValuesToAnalyzeMapRef) = @_;
392 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
393
394 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
395 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
396 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
397
398 $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
399 $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
400 $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
401
402 my($TextFilesList, $Delimiter);
403 $TextFilesList = "";
404 if ($CalculateCorrelation || $CalculateRSquare) {
405 $TextFilesList = $CorrelationTextFile;
406 if ($CalculateRSquare) {
407 $TextFilesList .= ", $CorrelationTextFile";
408 }
409 }
410 $Delimiter = length($TextFilesList) ? "," : "";
411 if ($CalculateCovariance) {
412 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
413 }
414 if ($TextFilesList =~ /\,/) {
415 print "Generating new text files $TextFilesList...\n"
416 }
417 else {
418 print "Generating new text file $TextFilesList...\n"
419 }
420 if ($CalculateCorrelation || $CalculateRSquare) {
421 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
422 if ($CalculateRSquare) {
423 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
424 }
425 }
426 if ($CalculateCovariance) {
427 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
428 }
429
430 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
431
432 # Write out the column labels...
433 @ColLabels = ();
434 push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]};
435 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
436 if ($CalculateCorrelation || $CalculateRSquare) {
437 print CORRELATIONTEXTFILE "$Line\n";
438 if ($CalculateRSquare) {
439 print RSQUARETEXTFILE "$Line\n";
440 }
441 }
442 if ($CalculateCovariance) {
443 print COVARIANCETEXTFILE "$Line\n";
444 }
445
446 # Due to symmetric nature of these matrices, only one half needs to be
447 # calculated. So, just calculate the lower half and copy it to upper half...
448 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze);
449
450 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
451 @DataLabelsToAnalyze = ();
452 @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]};
453
454 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
455 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
456 for $LabelIndex2 (0 .. $LabelIndex1) {
457 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
458 $DataValuesRef1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
459 $DataValuesRef2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
460 if ($CalculateCorrelation || $CalculateRSquare) {
461 $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2);
462 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
463 $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue;
464 if ($DataLabel1 ne $DataLabel2) {
465 $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue;
466 }
467 if ($CalculateRSquare) {
468 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
469 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
470 $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue;
471 if ($DataLabel1 ne $DataLabel2) {
472 $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue;
473 }
474 }
475 }
476 if ($CalculateCovariance) {
477 $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2);
478 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
479 $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue;
480 if ($DataLabel1 ne $DataLabel2) {
481 $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue;
482 }
483 }
484 }
485 }
486
487 # Write out the matrices...
488 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
489 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
490 @CorrelationRowValues = ();
491 @RSquareRowValues = ();
492 @CovarianceRowValues = ();
493 if ($CalculateCorrelation || $CalculateRSquare) {
494 push @CorrelationRowValues, $DataLabel1;
495 if ($CalculateRSquare) {
496 push @RSquareRowValues, $DataLabel1;
497 }
498 }
499 if ($CalculateCovariance) {
500 push @CovarianceRowValues, $DataLabel;
501 }
502 for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) {
503 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
504 if ($CalculateCorrelation || $CalculateRSquare) {
505 push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2};
506 if ($CalculateRSquare) {
507 push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2};
508 }
509 }
510 if ($CalculateCovariance) {
511 push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2};
512 }
513 }
514 if ($CalculateCorrelation || $CalculateRSquare) {
515 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
516 print CORRELATIONTEXTFILE "$Line\n";
517 if ($CalculateRSquare) {
518 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
519 print RSQUARETEXTFILE "$Line\n";
520 }
521 }
522 if ($CalculateCovariance) {
523 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
524 print COVARIANCETEXTFILE "$Line\n";
525 }
526 }
527 if ($CalculateCorrelation || $CalculateRSquare) {
528 close CORRELATIONTEXTFILE;
529 if ($CalculateRSquare) {
530 close RSQUARETEXTFILE;
531 }
532 }
533 if ($CalculateCovariance) {
534 close COVARIANCETEXTFILE;
535 }
536 }
537
538 # Calculate standard scores...
539 sub PerformStandardScoresAnalysis {
540 my($Index, $DataValuesToAnalyzeMapRef) = @_;
541 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
542
543 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0;
544 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0;
545
546 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." . $SDFilesInfo{NewTextFileExt}[$Index];
547 print "Generating new text file $NewTextFile...\n";
548 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
549
550 my($DataLabel, @DataLabelsToAnalyze);
551 # Write out column labels...
552 @ColLabels = ();
553 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
554 for $DataLabel (@DataLabelsToAnalyze) {
555 if ($StandardScores) {
556 push @ColLabels, "${DataLabel}\(StandardScores)";
557 }
558 if ($StandardScoresN) {
559 push @ColLabels, "${DataLabel}\(StandardScoresN)";
560 }
561 }
562 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
563 print NEWTEXTFILE "$NewLine\n";
564
565 # Go over each column to be analyzed and calculate standard deviation
566 # and mean values...
567 my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
568 %StandardDeviationMap = ();
569 %StandardDeviationNMap = ();
570 %MeanMap = ();
571 for $DataLabel (@DataLabelsToAnalyze) {
572 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
573 if (!exists($MeanMap{$DataLabel})) {
574 $MeanMap{$DataLabel} = Mean($DataValuesRef);
575 }
576 if ($StandardScores) {
577 if (!exists($StandardDeviationMap{$DataLabel})) {
578 $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef);
579 }
580 }
581 if ($StandardScoresN) {
582 if (!exists($StandardDeviationNMap{$DataLabel})) {
583 $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef);
584 }
585 }
586 }
587 #
588 # Go over each data field and calculate standard scores for each column
589 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
590 # for StandardScoresN; write out the calculated values as well...
591
592 my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues);
593 $SDFile = $SDFilesList[$Index];
594
595 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
596 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
597 @CmpdLines = split "\n", $CmpdString;
598 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
599 @RowValues = ();
600 for $DataLabel (@DataLabelsToAnalyze) {
601 $Value = "";
602 if (exists $DataFieldValues{$DataLabel}) {
603 $Value = $DataFieldValues{$DataLabel};
604 }
605 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1;
606 if ($StandardScores) {
607 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : "";
608 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
609 push @RowValues, $ScoreValue;
610 }
611 if ($StandardScoresN) {
612 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : "";
613 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
614 push @RowValues, $ScoreValue;
615 }
616 }
617 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
618 print NEWTEXTFILE "$NewLine\n";
619 }
620 close SDFILE;
621 close NEWTEXTFILE;
622
623 }
624
625 # Make sure the specified data field labels exists in SD files...
626 sub ProcessSDFilesDataLabelsInfo {
627 my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap);
628
629 @{$SDFilesInfo{DataLabelsToAnalyze}} = ();
630 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = ();
631 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = ();
632 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = ();
633
634 FILELIST: for $Index (0 .. $#SDFilesList) {
635 $SDFile = $SDFilesList[$Index];
636
637 @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = ();
638 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = ();
639 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = ();
640 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = ();
641
642 %UniqueDataLabelsToAnalyzeMap = ();
643
644 if ($SDFilesInfo{FileOkay}[$Index]) {
645 @DataLabelsToAnalyze = ();
646 if (@{$OptionsInfo{SpecifiedDataLabels}}) {
647 for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) {
648 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) {
649 push @DataLabelsToAnalyze, $DataLabel;
650 }
651 }
652 }
653 elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) {
654 push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]};
655 }
656 else {
657 push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]};
658 }
659 if (@DataLabelsToAnalyze) {
660 push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze;
661 # Set up unique data field label map as well...
662 for $DataLabel (@DataLabelsToAnalyze) {
663 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
664 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
665 }
666 }
667 }
668 else {
669 warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n";
670 $SDFilesInfo{FileOkay}[$Index] = 0;
671 next FILELIST;
672 }
673 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
674 # Make sure specific frequency files don't exist...
675 my($FrequencyFile);
676 for $DataLabel (@DataLabelsToAnalyze) {
677 $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
678 if (-e $FrequencyFile) {
679 warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n";
680 $SDFilesInfo{FileOkay}[$Index] = 0;
681 next FILELIST;
682 }
683 }
684 }
685 # Setup specified data field label pairs...
686 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) {
687 my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2);
688 if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) {
689 # Make sure both data field labels exist...
690 my($DataFieldIndex);
691 for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) {
692 $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex];
693 $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1];
694 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) {
695 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
696 }
697 }
698 }
699 elsif ($OptionsInfo{AllDataLabelPairs}) {
700 for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
701 for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
702 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
703 }
704 }
705 }
706 else {
707 for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
708 for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
709 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
710 }
711 }
712 }
713 if (@DataLabelPairsToAnalyze) {
714 if (@DataLabelPairsToAnalyze % 2) {
715 warn "Warning: Ignoring file $SDFile: Invalid number values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n";
716 $SDFilesInfo{FileOkay}[$Index] = 0;
717 next FILELIST;
718 }
719 else {
720 for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) {
721 push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex];
722 push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1];
723 }
724 # Set up unique data field labe map as well...
725 for $DataLabel (@DataLabelPairsToAnalyze) {
726 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
727 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
728 }
729 }
730 }
731 }
732 }
733 # Setup unique data field label array...
734 push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap);
735 }
736 }
737 }
738
739 # Retrieve information about input SD files...
740 sub RetrieveSDFilesInfo {
741 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount);
742
743 %SDFilesInfo = ();
744
745 @{$SDFilesInfo{FileOkay}} = ();
746 @{$SDFilesInfo{CmpdCount}} = ();
747 @{$SDFilesInfo{NewTextFileRoot}} = ();
748 @{$SDFilesInfo{NewTextFileExt}} = ();
749
750 @{$SDFilesInfo{AllDataFieldLabels}} = ();
751 @{$SDFilesInfo{AllDataFieldLabelsMap}} = ();
752 @{$SDFilesInfo{CommonDataLabels}} = ();
753
754 FILELIST: for $Index (0 .. $#SDFilesList) {
755 $SDFile = $SDFilesList[$Index];
756
757 $SDFilesInfo{FileOkay}[$Index] = 0;
758
759 $SDFilesInfo{CmpdCount}[$Index] = 0;
760 $SDFilesInfo{NewTextFileRoot}[$Index] = "";
761 $SDFilesInfo{NewTextFileExt}[$Index] = "";
762
763 @{$SDFilesInfo{AllDataLabels}[$Index]} = ();
764 %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = ();
765 @{$SDFilesInfo{CommonDataLabels}[$Index]} = ();
766
767 if (!(-e $SDFile)) {
768 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
769 next FILELIST;
770 }
771 if (!CheckFileType($SDFile, "sd sdf")) {
772 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
773 next FILELIST;
774 }
775
776 # Generate appropriate name for the new text files...
777 $FileDir = ""; $FileName = ""; $FileExt = "";
778 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
779 $OutFileExt = "csv";
780 if ($Options{outdelim} =~ /^tab$/i) {
781 $OutFileExt = "tsv";
782 }
783 if ($Options{root} && (@SDFilesList == 1)) {
784 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
785 if ($RootFileName && $RootFileExt) {
786 $FileName = $RootFileName;
787 }
788 else {
789 $FileName = $Options{root};
790 }
791 $OutFileRoot = $FileName;
792 }
793 else {
794 $OutFileRoot = $FileName;
795 }
796 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt";
797
798 if (!$OptionsInfo{Overwrite}) {
799 if (-e $OutFile) {
800 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
801 next FILELIST;
802 }
803 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
804 if ($OptionsInfo{AllDataLabelPairs}) {
805 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) {
806 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n";
807 next FILELIST;
808 }
809 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) {
810 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n";
811 next FILELIST;
812 }
813 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) {
814 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n";
815 next FILELIST;
816 }
817 }
818 else {
819 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") {
820 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n";
821 next FILELIST;
822 }
823 }
824 }
825 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) {
826 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n";
827 next FILELIST;
828 }
829 }
830
831 if (!open SDFILE, "$SDFile") {
832 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
833 next FILELIST;
834 }
835
836 my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels);
837 $CmpdCount = 0;
838 @DataFieldLabels = ();
839 @CommonDataFieldLabels = ();
840 ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
841 push @DataFieldLabels, @{$DataFieldLabelsRef};
842 push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef};
843 close SDFILE;
844
845 $SDFilesInfo{FileOkay}[$Index] = 1;
846 $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot";
847 $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt";
848
849 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
850 push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels;
851 push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels;
852 for $Label (@DataFieldLabels) {
853 $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label;
854 }
855 }
856 }
857
858 # Process option values...
859 sub ProcessOptions {
860 %OptionsInfo = ();
861
862 $OptionsInfo{Mode} = $Options{mode};
863
864 $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef;
865
866 $OptionsInfo{DetailLevel} = $Options{detail};
867
868 # Setup supported statistical functions...
869 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
870
871 %SupportedStatisticaFunctionsMap = ();
872 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
873
874 for $SupportedFunction (@SupportedStatisticaFunctions) {
875 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
876 }
877
878 # Setup a list of functions to use for analysis...
879 my($SpecifiedFunction);
880
881 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = ();
882 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = ();
883
884 # Check mode values...
885 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
886 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic";
887 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
888 }
889 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
890 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll";
891 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum);
892 }
893 elsif ($Options{mode} =~ /^All$/i ) {
894 $OptionsInfo{FileNameMode} = "AllStatistics";
895 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions;
896 }
897 else {
898 $OptionsInfo{FileNameMode} = "SpecifiedStatistics";
899
900 # Comma delimited list of functions...
901 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
902
903 $Mode = $Options{mode};
904 $Mode =~ s/ //g;
905 @SpecifiedFunctions = split ",", $Mode;
906 @UnsupportedSpecifiedFunctions = ();
907 for $SpecifiedFunction (@SpecifiedFunctions) {
908 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
909 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction;
910 }
911 else {
912 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
913 }
914 }
915 if (@UnsupportedSpecifiedFunctions) {
916 if (@UnsupportedSpecifiedFunctions > 1) {
917 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
918 }
919 else {
920 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
921 }
922 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
923 }
924 }
925
926 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
927 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) {
928 next FUNCTION;
929 }
930 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
931 }
932
933 # Setup delimiter and quotes...
934 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
935 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
936
937 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
938 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
939
940 # Setup miscellaneous options...
941 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1;
942 $OptionsInfo{Precision} = $Options{precision};
943
944 $OptionsInfo{KLargest} = $Options{klargest};
945 $OptionsInfo{KSmallest} = $Options{ksmallest};
946
947 $OptionsInfo{TrimFraction} = $Options{trimfraction};
948
949 # Setup frequency bin values...
950 $OptionsInfo{NumOfBins} = 10;
951 @{$OptionsInfo{BinRange}} = ();
952 if ($Options{frequencybins} =~ /\,/) {
953 my($BinValue, @SpecifiedBinRange);
954 @SpecifiedBinRange = split /\,/, $Options{frequencybins};
955 if (@SpecifiedBinRange < 2) {
956 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
957 }
958 for $BinValue (@SpecifiedBinRange) {
959 if (!IsNumerical($BinValue)) {
960 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
961 }
962 }
963 my($Index1, $Index2);
964 for $Index1 (0 .. $#SpecifiedBinRange) {
965 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
966 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
967 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
968 }
969 }
970 }
971 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
972 }
973 else {
974 $OptionsInfo{NumOfBins} = $Options{frequencybins};
975 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
977 }
978 }
979
980 # Setup specified data field labels...
981 @{$OptionsInfo{SpecifiedDataLabels}} = ();
982 if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) {
983 my(@SpecifiedValues) = split ",", $Options{datafields};
984 push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues;
985 }
986 @{$OptionsInfo{SpecifiedDataLabelPairs}} = ();
987 $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0;
988 $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0;
989 if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) {
990 my(@SpecifiedValues) = split ",", $Options{datafieldpairs};
991 if (@SpecifiedValues % 2) {
992 die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n";
993 }
994 push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues;
995 }
996
997 }
998
999 # Setup script usage and retrieve command line arguments specified using various options...
1000 sub SetupScriptUsage {
1001
1002 # Retrieve all the options...
1003 %Options = ();
1004 $Options{detail} = 0;
1005 $Options{datafields} = "Common";
1006 $Options{datafieldpairs} = "CommonPairs";
1007 $Options{frequencybins} = 10;
1008 $Options{klargest} = 2;
1009 $Options{ksmallest} = 2;
1010 $Options{mode} = "DescriptiveStatisticsBasic";
1011 $Options{outdelim} = "comma";
1012 $Options{precision} = 2;
1013 $Options{quote} = "yes";
1014 $Options{trimfraction} = 0.1;
1015
1016 if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) {
1017 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1018 }
1019 if ($Options{workingdir}) {
1020 if (! -d $Options{workingdir}) {
1021 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1022 }
1023 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1024 }
1025 if (!IsInteger($Options{detail})) {
1026 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n";
1027 }
1028 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1029 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1030 }
1031 if ($Options{quote} !~ /^(yes|no)$/i) {
1032 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1033 }
1034 if (!IsPositiveInteger($Options{precision})) {
1035 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
1036 }
1037 if (!IsPositiveInteger($Options{klargest})) {
1038 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n";
1039 }
1040 if (!IsPositiveInteger($Options{ksmallest})) {
1041 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n";
1042 }
1043 if (IsFloat($Options{trimfraction})) {
1044 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) {
1045 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1046 }
1047 }
1048 else {
1049 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1050 }
1051 }
1052
1053 __END__
1054
1055 =head1 NAME
1056
1057 AnalyzeSDFilesData.pl - Analyze numerical data field values in SDFile(s)
1058
1059 =head1 SYNOPSIS
1060
1061 AnalyzeSDFilesData.pl SDFile(s)...
1062
1063 AnalyzeSDFilesData.pl [B<--datafields> "fieldlabel,[fieldlabel,...]" | All]
1064 [B<--datafieldpairs> "fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | AllPairs] [B<-d, --detail> infolevel]
1065 [B<-f, --fast>] [B<--frequencybins> number | "number,number,[number,...]"]
1066 [B<-h, --help>] [B<--klargest> number] [B<--ksmallest> number]
1067 [B<-m, --mode> DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"]
1068 [B<--trimfraction> number] [B<-w, --workingdir> dirname] SDFiles(s)...
1069
1070 =head1 DESCRIPTION
1071
1072 Analyze numerical data field values in I<SDFile(s)> using a combination of various statistical
1073 functions; Non-numerical values are simply ignored. For I<Correlation, RSquare, and
1074 Covariance> analysis, the count of valid values in specified data field pairs must be same;
1075 otherwise, column data field pair is ignored. The file names are separated by space.The valid file
1076 extensions are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a
1077 current directory can be specified either by I<*.sdf> or the current directory name.
1078
1079 =head1 OPTIONS
1080
1081 =over 4
1082
1083 =item B<--datafields> I<"fieldlabel,[fieldlabel,...]" | Common | All>
1084
1085 Data fields to use for analysis. Possible values: list of comma separated data field
1086 labels, data fields common to all records, or all data fields. Default value: I<Common>.
1087 Examples:
1088
1089 ALogP,MolWeight,EC50
1090 "MolWeight,PSA"
1091
1092 =item B<--datafieldpairs> I<"fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | CommonPairs | AllPairs>
1093
1094 This value is mode specific and is only used for I<Correlation, PearsonCorrelation, or
1095 Covariance> value of B<-m, --mode> option. It specifies data field label pairs to use
1096 for data analysis during I<Correlation> and I<Covariance> calculations. Possible values:
1097 comma delimited list of data field label pairs, data field label pairs common to all records,
1098 or all data field pairs. Default value:I<CommonPairs>. Example:
1099
1100 MolWeight,EC50,NumN+O,PSA
1101
1102 For I<AllPairs> value of B<--datafieldpairs> option, all data field label pairs are used for
1103 I<Correlation> and I<Covariance> calculations.
1104
1105 =item B<-d, --detail> I<infolevel>
1106
1107 Level of information to print about column values being ignored. Default: I<0>. Possible values:
1108 0, 1, 2, 3, or 4.
1109
1110 =item B<-f, --fast>
1111
1112 In this mode, all the data field values specified for analysis are assumed to contain numerical
1113 data and no checking is performed before analysis. By default, only numerical data is
1114 used for analysis; other types of column data is ignored.
1115
1116 =item B<--frequencybins> I<number | "number,number,[number,...]">
1117
1118 Specify number of bins or bin range to use for frequency analysis. Default value: I<10>
1119
1120 Number of bins value along with the smallest and largest value for a column is used to
1121 group the column values into different groups.
1122
1123 The bin range list is used to group values for a column into different groups; It must contain
1124 values in ascending order. Examples:
1125
1126 10,20,30
1127 0.1,0.2,0.3,0.4,0.5
1128
1129 The frequency value calculated for a specific bin corresponds to all the column values
1130 which are greater than the previous bin value and less than or equal to the current bin value.
1131
1132 =item B<-h, --help>
1133
1134 Print this help message.
1135
1136 =item B<--klargest> I<number>
1137
1138 Kth largest value to find by I<KLargest> function. Default value: I<2>. Valid values: positive
1139 integers.
1140
1141 =item B<--ksmallest> I<number>
1142
1143 Kth smallest value to find by I<KSmallest> function. Default values: I<2>. Valid values: positive
1144 integers.
1145
1146 =item B<-m, --mode> I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]">
1147
1148 Specify how to analyze data in SDFile(s): calculate basic or all descriptive statistics; or
1149 use a comma delimited list of supported statistical functions. Possible values:
1150 I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | "function1,[function2]...">. Default
1151 value: I<DescriptiveStatisticsBasic>
1152
1153 I<DescriptiveStatisticsBasic> includes these functions: I<Count, Maximum, Minimum, Mean,
1154 Median, Sum, StandardDeviation, StandardError, Variance>.
1155
1156 I<DescriptiveStatisticsAll>, in addition to I<DescriptiveStatisticsBasic> functions, includes:
1157 I<GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare,
1158 Skewness, TrimMean>.
1159
1160 I<All> uses complete list of supported functions: I<Average, AverageDeviation, Correlation,
1161 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis,
1162 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum,
1163 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores,
1164 StandardScoresN, TrimMean, Variance, VarianceN>. The function names ending with N
1165 calculate corresponding values assuming an entire population instead of a population sample.
1166 Here are the formulas for these functions:
1167
1168 Average: See Mean
1169
1170 AverageDeviation: SUM( ABS(x[i] - Xmean) ) / n
1171
1172 Correlation: See Pearson Correlation
1173
1174 Covariance: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / n
1175
1176 GeometricMean: NthROOT( PRODUCT(x[i]) )
1177
1178 HarmonicMean: 1 / ( SUM(1/x[i]) / n )
1179
1180 Mean: SUM( x[i] ) / n
1181
1182 Median: Xsorted[(n - 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2
1183 for odd values of n.
1184
1185 Kurtosis: [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] -
1186 {3((n - 1)^2)}/{(n - 2)(n-3)}
1187
1188 PearsonCorrelation: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )
1189 (SUM( (y[i] - Ymean)^2 )) )
1190
1191 RSquare: PearsonCorrelation^2
1192
1193 Skewness: {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
1194
1195 StandardDeviation: SQRT ( SUM( (x[i] - Mean)^2 ) / (n - 1) )
1196
1197 StandardDeviationN: SQRT ( SUM( (x[i] - Mean)^2 ) / n )
1198
1199 StandardError: StandardDeviation / SQRT( n )
1200
1201 StandardScore: (x[i] - Mean) / (n - 1)
1202
1203 StandardScoreN: (x[i] - Mean) / n
1204
1205 Variance: SUM( (x[i] - Xmean)^2 / (n - 1) )
1206
1207 VarianceN: SUM( (x[i] - Xmean)^2 / n )
1208
1209 =item B<-o, --overwrite>
1210
1211 Overwrite existing files.
1212
1213 =item B<--outdelim> I<comma | tab | semicolon>
1214
1215 Output text file delimiter. Possible values: I<comma, tab, or semicolon>
1216 Default value: I<comma>.
1217
1218 =item B<-p, --precision> I<number>
1219
1220 Precision of calculated values in the output file. Default: up to I<2> decimal places.
1221 Valid values: positive integers.
1222
1223 =item B<-q, --quote> I<yes | no>
1224
1225 Put quotes around column values in output text file. Possible values: I<yes or
1226 no>. Default value: I<yes>.
1227
1228 =item B<-r, --root> I<rootname>
1229
1230 New text file name is generated using the root: <Root>.<Ext>. Default new file
1231 name: <InitialSDFileName><Mode>.<Ext>. Based on the specified analysis,
1232 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic,
1233 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation,
1234 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for
1235 comma/semicolon, and tab delimited text files respectively. This option is ignored for
1236 multiple input files.
1237
1238 =item B<--trimfraction> I<number>
1239
1240 Fraction of data to exclude from the top and bottom of the data set during
1241 I<TrimMean> calculation. Default value: I<0.1> Valid values: > 0 and < 1.
1242
1243 =item B<-w --workingdir> I<text>
1244
1245 Location of working directory. Default: current directory.
1246
1247 =back
1248
1249 =head1 EXAMPLES
1250
1251 To calculate basic statistics for data in all common data fields and generate a
1252 NewSample1DescriptiveStatisticsBasic.csv file, type:
1253
1254 % AnalyzeSDFilesData.pl -o -r NewSample1 Sample1.sdf
1255
1256 To calculate basic statistics for MolWeight data field and generate a
1257 NewSample1DescriptiveStatisticsBasic.csv file, type:
1258
1259 % AnalyzeSDFilesData.pl --datafields MolWeight -o -r NewSample1
1260 Sample1.sdf
1261
1262 To calculate all available statistics for MolWeight data field and all data field pairs,
1263 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv,
1264 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv
1265 files, type:
1266
1267 % AnalyzeSDFilesData.pl -m DescriptiveStatisticsAll --datafields
1268 MolWeight -o --datafieldpairs AllPairs -r NewSample1 Sample1.sdf
1269
1270 To compute frequency distribution of MolWeight data field into five bins and
1271 generate NewSample1MolWeightFrequencyAnalysis.csv, type:
1272
1273 % AnalyzeSDFilesData.pl -m Frequency --frequencybins 5 --datafields
1274 MolWeight -o -r NewSample1 Sample1.sdf
1275
1276 To compute frequency distribution of data in MolWeight data field into specified bin range
1277 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type:
1278
1279 % AnalyzeSDFilesData.pl -m Frequency --frequencybins "100,200,400"
1280 --datafields MolWeight -o -r NewSample1 Sample1.sdf
1281
1282 To calculate all available statistics for data in all data fields and pairs, type:
1283
1284 % AnalyzeSDFilesData.pl -m All --datafields All --datafieldpairs
1285 AllPairs -o -r NewSample1 Sample1.sdf
1286
1287 =head1 AUTHOR
1288
1289 Manish Sud <msud@san.rr.com>
1290
1291 =head1 SEE ALSO
1292
1293 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl
1294
1295 =head1 COPYRIGHT
1296
1297 Copyright (C) 2015 Manish Sud. All rights reserved.
1298
1299 This file is part of MayaChemTools.
1300
1301 MayaChemTools is free software; you can redistribute it and/or modify it under
1302 the terms of the GNU Lesser General Public License as published by the Free
1303 Software Foundation; either version 3 of the License, or (at your option)
1304 any later version.
1305
1306 =cut