0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: AnalyzeSDFilesData.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:04 $
|
|
5 # $Revision: 1.27 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use FileUtil;
|
|
36 use SDFileUtil;
|
|
37 use TextUtil;
|
|
38 use StatisticsUtil;
|
|
39
|
|
40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
41
|
|
42 # Autoflush STDOUT
|
|
43 $| = 1;
|
|
44
|
|
45 # Starting message...
|
|
46 $ScriptName = basename($0);
|
|
47 print "\n$ScriptName: Starting...\n\n";
|
|
48 $StartTime = new Benchmark;
|
|
49
|
|
50 # Get the options and setup script...
|
|
51 SetupScriptUsage();
|
|
52 if ($Options{help} || @ARGV < 1) {
|
|
53 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
54 }
|
|
55
|
|
56 my(@SDFilesList);
|
|
57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf");
|
|
58
|
|
59 print "Processing options...\n";
|
|
60 my(%OptionsInfo);
|
|
61 ProcessOptions();
|
|
62
|
|
63 # Collect information about SD files...
|
|
64 print "Checking input SD file(s)...\n";
|
|
65 my(%SDFilesInfo);
|
|
66 RetrieveSDFilesInfo();
|
|
67 ProcessSDFilesDataLabelsInfo();
|
|
68
|
|
69 # Generate output files...
|
|
70 my($FileIndex);
|
|
71 if (@SDFilesList > 1) {
|
|
72 print "\nProcessing SD files...\n";
|
|
73 }
|
|
74 for $FileIndex (0 .. $#SDFilesList) {
|
|
75 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
|
|
76 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
|
|
77 AnalyzeSDFile($FileIndex);
|
|
78 }
|
|
79 }
|
|
80 print "\n$ScriptName:Done...\n\n";
|
|
81
|
|
82 $EndTime = new Benchmark;
|
|
83 $TotalTime = timediff ($EndTime, $StartTime);
|
|
84 print "Total time: ", timestr($TotalTime), "\n";
|
|
85
|
|
86 ###############################################################################
|
|
87
|
|
88 # Analyze data...
|
|
89 sub AnalyzeSDFile {
|
|
90 my($Index) = @_;
|
|
91 my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap);
|
|
92
|
|
93 $SDFile = $SDFilesList[$Index];
|
|
94 @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]};
|
|
95 %DataFieldValuesToAnalyzeMap = ();
|
|
96 for $DataLabel (@DataLabelsToAnalyze) {
|
|
97 @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = ();
|
|
98 }
|
|
99
|
|
100 # Collect appropriate data field label values for analysis...
|
|
101 my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels);
|
|
102 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
|
|
103 $CmpdCount = 0;
|
|
104 $InvalidCmpdCount = 0;
|
|
105 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
|
|
106 $CmpdCount++;
|
|
107 @CmpdLines = split "\n", $CmpdString;
|
|
108 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
109 @InvalidCmpdDataLabels = ();
|
|
110 DATALABEL: for $DataLabel (@DataLabelsToAnalyze) {
|
|
111 if (exists $DataFieldValues{$DataLabel}) {
|
|
112 $DataValue = $DataFieldValues{$DataLabel};
|
|
113 if ($OptionsInfo{CheckData}) {
|
|
114 if (!IsNumerical($DataValue)) {
|
|
115 push @InvalidCmpdDataLabels, $DataLabel;
|
|
116 next DATALABEL;
|
|
117 }
|
|
118 }
|
|
119 push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue;
|
|
120 }
|
|
121 }
|
|
122 if (@InvalidCmpdDataLabels) {
|
|
123 $InvalidCmpdCount++;
|
|
124 if ($OptionsInfo{DetailLevel} >=4 ) {
|
|
125 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n";
|
|
126 }
|
|
127 elsif ($OptionsInfo{DetailLevel} >= 3) {
|
|
128 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n";
|
|
129 }
|
|
130 elsif ($OptionsInfo{DetailLevel} >= 2) {
|
|
131 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n";
|
|
132 }
|
|
133 }
|
|
134 }
|
|
135 if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) {
|
|
136 print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n";
|
|
137 }
|
|
138 close SDFILE;
|
|
139
|
|
140 # Perform the analysis...
|
|
141 my(@SpecifiedFunctionNames, $SpecifiedFunction);
|
|
142 @SpecifiedFunctionNames = ();
|
|
143
|
|
144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
|
|
145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
|
|
146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)};
|
|
147 }
|
|
148 }
|
|
149 if (@SpecifiedFunctionNames) {
|
|
150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap)
|
|
151 }
|
|
152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
|
|
153 if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) {
|
|
154 PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
|
|
155 }
|
|
156 else {
|
|
157 # Perform pairwise analysis for specified columns and write out calculated values - correlation
|
|
158 # rsquare, or covariance - in the same file.
|
|
159 PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
|
|
160 }
|
|
161 }
|
|
162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) {
|
|
163 PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
|
|
164 }
|
|
165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
|
|
166 PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
|
|
167 }
|
|
168
|
|
169 }
|
|
170
|
|
171 # Calculate values for various statistical functions...
|
|
172 sub PerformAnalysis {
|
|
173 my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_;
|
|
174 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze);
|
|
175
|
|
176 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
177
|
|
178 print "Generating new text file $NewTextFile...\n";
|
|
179 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
180
|
|
181 # Write out column labels...
|
|
182 @ColLabels = ();
|
|
183 push @ColLabels, "DataLabel";
|
|
184 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
|
|
185 $Label = $SpecifiedFunction;
|
|
186 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
|
|
187 my($KthValue);
|
|
188 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest};
|
|
189 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
|
|
190 $Label =~ s/K//g;
|
|
191 }
|
|
192 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
|
|
193 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})";
|
|
194 }
|
|
195 push @ColLabels, $Label;
|
|
196 }
|
|
197 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
198 print NEWTEXTFILE "$Line\n";
|
|
199
|
|
200 # Go over each column to be analyzed...
|
|
201 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
|
|
202
|
|
203 # Turn off "strict"; otherwise, invoking statistical functions using function name string
|
|
204 # is problematic.
|
|
205 no strict;
|
|
206
|
|
207 my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues);
|
|
208 %CalculatedValues = ();
|
|
209 for $DataLabel (@DataLabelsToAnalyze) {
|
|
210 @RowValues = ();
|
|
211 # Setup column id...
|
|
212 push @RowValues, $DataLabel;
|
|
213 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
|
|
214 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
|
|
215 $Value = "";
|
|
216 if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) {
|
|
217 # Invalid column values...
|
|
218 push @RowValues, $Value;
|
|
219 next FUNCTIONNAME;
|
|
220 }
|
|
221 if ($SpecifiedFunction =~ /^Count$/i) {
|
|
222 $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}};
|
|
223 }
|
|
224 elsif ($SpecifiedFunction =~ /^KLargest$/i) {
|
|
225 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest});
|
|
226 }
|
|
227 elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
|
|
228 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest});
|
|
229 }
|
|
230 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
|
|
231 if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
|
|
232 $Value = $CalculatedValues{$DataLabel}{StandardDeviation};
|
|
233 }
|
|
234 else {
|
|
235 $Value = &$SpecifiedFunction($DataValuesRef);
|
|
236 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
|
|
237 }
|
|
238 }
|
|
239 elsif ($SpecifiedFunction =~ /^StandardError$/i) {
|
|
240 if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
|
|
241 $Value = StandardDeviation($DataValuesRef);
|
|
242 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
|
|
243 }
|
|
244 if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) {
|
|
245 $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}});
|
|
246 }
|
|
247 }
|
|
248 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
|
|
249 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction});
|
|
250 }
|
|
251 else {
|
|
252 $Value = &$SpecifiedFunction($DataValuesRef);
|
|
253 }
|
|
254 # Format the output value. And add zero to get rid of tariling zeros...
|
|
255 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : "";
|
|
256 push @RowValues, $Value;
|
|
257 }
|
|
258 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
259 print NEWTEXTFILE "$Line\n";
|
|
260 }
|
|
261 close NEWTEXTFILE;
|
|
262 }
|
|
263
|
|
264 # Calculate covariance, correlation, rsquare for specified data field label pairs....
|
|
265 sub PerformDataLabelPairAnalysis {
|
|
266 my($Index, $DataValuesToAnalyzeMapRef) = @_;
|
|
267 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
|
|
268
|
|
269 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
|
|
270 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
|
|
271 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
|
|
272
|
|
273 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
274 print "Generating new text file $NewTextFile...\n";
|
|
275 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
276
|
|
277 # Write out the column labels...
|
|
278 @ColLabels = ();
|
|
279 push @ColLabels, ("DataLabel1", "DataLabel2");
|
|
280 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
281 push @ColLabels, "Correlation";
|
|
282 if ($CalculateRSquare) {
|
|
283 push @ColLabels, "RSquare";
|
|
284 }
|
|
285 }
|
|
286 if ($CalculateCovariance) {
|
|
287 push @ColLabels, "Covariance";
|
|
288 }
|
|
289 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
290 print NEWTEXTFILE "$Line\n";
|
|
291
|
|
292 # Go over each data field pair...
|
|
293 my($CorrelationValue, $RSquareValue, $CovarianceValue, $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value);
|
|
294
|
|
295 @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]};
|
|
296 @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]};
|
|
297 for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) {
|
|
298 @RowValues = ();
|
|
299 $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex];
|
|
300 $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex];
|
|
301 $DataValues1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
|
|
302 $DataValues2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
|
|
303
|
|
304 # Setup column ids...
|
|
305 push @RowValues, $DataLabel1;
|
|
306 push @RowValues, $DataLabel2;
|
|
307
|
|
308 if (@$DataValues1 != @$DataValues2) {
|
|
309 # Print a warning...
|
|
310 warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n";
|
|
311 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
312 push @RowValues, "";
|
|
313 if ($CalculateRSquare) {
|
|
314 push @RowValues, "";
|
|
315 }
|
|
316 }
|
|
317 if ($CalculateCovariance) {
|
|
318 push @RowValues, "";
|
|
319 }
|
|
320 }
|
|
321 else {
|
|
322 # Calculate appropriate value...
|
|
323 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
324 $CorrelationValue = Correlation($DataValues1, $DataValues2);
|
|
325 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
|
|
326 push @RowValues, $Value;
|
|
327 if ($CalculateRSquare) {
|
|
328 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
|
|
329 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
|
|
330 push @RowValues, $Value;
|
|
331 }
|
|
332 }
|
|
333 if ($CalculateCovariance) {
|
|
334 $CovarianceValue = Covariance($DataValues1, $DataValues2);
|
|
335 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
|
|
336 push @RowValues, $Value;
|
|
337 }
|
|
338 }
|
|
339 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
340 print NEWTEXTFILE "$Line\n";
|
|
341 }
|
|
342 close NEWTEXTFILE;
|
|
343 }
|
|
344
|
|
345 # Generate histogram numbers...
|
|
346 sub PerformFrequencyAnalysis {
|
|
347 my($Index, $DataValuesToAnalyzeMapRef) = @_;
|
|
348 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
|
|
349
|
|
350 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
|
|
351 for $DataLabel (@DataLabelsToAnalyze) {
|
|
352 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
353 print "Generating new text file $NewTextFile...\n";
|
|
354 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
355
|
|
356 # Write out the column labels...
|
|
357 @ColLabels = ();
|
|
358 push @ColLabels , ("Bins", "Frequency");
|
|
359 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
360 print NEWTEXTFILE "$Line\n";
|
|
361
|
|
362 #Calculate and write out frequency values...
|
|
363 %FrequencyMap = ();
|
|
364 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
|
|
365 if (@$DataValuesRef) {
|
|
366 if (@{$OptionsInfo{BinRange}}) {
|
|
367 %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}});
|
|
368 }
|
|
369 else {
|
|
370 %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins});
|
|
371 }
|
|
372 }
|
|
373 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) {
|
|
374 $FrequencyValue = $FrequencyMap{$BinValue};
|
|
375
|
|
376 @RowValues = ();
|
|
377 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : "";
|
|
378 push @RowValues, $Value;
|
|
379 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : "";
|
|
380 push @RowValues, $Value;
|
|
381
|
|
382 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
383 print NEWTEXTFILE "$Line\n";
|
|
384 }
|
|
385 close NEWTEXTFILE;
|
|
386 }
|
|
387 }
|
|
388
|
|
389 # Calculate covariance, correlation/rsquare matrices....
|
|
390 sub PerformMatrixAnalysis {
|
|
391 my($Index, $DataValuesToAnalyzeMapRef) = @_;
|
|
392 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
|
|
393
|
|
394 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
|
|
395 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
|
|
396 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
|
|
397
|
|
398 $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
399 $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
400 $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
401
|
|
402 my($TextFilesList, $Delimiter);
|
|
403 $TextFilesList = "";
|
|
404 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
405 $TextFilesList = $CorrelationTextFile;
|
|
406 if ($CalculateRSquare) {
|
|
407 $TextFilesList .= ", $CorrelationTextFile";
|
|
408 }
|
|
409 }
|
|
410 $Delimiter = length($TextFilesList) ? "," : "";
|
|
411 if ($CalculateCovariance) {
|
|
412 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
|
|
413 }
|
|
414 if ($TextFilesList =~ /\,/) {
|
|
415 print "Generating new text files $TextFilesList...\n"
|
|
416 }
|
|
417 else {
|
|
418 print "Generating new text file $TextFilesList...\n"
|
|
419 }
|
|
420 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
421 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
|
|
422 if ($CalculateRSquare) {
|
|
423 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
|
|
424 }
|
|
425 }
|
|
426 if ($CalculateCovariance) {
|
|
427 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
|
|
428 }
|
|
429
|
|
430 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
|
|
431
|
|
432 # Write out the column labels...
|
|
433 @ColLabels = ();
|
|
434 push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]};
|
|
435 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
436 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
437 print CORRELATIONTEXTFILE "$Line\n";
|
|
438 if ($CalculateRSquare) {
|
|
439 print RSQUARETEXTFILE "$Line\n";
|
|
440 }
|
|
441 }
|
|
442 if ($CalculateCovariance) {
|
|
443 print COVARIANCETEXTFILE "$Line\n";
|
|
444 }
|
|
445
|
|
446 # Due to symmetric nature of these matrices, only one half needs to be
|
|
447 # calculated. So, just calculate the lower half and copy it to upper half...
|
|
448 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze);
|
|
449
|
|
450 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
|
|
451 @DataLabelsToAnalyze = ();
|
|
452 @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]};
|
|
453
|
|
454 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
|
|
455 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
|
|
456 for $LabelIndex2 (0 .. $LabelIndex1) {
|
|
457 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
|
|
458 $DataValuesRef1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
|
|
459 $DataValuesRef2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
|
|
460 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
461 $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2);
|
|
462 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
|
|
463 $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue;
|
|
464 if ($DataLabel1 ne $DataLabel2) {
|
|
465 $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue;
|
|
466 }
|
|
467 if ($CalculateRSquare) {
|
|
468 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
|
|
469 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
|
|
470 $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue;
|
|
471 if ($DataLabel1 ne $DataLabel2) {
|
|
472 $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue;
|
|
473 }
|
|
474 }
|
|
475 }
|
|
476 if ($CalculateCovariance) {
|
|
477 $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2);
|
|
478 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
|
|
479 $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue;
|
|
480 if ($DataLabel1 ne $DataLabel2) {
|
|
481 $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue;
|
|
482 }
|
|
483 }
|
|
484 }
|
|
485 }
|
|
486
|
|
487 # Write out the matrices...
|
|
488 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
|
|
489 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
|
|
490 @CorrelationRowValues = ();
|
|
491 @RSquareRowValues = ();
|
|
492 @CovarianceRowValues = ();
|
|
493 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
494 push @CorrelationRowValues, $DataLabel1;
|
|
495 if ($CalculateRSquare) {
|
|
496 push @RSquareRowValues, $DataLabel1;
|
|
497 }
|
|
498 }
|
|
499 if ($CalculateCovariance) {
|
|
500 push @CovarianceRowValues, $DataLabel;
|
|
501 }
|
|
502 for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) {
|
|
503 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
|
|
504 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
505 push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2};
|
|
506 if ($CalculateRSquare) {
|
|
507 push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2};
|
|
508 }
|
|
509 }
|
|
510 if ($CalculateCovariance) {
|
|
511 push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2};
|
|
512 }
|
|
513 }
|
|
514 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
515 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
516 print CORRELATIONTEXTFILE "$Line\n";
|
|
517 if ($CalculateRSquare) {
|
|
518 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
519 print RSQUARETEXTFILE "$Line\n";
|
|
520 }
|
|
521 }
|
|
522 if ($CalculateCovariance) {
|
|
523 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
524 print COVARIANCETEXTFILE "$Line\n";
|
|
525 }
|
|
526 }
|
|
527 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
528 close CORRELATIONTEXTFILE;
|
|
529 if ($CalculateRSquare) {
|
|
530 close RSQUARETEXTFILE;
|
|
531 }
|
|
532 }
|
|
533 if ($CalculateCovariance) {
|
|
534 close COVARIANCETEXTFILE;
|
|
535 }
|
|
536 }
|
|
537
|
|
538 # Calculate standard scores...
|
|
539 sub PerformStandardScoresAnalysis {
|
|
540 my($Index, $DataValuesToAnalyzeMapRef) = @_;
|
|
541 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
|
|
542
|
|
543 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0;
|
|
544 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0;
|
|
545
|
|
546 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
547 print "Generating new text file $NewTextFile...\n";
|
|
548 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
549
|
|
550 my($DataLabel, @DataLabelsToAnalyze);
|
|
551 # Write out column labels...
|
|
552 @ColLabels = ();
|
|
553 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
|
|
554 for $DataLabel (@DataLabelsToAnalyze) {
|
|
555 if ($StandardScores) {
|
|
556 push @ColLabels, "${DataLabel}\(StandardScores)";
|
|
557 }
|
|
558 if ($StandardScoresN) {
|
|
559 push @ColLabels, "${DataLabel}\(StandardScoresN)";
|
|
560 }
|
|
561 }
|
|
562 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
563 print NEWTEXTFILE "$NewLine\n";
|
|
564
|
|
565 # Go over each column to be analyzed and calculate standard deviation
|
|
566 # and mean values...
|
|
567 my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
|
|
568 %StandardDeviationMap = ();
|
|
569 %StandardDeviationNMap = ();
|
|
570 %MeanMap = ();
|
|
571 for $DataLabel (@DataLabelsToAnalyze) {
|
|
572 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
|
|
573 if (!exists($MeanMap{$DataLabel})) {
|
|
574 $MeanMap{$DataLabel} = Mean($DataValuesRef);
|
|
575 }
|
|
576 if ($StandardScores) {
|
|
577 if (!exists($StandardDeviationMap{$DataLabel})) {
|
|
578 $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef);
|
|
579 }
|
|
580 }
|
|
581 if ($StandardScoresN) {
|
|
582 if (!exists($StandardDeviationNMap{$DataLabel})) {
|
|
583 $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef);
|
|
584 }
|
|
585 }
|
|
586 }
|
|
587 #
|
|
588 # Go over each data field and calculate standard scores for each column
|
|
589 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
|
|
590 # for StandardScoresN; write out the calculated values as well...
|
|
591
|
|
592 my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues);
|
|
593 $SDFile = $SDFilesList[$Index];
|
|
594
|
|
595 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
|
|
596 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
|
|
597 @CmpdLines = split "\n", $CmpdString;
|
|
598 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
599 @RowValues = ();
|
|
600 for $DataLabel (@DataLabelsToAnalyze) {
|
|
601 $Value = "";
|
|
602 if (exists $DataFieldValues{$DataLabel}) {
|
|
603 $Value = $DataFieldValues{$DataLabel};
|
|
604 }
|
|
605 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1;
|
|
606 if ($StandardScores) {
|
|
607 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : "";
|
|
608 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
|
|
609 push @RowValues, $ScoreValue;
|
|
610 }
|
|
611 if ($StandardScoresN) {
|
|
612 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : "";
|
|
613 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
|
|
614 push @RowValues, $ScoreValue;
|
|
615 }
|
|
616 }
|
|
617 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
618 print NEWTEXTFILE "$NewLine\n";
|
|
619 }
|
|
620 close SDFILE;
|
|
621 close NEWTEXTFILE;
|
|
622
|
|
623 }
|
|
624
|
|
625 # Make sure the specified data field labels exists in SD files...
|
|
626 sub ProcessSDFilesDataLabelsInfo {
|
|
627 my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap);
|
|
628
|
|
629 @{$SDFilesInfo{DataLabelsToAnalyze}} = ();
|
|
630 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = ();
|
|
631 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = ();
|
|
632 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = ();
|
|
633
|
|
634 FILELIST: for $Index (0 .. $#SDFilesList) {
|
|
635 $SDFile = $SDFilesList[$Index];
|
|
636
|
|
637 @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = ();
|
|
638 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = ();
|
|
639 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = ();
|
|
640 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = ();
|
|
641
|
|
642 %UniqueDataLabelsToAnalyzeMap = ();
|
|
643
|
|
644 if ($SDFilesInfo{FileOkay}[$Index]) {
|
|
645 @DataLabelsToAnalyze = ();
|
|
646 if (@{$OptionsInfo{SpecifiedDataLabels}}) {
|
|
647 for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) {
|
|
648 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) {
|
|
649 push @DataLabelsToAnalyze, $DataLabel;
|
|
650 }
|
|
651 }
|
|
652 }
|
|
653 elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) {
|
|
654 push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]};
|
|
655 }
|
|
656 else {
|
|
657 push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]};
|
|
658 }
|
|
659 if (@DataLabelsToAnalyze) {
|
|
660 push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze;
|
|
661 # Set up unique data field label map as well...
|
|
662 for $DataLabel (@DataLabelsToAnalyze) {
|
|
663 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
|
|
664 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
|
|
665 }
|
|
666 }
|
|
667 }
|
|
668 else {
|
|
669 warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n";
|
|
670 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
671 next FILELIST;
|
|
672 }
|
|
673 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
|
|
674 # Make sure specific frequency files don't exist...
|
|
675 my($FrequencyFile);
|
|
676 for $DataLabel (@DataLabelsToAnalyze) {
|
|
677 $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
|
|
678 if (-e $FrequencyFile) {
|
|
679 warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n";
|
|
680 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
681 next FILELIST;
|
|
682 }
|
|
683 }
|
|
684 }
|
|
685 # Setup specified data field label pairs...
|
|
686 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) {
|
|
687 my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2);
|
|
688 if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) {
|
|
689 # Make sure both data field labels exist...
|
|
690 my($DataFieldIndex);
|
|
691 for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) {
|
|
692 $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex];
|
|
693 $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1];
|
|
694 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) {
|
|
695 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
|
|
696 }
|
|
697 }
|
|
698 }
|
|
699 elsif ($OptionsInfo{AllDataLabelPairs}) {
|
|
700 for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
|
|
701 for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
|
|
702 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
|
|
703 }
|
|
704 }
|
|
705 }
|
|
706 else {
|
|
707 for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
|
|
708 for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
|
|
709 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
|
|
710 }
|
|
711 }
|
|
712 }
|
|
713 if (@DataLabelPairsToAnalyze) {
|
|
714 if (@DataLabelPairsToAnalyze % 2) {
|
|
715 warn "Warning: Ignoring file $SDFile: Invalid number values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n";
|
|
716 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
717 next FILELIST;
|
|
718 }
|
|
719 else {
|
|
720 for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) {
|
|
721 push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex];
|
|
722 push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1];
|
|
723 }
|
|
724 # Set up unique data field labe map as well...
|
|
725 for $DataLabel (@DataLabelPairsToAnalyze) {
|
|
726 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
|
|
727 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
|
|
728 }
|
|
729 }
|
|
730 }
|
|
731 }
|
|
732 }
|
|
733 # Setup unique data field label array...
|
|
734 push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap);
|
|
735 }
|
|
736 }
|
|
737 }
|
|
738
|
|
739 # Retrieve information about input SD files...
|
|
740 sub RetrieveSDFilesInfo {
|
|
741 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount);
|
|
742
|
|
743 %SDFilesInfo = ();
|
|
744
|
|
745 @{$SDFilesInfo{FileOkay}} = ();
|
|
746 @{$SDFilesInfo{CmpdCount}} = ();
|
|
747 @{$SDFilesInfo{NewTextFileRoot}} = ();
|
|
748 @{$SDFilesInfo{NewTextFileExt}} = ();
|
|
749
|
|
750 @{$SDFilesInfo{AllDataFieldLabels}} = ();
|
|
751 @{$SDFilesInfo{AllDataFieldLabelsMap}} = ();
|
|
752 @{$SDFilesInfo{CommonDataLabels}} = ();
|
|
753
|
|
754 FILELIST: for $Index (0 .. $#SDFilesList) {
|
|
755 $SDFile = $SDFilesList[$Index];
|
|
756
|
|
757 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
758
|
|
759 $SDFilesInfo{CmpdCount}[$Index] = 0;
|
|
760 $SDFilesInfo{NewTextFileRoot}[$Index] = "";
|
|
761 $SDFilesInfo{NewTextFileExt}[$Index] = "";
|
|
762
|
|
763 @{$SDFilesInfo{AllDataLabels}[$Index]} = ();
|
|
764 %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = ();
|
|
765 @{$SDFilesInfo{CommonDataLabels}[$Index]} = ();
|
|
766
|
|
767 if (!(-e $SDFile)) {
|
|
768 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
|
|
769 next FILELIST;
|
|
770 }
|
|
771 if (!CheckFileType($SDFile, "sd sdf")) {
|
|
772 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
|
|
773 next FILELIST;
|
|
774 }
|
|
775
|
|
776 # Generate appropriate name for the new text files...
|
|
777 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
778 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
|
|
779 $OutFileExt = "csv";
|
|
780 if ($Options{outdelim} =~ /^tab$/i) {
|
|
781 $OutFileExt = "tsv";
|
|
782 }
|
|
783 if ($Options{root} && (@SDFilesList == 1)) {
|
|
784 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
|
|
785 if ($RootFileName && $RootFileExt) {
|
|
786 $FileName = $RootFileName;
|
|
787 }
|
|
788 else {
|
|
789 $FileName = $Options{root};
|
|
790 }
|
|
791 $OutFileRoot = $FileName;
|
|
792 }
|
|
793 else {
|
|
794 $OutFileRoot = $FileName;
|
|
795 }
|
|
796 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt";
|
|
797
|
|
798 if (!$OptionsInfo{Overwrite}) {
|
|
799 if (-e $OutFile) {
|
|
800 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
|
|
801 next FILELIST;
|
|
802 }
|
|
803 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
|
|
804 if ($OptionsInfo{AllDataLabelPairs}) {
|
|
805 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) {
|
|
806 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n";
|
|
807 next FILELIST;
|
|
808 }
|
|
809 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) {
|
|
810 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n";
|
|
811 next FILELIST;
|
|
812 }
|
|
813 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) {
|
|
814 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n";
|
|
815 next FILELIST;
|
|
816 }
|
|
817 }
|
|
818 else {
|
|
819 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") {
|
|
820 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n";
|
|
821 next FILELIST;
|
|
822 }
|
|
823 }
|
|
824 }
|
|
825 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) {
|
|
826 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n";
|
|
827 next FILELIST;
|
|
828 }
|
|
829 }
|
|
830
|
|
831 if (!open SDFILE, "$SDFile") {
|
|
832 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
|
|
833 next FILELIST;
|
|
834 }
|
|
835
|
|
836 my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels);
|
|
837 $CmpdCount = 0;
|
|
838 @DataFieldLabels = ();
|
|
839 @CommonDataFieldLabels = ();
|
|
840 ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
|
|
841 push @DataFieldLabels, @{$DataFieldLabelsRef};
|
|
842 push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef};
|
|
843 close SDFILE;
|
|
844
|
|
845 $SDFilesInfo{FileOkay}[$Index] = 1;
|
|
846 $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot";
|
|
847 $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt";
|
|
848
|
|
849 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
|
|
850 push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels;
|
|
851 push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels;
|
|
852 for $Label (@DataFieldLabels) {
|
|
853 $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label;
|
|
854 }
|
|
855 }
|
|
856 }
|
|
857
|
|
858 # Process option values...
|
|
859 sub ProcessOptions {
|
|
860 %OptionsInfo = ();
|
|
861
|
|
862 $OptionsInfo{Mode} = $Options{mode};
|
|
863
|
|
864 $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef;
|
|
865
|
|
866 $OptionsInfo{DetailLevel} = $Options{detail};
|
|
867
|
|
868 # Setup supported statistical functions...
|
|
869 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
|
|
870
|
|
871 %SupportedStatisticaFunctionsMap = ();
|
|
872 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
|
|
873
|
|
874 for $SupportedFunction (@SupportedStatisticaFunctions) {
|
|
875 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
|
|
876 }
|
|
877
|
|
878 # Setup a list of functions to use for analysis...
|
|
879 my($SpecifiedFunction);
|
|
880
|
|
881 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = ();
|
|
882 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = ();
|
|
883
|
|
884 # Check mode values...
|
|
885 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
|
|
886 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic";
|
|
887 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
|
|
888 }
|
|
889 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
|
|
890 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll";
|
|
891 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum);
|
|
892 }
|
|
893 elsif ($Options{mode} =~ /^All$/i ) {
|
|
894 $OptionsInfo{FileNameMode} = "AllStatistics";
|
|
895 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions;
|
|
896 }
|
|
897 else {
|
|
898 $OptionsInfo{FileNameMode} = "SpecifiedStatistics";
|
|
899
|
|
900 # Comma delimited list of functions...
|
|
901 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
|
|
902
|
|
903 $Mode = $Options{mode};
|
|
904 $Mode =~ s/ //g;
|
|
905 @SpecifiedFunctions = split ",", $Mode;
|
|
906 @UnsupportedSpecifiedFunctions = ();
|
|
907 for $SpecifiedFunction (@SpecifiedFunctions) {
|
|
908 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
|
|
909 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction;
|
|
910 }
|
|
911 else {
|
|
912 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
|
|
913 }
|
|
914 }
|
|
915 if (@UnsupportedSpecifiedFunctions) {
|
|
916 if (@UnsupportedSpecifiedFunctions > 1) {
|
|
917 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
|
|
918 }
|
|
919 else {
|
|
920 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
|
|
921 }
|
|
922 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
|
|
923 }
|
|
924 }
|
|
925
|
|
926 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
|
|
927 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) {
|
|
928 next FUNCTION;
|
|
929 }
|
|
930 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
|
|
931 }
|
|
932
|
|
933 # Setup delimiter and quotes...
|
|
934 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
|
|
935 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
|
|
936
|
|
937 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
|
|
938 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
|
|
939
|
|
940 # Setup miscellaneous options...
|
|
941 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1;
|
|
942 $OptionsInfo{Precision} = $Options{precision};
|
|
943
|
|
944 $OptionsInfo{KLargest} = $Options{klargest};
|
|
945 $OptionsInfo{KSmallest} = $Options{ksmallest};
|
|
946
|
|
947 $OptionsInfo{TrimFraction} = $Options{trimfraction};
|
|
948
|
|
949 # Setup frequency bin values...
|
|
950 $OptionsInfo{NumOfBins} = 10;
|
|
951 @{$OptionsInfo{BinRange}} = ();
|
|
952 if ($Options{frequencybins} =~ /\,/) {
|
|
953 my($BinValue, @SpecifiedBinRange);
|
|
954 @SpecifiedBinRange = split /\,/, $Options{frequencybins};
|
|
955 if (@SpecifiedBinRange < 2) {
|
|
956 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
|
|
957 }
|
|
958 for $BinValue (@SpecifiedBinRange) {
|
|
959 if (!IsNumerical($BinValue)) {
|
|
960 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
|
|
961 }
|
|
962 }
|
|
963 my($Index1, $Index2);
|
|
964 for $Index1 (0 .. $#SpecifiedBinRange) {
|
|
965 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
|
|
966 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
|
|
967 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
|
|
968 }
|
|
969 }
|
|
970 }
|
|
971 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
|
|
972 }
|
|
973 else {
|
|
974 $OptionsInfo{NumOfBins} = $Options{frequencybins};
|
|
975 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
|
|
976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
|
|
977 }
|
|
978 }
|
|
979
|
|
980 # Setup specified data field labels...
|
|
981 @{$OptionsInfo{SpecifiedDataLabels}} = ();
|
|
982 if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) {
|
|
983 my(@SpecifiedValues) = split ",", $Options{datafields};
|
|
984 push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues;
|
|
985 }
|
|
986 @{$OptionsInfo{SpecifiedDataLabelPairs}} = ();
|
|
987 $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0;
|
|
988 $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0;
|
|
989 if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) {
|
|
990 my(@SpecifiedValues) = split ",", $Options{datafieldpairs};
|
|
991 if (@SpecifiedValues % 2) {
|
|
992 die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n";
|
|
993 }
|
|
994 push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues;
|
|
995 }
|
|
996
|
|
997 }
|
|
998
|
|
999 # Setup script usage and retrieve command line arguments specified using various options...
|
|
1000 sub SetupScriptUsage {
|
|
1001
|
|
1002 # Retrieve all the options...
|
|
1003 %Options = ();
|
|
1004 $Options{detail} = 0;
|
|
1005 $Options{datafields} = "Common";
|
|
1006 $Options{datafieldpairs} = "CommonPairs";
|
|
1007 $Options{frequencybins} = 10;
|
|
1008 $Options{klargest} = 2;
|
|
1009 $Options{ksmallest} = 2;
|
|
1010 $Options{mode} = "DescriptiveStatisticsBasic";
|
|
1011 $Options{outdelim} = "comma";
|
|
1012 $Options{precision} = 2;
|
|
1013 $Options{quote} = "yes";
|
|
1014 $Options{trimfraction} = 0.1;
|
|
1015
|
|
1016 if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) {
|
|
1017 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
1018 }
|
|
1019 if ($Options{workingdir}) {
|
|
1020 if (! -d $Options{workingdir}) {
|
|
1021 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
1022 }
|
|
1023 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
1024 }
|
|
1025 if (!IsInteger($Options{detail})) {
|
|
1026 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n";
|
|
1027 }
|
|
1028 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
|
|
1029 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
|
|
1030 }
|
|
1031 if ($Options{quote} !~ /^(yes|no)$/i) {
|
|
1032 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
|
|
1033 }
|
|
1034 if (!IsPositiveInteger($Options{precision})) {
|
|
1035 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
|
|
1036 }
|
|
1037 if (!IsPositiveInteger($Options{klargest})) {
|
|
1038 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n";
|
|
1039 }
|
|
1040 if (!IsPositiveInteger($Options{ksmallest})) {
|
|
1041 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n";
|
|
1042 }
|
|
1043 if (IsFloat($Options{trimfraction})) {
|
|
1044 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) {
|
|
1045 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
|
|
1046 }
|
|
1047 }
|
|
1048 else {
|
|
1049 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
|
|
1050 }
|
|
1051 }
|
|
1052
|
|
1053 __END__
|
|
1054
|
|
1055 =head1 NAME
|
|
1056
|
|
1057 AnalyzeSDFilesData.pl - Analyze numerical data field values in SDFile(s)
|
|
1058
|
|
1059 =head1 SYNOPSIS
|
|
1060
|
|
1061 AnalyzeSDFilesData.pl SDFile(s)...
|
|
1062
|
|
1063 AnalyzeSDFilesData.pl [B<--datafields> "fieldlabel,[fieldlabel,...]" | All]
|
|
1064 [B<--datafieldpairs> "fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | AllPairs] [B<-d, --detail> infolevel]
|
|
1065 [B<-f, --fast>] [B<--frequencybins> number | "number,number,[number,...]"]
|
|
1066 [B<-h, --help>] [B<--klargest> number] [B<--ksmallest> number]
|
|
1067 [B<-m, --mode> DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"]
|
|
1068 [B<--trimfraction> number] [B<-w, --workingdir> dirname] SDFiles(s)...
|
|
1069
|
|
1070 =head1 DESCRIPTION
|
|
1071
|
|
1072 Analyze numerical data field values in I<SDFile(s)> using a combination of various statistical
|
|
1073 functions; Non-numerical values are simply ignored. For I<Correlation, RSquare, and
|
|
1074 Covariance> analysis, the count of valid values in specified data field pairs must be same;
|
|
1075 otherwise, column data field pair is ignored. The file names are separated by space.The valid file
|
|
1076 extensions are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a
|
|
1077 current directory can be specified either by I<*.sdf> or the current directory name.
|
|
1078
|
|
1079 =head1 OPTIONS
|
|
1080
|
|
1081 =over 4
|
|
1082
|
|
1083 =item B<--datafields> I<"fieldlabel,[fieldlabel,...]" | Common | All>
|
|
1084
|
|
1085 Data fields to use for analysis. Possible values: list of comma separated data field
|
|
1086 labels, data fields common to all records, or all data fields. Default value: I<Common>.
|
|
1087 Examples:
|
|
1088
|
|
1089 ALogP,MolWeight,EC50
|
|
1090 "MolWeight,PSA"
|
|
1091
|
|
1092 =item B<--datafieldpairs> I<"fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | CommonPairs | AllPairs>
|
|
1093
|
|
1094 This value is mode specific and is only used for I<Correlation, PearsonCorrelation, or
|
|
1095 Covariance> value of B<-m, --mode> option. It specifies data field label pairs to use
|
|
1096 for data analysis during I<Correlation> and I<Covariance> calculations. Possible values:
|
|
1097 comma delimited list of data field label pairs, data field label pairs common to all records,
|
|
1098 or all data field pairs. Default value:I<CommonPairs>. Example:
|
|
1099
|
|
1100 MolWeight,EC50,NumN+O,PSA
|
|
1101
|
|
1102 For I<AllPairs> value of B<--datafieldpairs> option, all data field label pairs are used for
|
|
1103 I<Correlation> and I<Covariance> calculations.
|
|
1104
|
|
1105 =item B<-d, --detail> I<infolevel>
|
|
1106
|
|
1107 Level of information to print about column values being ignored. Default: I<0>. Possible values:
|
|
1108 0, 1, 2, 3, or 4.
|
|
1109
|
|
1110 =item B<-f, --fast>
|
|
1111
|
|
1112 In this mode, all the data field values specified for analysis are assumed to contain numerical
|
|
1113 data and no checking is performed before analysis. By default, only numerical data is
|
|
1114 used for analysis; other types of column data is ignored.
|
|
1115
|
|
1116 =item B<--frequencybins> I<number | "number,number,[number,...]">
|
|
1117
|
|
1118 Specify number of bins or bin range to use for frequency analysis. Default value: I<10>
|
|
1119
|
|
1120 Number of bins value along with the smallest and largest value for a column is used to
|
|
1121 group the column values into different groups.
|
|
1122
|
|
1123 The bin range list is used to group values for a column into different groups; It must contain
|
|
1124 values in ascending order. Examples:
|
|
1125
|
|
1126 10,20,30
|
|
1127 0.1,0.2,0.3,0.4,0.5
|
|
1128
|
|
1129 The frequency value calculated for a specific bin corresponds to all the column values
|
|
1130 which are greater than the previous bin value and less than or equal to the current bin value.
|
|
1131
|
|
1132 =item B<-h, --help>
|
|
1133
|
|
1134 Print this help message.
|
|
1135
|
|
1136 =item B<--klargest> I<number>
|
|
1137
|
|
1138 Kth largest value to find by I<KLargest> function. Default value: I<2>. Valid values: positive
|
|
1139 integers.
|
|
1140
|
|
1141 =item B<--ksmallest> I<number>
|
|
1142
|
|
1143 Kth smallest value to find by I<KSmallest> function. Default values: I<2>. Valid values: positive
|
|
1144 integers.
|
|
1145
|
|
1146 =item B<-m, --mode> I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]">
|
|
1147
|
|
1148 Specify how to analyze data in SDFile(s): calculate basic or all descriptive statistics; or
|
|
1149 use a comma delimited list of supported statistical functions. Possible values:
|
|
1150 I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | "function1,[function2]...">. Default
|
|
1151 value: I<DescriptiveStatisticsBasic>
|
|
1152
|
|
1153 I<DescriptiveStatisticsBasic> includes these functions: I<Count, Maximum, Minimum, Mean,
|
|
1154 Median, Sum, StandardDeviation, StandardError, Variance>.
|
|
1155
|
|
1156 I<DescriptiveStatisticsAll>, in addition to I<DescriptiveStatisticsBasic> functions, includes:
|
|
1157 I<GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare,
|
|
1158 Skewness, TrimMean>.
|
|
1159
|
|
1160 I<All> uses complete list of supported functions: I<Average, AverageDeviation, Correlation,
|
|
1161 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis,
|
|
1162 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum,
|
|
1163 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores,
|
|
1164 StandardScoresN, TrimMean, Variance, VarianceN>. The function names ending with N
|
|
1165 calculate corresponding values assuming an entire population instead of a population sample.
|
|
1166 Here are the formulas for these functions:
|
|
1167
|
|
1168 Average: See Mean
|
|
1169
|
|
1170 AverageDeviation: SUM( ABS(x[i] - Xmean) ) / n
|
|
1171
|
|
1172 Correlation: See Pearson Correlation
|
|
1173
|
|
1174 Covariance: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / n
|
|
1175
|
|
1176 GeometricMean: NthROOT( PRODUCT(x[i]) )
|
|
1177
|
|
1178 HarmonicMean: 1 / ( SUM(1/x[i]) / n )
|
|
1179
|
|
1180 Mean: SUM( x[i] ) / n
|
|
1181
|
|
1182 Median: Xsorted[(n - 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2
|
|
1183 for odd values of n.
|
|
1184
|
|
1185 Kurtosis: [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] -
|
|
1186 {3((n - 1)^2)}/{(n - 2)(n-3)}
|
|
1187
|
|
1188 PearsonCorrelation: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )
|
|
1189 (SUM( (y[i] - Ymean)^2 )) )
|
|
1190
|
|
1191 RSquare: PearsonCorrelation^2
|
|
1192
|
|
1193 Skewness: {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
|
|
1194
|
|
1195 StandardDeviation: SQRT ( SUM( (x[i] - Mean)^2 ) / (n - 1) )
|
|
1196
|
|
1197 StandardDeviationN: SQRT ( SUM( (x[i] - Mean)^2 ) / n )
|
|
1198
|
|
1199 StandardError: StandardDeviation / SQRT( n )
|
|
1200
|
|
1201 StandardScore: (x[i] - Mean) / (n - 1)
|
|
1202
|
|
1203 StandardScoreN: (x[i] - Mean) / n
|
|
1204
|
|
1205 Variance: SUM( (x[i] - Xmean)^2 / (n - 1) )
|
|
1206
|
|
1207 VarianceN: SUM( (x[i] - Xmean)^2 / n )
|
|
1208
|
|
1209 =item B<-o, --overwrite>
|
|
1210
|
|
1211 Overwrite existing files.
|
|
1212
|
|
1213 =item B<--outdelim> I<comma | tab | semicolon>
|
|
1214
|
|
1215 Output text file delimiter. Possible values: I<comma, tab, or semicolon>
|
|
1216 Default value: I<comma>.
|
|
1217
|
|
1218 =item B<-p, --precision> I<number>
|
|
1219
|
|
1220 Precision of calculated values in the output file. Default: up to I<2> decimal places.
|
|
1221 Valid values: positive integers.
|
|
1222
|
|
1223 =item B<-q, --quote> I<yes | no>
|
|
1224
|
|
1225 Put quotes around column values in output text file. Possible values: I<yes or
|
|
1226 no>. Default value: I<yes>.
|
|
1227
|
|
1228 =item B<-r, --root> I<rootname>
|
|
1229
|
|
1230 New text file name is generated using the root: <Root>.<Ext>. Default new file
|
|
1231 name: <InitialSDFileName><Mode>.<Ext>. Based on the specified analysis,
|
|
1232 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic,
|
|
1233 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation,
|
|
1234 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for
|
|
1235 comma/semicolon, and tab delimited text files respectively. This option is ignored for
|
|
1236 multiple input files.
|
|
1237
|
|
1238 =item B<--trimfraction> I<number>
|
|
1239
|
|
1240 Fraction of data to exclude from the top and bottom of the data set during
|
|
1241 I<TrimMean> calculation. Default value: I<0.1> Valid values: > 0 and < 1.
|
|
1242
|
|
1243 =item B<-w --workingdir> I<text>
|
|
1244
|
|
1245 Location of working directory. Default: current directory.
|
|
1246
|
|
1247 =back
|
|
1248
|
|
1249 =head1 EXAMPLES
|
|
1250
|
|
1251 To calculate basic statistics for data in all common data fields and generate a
|
|
1252 NewSample1DescriptiveStatisticsBasic.csv file, type:
|
|
1253
|
|
1254 % AnalyzeSDFilesData.pl -o -r NewSample1 Sample1.sdf
|
|
1255
|
|
1256 To calculate basic statistics for MolWeight data field and generate a
|
|
1257 NewSample1DescriptiveStatisticsBasic.csv file, type:
|
|
1258
|
|
1259 % AnalyzeSDFilesData.pl --datafields MolWeight -o -r NewSample1
|
|
1260 Sample1.sdf
|
|
1261
|
|
1262 To calculate all available statistics for MolWeight data field and all data field pairs,
|
|
1263 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv,
|
|
1264 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv
|
|
1265 files, type:
|
|
1266
|
|
1267 % AnalyzeSDFilesData.pl -m DescriptiveStatisticsAll --datafields
|
|
1268 MolWeight -o --datafieldpairs AllPairs -r NewSample1 Sample1.sdf
|
|
1269
|
|
1270 To compute frequency distribution of MolWeight data field into five bins and
|
|
1271 generate NewSample1MolWeightFrequencyAnalysis.csv, type:
|
|
1272
|
|
1273 % AnalyzeSDFilesData.pl -m Frequency --frequencybins 5 --datafields
|
|
1274 MolWeight -o -r NewSample1 Sample1.sdf
|
|
1275
|
|
1276 To compute frequency distribution of data in MolWeight data field into specified bin range
|
|
1277 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type:
|
|
1278
|
|
1279 % AnalyzeSDFilesData.pl -m Frequency --frequencybins "100,200,400"
|
|
1280 --datafields MolWeight -o -r NewSample1 Sample1.sdf
|
|
1281
|
|
1282 To calculate all available statistics for data in all data fields and pairs, type:
|
|
1283
|
|
1284 % AnalyzeSDFilesData.pl -m All --datafields All --datafieldpairs
|
|
1285 AllPairs -o -r NewSample1 Sample1.sdf
|
|
1286
|
|
1287 =head1 AUTHOR
|
|
1288
|
|
1289 Manish Sud <msud@san.rr.com>
|
|
1290
|
|
1291 =head1 SEE ALSO
|
|
1292
|
|
1293 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl
|
|
1294
|
|
1295 =head1 COPYRIGHT
|
|
1296
|
|
1297 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1298
|
|
1299 This file is part of MayaChemTools.
|
|
1300
|
|
1301 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1302 the terms of the GNU Lesser General Public License as published by the Free
|
|
1303 Software Foundation; either version 3 of the License, or (at your option)
|
|
1304 any later version.
|
|
1305
|
|
1306 =cut
|