0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: AnalyzeTextFilesData.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:04 $
|
|
5 # $Revision: 1.36 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use FileUtil;
|
|
36 use TextUtil;
|
|
37 use StatisticsUtil;
|
|
38
|
|
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
40
|
|
41 # Autoflush STDOUT
|
|
42 $| = 1;
|
|
43
|
|
44 # Starting message...
|
|
45 $ScriptName = basename($0);
|
|
46 print "\n$ScriptName: Starting...\n\n";
|
|
47 $StartTime = new Benchmark;
|
|
48
|
|
49 # Get the options and setup script...
|
|
50 SetupScriptUsage();
|
|
51 if ($Options{help} || @ARGV < 1) {
|
|
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
53 }
|
|
54
|
|
55 my(@TextFilesList);
|
|
56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
|
|
57
|
|
58 print "Processing options...\n";
|
|
59 my(%OptionsInfo);
|
|
60 ProcessOptions();
|
|
61
|
|
62 # Collect column information for all the text files...
|
|
63 print "Checking input text file(s)...\n";
|
|
64 my(%TextFilesInfo);
|
|
65 RetrieveTextFilesInfo();
|
|
66 ProcessColumnsInfo();
|
|
67
|
|
68 # Generate output files...
|
|
69 my($FileIndex);
|
|
70 if (@TextFilesList > 1) {
|
|
71 print "\nProcessing text files...\n";
|
|
72 }
|
|
73 for $FileIndex (0 .. $#TextFilesList) {
|
|
74 if ($TextFilesInfo{FileOkay}[$FileIndex]) {
|
|
75 print "\nProcessing file $TextFilesList[$FileIndex]...\n";
|
|
76 AnalyzeTextFile($FileIndex);
|
|
77 }
|
|
78 }
|
|
79 print "\n$ScriptName:Done...\n\n";
|
|
80
|
|
81 $EndTime = new Benchmark;
|
|
82 $TotalTime = timediff ($EndTime, $StartTime);
|
|
83 print "Total time: ", timestr($TotalTime), "\n";
|
|
84
|
|
85 ###############################################################################
|
|
86
|
|
87 # Analyze data...
|
|
88 sub AnalyzeTextFile {
|
|
89 my($Index) = @_;
|
|
90 my($TextFile, $Line, $InDelim, $ColNum, $Value, @LineWords, @ColNumsToAnalyze, %ColValuesToAnalyzeMap);
|
|
91
|
|
92 $TextFile = $TextFilesList[$Index];
|
|
93 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
94 @ColNumsToAnalyze = @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]};
|
|
95 %ColValuesToAnalyzeMap = ();
|
|
96 for $ColNum (@ColNumsToAnalyze) {
|
|
97 @{$ColValuesToAnalyzeMap{$ColNum}} = ();
|
|
98 }
|
|
99
|
|
100 my($LineCount, $InvalidLineCount, @InvalidColLabels);
|
|
101
|
|
102 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
|
|
103 # Skip over column labels line in text file and collect appropriate column data
|
|
104 # for analysis...
|
|
105 $Line = GetTextLine(\*TEXTFILE);
|
|
106 $LineCount = 1;
|
|
107 $InvalidLineCount = 0;
|
|
108 while ($Line = GetTextLine(\*TEXTFILE)) {
|
|
109 $LineCount++;
|
|
110 @LineWords = quotewords($InDelim, 0, $Line);
|
|
111 @InvalidColLabels = ();
|
|
112 COLNUM: for $ColNum (@ColNumsToAnalyze) {
|
|
113 $Value = $LineWords[$ColNum];
|
|
114 if ($OptionsInfo{CheckData}) {
|
|
115 if (!IsNumerical($Value)) {
|
|
116 push @InvalidColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
|
|
117 next COLNUM;
|
|
118 }
|
|
119 }
|
|
120 push @{$ColValuesToAnalyzeMap{$ColNum}}, $Value;
|
|
121 }
|
|
122 if (@InvalidColLabels) {
|
|
123 $InvalidLineCount++;
|
|
124 if ($OptionsInfo{DetailLevel} >=4 ) {
|
|
125 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed: $Line \n";
|
|
126 }
|
|
127 elsif ($OptionsInfo{DetailLevel} >= 3) {
|
|
128 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed...\n";
|
|
129 }
|
|
130 elsif ($OptionsInfo{DetailLevel} >= 2) {
|
|
131 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for columns to be analyzed...\n";
|
|
132 }
|
|
133 }
|
|
134 }
|
|
135 if ($InvalidLineCount && ($OptionsInfo{DetailLevel} >= 1)) {
|
|
136 print "Non-numerical or empty data present in $InvalidLineCount line(s)...\n";
|
|
137 }
|
|
138 close TEXTFILE;
|
|
139
|
|
140 # Perform the analysis...
|
|
141 my(@SpecifiedFunctionNames, $SpecifiedFunction);
|
|
142 @SpecifiedFunctionNames = ();
|
|
143
|
|
144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
|
|
145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
|
|
146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)};
|
|
147 }
|
|
148 }
|
|
149 if (@SpecifiedFunctionNames) {
|
|
150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%ColValuesToAnalyzeMap)
|
|
151 }
|
|
152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
|
|
153 if ($OptionsInfo{AllColumnPairs}) {
|
|
154 PerformMatrixAnalysis($Index, \%ColValuesToAnalyzeMap);
|
|
155 }
|
|
156 else {
|
|
157 # Perform pairwise analysis for specified columns and write out calculated values - correlation
|
|
158 # rsquare, or covariance - in the same file.
|
|
159 PerformColumnPairAnalysis($Index, \%ColValuesToAnalyzeMap);
|
|
160 }
|
|
161 }
|
|
162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) {
|
|
163 PerformStandardScoresAnalysis($Index, \%ColValuesToAnalyzeMap);
|
|
164 }
|
|
165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
|
|
166 PerformFrequencyAnalysis($Index, \%ColValuesToAnalyzeMap);
|
|
167 }
|
|
168 }
|
|
169
|
|
170 # Calculate values for various statistical functions...
|
|
171 sub PerformAnalysis {
|
|
172 my($Index, $SpecifiedFunctionNamesRef, $ColValuesToAnalyzeMapRef) = @_;
|
|
173 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @ColNumsToAnalyze);
|
|
174
|
|
175 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $TextFilesInfo{OutFileExt}[$Index];
|
|
176
|
|
177 print "Generating new text file $NewTextFile...\n";
|
|
178 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
179
|
|
180 # Write out column labels...
|
|
181 @ColLabels = ();
|
|
182 push @ColLabels, "ColumnID";
|
|
183 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
|
|
184 $Label = $SpecifiedFunction;
|
|
185 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
|
|
186 my($KthValue);
|
|
187 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest};
|
|
188 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
|
|
189 $Label =~ s/K//g;
|
|
190 }
|
|
191 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
|
|
192 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})";
|
|
193 }
|
|
194 push @ColLabels, $Label;
|
|
195 }
|
|
196 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
197 print NEWTEXTFILE "$Line\n";
|
|
198
|
|
199 # Go over each column to be analyzed...
|
|
200 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]};
|
|
201
|
|
202 # Turn off "strict"; otherwise, invoking statistical functions using function name string
|
|
203 # is problematic.
|
|
204 no strict;
|
|
205
|
|
206 my($ColValuesRef, $ColNum, $Value, @RowValues, %CalculatedValues);
|
|
207 %CalculatedValues = ();
|
|
208 for $ColNum (@ColNumsToAnalyze) {
|
|
209 @RowValues = ();
|
|
210 # Setup column id...
|
|
211 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum];
|
|
212 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
|
|
213 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
|
|
214 $Value = "";
|
|
215 if (!@{$ColValuesToAnalyzeMapRef->{$ColNum}}) {
|
|
216 # Invalid column values...
|
|
217 push @RowValues, $Value;
|
|
218 next FUNCTIONNAME;
|
|
219 }
|
|
220 if ($SpecifiedFunction =~ /^Count$/i) {
|
|
221 $Value = @{$ColValuesToAnalyzeMapRef->{$ColNum}};
|
|
222 }
|
|
223 elsif ($SpecifiedFunction =~ /^KLargest$/i) {
|
|
224 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KLargest});
|
|
225 }
|
|
226 elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
|
|
227 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KSmallest});
|
|
228 }
|
|
229 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
|
|
230 if (exists($CalculatedValues{$ColNum}{StandardDeviation})) {
|
|
231 $Value = $CalculatedValues{$ColNum}{StandardDeviation};
|
|
232 }
|
|
233 else {
|
|
234 $Value = &$SpecifiedFunction($ColValuesRef);
|
|
235 $CalculatedValues{$ColNum}{StandardDeviation} = $Value;
|
|
236 }
|
|
237 }
|
|
238 elsif ($SpecifiedFunction =~ /^StandardError$/i) {
|
|
239 if (!exists($CalculatedValues{$ColNum}{StandardDeviation})) {
|
|
240 $Value = StandardDeviation($ColValuesRef);
|
|
241 $CalculatedValues{$ColNum}{StandardDeviation} = $Value;
|
|
242 }
|
|
243 if (defined $CalculatedValues{$ColNum}{StandardDeviation}) {
|
|
244 $Value = &$SpecifiedFunction($CalculatedValues{$ColNum}{StandardDeviation}, @{$ColValuesToAnalyzeMapRef->{$ColNum}});
|
|
245 }
|
|
246 }
|
|
247 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
|
|
248 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{TrimFraction});
|
|
249 }
|
|
250 else {
|
|
251 $Value = &$SpecifiedFunction($ColValuesRef);
|
|
252 }
|
|
253 # Format the output value. And add zero to get rid of tariling zeros...
|
|
254 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : "";
|
|
255 push @RowValues, $Value;
|
|
256 }
|
|
257 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
258 print NEWTEXTFILE "$Line\n";
|
|
259 }
|
|
260 close NEWTEXTFILE;
|
|
261 }
|
|
262
|
|
263 # Calculate covariance, correlation, rsquare for specified column pairs....
|
|
264 sub PerformColumnPairAnalysis {
|
|
265 my($Index, $ColValuesToAnalyzeMapRef) = @_;
|
|
266 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
|
|
267 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
|
|
268 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
|
|
269 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
|
|
270
|
|
271 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "ColumnPairsAnalysis." . $TextFilesInfo{OutFileExt}[$Index];
|
|
272 print "Generating new text file $NewTextFile...\n";
|
|
273 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
274
|
|
275 # Write out the column labels...
|
|
276 @ColLabels = ();
|
|
277 push @ColLabels, ("ColumnID1", "ColumnID2");
|
|
278 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
279 push @ColLabels, "Correlation";
|
|
280 if ($CalculateRSquare) {
|
|
281 push @ColLabels, "RSquare";
|
|
282 }
|
|
283 }
|
|
284 if ($CalculateCovariance) {
|
|
285 push @ColLabels, "Covariance";
|
|
286 }
|
|
287 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
288 print NEWTEXTFILE "$Line\n";
|
|
289
|
|
290 # Go over each column pair...
|
|
291 my($CorrelationValue, $RSquareValue, $CovarianceValue, $ColIndex, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColPairs1ToAnalyze, @ColPairs2ToAnalyze, @RowValues, $Value);
|
|
292
|
|
293 @ColPairs1ToAnalyze = @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]};
|
|
294 @ColPairs2ToAnalyze = @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]};
|
|
295 for $ColIndex (0 .. $#ColPairs1ToAnalyze) {
|
|
296 @RowValues = ();
|
|
297 $ColNum1 = $ColPairs1ToAnalyze[$ColIndex];
|
|
298 $ColNum2 = $ColPairs2ToAnalyze[$ColIndex];
|
|
299 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}};
|
|
300 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}};
|
|
301
|
|
302 # Setup column ids...
|
|
303 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
|
|
304 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum2];
|
|
305
|
|
306 if (@$ColValuesRef1 != @$ColValuesRef2) {
|
|
307 # Print a warning...
|
|
308 warn "Warning: Skipping analysis for column pair $TextFilesInfo{ColLabels}[$Index][$ColNum1], $TextFilesInfo{ColLabels}[$Index][$ColNum2]: Number of valid data values must be same.\n";
|
|
309 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
310 push @RowValues, "";
|
|
311 if ($CalculateRSquare) {
|
|
312 push @RowValues, "";
|
|
313 }
|
|
314 }
|
|
315 if ($CalculateCovariance) {
|
|
316 push @RowValues, "";
|
|
317 }
|
|
318 }
|
|
319 else {
|
|
320 # Calculate appropriate value...
|
|
321 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
322 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2);
|
|
323 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
|
|
324 push @RowValues, $Value;
|
|
325 if ($CalculateRSquare) {
|
|
326 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
|
|
327 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
|
|
328 push @RowValues, $Value;
|
|
329 }
|
|
330 }
|
|
331 if ($CalculateCovariance) {
|
|
332 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2);
|
|
333 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
|
|
334 push @RowValues, $Value;
|
|
335 }
|
|
336 }
|
|
337 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
338 print NEWTEXTFILE "$Line\n";
|
|
339 }
|
|
340 close NEWTEXTFILE;
|
|
341 }
|
|
342
|
|
343 # Generate histogram numbers...
|
|
344 sub PerformFrequencyAnalysis {
|
|
345 my($Index, $ColValuesToAnalyzeMapRef) = @_;
|
|
346 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $ColNum, @ColNumsToAnalyze, $ColValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
|
|
347
|
|
348 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]};
|
|
349 for $ColNum (@ColNumsToAnalyze) {
|
|
350 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index];
|
|
351 print "Generating new text file $NewTextFile...\n";
|
|
352 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
353
|
|
354 # Write out the column labels...
|
|
355 @ColLabels = ();
|
|
356 push @ColLabels , ("Bins", "Frequency");
|
|
357 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
358 print NEWTEXTFILE "$Line\n";
|
|
359
|
|
360 #Calculate and write out frequency values...
|
|
361 %FrequencyMap = ();
|
|
362 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
|
|
363 if (@$ColValuesRef) {
|
|
364 if (@{$OptionsInfo{BinRange}}) {
|
|
365 %FrequencyMap = Frequency($ColValuesRef, \@{$OptionsInfo{BinRange}});
|
|
366 }
|
|
367 else {
|
|
368 %FrequencyMap = Frequency($ColValuesRef, $OptionsInfo{NumOfBins});
|
|
369 }
|
|
370 }
|
|
371 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) {
|
|
372 $FrequencyValue = $FrequencyMap{$BinValue};
|
|
373
|
|
374 @RowValues = ();
|
|
375 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : "";
|
|
376 push @RowValues, $Value;
|
|
377 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : "";
|
|
378 push @RowValues, $Value;
|
|
379
|
|
380 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
381 print NEWTEXTFILE "$Line\n";
|
|
382 }
|
|
383 close NEWTEXTFILE;
|
|
384 }
|
|
385 }
|
|
386
|
|
387 # Calculate covariance, correlation/rsquare matrices....
|
|
388 sub PerformMatrixAnalysis {
|
|
389 my($Index, $ColValuesToAnalyzeMapRef) = @_;
|
|
390 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
|
|
391
|
|
392 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
|
|
393 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
|
|
394 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
|
|
395
|
|
396 $CorrelationTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CorrelationMatrix." . $TextFilesInfo{OutFileExt}[$Index];
|
|
397 $RSquareTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "RSquareMatrix." . $TextFilesInfo{OutFileExt}[$Index];
|
|
398 $CovarianceTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CovarianceMatrix." . $TextFilesInfo{OutFileExt}[$Index];
|
|
399
|
|
400 my($TextFilesList, $Delimiter);
|
|
401 $TextFilesList = "";
|
|
402 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
403 $TextFilesList = $CorrelationTextFile;
|
|
404 if ($CalculateRSquare) {
|
|
405 $TextFilesList .= ", $CorrelationTextFile";
|
|
406 }
|
|
407 }
|
|
408 $Delimiter = length($TextFilesList) ? "," : "";
|
|
409 if ($CalculateCovariance) {
|
|
410 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
|
|
411 }
|
|
412 if ($TextFilesList =~ /\,/) {
|
|
413 print "Generating new text files $TextFilesList...\n"
|
|
414 }
|
|
415 else {
|
|
416 print "Generating new text file $TextFilesList...\n"
|
|
417 }
|
|
418 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
419 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
|
|
420 if ($CalculateRSquare) {
|
|
421 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
|
|
422 }
|
|
423 }
|
|
424 if ($CalculateCovariance) {
|
|
425 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
|
|
426 }
|
|
427
|
|
428 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $ColNum, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
|
|
429
|
|
430 # Write out the column labels...
|
|
431 @ColLabels = ();
|
|
432 push @ColLabels, "";
|
|
433 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
|
|
434 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
|
|
435 }
|
|
436 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
437 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
438 print CORRELATIONTEXTFILE "$Line\n";
|
|
439 if ($CalculateRSquare) {
|
|
440 print RSQUARETEXTFILE "$Line\n";
|
|
441 }
|
|
442 }
|
|
443 if ($CalculateCovariance) {
|
|
444 print COVARIANCETEXTFILE "$Line\n";
|
|
445 }
|
|
446
|
|
447 # Due to symmetric nature of these matrices, only one half needs to be
|
|
448 # calculated. So, just calculate the lower half and copy it to upper half...
|
|
449 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap);
|
|
450
|
|
451 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
|
|
452 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
|
|
453 for $ColNum2 (0 .. $ColNum1) {
|
|
454 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}};
|
|
455 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}};
|
|
456 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
457 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2);
|
|
458 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
|
|
459 $CorrelationMatrixMap{$ColNum1}{$ColNum2} = $CorrelationValue;
|
|
460 if ($ColNum1 != $ColNum2) {
|
|
461 $CorrelationMatrixMap{$ColNum2}{$ColNum1} = $CorrelationValue;
|
|
462 }
|
|
463 if ($CalculateRSquare) {
|
|
464 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
|
|
465 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
|
|
466 $RSquareMatrixMap{$ColNum1}{$ColNum2} = $RSquareValue;
|
|
467 if ($ColNum1 != $ColNum2) {
|
|
468 $RSquareMatrixMap{$ColNum2}{$ColNum1} = $RSquareValue;
|
|
469 }
|
|
470 }
|
|
471 }
|
|
472 if ($CalculateCovariance) {
|
|
473 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2);
|
|
474 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
|
|
475 $CovarianceMatrixMap{$ColNum1}{$ColNum2} = $CovarianceValue;
|
|
476 if ($ColNum1 != $ColNum2) {
|
|
477 $CovarianceMatrixMap{$ColNum2}{$ColNum1} = $CovarianceValue;
|
|
478 }
|
|
479 }
|
|
480 }
|
|
481 }
|
|
482
|
|
483 # Write out the matrices...
|
|
484 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
|
|
485 @CorrelationRowValues = ();
|
|
486 @RSquareRowValues = ();
|
|
487 @CovarianceRowValues = ();
|
|
488 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
489 push @CorrelationRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
|
|
490 if ($CalculateRSquare) {
|
|
491 push @RSquareRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
|
|
492 }
|
|
493 }
|
|
494 if ($CalculateCovariance) {
|
|
495 push @CovarianceRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
|
|
496 }
|
|
497 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
|
|
498 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
499 push @CorrelationRowValues, $CorrelationMatrixMap{$ColNum1}{$ColNum2};
|
|
500 if ($CalculateRSquare) {
|
|
501 push @RSquareRowValues, $RSquareMatrixMap{$ColNum1}{$ColNum2};
|
|
502 }
|
|
503 }
|
|
504 if ($CalculateCovariance) {
|
|
505 push @CovarianceRowValues, $CovarianceMatrixMap{$ColNum1}{$ColNum2};
|
|
506 }
|
|
507 }
|
|
508 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
509 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
510 print CORRELATIONTEXTFILE "$Line\n";
|
|
511 if ($CalculateRSquare) {
|
|
512 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
513 print RSQUARETEXTFILE "$Line\n";
|
|
514 }
|
|
515 }
|
|
516 if ($CalculateCovariance) {
|
|
517 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
518 print COVARIANCETEXTFILE "$Line\n";
|
|
519 }
|
|
520 }
|
|
521 if ($CalculateCorrelation || $CalculateRSquare) {
|
|
522 close CORRELATIONTEXTFILE;
|
|
523 if ($CalculateRSquare) {
|
|
524 close RSQUARETEXTFILE;
|
|
525 }
|
|
526 }
|
|
527 if ($CalculateCovariance) {
|
|
528 close COVARIANCETEXTFILE;
|
|
529 }
|
|
530 }
|
|
531
|
|
532 # Calculate standard scores...
|
|
533 sub PerformStandardScoresAnalysis {
|
|
534 my($Index, $ColValuesToAnalyzeMapRef) = @_;
|
|
535 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
|
|
536
|
|
537 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0;
|
|
538 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0;
|
|
539
|
|
540 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "StandardScores." . $TextFilesInfo{OutFileExt}[$Index];
|
|
541 print "Generating new text file $NewTextFile...\n";
|
|
542 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
|
|
543
|
|
544 my($ColValuesRef, $ColNum, @ColNumsToAnalyze);
|
|
545 # Write out column labels...
|
|
546 @ColLabels = ();
|
|
547 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]};
|
|
548 for $ColNum (@ColNumsToAnalyze) {
|
|
549 $Label = $TextFilesInfo{ColLabels}[$Index][$ColNum];
|
|
550 if ($StandardScores) {
|
|
551 push @ColLabels, "${Label}\(StandardScores)";
|
|
552 }
|
|
553 if ($StandardScoresN) {
|
|
554 push @ColLabels, "${Label}\(StandardScoresN)";
|
|
555 }
|
|
556 }
|
|
557 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
558 print NEWTEXTFILE "$NewLine\n";
|
|
559
|
|
560 # Go over each column to be analyzed and calculate standard deviation
|
|
561 # and mean values...
|
|
562 my(%StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
|
|
563 %StandardDeviationMap = ();
|
|
564 %StandardDeviationNMap = ();
|
|
565 %MeanMap = ();
|
|
566 for $ColNum (@ColNumsToAnalyze) {
|
|
567 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
|
|
568 if (!exists($MeanMap{$ColNum})) {
|
|
569 $MeanMap{$ColNum} = Mean($ColValuesRef);
|
|
570 }
|
|
571 if ($StandardScores) {
|
|
572 if (!exists($StandardDeviationMap{$ColNum})) {
|
|
573 $StandardDeviationMap{$ColNum} = StandardDeviation($ColValuesRef);
|
|
574 }
|
|
575 }
|
|
576 if ($StandardScoresN) {
|
|
577 if (!exists($StandardDeviationNMap{$ColNum})) {
|
|
578 $StandardDeviationNMap{$ColNum} = StandardDeviationN($ColValuesRef);
|
|
579 }
|
|
580 }
|
|
581 }
|
|
582 #
|
|
583 # Go over each row and calculate standard scores for each column
|
|
584 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
|
|
585 # for StandardScoresN; write out the calculated values as well...
|
|
586
|
|
587 my($TextFile, $InDelim, $Line, $Value, $ValueOkay, $ScoreValue, @RowValues, @LineWords);
|
|
588 $TextFile = $TextFilesList[$Index];
|
|
589 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
590
|
|
591 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
|
|
592 $Line = GetTextLine(\*TEXTFILE);
|
|
593 while ($Line = GetTextLine(\*TEXTFILE)) {
|
|
594 @LineWords = quotewords($InDelim, 0, $Line);
|
|
595 @RowValues = ();
|
|
596 COLNUM: for $ColNum (@ColNumsToAnalyze) {
|
|
597 $Value = $LineWords[$ColNum];
|
|
598 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1;
|
|
599 if ($StandardScores) {
|
|
600 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationMap{$ColNum}) : "";
|
|
601 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
|
|
602 push @RowValues, $ScoreValue;
|
|
603 }
|
|
604 if ($StandardScoresN) {
|
|
605 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationNMap{$ColNum}) : "";
|
|
606 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
|
|
607 push @RowValues, $ScoreValue;
|
|
608 }
|
|
609 }
|
|
610 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
611 print NEWTEXTFILE "$NewLine\n";
|
|
612 }
|
|
613 close TEXTFILE;
|
|
614 close NEWTEXTFILE;
|
|
615 }
|
|
616
|
|
617 # Make sure the specified columns exists in text files...
|
|
618 sub ProcessColumnsInfo {
|
|
619 my($Index, $TextFile, $ColNum, $NewColNum, $ColIndex, @ColNumsToAnalyze, %UniqueColNumsToAnalyzeMap);
|
|
620
|
|
621 @{$TextFilesInfo{ColNumsToAnalyze}} = ();
|
|
622 @{$TextFilesInfo{ColPairs1ToAnalyze}} = ();
|
|
623 @{$TextFilesInfo{ColPairs2ToAnalyze}} = ();
|
|
624 @{$TextFilesInfo{UniqueColNumsToAnalyze}} = ();
|
|
625
|
|
626 FILELIST: for $Index (0 .. $#TextFilesList) {
|
|
627 $TextFile = $TextFilesList[$Index];
|
|
628
|
|
629 @{$TextFilesInfo{ColNumsToAnalyze}[$Index]} = ();
|
|
630 @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]} = ();
|
|
631 @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]} = ();
|
|
632 @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]} = ();
|
|
633
|
|
634 %UniqueColNumsToAnalyzeMap = ();
|
|
635
|
|
636 if ($TextFilesInfo{FileOkay}[$Index]) {
|
|
637 @ColNumsToAnalyze = ();
|
|
638 if (@{$OptionsInfo{SpecifiedColumns}}) {
|
|
639 if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
|
|
640 for $ColNum (@{$OptionsInfo{SpecifiedColumns}}) {
|
|
641 if ($ColNum >=1 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) {
|
|
642 $NewColNum = $ColNum -1;
|
|
643 push @ColNumsToAnalyze, $NewColNum;
|
|
644 }
|
|
645 }
|
|
646 }
|
|
647 else {
|
|
648 my($ColLabel);
|
|
649 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) {
|
|
650 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
|
|
651 push @ColNumsToAnalyze, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
|
|
652 }
|
|
653 }
|
|
654 }
|
|
655 }
|
|
656 elsif (defined $OptionsInfo{Columns} && $OptionsInfo{Columns} =~ /^All$/i) {
|
|
657 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
|
|
658 push @ColNumsToAnalyze, $ColNum;
|
|
659 }
|
|
660 }
|
|
661 else {
|
|
662 push @ColNumsToAnalyze, 0;
|
|
663 }
|
|
664 if (@ColNumsToAnalyze) {
|
|
665 push @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}, @ColNumsToAnalyze;
|
|
666 # Set up unique columns map as well...
|
|
667 for $ColNum (@ColNumsToAnalyze) {
|
|
668 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) {
|
|
669 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum;
|
|
670 }
|
|
671 }
|
|
672 }
|
|
673 else {
|
|
674 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist.\n";
|
|
675 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
676 next FILELIST;
|
|
677 }
|
|
678 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
|
|
679 # Make sure specific frequency files don't exist...
|
|
680 my($FrequencyFile);
|
|
681 for $ColNum (@ColNumsToAnalyze) {
|
|
682 $FrequencyFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index];
|
|
683 if (-e $FrequencyFile) {
|
|
684 warn "Warning: Ignoring file $TextFile: The file $FrequencyFile already exists.\n";
|
|
685 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
686 next FILELIST;
|
|
687 }
|
|
688 }
|
|
689 }
|
|
690 # Setup specified column pairs...
|
|
691 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) {
|
|
692 my(@ColPairsToAnalyze, $ColNum1, $ColNum2);
|
|
693 if (@{$OptionsInfo{SpecifiedColumnPairs}}) {
|
|
694 # Make sure both columns exist...
|
|
695 if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
|
|
696 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) {
|
|
697 $ColNum1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex];
|
|
698 $ColNum2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1];
|
|
699 if ($ColNum1 >=1 && $ColNum1 <= $TextFilesInfo{ColCount}[$Index] && $ColNum2 >=1 && $ColNum2 <= $TextFilesInfo{ColCount}[$Index]) {
|
|
700 $ColNum1 -= 1;
|
|
701 $ColNum2 -= 1;
|
|
702 push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
|
|
703 }
|
|
704 }
|
|
705 }
|
|
706 else {
|
|
707 my($ColLabel1, $ColLabel2);
|
|
708 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) {
|
|
709 $ColLabel1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex];
|
|
710 $ColLabel2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1];
|
|
711 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}) && exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2})) {
|
|
712 $ColNum1 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1};
|
|
713 $ColNum2 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2};
|
|
714 push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
|
|
715 }
|
|
716 }
|
|
717 }
|
|
718 }
|
|
719 elsif ($OptionsInfo{AllColumnPairs}) {
|
|
720 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
|
|
721 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
|
|
722 push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
|
|
723 }
|
|
724 }
|
|
725 }
|
|
726 else {
|
|
727 if ($TextFilesInfo{ColCount}[$Index] >= 2) {
|
|
728 push @ColPairsToAnalyze, (0,1);
|
|
729 }
|
|
730 }
|
|
731 if (@ColPairsToAnalyze) {
|
|
732 if (@ColPairsToAnalyze % 2) {
|
|
733 warn "Warning: Ignoring file $TextFile: Invalid number values specified using \"--columnpairs\" option: It must contain even number of valid values.\n";
|
|
734 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
735 next FILELIST;
|
|
736 }
|
|
737 else {
|
|
738 for ($ColIndex = 0; $ColIndex < @ColPairsToAnalyze; $ColIndex += 2) {
|
|
739 push @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex];
|
|
740 push @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex + 1];
|
|
741 }
|
|
742 # Set up unique columns map as well...
|
|
743 for $ColNum (@ColPairsToAnalyze) {
|
|
744 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) {
|
|
745 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum;
|
|
746 }
|
|
747 }
|
|
748 }
|
|
749 }
|
|
750 }
|
|
751 # Setup uniques columns array...
|
|
752 push @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}, (sort keys %UniqueColNumsToAnalyzeMap);
|
|
753 }
|
|
754 }
|
|
755 }
|
|
756
|
|
757 # Retrieve information about input text files...
|
|
758 sub RetrieveTextFilesInfo {
|
|
759 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $OutFile, $OutFileExt, $ColNum, $ColLabel);
|
|
760
|
|
761 %TextFilesInfo = ();
|
|
762
|
|
763 @{$TextFilesInfo{FileOkay}} = ();
|
|
764 @{$TextFilesInfo{ColCount}} = ();
|
|
765 @{$TextFilesInfo{ColLabels}} = ();
|
|
766 @{$TextFilesInfo{ColLabelToNumMap}} = ();
|
|
767 @{$TextFilesInfo{InDelim}} = ();
|
|
768 @{$TextFilesInfo{OutFileRoot}} = ();
|
|
769 @{$TextFilesInfo{OutFileExt}} = ();
|
|
770
|
|
771 FILELIST: for $Index (0 .. $#TextFilesList) {
|
|
772 $TextFile = $TextFilesList[$Index];
|
|
773
|
|
774 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
775 $TextFilesInfo{ColCount}[$Index] = 0;
|
|
776 $TextFilesInfo{InDelim}[$Index] = "";
|
|
777 $TextFilesInfo{OutFileRoot}[$Index] = "";
|
|
778 $TextFilesInfo{OutFileExt}[$Index] = "";
|
|
779
|
|
780 @{$TextFilesInfo{ColLabels}[$Index]} = ();
|
|
781 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
|
|
782
|
|
783 if (!(-e $TextFile)) {
|
|
784 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
|
|
785 next FILELIST;
|
|
786 }
|
|
787 if (!CheckFileType($TextFile, "csv tsv")) {
|
|
788 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
|
|
789 next FILELIST;
|
|
790 }
|
|
791 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
|
|
792 if ($FileExt =~ /^tsv$/i) {
|
|
793 $InDelim = "\t";
|
|
794 }
|
|
795 else {
|
|
796 $InDelim = "\,";
|
|
797 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
|
|
798 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
|
|
799 next FILELIST;
|
|
800 }
|
|
801 if ($Options{indelim} =~ /^semicolon$/i) {
|
|
802 $InDelim = "\;";
|
|
803 }
|
|
804 }
|
|
805
|
|
806 if (!open TEXTFILE, "$TextFile") {
|
|
807 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
|
|
808 next FILELIST;
|
|
809 }
|
|
810
|
|
811 $Line = GetTextLine(\*TEXTFILE);
|
|
812 @ColLabels = quotewords($InDelim, 0, $Line);
|
|
813 close TEXTFILE;
|
|
814
|
|
815 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
816 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
|
|
817 $FileExt = "csv";
|
|
818 if ($Options{outdelim} =~ /^tab$/i) {
|
|
819 $FileExt = "tsv";
|
|
820 }
|
|
821 $OutFileExt = $FileExt;
|
|
822 if ($Options{root} && (@TextFilesList == 1)) {
|
|
823 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
|
|
824 if ($RootFileName && $RootFileExt) {
|
|
825 $FileName = $RootFileName;
|
|
826 }
|
|
827 else {
|
|
828 $FileName = $Options{root};
|
|
829 }
|
|
830 $OutFileRoot = $FileName;
|
|
831 }
|
|
832 else {
|
|
833 $OutFileRoot = $FileName;
|
|
834 }
|
|
835 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt";
|
|
836
|
|
837 if (lc($OutFile) eq lc($TextFile)) {
|
|
838 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n";
|
|
839 next FILELIST;
|
|
840 }
|
|
841 if (!$Options{overwrite}) {
|
|
842 if (-e $OutFile) {
|
|
843 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
|
|
844 next FILELIST;
|
|
845 }
|
|
846 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
|
|
847 if ($OptionsInfo{AllColumnPairs}) {
|
|
848 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) {
|
|
849 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n";
|
|
850 next FILELIST;
|
|
851 }
|
|
852 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) {
|
|
853 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n";
|
|
854 next FILELIST;
|
|
855 }
|
|
856 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) {
|
|
857 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n";
|
|
858 next FILELIST;
|
|
859 }
|
|
860 }
|
|
861 else {
|
|
862 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") {
|
|
863 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n";
|
|
864 next FILELIST;
|
|
865 }
|
|
866 }
|
|
867 }
|
|
868 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) {
|
|
869 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n";
|
|
870 next FILELIST;
|
|
871 }
|
|
872 }
|
|
873
|
|
874 $TextFilesInfo{FileOkay}[$Index] = 1;
|
|
875 $TextFilesInfo{InDelim}[$Index] = $InDelim;
|
|
876 $TextFilesInfo{OutFileRoot}[$Index] = "$OutFileRoot";
|
|
877 $TextFilesInfo{OutFileExt}[$Index] = "$OutFileExt";
|
|
878
|
|
879 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
|
|
880 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
|
|
881 for $ColNum (0 .. $#ColLabels) {
|
|
882 $ColLabel = $ColLabels[$ColNum];
|
|
883 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
|
|
884 }
|
|
885 }
|
|
886 }
|
|
887
|
|
888 # Process option values...
|
|
889 sub ProcessOptions {
|
|
890 %OptionsInfo = ();
|
|
891
|
|
892 $OptionsInfo{Mode} = $Options{mode};
|
|
893
|
|
894 $OptionsInfo{DetailLevel} = $Options{detail};
|
|
895
|
|
896 # Setup supported statistical functions...
|
|
897 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
|
|
898 %SupportedStatisticaFunctionsMap = ();
|
|
899 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
|
|
900
|
|
901 for $SupportedFunction (@SupportedStatisticaFunctions) {
|
|
902 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
|
|
903 }
|
|
904
|
|
905 # Setup a list of functions to use for analysis...
|
|
906 my($SpecifiedFunction);
|
|
907 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = ();
|
|
908 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = ();
|
|
909 # Check mode values...
|
|
910 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
|
|
911 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic";
|
|
912 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
|
|
913 }
|
|
914 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
|
|
915 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll";
|
|
916 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum);
|
|
917 }
|
|
918 elsif ($Options{mode} =~ /^All$/i ) {
|
|
919 $OptionsInfo{FileNameMode} = "AllStatistics";
|
|
920 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions;
|
|
921 }
|
|
922 else {
|
|
923 $OptionsInfo{FileNameMode} = "SpecifiedStatistics";
|
|
924 # Comma delimited list of functions...
|
|
925 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
|
|
926 $Mode = $Options{mode};
|
|
927 $Mode =~ s/ //g;
|
|
928 @SpecifiedFunctions = split ",", $Mode;
|
|
929 @UnsupportedSpecifiedFunctions = ();
|
|
930 for $SpecifiedFunction (@SpecifiedFunctions) {
|
|
931 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
|
|
932 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction;
|
|
933 }
|
|
934 else {
|
|
935 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
|
|
936 }
|
|
937 }
|
|
938 if (@UnsupportedSpecifiedFunctions) {
|
|
939 if (@UnsupportedSpecifiedFunctions > 1) {
|
|
940 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
|
|
941 }
|
|
942 else {
|
|
943 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
|
|
944 }
|
|
945 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
|
|
946 }
|
|
947 }
|
|
948 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
|
|
949 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) {
|
|
950 next FUNCTION;
|
|
951 }
|
|
952 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
|
|
953 }
|
|
954
|
|
955 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
|
|
956 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
|
|
957
|
|
958 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
|
|
959 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
|
|
960
|
|
961 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1;
|
|
962 $OptionsInfo{Precision} = $Options{precision};
|
|
963
|
|
964 $OptionsInfo{KLargest} = $Options{klargest};
|
|
965 $OptionsInfo{KSmallest} = $Options{ksmallest};
|
|
966
|
|
967 $OptionsInfo{TrimFraction} = $Options{trimfraction};
|
|
968
|
|
969 # Setup frequency bin values...
|
|
970 $OptionsInfo{NumOfBins} = 10;
|
|
971 @{$OptionsInfo{BinRange}} = ();
|
|
972 if ($Options{frequencybins} =~ /\,/) {
|
|
973 my($BinValue, @SpecifiedBinRange);
|
|
974 @SpecifiedBinRange = split /\,/, $Options{frequencybins};
|
|
975 if (@SpecifiedBinRange < 2) {
|
|
976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
|
|
977 }
|
|
978 for $BinValue (@SpecifiedBinRange) {
|
|
979 if (!IsNumerical($BinValue)) {
|
|
980 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
|
|
981 }
|
|
982 }
|
|
983 my($Index1, $Index2);
|
|
984 for $Index1 (0 .. $#SpecifiedBinRange) {
|
|
985 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
|
|
986 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
|
|
987 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
|
|
988 }
|
|
989 }
|
|
990 }
|
|
991 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
|
|
992 }
|
|
993 else {
|
|
994 $OptionsInfo{NumOfBins} = $Options{frequencybins};
|
|
995 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
|
|
996 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
|
|
997 }
|
|
998 }
|
|
999
|
|
1000 # Setup specified columns...
|
|
1001 $OptionsInfo{ColMode} = $Options{colmode};
|
|
1002 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef;
|
|
1003
|
|
1004 @{$OptionsInfo{SpecifiedColumns}} = ();
|
|
1005 if (defined $Options{columns} && $Options{columns} !~ /^All$/i) {
|
|
1006 my(@SpecifiedValues) = split ",", $Options{columns};
|
|
1007 if ($Options{colmode} =~ /^colnum$/i) {
|
|
1008 my($ColValue);
|
|
1009 for $ColValue (@SpecifiedValues) {
|
|
1010 if (!IsPositiveInteger($ColValue)) {
|
|
1011 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n";
|
|
1012 }
|
|
1013 }
|
|
1014 }
|
|
1015 push @{$OptionsInfo{SpecifiedColumns}}, @SpecifiedValues;
|
|
1016 }
|
|
1017 @{$OptionsInfo{SpecifiedColumnPairs}} = ();
|
|
1018 $OptionsInfo{AllColumnPairs} = (defined($Options{columnpairs}) && $Options{columnpairs} =~ /^AllPairs$/i) ? 1 : 0;
|
|
1019 if (defined($Options{columnpairs}) && !$OptionsInfo{AllColumnPairs}) {
|
|
1020 my(@SpecifiedValues) = split ",", $Options{columnpairs};
|
|
1021 if (@SpecifiedValues % 2) {
|
|
1022 die "Error: Invalid number of values specified using \"--columnpairs\" option: It must contain even number of values.\n";
|
|
1023 }
|
|
1024 if ($Options{colmode} =~ /^colnum$/i) {
|
|
1025 my($ColValue);
|
|
1026 for $ColValue (@SpecifiedValues) {
|
|
1027 if (!IsPositiveInteger($ColValue)) {
|
|
1028 die "Error: Column value, $ColValue, specified using \"--columnpairs\" is not valid: Allowed integer values: > 0.\n";
|
|
1029 }
|
|
1030 }
|
|
1031 }
|
|
1032 push @{$OptionsInfo{SpecifiedColumnPairs}}, @SpecifiedValues;
|
|
1033 }
|
|
1034
|
|
1035 }
|
|
1036
|
|
1037 # Setup script usage and retrieve command line arguments specified using various options...
|
|
1038 sub SetupScriptUsage {
|
|
1039
|
|
1040 # Retrieve all the options...
|
|
1041 %Options = ();
|
|
1042 $Options{colmode} = "colnum";
|
|
1043 $Options{detail} = 1;
|
|
1044 $Options{indelim} = "comma";
|
|
1045 $Options{frequencybins} = 10;
|
|
1046 $Options{klargest} = 2;
|
|
1047 $Options{ksmallest} = 2;
|
|
1048 $Options{mode} = "DescriptiveStatisticsBasic";
|
|
1049 $Options{outdelim} = "comma";
|
|
1050 $Options{precision} = 2;
|
|
1051 $Options{quote} = "yes";
|
|
1052 $Options{trimfraction} = 0.1;
|
|
1053
|
|
1054 if (!GetOptions(\%Options, "colmode|c=s", "columns=s", "columnpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "indelim=s", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) {
|
|
1055 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
1056 }
|
|
1057 if ($Options{workingdir}) {
|
|
1058 if (! -d $Options{workingdir}) {
|
|
1059 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
1060 }
|
|
1061 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
1062 }
|
|
1063 if ($Options{colmode} !~ /^(colnum|collabel)$/i) {
|
|
1064 die "Error: The value specified, $Options{colmode}, for option \"-c --colmode\" is not valid. Allowed values: colnum or collabel\n";
|
|
1065 }
|
|
1066 if (!IsPositiveInteger($Options{detail})) {
|
|
1067 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
|
|
1068 }
|
|
1069 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
|
|
1070 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
|
|
1071 }
|
|
1072 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
|
|
1073 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
|
|
1074 }
|
|
1075 if ($Options{quote} !~ /^(yes|no)$/i) {
|
|
1076 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
|
|
1077 }
|
|
1078 if (!IsPositiveInteger($Options{precision})) {
|
|
1079 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
|
|
1080 }
|
|
1081 if (!IsPositiveInteger($Options{klargest})) {
|
|
1082 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n";
|
|
1083 }
|
|
1084 if (!IsPositiveInteger($Options{ksmallest})) {
|
|
1085 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n";
|
|
1086 }
|
|
1087 if (IsFloat($Options{trimfraction})) {
|
|
1088 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) {
|
|
1089 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
|
|
1090 }
|
|
1091 }
|
|
1092 else {
|
|
1093 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
|
|
1094 }
|
|
1095 }
|
|
1096
|
|
1097 __END__
|
|
1098
|
|
1099 =head1 NAME
|
|
1100
|
|
1101 AnalyzeTextFilesData.pl - Analyze numerical coulmn data in TextFile(s)
|
|
1102
|
|
1103 =head1 SYNOPSIS
|
|
1104
|
|
1105 AnalyzeTextFilesData.pl TextFile(s)...
|
|
1106
|
|
1107 AnalyzeTextFilesData.pl [B<-c, --colmode> colnum | collabel] [B<--columns> "colnum,[colnum,...]" | "collabel,[collabel,...]" | All]
|
|
1108 [B<--columnpairs> "colnum,colnum,[colnum,colnum]..." | "collabel,collabel,[collabel,collabel]..." | AllPairs]
|
|
1109 [B<-d, --detail> infolevel] [B<-f, --fast>] [B<--frequencybins> number | "number,number,[number,...]"] [B<-h, --help>]
|
|
1110 [B<--indelim> comma | semicolon] [B<--klargest> number] [B<--ksmallest> number]
|
|
1111 [B<-m, --mode> DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"]
|
|
1112 [B<-o, --overwrite>] [B<--outdelim> comma | tab | semicolon] [B<-p, --precision> number]
|
|
1113 [B<-q, --quote> yes | no] [B<-r, --root> rootname] [B<--trimfraction> number] [B<-w, --workingdir> dirname] TextFiles(s)...
|
|
1114
|
|
1115 =head1 DESCRIPTION
|
|
1116
|
|
1117 Anaylze numerical column data in I<TextFile(s)> using a combination of various statistical
|
|
1118 functions; Non-numerical values are simply ignored. For I<Correlation, RSquare, and Covariance>
|
|
1119 analysis, the count of valid values in specifed column pair must be same; otherwise, column
|
|
1120 pair is ignored. The file names are separated by space. The valid file extensions are I<.csv>
|
|
1121 and I<.tsv> for comma/semicolon and tab delimited text files respectively. All other
|
|
1122 file names are ignored. All the text files in a current directory can be specified by
|
|
1123 I<*.csv>, I<*.tsv>, or the current directory name. The B<--indelim> option determines
|
|
1124 the format of I<TextFile(s)>. Any file which doesn't correspond to the format indicated
|
|
1125 by B<--indelim> option is ignored.
|
|
1126
|
|
1127 =head1 OPTIONS
|
|
1128
|
|
1129 =over 4
|
|
1130
|
|
1131 =item B<-c, --colmode> I<colnum | collabel>
|
|
1132
|
|
1133 Specify how columns are identified in TextFile(s): using column number or column
|
|
1134 label. Possible values: I<colnum or collabel>. Default value: I<colnum>.
|
|
1135
|
|
1136 =item B<--columns> I<"colnum,[colnum,...]" | "collabel,[collabel]..." | All>
|
|
1137
|
|
1138 This value is mode specific. It's a list of comma delimited columns to use
|
|
1139 for data analysis. Default value: I<First column>.
|
|
1140
|
|
1141 This value is ignored during I<Correlation/Pearson Correlation> and I<Covariance>
|
|
1142 data analysis; B<-coulmnparis> option is used instead.
|
|
1143
|
|
1144 For I<colnum> value of B<-c, --colmode> option, input values format is:
|
|
1145 I<colnum,colnum,...>. Example:
|
|
1146
|
|
1147 1,3,5
|
|
1148
|
|
1149 For I<collabel> value of B<-c, --colmode> option, input values format is:
|
|
1150 I<collabel,collabel,..>. Example:
|
|
1151
|
|
1152 ALogP,MolWeight,EC50
|
|
1153
|
|
1154 =item B<--columnpairs> I<"colnum,colnum,[colnum,colnum,...]" | "collabel,collabel,[collabel,collabel,...]" | AllPairs>
|
|
1155
|
|
1156 This value is mode specific and is only used for I<Correlation, PearsonCorrelation, or
|
|
1157 Covariance> value of B<-m, --mode> option. It is a comma delimited list of column pairs
|
|
1158 to use for data analysis during I<Correlation> and I<Covariance> calculations. Default value:
|
|
1159 I<First column, Second column>.
|
|
1160
|
|
1161 For I<colnum> value of B<-c, --colmode> option, input values format is:
|
|
1162 I<colnum,colnum,[colnum,colnum]...>. Example:
|
|
1163
|
|
1164 1,3,5,6,1,6
|
|
1165
|
|
1166 For I<collabel> value of B<-c, --colmode> option, input values format is:
|
|
1167 I<collabel,collabel,[collabel,collabel]..>. Example:
|
|
1168
|
|
1169 MolWeight,EC50,NumN+O,PSA
|
|
1170
|
|
1171 For I<AllPairs> value of B<--columnparis> option, all column pairs are used for I<Correlation>
|
|
1172 and I<Covariance> calculations.
|
|
1173
|
|
1174 =item B<-d, --detail> I<infolevel>
|
|
1175
|
|
1176 Level of information to print about column values being ignored. Default: I<1>. Possible values:
|
|
1177 1, 2, 3, or 4.
|
|
1178
|
|
1179 =item B<-f, --fast>
|
|
1180
|
|
1181 In this mode, all the columns specified for analysis are assumed to contain numerical
|
|
1182 data and no checking is performed before analysis. By default, only numerical data is
|
|
1183 used for analysis; other types of column data is ignored.
|
|
1184
|
|
1185 =item B<--frequencybins> I<number | "number,number,[number,...]">
|
|
1186
|
|
1187 Specify number of bins or bin range to use for frequency analysis. Default value: I<10>
|
|
1188
|
|
1189 Number of bins value along with the smallest and largest value for a column is used to
|
|
1190 group the column values into different groups.
|
|
1191
|
|
1192 The bin range list is used to group values for a column into different groups; It must contain
|
|
1193 values in ascending order. Examples:
|
|
1194
|
|
1195 10,20,30
|
|
1196 0.1,0.2,0.3,0.4,0.5
|
|
1197
|
|
1198 The frequency value calculated for a specific bin corresponds to all the column values
|
|
1199 which are greater than the previous bin value and less than or equal to the current bin value.
|
|
1200
|
|
1201 =item B<-h, --help>
|
|
1202
|
|
1203 Print this help message.
|
|
1204
|
|
1205 =item B<--indelim> I<comma | semicolon>
|
|
1206
|
|
1207 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
|
|
1208 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
|
|
1209 delimiter.
|
|
1210
|
|
1211 =item B<--klargest> I<number>
|
|
1212
|
|
1213 Kth largest value to find by I<KLargest> function. Default value: I<2> Valid values: positive
|
|
1214 integers.
|
|
1215
|
|
1216 =item B<--ksmallest> I<number>
|
|
1217
|
|
1218 Kth smallest value to find by I<KSmallest> function. Default value: I<2>. Valid values: positive
|
|
1219 integers.
|
|
1220
|
|
1221 =item B<-m, --mode> I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]">
|
|
1222
|
|
1223 Specify how to analyze data in TextFile(s): calculate basic or all descriptive statistics; or
|
|
1224 use a comma delimited list of supported statistical functions. Possible values:
|
|
1225 I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | "function1,[function2]...">. Default
|
|
1226 value: I<DescriptiveStatisticsBasic>
|
|
1227
|
|
1228 I<DescriptiveStatisticsBasic> includes these functions: I<Count, Maximum, Minimum, Mean,
|
|
1229 Median, Sum, StandardDeviation, StandardError, Variance>.
|
|
1230
|
|
1231 I<DescriptiveStatisticsAll>, in addition to I<DescriptiveStatisticsBasic> functions, includes:
|
|
1232 I<GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare,
|
|
1233 Skewness, TrimMean>.
|
|
1234
|
|
1235 I<All> uses complete list of supported functions: I<Average, AverageDeviation, Correlation,
|
|
1236 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis,
|
|
1237 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum,
|
|
1238 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores,
|
|
1239 StandardScoresN, TrimMean, Variance, VarianceN>. The function names ending with N
|
|
1240 calculate corresponding values assuming an entire population instead of a population sample.
|
|
1241
|
|
1242 Here are the formulas for these functions:
|
|
1243
|
|
1244 Average: See Mean
|
|
1245
|
|
1246 AverageDeviation: SUM( ABS(x[i] - Xmean) ) / n
|
|
1247
|
|
1248 Correlation: See Pearson Correlation
|
|
1249
|
|
1250 Covariance: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / n
|
|
1251
|
|
1252 GeometricMean: NthROOT( PRODUCT(x[i]) )
|
|
1253
|
|
1254 HarmonicMean: 1 / ( SUM(1/x[i]) / n )
|
|
1255
|
|
1256 Mean: SUM( x[i] ) / n
|
|
1257
|
|
1258 Median: Xsorted[(n - 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2
|
|
1259 for odd values of n.
|
|
1260
|
|
1261 Kurtosis: [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] -
|
|
1262 {3((n - 1)^2)}/{(n - 2)(n-3)}
|
|
1263
|
|
1264 PearsonCorrelation: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )
|
|
1265 (SUM( (y[i] - Ymean)^2 )) )
|
|
1266
|
|
1267 RSquare: PearsonCorrelation^2
|
|
1268
|
|
1269 Skewness: {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
|
|
1270
|
|
1271 StandardDeviation: SQRT ( SUM( (x[i] - Mean)^2 ) / (n - 1) )
|
|
1272
|
|
1273 StandardDeviationN: SQRT ( SUM( (x[i] - Mean)^2 ) / n )
|
|
1274
|
|
1275 StandardError: StandardDeviation / SQRT( n )
|
|
1276
|
|
1277 StandardScore: (x[i] - Mean) / (n - 1)
|
|
1278
|
|
1279 StandardScoreN: (x[i] - Mean) / n
|
|
1280
|
|
1281 Variance: SUM( (x[i] - Xmean)^2 / (n - 1) )
|
|
1282
|
|
1283 VarianceN: SUM( (x[i] - Xmean)^2 / n )
|
|
1284
|
|
1285 =item B<-o, --overwrite>
|
|
1286
|
|
1287 Overwrite existing files.
|
|
1288
|
|
1289 =item B<--outdelim> I<comma | tab | semicolon>
|
|
1290
|
|
1291 Output text file delimiter. Possible values: I<comma, tab, or semicolon>
|
|
1292 Default value: I<comma>.
|
|
1293
|
|
1294 =item B<-p, --precision> I<number>
|
|
1295
|
|
1296 Precision of calculated values in the output file. Default: up to I<2> decimal places.
|
|
1297 Valid values: positive integers.
|
|
1298
|
|
1299 =item B<-q, --quote> I<yes | no>
|
|
1300
|
|
1301 Put quotes around column values in output text file. Possible values: I<yes or
|
|
1302 no>. Default value: I<yes>.
|
|
1303
|
|
1304 =item B<-r, --root> I<rootname>
|
|
1305
|
|
1306 New text file name is generated using the root: <Root>.<Ext>. Default new file
|
|
1307 name: <InitialTextFileName><Mode>.<Ext>. Based on the specified analysis,
|
|
1308 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic,
|
|
1309 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation,
|
|
1310 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for
|
|
1311 comma/semicolon, and tab delimited text files respectively. This option is ignored for
|
|
1312 multiple input files.
|
|
1313
|
|
1314 =item B<--trimfraction> I<number>
|
|
1315
|
|
1316 Fraction of data to exclude from the top and bottom of the data set during
|
|
1317 I<TrimMean> calculation. Default value: I<0.1>. Valid values: > 0 and < 1.
|
|
1318
|
|
1319 =item B<-w --workingdir> I<text>
|
|
1320
|
|
1321 Location of working directory. Default: current directory.
|
|
1322
|
|
1323 =back
|
|
1324
|
|
1325 =head1 EXAMPLES
|
|
1326
|
|
1327 To calculate basic statistics for data in first column and generate a
|
|
1328 NewSample1DescriptiveStatisticsBasic.csv file, type:
|
|
1329
|
|
1330 % AnalyzeTextFilesData.pl -o -r NewSample1 Sample1.csv
|
|
1331
|
|
1332 To calculate basic statistics for data in third column and generate a
|
|
1333 NewSample1DescriptiveStatisticsBasic.csv file, type:
|
|
1334
|
|
1335 % AnalyzeTextFilesData.pl --columns 3 -o -r NewSample1 Sample1.csv
|
|
1336
|
|
1337 To calculate basic statistics for data in MolWeight column and generate a
|
|
1338 NewSample1DescriptiveStatisticsBasic.csv file, type:
|
|
1339
|
|
1340 % AnalyzeTextFilesData.pl -colmode collabel --columns MolWeight -o
|
|
1341 -r NewSample1 Sample1.csv
|
|
1342
|
|
1343 To calculate all available statistics for data in third column and all column pairs,
|
|
1344 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv,
|
|
1345 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv files,
|
|
1346 type:
|
|
1347
|
|
1348 % AnalyzeTextFilesData.pl -m DescriptiveStatisticsAll --columns 3 -o
|
|
1349 --columnpairs AllPairs -r NewSample1 Sample1.csv
|
|
1350
|
|
1351 To compute frequency distribution of data in third column into five bins and
|
|
1352 generate NewSample1MolWeightFrequencyAnalysis.csv, type:
|
|
1353
|
|
1354 % AnalyzeTextFilesData.pl -m Frequency --frequencybins 5 --columns 3
|
|
1355 -o -r NewSample1 Sample1.csv
|
|
1356
|
|
1357 To compute frequency distribution of data in third column into specified bin range
|
|
1358 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type:
|
|
1359
|
|
1360 % AnalyzeTextFilesData.pl -m Frequency --frequencybins "100,200,400"
|
|
1361 --columns 3 -o -r NewSample1 Sample1.csv
|
|
1362
|
|
1363 To calculate all available statistics for data in all columns and column pairs, type:
|
|
1364
|
|
1365 % AnalyzeTextFilesData.pl -m All --columns All --columnpairs
|
|
1366 AllPairs -o -r NewSample1 Sample1.csv
|
|
1367
|
|
1368 =head1 AUTHOR
|
|
1369
|
|
1370 Manish Sud <msud@san.rr.com>
|
|
1371
|
|
1372 =head1 SEE ALSO
|
|
1373
|
|
1374 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl
|
|
1375
|
|
1376 =head1 COPYRIGHT
|
|
1377
|
|
1378 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1379
|
|
1380 This file is part of MayaChemTools.
|
|
1381
|
|
1382 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1383 the terms of the GNU Lesser General Public License as published by the Free
|
|
1384 Software Foundation; either version 3 of the License, or (at your option)
|
|
1385 any later version.
|
|
1386
|
|
1387 =cut
|