Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/AnalyzeTextFilesData.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: AnalyzeTextFilesData.pl,v $ | |
4 # $Date: 2015/02/28 20:46:04 $ | |
5 # $Revision: 1.36 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileUtil; | |
36 use TextUtil; | |
37 use StatisticsUtil; | |
38 | |
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
40 | |
41 # Autoflush STDOUT | |
42 $| = 1; | |
43 | |
44 # Starting message... | |
45 $ScriptName = basename($0); | |
46 print "\n$ScriptName: Starting...\n\n"; | |
47 $StartTime = new Benchmark; | |
48 | |
49 # Get the options and setup script... | |
50 SetupScriptUsage(); | |
51 if ($Options{help} || @ARGV < 1) { | |
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
53 } | |
54 | |
55 my(@TextFilesList); | |
56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); | |
57 | |
58 print "Processing options...\n"; | |
59 my(%OptionsInfo); | |
60 ProcessOptions(); | |
61 | |
62 # Collect column information for all the text files... | |
63 print "Checking input text file(s)...\n"; | |
64 my(%TextFilesInfo); | |
65 RetrieveTextFilesInfo(); | |
66 ProcessColumnsInfo(); | |
67 | |
68 # Generate output files... | |
69 my($FileIndex); | |
70 if (@TextFilesList > 1) { | |
71 print "\nProcessing text files...\n"; | |
72 } | |
73 for $FileIndex (0 .. $#TextFilesList) { | |
74 if ($TextFilesInfo{FileOkay}[$FileIndex]) { | |
75 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; | |
76 AnalyzeTextFile($FileIndex); | |
77 } | |
78 } | |
79 print "\n$ScriptName:Done...\n\n"; | |
80 | |
81 $EndTime = new Benchmark; | |
82 $TotalTime = timediff ($EndTime, $StartTime); | |
83 print "Total time: ", timestr($TotalTime), "\n"; | |
84 | |
85 ############################################################################### | |
86 | |
87 # Analyze data... | |
88 sub AnalyzeTextFile { | |
89 my($Index) = @_; | |
90 my($TextFile, $Line, $InDelim, $ColNum, $Value, @LineWords, @ColNumsToAnalyze, %ColValuesToAnalyzeMap); | |
91 | |
92 $TextFile = $TextFilesList[$Index]; | |
93 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
94 @ColNumsToAnalyze = @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}; | |
95 %ColValuesToAnalyzeMap = (); | |
96 for $ColNum (@ColNumsToAnalyze) { | |
97 @{$ColValuesToAnalyzeMap{$ColNum}} = (); | |
98 } | |
99 | |
100 my($LineCount, $InvalidLineCount, @InvalidColLabels); | |
101 | |
102 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; | |
103 # Skip over column labels line in text file and collect appropriate column data | |
104 # for analysis... | |
105 $Line = GetTextLine(\*TEXTFILE); | |
106 $LineCount = 1; | |
107 $InvalidLineCount = 0; | |
108 while ($Line = GetTextLine(\*TEXTFILE)) { | |
109 $LineCount++; | |
110 @LineWords = quotewords($InDelim, 0, $Line); | |
111 @InvalidColLabels = (); | |
112 COLNUM: for $ColNum (@ColNumsToAnalyze) { | |
113 $Value = $LineWords[$ColNum]; | |
114 if ($OptionsInfo{CheckData}) { | |
115 if (!IsNumerical($Value)) { | |
116 push @InvalidColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
117 next COLNUM; | |
118 } | |
119 } | |
120 push @{$ColValuesToAnalyzeMap{$ColNum}}, $Value; | |
121 } | |
122 if (@InvalidColLabels) { | |
123 $InvalidLineCount++; | |
124 if ($OptionsInfo{DetailLevel} >=4 ) { | |
125 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed: $Line \n"; | |
126 } | |
127 elsif ($OptionsInfo{DetailLevel} >= 3) { | |
128 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed...\n"; | |
129 } | |
130 elsif ($OptionsInfo{DetailLevel} >= 2) { | |
131 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for columns to be analyzed...\n"; | |
132 } | |
133 } | |
134 } | |
135 if ($InvalidLineCount && ($OptionsInfo{DetailLevel} >= 1)) { | |
136 print "Non-numerical or empty data present in $InvalidLineCount line(s)...\n"; | |
137 } | |
138 close TEXTFILE; | |
139 | |
140 # Perform the analysis... | |
141 my(@SpecifiedFunctionNames, $SpecifiedFunction); | |
142 @SpecifiedFunctionNames = (); | |
143 | |
144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { | |
145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { | |
146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)}; | |
147 } | |
148 } | |
149 if (@SpecifiedFunctionNames) { | |
150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%ColValuesToAnalyzeMap) | |
151 } | |
152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { | |
153 if ($OptionsInfo{AllColumnPairs}) { | |
154 PerformMatrixAnalysis($Index, \%ColValuesToAnalyzeMap); | |
155 } | |
156 else { | |
157 # Perform pairwise analysis for specified columns and write out calculated values - correlation | |
158 # rsquare, or covariance - in the same file. | |
159 PerformColumnPairAnalysis($Index, \%ColValuesToAnalyzeMap); | |
160 } | |
161 } | |
162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) { | |
163 PerformStandardScoresAnalysis($Index, \%ColValuesToAnalyzeMap); | |
164 } | |
165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { | |
166 PerformFrequencyAnalysis($Index, \%ColValuesToAnalyzeMap); | |
167 } | |
168 } | |
169 | |
170 # Calculate values for various statistical functions... | |
171 sub PerformAnalysis { | |
172 my($Index, $SpecifiedFunctionNamesRef, $ColValuesToAnalyzeMapRef) = @_; | |
173 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @ColNumsToAnalyze); | |
174 | |
175 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $TextFilesInfo{OutFileExt}[$Index]; | |
176 | |
177 print "Generating new text file $NewTextFile...\n"; | |
178 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
179 | |
180 # Write out column labels... | |
181 @ColLabels = (); | |
182 push @ColLabels, "ColumnID"; | |
183 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { | |
184 $Label = $SpecifiedFunction; | |
185 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { | |
186 my($KthValue); | |
187 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest}; | |
188 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; | |
189 $Label =~ s/K//g; | |
190 } | |
191 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { | |
192 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})"; | |
193 } | |
194 push @ColLabels, $Label; | |
195 } | |
196 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
197 print NEWTEXTFILE "$Line\n"; | |
198 | |
199 # Go over each column to be analyzed... | |
200 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; | |
201 | |
202 # Turn off "strict"; otherwise, invoking statistical functions using function name string | |
203 # is problematic. | |
204 no strict; | |
205 | |
206 my($ColValuesRef, $ColNum, $Value, @RowValues, %CalculatedValues); | |
207 %CalculatedValues = (); | |
208 for $ColNum (@ColNumsToAnalyze) { | |
209 @RowValues = (); | |
210 # Setup column id... | |
211 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
212 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; | |
213 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { | |
214 $Value = ""; | |
215 if (!@{$ColValuesToAnalyzeMapRef->{$ColNum}}) { | |
216 # Invalid column values... | |
217 push @RowValues, $Value; | |
218 next FUNCTIONNAME; | |
219 } | |
220 if ($SpecifiedFunction =~ /^Count$/i) { | |
221 $Value = @{$ColValuesToAnalyzeMapRef->{$ColNum}}; | |
222 } | |
223 elsif ($SpecifiedFunction =~ /^KLargest$/i) { | |
224 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KLargest}); | |
225 } | |
226 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { | |
227 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KSmallest}); | |
228 } | |
229 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { | |
230 if (exists($CalculatedValues{$ColNum}{StandardDeviation})) { | |
231 $Value = $CalculatedValues{$ColNum}{StandardDeviation}; | |
232 } | |
233 else { | |
234 $Value = &$SpecifiedFunction($ColValuesRef); | |
235 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; | |
236 } | |
237 } | |
238 elsif ($SpecifiedFunction =~ /^StandardError$/i) { | |
239 if (!exists($CalculatedValues{$ColNum}{StandardDeviation})) { | |
240 $Value = StandardDeviation($ColValuesRef); | |
241 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; | |
242 } | |
243 if (defined $CalculatedValues{$ColNum}{StandardDeviation}) { | |
244 $Value = &$SpecifiedFunction($CalculatedValues{$ColNum}{StandardDeviation}, @{$ColValuesToAnalyzeMapRef->{$ColNum}}); | |
245 } | |
246 } | |
247 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { | |
248 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{TrimFraction}); | |
249 } | |
250 else { | |
251 $Value = &$SpecifiedFunction($ColValuesRef); | |
252 } | |
253 # Format the output value. And add zero to get rid of tariling zeros... | |
254 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ""; | |
255 push @RowValues, $Value; | |
256 } | |
257 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
258 print NEWTEXTFILE "$Line\n"; | |
259 } | |
260 close NEWTEXTFILE; | |
261 } | |
262 | |
263 # Calculate covariance, correlation, rsquare for specified column pairs.... | |
264 sub PerformColumnPairAnalysis { | |
265 my($Index, $ColValuesToAnalyzeMapRef) = @_; | |
266 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); | |
267 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; | |
268 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; | |
269 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; | |
270 | |
271 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "ColumnPairsAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; | |
272 print "Generating new text file $NewTextFile...\n"; | |
273 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
274 | |
275 # Write out the column labels... | |
276 @ColLabels = (); | |
277 push @ColLabels, ("ColumnID1", "ColumnID2"); | |
278 if ($CalculateCorrelation || $CalculateRSquare) { | |
279 push @ColLabels, "Correlation"; | |
280 if ($CalculateRSquare) { | |
281 push @ColLabels, "RSquare"; | |
282 } | |
283 } | |
284 if ($CalculateCovariance) { | |
285 push @ColLabels, "Covariance"; | |
286 } | |
287 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
288 print NEWTEXTFILE "$Line\n"; | |
289 | |
290 # Go over each column pair... | |
291 my($CorrelationValue, $RSquareValue, $CovarianceValue, $ColIndex, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColPairs1ToAnalyze, @ColPairs2ToAnalyze, @RowValues, $Value); | |
292 | |
293 @ColPairs1ToAnalyze = @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}; | |
294 @ColPairs2ToAnalyze = @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}; | |
295 for $ColIndex (0 .. $#ColPairs1ToAnalyze) { | |
296 @RowValues = (); | |
297 $ColNum1 = $ColPairs1ToAnalyze[$ColIndex]; | |
298 $ColNum2 = $ColPairs2ToAnalyze[$ColIndex]; | |
299 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; | |
300 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; | |
301 | |
302 # Setup column ids... | |
303 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; | |
304 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum2]; | |
305 | |
306 if (@$ColValuesRef1 != @$ColValuesRef2) { | |
307 # Print a warning... | |
308 warn "Warning: Skipping analysis for column pair $TextFilesInfo{ColLabels}[$Index][$ColNum1], $TextFilesInfo{ColLabels}[$Index][$ColNum2]: Number of valid data values must be same.\n"; | |
309 if ($CalculateCorrelation || $CalculateRSquare) { | |
310 push @RowValues, ""; | |
311 if ($CalculateRSquare) { | |
312 push @RowValues, ""; | |
313 } | |
314 } | |
315 if ($CalculateCovariance) { | |
316 push @RowValues, ""; | |
317 } | |
318 } | |
319 else { | |
320 # Calculate appropriate value... | |
321 if ($CalculateCorrelation || $CalculateRSquare) { | |
322 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); | |
323 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; | |
324 push @RowValues, $Value; | |
325 if ($CalculateRSquare) { | |
326 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; | |
327 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; | |
328 push @RowValues, $Value; | |
329 } | |
330 } | |
331 if ($CalculateCovariance) { | |
332 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); | |
333 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; | |
334 push @RowValues, $Value; | |
335 } | |
336 } | |
337 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
338 print NEWTEXTFILE "$Line\n"; | |
339 } | |
340 close NEWTEXTFILE; | |
341 } | |
342 | |
343 # Generate histogram numbers... | |
344 sub PerformFrequencyAnalysis { | |
345 my($Index, $ColValuesToAnalyzeMapRef) = @_; | |
346 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $ColNum, @ColNumsToAnalyze, $ColValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); | |
347 | |
348 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; | |
349 for $ColNum (@ColNumsToAnalyze) { | |
350 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; | |
351 print "Generating new text file $NewTextFile...\n"; | |
352 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
353 | |
354 # Write out the column labels... | |
355 @ColLabels = (); | |
356 push @ColLabels , ("Bins", "Frequency"); | |
357 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
358 print NEWTEXTFILE "$Line\n"; | |
359 | |
360 #Calculate and write out frequency values... | |
361 %FrequencyMap = (); | |
362 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; | |
363 if (@$ColValuesRef) { | |
364 if (@{$OptionsInfo{BinRange}}) { | |
365 %FrequencyMap = Frequency($ColValuesRef, \@{$OptionsInfo{BinRange}}); | |
366 } | |
367 else { | |
368 %FrequencyMap = Frequency($ColValuesRef, $OptionsInfo{NumOfBins}); | |
369 } | |
370 } | |
371 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { | |
372 $FrequencyValue = $FrequencyMap{$BinValue}; | |
373 | |
374 @RowValues = (); | |
375 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : ""; | |
376 push @RowValues, $Value; | |
377 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : ""; | |
378 push @RowValues, $Value; | |
379 | |
380 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
381 print NEWTEXTFILE "$Line\n"; | |
382 } | |
383 close NEWTEXTFILE; | |
384 } | |
385 } | |
386 | |
387 # Calculate covariance, correlation/rsquare matrices.... | |
388 sub PerformMatrixAnalysis { | |
389 my($Index, $ColValuesToAnalyzeMapRef) = @_; | |
390 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); | |
391 | |
392 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; | |
393 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; | |
394 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; | |
395 | |
396 $CorrelationTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CorrelationMatrix." . $TextFilesInfo{OutFileExt}[$Index]; | |
397 $RSquareTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "RSquareMatrix." . $TextFilesInfo{OutFileExt}[$Index]; | |
398 $CovarianceTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CovarianceMatrix." . $TextFilesInfo{OutFileExt}[$Index]; | |
399 | |
400 my($TextFilesList, $Delimiter); | |
401 $TextFilesList = ""; | |
402 if ($CalculateCorrelation || $CalculateRSquare) { | |
403 $TextFilesList = $CorrelationTextFile; | |
404 if ($CalculateRSquare) { | |
405 $TextFilesList .= ", $CorrelationTextFile"; | |
406 } | |
407 } | |
408 $Delimiter = length($TextFilesList) ? "," : ""; | |
409 if ($CalculateCovariance) { | |
410 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; | |
411 } | |
412 if ($TextFilesList =~ /\,/) { | |
413 print "Generating new text files $TextFilesList...\n" | |
414 } | |
415 else { | |
416 print "Generating new text file $TextFilesList...\n" | |
417 } | |
418 if ($CalculateCorrelation || $CalculateRSquare) { | |
419 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; | |
420 if ($CalculateRSquare) { | |
421 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; | |
422 } | |
423 } | |
424 if ($CalculateCovariance) { | |
425 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; | |
426 } | |
427 | |
428 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $ColNum, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); | |
429 | |
430 # Write out the column labels... | |
431 @ColLabels = (); | |
432 push @ColLabels, ""; | |
433 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { | |
434 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
435 } | |
436 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
437 if ($CalculateCorrelation || $CalculateRSquare) { | |
438 print CORRELATIONTEXTFILE "$Line\n"; | |
439 if ($CalculateRSquare) { | |
440 print RSQUARETEXTFILE "$Line\n"; | |
441 } | |
442 } | |
443 if ($CalculateCovariance) { | |
444 print COVARIANCETEXTFILE "$Line\n"; | |
445 } | |
446 | |
447 # Due to symmetric nature of these matrices, only one half needs to be | |
448 # calculated. So, just calculate the lower half and copy it to upper half... | |
449 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap); | |
450 | |
451 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); | |
452 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { | |
453 for $ColNum2 (0 .. $ColNum1) { | |
454 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; | |
455 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; | |
456 if ($CalculateCorrelation || $CalculateRSquare) { | |
457 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); | |
458 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; | |
459 $CorrelationMatrixMap{$ColNum1}{$ColNum2} = $CorrelationValue; | |
460 if ($ColNum1 != $ColNum2) { | |
461 $CorrelationMatrixMap{$ColNum2}{$ColNum1} = $CorrelationValue; | |
462 } | |
463 if ($CalculateRSquare) { | |
464 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; | |
465 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; | |
466 $RSquareMatrixMap{$ColNum1}{$ColNum2} = $RSquareValue; | |
467 if ($ColNum1 != $ColNum2) { | |
468 $RSquareMatrixMap{$ColNum2}{$ColNum1} = $RSquareValue; | |
469 } | |
470 } | |
471 } | |
472 if ($CalculateCovariance) { | |
473 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); | |
474 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; | |
475 $CovarianceMatrixMap{$ColNum1}{$ColNum2} = $CovarianceValue; | |
476 if ($ColNum1 != $ColNum2) { | |
477 $CovarianceMatrixMap{$ColNum2}{$ColNum1} = $CovarianceValue; | |
478 } | |
479 } | |
480 } | |
481 } | |
482 | |
483 # Write out the matrices... | |
484 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { | |
485 @CorrelationRowValues = (); | |
486 @RSquareRowValues = (); | |
487 @CovarianceRowValues = (); | |
488 if ($CalculateCorrelation || $CalculateRSquare) { | |
489 push @CorrelationRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; | |
490 if ($CalculateRSquare) { | |
491 push @RSquareRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; | |
492 } | |
493 } | |
494 if ($CalculateCovariance) { | |
495 push @CovarianceRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; | |
496 } | |
497 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { | |
498 if ($CalculateCorrelation || $CalculateRSquare) { | |
499 push @CorrelationRowValues, $CorrelationMatrixMap{$ColNum1}{$ColNum2}; | |
500 if ($CalculateRSquare) { | |
501 push @RSquareRowValues, $RSquareMatrixMap{$ColNum1}{$ColNum2}; | |
502 } | |
503 } | |
504 if ($CalculateCovariance) { | |
505 push @CovarianceRowValues, $CovarianceMatrixMap{$ColNum1}{$ColNum2}; | |
506 } | |
507 } | |
508 if ($CalculateCorrelation || $CalculateRSquare) { | |
509 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
510 print CORRELATIONTEXTFILE "$Line\n"; | |
511 if ($CalculateRSquare) { | |
512 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
513 print RSQUARETEXTFILE "$Line\n"; | |
514 } | |
515 } | |
516 if ($CalculateCovariance) { | |
517 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
518 print COVARIANCETEXTFILE "$Line\n"; | |
519 } | |
520 } | |
521 if ($CalculateCorrelation || $CalculateRSquare) { | |
522 close CORRELATIONTEXTFILE; | |
523 if ($CalculateRSquare) { | |
524 close RSQUARETEXTFILE; | |
525 } | |
526 } | |
527 if ($CalculateCovariance) { | |
528 close COVARIANCETEXTFILE; | |
529 } | |
530 } | |
531 | |
532 # Calculate standard scores... | |
533 sub PerformStandardScoresAnalysis { | |
534 my($Index, $ColValuesToAnalyzeMapRef) = @_; | |
535 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); | |
536 | |
537 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0; | |
538 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0; | |
539 | |
540 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "StandardScores." . $TextFilesInfo{OutFileExt}[$Index]; | |
541 print "Generating new text file $NewTextFile...\n"; | |
542 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
543 | |
544 my($ColValuesRef, $ColNum, @ColNumsToAnalyze); | |
545 # Write out column labels... | |
546 @ColLabels = (); | |
547 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; | |
548 for $ColNum (@ColNumsToAnalyze) { | |
549 $Label = $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
550 if ($StandardScores) { | |
551 push @ColLabels, "${Label}\(StandardScores)"; | |
552 } | |
553 if ($StandardScoresN) { | |
554 push @ColLabels, "${Label}\(StandardScoresN)"; | |
555 } | |
556 } | |
557 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
558 print NEWTEXTFILE "$NewLine\n"; | |
559 | |
560 # Go over each column to be analyzed and calculate standard deviation | |
561 # and mean values... | |
562 my(%StandardDeviationMap, %StandardDeviationNMap, %MeanMap); | |
563 %StandardDeviationMap = (); | |
564 %StandardDeviationNMap = (); | |
565 %MeanMap = (); | |
566 for $ColNum (@ColNumsToAnalyze) { | |
567 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; | |
568 if (!exists($MeanMap{$ColNum})) { | |
569 $MeanMap{$ColNum} = Mean($ColValuesRef); | |
570 } | |
571 if ($StandardScores) { | |
572 if (!exists($StandardDeviationMap{$ColNum})) { | |
573 $StandardDeviationMap{$ColNum} = StandardDeviation($ColValuesRef); | |
574 } | |
575 } | |
576 if ($StandardScoresN) { | |
577 if (!exists($StandardDeviationNMap{$ColNum})) { | |
578 $StandardDeviationNMap{$ColNum} = StandardDeviationN($ColValuesRef); | |
579 } | |
580 } | |
581 } | |
582 # | |
583 # Go over each row and calculate standard scores for each column | |
584 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n | |
585 # for StandardScoresN; write out the calculated values as well... | |
586 | |
587 my($TextFile, $InDelim, $Line, $Value, $ValueOkay, $ScoreValue, @RowValues, @LineWords); | |
588 $TextFile = $TextFilesList[$Index]; | |
589 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
590 | |
591 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; | |
592 $Line = GetTextLine(\*TEXTFILE); | |
593 while ($Line = GetTextLine(\*TEXTFILE)) { | |
594 @LineWords = quotewords($InDelim, 0, $Line); | |
595 @RowValues = (); | |
596 COLNUM: for $ColNum (@ColNumsToAnalyze) { | |
597 $Value = $LineWords[$ColNum]; | |
598 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1; | |
599 if ($StandardScores) { | |
600 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationMap{$ColNum}) : ""; | |
601 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; | |
602 push @RowValues, $ScoreValue; | |
603 } | |
604 if ($StandardScoresN) { | |
605 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationNMap{$ColNum}) : ""; | |
606 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; | |
607 push @RowValues, $ScoreValue; | |
608 } | |
609 } | |
610 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
611 print NEWTEXTFILE "$NewLine\n"; | |
612 } | |
613 close TEXTFILE; | |
614 close NEWTEXTFILE; | |
615 } | |
616 | |
617 # Make sure the specified columns exists in text files... | |
618 sub ProcessColumnsInfo { | |
619 my($Index, $TextFile, $ColNum, $NewColNum, $ColIndex, @ColNumsToAnalyze, %UniqueColNumsToAnalyzeMap); | |
620 | |
621 @{$TextFilesInfo{ColNumsToAnalyze}} = (); | |
622 @{$TextFilesInfo{ColPairs1ToAnalyze}} = (); | |
623 @{$TextFilesInfo{ColPairs2ToAnalyze}} = (); | |
624 @{$TextFilesInfo{UniqueColNumsToAnalyze}} = (); | |
625 | |
626 FILELIST: for $Index (0 .. $#TextFilesList) { | |
627 $TextFile = $TextFilesList[$Index]; | |
628 | |
629 @{$TextFilesInfo{ColNumsToAnalyze}[$Index]} = (); | |
630 @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]} = (); | |
631 @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]} = (); | |
632 @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]} = (); | |
633 | |
634 %UniqueColNumsToAnalyzeMap = (); | |
635 | |
636 if ($TextFilesInfo{FileOkay}[$Index]) { | |
637 @ColNumsToAnalyze = (); | |
638 if (@{$OptionsInfo{SpecifiedColumns}}) { | |
639 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { | |
640 for $ColNum (@{$OptionsInfo{SpecifiedColumns}}) { | |
641 if ($ColNum >=1 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
642 $NewColNum = $ColNum -1; | |
643 push @ColNumsToAnalyze, $NewColNum; | |
644 } | |
645 } | |
646 } | |
647 else { | |
648 my($ColLabel); | |
649 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) { | |
650 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
651 push @ColNumsToAnalyze, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
652 } | |
653 } | |
654 } | |
655 } | |
656 elsif (defined $OptionsInfo{Columns} && $OptionsInfo{Columns} =~ /^All$/i) { | |
657 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { | |
658 push @ColNumsToAnalyze, $ColNum; | |
659 } | |
660 } | |
661 else { | |
662 push @ColNumsToAnalyze, 0; | |
663 } | |
664 if (@ColNumsToAnalyze) { | |
665 push @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}, @ColNumsToAnalyze; | |
666 # Set up unique columns map as well... | |
667 for $ColNum (@ColNumsToAnalyze) { | |
668 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { | |
669 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; | |
670 } | |
671 } | |
672 } | |
673 else { | |
674 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist.\n"; | |
675 $TextFilesInfo{FileOkay}[$Index] = 0; | |
676 next FILELIST; | |
677 } | |
678 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { | |
679 # Make sure specific frequency files don't exist... | |
680 my($FrequencyFile); | |
681 for $ColNum (@ColNumsToAnalyze) { | |
682 $FrequencyFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; | |
683 if (-e $FrequencyFile) { | |
684 warn "Warning: Ignoring file $TextFile: The file $FrequencyFile already exists.\n"; | |
685 $TextFilesInfo{FileOkay}[$Index] = 0; | |
686 next FILELIST; | |
687 } | |
688 } | |
689 } | |
690 # Setup specified column pairs... | |
691 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) { | |
692 my(@ColPairsToAnalyze, $ColNum1, $ColNum2); | |
693 if (@{$OptionsInfo{SpecifiedColumnPairs}}) { | |
694 # Make sure both columns exist... | |
695 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { | |
696 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) { | |
697 $ColNum1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex]; | |
698 $ColNum2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1]; | |
699 if ($ColNum1 >=1 && $ColNum1 <= $TextFilesInfo{ColCount}[$Index] && $ColNum2 >=1 && $ColNum2 <= $TextFilesInfo{ColCount}[$Index]) { | |
700 $ColNum1 -= 1; | |
701 $ColNum2 -= 1; | |
702 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); | |
703 } | |
704 } | |
705 } | |
706 else { | |
707 my($ColLabel1, $ColLabel2); | |
708 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) { | |
709 $ColLabel1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex]; | |
710 $ColLabel2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1]; | |
711 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}) && exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2})) { | |
712 $ColNum1 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}; | |
713 $ColNum2 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2}; | |
714 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); | |
715 } | |
716 } | |
717 } | |
718 } | |
719 elsif ($OptionsInfo{AllColumnPairs}) { | |
720 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { | |
721 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { | |
722 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); | |
723 } | |
724 } | |
725 } | |
726 else { | |
727 if ($TextFilesInfo{ColCount}[$Index] >= 2) { | |
728 push @ColPairsToAnalyze, (0,1); | |
729 } | |
730 } | |
731 if (@ColPairsToAnalyze) { | |
732 if (@ColPairsToAnalyze % 2) { | |
733 warn "Warning: Ignoring file $TextFile: Invalid number values specified using \"--columnpairs\" option: It must contain even number of valid values.\n"; | |
734 $TextFilesInfo{FileOkay}[$Index] = 0; | |
735 next FILELIST; | |
736 } | |
737 else { | |
738 for ($ColIndex = 0; $ColIndex < @ColPairsToAnalyze; $ColIndex += 2) { | |
739 push @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex]; | |
740 push @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex + 1]; | |
741 } | |
742 # Set up unique columns map as well... | |
743 for $ColNum (@ColPairsToAnalyze) { | |
744 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { | |
745 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; | |
746 } | |
747 } | |
748 } | |
749 } | |
750 } | |
751 # Setup uniques columns array... | |
752 push @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}, (sort keys %UniqueColNumsToAnalyzeMap); | |
753 } | |
754 } | |
755 } | |
756 | |
757 # Retrieve information about input text files... | |
758 sub RetrieveTextFilesInfo { | |
759 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $OutFile, $OutFileExt, $ColNum, $ColLabel); | |
760 | |
761 %TextFilesInfo = (); | |
762 | |
763 @{$TextFilesInfo{FileOkay}} = (); | |
764 @{$TextFilesInfo{ColCount}} = (); | |
765 @{$TextFilesInfo{ColLabels}} = (); | |
766 @{$TextFilesInfo{ColLabelToNumMap}} = (); | |
767 @{$TextFilesInfo{InDelim}} = (); | |
768 @{$TextFilesInfo{OutFileRoot}} = (); | |
769 @{$TextFilesInfo{OutFileExt}} = (); | |
770 | |
771 FILELIST: for $Index (0 .. $#TextFilesList) { | |
772 $TextFile = $TextFilesList[$Index]; | |
773 | |
774 $TextFilesInfo{FileOkay}[$Index] = 0; | |
775 $TextFilesInfo{ColCount}[$Index] = 0; | |
776 $TextFilesInfo{InDelim}[$Index] = ""; | |
777 $TextFilesInfo{OutFileRoot}[$Index] = ""; | |
778 $TextFilesInfo{OutFileExt}[$Index] = ""; | |
779 | |
780 @{$TextFilesInfo{ColLabels}[$Index]} = (); | |
781 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); | |
782 | |
783 if (!(-e $TextFile)) { | |
784 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; | |
785 next FILELIST; | |
786 } | |
787 if (!CheckFileType($TextFile, "csv tsv")) { | |
788 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; | |
789 next FILELIST; | |
790 } | |
791 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
792 if ($FileExt =~ /^tsv$/i) { | |
793 $InDelim = "\t"; | |
794 } | |
795 else { | |
796 $InDelim = "\,"; | |
797 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
798 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; | |
799 next FILELIST; | |
800 } | |
801 if ($Options{indelim} =~ /^semicolon$/i) { | |
802 $InDelim = "\;"; | |
803 } | |
804 } | |
805 | |
806 if (!open TEXTFILE, "$TextFile") { | |
807 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; | |
808 next FILELIST; | |
809 } | |
810 | |
811 $Line = GetTextLine(\*TEXTFILE); | |
812 @ColLabels = quotewords($InDelim, 0, $Line); | |
813 close TEXTFILE; | |
814 | |
815 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
816 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
817 $FileExt = "csv"; | |
818 if ($Options{outdelim} =~ /^tab$/i) { | |
819 $FileExt = "tsv"; | |
820 } | |
821 $OutFileExt = $FileExt; | |
822 if ($Options{root} && (@TextFilesList == 1)) { | |
823 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); | |
824 if ($RootFileName && $RootFileExt) { | |
825 $FileName = $RootFileName; | |
826 } | |
827 else { | |
828 $FileName = $Options{root}; | |
829 } | |
830 $OutFileRoot = $FileName; | |
831 } | |
832 else { | |
833 $OutFileRoot = $FileName; | |
834 } | |
835 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt"; | |
836 | |
837 if (lc($OutFile) eq lc($TextFile)) { | |
838 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; | |
839 next FILELIST; | |
840 } | |
841 if (!$Options{overwrite}) { | |
842 if (-e $OutFile) { | |
843 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; | |
844 next FILELIST; | |
845 } | |
846 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { | |
847 if ($OptionsInfo{AllColumnPairs}) { | |
848 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) { | |
849 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n"; | |
850 next FILELIST; | |
851 } | |
852 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) { | |
853 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n"; | |
854 next FILELIST; | |
855 } | |
856 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) { | |
857 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n"; | |
858 next FILELIST; | |
859 } | |
860 } | |
861 else { | |
862 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") { | |
863 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n"; | |
864 next FILELIST; | |
865 } | |
866 } | |
867 } | |
868 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) { | |
869 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n"; | |
870 next FILELIST; | |
871 } | |
872 } | |
873 | |
874 $TextFilesInfo{FileOkay}[$Index] = 1; | |
875 $TextFilesInfo{InDelim}[$Index] = $InDelim; | |
876 $TextFilesInfo{OutFileRoot}[$Index] = "$OutFileRoot"; | |
877 $TextFilesInfo{OutFileExt}[$Index] = "$OutFileExt"; | |
878 | |
879 $TextFilesInfo{ColCount}[$Index] = @ColLabels; | |
880 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; | |
881 for $ColNum (0 .. $#ColLabels) { | |
882 $ColLabel = $ColLabels[$ColNum]; | |
883 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; | |
884 } | |
885 } | |
886 } | |
887 | |
888 # Process option values... | |
889 sub ProcessOptions { | |
890 %OptionsInfo = (); | |
891 | |
892 $OptionsInfo{Mode} = $Options{mode}; | |
893 | |
894 $OptionsInfo{DetailLevel} = $Options{detail}; | |
895 | |
896 # Setup supported statistical functions... | |
897 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); | |
898 %SupportedStatisticaFunctionsMap = (); | |
899 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); | |
900 | |
901 for $SupportedFunction (@SupportedStatisticaFunctions) { | |
902 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; | |
903 } | |
904 | |
905 # Setup a list of functions to use for analysis... | |
906 my($SpecifiedFunction); | |
907 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = (); | |
908 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = (); | |
909 # Check mode values... | |
910 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { | |
911 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic"; | |
912 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); | |
913 } | |
914 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { | |
915 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll"; | |
916 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); | |
917 } | |
918 elsif ($Options{mode} =~ /^All$/i ) { | |
919 $OptionsInfo{FileNameMode} = "AllStatistics"; | |
920 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions; | |
921 } | |
922 else { | |
923 $OptionsInfo{FileNameMode} = "SpecifiedStatistics"; | |
924 # Comma delimited list of functions... | |
925 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); | |
926 $Mode = $Options{mode}; | |
927 $Mode =~ s/ //g; | |
928 @SpecifiedFunctions = split ",", $Mode; | |
929 @UnsupportedSpecifiedFunctions = (); | |
930 for $SpecifiedFunction (@SpecifiedFunctions) { | |
931 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { | |
932 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction; | |
933 } | |
934 else { | |
935 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; | |
936 } | |
937 } | |
938 if (@UnsupportedSpecifiedFunctions) { | |
939 if (@UnsupportedSpecifiedFunctions > 1) { | |
940 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; | |
941 } | |
942 else { | |
943 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; | |
944 } | |
945 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; | |
946 } | |
947 } | |
948 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { | |
949 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) { | |
950 next FUNCTION; | |
951 } | |
952 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; | |
953 } | |
954 | |
955 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); | |
956 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; | |
957 | |
958 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; | |
959 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef; | |
960 | |
961 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1; | |
962 $OptionsInfo{Precision} = $Options{precision}; | |
963 | |
964 $OptionsInfo{KLargest} = $Options{klargest}; | |
965 $OptionsInfo{KSmallest} = $Options{ksmallest}; | |
966 | |
967 $OptionsInfo{TrimFraction} = $Options{trimfraction}; | |
968 | |
969 # Setup frequency bin values... | |
970 $OptionsInfo{NumOfBins} = 10; | |
971 @{$OptionsInfo{BinRange}} = (); | |
972 if ($Options{frequencybins} =~ /\,/) { | |
973 my($BinValue, @SpecifiedBinRange); | |
974 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; | |
975 if (@SpecifiedBinRange < 2) { | |
976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; | |
977 } | |
978 for $BinValue (@SpecifiedBinRange) { | |
979 if (!IsNumerical($BinValue)) { | |
980 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; | |
981 } | |
982 } | |
983 my($Index1, $Index2); | |
984 for $Index1 (0 .. $#SpecifiedBinRange) { | |
985 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { | |
986 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { | |
987 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; | |
988 } | |
989 } | |
990 } | |
991 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; | |
992 } | |
993 else { | |
994 $OptionsInfo{NumOfBins} = $Options{frequencybins}; | |
995 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { | |
996 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; | |
997 } | |
998 } | |
999 | |
1000 # Setup specified columns... | |
1001 $OptionsInfo{ColMode} = $Options{colmode}; | |
1002 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef; | |
1003 | |
1004 @{$OptionsInfo{SpecifiedColumns}} = (); | |
1005 if (defined $Options{columns} && $Options{columns} !~ /^All$/i) { | |
1006 my(@SpecifiedValues) = split ",", $Options{columns}; | |
1007 if ($Options{colmode} =~ /^colnum$/i) { | |
1008 my($ColValue); | |
1009 for $ColValue (@SpecifiedValues) { | |
1010 if (!IsPositiveInteger($ColValue)) { | |
1011 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; | |
1012 } | |
1013 } | |
1014 } | |
1015 push @{$OptionsInfo{SpecifiedColumns}}, @SpecifiedValues; | |
1016 } | |
1017 @{$OptionsInfo{SpecifiedColumnPairs}} = (); | |
1018 $OptionsInfo{AllColumnPairs} = (defined($Options{columnpairs}) && $Options{columnpairs} =~ /^AllPairs$/i) ? 1 : 0; | |
1019 if (defined($Options{columnpairs}) && !$OptionsInfo{AllColumnPairs}) { | |
1020 my(@SpecifiedValues) = split ",", $Options{columnpairs}; | |
1021 if (@SpecifiedValues % 2) { | |
1022 die "Error: Invalid number of values specified using \"--columnpairs\" option: It must contain even number of values.\n"; | |
1023 } | |
1024 if ($Options{colmode} =~ /^colnum$/i) { | |
1025 my($ColValue); | |
1026 for $ColValue (@SpecifiedValues) { | |
1027 if (!IsPositiveInteger($ColValue)) { | |
1028 die "Error: Column value, $ColValue, specified using \"--columnpairs\" is not valid: Allowed integer values: > 0.\n"; | |
1029 } | |
1030 } | |
1031 } | |
1032 push @{$OptionsInfo{SpecifiedColumnPairs}}, @SpecifiedValues; | |
1033 } | |
1034 | |
1035 } | |
1036 | |
1037 # Setup script usage and retrieve command line arguments specified using various options... | |
1038 sub SetupScriptUsage { | |
1039 | |
1040 # Retrieve all the options... | |
1041 %Options = (); | |
1042 $Options{colmode} = "colnum"; | |
1043 $Options{detail} = 1; | |
1044 $Options{indelim} = "comma"; | |
1045 $Options{frequencybins} = 10; | |
1046 $Options{klargest} = 2; | |
1047 $Options{ksmallest} = 2; | |
1048 $Options{mode} = "DescriptiveStatisticsBasic"; | |
1049 $Options{outdelim} = "comma"; | |
1050 $Options{precision} = 2; | |
1051 $Options{quote} = "yes"; | |
1052 $Options{trimfraction} = 0.1; | |
1053 | |
1054 if (!GetOptions(\%Options, "colmode|c=s", "columns=s", "columnpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "indelim=s", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) { | |
1055 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
1056 } | |
1057 if ($Options{workingdir}) { | |
1058 if (! -d $Options{workingdir}) { | |
1059 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
1060 } | |
1061 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
1062 } | |
1063 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { | |
1064 die "Error: The value specified, $Options{colmode}, for option \"-c --colmode\" is not valid. Allowed values: colnum or collabel\n"; | |
1065 } | |
1066 if (!IsPositiveInteger($Options{detail})) { | |
1067 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; | |
1068 } | |
1069 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
1070 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; | |
1071 } | |
1072 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
1073 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
1074 } | |
1075 if ($Options{quote} !~ /^(yes|no)$/i) { | |
1076 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
1077 } | |
1078 if (!IsPositiveInteger($Options{precision})) { | |
1079 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; | |
1080 } | |
1081 if (!IsPositiveInteger($Options{klargest})) { | |
1082 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n"; | |
1083 } | |
1084 if (!IsPositiveInteger($Options{ksmallest})) { | |
1085 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n"; | |
1086 } | |
1087 if (IsFloat($Options{trimfraction})) { | |
1088 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) { | |
1089 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; | |
1090 } | |
1091 } | |
1092 else { | |
1093 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; | |
1094 } | |
1095 } | |
1096 | |
1097 __END__ | |
1098 | |
1099 =head1 NAME | |
1100 | |
1101 AnalyzeTextFilesData.pl - Analyze numerical coulmn data in TextFile(s) | |
1102 | |
1103 =head1 SYNOPSIS | |
1104 | |
1105 AnalyzeTextFilesData.pl TextFile(s)... | |
1106 | |
1107 AnalyzeTextFilesData.pl [B<-c, --colmode> colnum | collabel] [B<--columns> "colnum,[colnum,...]" | "collabel,[collabel,...]" | All] | |
1108 [B<--columnpairs> "colnum,colnum,[colnum,colnum]..." | "collabel,collabel,[collabel,collabel]..." | AllPairs] | |
1109 [B<-d, --detail> infolevel] [B<-f, --fast>] [B<--frequencybins> number | "number,number,[number,...]"] [B<-h, --help>] | |
1110 [B<--indelim> comma | semicolon] [B<--klargest> number] [B<--ksmallest> number] | |
1111 [B<-m, --mode> DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"] | |
1112 [B<-o, --overwrite>] [B<--outdelim> comma | tab | semicolon] [B<-p, --precision> number] | |
1113 [B<-q, --quote> yes | no] [B<-r, --root> rootname] [B<--trimfraction> number] [B<-w, --workingdir> dirname] TextFiles(s)... | |
1114 | |
1115 =head1 DESCRIPTION | |
1116 | |
1117 Anaylze numerical column data in I<TextFile(s)> using a combination of various statistical | |
1118 functions; Non-numerical values are simply ignored. For I<Correlation, RSquare, and Covariance> | |
1119 analysis, the count of valid values in specifed column pair must be same; otherwise, column | |
1120 pair is ignored. The file names are separated by space. The valid file extensions are I<.csv> | |
1121 and I<.tsv> for comma/semicolon and tab delimited text files respectively. All other | |
1122 file names are ignored. All the text files in a current directory can be specified by | |
1123 I<*.csv>, I<*.tsv>, or the current directory name. The B<--indelim> option determines | |
1124 the format of I<TextFile(s)>. Any file which doesn't correspond to the format indicated | |
1125 by B<--indelim> option is ignored. | |
1126 | |
1127 =head1 OPTIONS | |
1128 | |
1129 =over 4 | |
1130 | |
1131 =item B<-c, --colmode> I<colnum | collabel> | |
1132 | |
1133 Specify how columns are identified in TextFile(s): using column number or column | |
1134 label. Possible values: I<colnum or collabel>. Default value: I<colnum>. | |
1135 | |
1136 =item B<--columns> I<"colnum,[colnum,...]" | "collabel,[collabel]..." | All> | |
1137 | |
1138 This value is mode specific. It's a list of comma delimited columns to use | |
1139 for data analysis. Default value: I<First column>. | |
1140 | |
1141 This value is ignored during I<Correlation/Pearson Correlation> and I<Covariance> | |
1142 data analysis; B<-coulmnparis> option is used instead. | |
1143 | |
1144 For I<colnum> value of B<-c, --colmode> option, input values format is: | |
1145 I<colnum,colnum,...>. Example: | |
1146 | |
1147 1,3,5 | |
1148 | |
1149 For I<collabel> value of B<-c, --colmode> option, input values format is: | |
1150 I<collabel,collabel,..>. Example: | |
1151 | |
1152 ALogP,MolWeight,EC50 | |
1153 | |
1154 =item B<--columnpairs> I<"colnum,colnum,[colnum,colnum,...]" | "collabel,collabel,[collabel,collabel,...]" | AllPairs> | |
1155 | |
1156 This value is mode specific and is only used for I<Correlation, PearsonCorrelation, or | |
1157 Covariance> value of B<-m, --mode> option. It is a comma delimited list of column pairs | |
1158 to use for data analysis during I<Correlation> and I<Covariance> calculations. Default value: | |
1159 I<First column, Second column>. | |
1160 | |
1161 For I<colnum> value of B<-c, --colmode> option, input values format is: | |
1162 I<colnum,colnum,[colnum,colnum]...>. Example: | |
1163 | |
1164 1,3,5,6,1,6 | |
1165 | |
1166 For I<collabel> value of B<-c, --colmode> option, input values format is: | |
1167 I<collabel,collabel,[collabel,collabel]..>. Example: | |
1168 | |
1169 MolWeight,EC50,NumN+O,PSA | |
1170 | |
1171 For I<AllPairs> value of B<--columnparis> option, all column pairs are used for I<Correlation> | |
1172 and I<Covariance> calculations. | |
1173 | |
1174 =item B<-d, --detail> I<infolevel> | |
1175 | |
1176 Level of information to print about column values being ignored. Default: I<1>. Possible values: | |
1177 1, 2, 3, or 4. | |
1178 | |
1179 =item B<-f, --fast> | |
1180 | |
1181 In this mode, all the columns specified for analysis are assumed to contain numerical | |
1182 data and no checking is performed before analysis. By default, only numerical data is | |
1183 used for analysis; other types of column data is ignored. | |
1184 | |
1185 =item B<--frequencybins> I<number | "number,number,[number,...]"> | |
1186 | |
1187 Specify number of bins or bin range to use for frequency analysis. Default value: I<10> | |
1188 | |
1189 Number of bins value along with the smallest and largest value for a column is used to | |
1190 group the column values into different groups. | |
1191 | |
1192 The bin range list is used to group values for a column into different groups; It must contain | |
1193 values in ascending order. Examples: | |
1194 | |
1195 10,20,30 | |
1196 0.1,0.2,0.3,0.4,0.5 | |
1197 | |
1198 The frequency value calculated for a specific bin corresponds to all the column values | |
1199 which are greater than the previous bin value and less than or equal to the current bin value. | |
1200 | |
1201 =item B<-h, --help> | |
1202 | |
1203 Print this help message. | |
1204 | |
1205 =item B<--indelim> I<comma | semicolon> | |
1206 | |
1207 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
1208 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
1209 delimiter. | |
1210 | |
1211 =item B<--klargest> I<number> | |
1212 | |
1213 Kth largest value to find by I<KLargest> function. Default value: I<2> Valid values: positive | |
1214 integers. | |
1215 | |
1216 =item B<--ksmallest> I<number> | |
1217 | |
1218 Kth smallest value to find by I<KSmallest> function. Default value: I<2>. Valid values: positive | |
1219 integers. | |
1220 | |
1221 =item B<-m, --mode> I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"> | |
1222 | |
1223 Specify how to analyze data in TextFile(s): calculate basic or all descriptive statistics; or | |
1224 use a comma delimited list of supported statistical functions. Possible values: | |
1225 I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | "function1,[function2]...">. Default | |
1226 value: I<DescriptiveStatisticsBasic> | |
1227 | |
1228 I<DescriptiveStatisticsBasic> includes these functions: I<Count, Maximum, Minimum, Mean, | |
1229 Median, Sum, StandardDeviation, StandardError, Variance>. | |
1230 | |
1231 I<DescriptiveStatisticsAll>, in addition to I<DescriptiveStatisticsBasic> functions, includes: | |
1232 I<GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare, | |
1233 Skewness, TrimMean>. | |
1234 | |
1235 I<All> uses complete list of supported functions: I<Average, AverageDeviation, Correlation, | |
1236 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, | |
1237 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum, | |
1238 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores, | |
1239 StandardScoresN, TrimMean, Variance, VarianceN>. The function names ending with N | |
1240 calculate corresponding values assuming an entire population instead of a population sample. | |
1241 | |
1242 Here are the formulas for these functions: | |
1243 | |
1244 Average: See Mean | |
1245 | |
1246 AverageDeviation: SUM( ABS(x[i] - Xmean) ) / n | |
1247 | |
1248 Correlation: See Pearson Correlation | |
1249 | |
1250 Covariance: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / n | |
1251 | |
1252 GeometricMean: NthROOT( PRODUCT(x[i]) ) | |
1253 | |
1254 HarmonicMean: 1 / ( SUM(1/x[i]) / n ) | |
1255 | |
1256 Mean: SUM( x[i] ) / n | |
1257 | |
1258 Median: Xsorted[(n - 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2 | |
1259 for odd values of n. | |
1260 | |
1261 Kurtosis: [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] - | |
1262 {3((n - 1)^2)}/{(n - 2)(n-3)} | |
1263 | |
1264 PearsonCorrelation: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 ) | |
1265 (SUM( (y[i] - Ymean)^2 )) ) | |
1266 | |
1267 RSquare: PearsonCorrelation^2 | |
1268 | |
1269 Skewness: {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 } | |
1270 | |
1271 StandardDeviation: SQRT ( SUM( (x[i] - Mean)^2 ) / (n - 1) ) | |
1272 | |
1273 StandardDeviationN: SQRT ( SUM( (x[i] - Mean)^2 ) / n ) | |
1274 | |
1275 StandardError: StandardDeviation / SQRT( n ) | |
1276 | |
1277 StandardScore: (x[i] - Mean) / (n - 1) | |
1278 | |
1279 StandardScoreN: (x[i] - Mean) / n | |
1280 | |
1281 Variance: SUM( (x[i] - Xmean)^2 / (n - 1) ) | |
1282 | |
1283 VarianceN: SUM( (x[i] - Xmean)^2 / n ) | |
1284 | |
1285 =item B<-o, --overwrite> | |
1286 | |
1287 Overwrite existing files. | |
1288 | |
1289 =item B<--outdelim> I<comma | tab | semicolon> | |
1290 | |
1291 Output text file delimiter. Possible values: I<comma, tab, or semicolon> | |
1292 Default value: I<comma>. | |
1293 | |
1294 =item B<-p, --precision> I<number> | |
1295 | |
1296 Precision of calculated values in the output file. Default: up to I<2> decimal places. | |
1297 Valid values: positive integers. | |
1298 | |
1299 =item B<-q, --quote> I<yes | no> | |
1300 | |
1301 Put quotes around column values in output text file. Possible values: I<yes or | |
1302 no>. Default value: I<yes>. | |
1303 | |
1304 =item B<-r, --root> I<rootname> | |
1305 | |
1306 New text file name is generated using the root: <Root>.<Ext>. Default new file | |
1307 name: <InitialTextFileName><Mode>.<Ext>. Based on the specified analysis, | |
1308 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic, | |
1309 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation, | |
1310 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for | |
1311 comma/semicolon, and tab delimited text files respectively. This option is ignored for | |
1312 multiple input files. | |
1313 | |
1314 =item B<--trimfraction> I<number> | |
1315 | |
1316 Fraction of data to exclude from the top and bottom of the data set during | |
1317 I<TrimMean> calculation. Default value: I<0.1>. Valid values: > 0 and < 1. | |
1318 | |
1319 =item B<-w --workingdir> I<text> | |
1320 | |
1321 Location of working directory. Default: current directory. | |
1322 | |
1323 =back | |
1324 | |
1325 =head1 EXAMPLES | |
1326 | |
1327 To calculate basic statistics for data in first column and generate a | |
1328 NewSample1DescriptiveStatisticsBasic.csv file, type: | |
1329 | |
1330 % AnalyzeTextFilesData.pl -o -r NewSample1 Sample1.csv | |
1331 | |
1332 To calculate basic statistics for data in third column and generate a | |
1333 NewSample1DescriptiveStatisticsBasic.csv file, type: | |
1334 | |
1335 % AnalyzeTextFilesData.pl --columns 3 -o -r NewSample1 Sample1.csv | |
1336 | |
1337 To calculate basic statistics for data in MolWeight column and generate a | |
1338 NewSample1DescriptiveStatisticsBasic.csv file, type: | |
1339 | |
1340 % AnalyzeTextFilesData.pl -colmode collabel --columns MolWeight -o | |
1341 -r NewSample1 Sample1.csv | |
1342 | |
1343 To calculate all available statistics for data in third column and all column pairs, | |
1344 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv, | |
1345 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv files, | |
1346 type: | |
1347 | |
1348 % AnalyzeTextFilesData.pl -m DescriptiveStatisticsAll --columns 3 -o | |
1349 --columnpairs AllPairs -r NewSample1 Sample1.csv | |
1350 | |
1351 To compute frequency distribution of data in third column into five bins and | |
1352 generate NewSample1MolWeightFrequencyAnalysis.csv, type: | |
1353 | |
1354 % AnalyzeTextFilesData.pl -m Frequency --frequencybins 5 --columns 3 | |
1355 -o -r NewSample1 Sample1.csv | |
1356 | |
1357 To compute frequency distribution of data in third column into specified bin range | |
1358 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type: | |
1359 | |
1360 % AnalyzeTextFilesData.pl -m Frequency --frequencybins "100,200,400" | |
1361 --columns 3 -o -r NewSample1 Sample1.csv | |
1362 | |
1363 To calculate all available statistics for data in all columns and column pairs, type: | |
1364 | |
1365 % AnalyzeTextFilesData.pl -m All --columns All --columnpairs | |
1366 AllPairs -o -r NewSample1 Sample1.csv | |
1367 | |
1368 =head1 AUTHOR | |
1369 | |
1370 Manish Sud <msud@san.rr.com> | |
1371 | |
1372 =head1 SEE ALSO | |
1373 | |
1374 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl | |
1375 | |
1376 =head1 COPYRIGHT | |
1377 | |
1378 Copyright (C) 2015 Manish Sud. All rights reserved. | |
1379 | |
1380 This file is part of MayaChemTools. | |
1381 | |
1382 MayaChemTools is free software; you can redistribute it and/or modify it under | |
1383 the terms of the GNU Lesser General Public License as published by the Free | |
1384 Software Foundation; either version 3 of the License, or (at your option) | |
1385 any later version. | |
1386 | |
1387 =cut |