comparison bin/AnalyzeTextFilesData.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: AnalyzeTextFilesData.pl,v $
4 # $Date: 2015/02/28 20:46:04 $
5 # $Revision: 1.36 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use StatisticsUtil;
38
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
40
41 # Autoflush STDOUT
42 $| = 1;
43
44 # Starting message...
45 $ScriptName = basename($0);
46 print "\n$ScriptName: Starting...\n\n";
47 $StartTime = new Benchmark;
48
49 # Get the options and setup script...
50 SetupScriptUsage();
51 if ($Options{help} || @ARGV < 1) {
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
53 }
54
55 my(@TextFilesList);
56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
57
58 print "Processing options...\n";
59 my(%OptionsInfo);
60 ProcessOptions();
61
62 # Collect column information for all the text files...
63 print "Checking input text file(s)...\n";
64 my(%TextFilesInfo);
65 RetrieveTextFilesInfo();
66 ProcessColumnsInfo();
67
68 # Generate output files...
69 my($FileIndex);
70 if (@TextFilesList > 1) {
71 print "\nProcessing text files...\n";
72 }
73 for $FileIndex (0 .. $#TextFilesList) {
74 if ($TextFilesInfo{FileOkay}[$FileIndex]) {
75 print "\nProcessing file $TextFilesList[$FileIndex]...\n";
76 AnalyzeTextFile($FileIndex);
77 }
78 }
79 print "\n$ScriptName:Done...\n\n";
80
81 $EndTime = new Benchmark;
82 $TotalTime = timediff ($EndTime, $StartTime);
83 print "Total time: ", timestr($TotalTime), "\n";
84
85 ###############################################################################
86
87 # Analyze data...
88 sub AnalyzeTextFile {
89 my($Index) = @_;
90 my($TextFile, $Line, $InDelim, $ColNum, $Value, @LineWords, @ColNumsToAnalyze, %ColValuesToAnalyzeMap);
91
92 $TextFile = $TextFilesList[$Index];
93 $InDelim = $TextFilesInfo{InDelim}[$Index];
94 @ColNumsToAnalyze = @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]};
95 %ColValuesToAnalyzeMap = ();
96 for $ColNum (@ColNumsToAnalyze) {
97 @{$ColValuesToAnalyzeMap{$ColNum}} = ();
98 }
99
100 my($LineCount, $InvalidLineCount, @InvalidColLabels);
101
102 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
103 # Skip over column labels line in text file and collect appropriate column data
104 # for analysis...
105 $Line = GetTextLine(\*TEXTFILE);
106 $LineCount = 1;
107 $InvalidLineCount = 0;
108 while ($Line = GetTextLine(\*TEXTFILE)) {
109 $LineCount++;
110 @LineWords = quotewords($InDelim, 0, $Line);
111 @InvalidColLabels = ();
112 COLNUM: for $ColNum (@ColNumsToAnalyze) {
113 $Value = $LineWords[$ColNum];
114 if ($OptionsInfo{CheckData}) {
115 if (!IsNumerical($Value)) {
116 push @InvalidColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
117 next COLNUM;
118 }
119 }
120 push @{$ColValuesToAnalyzeMap{$ColNum}}, $Value;
121 }
122 if (@InvalidColLabels) {
123 $InvalidLineCount++;
124 if ($OptionsInfo{DetailLevel} >=4 ) {
125 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed: $Line \n";
126 }
127 elsif ($OptionsInfo{DetailLevel} >= 3) {
128 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed...\n";
129 }
130 elsif ($OptionsInfo{DetailLevel} >= 2) {
131 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for columns to be analyzed...\n";
132 }
133 }
134 }
135 if ($InvalidLineCount && ($OptionsInfo{DetailLevel} >= 1)) {
136 print "Non-numerical or empty data present in $InvalidLineCount line(s)...\n";
137 }
138 close TEXTFILE;
139
140 # Perform the analysis...
141 my(@SpecifiedFunctionNames, $SpecifiedFunction);
142 @SpecifiedFunctionNames = ();
143
144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)};
147 }
148 }
149 if (@SpecifiedFunctionNames) {
150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%ColValuesToAnalyzeMap)
151 }
152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
153 if ($OptionsInfo{AllColumnPairs}) {
154 PerformMatrixAnalysis($Index, \%ColValuesToAnalyzeMap);
155 }
156 else {
157 # Perform pairwise analysis for specified columns and write out calculated values - correlation
158 # rsquare, or covariance - in the same file.
159 PerformColumnPairAnalysis($Index, \%ColValuesToAnalyzeMap);
160 }
161 }
162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) {
163 PerformStandardScoresAnalysis($Index, \%ColValuesToAnalyzeMap);
164 }
165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
166 PerformFrequencyAnalysis($Index, \%ColValuesToAnalyzeMap);
167 }
168 }
169
170 # Calculate values for various statistical functions...
171 sub PerformAnalysis {
172 my($Index, $SpecifiedFunctionNamesRef, $ColValuesToAnalyzeMapRef) = @_;
173 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @ColNumsToAnalyze);
174
175 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $TextFilesInfo{OutFileExt}[$Index];
176
177 print "Generating new text file $NewTextFile...\n";
178 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
179
180 # Write out column labels...
181 @ColLabels = ();
182 push @ColLabels, "ColumnID";
183 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
184 $Label = $SpecifiedFunction;
185 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
186 my($KthValue);
187 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest};
188 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
189 $Label =~ s/K//g;
190 }
191 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
192 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})";
193 }
194 push @ColLabels, $Label;
195 }
196 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
197 print NEWTEXTFILE "$Line\n";
198
199 # Go over each column to be analyzed...
200 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]};
201
202 # Turn off "strict"; otherwise, invoking statistical functions using function name string
203 # is problematic.
204 no strict;
205
206 my($ColValuesRef, $ColNum, $Value, @RowValues, %CalculatedValues);
207 %CalculatedValues = ();
208 for $ColNum (@ColNumsToAnalyze) {
209 @RowValues = ();
210 # Setup column id...
211 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum];
212 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
213 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
214 $Value = "";
215 if (!@{$ColValuesToAnalyzeMapRef->{$ColNum}}) {
216 # Invalid column values...
217 push @RowValues, $Value;
218 next FUNCTIONNAME;
219 }
220 if ($SpecifiedFunction =~ /^Count$/i) {
221 $Value = @{$ColValuesToAnalyzeMapRef->{$ColNum}};
222 }
223 elsif ($SpecifiedFunction =~ /^KLargest$/i) {
224 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KLargest});
225 }
226 elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
227 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KSmallest});
228 }
229 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
230 if (exists($CalculatedValues{$ColNum}{StandardDeviation})) {
231 $Value = $CalculatedValues{$ColNum}{StandardDeviation};
232 }
233 else {
234 $Value = &$SpecifiedFunction($ColValuesRef);
235 $CalculatedValues{$ColNum}{StandardDeviation} = $Value;
236 }
237 }
238 elsif ($SpecifiedFunction =~ /^StandardError$/i) {
239 if (!exists($CalculatedValues{$ColNum}{StandardDeviation})) {
240 $Value = StandardDeviation($ColValuesRef);
241 $CalculatedValues{$ColNum}{StandardDeviation} = $Value;
242 }
243 if (defined $CalculatedValues{$ColNum}{StandardDeviation}) {
244 $Value = &$SpecifiedFunction($CalculatedValues{$ColNum}{StandardDeviation}, @{$ColValuesToAnalyzeMapRef->{$ColNum}});
245 }
246 }
247 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
248 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{TrimFraction});
249 }
250 else {
251 $Value = &$SpecifiedFunction($ColValuesRef);
252 }
253 # Format the output value. And add zero to get rid of tariling zeros...
254 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : "";
255 push @RowValues, $Value;
256 }
257 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
258 print NEWTEXTFILE "$Line\n";
259 }
260 close NEWTEXTFILE;
261 }
262
263 # Calculate covariance, correlation, rsquare for specified column pairs....
264 sub PerformColumnPairAnalysis {
265 my($Index, $ColValuesToAnalyzeMapRef) = @_;
266 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
267 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
268 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
269 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
270
271 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "ColumnPairsAnalysis." . $TextFilesInfo{OutFileExt}[$Index];
272 print "Generating new text file $NewTextFile...\n";
273 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
274
275 # Write out the column labels...
276 @ColLabels = ();
277 push @ColLabels, ("ColumnID1", "ColumnID2");
278 if ($CalculateCorrelation || $CalculateRSquare) {
279 push @ColLabels, "Correlation";
280 if ($CalculateRSquare) {
281 push @ColLabels, "RSquare";
282 }
283 }
284 if ($CalculateCovariance) {
285 push @ColLabels, "Covariance";
286 }
287 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
288 print NEWTEXTFILE "$Line\n";
289
290 # Go over each column pair...
291 my($CorrelationValue, $RSquareValue, $CovarianceValue, $ColIndex, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColPairs1ToAnalyze, @ColPairs2ToAnalyze, @RowValues, $Value);
292
293 @ColPairs1ToAnalyze = @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]};
294 @ColPairs2ToAnalyze = @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]};
295 for $ColIndex (0 .. $#ColPairs1ToAnalyze) {
296 @RowValues = ();
297 $ColNum1 = $ColPairs1ToAnalyze[$ColIndex];
298 $ColNum2 = $ColPairs2ToAnalyze[$ColIndex];
299 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}};
300 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}};
301
302 # Setup column ids...
303 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
304 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum2];
305
306 if (@$ColValuesRef1 != @$ColValuesRef2) {
307 # Print a warning...
308 warn "Warning: Skipping analysis for column pair $TextFilesInfo{ColLabels}[$Index][$ColNum1], $TextFilesInfo{ColLabels}[$Index][$ColNum2]: Number of valid data values must be same.\n";
309 if ($CalculateCorrelation || $CalculateRSquare) {
310 push @RowValues, "";
311 if ($CalculateRSquare) {
312 push @RowValues, "";
313 }
314 }
315 if ($CalculateCovariance) {
316 push @RowValues, "";
317 }
318 }
319 else {
320 # Calculate appropriate value...
321 if ($CalculateCorrelation || $CalculateRSquare) {
322 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2);
323 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
324 push @RowValues, $Value;
325 if ($CalculateRSquare) {
326 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
327 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
328 push @RowValues, $Value;
329 }
330 }
331 if ($CalculateCovariance) {
332 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2);
333 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
334 push @RowValues, $Value;
335 }
336 }
337 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
338 print NEWTEXTFILE "$Line\n";
339 }
340 close NEWTEXTFILE;
341 }
342
343 # Generate histogram numbers...
344 sub PerformFrequencyAnalysis {
345 my($Index, $ColValuesToAnalyzeMapRef) = @_;
346 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $ColNum, @ColNumsToAnalyze, $ColValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
347
348 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]};
349 for $ColNum (@ColNumsToAnalyze) {
350 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index];
351 print "Generating new text file $NewTextFile...\n";
352 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
353
354 # Write out the column labels...
355 @ColLabels = ();
356 push @ColLabels , ("Bins", "Frequency");
357 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
358 print NEWTEXTFILE "$Line\n";
359
360 #Calculate and write out frequency values...
361 %FrequencyMap = ();
362 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
363 if (@$ColValuesRef) {
364 if (@{$OptionsInfo{BinRange}}) {
365 %FrequencyMap = Frequency($ColValuesRef, \@{$OptionsInfo{BinRange}});
366 }
367 else {
368 %FrequencyMap = Frequency($ColValuesRef, $OptionsInfo{NumOfBins});
369 }
370 }
371 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) {
372 $FrequencyValue = $FrequencyMap{$BinValue};
373
374 @RowValues = ();
375 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : "";
376 push @RowValues, $Value;
377 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : "";
378 push @RowValues, $Value;
379
380 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
381 print NEWTEXTFILE "$Line\n";
382 }
383 close NEWTEXTFILE;
384 }
385 }
386
387 # Calculate covariance, correlation/rsquare matrices....
388 sub PerformMatrixAnalysis {
389 my($Index, $ColValuesToAnalyzeMapRef) = @_;
390 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
391
392 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
393 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
394 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
395
396 $CorrelationTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CorrelationMatrix." . $TextFilesInfo{OutFileExt}[$Index];
397 $RSquareTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "RSquareMatrix." . $TextFilesInfo{OutFileExt}[$Index];
398 $CovarianceTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CovarianceMatrix." . $TextFilesInfo{OutFileExt}[$Index];
399
400 my($TextFilesList, $Delimiter);
401 $TextFilesList = "";
402 if ($CalculateCorrelation || $CalculateRSquare) {
403 $TextFilesList = $CorrelationTextFile;
404 if ($CalculateRSquare) {
405 $TextFilesList .= ", $CorrelationTextFile";
406 }
407 }
408 $Delimiter = length($TextFilesList) ? "," : "";
409 if ($CalculateCovariance) {
410 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
411 }
412 if ($TextFilesList =~ /\,/) {
413 print "Generating new text files $TextFilesList...\n"
414 }
415 else {
416 print "Generating new text file $TextFilesList...\n"
417 }
418 if ($CalculateCorrelation || $CalculateRSquare) {
419 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
420 if ($CalculateRSquare) {
421 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
422 }
423 }
424 if ($CalculateCovariance) {
425 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
426 }
427
428 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $ColNum, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
429
430 # Write out the column labels...
431 @ColLabels = ();
432 push @ColLabels, "";
433 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
434 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
435 }
436 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
437 if ($CalculateCorrelation || $CalculateRSquare) {
438 print CORRELATIONTEXTFILE "$Line\n";
439 if ($CalculateRSquare) {
440 print RSQUARETEXTFILE "$Line\n";
441 }
442 }
443 if ($CalculateCovariance) {
444 print COVARIANCETEXTFILE "$Line\n";
445 }
446
447 # Due to symmetric nature of these matrices, only one half needs to be
448 # calculated. So, just calculate the lower half and copy it to upper half...
449 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap);
450
451 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
452 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
453 for $ColNum2 (0 .. $ColNum1) {
454 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}};
455 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}};
456 if ($CalculateCorrelation || $CalculateRSquare) {
457 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2);
458 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
459 $CorrelationMatrixMap{$ColNum1}{$ColNum2} = $CorrelationValue;
460 if ($ColNum1 != $ColNum2) {
461 $CorrelationMatrixMap{$ColNum2}{$ColNum1} = $CorrelationValue;
462 }
463 if ($CalculateRSquare) {
464 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
465 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
466 $RSquareMatrixMap{$ColNum1}{$ColNum2} = $RSquareValue;
467 if ($ColNum1 != $ColNum2) {
468 $RSquareMatrixMap{$ColNum2}{$ColNum1} = $RSquareValue;
469 }
470 }
471 }
472 if ($CalculateCovariance) {
473 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2);
474 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
475 $CovarianceMatrixMap{$ColNum1}{$ColNum2} = $CovarianceValue;
476 if ($ColNum1 != $ColNum2) {
477 $CovarianceMatrixMap{$ColNum2}{$ColNum1} = $CovarianceValue;
478 }
479 }
480 }
481 }
482
483 # Write out the matrices...
484 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
485 @CorrelationRowValues = ();
486 @RSquareRowValues = ();
487 @CovarianceRowValues = ();
488 if ($CalculateCorrelation || $CalculateRSquare) {
489 push @CorrelationRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
490 if ($CalculateRSquare) {
491 push @RSquareRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
492 }
493 }
494 if ($CalculateCovariance) {
495 push @CovarianceRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1];
496 }
497 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
498 if ($CalculateCorrelation || $CalculateRSquare) {
499 push @CorrelationRowValues, $CorrelationMatrixMap{$ColNum1}{$ColNum2};
500 if ($CalculateRSquare) {
501 push @RSquareRowValues, $RSquareMatrixMap{$ColNum1}{$ColNum2};
502 }
503 }
504 if ($CalculateCovariance) {
505 push @CovarianceRowValues, $CovarianceMatrixMap{$ColNum1}{$ColNum2};
506 }
507 }
508 if ($CalculateCorrelation || $CalculateRSquare) {
509 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
510 print CORRELATIONTEXTFILE "$Line\n";
511 if ($CalculateRSquare) {
512 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
513 print RSQUARETEXTFILE "$Line\n";
514 }
515 }
516 if ($CalculateCovariance) {
517 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
518 print COVARIANCETEXTFILE "$Line\n";
519 }
520 }
521 if ($CalculateCorrelation || $CalculateRSquare) {
522 close CORRELATIONTEXTFILE;
523 if ($CalculateRSquare) {
524 close RSQUARETEXTFILE;
525 }
526 }
527 if ($CalculateCovariance) {
528 close COVARIANCETEXTFILE;
529 }
530 }
531
532 # Calculate standard scores...
533 sub PerformStandardScoresAnalysis {
534 my($Index, $ColValuesToAnalyzeMapRef) = @_;
535 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
536
537 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0;
538 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0;
539
540 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "StandardScores." . $TextFilesInfo{OutFileExt}[$Index];
541 print "Generating new text file $NewTextFile...\n";
542 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
543
544 my($ColValuesRef, $ColNum, @ColNumsToAnalyze);
545 # Write out column labels...
546 @ColLabels = ();
547 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]};
548 for $ColNum (@ColNumsToAnalyze) {
549 $Label = $TextFilesInfo{ColLabels}[$Index][$ColNum];
550 if ($StandardScores) {
551 push @ColLabels, "${Label}\(StandardScores)";
552 }
553 if ($StandardScoresN) {
554 push @ColLabels, "${Label}\(StandardScoresN)";
555 }
556 }
557 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
558 print NEWTEXTFILE "$NewLine\n";
559
560 # Go over each column to be analyzed and calculate standard deviation
561 # and mean values...
562 my(%StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
563 %StandardDeviationMap = ();
564 %StandardDeviationNMap = ();
565 %MeanMap = ();
566 for $ColNum (@ColNumsToAnalyze) {
567 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
568 if (!exists($MeanMap{$ColNum})) {
569 $MeanMap{$ColNum} = Mean($ColValuesRef);
570 }
571 if ($StandardScores) {
572 if (!exists($StandardDeviationMap{$ColNum})) {
573 $StandardDeviationMap{$ColNum} = StandardDeviation($ColValuesRef);
574 }
575 }
576 if ($StandardScoresN) {
577 if (!exists($StandardDeviationNMap{$ColNum})) {
578 $StandardDeviationNMap{$ColNum} = StandardDeviationN($ColValuesRef);
579 }
580 }
581 }
582 #
583 # Go over each row and calculate standard scores for each column
584 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
585 # for StandardScoresN; write out the calculated values as well...
586
587 my($TextFile, $InDelim, $Line, $Value, $ValueOkay, $ScoreValue, @RowValues, @LineWords);
588 $TextFile = $TextFilesList[$Index];
589 $InDelim = $TextFilesInfo{InDelim}[$Index];
590
591 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
592 $Line = GetTextLine(\*TEXTFILE);
593 while ($Line = GetTextLine(\*TEXTFILE)) {
594 @LineWords = quotewords($InDelim, 0, $Line);
595 @RowValues = ();
596 COLNUM: for $ColNum (@ColNumsToAnalyze) {
597 $Value = $LineWords[$ColNum];
598 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1;
599 if ($StandardScores) {
600 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationMap{$ColNum}) : "";
601 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
602 push @RowValues, $ScoreValue;
603 }
604 if ($StandardScoresN) {
605 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationNMap{$ColNum}) : "";
606 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
607 push @RowValues, $ScoreValue;
608 }
609 }
610 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
611 print NEWTEXTFILE "$NewLine\n";
612 }
613 close TEXTFILE;
614 close NEWTEXTFILE;
615 }
616
617 # Make sure the specified columns exists in text files...
618 sub ProcessColumnsInfo {
619 my($Index, $TextFile, $ColNum, $NewColNum, $ColIndex, @ColNumsToAnalyze, %UniqueColNumsToAnalyzeMap);
620
621 @{$TextFilesInfo{ColNumsToAnalyze}} = ();
622 @{$TextFilesInfo{ColPairs1ToAnalyze}} = ();
623 @{$TextFilesInfo{ColPairs2ToAnalyze}} = ();
624 @{$TextFilesInfo{UniqueColNumsToAnalyze}} = ();
625
626 FILELIST: for $Index (0 .. $#TextFilesList) {
627 $TextFile = $TextFilesList[$Index];
628
629 @{$TextFilesInfo{ColNumsToAnalyze}[$Index]} = ();
630 @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]} = ();
631 @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]} = ();
632 @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]} = ();
633
634 %UniqueColNumsToAnalyzeMap = ();
635
636 if ($TextFilesInfo{FileOkay}[$Index]) {
637 @ColNumsToAnalyze = ();
638 if (@{$OptionsInfo{SpecifiedColumns}}) {
639 if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
640 for $ColNum (@{$OptionsInfo{SpecifiedColumns}}) {
641 if ($ColNum >=1 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) {
642 $NewColNum = $ColNum -1;
643 push @ColNumsToAnalyze, $NewColNum;
644 }
645 }
646 }
647 else {
648 my($ColLabel);
649 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) {
650 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
651 push @ColNumsToAnalyze, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
652 }
653 }
654 }
655 }
656 elsif (defined $OptionsInfo{Columns} && $OptionsInfo{Columns} =~ /^All$/i) {
657 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
658 push @ColNumsToAnalyze, $ColNum;
659 }
660 }
661 else {
662 push @ColNumsToAnalyze, 0;
663 }
664 if (@ColNumsToAnalyze) {
665 push @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}, @ColNumsToAnalyze;
666 # Set up unique columns map as well...
667 for $ColNum (@ColNumsToAnalyze) {
668 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) {
669 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum;
670 }
671 }
672 }
673 else {
674 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist.\n";
675 $TextFilesInfo{FileOkay}[$Index] = 0;
676 next FILELIST;
677 }
678 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
679 # Make sure specific frequency files don't exist...
680 my($FrequencyFile);
681 for $ColNum (@ColNumsToAnalyze) {
682 $FrequencyFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index];
683 if (-e $FrequencyFile) {
684 warn "Warning: Ignoring file $TextFile: The file $FrequencyFile already exists.\n";
685 $TextFilesInfo{FileOkay}[$Index] = 0;
686 next FILELIST;
687 }
688 }
689 }
690 # Setup specified column pairs...
691 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) {
692 my(@ColPairsToAnalyze, $ColNum1, $ColNum2);
693 if (@{$OptionsInfo{SpecifiedColumnPairs}}) {
694 # Make sure both columns exist...
695 if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
696 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) {
697 $ColNum1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex];
698 $ColNum2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1];
699 if ($ColNum1 >=1 && $ColNum1 <= $TextFilesInfo{ColCount}[$Index] && $ColNum2 >=1 && $ColNum2 <= $TextFilesInfo{ColCount}[$Index]) {
700 $ColNum1 -= 1;
701 $ColNum2 -= 1;
702 push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
703 }
704 }
705 }
706 else {
707 my($ColLabel1, $ColLabel2);
708 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) {
709 $ColLabel1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex];
710 $ColLabel2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1];
711 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}) && exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2})) {
712 $ColNum1 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1};
713 $ColNum2 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2};
714 push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
715 }
716 }
717 }
718 }
719 elsif ($OptionsInfo{AllColumnPairs}) {
720 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
721 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) {
722 push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
723 }
724 }
725 }
726 else {
727 if ($TextFilesInfo{ColCount}[$Index] >= 2) {
728 push @ColPairsToAnalyze, (0,1);
729 }
730 }
731 if (@ColPairsToAnalyze) {
732 if (@ColPairsToAnalyze % 2) {
733 warn "Warning: Ignoring file $TextFile: Invalid number values specified using \"--columnpairs\" option: It must contain even number of valid values.\n";
734 $TextFilesInfo{FileOkay}[$Index] = 0;
735 next FILELIST;
736 }
737 else {
738 for ($ColIndex = 0; $ColIndex < @ColPairsToAnalyze; $ColIndex += 2) {
739 push @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex];
740 push @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex + 1];
741 }
742 # Set up unique columns map as well...
743 for $ColNum (@ColPairsToAnalyze) {
744 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) {
745 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum;
746 }
747 }
748 }
749 }
750 }
751 # Setup uniques columns array...
752 push @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}, (sort keys %UniqueColNumsToAnalyzeMap);
753 }
754 }
755 }
756
757 # Retrieve information about input text files...
758 sub RetrieveTextFilesInfo {
759 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $OutFile, $OutFileExt, $ColNum, $ColLabel);
760
761 %TextFilesInfo = ();
762
763 @{$TextFilesInfo{FileOkay}} = ();
764 @{$TextFilesInfo{ColCount}} = ();
765 @{$TextFilesInfo{ColLabels}} = ();
766 @{$TextFilesInfo{ColLabelToNumMap}} = ();
767 @{$TextFilesInfo{InDelim}} = ();
768 @{$TextFilesInfo{OutFileRoot}} = ();
769 @{$TextFilesInfo{OutFileExt}} = ();
770
771 FILELIST: for $Index (0 .. $#TextFilesList) {
772 $TextFile = $TextFilesList[$Index];
773
774 $TextFilesInfo{FileOkay}[$Index] = 0;
775 $TextFilesInfo{ColCount}[$Index] = 0;
776 $TextFilesInfo{InDelim}[$Index] = "";
777 $TextFilesInfo{OutFileRoot}[$Index] = "";
778 $TextFilesInfo{OutFileExt}[$Index] = "";
779
780 @{$TextFilesInfo{ColLabels}[$Index]} = ();
781 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
782
783 if (!(-e $TextFile)) {
784 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
785 next FILELIST;
786 }
787 if (!CheckFileType($TextFile, "csv tsv")) {
788 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
789 next FILELIST;
790 }
791 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
792 if ($FileExt =~ /^tsv$/i) {
793 $InDelim = "\t";
794 }
795 else {
796 $InDelim = "\,";
797 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
798 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
799 next FILELIST;
800 }
801 if ($Options{indelim} =~ /^semicolon$/i) {
802 $InDelim = "\;";
803 }
804 }
805
806 if (!open TEXTFILE, "$TextFile") {
807 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
808 next FILELIST;
809 }
810
811 $Line = GetTextLine(\*TEXTFILE);
812 @ColLabels = quotewords($InDelim, 0, $Line);
813 close TEXTFILE;
814
815 $FileDir = ""; $FileName = ""; $FileExt = "";
816 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
817 $FileExt = "csv";
818 if ($Options{outdelim} =~ /^tab$/i) {
819 $FileExt = "tsv";
820 }
821 $OutFileExt = $FileExt;
822 if ($Options{root} && (@TextFilesList == 1)) {
823 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
824 if ($RootFileName && $RootFileExt) {
825 $FileName = $RootFileName;
826 }
827 else {
828 $FileName = $Options{root};
829 }
830 $OutFileRoot = $FileName;
831 }
832 else {
833 $OutFileRoot = $FileName;
834 }
835 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt";
836
837 if (lc($OutFile) eq lc($TextFile)) {
838 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n";
839 next FILELIST;
840 }
841 if (!$Options{overwrite}) {
842 if (-e $OutFile) {
843 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
844 next FILELIST;
845 }
846 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
847 if ($OptionsInfo{AllColumnPairs}) {
848 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) {
849 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n";
850 next FILELIST;
851 }
852 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) {
853 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n";
854 next FILELIST;
855 }
856 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) {
857 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n";
858 next FILELIST;
859 }
860 }
861 else {
862 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") {
863 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n";
864 next FILELIST;
865 }
866 }
867 }
868 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) {
869 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n";
870 next FILELIST;
871 }
872 }
873
874 $TextFilesInfo{FileOkay}[$Index] = 1;
875 $TextFilesInfo{InDelim}[$Index] = $InDelim;
876 $TextFilesInfo{OutFileRoot}[$Index] = "$OutFileRoot";
877 $TextFilesInfo{OutFileExt}[$Index] = "$OutFileExt";
878
879 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
880 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
881 for $ColNum (0 .. $#ColLabels) {
882 $ColLabel = $ColLabels[$ColNum];
883 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
884 }
885 }
886 }
887
888 # Process option values...
889 sub ProcessOptions {
890 %OptionsInfo = ();
891
892 $OptionsInfo{Mode} = $Options{mode};
893
894 $OptionsInfo{DetailLevel} = $Options{detail};
895
896 # Setup supported statistical functions...
897 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
898 %SupportedStatisticaFunctionsMap = ();
899 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
900
901 for $SupportedFunction (@SupportedStatisticaFunctions) {
902 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
903 }
904
905 # Setup a list of functions to use for analysis...
906 my($SpecifiedFunction);
907 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = ();
908 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = ();
909 # Check mode values...
910 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
911 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic";
912 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
913 }
914 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
915 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll";
916 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum);
917 }
918 elsif ($Options{mode} =~ /^All$/i ) {
919 $OptionsInfo{FileNameMode} = "AllStatistics";
920 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions;
921 }
922 else {
923 $OptionsInfo{FileNameMode} = "SpecifiedStatistics";
924 # Comma delimited list of functions...
925 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
926 $Mode = $Options{mode};
927 $Mode =~ s/ //g;
928 @SpecifiedFunctions = split ",", $Mode;
929 @UnsupportedSpecifiedFunctions = ();
930 for $SpecifiedFunction (@SpecifiedFunctions) {
931 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
932 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction;
933 }
934 else {
935 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
936 }
937 }
938 if (@UnsupportedSpecifiedFunctions) {
939 if (@UnsupportedSpecifiedFunctions > 1) {
940 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
941 }
942 else {
943 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
944 }
945 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
946 }
947 }
948 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
949 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) {
950 next FUNCTION;
951 }
952 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
953 }
954
955 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
956 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
957
958 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
959 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
960
961 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1;
962 $OptionsInfo{Precision} = $Options{precision};
963
964 $OptionsInfo{KLargest} = $Options{klargest};
965 $OptionsInfo{KSmallest} = $Options{ksmallest};
966
967 $OptionsInfo{TrimFraction} = $Options{trimfraction};
968
969 # Setup frequency bin values...
970 $OptionsInfo{NumOfBins} = 10;
971 @{$OptionsInfo{BinRange}} = ();
972 if ($Options{frequencybins} =~ /\,/) {
973 my($BinValue, @SpecifiedBinRange);
974 @SpecifiedBinRange = split /\,/, $Options{frequencybins};
975 if (@SpecifiedBinRange < 2) {
976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
977 }
978 for $BinValue (@SpecifiedBinRange) {
979 if (!IsNumerical($BinValue)) {
980 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
981 }
982 }
983 my($Index1, $Index2);
984 for $Index1 (0 .. $#SpecifiedBinRange) {
985 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
986 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
987 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
988 }
989 }
990 }
991 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
992 }
993 else {
994 $OptionsInfo{NumOfBins} = $Options{frequencybins};
995 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
996 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
997 }
998 }
999
1000 # Setup specified columns...
1001 $OptionsInfo{ColMode} = $Options{colmode};
1002 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef;
1003
1004 @{$OptionsInfo{SpecifiedColumns}} = ();
1005 if (defined $Options{columns} && $Options{columns} !~ /^All$/i) {
1006 my(@SpecifiedValues) = split ",", $Options{columns};
1007 if ($Options{colmode} =~ /^colnum$/i) {
1008 my($ColValue);
1009 for $ColValue (@SpecifiedValues) {
1010 if (!IsPositiveInteger($ColValue)) {
1011 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n";
1012 }
1013 }
1014 }
1015 push @{$OptionsInfo{SpecifiedColumns}}, @SpecifiedValues;
1016 }
1017 @{$OptionsInfo{SpecifiedColumnPairs}} = ();
1018 $OptionsInfo{AllColumnPairs} = (defined($Options{columnpairs}) && $Options{columnpairs} =~ /^AllPairs$/i) ? 1 : 0;
1019 if (defined($Options{columnpairs}) && !$OptionsInfo{AllColumnPairs}) {
1020 my(@SpecifiedValues) = split ",", $Options{columnpairs};
1021 if (@SpecifiedValues % 2) {
1022 die "Error: Invalid number of values specified using \"--columnpairs\" option: It must contain even number of values.\n";
1023 }
1024 if ($Options{colmode} =~ /^colnum$/i) {
1025 my($ColValue);
1026 for $ColValue (@SpecifiedValues) {
1027 if (!IsPositiveInteger($ColValue)) {
1028 die "Error: Column value, $ColValue, specified using \"--columnpairs\" is not valid: Allowed integer values: > 0.\n";
1029 }
1030 }
1031 }
1032 push @{$OptionsInfo{SpecifiedColumnPairs}}, @SpecifiedValues;
1033 }
1034
1035 }
1036
1037 # Setup script usage and retrieve command line arguments specified using various options...
1038 sub SetupScriptUsage {
1039
1040 # Retrieve all the options...
1041 %Options = ();
1042 $Options{colmode} = "colnum";
1043 $Options{detail} = 1;
1044 $Options{indelim} = "comma";
1045 $Options{frequencybins} = 10;
1046 $Options{klargest} = 2;
1047 $Options{ksmallest} = 2;
1048 $Options{mode} = "DescriptiveStatisticsBasic";
1049 $Options{outdelim} = "comma";
1050 $Options{precision} = 2;
1051 $Options{quote} = "yes";
1052 $Options{trimfraction} = 0.1;
1053
1054 if (!GetOptions(\%Options, "colmode|c=s", "columns=s", "columnpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "indelim=s", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) {
1055 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1056 }
1057 if ($Options{workingdir}) {
1058 if (! -d $Options{workingdir}) {
1059 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1060 }
1061 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1062 }
1063 if ($Options{colmode} !~ /^(colnum|collabel)$/i) {
1064 die "Error: The value specified, $Options{colmode}, for option \"-c --colmode\" is not valid. Allowed values: colnum or collabel\n";
1065 }
1066 if (!IsPositiveInteger($Options{detail})) {
1067 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
1068 }
1069 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
1070 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
1071 }
1072 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1073 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1074 }
1075 if ($Options{quote} !~ /^(yes|no)$/i) {
1076 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1077 }
1078 if (!IsPositiveInteger($Options{precision})) {
1079 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
1080 }
1081 if (!IsPositiveInteger($Options{klargest})) {
1082 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n";
1083 }
1084 if (!IsPositiveInteger($Options{ksmallest})) {
1085 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n";
1086 }
1087 if (IsFloat($Options{trimfraction})) {
1088 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) {
1089 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1090 }
1091 }
1092 else {
1093 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1094 }
1095 }
1096
1097 __END__
1098
1099 =head1 NAME
1100
1101 AnalyzeTextFilesData.pl - Analyze numerical coulmn data in TextFile(s)
1102
1103 =head1 SYNOPSIS
1104
1105 AnalyzeTextFilesData.pl TextFile(s)...
1106
1107 AnalyzeTextFilesData.pl [B<-c, --colmode> colnum | collabel] [B<--columns> "colnum,[colnum,...]" | "collabel,[collabel,...]" | All]
1108 [B<--columnpairs> "colnum,colnum,[colnum,colnum]..." | "collabel,collabel,[collabel,collabel]..." | AllPairs]
1109 [B<-d, --detail> infolevel] [B<-f, --fast>] [B<--frequencybins> number | "number,number,[number,...]"] [B<-h, --help>]
1110 [B<--indelim> comma | semicolon] [B<--klargest> number] [B<--ksmallest> number]
1111 [B<-m, --mode> DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"]
1112 [B<-o, --overwrite>] [B<--outdelim> comma | tab | semicolon] [B<-p, --precision> number]
1113 [B<-q, --quote> yes | no] [B<-r, --root> rootname] [B<--trimfraction> number] [B<-w, --workingdir> dirname] TextFiles(s)...
1114
1115 =head1 DESCRIPTION
1116
1117 Anaylze numerical column data in I<TextFile(s)> using a combination of various statistical
1118 functions; Non-numerical values are simply ignored. For I<Correlation, RSquare, and Covariance>
1119 analysis, the count of valid values in specifed column pair must be same; otherwise, column
1120 pair is ignored. The file names are separated by space. The valid file extensions are I<.csv>
1121 and I<.tsv> for comma/semicolon and tab delimited text files respectively. All other
1122 file names are ignored. All the text files in a current directory can be specified by
1123 I<*.csv>, I<*.tsv>, or the current directory name. The B<--indelim> option determines
1124 the format of I<TextFile(s)>. Any file which doesn't correspond to the format indicated
1125 by B<--indelim> option is ignored.
1126
1127 =head1 OPTIONS
1128
1129 =over 4
1130
1131 =item B<-c, --colmode> I<colnum | collabel>
1132
1133 Specify how columns are identified in TextFile(s): using column number or column
1134 label. Possible values: I<colnum or collabel>. Default value: I<colnum>.
1135
1136 =item B<--columns> I<"colnum,[colnum,...]" | "collabel,[collabel]..." | All>
1137
1138 This value is mode specific. It's a list of comma delimited columns to use
1139 for data analysis. Default value: I<First column>.
1140
1141 This value is ignored during I<Correlation/Pearson Correlation> and I<Covariance>
1142 data analysis; B<-coulmnparis> option is used instead.
1143
1144 For I<colnum> value of B<-c, --colmode> option, input values format is:
1145 I<colnum,colnum,...>. Example:
1146
1147 1,3,5
1148
1149 For I<collabel> value of B<-c, --colmode> option, input values format is:
1150 I<collabel,collabel,..>. Example:
1151
1152 ALogP,MolWeight,EC50
1153
1154 =item B<--columnpairs> I<"colnum,colnum,[colnum,colnum,...]" | "collabel,collabel,[collabel,collabel,...]" | AllPairs>
1155
1156 This value is mode specific and is only used for I<Correlation, PearsonCorrelation, or
1157 Covariance> value of B<-m, --mode> option. It is a comma delimited list of column pairs
1158 to use for data analysis during I<Correlation> and I<Covariance> calculations. Default value:
1159 I<First column, Second column>.
1160
1161 For I<colnum> value of B<-c, --colmode> option, input values format is:
1162 I<colnum,colnum,[colnum,colnum]...>. Example:
1163
1164 1,3,5,6,1,6
1165
1166 For I<collabel> value of B<-c, --colmode> option, input values format is:
1167 I<collabel,collabel,[collabel,collabel]..>. Example:
1168
1169 MolWeight,EC50,NumN+O,PSA
1170
1171 For I<AllPairs> value of B<--columnparis> option, all column pairs are used for I<Correlation>
1172 and I<Covariance> calculations.
1173
1174 =item B<-d, --detail> I<infolevel>
1175
1176 Level of information to print about column values being ignored. Default: I<1>. Possible values:
1177 1, 2, 3, or 4.
1178
1179 =item B<-f, --fast>
1180
1181 In this mode, all the columns specified for analysis are assumed to contain numerical
1182 data and no checking is performed before analysis. By default, only numerical data is
1183 used for analysis; other types of column data is ignored.
1184
1185 =item B<--frequencybins> I<number | "number,number,[number,...]">
1186
1187 Specify number of bins or bin range to use for frequency analysis. Default value: I<10>
1188
1189 Number of bins value along with the smallest and largest value for a column is used to
1190 group the column values into different groups.
1191
1192 The bin range list is used to group values for a column into different groups; It must contain
1193 values in ascending order. Examples:
1194
1195 10,20,30
1196 0.1,0.2,0.3,0.4,0.5
1197
1198 The frequency value calculated for a specific bin corresponds to all the column values
1199 which are greater than the previous bin value and less than or equal to the current bin value.
1200
1201 =item B<-h, --help>
1202
1203 Print this help message.
1204
1205 =item B<--indelim> I<comma | semicolon>
1206
1207 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
1208 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
1209 delimiter.
1210
1211 =item B<--klargest> I<number>
1212
1213 Kth largest value to find by I<KLargest> function. Default value: I<2> Valid values: positive
1214 integers.
1215
1216 =item B<--ksmallest> I<number>
1217
1218 Kth smallest value to find by I<KSmallest> function. Default value: I<2>. Valid values: positive
1219 integers.
1220
1221 =item B<-m, --mode> I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]">
1222
1223 Specify how to analyze data in TextFile(s): calculate basic or all descriptive statistics; or
1224 use a comma delimited list of supported statistical functions. Possible values:
1225 I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | "function1,[function2]...">. Default
1226 value: I<DescriptiveStatisticsBasic>
1227
1228 I<DescriptiveStatisticsBasic> includes these functions: I<Count, Maximum, Minimum, Mean,
1229 Median, Sum, StandardDeviation, StandardError, Variance>.
1230
1231 I<DescriptiveStatisticsAll>, in addition to I<DescriptiveStatisticsBasic> functions, includes:
1232 I<GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare,
1233 Skewness, TrimMean>.
1234
1235 I<All> uses complete list of supported functions: I<Average, AverageDeviation, Correlation,
1236 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis,
1237 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum,
1238 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores,
1239 StandardScoresN, TrimMean, Variance, VarianceN>. The function names ending with N
1240 calculate corresponding values assuming an entire population instead of a population sample.
1241
1242 Here are the formulas for these functions:
1243
1244 Average: See Mean
1245
1246 AverageDeviation: SUM( ABS(x[i] - Xmean) ) / n
1247
1248 Correlation: See Pearson Correlation
1249
1250 Covariance: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / n
1251
1252 GeometricMean: NthROOT( PRODUCT(x[i]) )
1253
1254 HarmonicMean: 1 / ( SUM(1/x[i]) / n )
1255
1256 Mean: SUM( x[i] ) / n
1257
1258 Median: Xsorted[(n - 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2
1259 for odd values of n.
1260
1261 Kurtosis: [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] -
1262 {3((n - 1)^2)}/{(n - 2)(n-3)}
1263
1264 PearsonCorrelation: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )
1265 (SUM( (y[i] - Ymean)^2 )) )
1266
1267 RSquare: PearsonCorrelation^2
1268
1269 Skewness: {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
1270
1271 StandardDeviation: SQRT ( SUM( (x[i] - Mean)^2 ) / (n - 1) )
1272
1273 StandardDeviationN: SQRT ( SUM( (x[i] - Mean)^2 ) / n )
1274
1275 StandardError: StandardDeviation / SQRT( n )
1276
1277 StandardScore: (x[i] - Mean) / (n - 1)
1278
1279 StandardScoreN: (x[i] - Mean) / n
1280
1281 Variance: SUM( (x[i] - Xmean)^2 / (n - 1) )
1282
1283 VarianceN: SUM( (x[i] - Xmean)^2 / n )
1284
1285 =item B<-o, --overwrite>
1286
1287 Overwrite existing files.
1288
1289 =item B<--outdelim> I<comma | tab | semicolon>
1290
1291 Output text file delimiter. Possible values: I<comma, tab, or semicolon>
1292 Default value: I<comma>.
1293
1294 =item B<-p, --precision> I<number>
1295
1296 Precision of calculated values in the output file. Default: up to I<2> decimal places.
1297 Valid values: positive integers.
1298
1299 =item B<-q, --quote> I<yes | no>
1300
1301 Put quotes around column values in output text file. Possible values: I<yes or
1302 no>. Default value: I<yes>.
1303
1304 =item B<-r, --root> I<rootname>
1305
1306 New text file name is generated using the root: <Root>.<Ext>. Default new file
1307 name: <InitialTextFileName><Mode>.<Ext>. Based on the specified analysis,
1308 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic,
1309 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation,
1310 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for
1311 comma/semicolon, and tab delimited text files respectively. This option is ignored for
1312 multiple input files.
1313
1314 =item B<--trimfraction> I<number>
1315
1316 Fraction of data to exclude from the top and bottom of the data set during
1317 I<TrimMean> calculation. Default value: I<0.1>. Valid values: > 0 and < 1.
1318
1319 =item B<-w --workingdir> I<text>
1320
1321 Location of working directory. Default: current directory.
1322
1323 =back
1324
1325 =head1 EXAMPLES
1326
1327 To calculate basic statistics for data in first column and generate a
1328 NewSample1DescriptiveStatisticsBasic.csv file, type:
1329
1330 % AnalyzeTextFilesData.pl -o -r NewSample1 Sample1.csv
1331
1332 To calculate basic statistics for data in third column and generate a
1333 NewSample1DescriptiveStatisticsBasic.csv file, type:
1334
1335 % AnalyzeTextFilesData.pl --columns 3 -o -r NewSample1 Sample1.csv
1336
1337 To calculate basic statistics for data in MolWeight column and generate a
1338 NewSample1DescriptiveStatisticsBasic.csv file, type:
1339
1340 % AnalyzeTextFilesData.pl -colmode collabel --columns MolWeight -o
1341 -r NewSample1 Sample1.csv
1342
1343 To calculate all available statistics for data in third column and all column pairs,
1344 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv,
1345 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv files,
1346 type:
1347
1348 % AnalyzeTextFilesData.pl -m DescriptiveStatisticsAll --columns 3 -o
1349 --columnpairs AllPairs -r NewSample1 Sample1.csv
1350
1351 To compute frequency distribution of data in third column into five bins and
1352 generate NewSample1MolWeightFrequencyAnalysis.csv, type:
1353
1354 % AnalyzeTextFilesData.pl -m Frequency --frequencybins 5 --columns 3
1355 -o -r NewSample1 Sample1.csv
1356
1357 To compute frequency distribution of data in third column into specified bin range
1358 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type:
1359
1360 % AnalyzeTextFilesData.pl -m Frequency --frequencybins "100,200,400"
1361 --columns 3 -o -r NewSample1 Sample1.csv
1362
1363 To calculate all available statistics for data in all columns and column pairs, type:
1364
1365 % AnalyzeTextFilesData.pl -m All --columns All --columnpairs
1366 AllPairs -o -r NewSample1 Sample1.csv
1367
1368 =head1 AUTHOR
1369
1370 Manish Sud <msud@san.rr.com>
1371
1372 =head1 SEE ALSO
1373
1374 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl
1375
1376 =head1 COPYRIGHT
1377
1378 Copyright (C) 2015 Manish Sud. All rights reserved.
1379
1380 This file is part of MayaChemTools.
1381
1382 MayaChemTools is free software; you can redistribute it and/or modify it under
1383 the terms of the GNU Lesser General Public License as published by the Free
1384 Software Foundation; either version 3 of the License, or (at your option)
1385 any later version.
1386
1387 =cut