annotate mayachemtools/bin/AnalyzeSDFilesData.pl @ 9:ab29fa5c8c1f draft default tip

Uploaded
author deepakjadmin
date Thu, 15 Dec 2016 14:18:03 -0500
parents 73ae111cf86f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1 #!/usr/bin/perl -w
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
2 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
3 # $RCSfile: AnalyzeSDFilesData.pl,v $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
4 # $Date: 2015/02/28 20:46:04 $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
5 # $Revision: 1.27 $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
6 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
7 # Author: Manish Sud <msud@san.rr.com>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
8 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
10 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
11 # This file is part of MayaChemTools.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
12 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
14 # the terms of the GNU Lesser General Public License as published by the Free
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
15 # Software Foundation; either version 3 of the License, or (at your option) any
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
16 # later version.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
17 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
18 # MayaChemTools is distributed in the hope that it will be useful, but without
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
19 # any warranty; without even the implied warranty of merchantability of fitness
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
20 # for a particular purpose. See the GNU Lesser General Public License for more
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
21 # details.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
22 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public License
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
26 # Boston, MA, 02111-1307, USA.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
27 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
28
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
29 use strict;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
30 use FindBin; use lib "$FindBin::Bin/../lib";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
31 use Getopt::Long;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
32 use File::Basename;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
33 use Text::ParseWords;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
34 use Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
35 use FileUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
36 use SDFileUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
37 use TextUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
38 use StatisticsUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
39
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
41
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
42 # Autoflush STDOUT
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
43 $| = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
44
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
45 # Starting message...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
46 $ScriptName = basename($0);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
47 print "\n$ScriptName: Starting...\n\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
48 $StartTime = new Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
49
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
50 # Get the options and setup script...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
51 SetupScriptUsage();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
52 if ($Options{help} || @ARGV < 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
53 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
54 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
55
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
56 my(@SDFilesList);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
58
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
59 print "Processing options...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
60 my(%OptionsInfo);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
61 ProcessOptions();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
62
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
63 # Collect information about SD files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
64 print "Checking input SD file(s)...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
65 my(%SDFilesInfo);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
66 RetrieveSDFilesInfo();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
67 ProcessSDFilesDataLabelsInfo();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
68
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
69 # Generate output files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
70 my($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
71 if (@SDFilesList > 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
72 print "\nProcessing SD files...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
73 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
74 for $FileIndex (0 .. $#SDFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
75 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
76 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
77 AnalyzeSDFile($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
78 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
79 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
80 print "\n$ScriptName:Done...\n\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
81
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
82 $EndTime = new Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
83 $TotalTime = timediff ($EndTime, $StartTime);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
84 print "Total time: ", timestr($TotalTime), "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
85
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
86 ###############################################################################
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
87
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
88 # Analyze data...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
89 sub AnalyzeSDFile {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
90 my($Index) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
91 my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
92
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
93 $SDFile = $SDFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
94 @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
95 %DataFieldValuesToAnalyzeMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
96 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
97 @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
98 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
99
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
100 # Collect appropriate data field label values for analysis...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
101 my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
102 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
103 $CmpdCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
104 $InvalidCmpdCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
105 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
106 $CmpdCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
107 @CmpdLines = split "\n", $CmpdString;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
108 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
109 @InvalidCmpdDataLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
110 DATALABEL: for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
111 if (exists $DataFieldValues{$DataLabel}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
112 $DataValue = $DataFieldValues{$DataLabel};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
113 if ($OptionsInfo{CheckData}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
114 if (!IsNumerical($DataValue)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
115 push @InvalidCmpdDataLabels, $DataLabel;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
116 next DATALABEL;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
117 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
118 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
119 push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
120 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
121 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
122 if (@InvalidCmpdDataLabels) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
123 $InvalidCmpdCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
124 if ($OptionsInfo{DetailLevel} >=4 ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
125 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
126 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
127 elsif ($OptionsInfo{DetailLevel} >= 3) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
128 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
129 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
130 elsif ($OptionsInfo{DetailLevel} >= 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
131 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
132 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
133 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
134 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
135 if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
136 print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
137 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
138 close SDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
139
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
140 # Perform the analysis...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
141 my(@SpecifiedFunctionNames, $SpecifiedFunction);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
142 @SpecifiedFunctionNames = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
143
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
144 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
145 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
146 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
147 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
148 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
149 if (@SpecifiedFunctionNames) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
150 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap)
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
151 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
152 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
153 if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
154 PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
155 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
156 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
157 # Perform pairwise analysis for specified columns and write out calculated values - correlation
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
158 # rsquare, or covariance - in the same file.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
159 PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
160 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
161 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
163 PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
164 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
165 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
166 PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
167 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
168
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
169 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
170
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
171 # Calculate values for various statistical functions...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
172 sub PerformAnalysis {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
173 my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
174 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
175
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
176 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
177
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
178 print "Generating new text file $NewTextFile...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
179 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
180
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
181 # Write out column labels...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
182 @ColLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
183 push @ColLabels, "DataLabel";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
184 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
185 $Label = $SpecifiedFunction;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
186 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
187 my($KthValue);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
188 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
189 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
190 $Label =~ s/K//g;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
191 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
192 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
193 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
194 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
195 push @ColLabels, $Label;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
196 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
197 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
198 print NEWTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
199
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
200 # Go over each column to be analyzed...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
201 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
202
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
203 # Turn off "strict"; otherwise, invoking statistical functions using function name string
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
204 # is problematic.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
205 no strict;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
206
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
207 my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
208 %CalculatedValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
209 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
210 @RowValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
211 # Setup column id...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
212 push @RowValues, $DataLabel;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
213 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
214 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
215 $Value = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
216 if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
217 # Invalid column values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
218 push @RowValues, $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
219 next FUNCTIONNAME;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
220 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
221 if ($SpecifiedFunction =~ /^Count$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
222 $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
223 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
224 elsif ($SpecifiedFunction =~ /^KLargest$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
225 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
226 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
227 elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
228 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
229 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
230 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
231 if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
232 $Value = $CalculatedValues{$DataLabel}{StandardDeviation};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
233 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
234 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
235 $Value = &$SpecifiedFunction($DataValuesRef);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
236 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
237 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
238 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
239 elsif ($SpecifiedFunction =~ /^StandardError$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
240 if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
241 $Value = StandardDeviation($DataValuesRef);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
242 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
243 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
244 if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
245 $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
246 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
247 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
248 elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
249 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
250 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
251 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
252 $Value = &$SpecifiedFunction($DataValuesRef);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
253 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
254 # Format the output value. And add zero to get rid of tariling zeros...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
255 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
256 push @RowValues, $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
257 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
258 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
259 print NEWTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
260 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
261 close NEWTEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
262 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
263
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
264 # Calculate covariance, correlation, rsquare for specified data field label pairs....
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
265 sub PerformDataLabelPairAnalysis {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
266 my($Index, $DataValuesToAnalyzeMapRef) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
267 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
268
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
269 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
270 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
271 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
272
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
273 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
274 print "Generating new text file $NewTextFile...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
275 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
276
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
277 # Write out the column labels...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
278 @ColLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
279 push @ColLabels, ("DataLabel1", "DataLabel2");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
280 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
281 push @ColLabels, "Correlation";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
282 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
283 push @ColLabels, "RSquare";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
284 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
285 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
286 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
287 push @ColLabels, "Covariance";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
288 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
289 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
290 print NEWTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
291
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
292 # Go over each data field pair...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
293 my($CorrelationValue, $RSquareValue, $CovarianceValue, $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
294
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
295 @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
296 @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
297 for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
298 @RowValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
299 $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
300 $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
301 $DataValues1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
302 $DataValues2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
303
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
304 # Setup column ids...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
305 push @RowValues, $DataLabel1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
306 push @RowValues, $DataLabel2;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
307
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
308 if (@$DataValues1 != @$DataValues2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
309 # Print a warning...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
310 warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
311 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
312 push @RowValues, "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
313 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
314 push @RowValues, "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
315 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
316 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
317 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
318 push @RowValues, "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
319 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
320 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
321 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
322 # Calculate appropriate value...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
323 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
324 $CorrelationValue = Correlation($DataValues1, $DataValues2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
325 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
326 push @RowValues, $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
327 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
328 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
329 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
330 push @RowValues, $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
331 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
332 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
333 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
334 $CovarianceValue = Covariance($DataValues1, $DataValues2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
335 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
336 push @RowValues, $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
337 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
338 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
339 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
340 print NEWTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
341 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
342 close NEWTEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
343 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
344
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
345 # Generate histogram numbers...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
346 sub PerformFrequencyAnalysis {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
347 my($Index, $DataValuesToAnalyzeMapRef) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
348 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
349
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
350 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
351 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
352 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
353 print "Generating new text file $NewTextFile...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
354 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
355
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
356 # Write out the column labels...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
357 @ColLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
358 push @ColLabels , ("Bins", "Frequency");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
359 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
360 print NEWTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
361
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
362 #Calculate and write out frequency values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
363 %FrequencyMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
364 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
365 if (@$DataValuesRef) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
366 if (@{$OptionsInfo{BinRange}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
367 %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
368 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
369 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
370 %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
371 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
372 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
373 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
374 $FrequencyValue = $FrequencyMap{$BinValue};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
375
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
376 @RowValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
377 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
378 push @RowValues, $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
379 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
380 push @RowValues, $Value;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
381
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
382 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
383 print NEWTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
384 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
385 close NEWTEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
386 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
387 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
388
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
389 # Calculate covariance, correlation/rsquare matrices....
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
390 sub PerformMatrixAnalysis {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
391 my($Index, $DataValuesToAnalyzeMapRef) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
392 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
393
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
394 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
395 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
396 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
397
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
398 $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
399 $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
400 $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
401
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
402 my($TextFilesList, $Delimiter);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
403 $TextFilesList = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
404 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
405 $TextFilesList = $CorrelationTextFile;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
406 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
407 $TextFilesList .= ", $CorrelationTextFile";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
408 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
409 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
410 $Delimiter = length($TextFilesList) ? "," : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
411 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
412 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
413 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
414 if ($TextFilesList =~ /\,/) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
415 print "Generating new text files $TextFilesList...\n"
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
416 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
417 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
418 print "Generating new text file $TextFilesList...\n"
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
419 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
420 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
421 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
422 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
423 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
424 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
425 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
426 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
427 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
428 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
429
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
430 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
431
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
432 # Write out the column labels...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
433 @ColLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
434 push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
435 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
436 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
437 print CORRELATIONTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
438 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
439 print RSQUARETEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
440 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
441 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
442 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
443 print COVARIANCETEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
444 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
445
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
446 # Due to symmetric nature of these matrices, only one half needs to be
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
447 # calculated. So, just calculate the lower half and copy it to upper half...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
448 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
449
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
450 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
451 @DataLabelsToAnalyze = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
452 @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
453
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
454 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
455 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
456 for $LabelIndex2 (0 .. $LabelIndex1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
457 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
458 $DataValuesRef1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
459 $DataValuesRef2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
460 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
461 $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
462 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
463 $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
464 if ($DataLabel1 ne $DataLabel2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
465 $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
466 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
467 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
468 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
469 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
470 $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
471 if ($DataLabel1 ne $DataLabel2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
472 $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
473 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
474 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
475 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
476 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
477 $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
478 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
479 $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
480 if ($DataLabel1 ne $DataLabel2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
481 $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
482 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
483 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
484 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
485 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
486
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
487 # Write out the matrices...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
488 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
489 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
490 @CorrelationRowValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
491 @RSquareRowValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
492 @CovarianceRowValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
493 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
494 push @CorrelationRowValues, $DataLabel1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
495 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
496 push @RSquareRowValues, $DataLabel1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
497 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
498 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
499 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
500 push @CovarianceRowValues, $DataLabel;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
501 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
502 for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
503 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
504 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
505 push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
506 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
507 push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
508 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
509 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
510 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
511 push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
512 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
513 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
514 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
515 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
516 print CORRELATIONTEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
517 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
518 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
519 print RSQUARETEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
520 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
521 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
522 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
523 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
524 print COVARIANCETEXTFILE "$Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
525 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
526 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
527 if ($CalculateCorrelation || $CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
528 close CORRELATIONTEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
529 if ($CalculateRSquare) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
530 close RSQUARETEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
531 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
532 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
533 if ($CalculateCovariance) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
534 close COVARIANCETEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
535 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
536 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
537
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
538 # Calculate standard scores...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
539 sub PerformStandardScoresAnalysis {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
540 my($Index, $DataValuesToAnalyzeMapRef) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
541 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
542
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
543 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
544 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
545
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
546 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
547 print "Generating new text file $NewTextFile...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
548 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
549
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
550 my($DataLabel, @DataLabelsToAnalyze);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
551 # Write out column labels...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
552 @ColLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
553 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
554 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
555 if ($StandardScores) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
556 push @ColLabels, "${DataLabel}\(StandardScores)";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
557 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
558 if ($StandardScoresN) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
559 push @ColLabels, "${DataLabel}\(StandardScoresN)";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
560 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
561 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
562 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
563 print NEWTEXTFILE "$NewLine\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
564
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
565 # Go over each column to be analyzed and calculate standard deviation
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
566 # and mean values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
567 my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
568 %StandardDeviationMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
569 %StandardDeviationNMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
570 %MeanMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
571 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
572 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
573 if (!exists($MeanMap{$DataLabel})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
574 $MeanMap{$DataLabel} = Mean($DataValuesRef);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
575 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
576 if ($StandardScores) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
577 if (!exists($StandardDeviationMap{$DataLabel})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
578 $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
579 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
580 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
581 if ($StandardScoresN) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
582 if (!exists($StandardDeviationNMap{$DataLabel})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
583 $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
584 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
585 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
586 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
587 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
588 # Go over each data field and calculate standard scores for each column
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
589 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
590 # for StandardScoresN; write out the calculated values as well...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
591
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
592 my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
593 $SDFile = $SDFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
594
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
595 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
596 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
597 @CmpdLines = split "\n", $CmpdString;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
598 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
599 @RowValues = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
600 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
601 $Value = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
602 if (exists $DataFieldValues{$DataLabel}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
603 $Value = $DataFieldValues{$DataLabel};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
604 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
605 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
606 if ($StandardScores) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
607 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
608 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
609 push @RowValues, $ScoreValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
610 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
611 if ($StandardScoresN) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
612 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
613 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
614 push @RowValues, $ScoreValue;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
615 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
616 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
617 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
618 print NEWTEXTFILE "$NewLine\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
619 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
620 close SDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
621 close NEWTEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
622
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
623 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
624
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
625 # Make sure the specified data field labels exists in SD files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
626 sub ProcessSDFilesDataLabelsInfo {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
627 my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
628
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
629 @{$SDFilesInfo{DataLabelsToAnalyze}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
630 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
631 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
632 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
633
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
634 FILELIST: for $Index (0 .. $#SDFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
635 $SDFile = $SDFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
636
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
637 @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
638 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
639 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
640 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
641
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
642 %UniqueDataLabelsToAnalyzeMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
643
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
644 if ($SDFilesInfo{FileOkay}[$Index]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
645 @DataLabelsToAnalyze = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
646 if (@{$OptionsInfo{SpecifiedDataLabels}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
647 for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
648 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
649 push @DataLabelsToAnalyze, $DataLabel;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
650 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
651 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
652 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
653 elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
654 push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
655 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
656 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
657 push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
658 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
659 if (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
660 push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
661 # Set up unique data field label map as well...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
662 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
663 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
664 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
665 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
666 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
667 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
668 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
669 warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
670 $SDFilesInfo{FileOkay}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
671 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
672 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
673 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
674 # Make sure specific frequency files don't exist...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
675 my($FrequencyFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
676 for $DataLabel (@DataLabelsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
677 $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
678 if (-e $FrequencyFile) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
679 warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
680 $SDFilesInfo{FileOkay}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
681 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
682 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
683 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
684 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
685 # Setup specified data field label pairs...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
686 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
687 my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
688 if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
689 # Make sure both data field labels exist...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
690 my($DataFieldIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
691 for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
692 $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
693 $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
694 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
695 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
696 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
697 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
698 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
699 elsif ($OptionsInfo{AllDataLabelPairs}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
700 for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
701 for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
702 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
703 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
704 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
705 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
706 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
707 for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
708 for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
709 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
710 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
711 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
712 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
713 if (@DataLabelPairsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
714 if (@DataLabelPairsToAnalyze % 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
715 warn "Warning: Ignoring file $SDFile: Invalid number values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
716 $SDFilesInfo{FileOkay}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
717 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
718 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
719 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
720 for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
721 push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
722 push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
723 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
724 # Set up unique data field labe map as well...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
725 for $DataLabel (@DataLabelPairsToAnalyze) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
726 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
727 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
728 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
729 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
730 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
731 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
732 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
733 # Setup unique data field label array...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
734 push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
735 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
736 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
737 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
738
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
739 # Retrieve information about input SD files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
740 sub RetrieveSDFilesInfo {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
741 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
742
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
743 %SDFilesInfo = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
744
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
745 @{$SDFilesInfo{FileOkay}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
746 @{$SDFilesInfo{CmpdCount}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
747 @{$SDFilesInfo{NewTextFileRoot}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
748 @{$SDFilesInfo{NewTextFileExt}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
749
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
750 @{$SDFilesInfo{AllDataFieldLabels}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
751 @{$SDFilesInfo{AllDataFieldLabelsMap}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
752 @{$SDFilesInfo{CommonDataLabels}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
753
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
754 FILELIST: for $Index (0 .. $#SDFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
755 $SDFile = $SDFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
756
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
757 $SDFilesInfo{FileOkay}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
758
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
759 $SDFilesInfo{CmpdCount}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
760 $SDFilesInfo{NewTextFileRoot}[$Index] = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
761 $SDFilesInfo{NewTextFileExt}[$Index] = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
762
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
763 @{$SDFilesInfo{AllDataLabels}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
764 %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
765 @{$SDFilesInfo{CommonDataLabels}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
766
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
767 if (!(-e $SDFile)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
768 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
769 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
770 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
771 if (!CheckFileType($SDFile, "sd sdf")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
772 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
773 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
774 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
775
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
776 # Generate appropriate name for the new text files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
777 $FileDir = ""; $FileName = ""; $FileExt = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
778 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
779 $OutFileExt = "csv";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
780 if ($Options{outdelim} =~ /^tab$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
781 $OutFileExt = "tsv";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
782 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
783 if ($Options{root} && (@SDFilesList == 1)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
784 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
785 if ($RootFileName && $RootFileExt) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
786 $FileName = $RootFileName;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
787 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
788 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
789 $FileName = $Options{root};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
790 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
791 $OutFileRoot = $FileName;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
792 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
793 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
794 $OutFileRoot = $FileName;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
795 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
796 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
797
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
798 if (!$OptionsInfo{Overwrite}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
799 if (-e $OutFile) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
800 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
801 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
802 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
803 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
804 if ($OptionsInfo{AllDataLabelPairs}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
805 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
806 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
807 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
808 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
809 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
810 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
811 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
812 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
813 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
814 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
815 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
816 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
817 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
818 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
819 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
820 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
821 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
822 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
823 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
824 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
825 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
826 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
827 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
828 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
829 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
830
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
831 if (!open SDFILE, "$SDFile") {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
832 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
833 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
834 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
835
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
836 my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
837 $CmpdCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
838 @DataFieldLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
839 @CommonDataFieldLabels = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
840 ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
841 push @DataFieldLabels, @{$DataFieldLabelsRef};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
842 push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
843 close SDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
844
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
845 $SDFilesInfo{FileOkay}[$Index] = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
846 $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
847 $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
848
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
849 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
850 push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
851 push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
852 for $Label (@DataFieldLabels) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
853 $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
854 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
855 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
856 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
857
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
858 # Process option values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
859 sub ProcessOptions {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
860 %OptionsInfo = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
861
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
862 $OptionsInfo{Mode} = $Options{mode};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
863
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
864 $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
865
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
866 $OptionsInfo{DetailLevel} = $Options{detail};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
867
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
868 # Setup supported statistical functions...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
869 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
870
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
871 %SupportedStatisticaFunctionsMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
872 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
873
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
874 for $SupportedFunction (@SupportedStatisticaFunctions) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
875 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
876 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
877
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
878 # Setup a list of functions to use for analysis...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
879 my($SpecifiedFunction);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
880
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
881 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
882 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
883
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
884 # Check mode values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
885 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
886 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
887 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
888 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
889 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
890 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
891 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
892 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
893 elsif ($Options{mode} =~ /^All$/i ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
894 $OptionsInfo{FileNameMode} = "AllStatistics";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
895 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
896 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
897 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
898 $OptionsInfo{FileNameMode} = "SpecifiedStatistics";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
899
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
900 # Comma delimited list of functions...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
901 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
902
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
903 $Mode = $Options{mode};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
904 $Mode =~ s/ //g;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
905 @SpecifiedFunctions = split ",", $Mode;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
906 @UnsupportedSpecifiedFunctions = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
907 for $SpecifiedFunction (@SpecifiedFunctions) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
908 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
909 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
910 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
911 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
912 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
913 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
914 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
915 if (@UnsupportedSpecifiedFunctions) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
916 if (@UnsupportedSpecifiedFunctions > 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
917 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
918 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
919 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
920 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
921 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
922 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
923 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
924 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
925
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
926 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
927 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
928 next FUNCTION;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
929 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
930 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
931 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
932
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
933 # Setup delimiter and quotes...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
934 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
935 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
936
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
937 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
938 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
939
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
940 # Setup miscellaneous options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
941 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
942 $OptionsInfo{Precision} = $Options{precision};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
943
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
944 $OptionsInfo{KLargest} = $Options{klargest};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
945 $OptionsInfo{KSmallest} = $Options{ksmallest};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
946
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
947 $OptionsInfo{TrimFraction} = $Options{trimfraction};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
948
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
949 # Setup frequency bin values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
950 $OptionsInfo{NumOfBins} = 10;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
951 @{$OptionsInfo{BinRange}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
952 if ($Options{frequencybins} =~ /\,/) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
953 my($BinValue, @SpecifiedBinRange);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
954 @SpecifiedBinRange = split /\,/, $Options{frequencybins};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
955 if (@SpecifiedBinRange < 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
956 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
957 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
958 for $BinValue (@SpecifiedBinRange) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
959 if (!IsNumerical($BinValue)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
960 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
961 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
962 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
963 my($Index1, $Index2);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
964 for $Index1 (0 .. $#SpecifiedBinRange) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
965 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
966 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
967 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
968 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
969 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
970 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
971 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
972 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
973 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
974 $OptionsInfo{NumOfBins} = $Options{frequencybins};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
975 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
976 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
977 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
978 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
979
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
980 # Setup specified data field labels...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
981 @{$OptionsInfo{SpecifiedDataLabels}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
982 if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
983 my(@SpecifiedValues) = split ",", $Options{datafields};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
984 push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
985 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
986 @{$OptionsInfo{SpecifiedDataLabelPairs}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
987 $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
988 $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
989 if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
990 my(@SpecifiedValues) = split ",", $Options{datafieldpairs};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
991 if (@SpecifiedValues % 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
992 die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
993 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
994 push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
995 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
996
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
997 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
998
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
999 # Setup script usage and retrieve command line arguments specified using various options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1000 sub SetupScriptUsage {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1001
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1002 # Retrieve all the options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1003 %Options = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1004 $Options{detail} = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1005 $Options{datafields} = "Common";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1006 $Options{datafieldpairs} = "CommonPairs";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1007 $Options{frequencybins} = 10;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1008 $Options{klargest} = 2;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1009 $Options{ksmallest} = 2;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1010 $Options{mode} = "DescriptiveStatisticsBasic";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1011 $Options{outdelim} = "comma";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1012 $Options{precision} = 2;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1013 $Options{quote} = "yes";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1014 $Options{trimfraction} = 0.1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1015
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1016 if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1017 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1018 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1019 if ($Options{workingdir}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1020 if (! -d $Options{workingdir}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1021 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1022 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1023 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1024 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1025 if (!IsInteger($Options{detail})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1026 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1027 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1028 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1029 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1030 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1031 if ($Options{quote} !~ /^(yes|no)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1032 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1033 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1034 if (!IsPositiveInteger($Options{precision})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1035 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1036 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1037 if (!IsPositiveInteger($Options{klargest})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1038 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1039 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1040 if (!IsPositiveInteger($Options{ksmallest})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1041 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1042 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1043 if (IsFloat($Options{trimfraction})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1044 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1045 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1046 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1047 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1048 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1049 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1050 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1051 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1052
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1053 __END__
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1054
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1055 =head1 NAME
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1056
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1057 AnalyzeSDFilesData.pl - Analyze numerical data field values in SDFile(s)
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1058
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1059 =head1 SYNOPSIS
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1060
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1061 AnalyzeSDFilesData.pl SDFile(s)...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1062
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1063 AnalyzeSDFilesData.pl [B<--datafields> "fieldlabel,[fieldlabel,...]" | All]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1064 [B<--datafieldpairs> "fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | AllPairs] [B<-d, --detail> infolevel]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1065 [B<-f, --fast>] [B<--frequencybins> number | "number,number,[number,...]"]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1066 [B<-h, --help>] [B<--klargest> number] [B<--ksmallest> number]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1067 [B<-m, --mode> DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]"]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1068 [B<--trimfraction> number] [B<-w, --workingdir> dirname] SDFiles(s)...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1069
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1070 =head1 DESCRIPTION
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1071
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1072 Analyze numerical data field values in I<SDFile(s)> using a combination of various statistical
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1073 functions; Non-numerical values are simply ignored. For I<Correlation, RSquare, and
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1074 Covariance> analysis, the count of valid values in specified data field pairs must be same;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1075 otherwise, column data field pair is ignored. The file names are separated by space.The valid file
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1076 extensions are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1077 current directory can be specified either by I<*.sdf> or the current directory name.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1078
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1079 =head1 OPTIONS
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1080
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1081 =over 4
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1082
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1083 =item B<--datafields> I<"fieldlabel,[fieldlabel,...]" | Common | All>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1084
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1085 Data fields to use for analysis. Possible values: list of comma separated data field
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1086 labels, data fields common to all records, or all data fields. Default value: I<Common>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1087 Examples:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1088
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1089 ALogP,MolWeight,EC50
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1090 "MolWeight,PSA"
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1091
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1092 =item B<--datafieldpairs> I<"fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]" | CommonPairs | AllPairs>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1093
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1094 This value is mode specific and is only used for I<Correlation, PearsonCorrelation, or
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1095 Covariance> value of B<-m, --mode> option. It specifies data field label pairs to use
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1096 for data analysis during I<Correlation> and I<Covariance> calculations. Possible values:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1097 comma delimited list of data field label pairs, data field label pairs common to all records,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1098 or all data field pairs. Default value:I<CommonPairs>. Example:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1099
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1100 MolWeight,EC50,NumN+O,PSA
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1101
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1102 For I<AllPairs> value of B<--datafieldpairs> option, all data field label pairs are used for
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1103 I<Correlation> and I<Covariance> calculations.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1104
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1105 =item B<-d, --detail> I<infolevel>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1106
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1107 Level of information to print about column values being ignored. Default: I<0>. Possible values:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1108 0, 1, 2, 3, or 4.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1109
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1110 =item B<-f, --fast>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1111
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1112 In this mode, all the data field values specified for analysis are assumed to contain numerical
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1113 data and no checking is performed before analysis. By default, only numerical data is
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1114 used for analysis; other types of column data is ignored.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1115
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1116 =item B<--frequencybins> I<number | "number,number,[number,...]">
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1117
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1118 Specify number of bins or bin range to use for frequency analysis. Default value: I<10>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1119
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1120 Number of bins value along with the smallest and largest value for a column is used to
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1121 group the column values into different groups.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1122
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1123 The bin range list is used to group values for a column into different groups; It must contain
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1124 values in ascending order. Examples:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1125
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1126 10,20,30
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1127 0.1,0.2,0.3,0.4,0.5
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1128
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1129 The frequency value calculated for a specific bin corresponds to all the column values
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1130 which are greater than the previous bin value and less than or equal to the current bin value.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1131
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1132 =item B<-h, --help>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1133
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1134 Print this help message.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1135
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1136 =item B<--klargest> I<number>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1137
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1138 Kth largest value to find by I<KLargest> function. Default value: I<2>. Valid values: positive
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1139 integers.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1140
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1141 =item B<--ksmallest> I<number>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1142
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1143 Kth smallest value to find by I<KSmallest> function. Default values: I<2>. Valid values: positive
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1144 integers.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1145
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1146 =item B<-m, --mode> I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | "function1, [function2,...]">
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1147
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1148 Specify how to analyze data in SDFile(s): calculate basic or all descriptive statistics; or
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1149 use a comma delimited list of supported statistical functions. Possible values:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1150 I<DescriptiveStatisticsBasic | DescriptiveStatisticsAll | "function1,[function2]...">. Default
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1151 value: I<DescriptiveStatisticsBasic>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1152
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1153 I<DescriptiveStatisticsBasic> includes these functions: I<Count, Maximum, Minimum, Mean,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1154 Median, Sum, StandardDeviation, StandardError, Variance>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1155
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1156 I<DescriptiveStatisticsAll>, in addition to I<DescriptiveStatisticsBasic> functions, includes:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1157 I<GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1158 Skewness, TrimMean>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1159
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1160 I<All> uses complete list of supported functions: I<Average, AverageDeviation, Correlation,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1161 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1162 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1163 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1164 StandardScoresN, TrimMean, Variance, VarianceN>. The function names ending with N
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1165 calculate corresponding values assuming an entire population instead of a population sample.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1166 Here are the formulas for these functions:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1167
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1168 Average: See Mean
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1169
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1170 AverageDeviation: SUM( ABS(x[i] - Xmean) ) / n
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1171
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1172 Correlation: See Pearson Correlation
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1173
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1174 Covariance: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / n
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1175
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1176 GeometricMean: NthROOT( PRODUCT(x[i]) )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1177
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1178 HarmonicMean: 1 / ( SUM(1/x[i]) / n )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1179
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1180 Mean: SUM( x[i] ) / n
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1181
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1182 Median: Xsorted[(n - 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1183 for odd values of n.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1184
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1185 Kurtosis: [ {n(n + 1)/(n - 1)(n - 2)(n - 3)} SUM{ ((x[i] - Xmean)/STDDEV)^4 } ] -
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1186 {3((n - 1)^2)}/{(n - 2)(n-3)}
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1187
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1188 PearsonCorrelation: SUM( (x[i] - Xmean)(y[i] - Ymean) ) / SQRT( SUM( (x[i] - Xmean)^2 )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1189 (SUM( (y[i] - Ymean)^2 )) )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1190
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1191 RSquare: PearsonCorrelation^2
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1192
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1193 Skewness: {n/(n - 1)(n - 2)} SUM{ ((x[i] - Xmean)/STDDEV)^3 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1194
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1195 StandardDeviation: SQRT ( SUM( (x[i] - Mean)^2 ) / (n - 1) )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1196
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1197 StandardDeviationN: SQRT ( SUM( (x[i] - Mean)^2 ) / n )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1198
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1199 StandardError: StandardDeviation / SQRT( n )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1200
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1201 StandardScore: (x[i] - Mean) / (n - 1)
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1202
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1203 StandardScoreN: (x[i] - Mean) / n
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1204
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1205 Variance: SUM( (x[i] - Xmean)^2 / (n - 1) )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1206
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1207 VarianceN: SUM( (x[i] - Xmean)^2 / n )
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1208
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1209 =item B<-o, --overwrite>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1210
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1211 Overwrite existing files.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1212
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1213 =item B<--outdelim> I<comma | tab | semicolon>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1214
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1215 Output text file delimiter. Possible values: I<comma, tab, or semicolon>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1216 Default value: I<comma>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1217
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1218 =item B<-p, --precision> I<number>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1219
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1220 Precision of calculated values in the output file. Default: up to I<2> decimal places.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1221 Valid values: positive integers.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1222
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1223 =item B<-q, --quote> I<yes | no>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1224
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1225 Put quotes around column values in output text file. Possible values: I<yes or
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1226 no>. Default value: I<yes>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1227
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1228 =item B<-r, --root> I<rootname>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1229
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1230 New text file name is generated using the root: <Root>.<Ext>. Default new file
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1231 name: <InitialSDFileName><Mode>.<Ext>. Based on the specified analysis,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1232 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1233 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1234 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1235 comma/semicolon, and tab delimited text files respectively. This option is ignored for
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1236 multiple input files.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1237
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1238 =item B<--trimfraction> I<number>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1239
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1240 Fraction of data to exclude from the top and bottom of the data set during
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1241 I<TrimMean> calculation. Default value: I<0.1> Valid values: > 0 and < 1.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1242
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1243 =item B<-w --workingdir> I<text>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1244
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1245 Location of working directory. Default: current directory.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1246
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1247 =back
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1248
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1249 =head1 EXAMPLES
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1250
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1251 To calculate basic statistics for data in all common data fields and generate a
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1252 NewSample1DescriptiveStatisticsBasic.csv file, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1253
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1254 % AnalyzeSDFilesData.pl -o -r NewSample1 Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1255
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1256 To calculate basic statistics for MolWeight data field and generate a
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1257 NewSample1DescriptiveStatisticsBasic.csv file, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1258
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1259 % AnalyzeSDFilesData.pl --datafields MolWeight -o -r NewSample1
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1260 Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1261
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1262 To calculate all available statistics for MolWeight data field and all data field pairs,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1263 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1264 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1265 files, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1266
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1267 % AnalyzeSDFilesData.pl -m DescriptiveStatisticsAll --datafields
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1268 MolWeight -o --datafieldpairs AllPairs -r NewSample1 Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1269
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1270 To compute frequency distribution of MolWeight data field into five bins and
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1271 generate NewSample1MolWeightFrequencyAnalysis.csv, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1272
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1273 % AnalyzeSDFilesData.pl -m Frequency --frequencybins 5 --datafields
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1274 MolWeight -o -r NewSample1 Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1275
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1276 To compute frequency distribution of data in MolWeight data field into specified bin range
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1277 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1278
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1279 % AnalyzeSDFilesData.pl -m Frequency --frequencybins "100,200,400"
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1280 --datafields MolWeight -o -r NewSample1 Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1281
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1282 To calculate all available statistics for data in all data fields and pairs, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1283
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1284 % AnalyzeSDFilesData.pl -m All --datafields All --datafieldpairs
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1285 AllPairs -o -r NewSample1 Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1286
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1287 =head1 AUTHOR
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1288
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1289 Manish Sud <msud@san.rr.com>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1290
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1291 =head1 SEE ALSO
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1292
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1293 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1294
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1295 =head1 COPYRIGHT
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1296
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1297 Copyright (C) 2015 Manish Sud. All rights reserved.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1298
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1299 This file is part of MayaChemTools.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1300
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1301 MayaChemTools is free software; you can redistribute it and/or modify it under
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1302 the terms of the GNU Lesser General Public License as published by the Free
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1303 Software Foundation; either version 3 of the License, or (at your option)
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1304 any later version.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1305
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1306 =cut