annotate mayachemtools/bin/InfoTextFiles.pl @ 9:ab29fa5c8c1f draft default tip

Uploaded
author deepakjadmin
date Thu, 15 Dec 2016 14:18:03 -0500
parents 73ae111cf86f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1 #!/usr/bin/perl -w
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
2 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
3 # $RCSfile: InfoTextFiles.pl,v $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
4 # $Date: 2015/02/28 20:46:20 $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
5 # $Revision: 1.30 $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
6 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
7 # Author: Manish Sud <msud@san.rr.com>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
8 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
10 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
11 # This file is part of MayaChemTools.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
12 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
14 # the terms of the GNU Lesser General Public License as published by the Free
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
15 # Software Foundation; either version 3 of the License, or (at your option) any
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
16 # later version.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
17 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
18 # MayaChemTools is distributed in the hope that it will be useful, but without
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
19 # any warranty; without even the implied warranty of merchantability of fitness
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
20 # for a particular purpose. See the GNU Lesser General Public License for more
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
21 # details.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
22 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public License
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
26 # Boston, MA, 02111-1307, USA.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
27 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
28
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
29 use strict;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
30 use FindBin; use lib "$FindBin::Bin/../lib";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
31 use Getopt::Long;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
32 use File::Basename;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
33 use Text::ParseWords;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
34 use Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
35 use FileUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
36 use TextUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
37
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
39
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
40 # Autoflush STDOUT
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
41 $| = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
42
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
43 # Starting message...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
44 $ScriptName = basename($0);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
45 print "\n$ScriptName: Starting...\n\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
46 $StartTime = new Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
47
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
48 # Get the options and setup script...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
49 SetupScriptUsage();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
50 if ($Options{help} || @ARGV < 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
51 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
52 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
53
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
54 my(@TextFilesList);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
56
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
57 # Process options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
58 print "Processing options...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
59 my(%OptionsInfo);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
60 ProcessOptions();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
61
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
62 print "Checking input text file(s)...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
63 my(%TextFilesInfo);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
64 RetrieveTextFilesInfo();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
65 ProcessColumnsInfo();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
66
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
67 # Generate output files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
68 my($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
69 if (@TextFilesList > 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
70 print "\nProcessing text files...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
71 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
72 for $FileIndex (0 .. $#TextFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
73 if ($TextFilesInfo{FileOkay}[$FileIndex]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
74 print "\nProcessing file $TextFilesList[$FileIndex]...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
75 ListTextFileInfo($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
76 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
77 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
78 ListTotalSizeOfFiles();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
79
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
80 print "\n$ScriptName:Done...\n\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
81
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
82 $EndTime = new Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
83 $TotalTime = timediff ($EndTime, $StartTime);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
84 print "Total time: ", timestr($TotalTime), "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
85
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
86 ###############################################################################
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
87
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
88 # List appropriate information...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
89 sub ListTextFileInfo {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
90 my($Index) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
91 my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
92
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
93 $TextFile = $TextFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
94 $InDelim = $TextFilesInfo{InDelim}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
95 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
96
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
97 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
98
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
99 $LineCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
100 $EmptyLinesCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
101 $EmptyColDataLinesCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
102 $GreaterThanMaxColLinesCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
103
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
104 %EmptyColValuesCountMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
105 %NonEmptyColValuesCountMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
106 %SpecifiedNonNumericalColValuesCountMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
107 %NonNumericalColValuesCountMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
108 %NumericalColValuesCountMap = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
109
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
110 if ($OptionsInfo{ParseLines}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
111 # Skip over column labels from old file...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
112 if (<TEXTFILE>) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
113 $LineCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
114 LINE: while ($Line = <TEXTFILE>) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
115 $LineCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
116 $PrintTextLine = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
117 $Line =~ s/(\r\n)|(\r)|\n//g;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
118 @LineWords = quotewords($InDelim, 0, $Line);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
119 if ($OptionsInfo{CountEmpty}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
120 # Count lines with no data...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
121 if (!@LineWords) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
122 $EmptyLinesCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
123 if ($OptionsInfo{DetailLevel} >= 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
124 print "Line number $LineCount is empty...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
125 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
126 next LINE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
127 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
128 # Count lines with empty data for some columns...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
129 $EmptyColValueFound = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
130 VALUE: for $Value (@LineWords) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
131 if (!IsNotEmpty($Value)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
132 $EmptyColValueFound = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
133 next VALUE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
134 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
135 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
136 if ($EmptyColValueFound) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
137 $EmptyColDataLinesCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
138 if ($OptionsInfo{DetailLevel} >= 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
139 print "Line number $LineCount contains empty column value(s)...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
140 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
141 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
142 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
143 # Count lines with columns greater than the column label line...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
144 if (@LineWords > @ColLabels) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
145 $GreaterThanMaxColLinesCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
146 if ($OptionsInfo{DetailLevel} >= 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
147 print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
148 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
149 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
150 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
151 # Count empty values for each coulmn...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
152 for $ColNum (0 .. $#LineWords) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
153 if ($ColNum < @ColLabels) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
154 $Label = $ColLabels[$ColNum];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
155 if (IsNotEmpty($LineWords[$ColNum])) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
156 if (exists($NonEmptyColValuesCountMap{$Label})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
157 $NonEmptyColValuesCountMap{$Label} += 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
158 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
159 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
160 $NonEmptyColValuesCountMap{$Label} = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
161 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
162 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
163 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
164 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
165 if (exists($EmptyColValuesCountMap{$Label})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
166 $EmptyColValuesCountMap{$Label} += 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
167 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
168 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
169 $EmptyColValuesCountMap{$Label} = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
170 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
171 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
172 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
173 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
174 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
175 if ($OptionsInfo{CheckData}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
176 for $ColNum (0 .. $#LineWords) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
177 if ($ColNum < @ColLabels) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
178 if (IsNumerical($LineWords[$ColNum])) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
179 $Label = $ColLabels[$ColNum];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
180 if (exists($NumericalColValuesCountMap{$Label})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
181 $NumericalColValuesCountMap{$Label} += 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
182 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
183 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
184 $NumericalColValuesCountMap{$Label} = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
185 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
186 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
187 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
188 $Label = $ColLabels[$ColNum];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
189 if (IsNotEmpty($LineWords[$ColNum])) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
190 if (exists($NonNumericalColValuesCountMap{$Label})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
191 $NonNumericalColValuesCountMap{$Label} += 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
192 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
193 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
194 $NonNumericalColValuesCountMap{$Label} = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
195 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
196 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
197 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
198 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
199 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
200 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
201 if ($OptionsInfo{CheckNumericalData}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
202 $NonNumericalDataFound = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
203 for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
204 if ($ColNum < @LineWords) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
205 if (!IsNumerical($LineWords[$ColNum])) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
206 $NonNumericalDataFound = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
207 $Label = $ColLabels[$ColNum];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
208 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
209 $SpecifiedNonNumericalColValuesCountMap{$Label} += 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
210 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
211 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
212 $SpecifiedNonNumericalColValuesCountMap{$Label} = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
213 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
214 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
215 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
216 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
217 if ($NonNumericalDataFound) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
218 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
219 if ($OptionsInfo{DetailLevel} >=2 ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
220 print "Line number $LineCount contains non-numerical data for some specified column(s)...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
221 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
222 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
223 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
224 if ($PrintTextLine) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
225 print "Line $LineCount: $Line\n\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
226 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
227 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
228 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
229 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
230 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
231 while (<TEXTFILE>) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
232 $LineCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
233 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
234 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
235 close TEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
236
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
237 print "\nNumber of lines: $LineCount\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
238 print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
239 print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
240
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
241 if ($OptionsInfo{CountEmpty}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
242 print "\nNumber of lines with no data: $EmptyLinesCount\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
243 print "Number of lines with some missing column data: $EmptyColDataLinesCount\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
244 print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
245 PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
246 PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
247 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
248
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
249 if ($OptionsInfo{CheckData}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
250 print "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
251 PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
252 PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
253 print "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
254 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
255
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
256 if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
257 PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
258 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
259
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
260 # File size and modification information...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
261 print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
262 print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
263 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
264
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
265 # Total size of all the fiels...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
266 sub ListTotalSizeOfFiles {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
267 my($FileOkayCount, $TotalSize, $Index);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
268
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
269 $FileOkayCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
270 $TotalSize = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
271
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
272 for $Index (0 .. $#TextFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
273 if ($TextFilesInfo{FileOkay}[$Index]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
274 $FileOkayCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
275 $TotalSize += $TextFilesInfo{FileSize}[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
276 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
277 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
278 if ($FileOkayCount > 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
279 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
280 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
281 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
282
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
283 # List data information...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
284 sub PrintDataInformation {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
285 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
286 my($Line, $Label);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
287
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
288 $Line = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
289 for $Label (@{$DataLabelRef}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
290 $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
291 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
292 $Line =~ s/\,$//g;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
293 print "$InfoLabel: $Line\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
294 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
295
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
296 # Retrieve information about input text files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
297 sub RetrieveTextFilesInfo {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
298 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
299
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
300 %TextFilesInfo = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
301 @{$TextFilesInfo{FileOkay}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
302 @{$TextFilesInfo{ColCount}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
303 @{$TextFilesInfo{ColLabels}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
304 @{$TextFilesInfo{ColLabelToNumMap}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
305 @{$TextFilesInfo{InDelim}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
306 @{$TextFilesInfo{FileSize}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
307 @{$TextFilesInfo{FileLastModified}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
308
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
309 FILELIST: for $Index (0 .. $#TextFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
310 $TextFile = $TextFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
311
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
312 $TextFilesInfo{FileOkay}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
313 $TextFilesInfo{ColCount}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
314 $TextFilesInfo{InDelim}[$Index] = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
315 $TextFilesInfo{FileSize}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
316 $TextFilesInfo{FileLastModified}[$Index] = '';
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
317 @{$TextFilesInfo{ColLabels}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
318 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
319
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
320 if (!(-e $TextFile)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
321 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
322 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
323 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
324 if (!CheckFileType($TextFile, "csv tsv")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
325 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
326 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
327 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
328 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
329 if ($FileExt =~ /^tsv$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
330 $InDelim = "\t";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
331 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
332 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
333 $InDelim = "\,";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
334 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
335 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
336 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
337 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
338 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
339 $InDelim = "\;";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
340 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
341 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
342
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
343 if (!open TEXTFILE, "$TextFile") {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
344 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
345 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
346 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
347
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
348 $Line = GetTextLine(\*TEXTFILE);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
349 @ColLabels = quotewords($InDelim, 0, $Line);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
350 close TEXTFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
351
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
352 $TextFilesInfo{FileOkay}[$Index] = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
353 $TextFilesInfo{InDelim}[$Index] = $InDelim;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
354
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
355 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
356 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
357 for $ColNum (0 .. $#ColLabels) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
358 $ColLabel = $ColLabels[$ColNum];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
359 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
360 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
361 $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
362 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
363 $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
364 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
365
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
366 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
367
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
368 # Make sure specified numerical data columns are okay...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
369 sub ProcessColumnsInfo {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
370 my($Index, $TextFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
371
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
372 @{$TextFilesInfo{NumericalDataColNums}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
373 @{$TextFilesInfo{NumericalDataColLabels}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
374
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
375 FILELIST: for $Index (0 .. $#TextFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
376 $TextFile = $TextFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
377 @{$TextFilesInfo{NumericalDataColNums}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
378 @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
379
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
380 if ($TextFilesInfo{FileOkay}[$Index]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
381 my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
382 @SpecifiedColNums = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
383 if ($OptionsInfo{Mode} =~ /^colnum$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
384 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
385 if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
386 $ColNum = $SpecifiedColNum - 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
387 push @SpecifiedColNums, $ColNum;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
388 push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
389 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
390 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
391 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
392 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
393 for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
394 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
395 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
396 push @SpecifiedColNums, $ColNum;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
397 push @SpecifiedColLabels, $ColLabel;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
398 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
399 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
400 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
401 if (@SpecifiedColNums) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
402 push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
403 push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
404 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
405 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
406 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
407 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
408
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
409 # Process option values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
410 sub ProcessOptions {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
411 %OptionsInfo = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
412
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
413 $OptionsInfo{Mode} = $Options{mode};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
414
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
415 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
416 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
417
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
418 $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
419
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
420 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
421
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
422 $OptionsInfo{InDelim} = $Options{indelim};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
423 $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
424
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
425 $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
426 $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
427 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
428 $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
429
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
430 @{$OptionsInfo{SpecifiedNumericalDataCols}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
431 if ($Options{numericaldatacols}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
432 @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
433 if ($Options{mode} =~ /^colnum$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
434 my($ColNum);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
435 for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
436 if (!IsPositiveInteger($ColNum)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
437 die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
438 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
439 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
440 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
441 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
442
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
443 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
444
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
445 # Setup script usage and retrieve command line arguments specified using various options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
446 sub SetupScriptUsage {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
447
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
448 # Retrieve all the options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
449 %Options = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
450 $Options{detail} = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
451 $Options{mode} = "colnum";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
452 $Options{indelim} = "comma";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
453 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
454 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
455 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
456 if ($Options{workingdir}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
457 if (! -d $Options{workingdir}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
458 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
459 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
460 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
461 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
462 if ($Options{mode} !~ /^(colnum|collabel)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
463 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
464 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
465 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
466 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
467 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
468 if (!IsPositiveInteger($Options{detail})) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
469 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
470 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
471 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
472
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
473 __END__
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
474
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
475 =head1 NAME
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
476
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
477 InfoTextFiles.pl - List information about TextFile(s)
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
478
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
479 =head1 SYNOPSIS
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
480
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
481 InfoTextFiles.pl TextFile(s)...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
482
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
483 InfoTextFiles.pl [B<-a, --all>] [B<-c, --count>] [B<--datacheck>] [B<-d, --detail> infolevel] [B<-e, --empty>]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
484 [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-m, --mode> colnum | collabel]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
485 [B<-n, --numericaldatacols> colnum,[colnum,...] | collabel,[collabel,...]]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
486 [B<-w, --workingdir> dirname] TextFile(s)...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
487
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
488 =head1 DESCRIPTION
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
489
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
490 List information about I<TextFile(s)> contents: number of lines and columns, empty
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
491 column values, and so on. The file names are separated by spaces.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
492 The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
493 text files respectively. All other file names are ignored. All the text files in a
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
494 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
495 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
496 which doesn't correspond to the format indicated by B<--indelim> option is ignored.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
497
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
498 =head1 OPTIONS
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
499
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
500 =over 4
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
501
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
502 =item B<-a, --all>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
503
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
504 List all the available information.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
505
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
506 =item B<-c, --count>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
507
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
508 List number of rows and columns. This is B<default behavior>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
509
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
510 =item B<--datacheck>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
511
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
512 List number of numerical and non-numerical values for each column.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
513
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
514 =item B<-d, --detail> I<infolevel>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
515
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
516 Level of information to print about lines being ignored. Default: I<1>. Possible values:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
517 I<1, 2 or 3>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
518
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
519 =item B<-e, --empty>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
520
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
521 List number of empty row and column values.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
522
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
523 =item B<-h, --help>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
524
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
525 Print this help message.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
526
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
527 =item B<--indelim> I<comma | semicolon>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
528
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
529 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
530 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
531 delimiter.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
532
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
533 =item B<-m, --mode> I<colnum | collabel>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
534
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
535 Specify how to identify numerical data columns: using column number or column label.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
536 Possible values: I<colnum or collabel>. Default value: I<colnum>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
537
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
538 =item B<-n, --numericaldatacols> I<colnum,[colnum,...] | collabel,[collabel,...]>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
539
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
540 This value is mode specific. It is a list of column number or labels to check for
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
541 presence of numerical data only; otherwise, the value is flagged. Default value: I<all;all;...>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
542
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
543 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
544
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
545 1,3,5
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
546 "2,4,6"
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
547
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
548 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
549
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
550 "MW,SumNO,SumNHOH"
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
551
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
552
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
553 =item B<-w, --workingdir> I<dirname>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
554
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
555 Location of working directory. Default: current directory.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
556
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
557 =back
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
558
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
559 =head1 EXAMPLES
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
560
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
561 To count number of lines and columns in Text file(s), type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
562
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
563 % InfoTextFiles.pl Sample1.csv
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
564 % InfoTextFiles.pl Sample1.csv Sample1.tsv
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
565 % InfoTextFiles.pl *.csv *.tsv
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
566
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
567 To count number of lines, columns and empty values in Sample1.csv file and print
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
568 detailed information, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
569
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
570 % InfoTextFiles.pl -d 3 -e Sample1.csv
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
571
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
572 To track all available information and non-numerical values for Mol_ID and MolWeight
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
573 columns in Sample1.csv file and print detailed information, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
574
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
575 % InfoTextFiles.pl -d 3 -a -m collabel -n Mol_ID,MolWeight Sample1.csv
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
576
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
577 =head1 AUTHOR
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
578
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
579 Manish Sud <msud@san.rr.com>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
580
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
581 =head1 SEE ALSO
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
582
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
583 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
584
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
585 =head1 COPYRIGHT
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
586
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
587 Copyright (C) 2015 Manish Sud. All rights reserved.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
588
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
589 This file is part of MayaChemTools.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
590
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
591 MayaChemTools is free software; you can redistribute it and/or modify it under
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
592 the terms of the GNU Lesser General Public License as published by the Free
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
593 Software Foundation; either version 3 of the License, or (at your option)
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
594 any later version.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
595
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
596 =cut