annotate mayachemtool/mayachemtools/bin/InfoTextFiles.pl @ 0:68300206e90d draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:41:30 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
1 #!/usr/bin/perl -w
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
2 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
3 # $RCSfile: InfoTextFiles.pl,v $
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
4 # $Date: 2015/02/28 20:46:20 $
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
5 # $Revision: 1.30 $
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
6 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
7 # Author: Manish Sud <msud@san.rr.com>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
8 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
10 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
11 # This file is part of MayaChemTools.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
12 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
14 # the terms of the GNU Lesser General Public License as published by the Free
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
15 # Software Foundation; either version 3 of the License, or (at your option) any
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
16 # later version.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
17 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
18 # MayaChemTools is distributed in the hope that it will be useful, but without
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
19 # any warranty; without even the implied warranty of merchantability of fitness
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
20 # for a particular purpose. See the GNU Lesser General Public License for more
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
21 # details.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
22 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public License
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
26 # Boston, MA, 02111-1307, USA.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
27 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
28
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
29 use strict;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
30 use FindBin; use lib "$FindBin::Bin/../lib";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
31 use Getopt::Long;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
32 use File::Basename;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
33 use Text::ParseWords;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
34 use Benchmark;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
35 use FileUtil;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
36 use TextUtil;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
37
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
39
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
40 # Autoflush STDOUT
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
41 $| = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
42
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
43 # Starting message...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
44 $ScriptName = basename($0);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
45 print "\n$ScriptName: Starting...\n\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
46 $StartTime = new Benchmark;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
47
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
48 # Get the options and setup script...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
49 SetupScriptUsage();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
50 if ($Options{help} || @ARGV < 1) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
51 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
52 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
53
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
54 my(@TextFilesList);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
56
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
57 # Process options...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
58 print "Processing options...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
59 my(%OptionsInfo);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
60 ProcessOptions();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
61
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
62 print "Checking input text file(s)...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
63 my(%TextFilesInfo);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
64 RetrieveTextFilesInfo();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
65 ProcessColumnsInfo();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
66
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
67 # Generate output files...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
68 my($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
69 if (@TextFilesList > 1) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
70 print "\nProcessing text files...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
71 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
72 for $FileIndex (0 .. $#TextFilesList) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
73 if ($TextFilesInfo{FileOkay}[$FileIndex]) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
74 print "\nProcessing file $TextFilesList[$FileIndex]...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
75 ListTextFileInfo($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
76 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
77 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
78 ListTotalSizeOfFiles();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
79
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
80 print "\n$ScriptName:Done...\n\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
81
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
82 $EndTime = new Benchmark;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
83 $TotalTime = timediff ($EndTime, $StartTime);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
84 print "Total time: ", timestr($TotalTime), "\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
85
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
86 ###############################################################################
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
87
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
88 # List appropriate information...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
89 sub ListTextFileInfo {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
90 my($Index) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
91 my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
92
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
93 $TextFile = $TextFilesList[$Index];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
94 $InDelim = $TextFilesInfo{InDelim}[$Index];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
95 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
96
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
97 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
98
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
99 $LineCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
100 $EmptyLinesCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
101 $EmptyColDataLinesCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
102 $GreaterThanMaxColLinesCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
103
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
104 %EmptyColValuesCountMap = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
105 %NonEmptyColValuesCountMap = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
106 %SpecifiedNonNumericalColValuesCountMap = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
107 %NonNumericalColValuesCountMap = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
108 %NumericalColValuesCountMap = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
109
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
110 if ($OptionsInfo{ParseLines}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
111 # Skip over column labels from old file...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
112 if (<TEXTFILE>) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
113 $LineCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
114 LINE: while ($Line = <TEXTFILE>) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
115 $LineCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
116 $PrintTextLine = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
117 $Line =~ s/(\r\n)|(\r)|\n//g;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
118 @LineWords = quotewords($InDelim, 0, $Line);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
119 if ($OptionsInfo{CountEmpty}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
120 # Count lines with no data...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
121 if (!@LineWords) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
122 $EmptyLinesCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
123 if ($OptionsInfo{DetailLevel} >= 2) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
124 print "Line number $LineCount is empty...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
125 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
126 next LINE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
127 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
128 # Count lines with empty data for some columns...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
129 $EmptyColValueFound = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
130 VALUE: for $Value (@LineWords) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
131 if (!IsNotEmpty($Value)) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
132 $EmptyColValueFound = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
133 next VALUE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
134 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
135 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
136 if ($EmptyColValueFound) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
137 $EmptyColDataLinesCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
138 if ($OptionsInfo{DetailLevel} >= 2) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
139 print "Line number $LineCount contains empty column value(s)...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
140 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
141 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
142 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
143 # Count lines with columns greater than the column label line...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
144 if (@LineWords > @ColLabels) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
145 $GreaterThanMaxColLinesCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
146 if ($OptionsInfo{DetailLevel} >= 2) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
147 print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
148 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
149 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
150 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
151 # Count empty values for each coulmn...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
152 for $ColNum (0 .. $#LineWords) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
153 if ($ColNum < @ColLabels) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
154 $Label = $ColLabels[$ColNum];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
155 if (IsNotEmpty($LineWords[$ColNum])) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
156 if (exists($NonEmptyColValuesCountMap{$Label})) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
157 $NonEmptyColValuesCountMap{$Label} += 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
158 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
159 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
160 $NonEmptyColValuesCountMap{$Label} = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
161 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
162 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
163 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
164 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
165 if (exists($EmptyColValuesCountMap{$Label})) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
166 $EmptyColValuesCountMap{$Label} += 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
167 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
168 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
169 $EmptyColValuesCountMap{$Label} = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
170 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
171 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
172 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
173 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
174 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
175 if ($OptionsInfo{CheckData}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
176 for $ColNum (0 .. $#LineWords) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
177 if ($ColNum < @ColLabels) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
178 if (IsNumerical($LineWords[$ColNum])) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
179 $Label = $ColLabels[$ColNum];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
180 if (exists($NumericalColValuesCountMap{$Label})) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
181 $NumericalColValuesCountMap{$Label} += 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
182 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
183 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
184 $NumericalColValuesCountMap{$Label} = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
185 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
186 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
187 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
188 $Label = $ColLabels[$ColNum];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
189 if (IsNotEmpty($LineWords[$ColNum])) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
190 if (exists($NonNumericalColValuesCountMap{$Label})) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
191 $NonNumericalColValuesCountMap{$Label} += 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
192 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
193 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
194 $NonNumericalColValuesCountMap{$Label} = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
195 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
196 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
197 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
198 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
199 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
200 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
201 if ($OptionsInfo{CheckNumericalData}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
202 $NonNumericalDataFound = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
203 for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
204 if ($ColNum < @LineWords) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
205 if (!IsNumerical($LineWords[$ColNum])) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
206 $NonNumericalDataFound = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
207 $Label = $ColLabels[$ColNum];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
208 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
209 $SpecifiedNonNumericalColValuesCountMap{$Label} += 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
210 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
211 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
212 $SpecifiedNonNumericalColValuesCountMap{$Label} = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
213 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
214 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
215 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
216 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
217 if ($NonNumericalDataFound) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
218 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
219 if ($OptionsInfo{DetailLevel} >=2 ) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
220 print "Line number $LineCount contains non-numerical data for some specified column(s)...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
221 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
222 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
223 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
224 if ($PrintTextLine) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
225 print "Line $LineCount: $Line\n\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
226 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
227 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
228 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
229 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
230 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
231 while (<TEXTFILE>) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
232 $LineCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
233 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
234 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
235 close TEXTFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
236
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
237 print "\nNumber of lines: $LineCount\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
238 print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
239 print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
240
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
241 if ($OptionsInfo{CountEmpty}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
242 print "\nNumber of lines with no data: $EmptyLinesCount\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
243 print "Number of lines with some missing column data: $EmptyColDataLinesCount\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
244 print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
245 PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
246 PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
247 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
248
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
249 if ($OptionsInfo{CheckData}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
250 print "\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
251 PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
252 PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
253 print "\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
254 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
255
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
256 if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
257 PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
258 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
259
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
260 # File size and modification information...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
261 print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
262 print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
263 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
264
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
265 # Total size of all the fiels...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
266 sub ListTotalSizeOfFiles {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
267 my($FileOkayCount, $TotalSize, $Index);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
268
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
269 $FileOkayCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
270 $TotalSize = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
271
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
272 for $Index (0 .. $#TextFilesList) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
273 if ($TextFilesInfo{FileOkay}[$Index]) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
274 $FileOkayCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
275 $TotalSize += $TextFilesInfo{FileSize}[$Index];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
276 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
277 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
278 if ($FileOkayCount > 1) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
279 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
280 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
281 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
282
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
283 # List data information...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
284 sub PrintDataInformation {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
285 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
286 my($Line, $Label);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
287
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
288 $Line = "";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
289 for $Label (@{$DataLabelRef}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
290 $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
291 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
292 $Line =~ s/\,$//g;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
293 print "$InfoLabel: $Line\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
294 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
295
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
296 # Retrieve information about input text files...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
297 sub RetrieveTextFilesInfo {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
298 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
299
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
300 %TextFilesInfo = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
301 @{$TextFilesInfo{FileOkay}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
302 @{$TextFilesInfo{ColCount}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
303 @{$TextFilesInfo{ColLabels}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
304 @{$TextFilesInfo{ColLabelToNumMap}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
305 @{$TextFilesInfo{InDelim}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
306 @{$TextFilesInfo{FileSize}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
307 @{$TextFilesInfo{FileLastModified}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
308
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
309 FILELIST: for $Index (0 .. $#TextFilesList) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
310 $TextFile = $TextFilesList[$Index];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
311
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
312 $TextFilesInfo{FileOkay}[$Index] = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
313 $TextFilesInfo{ColCount}[$Index] = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
314 $TextFilesInfo{InDelim}[$Index] = "";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
315 $TextFilesInfo{FileSize}[$Index] = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
316 $TextFilesInfo{FileLastModified}[$Index] = '';
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
317 @{$TextFilesInfo{ColLabels}[$Index]} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
318 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
319
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
320 if (!(-e $TextFile)) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
321 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
322 next FILELIST;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
323 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
324 if (!CheckFileType($TextFile, "csv tsv")) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
325 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
326 next FILELIST;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
327 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
328 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
329 if ($FileExt =~ /^tsv$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
330 $InDelim = "\t";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
331 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
332 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
333 $InDelim = "\,";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
334 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
335 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
336 next FILELIST;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
337 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
338 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
339 $InDelim = "\;";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
340 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
341 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
342
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
343 if (!open TEXTFILE, "$TextFile") {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
344 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
345 next FILELIST;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
346 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
347
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
348 $Line = GetTextLine(\*TEXTFILE);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
349 @ColLabels = quotewords($InDelim, 0, $Line);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
350 close TEXTFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
351
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
352 $TextFilesInfo{FileOkay}[$Index] = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
353 $TextFilesInfo{InDelim}[$Index] = $InDelim;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
354
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
355 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
356 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
357 for $ColNum (0 .. $#ColLabels) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
358 $ColLabel = $ColLabels[$ColNum];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
359 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
360 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
361 $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
362 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
363 $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
364 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
365
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
366 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
367
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
368 # Make sure specified numerical data columns are okay...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
369 sub ProcessColumnsInfo {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
370 my($Index, $TextFile);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
371
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
372 @{$TextFilesInfo{NumericalDataColNums}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
373 @{$TextFilesInfo{NumericalDataColLabels}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
374
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
375 FILELIST: for $Index (0 .. $#TextFilesList) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
376 $TextFile = $TextFilesList[$Index];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
377 @{$TextFilesInfo{NumericalDataColNums}[$Index]} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
378 @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
379
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
380 if ($TextFilesInfo{FileOkay}[$Index]) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
381 my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
382 @SpecifiedColNums = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
383 if ($OptionsInfo{Mode} =~ /^colnum$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
384 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
385 if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
386 $ColNum = $SpecifiedColNum - 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
387 push @SpecifiedColNums, $ColNum;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
388 push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
389 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
390 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
391 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
392 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
393 for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
394 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
395 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
396 push @SpecifiedColNums, $ColNum;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
397 push @SpecifiedColLabels, $ColLabel;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
398 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
399 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
400 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
401 if (@SpecifiedColNums) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
402 push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
403 push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
404 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
405 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
406 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
407 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
408
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
409 # Process option values...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
410 sub ProcessOptions {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
411 %OptionsInfo = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
412
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
413 $OptionsInfo{Mode} = $Options{mode};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
414
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
415 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
416 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
417
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
418 $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
419
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
420 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
421
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
422 $OptionsInfo{InDelim} = $Options{indelim};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
423 $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
424
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
425 $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
426 $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
427 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
428 $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
429
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
430 @{$OptionsInfo{SpecifiedNumericalDataCols}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
431 if ($Options{numericaldatacols}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
432 @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
433 if ($Options{mode} =~ /^colnum$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
434 my($ColNum);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
435 for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
436 if (!IsPositiveInteger($ColNum)) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
437 die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
438 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
439 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
440 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
441 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
442
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
443 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
444
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
445 # Setup script usage and retrieve command line arguments specified using various options...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
446 sub SetupScriptUsage {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
447
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
448 # Retrieve all the options...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
449 %Options = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
450 $Options{detail} = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
451 $Options{mode} = "colnum";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
452 $Options{indelim} = "comma";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
453 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
454 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
455 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
456 if ($Options{workingdir}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
457 if (! -d $Options{workingdir}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
458 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
459 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
460 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
461 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
462 if ($Options{mode} !~ /^(colnum|collabel)$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
463 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
464 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
465 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
466 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
467 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
468 if (!IsPositiveInteger($Options{detail})) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
469 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
470 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
471 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
472
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
473 __END__
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
474
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
475 =head1 NAME
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
476
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
477 InfoTextFiles.pl - List information about TextFile(s)
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
478
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
479 =head1 SYNOPSIS
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
480
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
481 InfoTextFiles.pl TextFile(s)...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
482
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
483 InfoTextFiles.pl [B<-a, --all>] [B<-c, --count>] [B<--datacheck>] [B<-d, --detail> infolevel] [B<-e, --empty>]
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
484 [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-m, --mode> colnum | collabel]
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
485 [B<-n, --numericaldatacols> colnum,[colnum,...] | collabel,[collabel,...]]
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
486 [B<-w, --workingdir> dirname] TextFile(s)...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
487
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
488 =head1 DESCRIPTION
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
489
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
490 List information about I<TextFile(s)> contents: number of lines and columns, empty
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
491 column values, and so on. The file names are separated by spaces.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
492 The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
493 text files respectively. All other file names are ignored. All the text files in a
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
494 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
495 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
496 which doesn't correspond to the format indicated by B<--indelim> option is ignored.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
497
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
498 =head1 OPTIONS
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
499
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
500 =over 4
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
501
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
502 =item B<-a, --all>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
503
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
504 List all the available information.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
505
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
506 =item B<-c, --count>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
507
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
508 List number of rows and columns. This is B<default behavior>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
509
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
510 =item B<--datacheck>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
511
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
512 List number of numerical and non-numerical values for each column.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
513
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
514 =item B<-d, --detail> I<infolevel>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
515
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
516 Level of information to print about lines being ignored. Default: I<1>. Possible values:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
517 I<1, 2 or 3>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
518
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
519 =item B<-e, --empty>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
520
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
521 List number of empty row and column values.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
522
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
523 =item B<-h, --help>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
524
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
525 Print this help message.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
526
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
527 =item B<--indelim> I<comma | semicolon>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
528
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
529 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
530 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
531 delimiter.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
532
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
533 =item B<-m, --mode> I<colnum | collabel>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
534
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
535 Specify how to identify numerical data columns: using column number or column label.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
536 Possible values: I<colnum or collabel>. Default value: I<colnum>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
537
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
538 =item B<-n, --numericaldatacols> I<colnum,[colnum,...] | collabel,[collabel,...]>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
539
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
540 This value is mode specific. It is a list of column number or labels to check for
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
541 presence of numerical data only; otherwise, the value is flagged. Default value: I<all;all;...>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
542
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
543 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
544
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
545 1,3,5
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
546 "2,4,6"
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
547
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
548 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
549
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
550 "MW,SumNO,SumNHOH"
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
551
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
552
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
553 =item B<-w, --workingdir> I<dirname>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
554
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
555 Location of working directory. Default: current directory.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
556
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
557 =back
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
558
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
559 =head1 EXAMPLES
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
560
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
561 To count number of lines and columns in Text file(s), type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
562
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
563 % InfoTextFiles.pl Sample1.csv
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
564 % InfoTextFiles.pl Sample1.csv Sample1.tsv
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
565 % InfoTextFiles.pl *.csv *.tsv
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
566
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
567 To count number of lines, columns and empty values in Sample1.csv file and print
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
568 detailed information, type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
569
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
570 % InfoTextFiles.pl -d 3 -e Sample1.csv
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
571
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
572 To track all available information and non-numerical values for Mol_ID and MolWeight
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
573 columns in Sample1.csv file and print detailed information, type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
574
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
575 % InfoTextFiles.pl -d 3 -a -m collabel -n Mol_ID,MolWeight Sample1.csv
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
576
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
577 =head1 AUTHOR
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
578
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
579 Manish Sud <msud@san.rr.com>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
580
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
581 =head1 SEE ALSO
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
582
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
583 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
584
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
585 =head1 COPYRIGHT
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
586
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
587 Copyright (C) 2015 Manish Sud. All rights reserved.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
588
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
589 This file is part of MayaChemTools.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
590
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
591 MayaChemTools is free software; you can redistribute it and/or modify it under
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
592 the terms of the GNU Lesser General Public License as published by the Free
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
593 Software Foundation; either version 3 of the License, or (at your option)
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
594 any later version.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
595
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
596 =cut