0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: InfoTextFiles.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:20 $
|
|
5 # $Revision: 1.30 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use FileUtil;
|
|
36 use TextUtil;
|
|
37
|
|
38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
39
|
|
40 # Autoflush STDOUT
|
|
41 $| = 1;
|
|
42
|
|
43 # Starting message...
|
|
44 $ScriptName = basename($0);
|
|
45 print "\n$ScriptName: Starting...\n\n";
|
|
46 $StartTime = new Benchmark;
|
|
47
|
|
48 # Get the options and setup script...
|
|
49 SetupScriptUsage();
|
|
50 if ($Options{help} || @ARGV < 1) {
|
|
51 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
52 }
|
|
53
|
|
54 my(@TextFilesList);
|
|
55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
|
|
56
|
|
57 # Process options...
|
|
58 print "Processing options...\n";
|
|
59 my(%OptionsInfo);
|
|
60 ProcessOptions();
|
|
61
|
|
62 print "Checking input text file(s)...\n";
|
|
63 my(%TextFilesInfo);
|
|
64 RetrieveTextFilesInfo();
|
|
65 ProcessColumnsInfo();
|
|
66
|
|
67 # Generate output files...
|
|
68 my($FileIndex);
|
|
69 if (@TextFilesList > 1) {
|
|
70 print "\nProcessing text files...\n";
|
|
71 }
|
|
72 for $FileIndex (0 .. $#TextFilesList) {
|
|
73 if ($TextFilesInfo{FileOkay}[$FileIndex]) {
|
|
74 print "\nProcessing file $TextFilesList[$FileIndex]...\n";
|
|
75 ListTextFileInfo($FileIndex);
|
|
76 }
|
|
77 }
|
|
78 ListTotalSizeOfFiles();
|
|
79
|
|
80 print "\n$ScriptName:Done...\n\n";
|
|
81
|
|
82 $EndTime = new Benchmark;
|
|
83 $TotalTime = timediff ($EndTime, $StartTime);
|
|
84 print "Total time: ", timestr($TotalTime), "\n";
|
|
85
|
|
86 ###############################################################################
|
|
87
|
|
88 # List appropriate information...
|
|
89 sub ListTextFileInfo {
|
|
90 my($Index) = @_;
|
|
91 my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,);
|
|
92
|
|
93 $TextFile = $TextFilesList[$Index];
|
|
94 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
95 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
|
|
96
|
|
97 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
|
|
98
|
|
99 $LineCount = 0;
|
|
100 $EmptyLinesCount = 0;
|
|
101 $EmptyColDataLinesCount = 0;
|
|
102 $GreaterThanMaxColLinesCount = 0;
|
|
103
|
|
104 %EmptyColValuesCountMap = ();
|
|
105 %NonEmptyColValuesCountMap = ();
|
|
106 %SpecifiedNonNumericalColValuesCountMap = ();
|
|
107 %NonNumericalColValuesCountMap = ();
|
|
108 %NumericalColValuesCountMap = ();
|
|
109
|
|
110 if ($OptionsInfo{ParseLines}) {
|
|
111 # Skip over column labels from old file...
|
|
112 if (<TEXTFILE>) {
|
|
113 $LineCount++;
|
|
114 LINE: while ($Line = <TEXTFILE>) {
|
|
115 $LineCount++;
|
|
116 $PrintTextLine = 0;
|
|
117 $Line =~ s/(\r\n)|(\r)|\n//g;
|
|
118 @LineWords = quotewords($InDelim, 0, $Line);
|
|
119 if ($OptionsInfo{CountEmpty}) {
|
|
120 # Count lines with no data...
|
|
121 if (!@LineWords) {
|
|
122 $EmptyLinesCount++;
|
|
123 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
124 print "Line number $LineCount is empty...\n";
|
|
125 }
|
|
126 next LINE;
|
|
127 }
|
|
128 # Count lines with empty data for some columns...
|
|
129 $EmptyColValueFound = 0;
|
|
130 VALUE: for $Value (@LineWords) {
|
|
131 if (!IsNotEmpty($Value)) {
|
|
132 $EmptyColValueFound = 1;
|
|
133 next VALUE;
|
|
134 }
|
|
135 }
|
|
136 if ($EmptyColValueFound) {
|
|
137 $EmptyColDataLinesCount++;
|
|
138 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
139 print "Line number $LineCount contains empty column value(s)...\n";
|
|
140 }
|
|
141 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
|
|
142 }
|
|
143 # Count lines with columns greater than the column label line...
|
|
144 if (@LineWords > @ColLabels) {
|
|
145 $GreaterThanMaxColLinesCount++;
|
|
146 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
147 print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n";
|
|
148 }
|
|
149 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
|
|
150 }
|
|
151 # Count empty values for each coulmn...
|
|
152 for $ColNum (0 .. $#LineWords) {
|
|
153 if ($ColNum < @ColLabels) {
|
|
154 $Label = $ColLabels[$ColNum];
|
|
155 if (IsNotEmpty($LineWords[$ColNum])) {
|
|
156 if (exists($NonEmptyColValuesCountMap{$Label})) {
|
|
157 $NonEmptyColValuesCountMap{$Label} += 1;
|
|
158 }
|
|
159 else {
|
|
160 $NonEmptyColValuesCountMap{$Label} = 1;
|
|
161 }
|
|
162 }
|
|
163 else {
|
|
164 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
|
|
165 if (exists($EmptyColValuesCountMap{$Label})) {
|
|
166 $EmptyColValuesCountMap{$Label} += 1;
|
|
167 }
|
|
168 else {
|
|
169 $EmptyColValuesCountMap{$Label} = 1;
|
|
170 }
|
|
171 }
|
|
172 }
|
|
173 }
|
|
174 }
|
|
175 if ($OptionsInfo{CheckData}) {
|
|
176 for $ColNum (0 .. $#LineWords) {
|
|
177 if ($ColNum < @ColLabels) {
|
|
178 if (IsNumerical($LineWords[$ColNum])) {
|
|
179 $Label = $ColLabels[$ColNum];
|
|
180 if (exists($NumericalColValuesCountMap{$Label})) {
|
|
181 $NumericalColValuesCountMap{$Label} += 1;
|
|
182 }
|
|
183 else {
|
|
184 $NumericalColValuesCountMap{$Label} = 1;
|
|
185 }
|
|
186 }
|
|
187 else {
|
|
188 $Label = $ColLabels[$ColNum];
|
|
189 if (IsNotEmpty($LineWords[$ColNum])) {
|
|
190 if (exists($NonNumericalColValuesCountMap{$Label})) {
|
|
191 $NonNumericalColValuesCountMap{$Label} += 1;
|
|
192 }
|
|
193 else {
|
|
194 $NonNumericalColValuesCountMap{$Label} = 1;
|
|
195 }
|
|
196 }
|
|
197 }
|
|
198 }
|
|
199 }
|
|
200 }
|
|
201 if ($OptionsInfo{CheckNumericalData}) {
|
|
202 $NonNumericalDataFound = 0;
|
|
203 for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) {
|
|
204 if ($ColNum < @LineWords) {
|
|
205 if (!IsNumerical($LineWords[$ColNum])) {
|
|
206 $NonNumericalDataFound = 1;
|
|
207 $Label = $ColLabels[$ColNum];
|
|
208 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) {
|
|
209 $SpecifiedNonNumericalColValuesCountMap{$Label} += 1;
|
|
210 }
|
|
211 else {
|
|
212 $SpecifiedNonNumericalColValuesCountMap{$Label} = 1;
|
|
213 }
|
|
214 }
|
|
215 }
|
|
216 }
|
|
217 if ($NonNumericalDataFound) {
|
|
218 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
|
|
219 if ($OptionsInfo{DetailLevel} >=2 ) {
|
|
220 print "Line number $LineCount contains non-numerical data for some specified column(s)...\n";
|
|
221 }
|
|
222 }
|
|
223 }
|
|
224 if ($PrintTextLine) {
|
|
225 print "Line $LineCount: $Line\n\n";
|
|
226 }
|
|
227 }
|
|
228 }
|
|
229 }
|
|
230 else {
|
|
231 while (<TEXTFILE>) {
|
|
232 $LineCount++;
|
|
233 }
|
|
234 }
|
|
235 close TEXTFILE;
|
|
236
|
|
237 print "\nNumber of lines: $LineCount\n";
|
|
238 print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n";
|
|
239 print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n";
|
|
240
|
|
241 if ($OptionsInfo{CountEmpty}) {
|
|
242 print "\nNumber of lines with no data: $EmptyLinesCount\n";
|
|
243 print "Number of lines with some missing column data: $EmptyColDataLinesCount\n";
|
|
244 print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n";
|
|
245 PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap);
|
|
246 PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap);
|
|
247 }
|
|
248
|
|
249 if ($OptionsInfo{CheckData}) {
|
|
250 print "\n";
|
|
251 PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap);
|
|
252 PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap);
|
|
253 print "\n";
|
|
254 }
|
|
255
|
|
256 if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) {
|
|
257 PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap);
|
|
258 }
|
|
259
|
|
260 # File size and modification information...
|
|
261 print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n";
|
|
262 print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n";
|
|
263 }
|
|
264
|
|
265 # Total size of all the fiels...
|
|
266 sub ListTotalSizeOfFiles {
|
|
267 my($FileOkayCount, $TotalSize, $Index);
|
|
268
|
|
269 $FileOkayCount = 0;
|
|
270 $TotalSize = 0;
|
|
271
|
|
272 for $Index (0 .. $#TextFilesList) {
|
|
273 if ($TextFilesInfo{FileOkay}[$Index]) {
|
|
274 $FileOkayCount++;
|
|
275 $TotalSize += $TextFilesInfo{FileSize}[$Index];
|
|
276 }
|
|
277 }
|
|
278 if ($FileOkayCount > 1) {
|
|
279 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
|
|
280 }
|
|
281 }
|
|
282
|
|
283 # List data information...
|
|
284 sub PrintDataInformation {
|
|
285 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
|
|
286 my($Line, $Label);
|
|
287
|
|
288 $Line = "";
|
|
289 for $Label (@{$DataLabelRef}) {
|
|
290 $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
|
|
291 }
|
|
292 $Line =~ s/\,$//g;
|
|
293 print "$InfoLabel: $Line\n";
|
|
294 }
|
|
295
|
|
296 # Retrieve information about input text files...
|
|
297 sub RetrieveTextFilesInfo {
|
|
298 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString);
|
|
299
|
|
300 %TextFilesInfo = ();
|
|
301 @{$TextFilesInfo{FileOkay}} = ();
|
|
302 @{$TextFilesInfo{ColCount}} = ();
|
|
303 @{$TextFilesInfo{ColLabels}} = ();
|
|
304 @{$TextFilesInfo{ColLabelToNumMap}} = ();
|
|
305 @{$TextFilesInfo{InDelim}} = ();
|
|
306 @{$TextFilesInfo{FileSize}} = ();
|
|
307 @{$TextFilesInfo{FileLastModified}} = ();
|
|
308
|
|
309 FILELIST: for $Index (0 .. $#TextFilesList) {
|
|
310 $TextFile = $TextFilesList[$Index];
|
|
311
|
|
312 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
313 $TextFilesInfo{ColCount}[$Index] = 0;
|
|
314 $TextFilesInfo{InDelim}[$Index] = "";
|
|
315 $TextFilesInfo{FileSize}[$Index] = 0;
|
|
316 $TextFilesInfo{FileLastModified}[$Index] = '';
|
|
317 @{$TextFilesInfo{ColLabels}[$Index]} = ();
|
|
318 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
|
|
319
|
|
320 if (!(-e $TextFile)) {
|
|
321 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
|
|
322 next FILELIST;
|
|
323 }
|
|
324 if (!CheckFileType($TextFile, "csv tsv")) {
|
|
325 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
|
|
326 next FILELIST;
|
|
327 }
|
|
328 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
|
|
329 if ($FileExt =~ /^tsv$/i) {
|
|
330 $InDelim = "\t";
|
|
331 }
|
|
332 else {
|
|
333 $InDelim = "\,";
|
|
334 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
|
|
335 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
|
|
336 next FILELIST;
|
|
337 }
|
|
338 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
|
|
339 $InDelim = "\;";
|
|
340 }
|
|
341 }
|
|
342
|
|
343 if (!open TEXTFILE, "$TextFile") {
|
|
344 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
|
|
345 next FILELIST;
|
|
346 }
|
|
347
|
|
348 $Line = GetTextLine(\*TEXTFILE);
|
|
349 @ColLabels = quotewords($InDelim, 0, $Line);
|
|
350 close TEXTFILE;
|
|
351
|
|
352 $TextFilesInfo{FileOkay}[$Index] = 1;
|
|
353 $TextFilesInfo{InDelim}[$Index] = $InDelim;
|
|
354
|
|
355 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
|
|
356 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
|
|
357 for $ColNum (0 .. $#ColLabels) {
|
|
358 $ColLabel = $ColLabels[$ColNum];
|
|
359 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
|
|
360 }
|
|
361 $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile);
|
|
362 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile);
|
|
363 $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
|
|
364 }
|
|
365
|
|
366 }
|
|
367
|
|
368 # Make sure specified numerical data columns are okay...
|
|
369 sub ProcessColumnsInfo {
|
|
370 my($Index, $TextFile);
|
|
371
|
|
372 @{$TextFilesInfo{NumericalDataColNums}} = ();
|
|
373 @{$TextFilesInfo{NumericalDataColLabels}} = ();
|
|
374
|
|
375 FILELIST: for $Index (0 .. $#TextFilesList) {
|
|
376 $TextFile = $TextFilesList[$Index];
|
|
377 @{$TextFilesInfo{NumericalDataColNums}[$Index]} = ();
|
|
378 @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = ();
|
|
379
|
|
380 if ($TextFilesInfo{FileOkay}[$Index]) {
|
|
381 my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels);
|
|
382 @SpecifiedColNums = ();
|
|
383 if ($OptionsInfo{Mode} =~ /^colnum$/i) {
|
|
384 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
|
|
385 if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
|
|
386 $ColNum = $SpecifiedColNum - 1;
|
|
387 push @SpecifiedColNums, $ColNum;
|
|
388 push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
|
|
389 }
|
|
390 }
|
|
391 }
|
|
392 else {
|
|
393 for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
|
|
394 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
|
|
395 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
|
|
396 push @SpecifiedColNums, $ColNum;
|
|
397 push @SpecifiedColLabels, $ColLabel;
|
|
398 }
|
|
399 }
|
|
400 }
|
|
401 if (@SpecifiedColNums) {
|
|
402 push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums;
|
|
403 push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels;
|
|
404 }
|
|
405 }
|
|
406 }
|
|
407 }
|
|
408
|
|
409 # Process option values...
|
|
410 sub ProcessOptions {
|
|
411 %OptionsInfo = ();
|
|
412
|
|
413 $OptionsInfo{Mode} = $Options{mode};
|
|
414
|
|
415 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0;
|
|
416 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0;
|
|
417
|
|
418 $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1;
|
|
419
|
|
420 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0;
|
|
421
|
|
422 $OptionsInfo{InDelim} = $Options{indelim};
|
|
423 $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0;
|
|
424
|
|
425 $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0;
|
|
426 $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0;
|
|
427 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0;
|
|
428 $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0;
|
|
429
|
|
430 @{$OptionsInfo{SpecifiedNumericalDataCols}} = ();
|
|
431 if ($Options{numericaldatacols}) {
|
|
432 @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols};
|
|
433 if ($Options{mode} =~ /^colnum$/i) {
|
|
434 my($ColNum);
|
|
435 for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
|
|
436 if (!IsPositiveInteger($ColNum)) {
|
|
437 die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n";
|
|
438 }
|
|
439 }
|
|
440 }
|
|
441 }
|
|
442
|
|
443 }
|
|
444
|
|
445 # Setup script usage and retrieve command line arguments specified using various options...
|
|
446 sub SetupScriptUsage {
|
|
447
|
|
448 # Retrieve all the options...
|
|
449 %Options = ();
|
|
450 $Options{detail} = 1;
|
|
451 $Options{mode} = "colnum";
|
|
452 $Options{indelim} = "comma";
|
|
453 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) {
|
|
454 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
455 }
|
|
456 if ($Options{workingdir}) {
|
|
457 if (! -d $Options{workingdir}) {
|
|
458 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
459 }
|
|
460 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
461 }
|
|
462 if ($Options{mode} !~ /^(colnum|collabel)$/i) {
|
|
463 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
|
|
464 }
|
|
465 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
|
|
466 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
|
|
467 }
|
|
468 if (!IsPositiveInteger($Options{detail})) {
|
|
469 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
|
|
470 }
|
|
471 }
|
|
472
|
|
473 __END__
|
|
474
|
|
475 =head1 NAME
|
|
476
|
|
477 InfoTextFiles.pl - List information about TextFile(s)
|
|
478
|
|
479 =head1 SYNOPSIS
|
|
480
|
|
481 InfoTextFiles.pl TextFile(s)...
|
|
482
|
|
483 InfoTextFiles.pl [B<-a, --all>] [B<-c, --count>] [B<--datacheck>] [B<-d, --detail> infolevel] [B<-e, --empty>]
|
|
484 [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-m, --mode> colnum | collabel]
|
|
485 [B<-n, --numericaldatacols> colnum,[colnum,...] | collabel,[collabel,...]]
|
|
486 [B<-w, --workingdir> dirname] TextFile(s)...
|
|
487
|
|
488 =head1 DESCRIPTION
|
|
489
|
|
490 List information about I<TextFile(s)> contents: number of lines and columns, empty
|
|
491 column values, and so on. The file names are separated by spaces.
|
|
492 The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited
|
|
493 text files respectively. All other file names are ignored. All the text files in a
|
|
494 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory
|
|
495 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file
|
|
496 which doesn't correspond to the format indicated by B<--indelim> option is ignored.
|
|
497
|
|
498 =head1 OPTIONS
|
|
499
|
|
500 =over 4
|
|
501
|
|
502 =item B<-a, --all>
|
|
503
|
|
504 List all the available information.
|
|
505
|
|
506 =item B<-c, --count>
|
|
507
|
|
508 List number of rows and columns. This is B<default behavior>.
|
|
509
|
|
510 =item B<--datacheck>
|
|
511
|
|
512 List number of numerical and non-numerical values for each column.
|
|
513
|
|
514 =item B<-d, --detail> I<infolevel>
|
|
515
|
|
516 Level of information to print about lines being ignored. Default: I<1>. Possible values:
|
|
517 I<1, 2 or 3>.
|
|
518
|
|
519 =item B<-e, --empty>
|
|
520
|
|
521 List number of empty row and column values.
|
|
522
|
|
523 =item B<-h, --help>
|
|
524
|
|
525 Print this help message.
|
|
526
|
|
527 =item B<--indelim> I<comma | semicolon>
|
|
528
|
|
529 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
|
|
530 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
|
|
531 delimiter.
|
|
532
|
|
533 =item B<-m, --mode> I<colnum | collabel>
|
|
534
|
|
535 Specify how to identify numerical data columns: using column number or column label.
|
|
536 Possible values: I<colnum or collabel>. Default value: I<colnum>.
|
|
537
|
|
538 =item B<-n, --numericaldatacols> I<colnum,[colnum,...] | collabel,[collabel,...]>
|
|
539
|
|
540 This value is mode specific. It is a list of column number or labels to check for
|
|
541 presence of numerical data only; otherwise, the value is flagged. Default value: I<all;all;...>.
|
|
542
|
|
543 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example:
|
|
544
|
|
545 1,3,5
|
|
546 "2,4,6"
|
|
547
|
|
548 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example:
|
|
549
|
|
550 "MW,SumNO,SumNHOH"
|
|
551
|
|
552
|
|
553 =item B<-w, --workingdir> I<dirname>
|
|
554
|
|
555 Location of working directory. Default: current directory.
|
|
556
|
|
557 =back
|
|
558
|
|
559 =head1 EXAMPLES
|
|
560
|
|
561 To count number of lines and columns in Text file(s), type:
|
|
562
|
|
563 % InfoTextFiles.pl Sample1.csv
|
|
564 % InfoTextFiles.pl Sample1.csv Sample1.tsv
|
|
565 % InfoTextFiles.pl *.csv *.tsv
|
|
566
|
|
567 To count number of lines, columns and empty values in Sample1.csv file and print
|
|
568 detailed information, type:
|
|
569
|
|
570 % InfoTextFiles.pl -d 3 -e Sample1.csv
|
|
571
|
|
572 To track all available information and non-numerical values for Mol_ID and MolWeight
|
|
573 columns in Sample1.csv file and print detailed information, type:
|
|
574
|
|
575 % InfoTextFiles.pl -d 3 -a -m collabel -n Mol_ID,MolWeight Sample1.csv
|
|
576
|
|
577 =head1 AUTHOR
|
|
578
|
|
579 Manish Sud <msud@san.rr.com>
|
|
580
|
|
581 =head1 SEE ALSO
|
|
582
|
|
583 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl
|
|
584
|
|
585 =head1 COPYRIGHT
|
|
586
|
|
587 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
588
|
|
589 This file is part of MayaChemTools.
|
|
590
|
|
591 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
592 the terms of the GNU Lesser General Public License as published by the Free
|
|
593 Software Foundation; either version 3 of the License, or (at your option)
|
|
594 any later version.
|
|
595
|
|
596 =cut
|