1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoTextFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:20 $ 5 # $Revision: 1.30 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 40 # Autoflush STDOUT 41 $| = 1; 42 43 # Starting message... 44 $ScriptName = basename($0); 45 print "\n$ScriptName: Starting...\n\n"; 46 $StartTime = new Benchmark; 47 48 # Get the options and setup script... 49 SetupScriptUsage(); 50 if ($Options{help} || @ARGV < 1) { 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 52 } 53 54 my(@TextFilesList); 55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 56 57 # Process options... 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 print "Checking input text file(s)...\n"; 63 my(%TextFilesInfo); 64 RetrieveTextFilesInfo(); 65 ProcessColumnsInfo(); 66 67 # Generate output files... 68 my($FileIndex); 69 if (@TextFilesList > 1) { 70 print "\nProcessing text files...\n"; 71 } 72 for $FileIndex (0 .. $#TextFilesList) { 73 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 74 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 75 ListTextFileInfo($FileIndex); 76 } 77 } 78 ListTotalSizeOfFiles(); 79 80 print "\n$ScriptName:Done...\n\n"; 81 82 $EndTime = new Benchmark; 83 $TotalTime = timediff ($EndTime, $StartTime); 84 print "Total time: ", timestr($TotalTime), "\n"; 85 86 ############################################################################### 87 88 # List appropriate information... 89 sub ListTextFileInfo { 90 my($Index) = @_; 91 my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,); 92 93 $TextFile = $TextFilesList[$Index]; 94 $InDelim = $TextFilesInfo{InDelim}[$Index]; 95 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; 96 97 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 98 99 $LineCount = 0; 100 $EmptyLinesCount = 0; 101 $EmptyColDataLinesCount = 0; 102 $GreaterThanMaxColLinesCount = 0; 103 104 %EmptyColValuesCountMap = (); 105 %NonEmptyColValuesCountMap = (); 106 %SpecifiedNonNumericalColValuesCountMap = (); 107 %NonNumericalColValuesCountMap = (); 108 %NumericalColValuesCountMap = (); 109 110 if ($OptionsInfo{ParseLines}) { 111 # Skip over column labels from old file... 112 if (<TEXTFILE>) { 113 $LineCount++; 114 LINE: while ($Line = <TEXTFILE>) { 115 $LineCount++; 116 $PrintTextLine = 0; 117 $Line =~ s/(\r\n)|(\r)|\n//g; 118 @LineWords = quotewords($InDelim, 0, $Line); 119 if ($OptionsInfo{CountEmpty}) { 120 # Count lines with no data... 121 if (!@LineWords) { 122 $EmptyLinesCount++; 123 if ($OptionsInfo{DetailLevel} >= 2) { 124 print "Line number $LineCount is empty...\n"; 125 } 126 next LINE; 127 } 128 # Count lines with empty data for some columns... 129 $EmptyColValueFound = 0; 130 VALUE: for $Value (@LineWords) { 131 if (!IsNotEmpty($Value)) { 132 $EmptyColValueFound = 1; 133 next VALUE; 134 } 135 } 136 if ($EmptyColValueFound) { 137 $EmptyColDataLinesCount++; 138 if ($OptionsInfo{DetailLevel} >= 2) { 139 print "Line number $LineCount contains empty column value(s)...\n"; 140 } 141 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 142 } 143 # Count lines with columns greater than the column label line... 144 if (@LineWords > @ColLabels) { 145 $GreaterThanMaxColLinesCount++; 146 if ($OptionsInfo{DetailLevel} >= 2) { 147 print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n"; 148 } 149 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 150 } 151 # Count empty values for each coulmn... 152 for $ColNum (0 .. $#LineWords) { 153 if ($ColNum < @ColLabels) { 154 $Label = $ColLabels[$ColNum]; 155 if (IsNotEmpty($LineWords[$ColNum])) { 156 if (exists($NonEmptyColValuesCountMap{$Label})) { 157 $NonEmptyColValuesCountMap{$Label} += 1; 158 } 159 else { 160 $NonEmptyColValuesCountMap{$Label} = 1; 161 } 162 } 163 else { 164 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 165 if (exists($EmptyColValuesCountMap{$Label})) { 166 $EmptyColValuesCountMap{$Label} += 1; 167 } 168 else { 169 $EmptyColValuesCountMap{$Label} = 1; 170 } 171 } 172 } 173 } 174 } 175 if ($OptionsInfo{CheckData}) { 176 for $ColNum (0 .. $#LineWords) { 177 if ($ColNum < @ColLabels) { 178 if (IsNumerical($LineWords[$ColNum])) { 179 $Label = $ColLabels[$ColNum]; 180 if (exists($NumericalColValuesCountMap{$Label})) { 181 $NumericalColValuesCountMap{$Label} += 1; 182 } 183 else { 184 $NumericalColValuesCountMap{$Label} = 1; 185 } 186 } 187 else { 188 $Label = $ColLabels[$ColNum]; 189 if (IsNotEmpty($LineWords[$ColNum])) { 190 if (exists($NonNumericalColValuesCountMap{$Label})) { 191 $NonNumericalColValuesCountMap{$Label} += 1; 192 } 193 else { 194 $NonNumericalColValuesCountMap{$Label} = 1; 195 } 196 } 197 } 198 } 199 } 200 } 201 if ($OptionsInfo{CheckNumericalData}) { 202 $NonNumericalDataFound = 0; 203 for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) { 204 if ($ColNum < @LineWords) { 205 if (!IsNumerical($LineWords[$ColNum])) { 206 $NonNumericalDataFound = 1; 207 $Label = $ColLabels[$ColNum]; 208 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) { 209 $SpecifiedNonNumericalColValuesCountMap{$Label} += 1; 210 } 211 else { 212 $SpecifiedNonNumericalColValuesCountMap{$Label} = 1; 213 } 214 } 215 } 216 } 217 if ($NonNumericalDataFound) { 218 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 219 if ($OptionsInfo{DetailLevel} >=2 ) { 220 print "Line number $LineCount contains non-numerical data for some specified column(s)...\n"; 221 } 222 } 223 } 224 if ($PrintTextLine) { 225 print "Line $LineCount: $Line\n\n"; 226 } 227 } 228 } 229 } 230 else { 231 while (<TEXTFILE>) { 232 $LineCount++; 233 } 234 } 235 close TEXTFILE; 236 237 print "\nNumber of lines: $LineCount\n"; 238 print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n"; 239 print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n"; 240 241 if ($OptionsInfo{CountEmpty}) { 242 print "\nNumber of lines with no data: $EmptyLinesCount\n"; 243 print "Number of lines with some missing column data: $EmptyColDataLinesCount\n"; 244 print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n"; 245 PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap); 246 PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap); 247 } 248 249 if ($OptionsInfo{CheckData}) { 250 print "\n"; 251 PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap); 252 PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap); 253 print "\n"; 254 } 255 256 if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) { 257 PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap); 258 } 259 260 # File size and modification information... 261 print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n"; 262 print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n"; 263 } 264 265 # Total size of all the fiels... 266 sub ListTotalSizeOfFiles { 267 my($FileOkayCount, $TotalSize, $Index); 268 269 $FileOkayCount = 0; 270 $TotalSize = 0; 271 272 for $Index (0 .. $#TextFilesList) { 273 if ($TextFilesInfo{FileOkay}[$Index]) { 274 $FileOkayCount++; 275 $TotalSize += $TextFilesInfo{FileSize}[$Index]; 276 } 277 } 278 if ($FileOkayCount > 1) { 279 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 280 } 281 } 282 283 # List data information... 284 sub PrintDataInformation { 285 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; 286 my($Line, $Label); 287 288 $Line = ""; 289 for $Label (@{$DataLabelRef}) { 290 $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; 291 } 292 $Line =~ s/\,$//g; 293 print "$InfoLabel: $Line\n"; 294 } 295 296 # Retrieve information about input text files... 297 sub RetrieveTextFilesInfo { 298 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString); 299 300 %TextFilesInfo = (); 301 @{$TextFilesInfo{FileOkay}} = (); 302 @{$TextFilesInfo{ColCount}} = (); 303 @{$TextFilesInfo{ColLabels}} = (); 304 @{$TextFilesInfo{ColLabelToNumMap}} = (); 305 @{$TextFilesInfo{InDelim}} = (); 306 @{$TextFilesInfo{FileSize}} = (); 307 @{$TextFilesInfo{FileLastModified}} = (); 308 309 FILELIST: for $Index (0 .. $#TextFilesList) { 310 $TextFile = $TextFilesList[$Index]; 311 312 $TextFilesInfo{FileOkay}[$Index] = 0; 313 $TextFilesInfo{ColCount}[$Index] = 0; 314 $TextFilesInfo{InDelim}[$Index] = ""; 315 $TextFilesInfo{FileSize}[$Index] = 0; 316 $TextFilesInfo{FileLastModified}[$Index] = ''; 317 @{$TextFilesInfo{ColLabels}[$Index]} = (); 318 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 319 320 if (!(-e $TextFile)) { 321 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 322 next FILELIST; 323 } 324 if (!CheckFileType($TextFile, "csv tsv")) { 325 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 326 next FILELIST; 327 } 328 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 329 if ($FileExt =~ /^tsv$/i) { 330 $InDelim = "\t"; 331 } 332 else { 333 $InDelim = "\,"; 334 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { 335 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; 336 next FILELIST; 337 } 338 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 339 $InDelim = "\;"; 340 } 341 } 342 343 if (!open TEXTFILE, "$TextFile") { 344 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 345 next FILELIST; 346 } 347 348 $Line = GetTextLine(\*TEXTFILE); 349 @ColLabels = quotewords($InDelim, 0, $Line); 350 close TEXTFILE; 351 352 $TextFilesInfo{FileOkay}[$Index] = 1; 353 $TextFilesInfo{InDelim}[$Index] = $InDelim; 354 355 $TextFilesInfo{ColCount}[$Index] = @ColLabels; 356 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 357 for $ColNum (0 .. $#ColLabels) { 358 $ColLabel = $ColLabels[$ColNum]; 359 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 360 } 361 $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile); 362 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile); 363 $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 364 } 365 366 } 367 368 # Make sure specified numerical data columns are okay... 369 sub ProcessColumnsInfo { 370 my($Index, $TextFile); 371 372 @{$TextFilesInfo{NumericalDataColNums}} = (); 373 @{$TextFilesInfo{NumericalDataColLabels}} = (); 374 375 FILELIST: for $Index (0 .. $#TextFilesList) { 376 $TextFile = $TextFilesList[$Index]; 377 @{$TextFilesInfo{NumericalDataColNums}[$Index]} = (); 378 @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = (); 379 380 if ($TextFilesInfo{FileOkay}[$Index]) { 381 my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels); 382 @SpecifiedColNums = (); 383 if ($OptionsInfo{Mode} =~ /^colnum$/i) { 384 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { 385 if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { 386 $ColNum = $SpecifiedColNum - 1; 387 push @SpecifiedColNums, $ColNum; 388 push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 389 } 390 } 391 } 392 else { 393 for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { 394 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { 395 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 396 push @SpecifiedColNums, $ColNum; 397 push @SpecifiedColLabels, $ColLabel; 398 } 399 } 400 } 401 if (@SpecifiedColNums) { 402 push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums; 403 push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels; 404 } 405 } 406 } 407 } 408 409 # Process option values... 410 sub ProcessOptions { 411 %OptionsInfo = (); 412 413 $OptionsInfo{Mode} = $Options{mode}; 414 415 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; 416 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; 417 418 $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1; 419 420 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; 421 422 $OptionsInfo{InDelim} = $Options{indelim}; 423 $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0; 424 425 $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0; 426 $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0; 427 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; 428 $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0; 429 430 @{$OptionsInfo{SpecifiedNumericalDataCols}} = (); 431 if ($Options{numericaldatacols}) { 432 @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols}; 433 if ($Options{mode} =~ /^colnum$/i) { 434 my($ColNum); 435 for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { 436 if (!IsPositiveInteger($ColNum)) { 437 die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n"; 438 } 439 } 440 } 441 } 442 443 } 444 445 # Setup script usage and retrieve command line arguments specified using various options... 446 sub SetupScriptUsage { 447 448 # Retrieve all the options... 449 %Options = (); 450 $Options{detail} = 1; 451 $Options{mode} = "colnum"; 452 $Options{indelim} = "comma"; 453 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) { 454 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 455 } 456 if ($Options{workingdir}) { 457 if (! -d $Options{workingdir}) { 458 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 459 } 460 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 461 } 462 if ($Options{mode} !~ /^(colnum|collabel)$/i) { 463 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n"; 464 } 465 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 466 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 467 } 468 if (!IsPositiveInteger($Options{detail})) { 469 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 470 } 471 } 472