MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoTextFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.30 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 
  38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  39 
  40 # Autoflush STDOUT
  41 $| = 1;
  42 
  43 # Starting message...
  44 $ScriptName = basename($0);
  45 print "\n$ScriptName: Starting...\n\n";
  46 $StartTime = new Benchmark;
  47 
  48 # Get the options and setup script...
  49 SetupScriptUsage();
  50 if ($Options{help} || @ARGV < 1) {
  51   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  52 }
  53 
  54 my(@TextFilesList);
  55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  56 
  57 # Process options...
  58 print "Processing options...\n";
  59 my(%OptionsInfo);
  60 ProcessOptions();
  61 
  62 print "Checking input text file(s)...\n";
  63 my(%TextFilesInfo);
  64 RetrieveTextFilesInfo();
  65 ProcessColumnsInfo();
  66 
  67 # Generate output files...
  68 my($FileIndex);
  69 if (@TextFilesList > 1) {
  70   print "\nProcessing text files...\n";
  71 }
  72 for $FileIndex (0 .. $#TextFilesList) {
  73   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  74     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  75     ListTextFileInfo($FileIndex);
  76   }
  77 }
  78 ListTotalSizeOfFiles();
  79 
  80 print "\n$ScriptName:Done...\n\n";
  81 
  82 $EndTime = new Benchmark;
  83 $TotalTime = timediff ($EndTime, $StartTime);
  84 print "Total time: ", timestr($TotalTime), "\n";
  85 
  86 ###############################################################################
  87 
  88 # List appropriate information...
  89 sub ListTextFileInfo {
  90   my($Index) = @_;
  91   my($TextFile,  $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,);
  92 
  93   $TextFile = $TextFilesList[$Index];
  94   $InDelim = $TextFilesInfo{InDelim}[$Index];
  95   @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
  96 
  97   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
  98 
  99   $LineCount = 0;
 100   $EmptyLinesCount = 0;
 101   $EmptyColDataLinesCount = 0;
 102   $GreaterThanMaxColLinesCount = 0;
 103 
 104   %EmptyColValuesCountMap = ();
 105   %NonEmptyColValuesCountMap = ();
 106   %SpecifiedNonNumericalColValuesCountMap = ();
 107   %NonNumericalColValuesCountMap = ();
 108   %NumericalColValuesCountMap = ();
 109 
 110   if ($OptionsInfo{ParseLines}) {
 111     # Skip over column labels from old file...
 112     if (<TEXTFILE>) {
 113       $LineCount++;
 114       LINE: while ($Line = <TEXTFILE>) {
 115         $LineCount++;
 116         $PrintTextLine = 0;
 117         $Line =~ s/(\r\n)|(\r)|\n//g;
 118         @LineWords = quotewords($InDelim, 0, $Line);
 119         if ($OptionsInfo{CountEmpty}) {
 120           # Count lines with no data...
 121           if (!@LineWords) {
 122             $EmptyLinesCount++;
 123             if ($OptionsInfo{DetailLevel} >= 2) {
 124               print "Line number $LineCount is empty...\n";
 125             }
 126             next LINE;
 127           }
 128           # Count lines with empty data for some columns...
 129           $EmptyColValueFound = 0;
 130           VALUE: for $Value (@LineWords) {
 131               if (!IsNotEmpty($Value)) {
 132                 $EmptyColValueFound = 1;
 133                 next VALUE;
 134               }
 135           }
 136           if ($EmptyColValueFound) {
 137             $EmptyColDataLinesCount++;
 138             if ($OptionsInfo{DetailLevel} >= 2) {
 139               print "Line number $LineCount contains empty column value(s)...\n";
 140             }
 141             $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 142           }
 143           # Count lines with columns greater than the column label line...
 144           if (@LineWords > @ColLabels) {
 145             $GreaterThanMaxColLinesCount++;
 146             if ($OptionsInfo{DetailLevel} >= 2) {
 147               print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n";
 148             }
 149             $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 150           }
 151           # Count empty values for each coulmn...
 152           for $ColNum (0 .. $#LineWords) {
 153             if ($ColNum < @ColLabels) {
 154               $Label = $ColLabels[$ColNum];
 155               if (IsNotEmpty($LineWords[$ColNum])) {
 156                 if (exists($NonEmptyColValuesCountMap{$Label})) {
 157                   $NonEmptyColValuesCountMap{$Label} += 1;
 158                 }
 159                 else {
 160                   $NonEmptyColValuesCountMap{$Label} = 1;
 161                 }
 162               }
 163               else {
 164                 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 165                 if (exists($EmptyColValuesCountMap{$Label})) {
 166                   $EmptyColValuesCountMap{$Label} += 1;
 167                 }
 168                 else {
 169                   $EmptyColValuesCountMap{$Label} = 1;
 170                 }
 171               }
 172             }
 173           }
 174         }
 175         if ($OptionsInfo{CheckData}) {
 176           for $ColNum (0 .. $#LineWords) {
 177             if ($ColNum < @ColLabels) {
 178               if (IsNumerical($LineWords[$ColNum])) {
 179                 $Label = $ColLabels[$ColNum];
 180                 if (exists($NumericalColValuesCountMap{$Label})) {
 181                   $NumericalColValuesCountMap{$Label} += 1;
 182                 }
 183                 else {
 184                   $NumericalColValuesCountMap{$Label} = 1;
 185                 }
 186               }
 187               else {
 188                 $Label = $ColLabels[$ColNum];
 189                 if (IsNotEmpty($LineWords[$ColNum])) {
 190                   if (exists($NonNumericalColValuesCountMap{$Label})) {
 191                     $NonNumericalColValuesCountMap{$Label} += 1;
 192                   }
 193                   else {
 194                     $NonNumericalColValuesCountMap{$Label} = 1;
 195                   }
 196                 }
 197               }
 198             }
 199           }
 200         }
 201         if ($OptionsInfo{CheckNumericalData}) {
 202           $NonNumericalDataFound = 0;
 203           for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) {
 204             if ($ColNum < @LineWords) {
 205               if (!IsNumerical($LineWords[$ColNum])) {
 206                 $NonNumericalDataFound = 1;
 207                 $Label = $ColLabels[$ColNum];
 208                 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) {
 209                   $SpecifiedNonNumericalColValuesCountMap{$Label} += 1;
 210                 }
 211                 else {
 212                   $SpecifiedNonNumericalColValuesCountMap{$Label} = 1;
 213                 }
 214               }
 215             }
 216           }
 217           if ($NonNumericalDataFound) {
 218             $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 219             if ($OptionsInfo{DetailLevel} >=2 ) {
 220               print "Line number $LineCount contains non-numerical data for some specified column(s)...\n";
 221             }
 222           }
 223         }
 224         if ($PrintTextLine) {
 225           print "Line $LineCount: $Line\n\n";
 226         }
 227       }
 228     }
 229   }
 230   else {
 231     while (<TEXTFILE>) {
 232       $LineCount++;
 233     }
 234   }
 235   close TEXTFILE;
 236 
 237   print "\nNumber of lines: $LineCount\n";
 238   print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n";
 239   print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n";
 240 
 241   if ($OptionsInfo{CountEmpty}) {
 242     print "\nNumber of lines with no data: $EmptyLinesCount\n";
 243     print "Number of lines with some missing column data: $EmptyColDataLinesCount\n";
 244     print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n";
 245     PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap);
 246     PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap);
 247   }
 248 
 249   if ($OptionsInfo{CheckData}) {
 250     print "\n";
 251     PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap);
 252     PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap);
 253     print "\n";
 254   }
 255 
 256   if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) {
 257     PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap);
 258   }
 259 
 260   # File size and modification information...
 261   print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n";
 262   print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n";
 263 }
 264 
 265 # Total size of all the fiels...
 266 sub ListTotalSizeOfFiles {
 267   my($FileOkayCount, $TotalSize, $Index);
 268 
 269   $FileOkayCount = 0;
 270   $TotalSize = 0;
 271 
 272   for $Index (0 .. $#TextFilesList) {
 273     if ($TextFilesInfo{FileOkay}[$Index]) {
 274       $FileOkayCount++;
 275       $TotalSize += $TextFilesInfo{FileSize}[$Index];
 276     }
 277   }
 278   if ($FileOkayCount > 1) {
 279     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 280   }
 281 }
 282 
 283 # List data information...
 284 sub PrintDataInformation {
 285   my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
 286   my($Line, $Label);
 287 
 288   $Line = "";
 289   for $Label (@{$DataLabelRef}) {
 290     $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
 291   }
 292   $Line =~ s/\,$//g;
 293   print "$InfoLabel: $Line\n";
 294 }
 295 
 296 # Retrieve information about input text files...
 297 sub RetrieveTextFilesInfo {
 298   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels,  $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString);
 299 
 300   %TextFilesInfo = ();
 301   @{$TextFilesInfo{FileOkay}} = ();
 302   @{$TextFilesInfo{ColCount}} = ();
 303   @{$TextFilesInfo{ColLabels}} = ();
 304   @{$TextFilesInfo{ColLabelToNumMap}} = ();
 305   @{$TextFilesInfo{InDelim}} = ();
 306   @{$TextFilesInfo{FileSize}} = ();
 307   @{$TextFilesInfo{FileLastModified}} = ();
 308 
 309   FILELIST: for $Index (0 .. $#TextFilesList) {
 310     $TextFile = $TextFilesList[$Index];
 311 
 312     $TextFilesInfo{FileOkay}[$Index] = 0;
 313     $TextFilesInfo{ColCount}[$Index] = 0;
 314     $TextFilesInfo{InDelim}[$Index] = "";
 315     $TextFilesInfo{FileSize}[$Index] = 0;
 316     $TextFilesInfo{FileLastModified}[$Index] = '';
 317     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 318     %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
 319 
 320     if (!(-e $TextFile)) {
 321       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 322       next FILELIST;
 323     }
 324     if (!CheckFileType($TextFile, "csv tsv")) {
 325       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 326       next FILELIST;
 327     }
 328     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 329     if ($FileExt =~ /^tsv$/i) {
 330       $InDelim = "\t";
 331     }
 332     else {
 333       $InDelim = "\,";
 334       if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
 335         warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
 336         next FILELIST;
 337       }
 338       if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
 339         $InDelim = "\;";
 340       }
 341     }
 342 
 343     if (!open TEXTFILE, "$TextFile") {
 344       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 345       next FILELIST;
 346     }
 347 
 348     $Line = GetTextLine(\*TEXTFILE);
 349     @ColLabels = quotewords($InDelim, 0, $Line);
 350     close TEXTFILE;
 351 
 352     $TextFilesInfo{FileOkay}[$Index] = 1;
 353     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 354 
 355     $TextFilesInfo{ColCount}[$Index] = @ColLabels;
 356     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 357     for $ColNum (0 .. $#ColLabels) {
 358       $ColLabel = $ColLabels[$ColNum];
 359       $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
 360     }
 361     $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile);
 362     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile);
 363     $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 364   }
 365 
 366 }
 367 
 368 # Make sure specified numerical data columns are okay...
 369 sub ProcessColumnsInfo {
 370   my($Index, $TextFile);
 371 
 372   @{$TextFilesInfo{NumericalDataColNums}} = ();
 373   @{$TextFilesInfo{NumericalDataColLabels}} = ();
 374 
 375   FILELIST: for $Index (0 .. $#TextFilesList) {
 376     $TextFile = $TextFilesList[$Index];
 377     @{$TextFilesInfo{NumericalDataColNums}[$Index]} = ();
 378     @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = ();
 379 
 380     if ($TextFilesInfo{FileOkay}[$Index]) {
 381       my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels);
 382       @SpecifiedColNums = ();
 383       if ($OptionsInfo{Mode} =~ /^colnum$/i) {
 384         for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
 385           if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
 386             $ColNum = $SpecifiedColNum - 1;
 387             push @SpecifiedColNums, $ColNum;
 388             push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
 389           }
 390         }
 391       }
 392       else {
 393         for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
 394           if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
 395             $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
 396             push @SpecifiedColNums, $ColNum;
 397             push @SpecifiedColLabels, $ColLabel;
 398           }
 399         }
 400       }
 401       if (@SpecifiedColNums) {
 402         push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums;
 403         push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels;
 404       }
 405     }
 406   }
 407 }
 408 
 409 # Process option values...
 410 sub ProcessOptions {
 411   %OptionsInfo = ();
 412 
 413   $OptionsInfo{Mode} = $Options{mode};
 414 
 415   $OptionsInfo{All} = $Options{all} ? $Options{all} : 0;
 416   $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0;
 417 
 418   $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1;
 419 
 420   $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0;
 421 
 422   $OptionsInfo{InDelim} = $Options{indelim};
 423   $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0;
 424 
 425   $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0;
 426   $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0;
 427   $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0;
 428   $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0;
 429 
 430   @{$OptionsInfo{SpecifiedNumericalDataCols}} = ();
 431   if ($Options{numericaldatacols}) {
 432     @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols};
 433     if ($Options{mode} =~ /^colnum$/i) {
 434       my($ColNum);
 435       for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
 436         if (!IsPositiveInteger($ColNum)) {
 437           die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n";
 438         }
 439       }
 440     }
 441   }
 442 
 443 }
 444 
 445 # Setup script usage  and retrieve command line arguments specified using various options...
 446 sub SetupScriptUsage {
 447 
 448   # Retrieve all the options...
 449   %Options = ();
 450   $Options{detail} = 1;
 451   $Options{mode} = "colnum";
 452   $Options{indelim} = "comma";
 453   if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) {
 454     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 455   }
 456   if ($Options{workingdir}) {
 457     if (! -d $Options{workingdir}) {
 458       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 459     }
 460     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 461   }
 462   if ($Options{mode} !~ /^(colnum|collabel)$/i) {
 463     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
 464   }
 465   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 466     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 467   }
 468   if (!IsPositiveInteger($Options{detail})) {
 469     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 470   }
 471 }
 472