MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromTextFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:19 $
   5 # $Revision: 1.42 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use FileHandle;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 $StartTime = new Benchmark;
  45 
  46 # Starting message...
  47 $ScriptName = basename $0;
  48 print "\n$ScriptName:Starting...\n\n";
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@TextFilesList);
  57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  58 
  59 # Process options...
  60 print "Processing options...\n";
  61 my(%OptionsInfo);
  62 ProcessOptions();
  63 
  64 # Collect column information for all the text files...
  65 print "Checking input text file(s)...\n";
  66 my(%TextFilesInfo);
  67 RetrieveTextFilesInfo();
  68 RetrieveColumnsAndRowsInfo();
  69 
  70 # Generate output files...
  71 my($FileIndex);
  72 if (@TextFilesList > 1) {
  73   print "\nProcessing text files...\n";
  74 }
  75 for $FileIndex (0 .. $#TextFilesList) {
  76   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  77     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  78     ExtractFromTextFile($FileIndex);
  79   }
  80 }
  81 print "\n$ScriptName:Done...\n\n";
  82 
  83 $EndTime = new Benchmark;
  84 $TotalTime = timediff ($EndTime, $StartTime);
  85 print "Total time: ", timestr($TotalTime), "\n";
  86 
  87 ###############################################################################
  88 
  89 # Extract appropriate data from text file...
  90 sub ExtractFromTextFile {
  91   my($Index) = @_;
  92 
  93   if ($OptionsInfo{Mode} =~ /^categories$/i) {
  94     ExtractCategoryData($Index);
  95   }
  96   elsif ($OptionsInfo{Mode} =~ /^rows$/i){
  97     ExtractRowsData($Index);
  98   }
  99   else {
 100     ExtractColumnData($Index);
 101   }
 102 }
 103 
 104 # Geneate category files...
 105 sub ExtractCategoryData {
 106   my($Index) = @_;
 107   my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels);
 108 
 109   $TextFile = $TextFilesList[$Index];
 110 
 111   $NewTextFile = $TextFilesInfo{OutFile}[$Index];
 112   $CategoryCol = $TextFilesInfo{CategoryColNum}[$Index];
 113   $InDelim = $TextFilesInfo{InDelim}[$Index];
 114   @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
 115 
 116   my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap);
 117   # Collect category data...
 118   open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
 119   # Skip label line...
 120   $_ = <TEXTFILE>;
 121 
 122   %CategoriesNameToCountMap = ();
 123   %CategoriesNameToLinesMap = ();
 124 
 125   while ($Line = GetTextLine(\*TEXTFILE)) {
 126     @LineWords = quotewords($InDelim, 0, $Line);
 127     $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : "";
 128     if (exists($CategoriesNameToCountMap{$CategoryName})) {
 129       $CategoriesNameToCountMap{$CategoryName} += 1;
 130       push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line;
 131     }
 132     else {
 133       $CategoriesNameToCountMap{$CategoryName} = 1;
 134       @{$CategoriesNameToLinesMap{$CategoryName}} = ();
 135       push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line;
 136     }
 137   }
 138   close TEXTFILE;
 139 
 140   # Setup file names for individual category files...
 141   my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle);
 142 
 143   %CategoriesNameToFileHandleMap = ();
 144   %CategoriesNameToFileNameMap = ();
 145 
 146   for $CategoryName (keys %CategoriesNameToCountMap) {
 147     $CategoryFile = $TextFilesInfo{CategoryOutFileRoot}[$Index] . "$CategoryName" . ".$TextFilesInfo{OutFileExt}[$Index]";;
 148     $CategoryFile =~ s/ //g;
 149     $CategoryFileHandle = new FileHandle;
 150     open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n";
 151     $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile;
 152     $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle;
 153   }
 154 
 155   # Write out summary file...
 156   print "Generating file $NewTextFile...\n";
 157   open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
 158 
 159   # Write out column labels...
 160   @LineWords = ("Category","Count");
 161   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 162   print NEWTEXTFILE "$Line\n";
 163 
 164   # Write out the category names and count...
 165   for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) {
 166     $CategoryCount = $CategoriesNameToCountMap{$CategoryName};
 167     @LineWords = ("$CategoryName","$CategoryCount");
 168     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 169     print NEWTEXTFILE "$Line\n";
 170   }
 171   close NEWTEXTFILE;
 172 
 173   # Write out a file for each category...
 174   my($ColLabelLine, $LineIndex);
 175 
 176   $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 177   print "\nGenerating text files for each category...\n";
 178 
 179   for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) {
 180     print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n";
 181     $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName};
 182     print $CategoryFileHandle "$ColLabelLine\n";
 183     for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) {
 184       $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex];
 185       @LineWords = quotewords($InDelim, 0, $Line);
 186       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 187       print $CategoryFileHandle "$Line\n";
 188     }
 189     close $CategoryFileHandle;
 190   }
 191 }
 192 
 193 # Extract data for specific columns...
 194 sub ExtractColumnData {
 195   my($Index) = @_;
 196   my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim);
 197 
 198   $TextFile = $TextFilesList[$Index];
 199   $NewTextFile =$TextFilesInfo{OutFile}[$Index];
 200   $InDelim = $TextFilesInfo{InDelim}[$Index];
 201   @ColNumsToExtract = @{$TextFilesInfo{ColNumsToExtract}[$Index]};
 202 
 203   print "Generating file $NewTextFile...\n";
 204   open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
 205   open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
 206 
 207   $_ = <TEXTFILE>;
 208   # Write out column labels...
 209   my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue);
 210   @ColLabels = (); $ColLabelLine = "";
 211   for $ColNum (@ColNumsToExtract) {
 212     push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
 213   }
 214   $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 215   print NEWTEXTFILE "$ColLabelLine\n";
 216 
 217   while ($Line = GetTextLine(\*TEXTFILE)) {
 218     @LineWords = quotewords($InDelim, 0, $Line);
 219     @ColValues = (); $ColValuesLine = "";
 220     for $ColNum (@ColNumsToExtract) {
 221       $ColValue = "";
 222       if ($ColNum < @LineWords) {
 223         $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : "";
 224       }
 225       push @ColValues, $ColValue;
 226     }
 227     $ColValuesLine = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 228     print NEWTEXTFILE "$ColValuesLine\n";
 229   }
 230   close NEWTEXTFILE;
 231   close TEXTFILE;
 232 }
 233 
 234 # Extract data for specific rows...
 235 sub ExtractRowsData {
 236   my($Index) = @_;
 237   my($TextFile, $NewTextFile, $InDelim, $SpecifiedRowsMode);
 238 
 239   $TextFile = $TextFilesList[$Index];
 240   $NewTextFile =$TextFilesInfo{OutFile}[$Index];
 241   $InDelim = $TextFilesInfo{InDelim}[$Index];
 242 
 243   $SpecifiedRowsMode = $OptionsInfo{SpecifiedRowsMode};
 244 
 245   print "Generating file $NewTextFile...\n";
 246   open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
 247   open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
 248 
 249   my($Line, $RowCount, @LineWords, @ColLabels);
 250 
 251   # Write out column labels...
 252   $Line = <TEXTFILE>;
 253   push @ColLabels, @{$TextFilesInfo{ColLabels}[$Index]};
 254   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 255   print NEWTEXTFILE "$Line\n";
 256 
 257   if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) {
 258     ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE);
 259   }
 260   elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) {
 261     ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE);
 262   }
 263   elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) {
 264     ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE);
 265   }
 266   elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) {
 267     ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE);
 268   }
 269   elsif ($SpecifiedRowsMode =~ /^rownums$/i) {
 270     ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE);
 271   }
 272   elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) {
 273     ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE);
 274   }
 275 
 276   close NEWTEXTFILE;
 277   close TEXTFILE;
 278 }
 279 
 280 # Extract rows by column value...
 281 sub ExtractRowsByColValue {
 282   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 283   my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords);
 284 
 285   $InDelim = $TextFilesInfo{InDelim}[$Index];
 286 
 287   LINE: while ($Line = GetTextLine($TextFileRef)) {
 288     @LineWords = quotewords($InDelim, 0, $Line);
 289     for ($ValueIndex = 0; $ValueIndex < @{$TextFilesInfo{RowValues}[$Index]}; $ValueIndex = $ValueIndex + 3) {
 290       $ColNum = $TextFilesInfo{RowValues}[$Index][$ValueIndex];
 291       $ColValue = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 1];
 292       $Criterion = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 2];
 293       if ($ColNum > $#LineWords) {
 294         next LINE;
 295       }
 296       $Value = $LineWords[$ColNum];
 297       if ($Criterion =~ /^le$/i) {
 298         if ($Value > $ColValue) {
 299           next LINE;
 300         }
 301       }
 302       elsif ($Criterion =~ /^ge$/i) {
 303         if ($Value < $ColValue) {
 304           next LINE;
 305         }
 306       }
 307       elsif ($Criterion =~ /^eq$/i) {
 308         if ($Value ne $ColValue) {
 309           next LINE;
 310         }
 311       }
 312     }
 313     # Write it out...
 314     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 315     print $NewTextFileRef "$Line\n";
 316   }
 317 }
 318 # Extract rows by column value list...
 319 sub ExtractRowsByColValueList {
 320   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 321   my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords);
 322 
 323   $InDelim = $TextFilesInfo{InDelim}[$Index];
 324   $ColNum = $TextFilesInfo{RowValues}[$Index][0];
 325 
 326   # Setup a col value map...
 327   %ColValueMap = ();
 328   for $ValueIndex (1 .. $#{$TextFilesInfo{RowValues}[$Index]}) {
 329     $Value = $TextFilesInfo{RowValues}[$Index][$ValueIndex];
 330     $ColValueMap{$Value} = $Value;
 331   }
 332 
 333   LINE: while ($Line = GetTextLine($TextFileRef)) {
 334     @LineWords = quotewords($InDelim, 0, $Line);
 335     if ($ColNum > $#LineWords) {
 336       next LINE;
 337     }
 338     $ColValue = $LineWords[$ColNum];
 339     if (exists $ColValueMap{$ColValue}) {
 340       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 341       print $NewTextFileRef "$Line\n";
 342     }
 343   }
 344 }
 345 
 346 # Extract row by minimum column value...
 347 sub ExtractRowByMinOrMaxColValue {
 348   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 349   my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords);
 350 
 351   $InDelim = $TextFilesInfo{InDelim}[$Index];
 352   $ColNum = $TextFilesInfo{RowValues}[$Index][0];
 353 
 354   $ValueLine = ''; $ColValue = ''; $FirstValue = 1;
 355   LINE: while ($Line = GetTextLine($TextFileRef)) {
 356     @LineWords = quotewords($InDelim, 0, $Line);
 357     if ($ColNum > $#LineWords) {
 358       next LINE;
 359     }
 360     if ($FirstValue) {
 361       $FirstValue = 0;
 362       $ColValue = $LineWords[$ColNum];
 363       $ValueLine = $Line;
 364       next LINE;
 365     }
 366     if ($OptionsInfo{SpecifiedRowsMode} =~ /^rowbymaxcolvalue$/i) {
 367       if ($LineWords[$ColNum] > $ColValue) {
 368         $ColValue = $LineWords[$ColNum];
 369         $ValueLine = $Line;
 370       }
 371     }
 372     else {
 373       if ($LineWords[$ColNum] < $ColValue) {
 374         $ColValue = $LineWords[$ColNum];
 375         $ValueLine = $Line;
 376       }
 377     }
 378   }
 379   if ($ValueLine) {
 380     @LineWords = quotewords($InDelim, 0, $ValueLine);
 381     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 382     print $NewTextFileRef "$Line\n";
 383   }
 384 }
 385 
 386 # Extract rows by column value range...
 387 sub ExtractRowsByColValueRange {
 388   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 389   my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords);
 390 
 391   $InDelim = $TextFilesInfo{InDelim}[$Index];
 392   $ColNum = $TextFilesInfo{RowValues}[$Index][0];
 393   $MinValue = $TextFilesInfo{RowValues}[$Index][1];
 394   $MaxValue = $TextFilesInfo{RowValues}[$Index][2];
 395 
 396   LINE: while ($Line = GetTextLine($TextFileRef)) {
 397     @LineWords = quotewords($InDelim, 0, $Line);
 398     if ($ColNum > $#LineWords) {
 399       next LINE;
 400     }
 401     $ColValue = $LineWords[$ColNum];
 402     if ($ColValue >= $MinValue && $ColValue <= $MaxValue) {
 403       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 404       print $NewTextFileRef "$Line\n";
 405     }
 406   }
 407 }
 408 
 409 # Extract rows by row number range...
 410 sub ExtractRowsByRowNumRange {
 411   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 412 
 413   my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords);
 414   $InDelim = $TextFilesInfo{InDelim}[$Index];
 415   $MinRowNum = $TextFilesInfo{RowValues}[$Index][0];
 416   $MaxRowNum = $TextFilesInfo{RowValues}[$Index][1];
 417 
 418   $RowCount = 1;
 419   LINE: while ($Line = GetTextLine($TextFileRef)) {
 420     $RowCount++;
 421     if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) {
 422       @LineWords = quotewords($InDelim, 0, $Line);
 423       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 424       print $NewTextFileRef "$Line\n";
 425     }
 426     elsif ($RowCount > $MaxRowNum) {
 427       last LINE;
 428     }
 429   }
 430 }
 431 
 432 # Extract rows by row numbers...
 433 sub ExtractRowsByRowNums {
 434   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 435   my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords);
 436 
 437   $InDelim = $TextFilesInfo{InDelim}[$Index];
 438 
 439   # Setup a row nums map...
 440   %RowNumMap = ();
 441   $MaxRowNum = $TextFilesInfo{RowValues}[$Index][0];
 442   for $RowNum (@{$TextFilesInfo{RowValues}[$Index]}) {
 443     if ($RowNum > $MaxRowNum) {
 444       $MaxRowNum = $RowNum;
 445     }
 446     $RowNumMap{$RowNum} = $RowNum;
 447   }
 448 
 449   $RowCount = 1;
 450   LINE: while ($Line = GetTextLine($TextFileRef)) {
 451     $RowCount++;
 452     if (exists $RowNumMap{$RowCount}) {
 453       @LineWords = quotewords($InDelim, 0, $Line);
 454       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 455       print $NewTextFileRef "$Line\n";
 456     }
 457     elsif ($RowCount > $MaxRowNum) {
 458       last LINE;
 459     }
 460   }
 461 }
 462 
 463 # Retrieve text file columns and rows information for specified options...
 464 sub RetrieveColumnsAndRowsInfo {
 465   ProcessColumnsInfo();
 466   ProcessRowsInfo();
 467 }
 468 
 469 # Make sure the specified columns exists in text files...
 470 sub ProcessColumnsInfo {
 471   my($Index, $SpecifiedCategoryCol, $TextFile, @ColNumsToExtract);
 472 
 473   @{$TextFilesInfo{CategoryColNum}} = ();
 474   @{$TextFilesInfo{ColNumsToExtract}} = ();
 475 
 476   $SpecifiedCategoryCol = $OptionsInfo{SpecifiedCategoryCol};
 477 
 478   FILELIST: for $Index (0 .. $#TextFilesList) {
 479     $TextFile = $TextFilesList[$Index];
 480 
 481     $TextFilesInfo{CategoryColNum}[$Index] = 0;
 482     @{$TextFilesInfo{ColNumsToExtract}[$Index]} = ();
 483 
 484     if ($TextFilesInfo{FileOkay}[$Index]) {
 485       if ($OptionsInfo{Mode} =~ /^categories$/i) {
 486         my($CategoryColNum, $CategoryColValid);
 487 
 488         $CategoryColNum = 0;
 489         $CategoryColValid = 1;
 490         if ($SpecifiedCategoryCol) {
 491           if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
 492             if ($SpecifiedCategoryCol <= $TextFilesInfo{ColCount}[$Index]) {
 493               $CategoryColNum = $SpecifiedCategoryCol - 1;
 494             }
 495             else {
 496               $CategoryColValid = 0;
 497             }
 498           }
 499           else {
 500             if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol})) {
 501               $CategoryColNum =  $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol};
 502             }
 503             else {
 504               $CategoryColValid = 0;
 505             }
 506           }
 507         }
 508         if ($CategoryColValid) {
 509           $TextFilesInfo{CategoryColNum}[$Index] = $CategoryColNum;
 510         }
 511         else {
 512           warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n";
 513           $TextFilesInfo{FileOkay}[$Index] = 0;
 514         }
 515       }
 516       elsif ($OptionsInfo{Mode} =~ /^columns$/i) {
 517         my($SpecifiedColNum, $ColNum);
 518 
 519         $ColNum = 0;
 520         @ColNumsToExtract = ();
 521 
 522         if (@{$OptionsInfo{SpecifiedColumns}}) {
 523           if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
 524             for $SpecifiedColNum (@{$OptionsInfo{SpecifiedColumns}}) {
 525               if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
 526                 $ColNum = $SpecifiedColNum - 1;
 527                 push @ColNumsToExtract, $ColNum;
 528               }
 529             }
 530           }
 531           else {
 532             my($ColLabel);
 533             for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) {
 534               if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
 535                 push @ColNumsToExtract, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
 536               }
 537             }
 538           }
 539         }
 540         else {
 541           push @ColNumsToExtract, $ColNum;
 542         }
 543         if (@ColNumsToExtract) {
 544           push @{$TextFilesInfo{ColNumsToExtract}[$Index]}, @ColNumsToExtract;
 545         }
 546         else {
 547           warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist\n";
 548           $TextFilesInfo{FileOkay}[$Index] = 0;
 549         }
 550       }
 551     }
 552   }
 553 }
 554 
 555 # Process specified rows info...
 556 sub ProcessRowsInfo {
 557   my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues);
 558 
 559   @{$TextFilesInfo{RowValues}} = ();
 560 
 561   FILELIST: for $Index (0 .. $#TextFilesList) {
 562     $TextFile = $TextFilesList[$Index];
 563     @{$TextFilesInfo{RowValues}[$Index]} = ();
 564 
 565     if ($OptionsInfo{Mode} !~ /^rows$/i) {
 566       next FILELIST;
 567     }
 568     if (!$TextFilesInfo{FileOkay}[$Index]) {
 569       next FILELIST;
 570     }
 571 
 572     @RowValues = ();
 573 
 574     if ($OptionsInfo{RowsMode} =~ /^rowsbycolvalue$/i) {
 575       my($ValueIndex);
 576       for ($ValueIndex = 0; $ValueIndex < @{$OptionsInfo{SpecifiedRowValues}}; $ValueIndex = $ValueIndex + 3) {
 577         $ColID = $OptionsInfo{SpecifiedRowValues}[$ValueIndex];
 578         $Value = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 1];
 579         $Criterion = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 2];
 580 
 581         $ColIDOkay = 0;
 582         if ($OptionsInfo{ColMode} =~ /^collabel$/i) {
 583           if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) {
 584             $ColIDOkay = 1;
 585             $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID};
 586           }
 587         }
 588         else {
 589           if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) {
 590             $ColNum = $ColID - 1;
 591             $ColIDOkay = 1;
 592           }
 593         }
 594         if ($ColIDOkay) {
 595           push @RowValues, ($ColNum, $Value, $Criterion);
 596         }
 597       }
 598     }
 599     elsif ($OptionsInfo{RowsMode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) {
 600       # Process coulumn id...
 601       $ColID = $OptionsInfo{SpecifiedRowValues}[0];
 602       $ColIDOkay = 0;
 603 
 604       if ($OptionsInfo{ColMode} =~ /^collabel$/i) {
 605         if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) {
 606           $ColIDOkay = 1;
 607           $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID};
 608         }
 609       }
 610       else {
 611         if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) {
 612           $ColIDOkay = 1;
 613           $ColNum = $ColID - 1;
 614         }
 615       }
 616       if ($ColIDOkay) {
 617         push @RowValues, $ColNum;
 618         # Get rest of the specified values...
 619         if (@{$OptionsInfo{SpecifiedRowValues}} > 1) {
 620           for $Index (1 .. $#{$OptionsInfo{SpecifiedRowValues}}) {
 621             push @RowValues, $OptionsInfo{SpecifiedRowValues}[$Index];
 622           }
 623         }
 624       }
 625     }
 626     elsif ($OptionsInfo{RowsMode} =~ /^(rownums|rownumrange)$/i) {
 627       push @RowValues, @{$OptionsInfo{SpecifiedRowValues}};
 628     }
 629 
 630     if (@RowValues) {
 631       push @{$TextFilesInfo{RowValues}[$Index]}, @RowValues;
 632     }
 633     else {
 634       warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n";
 635       $TextFilesInfo{FileOkay}[$Index] = 0;
 636     }
 637   }
 638 }
 639 
 640 # Retrieve information about input text files...
 641 sub RetrieveTextFilesInfo {
 642   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel);
 643 
 644   %TextFilesInfo = ();
 645 
 646   @{$TextFilesInfo{FileOkay}} = ();
 647   @{$TextFilesInfo{ColCount}} = ();
 648   @{$TextFilesInfo{ColLabels}} = ();
 649   @{$TextFilesInfo{ColLabelToNumMap}} = ();
 650   @{$TextFilesInfo{InDelim}} = ();
 651   @{$TextFilesInfo{OutFile}} = ();
 652   @{$TextFilesInfo{OutFileExt}} = ();
 653   @{$TextFilesInfo{CategoryOutFileRoot}} = ();
 654 
 655   FILELIST: for $Index (0 .. $#TextFilesList) {
 656     $TextFile = $TextFilesList[$Index];
 657 
 658     $TextFilesInfo{FileOkay}[$Index] = 0;
 659     $TextFilesInfo{ColCount}[$Index] = 0;
 660     $TextFilesInfo{InDelim}[$Index] = "";
 661     $TextFilesInfo{OutFile}[$Index] = "";
 662     $TextFilesInfo{OutFileExt}[$Index] = "";
 663     $TextFilesInfo{CategoryOutFileRoot}[$Index] = "";
 664 
 665     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 666     %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
 667 
 668     if (!(-e $TextFile)) {
 669       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 670       next FILELIST;
 671     }
 672     if (!CheckFileType($TextFile, "csv tsv")) {
 673       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 674       next FILELIST;
 675     }
 676 
 677     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 678     if ($FileExt =~ /^tsv$/i) {
 679       $InDelim = "\t";
 680     }
 681     else {
 682       $InDelim = "\,";
 683       if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) {
 684         warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
 685         next FILELIST;
 686       }
 687       if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
 688         $InDelim = "\;";
 689       }
 690     }
 691 
 692     if (!open TEXTFILE, "$TextFile") {
 693       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 694       next FILELIST;
 695     }
 696 
 697     $Line = GetTextLine(\*TEXTFILE);
 698     @ColLabels = quotewords($InDelim, 0, $Line);
 699     close TEXTFILE;
 700 
 701     $FileDir = ""; $FileName = ""; $FileExt = "";
 702     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 703     $FileExt = "csv";
 704     if ($OptionsInfo{OutDelim} =~ /^tab$/i) {
 705       $FileExt = "tsv";
 706     }
 707 
 708     if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) {
 709       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 710       if ($RootFileName && $RootFileExt) {
 711         $FileName = $RootFileName;
 712       }
 713       else {
 714         $FileName = $OptionsInfo{OutFileRoot};
 715       }
 716       $OutFileRoot .= $FileName;
 717     }
 718     else {
 719       $OutFileRoot = $FileName;
 720       $OutFileRoot .= ($OptionsInfo{Mode} =~ /^categories$/i) ? "CategoriesSummary" : (($OptionsInfo{Mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns");
 721     }
 722     $CategoryOutFileRoot = "$FileName" . "Category";
 723 
 724     $OutFile = $OutFileRoot . ".$FileExt";
 725     if (lc($OutFile) eq lc($TextFile)) {
 726       warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n";
 727       next FILELIST;
 728     }
 729 
 730     if (!$OptionsInfo{Overwrite}) {
 731       if (-e $OutFile) {
 732         warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
 733         next FILELIST;
 734       }
 735     }
 736 
 737     $TextFilesInfo{FileOkay}[$Index] = 1;
 738     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 739     $TextFilesInfo{CategoryOutFileRoot}[$Index] = $CategoryOutFileRoot;
 740     $TextFilesInfo{OutFile}[$Index] = "$OutFile";
 741     $TextFilesInfo{OutFileExt}[$Index] = "$FileExt";
 742 
 743     $TextFilesInfo{ColCount}[$Index] = @ColLabels;
 744     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 745 
 746     for $ColNum (0 .. $#ColLabels) {
 747       $ColLabel = $ColLabels[$ColNum];
 748       $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
 749     }
 750   }
 751 }
 752 
 753 # Process option values...
 754 sub ProcessOptions {
 755   my(@SpecifiedColumns, @SpecifiedRowValues);
 756 
 757   %OptionsInfo = ();
 758 
 759   $OptionsInfo{Mode} = $Options{mode};
 760 
 761   $OptionsInfo{ColMode} = $Options{colmode};
 762 
 763   $OptionsInfo{CategoryCol} = defined $Options{categorycol} ? $Options{categorycol} : undef;
 764   $OptionsInfo{SpecifiedCategoryCol} = "";
 765 
 766   if (defined $Options{categorycol}) {
 767     my(@SpecifiedValues) = split ",", $Options{categorycol};
 768     if (@SpecifiedValues != 1) {
 769       die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n";
 770     }
 771     $OptionsInfo{SpecifiedCategoryCol} = $SpecifiedValues[0];
 772     if ($Options{colmode} =~ /^colnum$/i) {
 773       if (!IsPositiveInteger($OptionsInfo{SpecifiedCategoryCol})) {
 774         die "Error: Category column value, $OptionsInfo{SpecifiedCategoryCol}, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n";
 775       }
 776     }
 777   }
 778 
 779   $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef;
 780   @{$OptionsInfo{SpecifiedColumns}} = ();
 781   @SpecifiedColumns = ();
 782 
 783   if (defined $Options{columns}) {
 784     my(@SpecifiedValues) = split ",", $Options{columns};
 785     if ($Options{colmode} =~ /^colnum$/i) {
 786       my($ColValue);
 787       for $ColValue (@SpecifiedValues) {
 788         if (!IsPositiveInteger($ColValue)) {
 789           die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n";
 790         }
 791       }
 792     }
 793     push @SpecifiedColumns, @SpecifiedValues;
 794   }
 795   @{$OptionsInfo{SpecifiedColumns}} = @SpecifiedColumns;
 796 
 797   $OptionsInfo{InDelim} = $Options{indelim};
 798 
 799   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 800   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 801   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 802 
 803   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 804 
 805   # Process any specified rows values...
 806   @SpecifiedRowValues = ();
 807   @{$OptionsInfo{SpecifiedRowValues}} = ();
 808 
 809   $OptionsInfo{RowsMode} = $Options{rowsmode};
 810   $OptionsInfo{Rows} = defined $Options{rows} ? $Options{rows} : undef;
 811 
 812   $OptionsInfo{SpecifiedRowsMode} = $Options{rowsmode};
 813 
 814   if (defined $Options{rows}) {
 815     (@SpecifiedRowValues) = split ",", $Options{rows};
 816   }
 817   else {
 818     if ($Options{rowsmode} !~ /^rownums$/i) {
 819       die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n";
 820     }
 821     push @SpecifiedRowValues, "1";
 822   }
 823   @{$OptionsInfo{SpecifiedRowValues}} = @SpecifiedRowValues;
 824 
 825   my($SpecifiedColID, $SpecifiedRowID);
 826   # Make sure specified values are okay...
 827   if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) {
 828     if (@SpecifiedRowValues % 3) {
 829       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n";
 830     }
 831     # Triplet format: colid,value,criteria. Criterion: le,ge,eq
 832     my($Index, $ColID, $Criterion, $Value);
 833     for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) {
 834       $ColID = $SpecifiedRowValues[$Index];
 835       $Value = $SpecifiedRowValues[$Index + 1];
 836       $Criterion = $SpecifiedRowValues[$Index + 2];
 837       if ($Options{colmode} =~ /^colnum$/i) {
 838         if (!IsPositiveInteger($ColID)) {
 839           die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 840         }
 841       }
 842       if ($Criterion !~ /^(eq|le|ge)$/i) {
 843         die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n";
 844       }
 845     }
 846   }
 847   elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) {
 848     ($SpecifiedColID) = $SpecifiedRowValues[0];
 849     if ($Options{colmode} =~ /^colnum$/i) {
 850       if (!IsPositiveInteger($SpecifiedColID)) {
 851         die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 852       }
 853     }
 854     if (@SpecifiedRowValues == 1) {
 855       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n";
 856     }
 857   }
 858   elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) {
 859     if (@SpecifiedRowValues != 3) {
 860       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n";
 861     }
 862     ($SpecifiedColID) = $SpecifiedRowValues[0];
 863     if ($Options{colmode} =~ /^colnum$/i) {
 864       if (!IsPositiveInteger($SpecifiedColID)) {
 865         die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 866       }
 867     }
 868     if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) {
 869       die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n";
 870     }
 871   }
 872   elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) {
 873     if (@SpecifiedRowValues != 1) {
 874       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n";
 875     }
 876     ($SpecifiedColID) = $SpecifiedRowValues[0];
 877     if ($Options{colmode} =~ /^colnum$/i) {
 878       if (!IsPositiveInteger($SpecifiedColID)) {
 879         die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 880       }
 881     }
 882   }
 883   elsif ($Options{rowsmode} =~ /^rownums$/i) {
 884     for $SpecifiedRowID (@SpecifiedRowValues) {
 885       if (!IsPositiveInteger($SpecifiedRowID)) {
 886         die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 887       }
 888     }
 889   }
 890   elsif ($Options{rowsmode} =~ /^rownumrange$/i) {
 891     if (@SpecifiedRowValues != 2) {
 892       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n";
 893     }
 894     for $SpecifiedRowID (@SpecifiedRowValues) {
 895       if (!IsPositiveInteger($SpecifiedRowID)) {
 896         die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 897       }
 898     }
 899     if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) {
 900       die "Error: Invalid value pair -  ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n";
 901     }
 902   }
 903 }
 904 
 905 # Setup script usage  and retrieve command line arguments specified using various options...
 906 sub SetupScriptUsage {
 907 
 908   # Setup default and retrieve all the options...
 909   %Options = ();
 910   $Options{colmode} = "colnum";
 911   $Options{indelim} = "comma";
 912   $Options{mode} = "columns";
 913   $Options{outdelim} = "comma";
 914   $Options{quote} = "yes";
 915   $Options{rowsmode} = "rownums";
 916 
 917   if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) {
 918     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 919   }
 920   if ($Options{workingdir}) {
 921     if (! -d $Options{workingdir}) {
 922       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 923     }
 924     chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 925   }
 926   if ($Options{mode} !~ /^(columns|rows|categories)$/i) {
 927     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n";
 928   }
 929   if ($Options{colmode} !~ /^(colnum|collabel)$/i) {
 930     die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n";
 931   }
 932   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 933     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 934   }
 935   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 936     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 937   }
 938   if ($Options{quote} !~ /^(yes|no)$/i) {
 939     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 940   }
 941   if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) {
 942     die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n";
 943   }
 944 }