1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromTextFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:19 $ 5 # $Revision: 1.42 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use FileHandle; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 $StartTime = new Benchmark; 45 46 # Starting message... 47 $ScriptName = basename $0; 48 print "\n$ScriptName:Starting...\n\n"; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@TextFilesList); 57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 58 59 # Process options... 60 print "Processing options...\n"; 61 my(%OptionsInfo); 62 ProcessOptions(); 63 64 # Collect column information for all the text files... 65 print "Checking input text file(s)...\n"; 66 my(%TextFilesInfo); 67 RetrieveTextFilesInfo(); 68 RetrieveColumnsAndRowsInfo(); 69 70 # Generate output files... 71 my($FileIndex); 72 if (@TextFilesList > 1) { 73 print "\nProcessing text files...\n"; 74 } 75 for $FileIndex (0 .. $#TextFilesList) { 76 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 77 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 78 ExtractFromTextFile($FileIndex); 79 } 80 } 81 print "\n$ScriptName:Done...\n\n"; 82 83 $EndTime = new Benchmark; 84 $TotalTime = timediff ($EndTime, $StartTime); 85 print "Total time: ", timestr($TotalTime), "\n"; 86 87 ############################################################################### 88 89 # Extract appropriate data from text file... 90 sub ExtractFromTextFile { 91 my($Index) = @_; 92 93 if ($OptionsInfo{Mode} =~ /^categories$/i) { 94 ExtractCategoryData($Index); 95 } 96 elsif ($OptionsInfo{Mode} =~ /^rows$/i){ 97 ExtractRowsData($Index); 98 } 99 else { 100 ExtractColumnData($Index); 101 } 102 } 103 104 # Geneate category files... 105 sub ExtractCategoryData { 106 my($Index) = @_; 107 my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels); 108 109 $TextFile = $TextFilesList[$Index]; 110 111 $NewTextFile = $TextFilesInfo{OutFile}[$Index]; 112 $CategoryCol = $TextFilesInfo{CategoryColNum}[$Index]; 113 $InDelim = $TextFilesInfo{InDelim}[$Index]; 114 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; 115 116 my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap); 117 # Collect category data... 118 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 119 # Skip label line... 120 $_ = <TEXTFILE>; 121 122 %CategoriesNameToCountMap = (); 123 %CategoriesNameToLinesMap = (); 124 125 while ($Line = GetTextLine(\*TEXTFILE)) { 126 @LineWords = quotewords($InDelim, 0, $Line); 127 $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : ""; 128 if (exists($CategoriesNameToCountMap{$CategoryName})) { 129 $CategoriesNameToCountMap{$CategoryName} += 1; 130 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 131 } 132 else { 133 $CategoriesNameToCountMap{$CategoryName} = 1; 134 @{$CategoriesNameToLinesMap{$CategoryName}} = (); 135 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 136 } 137 } 138 close TEXTFILE; 139 140 # Setup file names for individual category files... 141 my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle); 142 143 %CategoriesNameToFileHandleMap = (); 144 %CategoriesNameToFileNameMap = (); 145 146 for $CategoryName (keys %CategoriesNameToCountMap) { 147 $CategoryFile = $TextFilesInfo{CategoryOutFileRoot}[$Index] . "$CategoryName" . ".$TextFilesInfo{OutFileExt}[$Index]";; 148 $CategoryFile =~ s/ //g; 149 $CategoryFileHandle = new FileHandle; 150 open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n"; 151 $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile; 152 $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle; 153 } 154 155 # Write out summary file... 156 print "Generating file $NewTextFile...\n"; 157 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 158 159 # Write out column labels... 160 @LineWords = ("Category","Count"); 161 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 162 print NEWTEXTFILE "$Line\n"; 163 164 # Write out the category names and count... 165 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 166 $CategoryCount = $CategoriesNameToCountMap{$CategoryName}; 167 @LineWords = ("$CategoryName","$CategoryCount"); 168 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 169 print NEWTEXTFILE "$Line\n"; 170 } 171 close NEWTEXTFILE; 172 173 # Write out a file for each category... 174 my($ColLabelLine, $LineIndex); 175 176 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 177 print "\nGenerating text files for each category...\n"; 178 179 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 180 print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n"; 181 $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName}; 182 print $CategoryFileHandle "$ColLabelLine\n"; 183 for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) { 184 $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex]; 185 @LineWords = quotewords($InDelim, 0, $Line); 186 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 187 print $CategoryFileHandle "$Line\n"; 188 } 189 close $CategoryFileHandle; 190 } 191 } 192 193 # Extract data for specific columns... 194 sub ExtractColumnData { 195 my($Index) = @_; 196 my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim); 197 198 $TextFile = $TextFilesList[$Index]; 199 $NewTextFile =$TextFilesInfo{OutFile}[$Index]; 200 $InDelim = $TextFilesInfo{InDelim}[$Index]; 201 @ColNumsToExtract = @{$TextFilesInfo{ColNumsToExtract}[$Index]}; 202 203 print "Generating file $NewTextFile...\n"; 204 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 205 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 206 207 $_ = <TEXTFILE>; 208 # Write out column labels... 209 my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue); 210 @ColLabels = (); $ColLabelLine = ""; 211 for $ColNum (@ColNumsToExtract) { 212 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 213 } 214 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 215 print NEWTEXTFILE "$ColLabelLine\n"; 216 217 while ($Line = GetTextLine(\*TEXTFILE)) { 218 @LineWords = quotewords($InDelim, 0, $Line); 219 @ColValues = (); $ColValuesLine = ""; 220 for $ColNum (@ColNumsToExtract) { 221 $ColValue = ""; 222 if ($ColNum < @LineWords) { 223 $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : ""; 224 } 225 push @ColValues, $ColValue; 226 } 227 $ColValuesLine = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 228 print NEWTEXTFILE "$ColValuesLine\n"; 229 } 230 close NEWTEXTFILE; 231 close TEXTFILE; 232 } 233 234 # Extract data for specific rows... 235 sub ExtractRowsData { 236 my($Index) = @_; 237 my($TextFile, $NewTextFile, $InDelim, $SpecifiedRowsMode); 238 239 $TextFile = $TextFilesList[$Index]; 240 $NewTextFile =$TextFilesInfo{OutFile}[$Index]; 241 $InDelim = $TextFilesInfo{InDelim}[$Index]; 242 243 $SpecifiedRowsMode = $OptionsInfo{SpecifiedRowsMode}; 244 245 print "Generating file $NewTextFile...\n"; 246 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 247 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 248 249 my($Line, $RowCount, @LineWords, @ColLabels); 250 251 # Write out column labels... 252 $Line = <TEXTFILE>; 253 push @ColLabels, @{$TextFilesInfo{ColLabels}[$Index]}; 254 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 255 print NEWTEXTFILE "$Line\n"; 256 257 if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) { 258 ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 259 } 260 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) { 261 ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE); 262 } 263 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) { 264 ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 265 } 266 elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 267 ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 268 } 269 elsif ($SpecifiedRowsMode =~ /^rownums$/i) { 270 ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE); 271 } 272 elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) { 273 ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 274 } 275 276 close NEWTEXTFILE; 277 close TEXTFILE; 278 } 279 280 # Extract rows by column value... 281 sub ExtractRowsByColValue { 282 my($Index, $TextFileRef, $NewTextFileRef) = @_; 283 my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords); 284 285 $InDelim = $TextFilesInfo{InDelim}[$Index]; 286 287 LINE: while ($Line = GetTextLine($TextFileRef)) { 288 @LineWords = quotewords($InDelim, 0, $Line); 289 for ($ValueIndex = 0; $ValueIndex < @{$TextFilesInfo{RowValues}[$Index]}; $ValueIndex = $ValueIndex + 3) { 290 $ColNum = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; 291 $ColValue = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 1]; 292 $Criterion = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 2]; 293 if ($ColNum > $#LineWords) { 294 next LINE; 295 } 296 $Value = $LineWords[$ColNum]; 297 if ($Criterion =~ /^le$/i) { 298 if ($Value > $ColValue) { 299 next LINE; 300 } 301 } 302 elsif ($Criterion =~ /^ge$/i) { 303 if ($Value < $ColValue) { 304 next LINE; 305 } 306 } 307 elsif ($Criterion =~ /^eq$/i) { 308 if ($Value ne $ColValue) { 309 next LINE; 310 } 311 } 312 } 313 # Write it out... 314 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 315 print $NewTextFileRef "$Line\n"; 316 } 317 } 318 # Extract rows by column value list... 319 sub ExtractRowsByColValueList { 320 my($Index, $TextFileRef, $NewTextFileRef) = @_; 321 my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords); 322 323 $InDelim = $TextFilesInfo{InDelim}[$Index]; 324 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; 325 326 # Setup a col value map... 327 %ColValueMap = (); 328 for $ValueIndex (1 .. $#{$TextFilesInfo{RowValues}[$Index]}) { 329 $Value = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; 330 $ColValueMap{$Value} = $Value; 331 } 332 333 LINE: while ($Line = GetTextLine($TextFileRef)) { 334 @LineWords = quotewords($InDelim, 0, $Line); 335 if ($ColNum > $#LineWords) { 336 next LINE; 337 } 338 $ColValue = $LineWords[$ColNum]; 339 if (exists $ColValueMap{$ColValue}) { 340 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 341 print $NewTextFileRef "$Line\n"; 342 } 343 } 344 } 345 346 # Extract row by minimum column value... 347 sub ExtractRowByMinOrMaxColValue { 348 my($Index, $TextFileRef, $NewTextFileRef) = @_; 349 my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords); 350 351 $InDelim = $TextFilesInfo{InDelim}[$Index]; 352 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; 353 354 $ValueLine = ''; $ColValue = ''; $FirstValue = 1; 355 LINE: while ($Line = GetTextLine($TextFileRef)) { 356 @LineWords = quotewords($InDelim, 0, $Line); 357 if ($ColNum > $#LineWords) { 358 next LINE; 359 } 360 if ($FirstValue) { 361 $FirstValue = 0; 362 $ColValue = $LineWords[$ColNum]; 363 $ValueLine = $Line; 364 next LINE; 365 } 366 if ($OptionsInfo{SpecifiedRowsMode} =~ /^rowbymaxcolvalue$/i) { 367 if ($LineWords[$ColNum] > $ColValue) { 368 $ColValue = $LineWords[$ColNum]; 369 $ValueLine = $Line; 370 } 371 } 372 else { 373 if ($LineWords[$ColNum] < $ColValue) { 374 $ColValue = $LineWords[$ColNum]; 375 $ValueLine = $Line; 376 } 377 } 378 } 379 if ($ValueLine) { 380 @LineWords = quotewords($InDelim, 0, $ValueLine); 381 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 382 print $NewTextFileRef "$Line\n"; 383 } 384 } 385 386 # Extract rows by column value range... 387 sub ExtractRowsByColValueRange { 388 my($Index, $TextFileRef, $NewTextFileRef) = @_; 389 my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords); 390 391 $InDelim = $TextFilesInfo{InDelim}[$Index]; 392 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; 393 $MinValue = $TextFilesInfo{RowValues}[$Index][1]; 394 $MaxValue = $TextFilesInfo{RowValues}[$Index][2]; 395 396 LINE: while ($Line = GetTextLine($TextFileRef)) { 397 @LineWords = quotewords($InDelim, 0, $Line); 398 if ($ColNum > $#LineWords) { 399 next LINE; 400 } 401 $ColValue = $LineWords[$ColNum]; 402 if ($ColValue >= $MinValue && $ColValue <= $MaxValue) { 403 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 404 print $NewTextFileRef "$Line\n"; 405 } 406 } 407 } 408 409 # Extract rows by row number range... 410 sub ExtractRowsByRowNumRange { 411 my($Index, $TextFileRef, $NewTextFileRef) = @_; 412 413 my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords); 414 $InDelim = $TextFilesInfo{InDelim}[$Index]; 415 $MinRowNum = $TextFilesInfo{RowValues}[$Index][0]; 416 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][1]; 417 418 $RowCount = 1; 419 LINE: while ($Line = GetTextLine($TextFileRef)) { 420 $RowCount++; 421 if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) { 422 @LineWords = quotewords($InDelim, 0, $Line); 423 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 424 print $NewTextFileRef "$Line\n"; 425 } 426 elsif ($RowCount > $MaxRowNum) { 427 last LINE; 428 } 429 } 430 } 431 432 # Extract rows by row numbers... 433 sub ExtractRowsByRowNums { 434 my($Index, $TextFileRef, $NewTextFileRef) = @_; 435 my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords); 436 437 $InDelim = $TextFilesInfo{InDelim}[$Index]; 438 439 # Setup a row nums map... 440 %RowNumMap = (); 441 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][0]; 442 for $RowNum (@{$TextFilesInfo{RowValues}[$Index]}) { 443 if ($RowNum > $MaxRowNum) { 444 $MaxRowNum = $RowNum; 445 } 446 $RowNumMap{$RowNum} = $RowNum; 447 } 448 449 $RowCount = 1; 450 LINE: while ($Line = GetTextLine($TextFileRef)) { 451 $RowCount++; 452 if (exists $RowNumMap{$RowCount}) { 453 @LineWords = quotewords($InDelim, 0, $Line); 454 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 455 print $NewTextFileRef "$Line\n"; 456 } 457 elsif ($RowCount > $MaxRowNum) { 458 last LINE; 459 } 460 } 461 } 462 463 # Retrieve text file columns and rows information for specified options... 464 sub RetrieveColumnsAndRowsInfo { 465 ProcessColumnsInfo(); 466 ProcessRowsInfo(); 467 } 468 469 # Make sure the specified columns exists in text files... 470 sub ProcessColumnsInfo { 471 my($Index, $SpecifiedCategoryCol, $TextFile, @ColNumsToExtract); 472 473 @{$TextFilesInfo{CategoryColNum}} = (); 474 @{$TextFilesInfo{ColNumsToExtract}} = (); 475 476 $SpecifiedCategoryCol = $OptionsInfo{SpecifiedCategoryCol}; 477 478 FILELIST: for $Index (0 .. $#TextFilesList) { 479 $TextFile = $TextFilesList[$Index]; 480 481 $TextFilesInfo{CategoryColNum}[$Index] = 0; 482 @{$TextFilesInfo{ColNumsToExtract}[$Index]} = (); 483 484 if ($TextFilesInfo{FileOkay}[$Index]) { 485 if ($OptionsInfo{Mode} =~ /^categories$/i) { 486 my($CategoryColNum, $CategoryColValid); 487 488 $CategoryColNum = 0; 489 $CategoryColValid = 1; 490 if ($SpecifiedCategoryCol) { 491 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 492 if ($SpecifiedCategoryCol <= $TextFilesInfo{ColCount}[$Index]) { 493 $CategoryColNum = $SpecifiedCategoryCol - 1; 494 } 495 else { 496 $CategoryColValid = 0; 497 } 498 } 499 else { 500 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol})) { 501 $CategoryColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol}; 502 } 503 else { 504 $CategoryColValid = 0; 505 } 506 } 507 } 508 if ($CategoryColValid) { 509 $TextFilesInfo{CategoryColNum}[$Index] = $CategoryColNum; 510 } 511 else { 512 warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n"; 513 $TextFilesInfo{FileOkay}[$Index] = 0; 514 } 515 } 516 elsif ($OptionsInfo{Mode} =~ /^columns$/i) { 517 my($SpecifiedColNum, $ColNum); 518 519 $ColNum = 0; 520 @ColNumsToExtract = (); 521 522 if (@{$OptionsInfo{SpecifiedColumns}}) { 523 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 524 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedColumns}}) { 525 if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { 526 $ColNum = $SpecifiedColNum - 1; 527 push @ColNumsToExtract, $ColNum; 528 } 529 } 530 } 531 else { 532 my($ColLabel); 533 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) { 534 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { 535 push @ColNumsToExtract, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 536 } 537 } 538 } 539 } 540 else { 541 push @ColNumsToExtract, $ColNum; 542 } 543 if (@ColNumsToExtract) { 544 push @{$TextFilesInfo{ColNumsToExtract}[$Index]}, @ColNumsToExtract; 545 } 546 else { 547 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist\n"; 548 $TextFilesInfo{FileOkay}[$Index] = 0; 549 } 550 } 551 } 552 } 553 } 554 555 # Process specified rows info... 556 sub ProcessRowsInfo { 557 my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues); 558 559 @{$TextFilesInfo{RowValues}} = (); 560 561 FILELIST: for $Index (0 .. $#TextFilesList) { 562 $TextFile = $TextFilesList[$Index]; 563 @{$TextFilesInfo{RowValues}[$Index]} = (); 564 565 if ($OptionsInfo{Mode} !~ /^rows$/i) { 566 next FILELIST; 567 } 568 if (!$TextFilesInfo{FileOkay}[$Index]) { 569 next FILELIST; 570 } 571 572 @RowValues = (); 573 574 if ($OptionsInfo{RowsMode} =~ /^rowsbycolvalue$/i) { 575 my($ValueIndex); 576 for ($ValueIndex = 0; $ValueIndex < @{$OptionsInfo{SpecifiedRowValues}}; $ValueIndex = $ValueIndex + 3) { 577 $ColID = $OptionsInfo{SpecifiedRowValues}[$ValueIndex]; 578 $Value = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 1]; 579 $Criterion = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 2]; 580 581 $ColIDOkay = 0; 582 if ($OptionsInfo{ColMode} =~ /^collabel$/i) { 583 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { 584 $ColIDOkay = 1; 585 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; 586 } 587 } 588 else { 589 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { 590 $ColNum = $ColID - 1; 591 $ColIDOkay = 1; 592 } 593 } 594 if ($ColIDOkay) { 595 push @RowValues, ($ColNum, $Value, $Criterion); 596 } 597 } 598 } 599 elsif ($OptionsInfo{RowsMode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) { 600 # Process coulumn id... 601 $ColID = $OptionsInfo{SpecifiedRowValues}[0]; 602 $ColIDOkay = 0; 603 604 if ($OptionsInfo{ColMode} =~ /^collabel$/i) { 605 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { 606 $ColIDOkay = 1; 607 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; 608 } 609 } 610 else { 611 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { 612 $ColIDOkay = 1; 613 $ColNum = $ColID - 1; 614 } 615 } 616 if ($ColIDOkay) { 617 push @RowValues, $ColNum; 618 # Get rest of the specified values... 619 if (@{$OptionsInfo{SpecifiedRowValues}} > 1) { 620 for $Index (1 .. $#{$OptionsInfo{SpecifiedRowValues}}) { 621 push @RowValues, $OptionsInfo{SpecifiedRowValues}[$Index]; 622 } 623 } 624 } 625 } 626 elsif ($OptionsInfo{RowsMode} =~ /^(rownums|rownumrange)$/i) { 627 push @RowValues, @{$OptionsInfo{SpecifiedRowValues}}; 628 } 629 630 if (@RowValues) { 631 push @{$TextFilesInfo{RowValues}[$Index]}, @RowValues; 632 } 633 else { 634 warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n"; 635 $TextFilesInfo{FileOkay}[$Index] = 0; 636 } 637 } 638 } 639 640 # Retrieve information about input text files... 641 sub RetrieveTextFilesInfo { 642 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel); 643 644 %TextFilesInfo = (); 645 646 @{$TextFilesInfo{FileOkay}} = (); 647 @{$TextFilesInfo{ColCount}} = (); 648 @{$TextFilesInfo{ColLabels}} = (); 649 @{$TextFilesInfo{ColLabelToNumMap}} = (); 650 @{$TextFilesInfo{InDelim}} = (); 651 @{$TextFilesInfo{OutFile}} = (); 652 @{$TextFilesInfo{OutFileExt}} = (); 653 @{$TextFilesInfo{CategoryOutFileRoot}} = (); 654 655 FILELIST: for $Index (0 .. $#TextFilesList) { 656 $TextFile = $TextFilesList[$Index]; 657 658 $TextFilesInfo{FileOkay}[$Index] = 0; 659 $TextFilesInfo{ColCount}[$Index] = 0; 660 $TextFilesInfo{InDelim}[$Index] = ""; 661 $TextFilesInfo{OutFile}[$Index] = ""; 662 $TextFilesInfo{OutFileExt}[$Index] = ""; 663 $TextFilesInfo{CategoryOutFileRoot}[$Index] = ""; 664 665 @{$TextFilesInfo{ColLabels}[$Index]} = (); 666 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 667 668 if (!(-e $TextFile)) { 669 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 670 next FILELIST; 671 } 672 if (!CheckFileType($TextFile, "csv tsv")) { 673 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 674 next FILELIST; 675 } 676 677 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 678 if ($FileExt =~ /^tsv$/i) { 679 $InDelim = "\t"; 680 } 681 else { 682 $InDelim = "\,"; 683 if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) { 684 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; 685 next FILELIST; 686 } 687 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 688 $InDelim = "\;"; 689 } 690 } 691 692 if (!open TEXTFILE, "$TextFile") { 693 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 694 next FILELIST; 695 } 696 697 $Line = GetTextLine(\*TEXTFILE); 698 @ColLabels = quotewords($InDelim, 0, $Line); 699 close TEXTFILE; 700 701 $FileDir = ""; $FileName = ""; $FileExt = ""; 702 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 703 $FileExt = "csv"; 704 if ($OptionsInfo{OutDelim} =~ /^tab$/i) { 705 $FileExt = "tsv"; 706 } 707 708 if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) { 709 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 710 if ($RootFileName && $RootFileExt) { 711 $FileName = $RootFileName; 712 } 713 else { 714 $FileName = $OptionsInfo{OutFileRoot}; 715 } 716 $OutFileRoot .= $FileName; 717 } 718 else { 719 $OutFileRoot = $FileName; 720 $OutFileRoot .= ($OptionsInfo{Mode} =~ /^categories$/i) ? "CategoriesSummary" : (($OptionsInfo{Mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns"); 721 } 722 $CategoryOutFileRoot = "$FileName" . "Category"; 723 724 $OutFile = $OutFileRoot . ".$FileExt"; 725 if (lc($OutFile) eq lc($TextFile)) { 726 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; 727 next FILELIST; 728 } 729 730 if (!$OptionsInfo{Overwrite}) { 731 if (-e $OutFile) { 732 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; 733 next FILELIST; 734 } 735 } 736 737 $TextFilesInfo{FileOkay}[$Index] = 1; 738 $TextFilesInfo{InDelim}[$Index] = $InDelim; 739 $TextFilesInfo{CategoryOutFileRoot}[$Index] = $CategoryOutFileRoot; 740 $TextFilesInfo{OutFile}[$Index] = "$OutFile"; 741 $TextFilesInfo{OutFileExt}[$Index] = "$FileExt"; 742 743 $TextFilesInfo{ColCount}[$Index] = @ColLabels; 744 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 745 746 for $ColNum (0 .. $#ColLabels) { 747 $ColLabel = $ColLabels[$ColNum]; 748 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 749 } 750 } 751 } 752 753 # Process option values... 754 sub ProcessOptions { 755 my(@SpecifiedColumns, @SpecifiedRowValues); 756 757 %OptionsInfo = (); 758 759 $OptionsInfo{Mode} = $Options{mode}; 760 761 $OptionsInfo{ColMode} = $Options{colmode}; 762 763 $OptionsInfo{CategoryCol} = defined $Options{categorycol} ? $Options{categorycol} : undef; 764 $OptionsInfo{SpecifiedCategoryCol} = ""; 765 766 if (defined $Options{categorycol}) { 767 my(@SpecifiedValues) = split ",", $Options{categorycol}; 768 if (@SpecifiedValues != 1) { 769 die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n"; 770 } 771 $OptionsInfo{SpecifiedCategoryCol} = $SpecifiedValues[0]; 772 if ($Options{colmode} =~ /^colnum$/i) { 773 if (!IsPositiveInteger($OptionsInfo{SpecifiedCategoryCol})) { 774 die "Error: Category column value, $OptionsInfo{SpecifiedCategoryCol}, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n"; 775 } 776 } 777 } 778 779 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef; 780 @{$OptionsInfo{SpecifiedColumns}} = (); 781 @SpecifiedColumns = (); 782 783 if (defined $Options{columns}) { 784 my(@SpecifiedValues) = split ",", $Options{columns}; 785 if ($Options{colmode} =~ /^colnum$/i) { 786 my($ColValue); 787 for $ColValue (@SpecifiedValues) { 788 if (!IsPositiveInteger($ColValue)) { 789 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; 790 } 791 } 792 } 793 push @SpecifiedColumns, @SpecifiedValues; 794 } 795 @{$OptionsInfo{SpecifiedColumns}} = @SpecifiedColumns; 796 797 $OptionsInfo{InDelim} = $Options{indelim}; 798 799 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 800 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 801 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 802 803 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; 804 805 # Process any specified rows values... 806 @SpecifiedRowValues = (); 807 @{$OptionsInfo{SpecifiedRowValues}} = (); 808 809 $OptionsInfo{RowsMode} = $Options{rowsmode}; 810 $OptionsInfo{Rows} = defined $Options{rows} ? $Options{rows} : undef; 811 812 $OptionsInfo{SpecifiedRowsMode} = $Options{rowsmode}; 813 814 if (defined $Options{rows}) { 815 (@SpecifiedRowValues) = split ",", $Options{rows}; 816 } 817 else { 818 if ($Options{rowsmode} !~ /^rownums$/i) { 819 die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n"; 820 } 821 push @SpecifiedRowValues, "1"; 822 } 823 @{$OptionsInfo{SpecifiedRowValues}} = @SpecifiedRowValues; 824 825 my($SpecifiedColID, $SpecifiedRowID); 826 # Make sure specified values are okay... 827 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { 828 if (@SpecifiedRowValues % 3) { 829 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n"; 830 } 831 # Triplet format: colid,value,criteria. Criterion: le,ge,eq 832 my($Index, $ColID, $Criterion, $Value); 833 for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) { 834 $ColID = $SpecifiedRowValues[$Index]; 835 $Value = $SpecifiedRowValues[$Index + 1]; 836 $Criterion = $SpecifiedRowValues[$Index + 2]; 837 if ($Options{colmode} =~ /^colnum$/i) { 838 if (!IsPositiveInteger($ColID)) { 839 die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 840 } 841 } 842 if ($Criterion !~ /^(eq|le|ge)$/i) { 843 die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n"; 844 } 845 } 846 } 847 elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) { 848 ($SpecifiedColID) = $SpecifiedRowValues[0]; 849 if ($Options{colmode} =~ /^colnum$/i) { 850 if (!IsPositiveInteger($SpecifiedColID)) { 851 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 852 } 853 } 854 if (@SpecifiedRowValues == 1) { 855 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n"; 856 } 857 } 858 elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) { 859 if (@SpecifiedRowValues != 3) { 860 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n"; 861 } 862 ($SpecifiedColID) = $SpecifiedRowValues[0]; 863 if ($Options{colmode} =~ /^colnum$/i) { 864 if (!IsPositiveInteger($SpecifiedColID)) { 865 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 866 } 867 } 868 if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) { 869 die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n"; 870 } 871 } 872 elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 873 if (@SpecifiedRowValues != 1) { 874 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n"; 875 } 876 ($SpecifiedColID) = $SpecifiedRowValues[0]; 877 if ($Options{colmode} =~ /^colnum$/i) { 878 if (!IsPositiveInteger($SpecifiedColID)) { 879 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 880 } 881 } 882 } 883 elsif ($Options{rowsmode} =~ /^rownums$/i) { 884 for $SpecifiedRowID (@SpecifiedRowValues) { 885 if (!IsPositiveInteger($SpecifiedRowID)) { 886 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 887 } 888 } 889 } 890 elsif ($Options{rowsmode} =~ /^rownumrange$/i) { 891 if (@SpecifiedRowValues != 2) { 892 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n"; 893 } 894 for $SpecifiedRowID (@SpecifiedRowValues) { 895 if (!IsPositiveInteger($SpecifiedRowID)) { 896 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 897 } 898 } 899 if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) { 900 die "Error: Invalid value pair - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n"; 901 } 902 } 903 } 904 905 # Setup script usage and retrieve command line arguments specified using various options... 906 sub SetupScriptUsage { 907 908 # Setup default and retrieve all the options... 909 %Options = (); 910 $Options{colmode} = "colnum"; 911 $Options{indelim} = "comma"; 912 $Options{mode} = "columns"; 913 $Options{outdelim} = "comma"; 914 $Options{quote} = "yes"; 915 $Options{rowsmode} = "rownums"; 916 917 if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) { 918 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 919 } 920 if ($Options{workingdir}) { 921 if (! -d $Options{workingdir}) { 922 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 923 } 924 chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 925 } 926 if ($Options{mode} !~ /^(columns|rows|categories)$/i) { 927 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n"; 928 } 929 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { 930 die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n"; 931 } 932 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 933 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 934 } 935 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 936 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 937 } 938 if ($Options{quote} !~ /^(yes|no)$/i) { 939 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 940 } 941 if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) { 942 die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n"; 943 } 944 }