Mercurial > repos > deepakjadmin > mayatool3_test3
comparison mayachemtools/bin/ExtractFromTextFiles.pl @ 0:73ae111cf86f draft
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 11:55:01 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:73ae111cf86f |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: ExtractFromTextFiles.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:19 $ | |
| 5 # $Revision: 1.42 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Text::ParseWords; | |
| 34 use FileHandle; | |
| 35 use Benchmark; | |
| 36 use FileUtil; | |
| 37 use TextUtil; | |
| 38 | |
| 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 40 | |
| 41 # Autoflush STDOUT | |
| 42 $| = 1; | |
| 43 | |
| 44 $StartTime = new Benchmark; | |
| 45 | |
| 46 # Starting message... | |
| 47 $ScriptName = basename $0; | |
| 48 print "\n$ScriptName:Starting...\n\n"; | |
| 49 | |
| 50 # Get the options and setup script... | |
| 51 SetupScriptUsage(); | |
| 52 if ($Options{help} || @ARGV < 1) { | |
| 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 54 } | |
| 55 | |
| 56 my(@TextFilesList); | |
| 57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); | |
| 58 | |
| 59 # Process options... | |
| 60 print "Processing options...\n"; | |
| 61 my(%OptionsInfo); | |
| 62 ProcessOptions(); | |
| 63 | |
| 64 # Collect column information for all the text files... | |
| 65 print "Checking input text file(s)...\n"; | |
| 66 my(%TextFilesInfo); | |
| 67 RetrieveTextFilesInfo(); | |
| 68 RetrieveColumnsAndRowsInfo(); | |
| 69 | |
| 70 # Generate output files... | |
| 71 my($FileIndex); | |
| 72 if (@TextFilesList > 1) { | |
| 73 print "\nProcessing text files...\n"; | |
| 74 } | |
| 75 for $FileIndex (0 .. $#TextFilesList) { | |
| 76 if ($TextFilesInfo{FileOkay}[$FileIndex]) { | |
| 77 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; | |
| 78 ExtractFromTextFile($FileIndex); | |
| 79 } | |
| 80 } | |
| 81 print "\n$ScriptName:Done...\n\n"; | |
| 82 | |
| 83 $EndTime = new Benchmark; | |
| 84 $TotalTime = timediff ($EndTime, $StartTime); | |
| 85 print "Total time: ", timestr($TotalTime), "\n"; | |
| 86 | |
| 87 ############################################################################### | |
| 88 | |
| 89 # Extract appropriate data from text file... | |
| 90 sub ExtractFromTextFile { | |
| 91 my($Index) = @_; | |
| 92 | |
| 93 if ($OptionsInfo{Mode} =~ /^categories$/i) { | |
| 94 ExtractCategoryData($Index); | |
| 95 } | |
| 96 elsif ($OptionsInfo{Mode} =~ /^rows$/i){ | |
| 97 ExtractRowsData($Index); | |
| 98 } | |
| 99 else { | |
| 100 ExtractColumnData($Index); | |
| 101 } | |
| 102 } | |
| 103 | |
| 104 # Geneate category files... | |
| 105 sub ExtractCategoryData { | |
| 106 my($Index) = @_; | |
| 107 my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels); | |
| 108 | |
| 109 $TextFile = $TextFilesList[$Index]; | |
| 110 | |
| 111 $NewTextFile = $TextFilesInfo{OutFile}[$Index]; | |
| 112 $CategoryCol = $TextFilesInfo{CategoryColNum}[$Index]; | |
| 113 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 114 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; | |
| 115 | |
| 116 my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap); | |
| 117 # Collect category data... | |
| 118 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; | |
| 119 # Skip label line... | |
| 120 $_ = <TEXTFILE>; | |
| 121 | |
| 122 %CategoriesNameToCountMap = (); | |
| 123 %CategoriesNameToLinesMap = (); | |
| 124 | |
| 125 while ($Line = GetTextLine(\*TEXTFILE)) { | |
| 126 @LineWords = quotewords($InDelim, 0, $Line); | |
| 127 $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : ""; | |
| 128 if (exists($CategoriesNameToCountMap{$CategoryName})) { | |
| 129 $CategoriesNameToCountMap{$CategoryName} += 1; | |
| 130 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; | |
| 131 } | |
| 132 else { | |
| 133 $CategoriesNameToCountMap{$CategoryName} = 1; | |
| 134 @{$CategoriesNameToLinesMap{$CategoryName}} = (); | |
| 135 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; | |
| 136 } | |
| 137 } | |
| 138 close TEXTFILE; | |
| 139 | |
| 140 # Setup file names for individual category files... | |
| 141 my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle); | |
| 142 | |
| 143 %CategoriesNameToFileHandleMap = (); | |
| 144 %CategoriesNameToFileNameMap = (); | |
| 145 | |
| 146 for $CategoryName (keys %CategoriesNameToCountMap) { | |
| 147 $CategoryFile = $TextFilesInfo{CategoryOutFileRoot}[$Index] . "$CategoryName" . ".$TextFilesInfo{OutFileExt}[$Index]";; | |
| 148 $CategoryFile =~ s/ //g; | |
| 149 $CategoryFileHandle = new FileHandle; | |
| 150 open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n"; | |
| 151 $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile; | |
| 152 $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle; | |
| 153 } | |
| 154 | |
| 155 # Write out summary file... | |
| 156 print "Generating file $NewTextFile...\n"; | |
| 157 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; | |
| 158 | |
| 159 # Write out column labels... | |
| 160 @LineWords = ("Category","Count"); | |
| 161 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 162 print NEWTEXTFILE "$Line\n"; | |
| 163 | |
| 164 # Write out the category names and count... | |
| 165 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { | |
| 166 $CategoryCount = $CategoriesNameToCountMap{$CategoryName}; | |
| 167 @LineWords = ("$CategoryName","$CategoryCount"); | |
| 168 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 169 print NEWTEXTFILE "$Line\n"; | |
| 170 } | |
| 171 close NEWTEXTFILE; | |
| 172 | |
| 173 # Write out a file for each category... | |
| 174 my($ColLabelLine, $LineIndex); | |
| 175 | |
| 176 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 177 print "\nGenerating text files for each category...\n"; | |
| 178 | |
| 179 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { | |
| 180 print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n"; | |
| 181 $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName}; | |
| 182 print $CategoryFileHandle "$ColLabelLine\n"; | |
| 183 for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) { | |
| 184 $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex]; | |
| 185 @LineWords = quotewords($InDelim, 0, $Line); | |
| 186 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 187 print $CategoryFileHandle "$Line\n"; | |
| 188 } | |
| 189 close $CategoryFileHandle; | |
| 190 } | |
| 191 } | |
| 192 | |
| 193 # Extract data for specific columns... | |
| 194 sub ExtractColumnData { | |
| 195 my($Index) = @_; | |
| 196 my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim); | |
| 197 | |
| 198 $TextFile = $TextFilesList[$Index]; | |
| 199 $NewTextFile =$TextFilesInfo{OutFile}[$Index]; | |
| 200 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 201 @ColNumsToExtract = @{$TextFilesInfo{ColNumsToExtract}[$Index]}; | |
| 202 | |
| 203 print "Generating file $NewTextFile...\n"; | |
| 204 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; | |
| 205 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; | |
| 206 | |
| 207 $_ = <TEXTFILE>; | |
| 208 # Write out column labels... | |
| 209 my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue); | |
| 210 @ColLabels = (); $ColLabelLine = ""; | |
| 211 for $ColNum (@ColNumsToExtract) { | |
| 212 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
| 213 } | |
| 214 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 215 print NEWTEXTFILE "$ColLabelLine\n"; | |
| 216 | |
| 217 while ($Line = GetTextLine(\*TEXTFILE)) { | |
| 218 @LineWords = quotewords($InDelim, 0, $Line); | |
| 219 @ColValues = (); $ColValuesLine = ""; | |
| 220 for $ColNum (@ColNumsToExtract) { | |
| 221 $ColValue = ""; | |
| 222 if ($ColNum < @LineWords) { | |
| 223 $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : ""; | |
| 224 } | |
| 225 push @ColValues, $ColValue; | |
| 226 } | |
| 227 $ColValuesLine = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 228 print NEWTEXTFILE "$ColValuesLine\n"; | |
| 229 } | |
| 230 close NEWTEXTFILE; | |
| 231 close TEXTFILE; | |
| 232 } | |
| 233 | |
| 234 # Extract data for specific rows... | |
| 235 sub ExtractRowsData { | |
| 236 my($Index) = @_; | |
| 237 my($TextFile, $NewTextFile, $InDelim, $SpecifiedRowsMode); | |
| 238 | |
| 239 $TextFile = $TextFilesList[$Index]; | |
| 240 $NewTextFile =$TextFilesInfo{OutFile}[$Index]; | |
| 241 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 242 | |
| 243 $SpecifiedRowsMode = $OptionsInfo{SpecifiedRowsMode}; | |
| 244 | |
| 245 print "Generating file $NewTextFile...\n"; | |
| 246 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; | |
| 247 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; | |
| 248 | |
| 249 my($Line, $RowCount, @LineWords, @ColLabels); | |
| 250 | |
| 251 # Write out column labels... | |
| 252 $Line = <TEXTFILE>; | |
| 253 push @ColLabels, @{$TextFilesInfo{ColLabels}[$Index]}; | |
| 254 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 255 print NEWTEXTFILE "$Line\n"; | |
| 256 | |
| 257 if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) { | |
| 258 ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); | |
| 259 } | |
| 260 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) { | |
| 261 ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE); | |
| 262 } | |
| 263 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) { | |
| 264 ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE); | |
| 265 } | |
| 266 elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { | |
| 267 ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); | |
| 268 } | |
| 269 elsif ($SpecifiedRowsMode =~ /^rownums$/i) { | |
| 270 ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE); | |
| 271 } | |
| 272 elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) { | |
| 273 ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE); | |
| 274 } | |
| 275 | |
| 276 close NEWTEXTFILE; | |
| 277 close TEXTFILE; | |
| 278 } | |
| 279 | |
| 280 # Extract rows by column value... | |
| 281 sub ExtractRowsByColValue { | |
| 282 my($Index, $TextFileRef, $NewTextFileRef) = @_; | |
| 283 my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords); | |
| 284 | |
| 285 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 286 | |
| 287 LINE: while ($Line = GetTextLine($TextFileRef)) { | |
| 288 @LineWords = quotewords($InDelim, 0, $Line); | |
| 289 for ($ValueIndex = 0; $ValueIndex < @{$TextFilesInfo{RowValues}[$Index]}; $ValueIndex = $ValueIndex + 3) { | |
| 290 $ColNum = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; | |
| 291 $ColValue = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 1]; | |
| 292 $Criterion = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 2]; | |
| 293 if ($ColNum > $#LineWords) { | |
| 294 next LINE; | |
| 295 } | |
| 296 $Value = $LineWords[$ColNum]; | |
| 297 if ($Criterion =~ /^le$/i) { | |
| 298 if ($Value > $ColValue) { | |
| 299 next LINE; | |
| 300 } | |
| 301 } | |
| 302 elsif ($Criterion =~ /^ge$/i) { | |
| 303 if ($Value < $ColValue) { | |
| 304 next LINE; | |
| 305 } | |
| 306 } | |
| 307 elsif ($Criterion =~ /^eq$/i) { | |
| 308 if ($Value ne $ColValue) { | |
| 309 next LINE; | |
| 310 } | |
| 311 } | |
| 312 } | |
| 313 # Write it out... | |
| 314 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 315 print $NewTextFileRef "$Line\n"; | |
| 316 } | |
| 317 } | |
| 318 # Extract rows by column value list... | |
| 319 sub ExtractRowsByColValueList { | |
| 320 my($Index, $TextFileRef, $NewTextFileRef) = @_; | |
| 321 my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords); | |
| 322 | |
| 323 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 324 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; | |
| 325 | |
| 326 # Setup a col value map... | |
| 327 %ColValueMap = (); | |
| 328 for $ValueIndex (1 .. $#{$TextFilesInfo{RowValues}[$Index]}) { | |
| 329 $Value = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; | |
| 330 $ColValueMap{$Value} = $Value; | |
| 331 } | |
| 332 | |
| 333 LINE: while ($Line = GetTextLine($TextFileRef)) { | |
| 334 @LineWords = quotewords($InDelim, 0, $Line); | |
| 335 if ($ColNum > $#LineWords) { | |
| 336 next LINE; | |
| 337 } | |
| 338 $ColValue = $LineWords[$ColNum]; | |
| 339 if (exists $ColValueMap{$ColValue}) { | |
| 340 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 341 print $NewTextFileRef "$Line\n"; | |
| 342 } | |
| 343 } | |
| 344 } | |
| 345 | |
| 346 # Extract row by minimum column value... | |
| 347 sub ExtractRowByMinOrMaxColValue { | |
| 348 my($Index, $TextFileRef, $NewTextFileRef) = @_; | |
| 349 my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords); | |
| 350 | |
| 351 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 352 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; | |
| 353 | |
| 354 $ValueLine = ''; $ColValue = ''; $FirstValue = 1; | |
| 355 LINE: while ($Line = GetTextLine($TextFileRef)) { | |
| 356 @LineWords = quotewords($InDelim, 0, $Line); | |
| 357 if ($ColNum > $#LineWords) { | |
| 358 next LINE; | |
| 359 } | |
| 360 if ($FirstValue) { | |
| 361 $FirstValue = 0; | |
| 362 $ColValue = $LineWords[$ColNum]; | |
| 363 $ValueLine = $Line; | |
| 364 next LINE; | |
| 365 } | |
| 366 if ($OptionsInfo{SpecifiedRowsMode} =~ /^rowbymaxcolvalue$/i) { | |
| 367 if ($LineWords[$ColNum] > $ColValue) { | |
| 368 $ColValue = $LineWords[$ColNum]; | |
| 369 $ValueLine = $Line; | |
| 370 } | |
| 371 } | |
| 372 else { | |
| 373 if ($LineWords[$ColNum] < $ColValue) { | |
| 374 $ColValue = $LineWords[$ColNum]; | |
| 375 $ValueLine = $Line; | |
| 376 } | |
| 377 } | |
| 378 } | |
| 379 if ($ValueLine) { | |
| 380 @LineWords = quotewords($InDelim, 0, $ValueLine); | |
| 381 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 382 print $NewTextFileRef "$Line\n"; | |
| 383 } | |
| 384 } | |
| 385 | |
| 386 # Extract rows by column value range... | |
| 387 sub ExtractRowsByColValueRange { | |
| 388 my($Index, $TextFileRef, $NewTextFileRef) = @_; | |
| 389 my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords); | |
| 390 | |
| 391 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 392 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; | |
| 393 $MinValue = $TextFilesInfo{RowValues}[$Index][1]; | |
| 394 $MaxValue = $TextFilesInfo{RowValues}[$Index][2]; | |
| 395 | |
| 396 LINE: while ($Line = GetTextLine($TextFileRef)) { | |
| 397 @LineWords = quotewords($InDelim, 0, $Line); | |
| 398 if ($ColNum > $#LineWords) { | |
| 399 next LINE; | |
| 400 } | |
| 401 $ColValue = $LineWords[$ColNum]; | |
| 402 if ($ColValue >= $MinValue && $ColValue <= $MaxValue) { | |
| 403 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 404 print $NewTextFileRef "$Line\n"; | |
| 405 } | |
| 406 } | |
| 407 } | |
| 408 | |
| 409 # Extract rows by row number range... | |
| 410 sub ExtractRowsByRowNumRange { | |
| 411 my($Index, $TextFileRef, $NewTextFileRef) = @_; | |
| 412 | |
| 413 my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords); | |
| 414 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 415 $MinRowNum = $TextFilesInfo{RowValues}[$Index][0]; | |
| 416 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][1]; | |
| 417 | |
| 418 $RowCount = 1; | |
| 419 LINE: while ($Line = GetTextLine($TextFileRef)) { | |
| 420 $RowCount++; | |
| 421 if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) { | |
| 422 @LineWords = quotewords($InDelim, 0, $Line); | |
| 423 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 424 print $NewTextFileRef "$Line\n"; | |
| 425 } | |
| 426 elsif ($RowCount > $MaxRowNum) { | |
| 427 last LINE; | |
| 428 } | |
| 429 } | |
| 430 } | |
| 431 | |
| 432 # Extract rows by row numbers... | |
| 433 sub ExtractRowsByRowNums { | |
| 434 my($Index, $TextFileRef, $NewTextFileRef) = @_; | |
| 435 my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords); | |
| 436 | |
| 437 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 438 | |
| 439 # Setup a row nums map... | |
| 440 %RowNumMap = (); | |
| 441 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][0]; | |
| 442 for $RowNum (@{$TextFilesInfo{RowValues}[$Index]}) { | |
| 443 if ($RowNum > $MaxRowNum) { | |
| 444 $MaxRowNum = $RowNum; | |
| 445 } | |
| 446 $RowNumMap{$RowNum} = $RowNum; | |
| 447 } | |
| 448 | |
| 449 $RowCount = 1; | |
| 450 LINE: while ($Line = GetTextLine($TextFileRef)) { | |
| 451 $RowCount++; | |
| 452 if (exists $RowNumMap{$RowCount}) { | |
| 453 @LineWords = quotewords($InDelim, 0, $Line); | |
| 454 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 455 print $NewTextFileRef "$Line\n"; | |
| 456 } | |
| 457 elsif ($RowCount > $MaxRowNum) { | |
| 458 last LINE; | |
| 459 } | |
| 460 } | |
| 461 } | |
| 462 | |
| 463 # Retrieve text file columns and rows information for specified options... | |
| 464 sub RetrieveColumnsAndRowsInfo { | |
| 465 ProcessColumnsInfo(); | |
| 466 ProcessRowsInfo(); | |
| 467 } | |
| 468 | |
| 469 # Make sure the specified columns exists in text files... | |
| 470 sub ProcessColumnsInfo { | |
| 471 my($Index, $SpecifiedCategoryCol, $TextFile, @ColNumsToExtract); | |
| 472 | |
| 473 @{$TextFilesInfo{CategoryColNum}} = (); | |
| 474 @{$TextFilesInfo{ColNumsToExtract}} = (); | |
| 475 | |
| 476 $SpecifiedCategoryCol = $OptionsInfo{SpecifiedCategoryCol}; | |
| 477 | |
| 478 FILELIST: for $Index (0 .. $#TextFilesList) { | |
| 479 $TextFile = $TextFilesList[$Index]; | |
| 480 | |
| 481 $TextFilesInfo{CategoryColNum}[$Index] = 0; | |
| 482 @{$TextFilesInfo{ColNumsToExtract}[$Index]} = (); | |
| 483 | |
| 484 if ($TextFilesInfo{FileOkay}[$Index]) { | |
| 485 if ($OptionsInfo{Mode} =~ /^categories$/i) { | |
| 486 my($CategoryColNum, $CategoryColValid); | |
| 487 | |
| 488 $CategoryColNum = 0; | |
| 489 $CategoryColValid = 1; | |
| 490 if ($SpecifiedCategoryCol) { | |
| 491 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { | |
| 492 if ($SpecifiedCategoryCol <= $TextFilesInfo{ColCount}[$Index]) { | |
| 493 $CategoryColNum = $SpecifiedCategoryCol - 1; | |
| 494 } | |
| 495 else { | |
| 496 $CategoryColValid = 0; | |
| 497 } | |
| 498 } | |
| 499 else { | |
| 500 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol})) { | |
| 501 $CategoryColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol}; | |
| 502 } | |
| 503 else { | |
| 504 $CategoryColValid = 0; | |
| 505 } | |
| 506 } | |
| 507 } | |
| 508 if ($CategoryColValid) { | |
| 509 $TextFilesInfo{CategoryColNum}[$Index] = $CategoryColNum; | |
| 510 } | |
| 511 else { | |
| 512 warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n"; | |
| 513 $TextFilesInfo{FileOkay}[$Index] = 0; | |
| 514 } | |
| 515 } | |
| 516 elsif ($OptionsInfo{Mode} =~ /^columns$/i) { | |
| 517 my($SpecifiedColNum, $ColNum); | |
| 518 | |
| 519 $ColNum = 0; | |
| 520 @ColNumsToExtract = (); | |
| 521 | |
| 522 if (@{$OptionsInfo{SpecifiedColumns}}) { | |
| 523 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { | |
| 524 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedColumns}}) { | |
| 525 if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
| 526 $ColNum = $SpecifiedColNum - 1; | |
| 527 push @ColNumsToExtract, $ColNum; | |
| 528 } | |
| 529 } | |
| 530 } | |
| 531 else { | |
| 532 my($ColLabel); | |
| 533 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) { | |
| 534 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
| 535 push @ColNumsToExtract, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
| 536 } | |
| 537 } | |
| 538 } | |
| 539 } | |
| 540 else { | |
| 541 push @ColNumsToExtract, $ColNum; | |
| 542 } | |
| 543 if (@ColNumsToExtract) { | |
| 544 push @{$TextFilesInfo{ColNumsToExtract}[$Index]}, @ColNumsToExtract; | |
| 545 } | |
| 546 else { | |
| 547 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist\n"; | |
| 548 $TextFilesInfo{FileOkay}[$Index] = 0; | |
| 549 } | |
| 550 } | |
| 551 } | |
| 552 } | |
| 553 } | |
| 554 | |
| 555 # Process specified rows info... | |
| 556 sub ProcessRowsInfo { | |
| 557 my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues); | |
| 558 | |
| 559 @{$TextFilesInfo{RowValues}} = (); | |
| 560 | |
| 561 FILELIST: for $Index (0 .. $#TextFilesList) { | |
| 562 $TextFile = $TextFilesList[$Index]; | |
| 563 @{$TextFilesInfo{RowValues}[$Index]} = (); | |
| 564 | |
| 565 if ($OptionsInfo{Mode} !~ /^rows$/i) { | |
| 566 next FILELIST; | |
| 567 } | |
| 568 if (!$TextFilesInfo{FileOkay}[$Index]) { | |
| 569 next FILELIST; | |
| 570 } | |
| 571 | |
| 572 @RowValues = (); | |
| 573 | |
| 574 if ($OptionsInfo{RowsMode} =~ /^rowsbycolvalue$/i) { | |
| 575 my($ValueIndex); | |
| 576 for ($ValueIndex = 0; $ValueIndex < @{$OptionsInfo{SpecifiedRowValues}}; $ValueIndex = $ValueIndex + 3) { | |
| 577 $ColID = $OptionsInfo{SpecifiedRowValues}[$ValueIndex]; | |
| 578 $Value = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 1]; | |
| 579 $Criterion = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 2]; | |
| 580 | |
| 581 $ColIDOkay = 0; | |
| 582 if ($OptionsInfo{ColMode} =~ /^collabel$/i) { | |
| 583 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { | |
| 584 $ColIDOkay = 1; | |
| 585 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; | |
| 586 } | |
| 587 } | |
| 588 else { | |
| 589 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { | |
| 590 $ColNum = $ColID - 1; | |
| 591 $ColIDOkay = 1; | |
| 592 } | |
| 593 } | |
| 594 if ($ColIDOkay) { | |
| 595 push @RowValues, ($ColNum, $Value, $Criterion); | |
| 596 } | |
| 597 } | |
| 598 } | |
| 599 elsif ($OptionsInfo{RowsMode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) { | |
| 600 # Process coulumn id... | |
| 601 $ColID = $OptionsInfo{SpecifiedRowValues}[0]; | |
| 602 $ColIDOkay = 0; | |
| 603 | |
| 604 if ($OptionsInfo{ColMode} =~ /^collabel$/i) { | |
| 605 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { | |
| 606 $ColIDOkay = 1; | |
| 607 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; | |
| 608 } | |
| 609 } | |
| 610 else { | |
| 611 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { | |
| 612 $ColIDOkay = 1; | |
| 613 $ColNum = $ColID - 1; | |
| 614 } | |
| 615 } | |
| 616 if ($ColIDOkay) { | |
| 617 push @RowValues, $ColNum; | |
| 618 # Get rest of the specified values... | |
| 619 if (@{$OptionsInfo{SpecifiedRowValues}} > 1) { | |
| 620 for $Index (1 .. $#{$OptionsInfo{SpecifiedRowValues}}) { | |
| 621 push @RowValues, $OptionsInfo{SpecifiedRowValues}[$Index]; | |
| 622 } | |
| 623 } | |
| 624 } | |
| 625 } | |
| 626 elsif ($OptionsInfo{RowsMode} =~ /^(rownums|rownumrange)$/i) { | |
| 627 push @RowValues, @{$OptionsInfo{SpecifiedRowValues}}; | |
| 628 } | |
| 629 | |
| 630 if (@RowValues) { | |
| 631 push @{$TextFilesInfo{RowValues}[$Index]}, @RowValues; | |
| 632 } | |
| 633 else { | |
| 634 warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n"; | |
| 635 $TextFilesInfo{FileOkay}[$Index] = 0; | |
| 636 } | |
| 637 } | |
| 638 } | |
| 639 | |
| 640 # Retrieve information about input text files... | |
| 641 sub RetrieveTextFilesInfo { | |
| 642 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel); | |
| 643 | |
| 644 %TextFilesInfo = (); | |
| 645 | |
| 646 @{$TextFilesInfo{FileOkay}} = (); | |
| 647 @{$TextFilesInfo{ColCount}} = (); | |
| 648 @{$TextFilesInfo{ColLabels}} = (); | |
| 649 @{$TextFilesInfo{ColLabelToNumMap}} = (); | |
| 650 @{$TextFilesInfo{InDelim}} = (); | |
| 651 @{$TextFilesInfo{OutFile}} = (); | |
| 652 @{$TextFilesInfo{OutFileExt}} = (); | |
| 653 @{$TextFilesInfo{CategoryOutFileRoot}} = (); | |
| 654 | |
| 655 FILELIST: for $Index (0 .. $#TextFilesList) { | |
| 656 $TextFile = $TextFilesList[$Index]; | |
| 657 | |
| 658 $TextFilesInfo{FileOkay}[$Index] = 0; | |
| 659 $TextFilesInfo{ColCount}[$Index] = 0; | |
| 660 $TextFilesInfo{InDelim}[$Index] = ""; | |
| 661 $TextFilesInfo{OutFile}[$Index] = ""; | |
| 662 $TextFilesInfo{OutFileExt}[$Index] = ""; | |
| 663 $TextFilesInfo{CategoryOutFileRoot}[$Index] = ""; | |
| 664 | |
| 665 @{$TextFilesInfo{ColLabels}[$Index]} = (); | |
| 666 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); | |
| 667 | |
| 668 if (!(-e $TextFile)) { | |
| 669 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; | |
| 670 next FILELIST; | |
| 671 } | |
| 672 if (!CheckFileType($TextFile, "csv tsv")) { | |
| 673 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; | |
| 674 next FILELIST; | |
| 675 } | |
| 676 | |
| 677 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
| 678 if ($FileExt =~ /^tsv$/i) { | |
| 679 $InDelim = "\t"; | |
| 680 } | |
| 681 else { | |
| 682 $InDelim = "\,"; | |
| 683 if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) { | |
| 684 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; | |
| 685 next FILELIST; | |
| 686 } | |
| 687 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { | |
| 688 $InDelim = "\;"; | |
| 689 } | |
| 690 } | |
| 691 | |
| 692 if (!open TEXTFILE, "$TextFile") { | |
| 693 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; | |
| 694 next FILELIST; | |
| 695 } | |
| 696 | |
| 697 $Line = GetTextLine(\*TEXTFILE); | |
| 698 @ColLabels = quotewords($InDelim, 0, $Line); | |
| 699 close TEXTFILE; | |
| 700 | |
| 701 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 702 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
| 703 $FileExt = "csv"; | |
| 704 if ($OptionsInfo{OutDelim} =~ /^tab$/i) { | |
| 705 $FileExt = "tsv"; | |
| 706 } | |
| 707 | |
| 708 if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) { | |
| 709 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
| 710 if ($RootFileName && $RootFileExt) { | |
| 711 $FileName = $RootFileName; | |
| 712 } | |
| 713 else { | |
| 714 $FileName = $OptionsInfo{OutFileRoot}; | |
| 715 } | |
| 716 $OutFileRoot .= $FileName; | |
| 717 } | |
| 718 else { | |
| 719 $OutFileRoot = $FileName; | |
| 720 $OutFileRoot .= ($OptionsInfo{Mode} =~ /^categories$/i) ? "CategoriesSummary" : (($OptionsInfo{Mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns"); | |
| 721 } | |
| 722 $CategoryOutFileRoot = "$FileName" . "Category"; | |
| 723 | |
| 724 $OutFile = $OutFileRoot . ".$FileExt"; | |
| 725 if (lc($OutFile) eq lc($TextFile)) { | |
| 726 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; | |
| 727 next FILELIST; | |
| 728 } | |
| 729 | |
| 730 if (!$OptionsInfo{Overwrite}) { | |
| 731 if (-e $OutFile) { | |
| 732 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; | |
| 733 next FILELIST; | |
| 734 } | |
| 735 } | |
| 736 | |
| 737 $TextFilesInfo{FileOkay}[$Index] = 1; | |
| 738 $TextFilesInfo{InDelim}[$Index] = $InDelim; | |
| 739 $TextFilesInfo{CategoryOutFileRoot}[$Index] = $CategoryOutFileRoot; | |
| 740 $TextFilesInfo{OutFile}[$Index] = "$OutFile"; | |
| 741 $TextFilesInfo{OutFileExt}[$Index] = "$FileExt"; | |
| 742 | |
| 743 $TextFilesInfo{ColCount}[$Index] = @ColLabels; | |
| 744 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; | |
| 745 | |
| 746 for $ColNum (0 .. $#ColLabels) { | |
| 747 $ColLabel = $ColLabels[$ColNum]; | |
| 748 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; | |
| 749 } | |
| 750 } | |
| 751 } | |
| 752 | |
| 753 # Process option values... | |
| 754 sub ProcessOptions { | |
| 755 my(@SpecifiedColumns, @SpecifiedRowValues); | |
| 756 | |
| 757 %OptionsInfo = (); | |
| 758 | |
| 759 $OptionsInfo{Mode} = $Options{mode}; | |
| 760 | |
| 761 $OptionsInfo{ColMode} = $Options{colmode}; | |
| 762 | |
| 763 $OptionsInfo{CategoryCol} = defined $Options{categorycol} ? $Options{categorycol} : undef; | |
| 764 $OptionsInfo{SpecifiedCategoryCol} = ""; | |
| 765 | |
| 766 if (defined $Options{categorycol}) { | |
| 767 my(@SpecifiedValues) = split ",", $Options{categorycol}; | |
| 768 if (@SpecifiedValues != 1) { | |
| 769 die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n"; | |
| 770 } | |
| 771 $OptionsInfo{SpecifiedCategoryCol} = $SpecifiedValues[0]; | |
| 772 if ($Options{colmode} =~ /^colnum$/i) { | |
| 773 if (!IsPositiveInteger($OptionsInfo{SpecifiedCategoryCol})) { | |
| 774 die "Error: Category column value, $OptionsInfo{SpecifiedCategoryCol}, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n"; | |
| 775 } | |
| 776 } | |
| 777 } | |
| 778 | |
| 779 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef; | |
| 780 @{$OptionsInfo{SpecifiedColumns}} = (); | |
| 781 @SpecifiedColumns = (); | |
| 782 | |
| 783 if (defined $Options{columns}) { | |
| 784 my(@SpecifiedValues) = split ",", $Options{columns}; | |
| 785 if ($Options{colmode} =~ /^colnum$/i) { | |
| 786 my($ColValue); | |
| 787 for $ColValue (@SpecifiedValues) { | |
| 788 if (!IsPositiveInteger($ColValue)) { | |
| 789 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; | |
| 790 } | |
| 791 } | |
| 792 } | |
| 793 push @SpecifiedColumns, @SpecifiedValues; | |
| 794 } | |
| 795 @{$OptionsInfo{SpecifiedColumns}} = @SpecifiedColumns; | |
| 796 | |
| 797 $OptionsInfo{InDelim} = $Options{indelim}; | |
| 798 | |
| 799 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); | |
| 800 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; | |
| 801 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; | |
| 802 | |
| 803 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; | |
| 804 | |
| 805 # Process any specified rows values... | |
| 806 @SpecifiedRowValues = (); | |
| 807 @{$OptionsInfo{SpecifiedRowValues}} = (); | |
| 808 | |
| 809 $OptionsInfo{RowsMode} = $Options{rowsmode}; | |
| 810 $OptionsInfo{Rows} = defined $Options{rows} ? $Options{rows} : undef; | |
| 811 | |
| 812 $OptionsInfo{SpecifiedRowsMode} = $Options{rowsmode}; | |
| 813 | |
| 814 if (defined $Options{rows}) { | |
| 815 (@SpecifiedRowValues) = split ",", $Options{rows}; | |
| 816 } | |
| 817 else { | |
| 818 if ($Options{rowsmode} !~ /^rownums$/i) { | |
| 819 die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n"; | |
| 820 } | |
| 821 push @SpecifiedRowValues, "1"; | |
| 822 } | |
| 823 @{$OptionsInfo{SpecifiedRowValues}} = @SpecifiedRowValues; | |
| 824 | |
| 825 my($SpecifiedColID, $SpecifiedRowID); | |
| 826 # Make sure specified values are okay... | |
| 827 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { | |
| 828 if (@SpecifiedRowValues % 3) { | |
| 829 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n"; | |
| 830 } | |
| 831 # Triplet format: colid,value,criteria. Criterion: le,ge,eq | |
| 832 my($Index, $ColID, $Criterion, $Value); | |
| 833 for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) { | |
| 834 $ColID = $SpecifiedRowValues[$Index]; | |
| 835 $Value = $SpecifiedRowValues[$Index + 1]; | |
| 836 $Criterion = $SpecifiedRowValues[$Index + 2]; | |
| 837 if ($Options{colmode} =~ /^colnum$/i) { | |
| 838 if (!IsPositiveInteger($ColID)) { | |
| 839 die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; | |
| 840 } | |
| 841 } | |
| 842 if ($Criterion !~ /^(eq|le|ge)$/i) { | |
| 843 die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n"; | |
| 844 } | |
| 845 } | |
| 846 } | |
| 847 elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) { | |
| 848 ($SpecifiedColID) = $SpecifiedRowValues[0]; | |
| 849 if ($Options{colmode} =~ /^colnum$/i) { | |
| 850 if (!IsPositiveInteger($SpecifiedColID)) { | |
| 851 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; | |
| 852 } | |
| 853 } | |
| 854 if (@SpecifiedRowValues == 1) { | |
| 855 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n"; | |
| 856 } | |
| 857 } | |
| 858 elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) { | |
| 859 if (@SpecifiedRowValues != 3) { | |
| 860 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n"; | |
| 861 } | |
| 862 ($SpecifiedColID) = $SpecifiedRowValues[0]; | |
| 863 if ($Options{colmode} =~ /^colnum$/i) { | |
| 864 if (!IsPositiveInteger($SpecifiedColID)) { | |
| 865 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; | |
| 866 } | |
| 867 } | |
| 868 if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) { | |
| 869 die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n"; | |
| 870 } | |
| 871 } | |
| 872 elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { | |
| 873 if (@SpecifiedRowValues != 1) { | |
| 874 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n"; | |
| 875 } | |
| 876 ($SpecifiedColID) = $SpecifiedRowValues[0]; | |
| 877 if ($Options{colmode} =~ /^colnum$/i) { | |
| 878 if (!IsPositiveInteger($SpecifiedColID)) { | |
| 879 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; | |
| 880 } | |
| 881 } | |
| 882 } | |
| 883 elsif ($Options{rowsmode} =~ /^rownums$/i) { | |
| 884 for $SpecifiedRowID (@SpecifiedRowValues) { | |
| 885 if (!IsPositiveInteger($SpecifiedRowID)) { | |
| 886 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; | |
| 887 } | |
| 888 } | |
| 889 } | |
| 890 elsif ($Options{rowsmode} =~ /^rownumrange$/i) { | |
| 891 if (@SpecifiedRowValues != 2) { | |
| 892 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n"; | |
| 893 } | |
| 894 for $SpecifiedRowID (@SpecifiedRowValues) { | |
| 895 if (!IsPositiveInteger($SpecifiedRowID)) { | |
| 896 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; | |
| 897 } | |
| 898 } | |
| 899 if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) { | |
| 900 die "Error: Invalid value pair - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n"; | |
| 901 } | |
| 902 } | |
| 903 } | |
| 904 | |
| 905 # Setup script usage and retrieve command line arguments specified using various options... | |
| 906 sub SetupScriptUsage { | |
| 907 | |
| 908 # Setup default and retrieve all the options... | |
| 909 %Options = (); | |
| 910 $Options{colmode} = "colnum"; | |
| 911 $Options{indelim} = "comma"; | |
| 912 $Options{mode} = "columns"; | |
| 913 $Options{outdelim} = "comma"; | |
| 914 $Options{quote} = "yes"; | |
| 915 $Options{rowsmode} = "rownums"; | |
| 916 | |
| 917 if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) { | |
| 918 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 919 } | |
| 920 if ($Options{workingdir}) { | |
| 921 if (! -d $Options{workingdir}) { | |
| 922 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 923 } | |
| 924 chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 925 } | |
| 926 if ($Options{mode} !~ /^(columns|rows|categories)$/i) { | |
| 927 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n"; | |
| 928 } | |
| 929 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { | |
| 930 die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n"; | |
| 931 } | |
| 932 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
| 933 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; | |
| 934 } | |
| 935 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
| 936 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
| 937 } | |
| 938 if ($Options{quote} !~ /^(yes|no)$/i) { | |
| 939 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
| 940 } | |
| 941 if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) { | |
| 942 die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n"; | |
| 943 } | |
| 944 } | |
| 945 __END__ | |
| 946 | |
| 947 | |
| 948 =head1 NAME | |
| 949 | |
| 950 ExtractFromTextFiles.pl - Extract specific data from TextFile(s) | |
| 951 | |
| 952 =head1 SYNOPSIS | |
| 953 | |
| 954 ExtractFromTextFiles.pl TextFile(s)... | |
| 955 | |
| 956 ExtractFromTextFiles.pl [B<-c, --colmode> colnum | collabel] [B<--categorycol > number | string] | |
| 957 [B<--columns> "colnum,[colnum]..." | "collabel,[collabel]..."] [B<-h, --help>] | |
| 958 [B<--indelim> I<comma | semicolon>] [B<-m, --mode > I<columns | rows | categories>] | |
| 959 [B<-o, --overwrite>] [B<--outdelim> I<comma | tab | semicolon>] [B<-q, --quote> I<yes | no>] | |
| 960 [B<--rows> "colid,value,criteria..." | "colid,value..." | "colid,mincolvalue,maxcolvalue" | "rownum,rownum,..." | colid | "minrownum,maxrownum"] | |
| 961 [ B<--rowsmode> rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange] | |
| 962 [B<-r, --root> I<rootname>] [B<-w, --workingdir> I<dirname>] TextFile(s)... | |
| 963 | |
| 964 =head1 DESCRIPTION | |
| 965 | |
| 966 Extract column(s)/row(s) data from I<TextFile(s)> identified by column numbers or labels. Or categorize | |
| 967 data using a specified column category. During categorization, a summary text file is | |
| 968 generated containing category name and count; an additional text file, containing data for | |
| 969 for each category, is also generated. The file names are separated by space. The | |
| 970 valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited | |
| 971 text files respectively. All other file names are ignored. All the text files in a | |
| 972 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory | |
| 973 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file | |
| 974 which doesn't correspond to the format indicated by B<--indelim> option is ignored. | |
| 975 | |
| 976 =head1 OPTIONS | |
| 977 | |
| 978 =over 4 | |
| 979 | |
| 980 =item B<-c, --colmode> I<colnum | collabel> | |
| 981 | |
| 982 Specify how columns are identified in I<TextFile(s)>: using column number or column | |
| 983 label. Possible values: I<colnum or collabel>. Default value: I<colnum>. | |
| 984 | |
| 985 =item B<--categorycol > I<number | string> | |
| 986 | |
| 987 Column used to categorize data. Default value: First column. | |
| 988 | |
| 989 For I<colnum> value of B<-c, --colmode> option, input value is a column number. | |
| 990 Example: I<1>. | |
| 991 | |
| 992 For I<collabel> value of B<-c, --colmode> option, input value is a column label. | |
| 993 Example: I<Mol_ID>. | |
| 994 | |
| 995 =item B<--columns> I<"colnum,[colnum]..." | "collabel,[collabel]..."> | |
| 996 | |
| 997 List of comma delimited columns to extract. Default value: First column. | |
| 998 | |
| 999 For I<colnum> value of B<-c, --colmode> option, input values format is: | |
| 1000 I<colnum,colnum,...>. Example: I<1,3,5> | |
| 1001 | |
| 1002 For I<collabel> value of B<-c, --colmode> option, input values format is: | |
| 1003 I<collabel,collabel,..>. Example: I<Mol_ID,MolWeight> | |
| 1004 | |
| 1005 =item B<-h, --help> | |
| 1006 | |
| 1007 Print this help message. | |
| 1008 | |
| 1009 =item B<--indelim> I<comma | semicolon> | |
| 1010 | |
| 1011 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
| 1012 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
| 1013 delimiter. | |
| 1014 | |
| 1015 =item B<-m, --mode > I<columns | rows | categories> | |
| 1016 | |
| 1017 Specify what to extract from I<TextFile(s)>. Possible values: I<columns, rows, | |
| 1018 or categories>. Default value: I<columns>. | |
| 1019 | |
| 1020 For I<columns> mode, data for appropriate columns specified by B<--columns> option | |
| 1021 is extracted from I<TextFile(s)> and placed into new text files. | |
| 1022 | |
| 1023 For I<rows> mode, appropriate rows specified in conjuction with B<--rowsmode> and | |
| 1024 B<rows> options are extracted from I<TextFile(s)> and placed into new text files. | |
| 1025 | |
| 1026 For I<categories> mode, coulmn specified by B<--categorycol> is | |
| 1027 used to categorize data, and a summary text file is generated | |
| 1028 containing category name and count; an additional text file, containing data for | |
| 1029 for each category, is also generated. | |
| 1030 | |
| 1031 =item B<-o, --overwrite> | |
| 1032 | |
| 1033 Overwrite existing files. | |
| 1034 | |
| 1035 =item B<--outdelim> I<comma | tab | semicolon>. | |
| 1036 | |
| 1037 Output text file delimiter. Possible values: I<comma, tab, or semicolon>. | |
| 1038 Default value: I<comma> | |
| 1039 | |
| 1040 =item B<-q, --quote> I<yes | no> | |
| 1041 | |
| 1042 Put quotes around column values in output text file. Possible values: I<yes or | |
| 1043 no>. Default value: I<yes>. | |
| 1044 | |
| 1045 =item B<-r, --root> I<rootname> | |
| 1046 | |
| 1047 New file name is generated using the root: <Root>.<Ext>. Default for new file | |
| 1048 names: <TextFile>CategoriesSummary.<Ext>, <TextFile>ExtractedColumns.<Ext>, and | |
| 1049 <TextFile>ExtractedRows.<Ext> for I<categories>, I<columns>, and I<rows> mode | |
| 1050 respectively. And <TextFile>Category<CategoryName>.<Ext> | |
| 1051 for each category retrieved from each text file. The output file type determines <Ext> | |
| 1052 value: csv and tsv for CSV, and TSV files respectively. | |
| 1053 | |
| 1054 This option is ignored for multiple input files. | |
| 1055 | |
| 1056 =item B<--rows> I<"colid,value,criteria..." | "colid,value..." | "colid,mincolvalue,maxcolvalue" | "rownum,rownum,..." | colid | "minrownum,maxrownum"> | |
| 1057 | |
| 1058 This value is B<--rowsmode> specific. In general, it's a list of comma separated column ids and | |
| 1059 associated mode specific value. Based on Column ids specification, column label or number, is | |
| 1060 controlled by B<-c, --colmode> option. | |
| 1061 | |
| 1062 First line containing column labels is always written out. And value comparisons assume | |
| 1063 numerical column data. | |
| 1064 | |
| 1065 For I<rowsbycolvalue> mode, input value format contains these triplets: | |
| 1066 I<colid,value, criteria...>. Possible values for criteria: I<le, ge or eq>. | |
| 1067 Examples: | |
| 1068 | |
| 1069 MolWt,450,le | |
| 1070 MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le | |
| 1071 | |
| 1072 For I<rowsbycolvaluelist> mode, input value format is: I<colid,value...>. Examples: | |
| 1073 | |
| 1074 Mol_ID,20 | |
| 1075 Mol_ID,20,1002,1115 | |
| 1076 | |
| 1077 For I<rowsbycolvaluerange> mode, input value format is: I<colid,mincolvalue,maxcolvalue>. Examples: | |
| 1078 | |
| 1079 MolWt,100,450 | |
| 1080 | |
| 1081 For I<rowbymincolvalue, rowbymaxcolvalue> modes, input value format is: I<colid>. | |
| 1082 | |
| 1083 For I<rownum> mode, input value format is: I<rownum>. Default value: I<2>. | |
| 1084 | |
| 1085 For I<rownumrange> mode, input value format is: I<minrownum, maxrownum>. Examples: | |
| 1086 | |
| 1087 10,40 | |
| 1088 | |
| 1089 =item B<--rowsmode> I<rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange> | |
| 1090 | |
| 1091 Specify how to extract rows from I<TextFile(s)>. Possible values: I<rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, | |
| 1092 rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange>. Default value: I<rownum>. | |
| 1093 | |
| 1094 Use B<--rows> option to list rows criterion used for extraction of rows from | |
| 1095 I<TextFile(s)>. | |
| 1096 | |
| 1097 =item B<-w, --workingdir> I<dirname> | |
| 1098 | |
| 1099 Location of working directory. Default: current directory. | |
| 1100 | |
| 1101 =back | |
| 1102 | |
| 1103 =head1 EXAMPLES | |
| 1104 | |
| 1105 To extract first column from a text file and generate a new CSV text file NewSample1.csv, | |
| 1106 type: | |
| 1107 | |
| 1108 % ExtractFromTextFiles.pl -r NewSample1 -o Sample1.csv | |
| 1109 | |
| 1110 To extract columns Mol_ID, MolWeight, and NAME from Sample1.csv and generate a new | |
| 1111 textfile NewSample1.tsv with no quotes, type: | |
| 1112 | |
| 1113 % ExtractFromTextFiles.pl -m columns -c collabel --columns "Mol_ID, | |
| 1114 MolWeight,NAME" --outdelim tab --quote no -r NewSample1 | |
| 1115 -o Sample1.csv | |
| 1116 | |
| 1117 To extract rows containing values for MolWeight column of less than 450 from | |
| 1118 Sample1.csv and generate a new textfile NewSample1.csv, type: | |
| 1119 | |
| 1120 % ExtractFromTextFiles.pl -m rows --rowsmode rowsbycolvalue | |
| 1121 -c collabel --rows MolWeight,450,le -r NewSample1 | |
| 1122 -o Sample1.csv | |
| 1123 | |
| 1124 To extract rows containing values for MolWeight column between 400 and 500 from | |
| 1125 Sample1.csv and generate a new textfile NewSample1.csv, type: | |
| 1126 | |
| 1127 % ExtractFromTextFiles.pl -m rows --rowsmode rowsbycolvaluerange | |
| 1128 -c collabel --rows MolWeight,450,500 -r NewSample1 | |
| 1129 -o Sample1.csv | |
| 1130 | |
| 1131 To extract a row containing minimum value for column MolWeight from Sample1.csv and generate | |
| 1132 a new textfile NewSample1.csv, type: | |
| 1133 | |
| 1134 % ExtractFromTextFiles.pl -m rows --rowsmode rowbymincolvalue | |
| 1135 -c collabel --rows MolWeight -r NewSample1 | |
| 1136 -o Sample1.csv | |
| 1137 | |
| 1138 =head1 AUTHOR | |
| 1139 | |
| 1140 Manish Sud <msud@san.rr.com> | |
| 1141 | |
| 1142 =head1 SEE ALSO | |
| 1143 | |
| 1144 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl | |
| 1145 | |
| 1146 =head1 COPYRIGHT | |
| 1147 | |
| 1148 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 1149 | |
| 1150 This file is part of MayaChemTools. | |
| 1151 | |
| 1152 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 1153 the terms of the GNU Lesser General Public License as published by the Free | |
| 1154 Software Foundation; either version 3 of the License, or (at your option) | |
| 1155 any later version. | |
| 1156 | |
| 1157 =cut |
