Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/InfoTextFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: InfoTextFiles.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:20 $ | |
| 5 # $Revision: 1.30 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Text::ParseWords; | |
| 34 use Benchmark; | |
| 35 use FileUtil; | |
| 36 use TextUtil; | |
| 37 | |
| 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 39 | |
| 40 # Autoflush STDOUT | |
| 41 $| = 1; | |
| 42 | |
| 43 # Starting message... | |
| 44 $ScriptName = basename($0); | |
| 45 print "\n$ScriptName: Starting...\n\n"; | |
| 46 $StartTime = new Benchmark; | |
| 47 | |
| 48 # Get the options and setup script... | |
| 49 SetupScriptUsage(); | |
| 50 if ($Options{help} || @ARGV < 1) { | |
| 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 52 } | |
| 53 | |
| 54 my(@TextFilesList); | |
| 55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); | |
| 56 | |
| 57 # Process options... | |
| 58 print "Processing options...\n"; | |
| 59 my(%OptionsInfo); | |
| 60 ProcessOptions(); | |
| 61 | |
| 62 print "Checking input text file(s)...\n"; | |
| 63 my(%TextFilesInfo); | |
| 64 RetrieveTextFilesInfo(); | |
| 65 ProcessColumnsInfo(); | |
| 66 | |
| 67 # Generate output files... | |
| 68 my($FileIndex); | |
| 69 if (@TextFilesList > 1) { | |
| 70 print "\nProcessing text files...\n"; | |
| 71 } | |
| 72 for $FileIndex (0 .. $#TextFilesList) { | |
| 73 if ($TextFilesInfo{FileOkay}[$FileIndex]) { | |
| 74 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; | |
| 75 ListTextFileInfo($FileIndex); | |
| 76 } | |
| 77 } | |
| 78 ListTotalSizeOfFiles(); | |
| 79 | |
| 80 print "\n$ScriptName:Done...\n\n"; | |
| 81 | |
| 82 $EndTime = new Benchmark; | |
| 83 $TotalTime = timediff ($EndTime, $StartTime); | |
| 84 print "Total time: ", timestr($TotalTime), "\n"; | |
| 85 | |
| 86 ############################################################################### | |
| 87 | |
| 88 # List appropriate information... | |
| 89 sub ListTextFileInfo { | |
| 90 my($Index) = @_; | |
| 91 my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,); | |
| 92 | |
| 93 $TextFile = $TextFilesList[$Index]; | |
| 94 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 95 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; | |
| 96 | |
| 97 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; | |
| 98 | |
| 99 $LineCount = 0; | |
| 100 $EmptyLinesCount = 0; | |
| 101 $EmptyColDataLinesCount = 0; | |
| 102 $GreaterThanMaxColLinesCount = 0; | |
| 103 | |
| 104 %EmptyColValuesCountMap = (); | |
| 105 %NonEmptyColValuesCountMap = (); | |
| 106 %SpecifiedNonNumericalColValuesCountMap = (); | |
| 107 %NonNumericalColValuesCountMap = (); | |
| 108 %NumericalColValuesCountMap = (); | |
| 109 | |
| 110 if ($OptionsInfo{ParseLines}) { | |
| 111 # Skip over column labels from old file... | |
| 112 if (<TEXTFILE>) { | |
| 113 $LineCount++; | |
| 114 LINE: while ($Line = <TEXTFILE>) { | |
| 115 $LineCount++; | |
| 116 $PrintTextLine = 0; | |
| 117 $Line =~ s/(\r\n)|(\r)|\n//g; | |
| 118 @LineWords = quotewords($InDelim, 0, $Line); | |
| 119 if ($OptionsInfo{CountEmpty}) { | |
| 120 # Count lines with no data... | |
| 121 if (!@LineWords) { | |
| 122 $EmptyLinesCount++; | |
| 123 if ($OptionsInfo{DetailLevel} >= 2) { | |
| 124 print "Line number $LineCount is empty...\n"; | |
| 125 } | |
| 126 next LINE; | |
| 127 } | |
| 128 # Count lines with empty data for some columns... | |
| 129 $EmptyColValueFound = 0; | |
| 130 VALUE: for $Value (@LineWords) { | |
| 131 if (!IsNotEmpty($Value)) { | |
| 132 $EmptyColValueFound = 1; | |
| 133 next VALUE; | |
| 134 } | |
| 135 } | |
| 136 if ($EmptyColValueFound) { | |
| 137 $EmptyColDataLinesCount++; | |
| 138 if ($OptionsInfo{DetailLevel} >= 2) { | |
| 139 print "Line number $LineCount contains empty column value(s)...\n"; | |
| 140 } | |
| 141 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; | |
| 142 } | |
| 143 # Count lines with columns greater than the column label line... | |
| 144 if (@LineWords > @ColLabels) { | |
| 145 $GreaterThanMaxColLinesCount++; | |
| 146 if ($OptionsInfo{DetailLevel} >= 2) { | |
| 147 print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n"; | |
| 148 } | |
| 149 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; | |
| 150 } | |
| 151 # Count empty values for each coulmn... | |
| 152 for $ColNum (0 .. $#LineWords) { | |
| 153 if ($ColNum < @ColLabels) { | |
| 154 $Label = $ColLabels[$ColNum]; | |
| 155 if (IsNotEmpty($LineWords[$ColNum])) { | |
| 156 if (exists($NonEmptyColValuesCountMap{$Label})) { | |
| 157 $NonEmptyColValuesCountMap{$Label} += 1; | |
| 158 } | |
| 159 else { | |
| 160 $NonEmptyColValuesCountMap{$Label} = 1; | |
| 161 } | |
| 162 } | |
| 163 else { | |
| 164 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; | |
| 165 if (exists($EmptyColValuesCountMap{$Label})) { | |
| 166 $EmptyColValuesCountMap{$Label} += 1; | |
| 167 } | |
| 168 else { | |
| 169 $EmptyColValuesCountMap{$Label} = 1; | |
| 170 } | |
| 171 } | |
| 172 } | |
| 173 } | |
| 174 } | |
| 175 if ($OptionsInfo{CheckData}) { | |
| 176 for $ColNum (0 .. $#LineWords) { | |
| 177 if ($ColNum < @ColLabels) { | |
| 178 if (IsNumerical($LineWords[$ColNum])) { | |
| 179 $Label = $ColLabels[$ColNum]; | |
| 180 if (exists($NumericalColValuesCountMap{$Label})) { | |
| 181 $NumericalColValuesCountMap{$Label} += 1; | |
| 182 } | |
| 183 else { | |
| 184 $NumericalColValuesCountMap{$Label} = 1; | |
| 185 } | |
| 186 } | |
| 187 else { | |
| 188 $Label = $ColLabels[$ColNum]; | |
| 189 if (IsNotEmpty($LineWords[$ColNum])) { | |
| 190 if (exists($NonNumericalColValuesCountMap{$Label})) { | |
| 191 $NonNumericalColValuesCountMap{$Label} += 1; | |
| 192 } | |
| 193 else { | |
| 194 $NonNumericalColValuesCountMap{$Label} = 1; | |
| 195 } | |
| 196 } | |
| 197 } | |
| 198 } | |
| 199 } | |
| 200 } | |
| 201 if ($OptionsInfo{CheckNumericalData}) { | |
| 202 $NonNumericalDataFound = 0; | |
| 203 for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) { | |
| 204 if ($ColNum < @LineWords) { | |
| 205 if (!IsNumerical($LineWords[$ColNum])) { | |
| 206 $NonNumericalDataFound = 1; | |
| 207 $Label = $ColLabels[$ColNum]; | |
| 208 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) { | |
| 209 $SpecifiedNonNumericalColValuesCountMap{$Label} += 1; | |
| 210 } | |
| 211 else { | |
| 212 $SpecifiedNonNumericalColValuesCountMap{$Label} = 1; | |
| 213 } | |
| 214 } | |
| 215 } | |
| 216 } | |
| 217 if ($NonNumericalDataFound) { | |
| 218 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; | |
| 219 if ($OptionsInfo{DetailLevel} >=2 ) { | |
| 220 print "Line number $LineCount contains non-numerical data for some specified column(s)...\n"; | |
| 221 } | |
| 222 } | |
| 223 } | |
| 224 if ($PrintTextLine) { | |
| 225 print "Line $LineCount: $Line\n\n"; | |
| 226 } | |
| 227 } | |
| 228 } | |
| 229 } | |
| 230 else { | |
| 231 while (<TEXTFILE>) { | |
| 232 $LineCount++; | |
| 233 } | |
| 234 } | |
| 235 close TEXTFILE; | |
| 236 | |
| 237 print "\nNumber of lines: $LineCount\n"; | |
| 238 print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n"; | |
| 239 print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n"; | |
| 240 | |
| 241 if ($OptionsInfo{CountEmpty}) { | |
| 242 print "\nNumber of lines with no data: $EmptyLinesCount\n"; | |
| 243 print "Number of lines with some missing column data: $EmptyColDataLinesCount\n"; | |
| 244 print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n"; | |
| 245 PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap); | |
| 246 PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap); | |
| 247 } | |
| 248 | |
| 249 if ($OptionsInfo{CheckData}) { | |
| 250 print "\n"; | |
| 251 PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap); | |
| 252 PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap); | |
| 253 print "\n"; | |
| 254 } | |
| 255 | |
| 256 if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) { | |
| 257 PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap); | |
| 258 } | |
| 259 | |
| 260 # File size and modification information... | |
| 261 print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n"; | |
| 262 print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n"; | |
| 263 } | |
| 264 | |
| 265 # Total size of all the fiels... | |
| 266 sub ListTotalSizeOfFiles { | |
| 267 my($FileOkayCount, $TotalSize, $Index); | |
| 268 | |
| 269 $FileOkayCount = 0; | |
| 270 $TotalSize = 0; | |
| 271 | |
| 272 for $Index (0 .. $#TextFilesList) { | |
| 273 if ($TextFilesInfo{FileOkay}[$Index]) { | |
| 274 $FileOkayCount++; | |
| 275 $TotalSize += $TextFilesInfo{FileSize}[$Index]; | |
| 276 } | |
| 277 } | |
| 278 if ($FileOkayCount > 1) { | |
| 279 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; | |
| 280 } | |
| 281 } | |
| 282 | |
| 283 # List data information... | |
| 284 sub PrintDataInformation { | |
| 285 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; | |
| 286 my($Line, $Label); | |
| 287 | |
| 288 $Line = ""; | |
| 289 for $Label (@{$DataLabelRef}) { | |
| 290 $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; | |
| 291 } | |
| 292 $Line =~ s/\,$//g; | |
| 293 print "$InfoLabel: $Line\n"; | |
| 294 } | |
| 295 | |
| 296 # Retrieve information about input text files... | |
| 297 sub RetrieveTextFilesInfo { | |
| 298 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString); | |
| 299 | |
| 300 %TextFilesInfo = (); | |
| 301 @{$TextFilesInfo{FileOkay}} = (); | |
| 302 @{$TextFilesInfo{ColCount}} = (); | |
| 303 @{$TextFilesInfo{ColLabels}} = (); | |
| 304 @{$TextFilesInfo{ColLabelToNumMap}} = (); | |
| 305 @{$TextFilesInfo{InDelim}} = (); | |
| 306 @{$TextFilesInfo{FileSize}} = (); | |
| 307 @{$TextFilesInfo{FileLastModified}} = (); | |
| 308 | |
| 309 FILELIST: for $Index (0 .. $#TextFilesList) { | |
| 310 $TextFile = $TextFilesList[$Index]; | |
| 311 | |
| 312 $TextFilesInfo{FileOkay}[$Index] = 0; | |
| 313 $TextFilesInfo{ColCount}[$Index] = 0; | |
| 314 $TextFilesInfo{InDelim}[$Index] = ""; | |
| 315 $TextFilesInfo{FileSize}[$Index] = 0; | |
| 316 $TextFilesInfo{FileLastModified}[$Index] = ''; | |
| 317 @{$TextFilesInfo{ColLabels}[$Index]} = (); | |
| 318 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); | |
| 319 | |
| 320 if (!(-e $TextFile)) { | |
| 321 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; | |
| 322 next FILELIST; | |
| 323 } | |
| 324 if (!CheckFileType($TextFile, "csv tsv")) { | |
| 325 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; | |
| 326 next FILELIST; | |
| 327 } | |
| 328 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
| 329 if ($FileExt =~ /^tsv$/i) { | |
| 330 $InDelim = "\t"; | |
| 331 } | |
| 332 else { | |
| 333 $InDelim = "\,"; | |
| 334 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { | |
| 335 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; | |
| 336 next FILELIST; | |
| 337 } | |
| 338 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { | |
| 339 $InDelim = "\;"; | |
| 340 } | |
| 341 } | |
| 342 | |
| 343 if (!open TEXTFILE, "$TextFile") { | |
| 344 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; | |
| 345 next FILELIST; | |
| 346 } | |
| 347 | |
| 348 $Line = GetTextLine(\*TEXTFILE); | |
| 349 @ColLabels = quotewords($InDelim, 0, $Line); | |
| 350 close TEXTFILE; | |
| 351 | |
| 352 $TextFilesInfo{FileOkay}[$Index] = 1; | |
| 353 $TextFilesInfo{InDelim}[$Index] = $InDelim; | |
| 354 | |
| 355 $TextFilesInfo{ColCount}[$Index] = @ColLabels; | |
| 356 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; | |
| 357 for $ColNum (0 .. $#ColLabels) { | |
| 358 $ColLabel = $ColLabels[$ColNum]; | |
| 359 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; | |
| 360 } | |
| 361 $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile); | |
| 362 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile); | |
| 363 $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; | |
| 364 } | |
| 365 | |
| 366 } | |
| 367 | |
| 368 # Make sure specified numerical data columns are okay... | |
| 369 sub ProcessColumnsInfo { | |
| 370 my($Index, $TextFile); | |
| 371 | |
| 372 @{$TextFilesInfo{NumericalDataColNums}} = (); | |
| 373 @{$TextFilesInfo{NumericalDataColLabels}} = (); | |
| 374 | |
| 375 FILELIST: for $Index (0 .. $#TextFilesList) { | |
| 376 $TextFile = $TextFilesList[$Index]; | |
| 377 @{$TextFilesInfo{NumericalDataColNums}[$Index]} = (); | |
| 378 @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = (); | |
| 379 | |
| 380 if ($TextFilesInfo{FileOkay}[$Index]) { | |
| 381 my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels); | |
| 382 @SpecifiedColNums = (); | |
| 383 if ($OptionsInfo{Mode} =~ /^colnum$/i) { | |
| 384 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { | |
| 385 if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
| 386 $ColNum = $SpecifiedColNum - 1; | |
| 387 push @SpecifiedColNums, $ColNum; | |
| 388 push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
| 389 } | |
| 390 } | |
| 391 } | |
| 392 else { | |
| 393 for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { | |
| 394 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
| 395 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
| 396 push @SpecifiedColNums, $ColNum; | |
| 397 push @SpecifiedColLabels, $ColLabel; | |
| 398 } | |
| 399 } | |
| 400 } | |
| 401 if (@SpecifiedColNums) { | |
| 402 push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums; | |
| 403 push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels; | |
| 404 } | |
| 405 } | |
| 406 } | |
| 407 } | |
| 408 | |
| 409 # Process option values... | |
| 410 sub ProcessOptions { | |
| 411 %OptionsInfo = (); | |
| 412 | |
| 413 $OptionsInfo{Mode} = $Options{mode}; | |
| 414 | |
| 415 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; | |
| 416 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; | |
| 417 | |
| 418 $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1; | |
| 419 | |
| 420 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; | |
| 421 | |
| 422 $OptionsInfo{InDelim} = $Options{indelim}; | |
| 423 $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0; | |
| 424 | |
| 425 $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0; | |
| 426 $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0; | |
| 427 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; | |
| 428 $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0; | |
| 429 | |
| 430 @{$OptionsInfo{SpecifiedNumericalDataCols}} = (); | |
| 431 if ($Options{numericaldatacols}) { | |
| 432 @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols}; | |
| 433 if ($Options{mode} =~ /^colnum$/i) { | |
| 434 my($ColNum); | |
| 435 for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { | |
| 436 if (!IsPositiveInteger($ColNum)) { | |
| 437 die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n"; | |
| 438 } | |
| 439 } | |
| 440 } | |
| 441 } | |
| 442 | |
| 443 } | |
| 444 | |
| 445 # Setup script usage and retrieve command line arguments specified using various options... | |
| 446 sub SetupScriptUsage { | |
| 447 | |
| 448 # Retrieve all the options... | |
| 449 %Options = (); | |
| 450 $Options{detail} = 1; | |
| 451 $Options{mode} = "colnum"; | |
| 452 $Options{indelim} = "comma"; | |
| 453 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) { | |
| 454 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 455 } | |
| 456 if ($Options{workingdir}) { | |
| 457 if (! -d $Options{workingdir}) { | |
| 458 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 459 } | |
| 460 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 461 } | |
| 462 if ($Options{mode} !~ /^(colnum|collabel)$/i) { | |
| 463 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n"; | |
| 464 } | |
| 465 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
| 466 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; | |
| 467 } | |
| 468 if (!IsPositiveInteger($Options{detail})) { | |
| 469 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; | |
| 470 } | |
| 471 } | |
| 472 | |
| 473 __END__ | |
| 474 | |
| 475 =head1 NAME | |
| 476 | |
| 477 InfoTextFiles.pl - List information about TextFile(s) | |
| 478 | |
| 479 =head1 SYNOPSIS | |
| 480 | |
| 481 InfoTextFiles.pl TextFile(s)... | |
| 482 | |
| 483 InfoTextFiles.pl [B<-a, --all>] [B<-c, --count>] [B<--datacheck>] [B<-d, --detail> infolevel] [B<-e, --empty>] | |
| 484 [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-m, --mode> colnum | collabel] | |
| 485 [B<-n, --numericaldatacols> colnum,[colnum,...] | collabel,[collabel,...]] | |
| 486 [B<-w, --workingdir> dirname] TextFile(s)... | |
| 487 | |
| 488 =head1 DESCRIPTION | |
| 489 | |
| 490 List information about I<TextFile(s)> contents: number of lines and columns, empty | |
| 491 column values, and so on. The file names are separated by spaces. | |
| 492 The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited | |
| 493 text files respectively. All other file names are ignored. All the text files in a | |
| 494 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory | |
| 495 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file | |
| 496 which doesn't correspond to the format indicated by B<--indelim> option is ignored. | |
| 497 | |
| 498 =head1 OPTIONS | |
| 499 | |
| 500 =over 4 | |
| 501 | |
| 502 =item B<-a, --all> | |
| 503 | |
| 504 List all the available information. | |
| 505 | |
| 506 =item B<-c, --count> | |
| 507 | |
| 508 List number of rows and columns. This is B<default behavior>. | |
| 509 | |
| 510 =item B<--datacheck> | |
| 511 | |
| 512 List number of numerical and non-numerical values for each column. | |
| 513 | |
| 514 =item B<-d, --detail> I<infolevel> | |
| 515 | |
| 516 Level of information to print about lines being ignored. Default: I<1>. Possible values: | |
| 517 I<1, 2 or 3>. | |
| 518 | |
| 519 =item B<-e, --empty> | |
| 520 | |
| 521 List number of empty row and column values. | |
| 522 | |
| 523 =item B<-h, --help> | |
| 524 | |
| 525 Print this help message. | |
| 526 | |
| 527 =item B<--indelim> I<comma | semicolon> | |
| 528 | |
| 529 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
| 530 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
| 531 delimiter. | |
| 532 | |
| 533 =item B<-m, --mode> I<colnum | collabel> | |
| 534 | |
| 535 Specify how to identify numerical data columns: using column number or column label. | |
| 536 Possible values: I<colnum or collabel>. Default value: I<colnum>. | |
| 537 | |
| 538 =item B<-n, --numericaldatacols> I<colnum,[colnum,...] | collabel,[collabel,...]> | |
| 539 | |
| 540 This value is mode specific. It is a list of column number or labels to check for | |
| 541 presence of numerical data only; otherwise, the value is flagged. Default value: I<all;all;...>. | |
| 542 | |
| 543 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example: | |
| 544 | |
| 545 1,3,5 | |
| 546 "2,4,6" | |
| 547 | |
| 548 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example: | |
| 549 | |
| 550 "MW,SumNO,SumNHOH" | |
| 551 | |
| 552 | |
| 553 =item B<-w, --workingdir> I<dirname> | |
| 554 | |
| 555 Location of working directory. Default: current directory. | |
| 556 | |
| 557 =back | |
| 558 | |
| 559 =head1 EXAMPLES | |
| 560 | |
| 561 To count number of lines and columns in Text file(s), type: | |
| 562 | |
| 563 % InfoTextFiles.pl Sample1.csv | |
| 564 % InfoTextFiles.pl Sample1.csv Sample1.tsv | |
| 565 % InfoTextFiles.pl *.csv *.tsv | |
| 566 | |
| 567 To count number of lines, columns and empty values in Sample1.csv file and print | |
| 568 detailed information, type: | |
| 569 | |
| 570 % InfoTextFiles.pl -d 3 -e Sample1.csv | |
| 571 | |
| 572 To track all available information and non-numerical values for Mol_ID and MolWeight | |
| 573 columns in Sample1.csv file and print detailed information, type: | |
| 574 | |
| 575 % InfoTextFiles.pl -d 3 -a -m collabel -n Mol_ID,MolWeight Sample1.csv | |
| 576 | |
| 577 =head1 AUTHOR | |
| 578 | |
| 579 Manish Sud <msud@san.rr.com> | |
| 580 | |
| 581 =head1 SEE ALSO | |
| 582 | |
| 583 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl | |
| 584 | |
| 585 =head1 COPYRIGHT | |
| 586 | |
| 587 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 588 | |
| 589 This file is part of MayaChemTools. | |
| 590 | |
| 591 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 592 the terms of the GNU Lesser General Public License as published by the Free | |
| 593 Software Foundation; either version 3 of the License, or (at your option) | |
| 594 any later version. | |
| 595 | |
| 596 =cut |
