Mercurial > repos > deepakjadmin > mayatool3_test2
comparison lib/TextUtil.pm @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 package TextUtil; | |
| 2 # | |
| 3 # $RCSfile: TextUtil.pm,v $ | |
| 4 # $Date: 2015/03/22 20:08:26 $ | |
| 5 # $Revision: 1.45 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use Exporter; | |
| 31 | |
| 32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |
| 33 | |
| 34 @ISA = qw(Exporter); | |
| 35 @EXPORT = qw(AddNumberSuffix ContainsWhiteSpaces GetTextLine GetTextFileDataByUniqueKey GetTextFileDataByNonUniqueKey HashCode IsEmpty IsNumberPowerOfNumber IsInteger IsPositiveInteger IsFloat IsNotEmpty IsNumerical JoinWords SplitWords QuoteAWord RemoveLeadingWhiteSpaces RemoveTrailingWhiteSpaces RemoveLeadingAndTrailingWhiteSpaces WrapText); | |
| 36 @EXPORT_OK = qw(); | |
| 37 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); | |
| 38 | |
| 39 # Add number suffix... | |
| 40 sub AddNumberSuffix { | |
| 41 my($Value) = @_; | |
| 42 my($ValueWithSuffix, $Suffix); | |
| 43 | |
| 44 $ValueWithSuffix = $Value; | |
| 45 if (!IsPositiveInteger($Value)) { | |
| 46 return $ValueWithSuffix; | |
| 47 } | |
| 48 $Suffix = "th"; | |
| 49 if ($Value < 10 || $Value > 20) { | |
| 50 my $Remainder = $Value % 10; | |
| 51 $Suffix = ($Remainder == 1) ? "st" : (($Remainder == 2) ? "nd" : (($Remainder == 3) ? "rd" : "th")); | |
| 52 } | |
| 53 $ValueWithSuffix = "${ValueWithSuffix}${Suffix}"; | |
| 54 return $ValueWithSuffix; | |
| 55 } | |
| 56 | |
| 57 # Check out the string: Doen it contain any white space characters? | |
| 58 sub ContainsWhiteSpaces { | |
| 59 my($TheString) = @_; | |
| 60 my($Status) = 0; | |
| 61 | |
| 62 if (defined($TheString) && length($TheString)) { | |
| 63 $Status = ($TheString =~ /[ \t\r\n\f]/ ) ? 1 : 0; | |
| 64 } | |
| 65 return $Status; | |
| 66 } | |
| 67 | |
| 68 # Read the line, change to UNIX new line char, and chop off new line char as well... | |
| 69 sub GetTextLine { | |
| 70 my($TextFileRef) = @_; | |
| 71 my($Line) = ''; | |
| 72 | |
| 73 # Get the next non empty line... | |
| 74 LINE: while (defined($_ = <$TextFileRef>)) { | |
| 75 # Change Windows and Mac new line char to UNIX... | |
| 76 s/(\r\n)|(\r)/\n/g; | |
| 77 | |
| 78 # Take out any new line char at the end by explicitly removing it instead of using | |
| 79 # chomp, which might not always work correctly on files generated on a system | |
| 80 # with a value of input line separator different from the current system... | |
| 81 s/\n$//g; | |
| 82 | |
| 83 # Doesn't hurt to chomp... | |
| 84 chomp; | |
| 85 | |
| 86 $Line = $_; | |
| 87 if (length $Line) { | |
| 88 last LINE; | |
| 89 } | |
| 90 } | |
| 91 return $Line; | |
| 92 } | |
| 93 | |
| 94 # Load data from a CSV file into the specified hash reference using a specific | |
| 95 # column for unique data key values. | |
| 96 # | |
| 97 # The lines starting with # are treated as comments and ignored. First line | |
| 98 # not starting with # must contain column labels and the number of columns in | |
| 99 # all other data rows must match the number of column labels. | |
| 100 # | |
| 101 # The first column is assumed to contain data key value by default; all other columns | |
| 102 # contain data as indicated in their column labels. | |
| 103 # | |
| 104 # In order to avoid dependence of data access on the specified column labels, the | |
| 105 # column data is loaded into hash with Column<Num> hash keys, where column number | |
| 106 # start from 1. The data key column is not available as Colnum<Num> hash key; | |
| 107 # | |
| 108 # The format of the data structure loaded into a specified hash reference is: | |
| 109 # | |
| 110 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys | |
| 111 # @{$TextDataMapRef->{ColLabels}} - Array of column labels | |
| 112 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs | |
| 113 # $TextDataMapRef->{NumOfCols} - Number of columns | |
| 114 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> | |
| 115 # %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey> | |
| 116 # | |
| 117 # Caveats: | |
| 118 # . The column number start from 1. | |
| 119 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs. | |
| 120 # | |
| 121 sub GetTextFileDataByUniqueKey { | |
| 122 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; | |
| 123 | |
| 124 return _GetTextFileData("UniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim); | |
| 125 } | |
| 126 | |
| 127 # Load data from a CSV file into the specified hash reference using a specific | |
| 128 # column for non-unique data key values. | |
| 129 # | |
| 130 # The lines starting with # are treated as comments and ignored. First line | |
| 131 # not starting with # must contain column labels and the number of columns in | |
| 132 # all other data rows must match the number of column labels. | |
| 133 # | |
| 134 # The first column is assumed to contain data key value by default; all other columns | |
| 135 # contain data as indicated in their column labels. | |
| 136 # | |
| 137 # In order to avoid dependence of data access on the specified column labels, the | |
| 138 # column data is loaded into hash with Column<Num> hash keys, where column number | |
| 139 # start from 1. The data key column is not available as Colnum<Num> hash key; | |
| 140 # | |
| 141 # The format of the data structure loaded into a specified hash reference is: | |
| 142 # | |
| 143 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys | |
| 144 # @{$TextDataMapRef->{ColLabels}} - Array of column labels | |
| 145 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs | |
| 146 # $TextDataMapRef->{NumOfCols} - Number of columns | |
| 147 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> | |
| 148 # @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array: <DataCol<Num>, DataKey> | |
| 149 # | |
| 150 # Caveats: | |
| 151 # . The column number start from 1. | |
| 152 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs. | |
| 153 # | |
| 154 sub GetTextFileDataByNonUniqueKey { | |
| 155 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; | |
| 156 | |
| 157 return _GetTextFileData("NonUniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim); | |
| 158 } | |
| 159 | |
| 160 # Loadtext file data using unique or non-uniqye data column key... | |
| 161 # | |
| 162 sub _GetTextFileData { | |
| 163 my($DataKeyMode, $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; | |
| 164 my($DataKeyColIndex, $LineCount, $IgnoredLineCount, $UniqueDataKeyMode, $DataKey, $Line, $NumOfCols, $ColIndex, $ColNum, $ColID, $ColValue, @LineWords, @ColLabels, @DataColIDs, @DataColNums); | |
| 165 | |
| 166 print "\nProcessing text data file $TextDataFile...\n"; | |
| 167 | |
| 168 $UniqueDataKeyMode = 0; | |
| 169 if ($DataKeyMode =~ /^UniqueKey$/i) { | |
| 170 $UniqueDataKeyMode = 1; | |
| 171 } | |
| 172 | |
| 173 # Setup default values... | |
| 174 $DataKeyColNum = defined $DataKeyColNum ? $DataKeyColNum : 1; | |
| 175 | |
| 176 if ($TextDataFile =~ /^\.tsv$/i) { | |
| 177 $InDelim = "\t"; | |
| 178 } | |
| 179 else { | |
| 180 $InDelim = "\,"; | |
| 181 if ($InDelim =~ /^semicolon$/i) { | |
| 182 $InDelim = "\;"; | |
| 183 } | |
| 184 } | |
| 185 | |
| 186 ($LineCount, $IgnoredLineCount) = (0) x 2; | |
| 187 | |
| 188 open TEXTDATAFILE, "$TextDataFile" or die "Couldn't open $TextDataFile: $! ..."; | |
| 189 | |
| 190 # Skip lines up to column labels... | |
| 191 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) { | |
| 192 $LineCount++; | |
| 193 if ($Line =~ /^#/) { | |
| 194 $IgnoredLineCount++; | |
| 195 } | |
| 196 else { | |
| 197 last LINE; | |
| 198 } | |
| 199 } | |
| 200 | |
| 201 # Initialize data map... | |
| 202 %{$TextDataMapRef} = (); | |
| 203 @{$TextDataMapRef->{DataKeys}} = (); | |
| 204 @{$TextDataMapRef->{ColLabels}} = (); | |
| 205 @{$TextDataMapRef->{DataColIDs}} = (); | |
| 206 $TextDataMapRef->{NumOfCols} = undef; | |
| 207 | |
| 208 # Process column labels... | |
| 209 @ColLabels= quotewords($InDelim, 0, $Line); | |
| 210 $NumOfCols = @ColLabels; | |
| 211 | |
| 212 if ($DataKeyColNum < 1 || $DataKeyColNum > $NumOfCols) { | |
| 213 warn "Warning: Ignoring text data file $TextDataFile: Invalid data key column number, $DataKeyColNum, specified. It must be > 0 or <= $NumOfCols, number of columns in the text file ..."; | |
| 214 return; | |
| 215 } | |
| 216 $DataKeyColIndex = $DataKeyColNum - 1; | |
| 217 | |
| 218 $TextDataMapRef->{NumOfCols} = $NumOfCols; | |
| 219 push @{$TextDataMapRef->{ColLabels}}, @ColLabels; | |
| 220 | |
| 221 # Set up column data IDs for tracking the data... | |
| 222 @DataColNums = (); | |
| 223 @DataColIDs = (); | |
| 224 COLNUM: for $ColNum (1 .. $NumOfCols) { | |
| 225 if ($ColNum == $DataKeyColNum) { | |
| 226 next COLNUM; | |
| 227 } | |
| 228 push @DataColNums, $ColNum; | |
| 229 $ColID = "DataCol${ColNum}"; | |
| 230 push @DataColIDs, $ColID; | |
| 231 } | |
| 232 push @{$TextDataMapRef->{DataColIDs}}, @DataColIDs; | |
| 233 | |
| 234 # Initialize column data hash... | |
| 235 %{$TextDataMapRef->{DataKey}} = (); | |
| 236 for $ColIndex (0 .. $#DataColNums) { | |
| 237 $ColNum = $DataColNums[$ColIndex]; | |
| 238 $ColID = $DataColIDs[$ColIndex]; | |
| 239 %{$TextDataMapRef->{$ColID}} = (); | |
| 240 } | |
| 241 | |
| 242 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) { | |
| 243 $LineCount++; | |
| 244 if ($Line =~ /^#/) { | |
| 245 $IgnoredLineCount++; | |
| 246 next LINE; | |
| 247 } | |
| 248 | |
| 249 @LineWords = quotewords($InDelim, 0, $Line); | |
| 250 if (@LineWords != $NumOfCols) { | |
| 251 $IgnoredLineCount++; | |
| 252 warn "Warning: The number of data fields, @LineWords, in $TextDataFile must be $NumOfCols.\nIgnoring line number $LineCount: $Line...\n"; | |
| 253 next LINE; | |
| 254 } | |
| 255 $DataKey = $LineWords[$DataKeyColIndex]; | |
| 256 | |
| 257 if ($UniqueDataKeyMode) { | |
| 258 if (exists $TextDataMapRef->{DataKey}{$DataKey}) { | |
| 259 $IgnoredLineCount++; | |
| 260 warn "Warning: The data key, $DataKey, in data column key number, $DataKeyColNum, is already present.\nIgnoring line number $LineCount: $Line...\n"; | |
| 261 next LINE; | |
| 262 } | |
| 263 push @{$TextDataMapRef->{DataKeys}}, $DataKey; | |
| 264 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey; | |
| 265 } | |
| 266 else { | |
| 267 if (!exists $TextDataMapRef->{DataKey}{$DataKey}) { | |
| 268 push @{$TextDataMapRef->{DataKeys}}, $DataKey; | |
| 269 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey; | |
| 270 | |
| 271 for $ColIndex (0 .. $#DataColNums) { | |
| 272 $ColNum = $DataColNums[$ColIndex]; | |
| 273 $ColID = $DataColIDs[$ColIndex]; | |
| 274 @{$TextDataMapRef->{$ColID}{$DataKey}} = (); | |
| 275 } | |
| 276 } | |
| 277 } | |
| 278 | |
| 279 # Track column data values... | |
| 280 for $ColIndex (0 .. $#DataColNums) { | |
| 281 $ColID = $DataColIDs[$ColIndex]; | |
| 282 | |
| 283 $ColNum = $DataColNums[$ColIndex]; | |
| 284 $ColValue = $LineWords[$ColNum - 1]; | |
| 285 | |
| 286 if ($UniqueDataKeyMode) { | |
| 287 $TextDataMapRef->{$ColID}{$DataKey} = $ColValue; | |
| 288 } | |
| 289 else { | |
| 290 push @{$TextDataMapRef->{$ColID}{$DataKey}}, $ColValue; | |
| 291 } | |
| 292 } | |
| 293 | |
| 294 } | |
| 295 | |
| 296 print "\nTotal number of lines in file $TextDataFile: $LineCount\n"; | |
| 297 print "Total number of lines ignored: $IgnoredLineCount\n"; | |
| 298 | |
| 299 close TEXTDATAFILE; | |
| 300 } | |
| 301 | |
| 302 # Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38]. It's also implemented in | |
| 303 # Perl for internal hash keys in hv.h include file. | |
| 304 # | |
| 305 # It's not clear how to force Perl perform unsigned integer arithmetic irrespective of the OS/Platform and | |
| 306 # the value of use64bitint flag used during its compilation. | |
| 307 # | |
| 308 # In order to generate a consistent 32 bit has code across OS/platforms, the following methodology appear | |
| 309 # to work: | |
| 310 # | |
| 311 # o Use MaxHashCodeMask to retrieve appropriate bits after left shifting by bit operators and additions | |
| 312 # o Stay away from "use integer" to avoid signed integer arithmetic for bit operators | |
| 313 # | |
| 314 # | |
| 315 # MaxHashCodeMask (2147483647) corresponds to the maximum value which can be stored in 31 bits | |
| 316 # | |
| 317 my($MaxHashCodeMask); | |
| 318 $MaxHashCodeMask = 2**31 - 1; | |
| 319 | |
| 320 sub HashCode { | |
| 321 my($String) = @_; | |
| 322 my($HashCode, $Value, $ShiftedHashCode); | |
| 323 | |
| 324 $HashCode = 0; | |
| 325 for $Value (unpack('C*', $String)) { | |
| 326 $HashCode += $Value; | |
| 327 | |
| 328 $ShiftedHashCode = $HashCode << 10; | |
| 329 if ($ShiftedHashCode > $MaxHashCodeMask) { | |
| 330 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; | |
| 331 } | |
| 332 | |
| 333 $HashCode += $ShiftedHashCode; | |
| 334 if ($HashCode > $MaxHashCodeMask) { | |
| 335 $HashCode = $HashCode & $MaxHashCodeMask; | |
| 336 } | |
| 337 | |
| 338 $HashCode ^= ($HashCode >> 6); | |
| 339 } | |
| 340 | |
| 341 $ShiftedHashCode = $HashCode << 3; | |
| 342 if ($ShiftedHashCode > $MaxHashCodeMask) { | |
| 343 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; | |
| 344 } | |
| 345 | |
| 346 $HashCode += $ShiftedHashCode; | |
| 347 if ($HashCode > $MaxHashCodeMask) { | |
| 348 $HashCode = $HashCode & $MaxHashCodeMask; | |
| 349 } | |
| 350 $HashCode ^= ($HashCode >> 11); | |
| 351 | |
| 352 $ShiftedHashCode = $HashCode << 15; | |
| 353 if ($ShiftedHashCode > $MaxHashCodeMask) { | |
| 354 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; | |
| 355 } | |
| 356 | |
| 357 $HashCode += $ShiftedHashCode; | |
| 358 if ($HashCode > $MaxHashCodeMask) { | |
| 359 $HashCode = $HashCode & $MaxHashCodeMask; | |
| 360 } | |
| 361 return $HashCode; | |
| 362 } | |
| 363 | |
| 364 # Check out the string: Is it defined and has a non zero length? | |
| 365 sub IsEmpty { | |
| 366 my($TheString) = @_; | |
| 367 my($Status) = 1; | |
| 368 | |
| 369 $Status = (defined($TheString) && length($TheString)) ? 0 : 1; | |
| 370 | |
| 371 return $Status; | |
| 372 } | |
| 373 | |
| 374 # Is first specified number power of second specified number... | |
| 375 sub IsNumberPowerOfNumber { | |
| 376 my($FirstNum, $SecondNum) = @_; | |
| 377 my($PowerValue); | |
| 378 | |
| 379 $PowerValue = log($FirstNum)/log($SecondNum); | |
| 380 | |
| 381 return IsInteger($PowerValue) ? 1 : 0; | |
| 382 } | |
| 383 | |
| 384 # Check out the string: Is it an integer? | |
| 385 sub IsInteger { | |
| 386 my($TheString) = @_; | |
| 387 my($Status) = 0; | |
| 388 | |
| 389 if (defined($TheString) && length($TheString)) { | |
| 390 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); | |
| 391 $TheString =~ s/^[+-]//; | |
| 392 $Status = ($TheString =~ /[^0-9]/) ? 0 : 1; | |
| 393 } | |
| 394 return $Status; | |
| 395 } | |
| 396 | |
| 397 # Check out the string: Is it an integer with value > 0? | |
| 398 sub IsPositiveInteger { | |
| 399 my($TheString) = @_; | |
| 400 my($Status) = 0; | |
| 401 | |
| 402 $Status = IsInteger($TheString) ? ($TheString > 0 ? 1 : 0) : 0; | |
| 403 | |
| 404 return $Status; | |
| 405 } | |
| 406 | |
| 407 | |
| 408 # Check out the string: Is it a float? | |
| 409 sub IsFloat { | |
| 410 my($TheString) = @_; | |
| 411 my($Status) = 0; | |
| 412 | |
| 413 if (defined($TheString) && length($TheString)) { | |
| 414 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); | |
| 415 $TheString =~ s/^[+-]//; | |
| 416 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1); | |
| 417 } | |
| 418 return $Status; | |
| 419 } | |
| 420 | |
| 421 # Check out the string: Is it defined and has a non zero length? | |
| 422 sub IsNotEmpty { | |
| 423 my($TheString) = @_; | |
| 424 my($Status); | |
| 425 | |
| 426 $Status = IsEmpty($TheString) ? 0 : 1; | |
| 427 | |
| 428 return $Status; | |
| 429 } | |
| 430 | |
| 431 # Check out the string: Does it only contain numerical data? | |
| 432 sub IsNumerical { | |
| 433 my($TheString) = @_; | |
| 434 my($Status) = 0; | |
| 435 | |
| 436 if (defined($TheString) && length($TheString)) { | |
| 437 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); | |
| 438 $TheString =~ s/^[+-]//; | |
| 439 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1); | |
| 440 } | |
| 441 return $Status; | |
| 442 } | |
| 443 | |
| 444 # Join different words using delimiter and quote parameters. And return as | |
| 445 # a string value. | |
| 446 sub JoinWords { | |
| 447 my($Words, $Delim, $Quote) = @_; | |
| 448 | |
| 449 if (!@$Words) { | |
| 450 return ""; | |
| 451 } | |
| 452 | |
| 453 $Quote = $Quote ? "\"" : ""; | |
| 454 my(@NewWords) = map { (defined($_) && length($_)) ? "${Quote}$_${Quote}" : "${Quote}${Quote}" } @$Words; | |
| 455 | |
| 456 return join $Delim, @NewWords; | |
| 457 } | |
| 458 | |
| 459 # Split string value containing quoted or unquoted words in to an array containing | |
| 460 # unquoted words. | |
| 461 # | |
| 462 # This function is used to split strings generated by JoinWords. | |
| 463 # | |
| 464 sub SplitWords { | |
| 465 my($Line, $Delim) = @_; | |
| 466 | |
| 467 if (!$Line) { | |
| 468 return (); | |
| 469 } | |
| 470 | |
| 471 # Is it a quoted string? | |
| 472 if ($Line =~ /^\"/) { | |
| 473 # Take out first and last quote... | |
| 474 $Line =~ s/^\"//; $Line =~ s/\"$//; | |
| 475 | |
| 476 $Delim = "\"$Delim\""; | |
| 477 } | |
| 478 return split /$Delim/, $Line; | |
| 479 } | |
| 480 | |
| 481 # Based on quote parameter, figure out what to do | |
| 482 sub QuoteAWord { | |
| 483 my($Word, $Quote) = @_; | |
| 484 my($QuotedWord); | |
| 485 | |
| 486 $QuotedWord = ""; | |
| 487 if ($Word) { | |
| 488 $QuotedWord = $Word; | |
| 489 } | |
| 490 if ($Quote) { | |
| 491 $QuotedWord = "\"$QuotedWord\""; | |
| 492 } | |
| 493 return ($QuotedWord); | |
| 494 } | |
| 495 | |
| 496 # Remove leading white space characters from the string... | |
| 497 sub RemoveLeadingWhiteSpaces { | |
| 498 my($InString) = @_; | |
| 499 my($OutString, $TrailingString, $LeadingWhiteSpace); | |
| 500 | |
| 501 $OutString = $InString; | |
| 502 if (length($InString) && ContainsWhiteSpaces($InString)) { | |
| 503 $OutString =~ s/^([ \t\r\n\f]*)(.*?)$/$2/; | |
| 504 } | |
| 505 return $OutString; | |
| 506 } | |
| 507 | |
| 508 # Remove Trailing white space characters from the string... | |
| 509 sub RemoveTrailingWhiteSpaces { | |
| 510 my($InString) = @_; | |
| 511 my($OutString, $LeadingString, $TrailingWhiteSpace); | |
| 512 | |
| 513 $OutString = $InString; | |
| 514 if (length($InString) && ContainsWhiteSpaces($InString)) { | |
| 515 $OutString =~ s/^(.*?)([ \t\r\n\f]*)$/$1/; | |
| 516 } | |
| 517 return $OutString; | |
| 518 } | |
| 519 | |
| 520 # Remove both leading and trailing white space characters from the string... | |
| 521 sub RemoveLeadingAndTrailingWhiteSpaces { | |
| 522 my($InString) = @_; | |
| 523 my($OutString); | |
| 524 | |
| 525 $OutString = $InString; | |
| 526 if (length($InString) && ContainsWhiteSpaces($InString)) { | |
| 527 $OutString =~ s/^([ \t\r\n\f]*)(.*?)([ \t\r\n\f]*)$/$2/; | |
| 528 } | |
| 529 return $OutString; | |
| 530 } | |
| 531 | |
| 532 # Wrap text string... | |
| 533 sub WrapText { | |
| 534 my($InString, $WrapLength, $WrapDelimiter); | |
| 535 my($OutString); | |
| 536 | |
| 537 $WrapLength = 40; | |
| 538 $WrapDelimiter = "\n"; | |
| 539 if (@_ == 3) { | |
| 540 ($InString, $WrapLength, $WrapDelimiter) = @_; | |
| 541 } | |
| 542 elsif (@_ == 2) { | |
| 543 ($InString, $WrapLength) = @_; | |
| 544 } | |
| 545 else { | |
| 546 ($InString, $WrapLength) = @_; | |
| 547 } | |
| 548 $OutString = $InString; | |
| 549 if ($InString && (length($InString) > $WrapLength)) { | |
| 550 $OutString = ""; | |
| 551 my($Index, $Length, $FirstPiece, $StringPiece); | |
| 552 $Index = 0; $Length = length($InString); | |
| 553 $FirstPiece = 1; | |
| 554 for ($Index = 0; $Index < $Length; $Index += $WrapLength) { | |
| 555 if (($Index + $WrapLength) < $Length) { | |
| 556 $StringPiece = substr($InString, $Index, $WrapLength); | |
| 557 } | |
| 558 else { | |
| 559 # Last piece of the string... | |
| 560 $StringPiece = substr($InString, $Index, $WrapLength); | |
| 561 } | |
| 562 if ($FirstPiece) { | |
| 563 $FirstPiece = 0; | |
| 564 $OutString = $StringPiece; | |
| 565 } | |
| 566 else { | |
| 567 $OutString .= "${WrapDelimiter}${StringPiece}"; | |
| 568 } | |
| 569 } | |
| 570 } | |
| 571 return $OutString; | |
| 572 } | |
| 573 | |
| 574 1; | |
| 575 | |
| 576 __END__ | |
| 577 | |
| 578 =head1 NAME | |
| 579 | |
| 580 TextUtil | |
| 581 | |
| 582 =head1 SYNOPSIS | |
| 583 | |
| 584 use TextUtil; | |
| 585 | |
| 586 use TextUtil qw(:all); | |
| 587 | |
| 588 =head1 DESCRIPTION | |
| 589 | |
| 590 B<TextUtil> module provides the following functions: | |
| 591 | |
| 592 AddNumberSuffix, ContainsWhiteSpaces, GetTextFileDataByNonUniqueKey, | |
| 593 GetTextFileDataByUniqueKey, GetTextLine, HashCode, IsEmpty, IsFloat, IsInteger, | |
| 594 IsNotEmpty, IsNumberPowerOfNumber, IsNumerical, IsPositiveInteger, JoinWords, | |
| 595 QuoteAWord, RemoveLeadingAndTrailingWhiteSpaces, RemoveLeadingWhiteSpaces, | |
| 596 RemoveTrailingWhiteSpaces, SplitWords, WrapText | |
| 597 | |
| 598 =head1 FUNCTIONS | |
| 599 | |
| 600 =over 4 | |
| 601 | |
| 602 =item B<AddNumberSuffix> | |
| 603 | |
| 604 $NumberWithSuffix = AddNumberSuffix($IntegerValue); | |
| 605 | |
| 606 Returns number with appropriate suffix: 0, 1st, 2nd, 3rd, 4th, and so on. | |
| 607 | |
| 608 =item B<ContainsWhiteSpaces> | |
| 609 | |
| 610 $Status = ContainsWhiteSpaces($TheString); | |
| 611 | |
| 612 Returns 1 or 0 based on whether the string contains any white spaces. | |
| 613 | |
| 614 =item B<GetTextLine> | |
| 615 | |
| 616 $Line = GetTextLine(\*TEXTFILE); | |
| 617 | |
| 618 Reads next line from an already opened text file, takes out any carriage return, | |
| 619 and returns it as a string. NULL is returned for EOF. | |
| 620 | |
| 621 =item B<GetTextFileDataByNonUniqueKey> | |
| 622 | |
| 623 GetTextFileDataByNonUniqueKey($TextDataFile, $TextDataMapRef, | |
| 624 $DataKeyColNum, $InDelim); | |
| 625 | |
| 626 Load data from a text file into the specified hash reference using a specific | |
| 627 column for non-unique data key values. | |
| 628 | |
| 629 The lines starting with # are treated as comments and ignored. First line | |
| 630 not starting with # must contain column labels and the number of columns in | |
| 631 all other data rows must match the number of column labels. | |
| 632 | |
| 633 The first column is assumed to contain data key value by default; all other columns | |
| 634 contain data as indicated in their column labels. | |
| 635 | |
| 636 In order to avoid dependence of data access on the specified column labels, the | |
| 637 column data is loaded into hash with Column<Num> hash keys, where column number | |
| 638 start from 1. The data key column is not available as Colnum<Num> hash key; | |
| 639 | |
| 640 The format of the data structure loaded into a specified hash reference is: | |
| 641 | |
| 642 @{$TextDataMapRef->{DataKeys}} - Array of unique data keys | |
| 643 @{$TextDataMapRef->{ColLabels}} - Array of column labels | |
| 644 @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs | |
| 645 $TextDataMapRef->{NumOfCols} - Number of columns | |
| 646 %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> | |
| 647 @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array: | |
| 648 <DataCol<Num>, DataKey> | |
| 649 | |
| 650 =item B<GetTextFileDataByUniqueKey> | |
| 651 | |
| 652 GetTextFileDataByUniqueKey($TextDataFile, $TextDataMapRef, $DataKeyColNum, | |
| 653 $InDelim); | |
| 654 | |
| 655 Load data from a text file into the specified hash reference using a a specific | |
| 656 column for unique data key values. | |
| 657 | |
| 658 The lines starting with # are treated as comments and ignored. First line | |
| 659 not starting with # must contain column labels and the number of columns in | |
| 660 all other data rows must match the number of column labels. | |
| 661 | |
| 662 The first column is assumed to contain data key value by default; all other columns | |
| 663 contain data as indicated in their column labels. | |
| 664 | |
| 665 In order to avoid dependence of data access on the specified column labels, the | |
| 666 column data is loaded into hash with Column<Num> hash keys, where column number | |
| 667 start from 1. The data key column is not available as Colnum<Num> hash key; | |
| 668 | |
| 669 The format of the data structure loaded into a specified hash reference is: | |
| 670 | |
| 671 @{$TextDataMapRef->{DataKeys}} - Array of unique data keys | |
| 672 @{$TextDataMapRef->{ColLabels}} - Array of column labels | |
| 673 @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs | |
| 674 $TextDataMapRef->{NumOfCols} - Number of columns | |
| 675 %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> | |
| 676 %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey> | |
| 677 | |
| 678 =item B<HashCode> | |
| 679 | |
| 680 $HashCode = HashCode($TheString); | |
| 681 | |
| 682 Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38]. | |
| 683 It's also implemented in Perl for internal hash keys in hv.h include file. | |
| 684 | |
| 685 =item B<IsEmpty> | |
| 686 | |
| 687 $Status = IsEmpty($TheString); | |
| 688 | |
| 689 Returns 1 or 0 based on whether the string is empty. | |
| 690 | |
| 691 =item B<IsInteger> | |
| 692 | |
| 693 $Status = IsInteger($TheString); | |
| 694 | |
| 695 Returns 1 or 0 based on whether the string is a positive integer. | |
| 696 | |
| 697 =item B<IsPositiveInteger> | |
| 698 | |
| 699 $Status = IsPositiveInteger($TheString); | |
| 700 | |
| 701 Returns 1 or 0 based on whether the string is an integer. | |
| 702 | |
| 703 =item B<IsFloat> | |
| 704 | |
| 705 $Status = IsFloat($TheString); | |
| 706 | |
| 707 Returns 1 or 0 based on whether the string is a float. | |
| 708 | |
| 709 =item B<IsNotEmpty> | |
| 710 | |
| 711 $Status = IsNotEmpty($TheString); | |
| 712 | |
| 713 Returns 0 or 1 based on whether the string is empty. | |
| 714 | |
| 715 =item B<IsNumerical> | |
| 716 | |
| 717 $Status = IsNumerical($TheString); | |
| 718 | |
| 719 Returns 1 or 0 based on whether the string is a number. | |
| 720 | |
| 721 =item B<IsNumberPowerOfNumber> | |
| 722 | |
| 723 $Status = IsNumberPowerOfNumber($FirstNum, $SecondNum); | |
| 724 | |
| 725 Returns 1 or 0 based on whether the first number is a power of second number. | |
| 726 | |
| 727 =item B<JoinWords> | |
| 728 | |
| 729 $JoinedWords = JoinWords($Words, $Delim, $Quote); | |
| 730 | |
| 731 Joins different words using delimiter and quote parameters, and returns it | |
| 732 as a string. | |
| 733 | |
| 734 =item B<QuoteAWord> | |
| 735 | |
| 736 $QuotedWord = QuoteAWord($Word, $Quote); | |
| 737 | |
| 738 Returns a quoted string based on I<Quote> value. | |
| 739 | |
| 740 =item B<RemoveLeadingWhiteSpaces> | |
| 741 | |
| 742 $OutString = RemoveLeadingWhiteSpaces($InString); | |
| 743 | |
| 744 Returns a string without any leading and traling white spaces. | |
| 745 | |
| 746 =item B<RemoveTrailingWhiteSpaces> | |
| 747 | |
| 748 $OutString = RemoveTrailingWhiteSpaces($InString); | |
| 749 | |
| 750 Returns a string without any trailing white spaces. | |
| 751 | |
| 752 =item B<RemoveLeadingAndTrailingWhiteSpaces> | |
| 753 | |
| 754 $OutString = RemoveLeadingAndTrailingWhiteSpaces($InString); | |
| 755 | |
| 756 Returns a string without any leading and traling white spaces. | |
| 757 | |
| 758 =item B<SplitWords> | |
| 759 | |
| 760 @Words = SplitWords($Line, $Delimiter); | |
| 761 | |
| 762 Returns an array I<Words> ontaining unquoted words generated after spliting | |
| 763 string value I<Line> containing quoted or unquoted words. | |
| 764 | |
| 765 This function is used to split strings generated by JoinWords as replacement | |
| 766 for Perl's core module funtion Text::ParseWords::quotewords() which dumps core | |
| 767 on very long strings. | |
| 768 | |
| 769 =item B<WrapText> | |
| 770 | |
| 771 $OutString = WrapText($InString, [$WrapLength, $WrapDelimiter]); | |
| 772 | |
| 773 Returns a wrapped string. By default, I<WrapLenght> is I<40> and I<WrapDelimiter> | |
| 774 is Unix new line character. | |
| 775 | |
| 776 =back | |
| 777 | |
| 778 =head1 AUTHOR | |
| 779 | |
| 780 Manish Sud <msud@san.rr.com> | |
| 781 | |
| 782 =head1 SEE ALSO | |
| 783 | |
| 784 FileUtil.pm | |
| 785 | |
| 786 =head1 COPYRIGHT | |
| 787 | |
| 788 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 789 | |
| 790 This file is part of MayaChemTools. | |
| 791 | |
| 792 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 793 the terms of the GNU Lesser General Public License as published by the Free | |
| 794 Software Foundation; either version 3 of the License, or (at your option) | |
| 795 any later version. | |
| 796 | |
| 797 =cut |
