1 package TextUtil; 2 # 3 # $RCSfile: TextUtil.pm,v $ 4 # $Date: 2015/03/22 20:08:26 $ 5 # $Revision: 1.45 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Exporter; 31 32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 33 34 @ISA = qw(Exporter); 35 @EXPORT = qw(AddNumberSuffix ContainsWhiteSpaces GetTextLine GetTextFileDataByUniqueKey GetTextFileDataByNonUniqueKey HashCode IsEmpty IsNumberPowerOfNumber IsInteger IsPositiveInteger IsFloat IsNotEmpty IsNumerical JoinWords SplitWords QuoteAWord RemoveLeadingWhiteSpaces RemoveTrailingWhiteSpaces RemoveLeadingAndTrailingWhiteSpaces WrapText); 36 @EXPORT_OK = qw(); 37 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 38 39 # Add number suffix... 40 sub AddNumberSuffix { 41 my($Value) = @_; 42 my($ValueWithSuffix, $Suffix); 43 44 $ValueWithSuffix = $Value; 45 if (!IsPositiveInteger($Value)) { 46 return $ValueWithSuffix; 47 } 48 $Suffix = "th"; 49 if ($Value < 10 || $Value > 20) { 50 my $Remainder = $Value % 10; 51 $Suffix = ($Remainder == 1) ? "st" : (($Remainder == 2) ? "nd" : (($Remainder == 3) ? "rd" : "th")); 52 } 53 $ValueWithSuffix = "${ValueWithSuffix}${Suffix}"; 54 return $ValueWithSuffix; 55 } 56 57 # Check out the string: Doen it contain any white space characters? 58 sub ContainsWhiteSpaces { 59 my($TheString) = @_; 60 my($Status) = 0; 61 62 if (defined($TheString) && length($TheString)) { 63 $Status = ($TheString =~ /[ \t\r\n\f]/ ) ? 1 : 0; 64 } 65 return $Status; 66 } 67 68 # Read the line, change to UNIX new line char, and chop off new line char as well... 69 sub GetTextLine { 70 my($TextFileRef) = @_; 71 my($Line) = ''; 72 73 # Get the next non empty line... 74 LINE: while (defined($_ = <$TextFileRef>)) { 75 # Change Windows and Mac new line char to UNIX... 76 s/(\r\n)|(\r)/\n/g; 77 78 # Take out any new line char at the end by explicitly removing it instead of using 79 # chomp, which might not always work correctly on files generated on a system 80 # with a value of input line separator different from the current system... 81 s/\n$//g; 82 83 # Doesn't hurt to chomp... 84 chomp; 85 86 $Line = $_; 87 if (length $Line) { 88 last LINE; 89 } 90 } 91 return $Line; 92 } 93 94 # Load data from a CSV file into the specified hash reference using a specific 95 # column for unique data key values. 96 # 97 # The lines starting with # are treated as comments and ignored. First line 98 # not starting with # must contain column labels and the number of columns in 99 # all other data rows must match the number of column labels. 100 # 101 # The first column is assumed to contain data key value by default; all other columns 102 # contain data as indicated in their column labels. 103 # 104 # In order to avoid dependence of data access on the specified column labels, the 105 # column data is loaded into hash with Column<Num> hash keys, where column number 106 # start from 1. The data key column is not available as Colnum<Num> hash key; 107 # 108 # The format of the data structure loaded into a specified hash reference is: 109 # 110 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys 111 # @{$TextDataMapRef->{ColLabels}} - Array of column labels 112 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs 113 # $TextDataMapRef->{NumOfCols} - Number of columns 114 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> 115 # %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey> 116 # 117 # Caveats: 118 # . The column number start from 1. 119 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs. 120 # 121 sub GetTextFileDataByUniqueKey { 122 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; 123 124 return _GetTextFileData("UniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim); 125 } 126 127 # Load data from a CSV file into the specified hash reference using a specific 128 # column for non-unique data key values. 129 # 130 # The lines starting with # are treated as comments and ignored. First line 131 # not starting with # must contain column labels and the number of columns in 132 # all other data rows must match the number of column labels. 133 # 134 # The first column is assumed to contain data key value by default; all other columns 135 # contain data as indicated in their column labels. 136 # 137 # In order to avoid dependence of data access on the specified column labels, the 138 # column data is loaded into hash with Column<Num> hash keys, where column number 139 # start from 1. The data key column is not available as Colnum<Num> hash key; 140 # 141 # The format of the data structure loaded into a specified hash reference is: 142 # 143 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys 144 # @{$TextDataMapRef->{ColLabels}} - Array of column labels 145 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs 146 # $TextDataMapRef->{NumOfCols} - Number of columns 147 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> 148 # @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array: <DataCol<Num>, DataKey> 149 # 150 # Caveats: 151 # . The column number start from 1. 152 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs. 153 # 154 sub GetTextFileDataByNonUniqueKey { 155 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; 156 157 return _GetTextFileData("NonUniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim); 158 } 159 160 # Loadtext file data using unique or non-uniqye data column key... 161 # 162 sub _GetTextFileData { 163 my($DataKeyMode, $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; 164 my($DataKeyColIndex, $LineCount, $IgnoredLineCount, $UniqueDataKeyMode, $DataKey, $Line, $NumOfCols, $ColIndex, $ColNum, $ColID, $ColValue, @LineWords, @ColLabels, @DataColIDs, @DataColNums); 165 166 print "\nProcessing text data file $TextDataFile...\n"; 167 168 $UniqueDataKeyMode = 0; 169 if ($DataKeyMode =~ /^UniqueKey$/i) { 170 $UniqueDataKeyMode = 1; 171 } 172 173 # Setup default values... 174 $DataKeyColNum = defined $DataKeyColNum ? $DataKeyColNum : 1; 175 176 if ($TextDataFile =~ /^\.tsv$/i) { 177 $InDelim = "\t"; 178 } 179 else { 180 $InDelim = "\,"; 181 if ($InDelim =~ /^semicolon$/i) { 182 $InDelim = "\;"; 183 } 184 } 185 186 ($LineCount, $IgnoredLineCount) = (0) x 2; 187 188 open TEXTDATAFILE, "$TextDataFile" or die "Couldn't open $TextDataFile: $! ..."; 189 190 # Skip lines up to column labels... 191 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) { 192 $LineCount++; 193 if ($Line =~ /^#/) { 194 $IgnoredLineCount++; 195 } 196 else { 197 last LINE; 198 } 199 } 200 201 # Initialize data map... 202 %{$TextDataMapRef} = (); 203 @{$TextDataMapRef->{DataKeys}} = (); 204 @{$TextDataMapRef->{ColLabels}} = (); 205 @{$TextDataMapRef->{DataColIDs}} = (); 206 $TextDataMapRef->{NumOfCols} = undef; 207 208 # Process column labels... 209 @ColLabels= quotewords($InDelim, 0, $Line); 210 $NumOfCols = @ColLabels; 211 212 if ($DataKeyColNum < 1 || $DataKeyColNum > $NumOfCols) { 213 warn "Warning: Ignoring text data file $TextDataFile: Invalid data key column number, $DataKeyColNum, specified. It must be > 0 or <= $NumOfCols, number of columns in the text file ..."; 214 return; 215 } 216 $DataKeyColIndex = $DataKeyColNum - 1; 217 218 $TextDataMapRef->{NumOfCols} = $NumOfCols; 219 push @{$TextDataMapRef->{ColLabels}}, @ColLabels; 220 221 # Set up column data IDs for tracking the data... 222 @DataColNums = (); 223 @DataColIDs = (); 224 COLNUM: for $ColNum (1 .. $NumOfCols) { 225 if ($ColNum == $DataKeyColNum) { 226 next COLNUM; 227 } 228 push @DataColNums, $ColNum; 229 $ColID = "DataCol${ColNum}"; 230 push @DataColIDs, $ColID; 231 } 232 push @{$TextDataMapRef->{DataColIDs}}, @DataColIDs; 233 234 # Initialize column data hash... 235 %{$TextDataMapRef->{DataKey}} = (); 236 for $ColIndex (0 .. $#DataColNums) { 237 $ColNum = $DataColNums[$ColIndex]; 238 $ColID = $DataColIDs[$ColIndex]; 239 %{$TextDataMapRef->{$ColID}} = (); 240 } 241 242 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) { 243 $LineCount++; 244 if ($Line =~ /^#/) { 245 $IgnoredLineCount++; 246 next LINE; 247 } 248 249 @LineWords = quotewords($InDelim, 0, $Line); 250 if (@LineWords != $NumOfCols) { 251 $IgnoredLineCount++; 252 warn "Warning: The number of data fields, @LineWords, in $TextDataFile must be $NumOfCols.\nIgnoring line number $LineCount: $Line...\n"; 253 next LINE; 254 } 255 $DataKey = $LineWords[$DataKeyColIndex]; 256 257 if ($UniqueDataKeyMode) { 258 if (exists $TextDataMapRef->{DataKey}{$DataKey}) { 259 $IgnoredLineCount++; 260 warn "Warning: The data key, $DataKey, in data column key number, $DataKeyColNum, is already present.\nIgnoring line number $LineCount: $Line...\n"; 261 next LINE; 262 } 263 push @{$TextDataMapRef->{DataKeys}}, $DataKey; 264 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey; 265 } 266 else { 267 if (!exists $TextDataMapRef->{DataKey}{$DataKey}) { 268 push @{$TextDataMapRef->{DataKeys}}, $DataKey; 269 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey; 270 271 for $ColIndex (0 .. $#DataColNums) { 272 $ColNum = $DataColNums[$ColIndex]; 273 $ColID = $DataColIDs[$ColIndex]; 274 @{$TextDataMapRef->{$ColID}{$DataKey}} = (); 275 } 276 } 277 } 278 279 # Track column data values... 280 for $ColIndex (0 .. $#DataColNums) { 281 $ColID = $DataColIDs[$ColIndex]; 282 283 $ColNum = $DataColNums[$ColIndex]; 284 $ColValue = $LineWords[$ColNum - 1]; 285 286 if ($UniqueDataKeyMode) { 287 $TextDataMapRef->{$ColID}{$DataKey} = $ColValue; 288 } 289 else { 290 push @{$TextDataMapRef->{$ColID}{$DataKey}}, $ColValue; 291 } 292 } 293 294 } 295 296 print "\nTotal number of lines in file $TextDataFile: $LineCount\n"; 297 print "Total number of lines ignored: $IgnoredLineCount\n"; 298 299 close TEXTDATAFILE; 300 } 301 302 # Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38]. It's also implemented in 303 # Perl for internal hash keys in hv.h include file. 304 # 305 # It's not clear how to force Perl perform unsigned integer arithmetic irrespective of the OS/Platform and 306 # the value of use64bitint flag used during its compilation. 307 # 308 # In order to generate a consistent 32 bit has code across OS/platforms, the following methodology appear 309 # to work: 310 # 311 # o Use MaxHashCodeMask to retrieve appropriate bits after left shifting by bit operators and additions 312 # o Stay away from "use integer" to avoid signed integer arithmetic for bit operators 313 # 314 # 315 # MaxHashCodeMask (2147483647) corresponds to the maximum value which can be stored in 31 bits 316 # 317 my($MaxHashCodeMask); 318 $MaxHashCodeMask = 2**31 - 1; 319 320 sub HashCode { 321 my($String) = @_; 322 my($HashCode, $Value, $ShiftedHashCode); 323 324 $HashCode = 0; 325 for $Value (unpack('C*', $String)) { 326 $HashCode += $Value; 327 328 $ShiftedHashCode = $HashCode << 10; 329 if ($ShiftedHashCode > $MaxHashCodeMask) { 330 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; 331 } 332 333 $HashCode += $ShiftedHashCode; 334 if ($HashCode > $MaxHashCodeMask) { 335 $HashCode = $HashCode & $MaxHashCodeMask; 336 } 337 338 $HashCode ^= ($HashCode >> 6); 339 } 340 341 $ShiftedHashCode = $HashCode << 3; 342 if ($ShiftedHashCode > $MaxHashCodeMask) { 343 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; 344 } 345 346 $HashCode += $ShiftedHashCode; 347 if ($HashCode > $MaxHashCodeMask) { 348 $HashCode = $HashCode & $MaxHashCodeMask; 349 } 350 $HashCode ^= ($HashCode >> 11); 351 352 $ShiftedHashCode = $HashCode << 15; 353 if ($ShiftedHashCode > $MaxHashCodeMask) { 354 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; 355 } 356 357 $HashCode += $ShiftedHashCode; 358 if ($HashCode > $MaxHashCodeMask) { 359 $HashCode = $HashCode & $MaxHashCodeMask; 360 } 361 return $HashCode; 362 } 363 364 # Check out the string: Is it defined and has a non zero length? 365 sub IsEmpty { 366 my($TheString) = @_; 367 my($Status) = 1; 368 369 $Status = (defined($TheString) && length($TheString)) ? 0 : 1; 370 371 return $Status; 372 } 373 374 # Is first specified number power of second specified number... 375 sub IsNumberPowerOfNumber { 376 my($FirstNum, $SecondNum) = @_; 377 my($PowerValue); 378 379 $PowerValue = log($FirstNum)/log($SecondNum); 380 381 return IsInteger($PowerValue) ? 1 : 0; 382 } 383 384 # Check out the string: Is it an integer? 385 sub IsInteger { 386 my($TheString) = @_; 387 my($Status) = 0; 388 389 if (defined($TheString) && length($TheString)) { 390 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); 391 $TheString =~ s/^[+-]//; 392 $Status = ($TheString =~ /[^0-9]/) ? 0 : 1; 393 } 394 return $Status; 395 } 396 397 # Check out the string: Is it an integer with value > 0? 398 sub IsPositiveInteger { 399 my($TheString) = @_; 400 my($Status) = 0; 401 402 $Status = IsInteger($TheString) ? ($TheString > 0 ? 1 : 0) : 0; 403 404 return $Status; 405 } 406 407 408 # Check out the string: Is it a float? 409 sub IsFloat { 410 my($TheString) = @_; 411 my($Status) = 0; 412 413 if (defined($TheString) && length($TheString)) { 414 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); 415 $TheString =~ s/^[+-]//; 416 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1); 417 } 418 return $Status; 419 } 420 421 # Check out the string: Is it defined and has a non zero length? 422 sub IsNotEmpty { 423 my($TheString) = @_; 424 my($Status); 425 426 $Status = IsEmpty($TheString) ? 0 : 1; 427 428 return $Status; 429 } 430 431 # Check out the string: Does it only contain numerical data? 432 sub IsNumerical { 433 my($TheString) = @_; 434 my($Status) = 0; 435 436 if (defined($TheString) && length($TheString)) { 437 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); 438 $TheString =~ s/^[+-]//; 439 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1); 440 } 441 return $Status; 442 } 443 444 # Join different words using delimiter and quote parameters. And return as 445 # a string value. 446 sub JoinWords { 447 my($Words, $Delim, $Quote) = @_; 448 449 if (!@$Words) { 450 return ""; 451 } 452 453 $Quote = $Quote ? "\"" : ""; 454 my(@NewWords) = map { (defined($_) && length($_)) ? "${Quote}$_${Quote}" : "${Quote}${Quote}" } @$Words; 455 456 return join $Delim, @NewWords; 457 } 458 459 # Split string value containing quoted or unquoted words in to an array containing 460 # unquoted words. 461 # 462 # This function is used to split strings generated by JoinWords. 463 # 464 sub SplitWords { 465 my($Line, $Delim) = @_; 466 467 if (!$Line) { 468 return (); 469 } 470 471 # Is it a quoted string? 472 if ($Line =~ /^\"/) { 473 # Take out first and last quote... 474 $Line =~ s/^\"//; $Line =~ s/\"$//; 475 476 $Delim = "\"$Delim\""; 477 } 478 return split /$Delim/, $Line; 479 } 480 481 # Based on quote parameter, figure out what to do 482 sub QuoteAWord { 483 my($Word, $Quote) = @_; 484 my($QuotedWord); 485 486 $QuotedWord = ""; 487 if ($Word) { 488 $QuotedWord = $Word; 489 } 490 if ($Quote) { 491 $QuotedWord = "\"$QuotedWord\""; 492 } 493 return ($QuotedWord); 494 } 495 496 # Remove leading white space characters from the string... 497 sub RemoveLeadingWhiteSpaces { 498 my($InString) = @_; 499 my($OutString, $TrailingString, $LeadingWhiteSpace); 500 501 $OutString = $InString; 502 if (length($InString) && ContainsWhiteSpaces($InString)) { 503 $OutString =~ s/^([ \t\r\n\f]*)(.*?)$/$2/; 504 } 505 return $OutString; 506 } 507 508 # Remove Trailing white space characters from the string... 509 sub RemoveTrailingWhiteSpaces { 510 my($InString) = @_; 511 my($OutString, $LeadingString, $TrailingWhiteSpace); 512 513 $OutString = $InString; 514 if (length($InString) && ContainsWhiteSpaces($InString)) { 515 $OutString =~ s/^(.*?)([ \t\r\n\f]*)$/$1/; 516 } 517 return $OutString; 518 } 519 520 # Remove both leading and trailing white space characters from the string... 521 sub RemoveLeadingAndTrailingWhiteSpaces { 522 my($InString) = @_; 523 my($OutString); 524 525 $OutString = $InString; 526 if (length($InString) && ContainsWhiteSpaces($InString)) { 527 $OutString =~ s/^([ \t\r\n\f]*)(.*?)([ \t\r\n\f]*)$/$2/; 528 } 529 return $OutString; 530 } 531 532 # Wrap text string... 533 sub WrapText { 534 my($InString, $WrapLength, $WrapDelimiter); 535 my($OutString); 536 537 $WrapLength = 40; 538 $WrapDelimiter = "\n"; 539 if (@_ == 3) { 540 ($InString, $WrapLength, $WrapDelimiter) = @_; 541 } 542 elsif (@_ == 2) { 543 ($InString, $WrapLength) = @_; 544 } 545 else { 546 ($InString, $WrapLength) = @_; 547 } 548 $OutString = $InString; 549 if ($InString && (length($InString) > $WrapLength)) { 550 $OutString = ""; 551 my($Index, $Length, $FirstPiece, $StringPiece); 552 $Index = 0; $Length = length($InString); 553 $FirstPiece = 1; 554 for ($Index = 0; $Index < $Length; $Index += $WrapLength) { 555 if (($Index + $WrapLength) < $Length) { 556 $StringPiece = substr($InString, $Index, $WrapLength); 557 } 558 else { 559 # Last piece of the string... 560 $StringPiece = substr($InString, $Index, $WrapLength); 561 } 562 if ($FirstPiece) { 563 $FirstPiece = 0; 564 $OutString = $StringPiece; 565 } 566 else { 567 $OutString .= "${WrapDelimiter}${StringPiece}"; 568 } 569 } 570 } 571 return $OutString; 572 } 573