MayaChemTools

   1 package TextUtil;
   2 #
   3 # $RCSfile: TextUtil.pm,v $
   4 # $Date: 2015/03/22 20:08:26 $
   5 # $Revision: 1.45 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Exporter;
  31 
  32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  33 
  34 @ISA = qw(Exporter);
  35 @EXPORT = qw(AddNumberSuffix ContainsWhiteSpaces GetTextLine GetTextFileDataByUniqueKey GetTextFileDataByNonUniqueKey HashCode IsEmpty IsNumberPowerOfNumber IsInteger IsPositiveInteger IsFloat IsNotEmpty IsNumerical JoinWords SplitWords  QuoteAWord RemoveLeadingWhiteSpaces RemoveTrailingWhiteSpaces RemoveLeadingAndTrailingWhiteSpaces WrapText);
  36 @EXPORT_OK = qw();
  37 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  38 
  39 # Add number suffix...
  40 sub AddNumberSuffix {
  41   my($Value) = @_;
  42   my($ValueWithSuffix, $Suffix);
  43 
  44   $ValueWithSuffix = $Value;
  45   if (!IsPositiveInteger($Value)) {
  46     return $ValueWithSuffix;
  47   }
  48   $Suffix = "th";
  49   if ($Value < 10 || $Value > 20) {
  50     my $Remainder = $Value % 10;
  51     $Suffix = ($Remainder == 1) ? "st" : (($Remainder == 2) ? "nd" : (($Remainder == 3) ? "rd" : "th"));
  52   }
  53   $ValueWithSuffix = "${ValueWithSuffix}${Suffix}";
  54   return $ValueWithSuffix;
  55 }
  56 
  57 # Check out the string: Doen it contain any white space characters?
  58 sub ContainsWhiteSpaces {
  59   my($TheString) = @_;
  60   my($Status) = 0;
  61 
  62   if (defined($TheString) && length($TheString)) {
  63     $Status = ($TheString =~ /[ \t\r\n\f]/ ) ? 1 : 0;
  64   }
  65   return $Status;
  66 }
  67 
  68 # Read the line, change to UNIX new line char, and chop off new line char as well...
  69 sub GetTextLine {
  70   my($TextFileRef) = @_;
  71   my($Line) = '';
  72 
  73   # Get the next non empty line...
  74   LINE: while (defined($_ = <$TextFileRef>)) {
  75     # Change Windows and Mac new line char to UNIX...
  76     s/(\r\n)|(\r)/\n/g;
  77 
  78     # Take out any new line char at the end by explicitly removing it instead of using
  79     # chomp, which might not always work correctly on files generated on a system
  80     # with a value of input line separator different from the current system...
  81     s/\n$//g;
  82 
  83     # Doesn't hurt to chomp...
  84     chomp;
  85 
  86     $Line = $_;
  87     if (length $Line) {
  88       last LINE;
  89     }
  90   }
  91   return $Line;
  92 }
  93 
  94 # Load data from a CSV file into the specified hash reference using a specific
  95 # column for unique data key values.
  96 #
  97 # The lines starting with # are treated as comments and ignored. First line
  98 # not starting with # must contain column labels and the number of columns in
  99 # all other data rows must match the number of column labels.
 100 #
 101 # The first column is assumed to contain data key value by default; all other columns
 102 # contain data as indicated in their column labels.
 103 #
 104 # In order to avoid dependence of data access on the specified column labels, the
 105 # column data is loaded into hash with Column<Num> hash keys, where column number
 106 # start from 1. The data key column is not available as Colnum<Num> hash key;
 107 #
 108 # The format of the data structure loaded into a specified hash reference is:
 109 #
 110 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
 111 # @{$TextDataMapRef->{ColLabels}} - Array of column labels
 112 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
 113 # $TextDataMapRef->{NumOfCols} - Number of columns
 114 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
 115 # %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey>
 116 #
 117 # Caveats:
 118 #   . The column number start from 1.
 119 #   . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs.
 120 #
 121 sub GetTextFileDataByUniqueKey {
 122   my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
 123 
 124   return _GetTextFileData("UniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim);
 125 }
 126 
 127 # Load data from a CSV file into the specified hash reference using a specific
 128 # column for non-unique data key values.
 129 #
 130 # The lines starting with # are treated as comments and ignored. First line
 131 # not starting with # must contain column labels and the number of columns in
 132 # all other data rows must match the number of column labels.
 133 #
 134 # The first column is assumed to contain data key value by default; all other columns
 135 # contain data as indicated in their column labels.
 136 #
 137 # In order to avoid dependence of data access on the specified column labels, the
 138 # column data is loaded into hash with Column<Num> hash keys, where column number
 139 # start from 1. The data key column is not available as Colnum<Num> hash key;
 140 #
 141 # The format of the data structure loaded into a specified hash reference is:
 142 #
 143 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
 144 # @{$TextDataMapRef->{ColLabels}} - Array of column labels
 145 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
 146 # $TextDataMapRef->{NumOfCols} - Number of columns
 147 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
 148 # @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array: <DataCol<Num>, DataKey>
 149 #
 150 # Caveats:
 151 #   . The column number start from 1.
 152 #   . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs.
 153 #
 154 sub GetTextFileDataByNonUniqueKey {
 155   my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
 156 
 157   return _GetTextFileData("NonUniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim);
 158 }
 159 
 160 # Loadtext file data using unique or non-uniqye data column key...
 161 #
 162 sub _GetTextFileData {
 163   my($DataKeyMode, $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
 164   my($DataKeyColIndex, $LineCount, $IgnoredLineCount, $UniqueDataKeyMode, $DataKey, $Line, $NumOfCols, $ColIndex, $ColNum, $ColID, $ColValue, @LineWords, @ColLabels, @DataColIDs, @DataColNums);
 165 
 166   print "\nProcessing text data file $TextDataFile...\n";
 167 
 168   $UniqueDataKeyMode = 0;
 169   if ($DataKeyMode =~ /^UniqueKey$/i) {
 170     $UniqueDataKeyMode = 1;
 171   }
 172 
 173   # Setup default values...
 174   $DataKeyColNum = defined  $DataKeyColNum ? $DataKeyColNum : 1;
 175 
 176   if ($TextDataFile =~ /^\.tsv$/i) {
 177     $InDelim = "\t";
 178   }
 179   else {
 180     $InDelim = "\,";
 181     if ($InDelim =~ /^semicolon$/i) {
 182       $InDelim = "\;";
 183     }
 184   }
 185 
 186   ($LineCount, $IgnoredLineCount) = (0) x 2;
 187 
 188   open TEXTDATAFILE, "$TextDataFile" or die "Couldn't open $TextDataFile: $! ...";
 189 
 190   # Skip lines up to column labels...
 191   LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) {
 192     $LineCount++;
 193     if ($Line =~ /^#/) {
 194       $IgnoredLineCount++;
 195     }
 196     else {
 197       last LINE;
 198     }
 199   }
 200 
 201   # Initialize data map...
 202   %{$TextDataMapRef} = ();
 203   @{$TextDataMapRef->{DataKeys}} = ();
 204   @{$TextDataMapRef->{ColLabels}} = ();
 205   @{$TextDataMapRef->{DataColIDs}} = ();
 206   $TextDataMapRef->{NumOfCols} = undef;
 207 
 208   # Process column labels...
 209   @ColLabels= quotewords($InDelim, 0, $Line);
 210   $NumOfCols = @ColLabels;
 211 
 212   if ($DataKeyColNum < 1 || $DataKeyColNum > $NumOfCols) {
 213     warn "Warning: Ignoring text data file $TextDataFile: Invalid data key column number, $DataKeyColNum, specified. It must be > 0 or <= $NumOfCols, number of columns in the text file ...";
 214     return;
 215   }
 216   $DataKeyColIndex = $DataKeyColNum - 1;
 217 
 218   $TextDataMapRef->{NumOfCols} = $NumOfCols;
 219   push @{$TextDataMapRef->{ColLabels}}, @ColLabels;
 220 
 221   # Set up column data IDs for tracking the data...
 222   @DataColNums = ();
 223   @DataColIDs = ();
 224   COLNUM: for $ColNum (1 .. $NumOfCols) {
 225     if ($ColNum == $DataKeyColNum) {
 226       next COLNUM;
 227     }
 228     push @DataColNums, $ColNum;
 229     $ColID = "DataCol${ColNum}";
 230     push @DataColIDs, $ColID;
 231   }
 232   push @{$TextDataMapRef->{DataColIDs}}, @DataColIDs;
 233 
 234   # Initialize column data hash...
 235   %{$TextDataMapRef->{DataKey}} = ();
 236   for $ColIndex (0 .. $#DataColNums) {
 237     $ColNum = $DataColNums[$ColIndex];
 238     $ColID = $DataColIDs[$ColIndex];
 239     %{$TextDataMapRef->{$ColID}} = ();
 240   }
 241 
 242   LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) {
 243     $LineCount++;
 244     if ($Line =~ /^#/) {
 245       $IgnoredLineCount++;
 246       next LINE;
 247     }
 248 
 249     @LineWords = quotewords($InDelim, 0, $Line);
 250     if (@LineWords != $NumOfCols) {
 251       $IgnoredLineCount++;
 252       warn "Warning: The number of data fields, @LineWords, in $TextDataFile must be $NumOfCols.\nIgnoring line number $LineCount: $Line...\n";
 253       next LINE;
 254     }
 255     $DataKey = $LineWords[$DataKeyColIndex];
 256 
 257     if ($UniqueDataKeyMode) {
 258       if (exists $TextDataMapRef->{DataKey}{$DataKey}) {
 259         $IgnoredLineCount++;
 260         warn "Warning: The data key, $DataKey, in data column key number, $DataKeyColNum, is already present.\nIgnoring line number $LineCount: $Line...\n";
 261         next LINE;
 262       }
 263       push @{$TextDataMapRef->{DataKeys}}, $DataKey;
 264       $TextDataMapRef->{DataKey}{$DataKey} = $DataKey;
 265     }
 266     else {
 267       if (!exists $TextDataMapRef->{DataKey}{$DataKey}) {
 268         push @{$TextDataMapRef->{DataKeys}}, $DataKey;
 269         $TextDataMapRef->{DataKey}{$DataKey} = $DataKey;
 270 
 271         for $ColIndex (0 .. $#DataColNums) {
 272           $ColNum = $DataColNums[$ColIndex];
 273           $ColID = $DataColIDs[$ColIndex];
 274           @{$TextDataMapRef->{$ColID}{$DataKey}} = ();
 275         }
 276       }
 277     }
 278 
 279     # Track column data values...
 280     for $ColIndex (0 .. $#DataColNums) {
 281       $ColID = $DataColIDs[$ColIndex];
 282 
 283       $ColNum = $DataColNums[$ColIndex];
 284       $ColValue = $LineWords[$ColNum - 1];
 285 
 286       if ($UniqueDataKeyMode) {
 287         $TextDataMapRef->{$ColID}{$DataKey} = $ColValue;
 288       }
 289       else {
 290         push @{$TextDataMapRef->{$ColID}{$DataKey}}, $ColValue;
 291       }
 292     }
 293 
 294   }
 295 
 296   print "\nTotal number of lines in file $TextDataFile: $LineCount\n";
 297   print "Total number of lines ignored: $IgnoredLineCount\n";
 298 
 299   close TEXTDATAFILE;
 300 }
 301 
 302 # Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38]. It's also implemented in
 303 # Perl for internal hash keys in hv.h include file.
 304 #
 305 # It's not clear how to force Perl perform unsigned integer arithmetic irrespective of the OS/Platform and
 306 # the value of use64bitint flag used during its compilation.
 307 #
 308 # In order to generate a consistent 32 bit has code across OS/platforms, the following methodology appear
 309 # to work:
 310 #
 311 #    o Use MaxHashCodeMask to retrieve appropriate bits after left shifting by bit operators and additions
 312 #    o Stay away from "use integer" to avoid signed integer arithmetic for bit operators
 313 #
 314 #
 315 #   MaxHashCodeMask (2147483647) corresponds to the maximum value which can be stored in 31 bits
 316 #
 317 my($MaxHashCodeMask);
 318 $MaxHashCodeMask = 2**31 - 1;
 319 
 320 sub HashCode {
 321   my($String) = @_;
 322   my($HashCode, $Value, $ShiftedHashCode);
 323 
 324   $HashCode = 0;
 325   for $Value (unpack('C*', $String)) {
 326     $HashCode += $Value;
 327 
 328     $ShiftedHashCode = $HashCode << 10;
 329     if ($ShiftedHashCode > $MaxHashCodeMask) {
 330       $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
 331     }
 332 
 333     $HashCode += $ShiftedHashCode;
 334     if ($HashCode > $MaxHashCodeMask) {
 335       $HashCode = $HashCode & $MaxHashCodeMask;
 336     }
 337 
 338     $HashCode ^= ($HashCode >> 6);
 339   }
 340 
 341   $ShiftedHashCode = $HashCode << 3;
 342   if ($ShiftedHashCode > $MaxHashCodeMask) {
 343     $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
 344   }
 345 
 346   $HashCode += $ShiftedHashCode;
 347   if ($HashCode > $MaxHashCodeMask) {
 348     $HashCode = $HashCode & $MaxHashCodeMask;
 349   }
 350   $HashCode ^= ($HashCode >> 11);
 351 
 352   $ShiftedHashCode = $HashCode << 15;
 353   if ($ShiftedHashCode > $MaxHashCodeMask) {
 354     $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
 355   }
 356 
 357   $HashCode += $ShiftedHashCode;
 358   if ($HashCode > $MaxHashCodeMask) {
 359     $HashCode = $HashCode & $MaxHashCodeMask;
 360   }
 361   return $HashCode;
 362 }
 363 
 364 # Check out the string: Is it defined and has a non zero length?
 365 sub IsEmpty {
 366   my($TheString) = @_;
 367   my($Status) = 1;
 368 
 369   $Status = (defined($TheString) && length($TheString)) ? 0 : 1;
 370 
 371   return $Status;
 372 }
 373 
 374 # Is first specified number power of second specified number...
 375 sub IsNumberPowerOfNumber {
 376   my($FirstNum, $SecondNum) = @_;
 377   my($PowerValue);
 378 
 379   $PowerValue = log($FirstNum)/log($SecondNum);
 380 
 381   return IsInteger($PowerValue) ? 1 : 0;
 382 }
 383 
 384 # Check out the string: Is it an integer?
 385 sub IsInteger {
 386   my($TheString) = @_;
 387   my($Status) = 0;
 388 
 389   if (defined($TheString) && length($TheString)) {
 390     $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
 391     $TheString =~ s/^[+-]//;
 392     $Status = ($TheString =~ /[^0-9]/) ? 0 : 1;
 393   }
 394   return $Status;
 395 }
 396 
 397 # Check out the string: Is it an integer with value > 0?
 398 sub IsPositiveInteger {
 399   my($TheString) = @_;
 400   my($Status) = 0;
 401 
 402   $Status = IsInteger($TheString) ? ($TheString > 0 ? 1 : 0) : 0;
 403 
 404   return $Status;
 405 }
 406 
 407 
 408 # Check out the string: Is it a float?
 409 sub IsFloat {
 410   my($TheString) = @_;
 411   my($Status) = 0;
 412 
 413   if (defined($TheString) && length($TheString)) {
 414     $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
 415     $TheString =~ s/^[+-]//;
 416     $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1);
 417   }
 418   return $Status;
 419 }
 420 
 421 # Check out the string: Is it defined and has a non zero length?
 422 sub IsNotEmpty {
 423   my($TheString) = @_;
 424   my($Status);
 425 
 426   $Status = IsEmpty($TheString) ? 0 : 1;
 427 
 428   return $Status;
 429 }
 430 
 431 # Check out the string: Does it only contain numerical data?
 432 sub IsNumerical {
 433   my($TheString) = @_;
 434   my($Status) = 0;
 435 
 436   if (defined($TheString) && length($TheString)) {
 437     $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
 438     $TheString =~ s/^[+-]//;
 439     $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1);
 440   }
 441   return $Status;
 442 }
 443 
 444 # Join different words using delimiter and quote parameters. And return as
 445 # a string value.
 446 sub JoinWords {
 447   my($Words, $Delim, $Quote) = @_;
 448 
 449   if (!@$Words) {
 450     return "";
 451   }
 452 
 453   $Quote = $Quote ? "\"" : "";
 454   my(@NewWords) = map { (defined($_) && length($_)) ? "${Quote}$_${Quote}" : "${Quote}${Quote}" } @$Words;
 455 
 456   return join $Delim, @NewWords;
 457 }
 458 
 459 # Split string value containing quoted or unquoted words in to an array containing
 460 # unquoted words.
 461 #
 462 # This function is used to split strings generated by JoinWords.
 463 #
 464 sub SplitWords {
 465   my($Line, $Delim) = @_;
 466 
 467   if (!$Line) {
 468     return ();
 469   }
 470 
 471   # Is it a quoted string?
 472   if ($Line =~ /^\"/) {
 473     # Take out first and last quote...
 474     $Line =~ s/^\"//; $Line =~ s/\"$//;
 475 
 476     $Delim = "\"$Delim\"";
 477   }
 478   return split /$Delim/, $Line;
 479 }
 480 
 481 # Based on quote parameter, figure out what to do
 482 sub QuoteAWord {
 483   my($Word, $Quote) = @_;
 484   my($QuotedWord);
 485 
 486   $QuotedWord = "";
 487   if ($Word) {
 488     $QuotedWord = $Word;
 489   }
 490   if ($Quote) {
 491     $QuotedWord = "\"$QuotedWord\"";
 492   }
 493   return ($QuotedWord);
 494 }
 495 
 496 # Remove leading white space characters from the string...
 497 sub RemoveLeadingWhiteSpaces {
 498   my($InString) = @_;
 499   my($OutString, $TrailingString, $LeadingWhiteSpace);
 500 
 501   $OutString = $InString;
 502   if (length($InString) && ContainsWhiteSpaces($InString)) {
 503     $OutString =~ s/^([ \t\r\n\f]*)(.*?)$/$2/;
 504   }
 505   return $OutString;
 506 }
 507 
 508 # Remove Trailing white space characters from the string...
 509 sub RemoveTrailingWhiteSpaces {
 510   my($InString) = @_;
 511   my($OutString, $LeadingString, $TrailingWhiteSpace);
 512 
 513   $OutString = $InString;
 514   if (length($InString) && ContainsWhiteSpaces($InString)) {
 515     $OutString =~ s/^(.*?)([ \t\r\n\f]*)$/$1/;
 516   }
 517   return $OutString;
 518 }
 519 
 520 # Remove both leading and trailing white space characters from the string...
 521 sub RemoveLeadingAndTrailingWhiteSpaces {
 522   my($InString) = @_;
 523   my($OutString);
 524 
 525   $OutString = $InString;
 526   if (length($InString) && ContainsWhiteSpaces($InString)) {
 527     $OutString =~ s/^([ \t\r\n\f]*)(.*?)([ \t\r\n\f]*)$/$2/;
 528   }
 529   return $OutString;
 530 }
 531 
 532 # Wrap text string...
 533 sub WrapText {
 534   my($InString, $WrapLength, $WrapDelimiter);
 535   my($OutString);
 536 
 537   $WrapLength = 40;
 538   $WrapDelimiter = "\n";
 539   if (@_ == 3) {
 540     ($InString, $WrapLength, $WrapDelimiter) = @_;
 541   }
 542   elsif (@_ == 2) {
 543     ($InString, $WrapLength) = @_;
 544   }
 545   else {
 546     ($InString, $WrapLength) = @_;
 547   }
 548   $OutString = $InString;
 549   if ($InString && (length($InString) > $WrapLength)) {
 550     $OutString = "";
 551     my($Index, $Length, $FirstPiece, $StringPiece);
 552     $Index = 0; $Length = length($InString);
 553     $FirstPiece = 1;
 554     for ($Index = 0; $Index < $Length; $Index += $WrapLength) {
 555       if (($Index + $WrapLength) < $Length) {
 556         $StringPiece = substr($InString, $Index, $WrapLength);
 557       }
 558       else {
 559         # Last piece of the string...
 560         $StringPiece = substr($InString, $Index, $WrapLength);
 561       }
 562       if ($FirstPiece) {
 563         $FirstPiece = 0;
 564         $OutString = $StringPiece;
 565       }
 566       else {
 567         $OutString .= "${WrapDelimiter}${StringPiece}";
 568       }
 569     }
 570   }
 571   return $OutString;
 572 }
 573