0
|
1 package TextUtil;
|
|
2 #
|
|
3 # $RCSfile: TextUtil.pm,v $
|
|
4 # $Date: 2015/03/22 20:08:26 $
|
|
5 # $Revision: 1.45 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use Exporter;
|
|
31
|
|
32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
33
|
|
34 @ISA = qw(Exporter);
|
|
35 @EXPORT = qw(AddNumberSuffix ContainsWhiteSpaces GetTextLine GetTextFileDataByUniqueKey GetTextFileDataByNonUniqueKey HashCode IsEmpty IsNumberPowerOfNumber IsInteger IsPositiveInteger IsFloat IsNotEmpty IsNumerical JoinWords SplitWords QuoteAWord RemoveLeadingWhiteSpaces RemoveTrailingWhiteSpaces RemoveLeadingAndTrailingWhiteSpaces WrapText);
|
|
36 @EXPORT_OK = qw();
|
|
37 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
|
|
38
|
|
39 # Add number suffix...
|
|
40 sub AddNumberSuffix {
|
|
41 my($Value) = @_;
|
|
42 my($ValueWithSuffix, $Suffix);
|
|
43
|
|
44 $ValueWithSuffix = $Value;
|
|
45 if (!IsPositiveInteger($Value)) {
|
|
46 return $ValueWithSuffix;
|
|
47 }
|
|
48 $Suffix = "th";
|
|
49 if ($Value < 10 || $Value > 20) {
|
|
50 my $Remainder = $Value % 10;
|
|
51 $Suffix = ($Remainder == 1) ? "st" : (($Remainder == 2) ? "nd" : (($Remainder == 3) ? "rd" : "th"));
|
|
52 }
|
|
53 $ValueWithSuffix = "${ValueWithSuffix}${Suffix}";
|
|
54 return $ValueWithSuffix;
|
|
55 }
|
|
56
|
|
57 # Check out the string: Doen it contain any white space characters?
|
|
58 sub ContainsWhiteSpaces {
|
|
59 my($TheString) = @_;
|
|
60 my($Status) = 0;
|
|
61
|
|
62 if (defined($TheString) && length($TheString)) {
|
|
63 $Status = ($TheString =~ /[ \t\r\n\f]/ ) ? 1 : 0;
|
|
64 }
|
|
65 return $Status;
|
|
66 }
|
|
67
|
|
68 # Read the line, change to UNIX new line char, and chop off new line char as well...
|
|
69 sub GetTextLine {
|
|
70 my($TextFileRef) = @_;
|
|
71 my($Line) = '';
|
|
72
|
|
73 # Get the next non empty line...
|
|
74 LINE: while (defined($_ = <$TextFileRef>)) {
|
|
75 # Change Windows and Mac new line char to UNIX...
|
|
76 s/(\r\n)|(\r)/\n/g;
|
|
77
|
|
78 # Take out any new line char at the end by explicitly removing it instead of using
|
|
79 # chomp, which might not always work correctly on files generated on a system
|
|
80 # with a value of input line separator different from the current system...
|
|
81 s/\n$//g;
|
|
82
|
|
83 # Doesn't hurt to chomp...
|
|
84 chomp;
|
|
85
|
|
86 $Line = $_;
|
|
87 if (length $Line) {
|
|
88 last LINE;
|
|
89 }
|
|
90 }
|
|
91 return $Line;
|
|
92 }
|
|
93
|
|
94 # Load data from a CSV file into the specified hash reference using a specific
|
|
95 # column for unique data key values.
|
|
96 #
|
|
97 # The lines starting with # are treated as comments and ignored. First line
|
|
98 # not starting with # must contain column labels and the number of columns in
|
|
99 # all other data rows must match the number of column labels.
|
|
100 #
|
|
101 # The first column is assumed to contain data key value by default; all other columns
|
|
102 # contain data as indicated in their column labels.
|
|
103 #
|
|
104 # In order to avoid dependence of data access on the specified column labels, the
|
|
105 # column data is loaded into hash with Column<Num> hash keys, where column number
|
|
106 # start from 1. The data key column is not available as Colnum<Num> hash key;
|
|
107 #
|
|
108 # The format of the data structure loaded into a specified hash reference is:
|
|
109 #
|
|
110 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
|
|
111 # @{$TextDataMapRef->{ColLabels}} - Array of column labels
|
|
112 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
|
|
113 # $TextDataMapRef->{NumOfCols} - Number of columns
|
|
114 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
|
|
115 # %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey>
|
|
116 #
|
|
117 # Caveats:
|
|
118 # . The column number start from 1.
|
|
119 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs.
|
|
120 #
|
|
121 sub GetTextFileDataByUniqueKey {
|
|
122 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
|
|
123
|
|
124 return _GetTextFileData("UniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim);
|
|
125 }
|
|
126
|
|
127 # Load data from a CSV file into the specified hash reference using a specific
|
|
128 # column for non-unique data key values.
|
|
129 #
|
|
130 # The lines starting with # are treated as comments and ignored. First line
|
|
131 # not starting with # must contain column labels and the number of columns in
|
|
132 # all other data rows must match the number of column labels.
|
|
133 #
|
|
134 # The first column is assumed to contain data key value by default; all other columns
|
|
135 # contain data as indicated in their column labels.
|
|
136 #
|
|
137 # In order to avoid dependence of data access on the specified column labels, the
|
|
138 # column data is loaded into hash with Column<Num> hash keys, where column number
|
|
139 # start from 1. The data key column is not available as Colnum<Num> hash key;
|
|
140 #
|
|
141 # The format of the data structure loaded into a specified hash reference is:
|
|
142 #
|
|
143 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
|
|
144 # @{$TextDataMapRef->{ColLabels}} - Array of column labels
|
|
145 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
|
|
146 # $TextDataMapRef->{NumOfCols} - Number of columns
|
|
147 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
|
|
148 # @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array: <DataCol<Num>, DataKey>
|
|
149 #
|
|
150 # Caveats:
|
|
151 # . The column number start from 1.
|
|
152 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs.
|
|
153 #
|
|
154 sub GetTextFileDataByNonUniqueKey {
|
|
155 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
|
|
156
|
|
157 return _GetTextFileData("NonUniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim);
|
|
158 }
|
|
159
|
|
160 # Loadtext file data using unique or non-uniqye data column key...
|
|
161 #
|
|
162 sub _GetTextFileData {
|
|
163 my($DataKeyMode, $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
|
|
164 my($DataKeyColIndex, $LineCount, $IgnoredLineCount, $UniqueDataKeyMode, $DataKey, $Line, $NumOfCols, $ColIndex, $ColNum, $ColID, $ColValue, @LineWords, @ColLabels, @DataColIDs, @DataColNums);
|
|
165
|
|
166 print "\nProcessing text data file $TextDataFile...\n";
|
|
167
|
|
168 $UniqueDataKeyMode = 0;
|
|
169 if ($DataKeyMode =~ /^UniqueKey$/i) {
|
|
170 $UniqueDataKeyMode = 1;
|
|
171 }
|
|
172
|
|
173 # Setup default values...
|
|
174 $DataKeyColNum = defined $DataKeyColNum ? $DataKeyColNum : 1;
|
|
175
|
|
176 if ($TextDataFile =~ /^\.tsv$/i) {
|
|
177 $InDelim = "\t";
|
|
178 }
|
|
179 else {
|
|
180 $InDelim = "\,";
|
|
181 if ($InDelim =~ /^semicolon$/i) {
|
|
182 $InDelim = "\;";
|
|
183 }
|
|
184 }
|
|
185
|
|
186 ($LineCount, $IgnoredLineCount) = (0) x 2;
|
|
187
|
|
188 open TEXTDATAFILE, "$TextDataFile" or die "Couldn't open $TextDataFile: $! ...";
|
|
189
|
|
190 # Skip lines up to column labels...
|
|
191 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) {
|
|
192 $LineCount++;
|
|
193 if ($Line =~ /^#/) {
|
|
194 $IgnoredLineCount++;
|
|
195 }
|
|
196 else {
|
|
197 last LINE;
|
|
198 }
|
|
199 }
|
|
200
|
|
201 # Initialize data map...
|
|
202 %{$TextDataMapRef} = ();
|
|
203 @{$TextDataMapRef->{DataKeys}} = ();
|
|
204 @{$TextDataMapRef->{ColLabels}} = ();
|
|
205 @{$TextDataMapRef->{DataColIDs}} = ();
|
|
206 $TextDataMapRef->{NumOfCols} = undef;
|
|
207
|
|
208 # Process column labels...
|
|
209 @ColLabels= quotewords($InDelim, 0, $Line);
|
|
210 $NumOfCols = @ColLabels;
|
|
211
|
|
212 if ($DataKeyColNum < 1 || $DataKeyColNum > $NumOfCols) {
|
|
213 warn "Warning: Ignoring text data file $TextDataFile: Invalid data key column number, $DataKeyColNum, specified. It must be > 0 or <= $NumOfCols, number of columns in the text file ...";
|
|
214 return;
|
|
215 }
|
|
216 $DataKeyColIndex = $DataKeyColNum - 1;
|
|
217
|
|
218 $TextDataMapRef->{NumOfCols} = $NumOfCols;
|
|
219 push @{$TextDataMapRef->{ColLabels}}, @ColLabels;
|
|
220
|
|
221 # Set up column data IDs for tracking the data...
|
|
222 @DataColNums = ();
|
|
223 @DataColIDs = ();
|
|
224 COLNUM: for $ColNum (1 .. $NumOfCols) {
|
|
225 if ($ColNum == $DataKeyColNum) {
|
|
226 next COLNUM;
|
|
227 }
|
|
228 push @DataColNums, $ColNum;
|
|
229 $ColID = "DataCol${ColNum}";
|
|
230 push @DataColIDs, $ColID;
|
|
231 }
|
|
232 push @{$TextDataMapRef->{DataColIDs}}, @DataColIDs;
|
|
233
|
|
234 # Initialize column data hash...
|
|
235 %{$TextDataMapRef->{DataKey}} = ();
|
|
236 for $ColIndex (0 .. $#DataColNums) {
|
|
237 $ColNum = $DataColNums[$ColIndex];
|
|
238 $ColID = $DataColIDs[$ColIndex];
|
|
239 %{$TextDataMapRef->{$ColID}} = ();
|
|
240 }
|
|
241
|
|
242 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) {
|
|
243 $LineCount++;
|
|
244 if ($Line =~ /^#/) {
|
|
245 $IgnoredLineCount++;
|
|
246 next LINE;
|
|
247 }
|
|
248
|
|
249 @LineWords = quotewords($InDelim, 0, $Line);
|
|
250 if (@LineWords != $NumOfCols) {
|
|
251 $IgnoredLineCount++;
|
|
252 warn "Warning: The number of data fields, @LineWords, in $TextDataFile must be $NumOfCols.\nIgnoring line number $LineCount: $Line...\n";
|
|
253 next LINE;
|
|
254 }
|
|
255 $DataKey = $LineWords[$DataKeyColIndex];
|
|
256
|
|
257 if ($UniqueDataKeyMode) {
|
|
258 if (exists $TextDataMapRef->{DataKey}{$DataKey}) {
|
|
259 $IgnoredLineCount++;
|
|
260 warn "Warning: The data key, $DataKey, in data column key number, $DataKeyColNum, is already present.\nIgnoring line number $LineCount: $Line...\n";
|
|
261 next LINE;
|
|
262 }
|
|
263 push @{$TextDataMapRef->{DataKeys}}, $DataKey;
|
|
264 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey;
|
|
265 }
|
|
266 else {
|
|
267 if (!exists $TextDataMapRef->{DataKey}{$DataKey}) {
|
|
268 push @{$TextDataMapRef->{DataKeys}}, $DataKey;
|
|
269 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey;
|
|
270
|
|
271 for $ColIndex (0 .. $#DataColNums) {
|
|
272 $ColNum = $DataColNums[$ColIndex];
|
|
273 $ColID = $DataColIDs[$ColIndex];
|
|
274 @{$TextDataMapRef->{$ColID}{$DataKey}} = ();
|
|
275 }
|
|
276 }
|
|
277 }
|
|
278
|
|
279 # Track column data values...
|
|
280 for $ColIndex (0 .. $#DataColNums) {
|
|
281 $ColID = $DataColIDs[$ColIndex];
|
|
282
|
|
283 $ColNum = $DataColNums[$ColIndex];
|
|
284 $ColValue = $LineWords[$ColNum - 1];
|
|
285
|
|
286 if ($UniqueDataKeyMode) {
|
|
287 $TextDataMapRef->{$ColID}{$DataKey} = $ColValue;
|
|
288 }
|
|
289 else {
|
|
290 push @{$TextDataMapRef->{$ColID}{$DataKey}}, $ColValue;
|
|
291 }
|
|
292 }
|
|
293
|
|
294 }
|
|
295
|
|
296 print "\nTotal number of lines in file $TextDataFile: $LineCount\n";
|
|
297 print "Total number of lines ignored: $IgnoredLineCount\n";
|
|
298
|
|
299 close TEXTDATAFILE;
|
|
300 }
|
|
301
|
|
302 # Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38]. It's also implemented in
|
|
303 # Perl for internal hash keys in hv.h include file.
|
|
304 #
|
|
305 # It's not clear how to force Perl perform unsigned integer arithmetic irrespective of the OS/Platform and
|
|
306 # the value of use64bitint flag used during its compilation.
|
|
307 #
|
|
308 # In order to generate a consistent 32 bit has code across OS/platforms, the following methodology appear
|
|
309 # to work:
|
|
310 #
|
|
311 # o Use MaxHashCodeMask to retrieve appropriate bits after left shifting by bit operators and additions
|
|
312 # o Stay away from "use integer" to avoid signed integer arithmetic for bit operators
|
|
313 #
|
|
314 #
|
|
315 # MaxHashCodeMask (2147483647) corresponds to the maximum value which can be stored in 31 bits
|
|
316 #
|
|
317 my($MaxHashCodeMask);
|
|
318 $MaxHashCodeMask = 2**31 - 1;
|
|
319
|
|
320 sub HashCode {
|
|
321 my($String) = @_;
|
|
322 my($HashCode, $Value, $ShiftedHashCode);
|
|
323
|
|
324 $HashCode = 0;
|
|
325 for $Value (unpack('C*', $String)) {
|
|
326 $HashCode += $Value;
|
|
327
|
|
328 $ShiftedHashCode = $HashCode << 10;
|
|
329 if ($ShiftedHashCode > $MaxHashCodeMask) {
|
|
330 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
|
|
331 }
|
|
332
|
|
333 $HashCode += $ShiftedHashCode;
|
|
334 if ($HashCode > $MaxHashCodeMask) {
|
|
335 $HashCode = $HashCode & $MaxHashCodeMask;
|
|
336 }
|
|
337
|
|
338 $HashCode ^= ($HashCode >> 6);
|
|
339 }
|
|
340
|
|
341 $ShiftedHashCode = $HashCode << 3;
|
|
342 if ($ShiftedHashCode > $MaxHashCodeMask) {
|
|
343 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
|
|
344 }
|
|
345
|
|
346 $HashCode += $ShiftedHashCode;
|
|
347 if ($HashCode > $MaxHashCodeMask) {
|
|
348 $HashCode = $HashCode & $MaxHashCodeMask;
|
|
349 }
|
|
350 $HashCode ^= ($HashCode >> 11);
|
|
351
|
|
352 $ShiftedHashCode = $HashCode << 15;
|
|
353 if ($ShiftedHashCode > $MaxHashCodeMask) {
|
|
354 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
|
|
355 }
|
|
356
|
|
357 $HashCode += $ShiftedHashCode;
|
|
358 if ($HashCode > $MaxHashCodeMask) {
|
|
359 $HashCode = $HashCode & $MaxHashCodeMask;
|
|
360 }
|
|
361 return $HashCode;
|
|
362 }
|
|
363
|
|
364 # Check out the string: Is it defined and has a non zero length?
|
|
365 sub IsEmpty {
|
|
366 my($TheString) = @_;
|
|
367 my($Status) = 1;
|
|
368
|
|
369 $Status = (defined($TheString) && length($TheString)) ? 0 : 1;
|
|
370
|
|
371 return $Status;
|
|
372 }
|
|
373
|
|
374 # Is first specified number power of second specified number...
|
|
375 sub IsNumberPowerOfNumber {
|
|
376 my($FirstNum, $SecondNum) = @_;
|
|
377 my($PowerValue);
|
|
378
|
|
379 $PowerValue = log($FirstNum)/log($SecondNum);
|
|
380
|
|
381 return IsInteger($PowerValue) ? 1 : 0;
|
|
382 }
|
|
383
|
|
384 # Check out the string: Is it an integer?
|
|
385 sub IsInteger {
|
|
386 my($TheString) = @_;
|
|
387 my($Status) = 0;
|
|
388
|
|
389 if (defined($TheString) && length($TheString)) {
|
|
390 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
|
|
391 $TheString =~ s/^[+-]//;
|
|
392 $Status = ($TheString =~ /[^0-9]/) ? 0 : 1;
|
|
393 }
|
|
394 return $Status;
|
|
395 }
|
|
396
|
|
397 # Check out the string: Is it an integer with value > 0?
|
|
398 sub IsPositiveInteger {
|
|
399 my($TheString) = @_;
|
|
400 my($Status) = 0;
|
|
401
|
|
402 $Status = IsInteger($TheString) ? ($TheString > 0 ? 1 : 0) : 0;
|
|
403
|
|
404 return $Status;
|
|
405 }
|
|
406
|
|
407
|
|
408 # Check out the string: Is it a float?
|
|
409 sub IsFloat {
|
|
410 my($TheString) = @_;
|
|
411 my($Status) = 0;
|
|
412
|
|
413 if (defined($TheString) && length($TheString)) {
|
|
414 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
|
|
415 $TheString =~ s/^[+-]//;
|
|
416 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1);
|
|
417 }
|
|
418 return $Status;
|
|
419 }
|
|
420
|
|
421 # Check out the string: Is it defined and has a non zero length?
|
|
422 sub IsNotEmpty {
|
|
423 my($TheString) = @_;
|
|
424 my($Status);
|
|
425
|
|
426 $Status = IsEmpty($TheString) ? 0 : 1;
|
|
427
|
|
428 return $Status;
|
|
429 }
|
|
430
|
|
431 # Check out the string: Does it only contain numerical data?
|
|
432 sub IsNumerical {
|
|
433 my($TheString) = @_;
|
|
434 my($Status) = 0;
|
|
435
|
|
436 if (defined($TheString) && length($TheString)) {
|
|
437 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
|
|
438 $TheString =~ s/^[+-]//;
|
|
439 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1);
|
|
440 }
|
|
441 return $Status;
|
|
442 }
|
|
443
|
|
444 # Join different words using delimiter and quote parameters. And return as
|
|
445 # a string value.
|
|
446 sub JoinWords {
|
|
447 my($Words, $Delim, $Quote) = @_;
|
|
448
|
|
449 if (!@$Words) {
|
|
450 return "";
|
|
451 }
|
|
452
|
|
453 $Quote = $Quote ? "\"" : "";
|
|
454 my(@NewWords) = map { (defined($_) && length($_)) ? "${Quote}$_${Quote}" : "${Quote}${Quote}" } @$Words;
|
|
455
|
|
456 return join $Delim, @NewWords;
|
|
457 }
|
|
458
|
|
459 # Split string value containing quoted or unquoted words in to an array containing
|
|
460 # unquoted words.
|
|
461 #
|
|
462 # This function is used to split strings generated by JoinWords.
|
|
463 #
|
|
464 sub SplitWords {
|
|
465 my($Line, $Delim) = @_;
|
|
466
|
|
467 if (!$Line) {
|
|
468 return ();
|
|
469 }
|
|
470
|
|
471 # Is it a quoted string?
|
|
472 if ($Line =~ /^\"/) {
|
|
473 # Take out first and last quote...
|
|
474 $Line =~ s/^\"//; $Line =~ s/\"$//;
|
|
475
|
|
476 $Delim = "\"$Delim\"";
|
|
477 }
|
|
478 return split /$Delim/, $Line;
|
|
479 }
|
|
480
|
|
481 # Based on quote parameter, figure out what to do
|
|
482 sub QuoteAWord {
|
|
483 my($Word, $Quote) = @_;
|
|
484 my($QuotedWord);
|
|
485
|
|
486 $QuotedWord = "";
|
|
487 if ($Word) {
|
|
488 $QuotedWord = $Word;
|
|
489 }
|
|
490 if ($Quote) {
|
|
491 $QuotedWord = "\"$QuotedWord\"";
|
|
492 }
|
|
493 return ($QuotedWord);
|
|
494 }
|
|
495
|
|
496 # Remove leading white space characters from the string...
|
|
497 sub RemoveLeadingWhiteSpaces {
|
|
498 my($InString) = @_;
|
|
499 my($OutString, $TrailingString, $LeadingWhiteSpace);
|
|
500
|
|
501 $OutString = $InString;
|
|
502 if (length($InString) && ContainsWhiteSpaces($InString)) {
|
|
503 $OutString =~ s/^([ \t\r\n\f]*)(.*?)$/$2/;
|
|
504 }
|
|
505 return $OutString;
|
|
506 }
|
|
507
|
|
508 # Remove Trailing white space characters from the string...
|
|
509 sub RemoveTrailingWhiteSpaces {
|
|
510 my($InString) = @_;
|
|
511 my($OutString, $LeadingString, $TrailingWhiteSpace);
|
|
512
|
|
513 $OutString = $InString;
|
|
514 if (length($InString) && ContainsWhiteSpaces($InString)) {
|
|
515 $OutString =~ s/^(.*?)([ \t\r\n\f]*)$/$1/;
|
|
516 }
|
|
517 return $OutString;
|
|
518 }
|
|
519
|
|
520 # Remove both leading and trailing white space characters from the string...
|
|
521 sub RemoveLeadingAndTrailingWhiteSpaces {
|
|
522 my($InString) = @_;
|
|
523 my($OutString);
|
|
524
|
|
525 $OutString = $InString;
|
|
526 if (length($InString) && ContainsWhiteSpaces($InString)) {
|
|
527 $OutString =~ s/^([ \t\r\n\f]*)(.*?)([ \t\r\n\f]*)$/$2/;
|
|
528 }
|
|
529 return $OutString;
|
|
530 }
|
|
531
|
|
532 # Wrap text string...
|
|
533 sub WrapText {
|
|
534 my($InString, $WrapLength, $WrapDelimiter);
|
|
535 my($OutString);
|
|
536
|
|
537 $WrapLength = 40;
|
|
538 $WrapDelimiter = "\n";
|
|
539 if (@_ == 3) {
|
|
540 ($InString, $WrapLength, $WrapDelimiter) = @_;
|
|
541 }
|
|
542 elsif (@_ == 2) {
|
|
543 ($InString, $WrapLength) = @_;
|
|
544 }
|
|
545 else {
|
|
546 ($InString, $WrapLength) = @_;
|
|
547 }
|
|
548 $OutString = $InString;
|
|
549 if ($InString && (length($InString) > $WrapLength)) {
|
|
550 $OutString = "";
|
|
551 my($Index, $Length, $FirstPiece, $StringPiece);
|
|
552 $Index = 0; $Length = length($InString);
|
|
553 $FirstPiece = 1;
|
|
554 for ($Index = 0; $Index < $Length; $Index += $WrapLength) {
|
|
555 if (($Index + $WrapLength) < $Length) {
|
|
556 $StringPiece = substr($InString, $Index, $WrapLength);
|
|
557 }
|
|
558 else {
|
|
559 # Last piece of the string...
|
|
560 $StringPiece = substr($InString, $Index, $WrapLength);
|
|
561 }
|
|
562 if ($FirstPiece) {
|
|
563 $FirstPiece = 0;
|
|
564 $OutString = $StringPiece;
|
|
565 }
|
|
566 else {
|
|
567 $OutString .= "${WrapDelimiter}${StringPiece}";
|
|
568 }
|
|
569 }
|
|
570 }
|
|
571 return $OutString;
|
|
572 }
|
|
573
|
|
574 1;
|
|
575
|
|
576 __END__
|
|
577
|
|
578 =head1 NAME
|
|
579
|
|
580 TextUtil
|
|
581
|
|
582 =head1 SYNOPSIS
|
|
583
|
|
584 use TextUtil;
|
|
585
|
|
586 use TextUtil qw(:all);
|
|
587
|
|
588 =head1 DESCRIPTION
|
|
589
|
|
590 B<TextUtil> module provides the following functions:
|
|
591
|
|
592 AddNumberSuffix, ContainsWhiteSpaces, GetTextFileDataByNonUniqueKey,
|
|
593 GetTextFileDataByUniqueKey, GetTextLine, HashCode, IsEmpty, IsFloat, IsInteger,
|
|
594 IsNotEmpty, IsNumberPowerOfNumber, IsNumerical, IsPositiveInteger, JoinWords,
|
|
595 QuoteAWord, RemoveLeadingAndTrailingWhiteSpaces, RemoveLeadingWhiteSpaces,
|
|
596 RemoveTrailingWhiteSpaces, SplitWords, WrapText
|
|
597
|
|
598 =head1 FUNCTIONS
|
|
599
|
|
600 =over 4
|
|
601
|
|
602 =item B<AddNumberSuffix>
|
|
603
|
|
604 $NumberWithSuffix = AddNumberSuffix($IntegerValue);
|
|
605
|
|
606 Returns number with appropriate suffix: 0, 1st, 2nd, 3rd, 4th, and so on.
|
|
607
|
|
608 =item B<ContainsWhiteSpaces>
|
|
609
|
|
610 $Status = ContainsWhiteSpaces($TheString);
|
|
611
|
|
612 Returns 1 or 0 based on whether the string contains any white spaces.
|
|
613
|
|
614 =item B<GetTextLine>
|
|
615
|
|
616 $Line = GetTextLine(\*TEXTFILE);
|
|
617
|
|
618 Reads next line from an already opened text file, takes out any carriage return,
|
|
619 and returns it as a string. NULL is returned for EOF.
|
|
620
|
|
621 =item B<GetTextFileDataByNonUniqueKey>
|
|
622
|
|
623 GetTextFileDataByNonUniqueKey($TextDataFile, $TextDataMapRef,
|
|
624 $DataKeyColNum, $InDelim);
|
|
625
|
|
626 Load data from a text file into the specified hash reference using a specific
|
|
627 column for non-unique data key values.
|
|
628
|
|
629 The lines starting with # are treated as comments and ignored. First line
|
|
630 not starting with # must contain column labels and the number of columns in
|
|
631 all other data rows must match the number of column labels.
|
|
632
|
|
633 The first column is assumed to contain data key value by default; all other columns
|
|
634 contain data as indicated in their column labels.
|
|
635
|
|
636 In order to avoid dependence of data access on the specified column labels, the
|
|
637 column data is loaded into hash with Column<Num> hash keys, where column number
|
|
638 start from 1. The data key column is not available as Colnum<Num> hash key;
|
|
639
|
|
640 The format of the data structure loaded into a specified hash reference is:
|
|
641
|
|
642 @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
|
|
643 @{$TextDataMapRef->{ColLabels}} - Array of column labels
|
|
644 @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
|
|
645 $TextDataMapRef->{NumOfCols} - Number of columns
|
|
646 %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
|
|
647 @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array:
|
|
648 <DataCol<Num>, DataKey>
|
|
649
|
|
650 =item B<GetTextFileDataByUniqueKey>
|
|
651
|
|
652 GetTextFileDataByUniqueKey($TextDataFile, $TextDataMapRef, $DataKeyColNum,
|
|
653 $InDelim);
|
|
654
|
|
655 Load data from a text file into the specified hash reference using a a specific
|
|
656 column for unique data key values.
|
|
657
|
|
658 The lines starting with # are treated as comments and ignored. First line
|
|
659 not starting with # must contain column labels and the number of columns in
|
|
660 all other data rows must match the number of column labels.
|
|
661
|
|
662 The first column is assumed to contain data key value by default; all other columns
|
|
663 contain data as indicated in their column labels.
|
|
664
|
|
665 In order to avoid dependence of data access on the specified column labels, the
|
|
666 column data is loaded into hash with Column<Num> hash keys, where column number
|
|
667 start from 1. The data key column is not available as Colnum<Num> hash key;
|
|
668
|
|
669 The format of the data structure loaded into a specified hash reference is:
|
|
670
|
|
671 @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
|
|
672 @{$TextDataMapRef->{ColLabels}} - Array of column labels
|
|
673 @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
|
|
674 $TextDataMapRef->{NumOfCols} - Number of columns
|
|
675 %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
|
|
676 %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey>
|
|
677
|
|
678 =item B<HashCode>
|
|
679
|
|
680 $HashCode = HashCode($TheString);
|
|
681
|
|
682 Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38].
|
|
683 It's also implemented in Perl for internal hash keys in hv.h include file.
|
|
684
|
|
685 =item B<IsEmpty>
|
|
686
|
|
687 $Status = IsEmpty($TheString);
|
|
688
|
|
689 Returns 1 or 0 based on whether the string is empty.
|
|
690
|
|
691 =item B<IsInteger>
|
|
692
|
|
693 $Status = IsInteger($TheString);
|
|
694
|
|
695 Returns 1 or 0 based on whether the string is a positive integer.
|
|
696
|
|
697 =item B<IsPositiveInteger>
|
|
698
|
|
699 $Status = IsPositiveInteger($TheString);
|
|
700
|
|
701 Returns 1 or 0 based on whether the string is an integer.
|
|
702
|
|
703 =item B<IsFloat>
|
|
704
|
|
705 $Status = IsFloat($TheString);
|
|
706
|
|
707 Returns 1 or 0 based on whether the string is a float.
|
|
708
|
|
709 =item B<IsNotEmpty>
|
|
710
|
|
711 $Status = IsNotEmpty($TheString);
|
|
712
|
|
713 Returns 0 or 1 based on whether the string is empty.
|
|
714
|
|
715 =item B<IsNumerical>
|
|
716
|
|
717 $Status = IsNumerical($TheString);
|
|
718
|
|
719 Returns 1 or 0 based on whether the string is a number.
|
|
720
|
|
721 =item B<IsNumberPowerOfNumber>
|
|
722
|
|
723 $Status = IsNumberPowerOfNumber($FirstNum, $SecondNum);
|
|
724
|
|
725 Returns 1 or 0 based on whether the first number is a power of second number.
|
|
726
|
|
727 =item B<JoinWords>
|
|
728
|
|
729 $JoinedWords = JoinWords($Words, $Delim, $Quote);
|
|
730
|
|
731 Joins different words using delimiter and quote parameters, and returns it
|
|
732 as a string.
|
|
733
|
|
734 =item B<QuoteAWord>
|
|
735
|
|
736 $QuotedWord = QuoteAWord($Word, $Quote);
|
|
737
|
|
738 Returns a quoted string based on I<Quote> value.
|
|
739
|
|
740 =item B<RemoveLeadingWhiteSpaces>
|
|
741
|
|
742 $OutString = RemoveLeadingWhiteSpaces($InString);
|
|
743
|
|
744 Returns a string without any leading and traling white spaces.
|
|
745
|
|
746 =item B<RemoveTrailingWhiteSpaces>
|
|
747
|
|
748 $OutString = RemoveTrailingWhiteSpaces($InString);
|
|
749
|
|
750 Returns a string without any trailing white spaces.
|
|
751
|
|
752 =item B<RemoveLeadingAndTrailingWhiteSpaces>
|
|
753
|
|
754 $OutString = RemoveLeadingAndTrailingWhiteSpaces($InString);
|
|
755
|
|
756 Returns a string without any leading and traling white spaces.
|
|
757
|
|
758 =item B<SplitWords>
|
|
759
|
|
760 @Words = SplitWords($Line, $Delimiter);
|
|
761
|
|
762 Returns an array I<Words> ontaining unquoted words generated after spliting
|
|
763 string value I<Line> containing quoted or unquoted words.
|
|
764
|
|
765 This function is used to split strings generated by JoinWords as replacement
|
|
766 for Perl's core module funtion Text::ParseWords::quotewords() which dumps core
|
|
767 on very long strings.
|
|
768
|
|
769 =item B<WrapText>
|
|
770
|
|
771 $OutString = WrapText($InString, [$WrapLength, $WrapDelimiter]);
|
|
772
|
|
773 Returns a wrapped string. By default, I<WrapLenght> is I<40> and I<WrapDelimiter>
|
|
774 is Unix new line character.
|
|
775
|
|
776 =back
|
|
777
|
|
778 =head1 AUTHOR
|
|
779
|
|
780 Manish Sud <msud@san.rr.com>
|
|
781
|
|
782 =head1 SEE ALSO
|
|
783
|
|
784 FileUtil.pm
|
|
785
|
|
786 =head1 COPYRIGHT
|
|
787
|
|
788 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
789
|
|
790 This file is part of MayaChemTools.
|
|
791
|
|
792 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
793 the terms of the GNU Lesser General Public License as published by the Free
|
|
794 Software Foundation; either version 3 of the License, or (at your option)
|
|
795 any later version.
|
|
796
|
|
797 =cut
|