Mercurial > repos > plus91-technologies-pvt-ltd > ss_test_tool
comparison 2.4/lib/perl5/x86_64-linux-gnu-thread-multi/String/Approx.pm @ 0:00b9898b8510 draft
Uploaded
| author | plus91-technologies-pvt-ltd |
|---|---|
| date | Wed, 04 Jun 2014 03:41:27 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:00b9898b8510 |
|---|---|
| 1 package String::Approx; | |
| 2 | |
| 3 require v5.8.0; | |
| 4 | |
| 5 $VERSION = '3.27'; | |
| 6 | |
| 7 use strict; | |
| 8 local $^W = 1; | |
| 9 | |
| 10 use Carp; | |
| 11 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); | |
| 12 | |
| 13 require Exporter; | |
| 14 require DynaLoader; | |
| 15 | |
| 16 @ISA = qw(Exporter DynaLoader); | |
| 17 | |
| 18 @EXPORT_OK = qw(amatch asubstitute aindex aslice arindex | |
| 19 adist adistr adistword adistrword); | |
| 20 | |
| 21 bootstrap String::Approx $VERSION; | |
| 22 | |
| 23 my $CACHE_MAX = 1000; # high water mark | |
| 24 my $CACHE_PURGE = 0.75; # purge this much of the least used | |
| 25 my $CACHE_N_PURGE; # purge this many of the least used | |
| 26 | |
| 27 sub cache_n_purge () { | |
| 28 $CACHE_N_PURGE = $CACHE_MAX * $CACHE_PURGE; | |
| 29 $CACHE_N_PURGE = 1 if $CACHE_N_PURGE < 1; | |
| 30 return $CACHE_N_PURGE; | |
| 31 } | |
| 32 | |
| 33 cache_n_purge(); | |
| 34 | |
| 35 sub cache_max (;$) { | |
| 36 if (@_ == 0) { | |
| 37 return $CACHE_MAX; | |
| 38 } else { | |
| 39 $CACHE_MAX = shift; | |
| 40 } | |
| 41 $CACHE_MAX = 0 if $CACHE_MAX < 0; | |
| 42 cache_n_purge(); | |
| 43 } | |
| 44 | |
| 45 sub cache_purge (;$) { | |
| 46 if (@_ == 0) { | |
| 47 return $CACHE_PURGE; | |
| 48 } else { | |
| 49 $CACHE_PURGE = shift; | |
| 50 } | |
| 51 if ($CACHE_PURGE < 0) { | |
| 52 $CACHE_PURGE = 0; | |
| 53 } elsif ($CACHE_PURGE > 1) { | |
| 54 $CACHE_PURGE = 1; | |
| 55 } | |
| 56 cache_n_purge(); | |
| 57 } | |
| 58 | |
| 59 my %_simple; | |
| 60 my %_simple_usage_count; | |
| 61 | |
| 62 sub _cf_simple { | |
| 63 my $P = shift; | |
| 64 | |
| 65 my @usage = | |
| 66 sort { $_simple_usage_count{$a} <=> $_simple_usage_count{$b} } | |
| 67 grep { $_ ne $P } | |
| 68 keys %_simple_usage_count; | |
| 69 | |
| 70 # Make room, delete the least used entries. | |
| 71 $#usage = $CACHE_N_PURGE - 1; | |
| 72 | |
| 73 delete @_simple_usage_count{@usage}; | |
| 74 delete @_simple{@usage}; | |
| 75 } | |
| 76 | |
| 77 sub _simple { | |
| 78 my $P = shift; | |
| 79 | |
| 80 my $_simple = new(__PACKAGE__, $P); | |
| 81 | |
| 82 if ($CACHE_MAX) { | |
| 83 $_simple{$P} = $_simple unless exists $_simple{$P}; | |
| 84 | |
| 85 $_simple_usage_count{$P}++; | |
| 86 | |
| 87 if (keys %_simple_usage_count > $CACHE_MAX) { | |
| 88 _cf_simple($P); | |
| 89 } | |
| 90 } | |
| 91 | |
| 92 return ( $_simple ); | |
| 93 } | |
| 94 | |
| 95 sub _parse_param { | |
| 96 use integer; | |
| 97 | |
| 98 my ($n, @param) = @_; | |
| 99 my %param; | |
| 100 | |
| 101 foreach (@param) { | |
| 102 while ($_ ne '') { | |
| 103 s/^\s+//; | |
| 104 if (s/^([IDS]\s*)?(\d+)(\s*%)?//) { | |
| 105 my $k = defined $3 ? (($2-1) * $n) / 100 + ($2 ? 1 : 0) : $2; | |
| 106 | |
| 107 if (defined $1) { | |
| 108 $param{$1} = $k; | |
| 109 } else { | |
| 110 $param{k} = $k; | |
| 111 } | |
| 112 } elsif (s/^initial_position\W+(\d+)\b//) { | |
| 113 $param{'initial_position'} = $1; | |
| 114 } elsif (s/^final_position\W+(\d+)\b//) { | |
| 115 $param{'final_position'} = $1; | |
| 116 } elsif (s/^position_range\W+(\d+)\b//) { | |
| 117 $param{'position_range'} = $1; | |
| 118 } elsif (s/^minimal_distance\b//) { | |
| 119 $param{'minimal_distance'} = 1; | |
| 120 } elsif (s/^i//) { | |
| 121 $param{ i } = 1; | |
| 122 } elsif (s/^g//) { | |
| 123 $param{ g } = 1; | |
| 124 } elsif (s/^\?//) { | |
| 125 $param{'?'} = 1; | |
| 126 } else { | |
| 127 warn "unknown parameter: '$_'\n"; | |
| 128 return; | |
| 129 } | |
| 130 } | |
| 131 } | |
| 132 | |
| 133 return %param; | |
| 134 } | |
| 135 | |
| 136 my %_param_key; | |
| 137 my %_parsed_param; | |
| 138 | |
| 139 my %_complex; | |
| 140 my %_complex_usage_count; | |
| 141 | |
| 142 sub _cf_complex { | |
| 143 my $P = shift; | |
| 144 | |
| 145 my @usage = | |
| 146 sort { $_complex_usage_count{$a} <=> | |
| 147 $_complex_usage_count{$b} } | |
| 148 grep { $_ ne $P } | |
| 149 keys %_complex_usage_count; | |
| 150 | |
| 151 # Make room, delete the least used entries. | |
| 152 $#usage = $CACHE_N_PURGE - 1; | |
| 153 | |
| 154 delete @_complex_usage_count{@usage}; | |
| 155 delete @_complex{@usage}; | |
| 156 } | |
| 157 | |
| 158 sub _complex { | |
| 159 my ($P, @param) = @_; | |
| 160 unshift @param, length $P; | |
| 161 my $param = "@param"; | |
| 162 my $_param_key; | |
| 163 my %param; | |
| 164 my $complex; | |
| 165 my $is_new; | |
| 166 | |
| 167 unless (exists $_param_key{$param}) { | |
| 168 %param = _parse_param(@param); | |
| 169 $_parsed_param{$param} = { %param }; | |
| 170 $_param_key{$param} = join(" ", %param); | |
| 171 } else { | |
| 172 %param = %{ $_parsed_param{$param} }; | |
| 173 } | |
| 174 | |
| 175 $_param_key = $_param_key{$param}; | |
| 176 | |
| 177 if ($CACHE_MAX) { | |
| 178 if (exists $_complex{$P}->{$_param_key}) { | |
| 179 $complex = $_complex{$P}->{$_param_key}; | |
| 180 } | |
| 181 } | |
| 182 | |
| 183 unless (defined $complex) { | |
| 184 if (exists $param{'k'}) { | |
| 185 $complex = new(__PACKAGE__, $P, $param{k}); | |
| 186 } else { | |
| 187 $complex = new(__PACKAGE__, $P); | |
| 188 } | |
| 189 $_complex{$P}->{$_param_key} = $complex if $CACHE_MAX; | |
| 190 $is_new = 1; | |
| 191 } | |
| 192 | |
| 193 if ($is_new) { | |
| 194 $complex->set_greedy unless exists $param{'?'}; | |
| 195 | |
| 196 $complex->set_insertions($param{'I'}) | |
| 197 if exists $param{'I'}; | |
| 198 $complex->set_deletions($param{'D'}) | |
| 199 if exists $param{'D'}; | |
| 200 $complex->set_substitutions($param{'S'}) | |
| 201 if exists $param{'S'}; | |
| 202 | |
| 203 $complex->set_caseignore_slice | |
| 204 if exists $param{'i'}; | |
| 205 | |
| 206 $complex->set_text_initial_position($param{'initial_position'}) | |
| 207 if exists $param{'initial_position'}; | |
| 208 | |
| 209 $complex->set_text_final_position($param{'final_position'}) | |
| 210 if exists $param{'final_position'}; | |
| 211 | |
| 212 $complex->set_text_position_range($param{'position_range'}) | |
| 213 if exists $param{'position_range'}; | |
| 214 | |
| 215 $complex->set_minimal_distance($param{'minimal_distance'}) | |
| 216 if exists $param{'minimal_distance'}; | |
| 217 } | |
| 218 | |
| 219 if ($CACHE_MAX) { | |
| 220 $_complex_usage_count{$P}->{$_param_key}++; | |
| 221 | |
| 222 # If our cache overfloweth. | |
| 223 if (scalar keys %_complex_usage_count > $CACHE_MAX) { | |
| 224 _cf_complex($P); | |
| 225 } | |
| 226 } | |
| 227 | |
| 228 return ( $complex, %param ); | |
| 229 } | |
| 230 | |
| 231 sub cache_disable { | |
| 232 cache_max(0); | |
| 233 } | |
| 234 | |
| 235 sub cache_flush_all { | |
| 236 my $old_purge = cache_purge(); | |
| 237 cache_purge(1); | |
| 238 _cf_simple(''); | |
| 239 _cf_complex(''); | |
| 240 cache_purge($old_purge); | |
| 241 } | |
| 242 | |
| 243 sub amatch { | |
| 244 my $P = shift; | |
| 245 return 1 unless length $P; | |
| 246 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
| 247 _complex($P, @{ shift(@_) }) : _simple($P))[0]; | |
| 248 | |
| 249 if (@_) { | |
| 250 if (wantarray) { | |
| 251 return grep { $a->match($_) } @_; | |
| 252 } else { | |
| 253 foreach (@_) { | |
| 254 return 1 if $a->match($_); | |
| 255 } | |
| 256 return 0; | |
| 257 } | |
| 258 } | |
| 259 if (defined $_) { | |
| 260 if (wantarray) { | |
| 261 return $a->match($_) ? $_ : undef; | |
| 262 } else { | |
| 263 return 1 if $a->match($_); | |
| 264 } | |
| 265 } | |
| 266 return $a->match($_) if defined $_; | |
| 267 | |
| 268 warn "amatch: \$_ is undefined: what are you matching?\n"; | |
| 269 return; | |
| 270 } | |
| 271 | |
| 272 sub _find_substitute { | |
| 273 my ($ri, $rs, $i, $s, $S, $rn) = @_; | |
| 274 | |
| 275 push @{ $ri }, $i; | |
| 276 push @{ $rs }, $s; | |
| 277 | |
| 278 my $pre = substr($_, 0, $i); | |
| 279 my $old = substr($_, $i, $s); | |
| 280 my $suf = substr($_, $i + $s); | |
| 281 my $new = $S; | |
| 282 | |
| 283 $new =~ s/\$\`/$pre/g; | |
| 284 $new =~ s/\$\&/$old/g; | |
| 285 $new =~ s/\$\'/$suf/g; | |
| 286 | |
| 287 push @{ $rn }, $new; | |
| 288 } | |
| 289 | |
| 290 sub _do_substitute { | |
| 291 my ($rn, $ri, $rs, $rS) = @_; | |
| 292 | |
| 293 my $d = 0; | |
| 294 my $n = $_; | |
| 295 | |
| 296 foreach my $i (0..$#$rn) { | |
| 297 substr($n, $ri->[$i] + $d, $rs->[$i]) = $rn->[$i]; | |
| 298 $d += length($rn->[$i]) - $rs->[$i]; | |
| 299 } | |
| 300 | |
| 301 push @{ $rS }, $n; | |
| 302 } | |
| 303 | |
| 304 sub asubstitute { | |
| 305 my $P = shift; | |
| 306 my $S = shift; | |
| 307 my ($a, %p) = | |
| 308 (@_ && ref $_[0] eq 'ARRAY') ? | |
| 309 _complex($P, @{ shift(@_) }) : _simple($P); | |
| 310 | |
| 311 my ($i, $s, @i, @s, @n, @S); | |
| 312 | |
| 313 if (@_) { | |
| 314 if (exists $p{ g }) { | |
| 315 foreach (@_) { | |
| 316 @s = @i = @n = (); | |
| 317 while (($i, $s) = $a->slice_next($_)) { | |
| 318 if (defined $i) { | |
| 319 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
| 320 } | |
| 321 } | |
| 322 _do_substitute(\@n, \@i, \@s, \@S) if @n; | |
| 323 } | |
| 324 } else { | |
| 325 foreach (@_) { | |
| 326 @s = @i = @n = (); | |
| 327 ($i, $s) = $a->slice($_); | |
| 328 if (defined $i) { | |
| 329 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
| 330 _do_substitute(\@n, \@i, \@s, \@S); | |
| 331 } | |
| 332 } | |
| 333 } | |
| 334 return @S; | |
| 335 } elsif (defined $_) { | |
| 336 if (exists $p{ g }) { | |
| 337 while (($i, $s) = $a->slice_next($_)) { | |
| 338 if (defined $i) { | |
| 339 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
| 340 } | |
| 341 } | |
| 342 _do_substitute(\@n, \@i, \@s, \@S) if @n; | |
| 343 } else { | |
| 344 ($i, $s) = $a->slice($_); | |
| 345 if (defined $i) { | |
| 346 _find_substitute(\@i, \@s, $i, $s, $S, \@n); | |
| 347 _do_substitute(\@n, \@i, \@s, \@S); | |
| 348 } | |
| 349 } | |
| 350 return $_ = $n[0]; | |
| 351 } else { | |
| 352 warn "asubstitute: \$_ is undefined: what are you substituting?\n"; | |
| 353 return; | |
| 354 } | |
| 355 } | |
| 356 | |
| 357 sub aindex { | |
| 358 my $P = shift; | |
| 359 return 0 unless length $P; | |
| 360 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
| 361 _complex($P, @{ shift(@_) }) : _simple($P))[0]; | |
| 362 | |
| 363 $a->set_greedy; # The *first* match, thank you. | |
| 364 | |
| 365 if (@_) { | |
| 366 if (wantarray) { | |
| 367 return map { $a->index($_) } @_; | |
| 368 } else { | |
| 369 return $a->index($_[0]); | |
| 370 } | |
| 371 } | |
| 372 return $a->index($_) if defined $_; | |
| 373 | |
| 374 warn "aindex: \$_ is undefined: what are you indexing?\n"; | |
| 375 return; | |
| 376 } | |
| 377 | |
| 378 sub aslice { | |
| 379 my $P = shift; | |
| 380 return (0, 0) unless length $P; | |
| 381 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
| 382 _complex($P, @{ shift(@_) }) : _simple($P))[0]; | |
| 383 | |
| 384 $a->set_greedy; # The *first* match, thank you. | |
| 385 | |
| 386 if (@_) { | |
| 387 return map { [ $a->slice($_) ] } @_; | |
| 388 } | |
| 389 return $a->slice($_) if defined $_; | |
| 390 | |
| 391 warn "aslice: \$_ is undefined: what are you slicing?\n"; | |
| 392 return; | |
| 393 } | |
| 394 | |
| 395 sub _adist { | |
| 396 my $s0 = shift; | |
| 397 my $s1 = shift; | |
| 398 my ($aslice) = aslice($s0, ['minimal_distance', @_], $s1); | |
| 399 my ($index, $size, $distance) = @$aslice; | |
| 400 my ($l0, $l1) = map { length } ($s0, $s1); | |
| 401 return $l0 <= $l1 ? $distance : -$distance; | |
| 402 } | |
| 403 | |
| 404 sub adist { | |
| 405 my $a0 = shift; | |
| 406 my $a1 = shift; | |
| 407 if (length($a0) == 0) { | |
| 408 return length($a1); | |
| 409 } | |
| 410 if (length($a1) == 0) { | |
| 411 return length($a0); | |
| 412 } | |
| 413 my @m = ref $_[0] eq 'ARRAY' ? @{shift()} : (); | |
| 414 if (ref $a0 eq 'ARRAY') { | |
| 415 if (ref $a1 eq 'ARRAY') { | |
| 416 return [ map { adist($a0, $_, @m) } @{$a1} ]; | |
| 417 } else { | |
| 418 return [ map { _adist($_, $a1, @m) } @{$a0} ]; | |
| 419 } | |
| 420 } elsif (ref $a1 eq 'ARRAY') { | |
| 421 return [ map { _adist($a0, $_, @m) } @{$a1} ]; | |
| 422 } else { | |
| 423 if (wantarray) { | |
| 424 return map { _adist($a0, $_, @m) } ($a1, @_); | |
| 425 } else { | |
| 426 return _adist($a0, $a1, @m); | |
| 427 } | |
| 428 } | |
| 429 } | |
| 430 | |
| 431 sub adistr { | |
| 432 my $a0 = shift; | |
| 433 my $a1 = shift; | |
| 434 my @m = ref $_[0] eq 'ARRAY' ? shift : (); | |
| 435 if (ref $a0 eq 'ARRAY') { | |
| 436 if (ref $a1 eq 'ARRAY') { | |
| 437 my $l0 = length(); | |
| 438 return $l0 ? [ map { adist($a0, $_, @m) } | |
| 439 @{$a1} ] : | |
| 440 [ ]; | |
| 441 } else { | |
| 442 return [ map { my $l0 = length(); | |
| 443 $l0 ? _adist($_, $a1, @m) / $l0 : undef | |
| 444 } @{$a0} ]; | |
| 445 } | |
| 446 } elsif (ref $a1 eq 'ARRAY') { | |
| 447 my $l0 = length($a0); | |
| 448 return [] unless $l0; | |
| 449 return [ map { _adist($a0, $_, @m) / $l0 } @{$a1} ]; | |
| 450 } else { | |
| 451 my $l0 = length($a0); | |
| 452 if (wantarray) { | |
| 453 return map { $l0 ? _adist($a0, $_, @m) / $l0 : undef } ($a1, @_); | |
| 454 } else { | |
| 455 return undef unless $l0; | |
| 456 return _adist($a0, $a1, @m) / $l0; | |
| 457 } | |
| 458 } | |
| 459 } | |
| 460 | |
| 461 sub adistword { | |
| 462 return adist($_[0], $_[1], ['position_range=0']); | |
| 463 } | |
| 464 | |
| 465 sub adistrword { | |
| 466 return adistr($_[0], $_[1], ['position_range=0']); | |
| 467 } | |
| 468 | |
| 469 sub arindex { | |
| 470 my $P = shift; | |
| 471 my $l = length $P; | |
| 472 return 0 unless $l; | |
| 473 my $R = reverse $P; | |
| 474 my $a = ((@_ && ref $_[0] eq 'ARRAY') ? | |
| 475 _complex($R, @{ shift(@_) }) : _simple($R))[0]; | |
| 476 | |
| 477 $a->set_greedy; # The *first* match, thank you. | |
| 478 | |
| 479 if (@_) { | |
| 480 if (wantarray) { | |
| 481 return map { | |
| 482 my $aindex = $a->index(scalar reverse()); | |
| 483 $aindex == -1 ? $aindex : (length($_) - $aindex - $l); | |
| 484 } @_; | |
| 485 } else { | |
| 486 my $aindex = $a->index(scalar reverse $_[0]); | |
| 487 return $aindex == -1 ? $aindex : (length($_[0]) - $aindex - $l); | |
| 488 } | |
| 489 } | |
| 490 if (defined $_) { | |
| 491 my $aindex = $a->index(scalar reverse()); | |
| 492 return $aindex == -1 ? $aindex : (length($_) - $aindex - $l); | |
| 493 } | |
| 494 | |
| 495 warn "arindex: \$_ is undefined: what are you indexing?\n"; | |
| 496 return; | |
| 497 } | |
| 498 | |
| 499 1; | |
| 500 __END__ | |
| 501 =pod | |
| 502 | |
| 503 =head1 NAME | |
| 504 | |
| 505 String::Approx - Perl extension for approximate matching (fuzzy matching) | |
| 506 | |
| 507 =head1 SYNOPSIS | |
| 508 | |
| 509 use String::Approx 'amatch'; | |
| 510 | |
| 511 print if amatch("foobar"); | |
| 512 | |
| 513 my @matches = amatch("xyzzy", @inputs); | |
| 514 | |
| 515 my @catches = amatch("plugh", ['2'], @inputs); | |
| 516 | |
| 517 =head1 DESCRIPTION | |
| 518 | |
| 519 String::Approx lets you match and substitute strings approximately. | |
| 520 With this you can emulate errors: typing errorrs, speling errors, | |
| 521 closely related vocabularies (colour color), genetic mutations (GAG | |
| 522 ACT), abbreviations (McScot, MacScot). | |
| 523 | |
| 524 NOTE: String::Approx suits the task of B<string matching>, not | |
| 525 B<string comparison>, and it works for B<strings>, not for B<text>. | |
| 526 | |
| 527 If you want to compare strings for similarity, you probably just want | |
| 528 the Levenshtein edit distance (explained below), the Text::Levenshtein | |
| 529 and Text::LevenshteinXS modules in CPAN. See also Text::WagnerFischer | |
| 530 and Text::PhraseDistance. (There are functions for this in String::Approx, | |
| 531 e.g. adist(), but their results sometimes differ from the bare Levenshtein | |
| 532 et al.) | |
| 533 | |
| 534 If you want to compare things like text or source code, consisting of | |
| 535 B<words> or B<tokens> and B<phrases> and B<sentences>, or | |
| 536 B<expressions> and B<statements>, you should probably use some other | |
| 537 tool than String::Approx, like for example the standard UNIX diff(1) | |
| 538 tool, or the Algorithm::Diff module from CPAN. | |
| 539 | |
| 540 The measure of B<approximateness> is the I<Levenshtein edit distance>. | |
| 541 It is the total number of "edits": insertions, | |
| 542 | |
| 543 word world | |
| 544 | |
| 545 deletions, | |
| 546 | |
| 547 monkey money | |
| 548 | |
| 549 and substitutions | |
| 550 | |
| 551 sun fun | |
| 552 | |
| 553 required to transform a string to another string. For example, to | |
| 554 transform I<"lead"> into I<"gold">, you need three edits: | |
| 555 | |
| 556 lead gead goad gold | |
| 557 | |
| 558 The edit distance of "lead" and "gold" is therefore three, or 75%. | |
| 559 | |
| 560 B<String::Approx> uses the Levenshtein edit distance as its measure, but | |
| 561 String::Approx is not well-suited for comparing strings of different | |
| 562 length, in other words, if you want a "fuzzy eq", see above. | |
| 563 String::Approx is more like regular expressions or index(), it finds | |
| 564 substrings that are close matches.> | |
| 565 | |
| 566 =head1 MATCH | |
| 567 | |
| 568 use String::Approx 'amatch'; | |
| 569 | |
| 570 $matched = amatch("pattern") | |
| 571 $matched = amatch("pattern", [ modifiers ]) | |
| 572 | |
| 573 $any_matched = amatch("pattern", @inputs) | |
| 574 $any_matched = amatch("pattern", [ modifiers ], @inputs) | |
| 575 | |
| 576 @match = amatch("pattern") | |
| 577 @match = amatch("pattern", [ modifiers ]) | |
| 578 | |
| 579 @matches = amatch("pattern", @inputs) | |
| 580 @matches = amatch("pattern", [ modifiers ], @inputs) | |
| 581 | |
| 582 Match B<pattern> approximately. In list context return the matched | |
| 583 B<@inputs>. If no inputs are given, match against the B<$_>. In scalar | |
| 584 context return true if I<any> of the inputs match, false if none match. | |
| 585 | |
| 586 Notice that the pattern is a string. Not a regular expression. None | |
| 587 of the regular expression notations (^, ., *, and so on) work. They | |
| 588 are characters just like the others. Note-on-note: some limited form | |
| 589 of I<"regular expressionism"> is planned in future: for example | |
| 590 character classes ([abc]) and I<any-chars> (.). But that feature will | |
| 591 be turned on by a special I<modifier> (just a guess: "r"), so there | |
| 592 should be no backward compatibility problem. | |
| 593 | |
| 594 Notice also that matching is not symmetric. The inputs are matched | |
| 595 against the pattern, not the other way round. In other words: the | |
| 596 pattern can be a substring, a submatch, of an input element. An input | |
| 597 element is always a superstring of the pattern. | |
| 598 | |
| 599 =head2 MODIFIERS | |
| 600 | |
| 601 With the modifiers you can control the amount of approximateness and | |
| 602 certain other control variables. The modifiers are one or more | |
| 603 strings, for example B<"i">, within a string optionally separated by | |
| 604 whitespace. The modifiers are inside an anonymous array: the B<[ ]> | |
| 605 in the syntax are not notational, they really do mean B<[ ]>, for | |
| 606 example B<[ "i", "2" ]>. B<["2 i"]> would be identical. | |
| 607 | |
| 608 The implicit default approximateness is 10%, rounded up. In other | |
| 609 words: every tenth character in the pattern may be an error, an edit. | |
| 610 You can explicitly set the maximum approximateness by supplying a | |
| 611 modifier like | |
| 612 | |
| 613 number | |
| 614 number% | |
| 615 | |
| 616 Examples: B<"3">, B<"15%">. | |
| 617 | |
| 618 Note that C<0%> is not rounded up, it is equal to C<0>. | |
| 619 | |
| 620 Using a similar syntax you can separately control the maximum number | |
| 621 of insertions, deletions, and substitutions by prefixing the numbers | |
| 622 with I, D, or S, like this: | |
| 623 | |
| 624 Inumber | |
| 625 Inumber% | |
| 626 Dnumber | |
| 627 Dnumber% | |
| 628 Snumber | |
| 629 Snumber% | |
| 630 | |
| 631 Examples: B<"I2">, B<"D20%">, B<"S0">. | |
| 632 | |
| 633 You can ignore case (B<"A"> becames equal to B<"a"> and vice versa) | |
| 634 by adding the B<"i"> modifier. | |
| 635 | |
| 636 For example | |
| 637 | |
| 638 [ "i 25%", "S0" ] | |
| 639 | |
| 640 means I<ignore case>, I<allow every fourth character to be "an edit">, | |
| 641 but allow I<no substitutions>. (See L<NOTES> about disallowing | |
| 642 substitutions or insertions.) | |
| 643 | |
| 644 NOTE: setting C<I0 D0 S0> is not equivalent to using index(). | |
| 645 If you want to use index(), use index(). | |
| 646 | |
| 647 =head1 SUBSTITUTE | |
| 648 | |
| 649 use String::Approx 'asubstitute'; | |
| 650 | |
| 651 @substituted = asubstitute("pattern", "replacement") | |
| 652 @substituted = asubstitute("pattern", "replacement", @inputs) | |
| 653 @substituted = asubstitute("pattern", "replacement", [ modifiers ]) | |
| 654 @substituted = asubstitute("pattern", "replacement", | |
| 655 [ modifiers ], @inputs) | |
| 656 | |
| 657 Substitute approximate B<pattern> with B<replacement> and return as a | |
| 658 list <copies> of B<@inputs>, the substitutions having been made on the | |
| 659 elements that did match the pattern. If no inputs are given, | |
| 660 substitute in the B<$_>. The replacement can contain magic strings | |
| 661 B<$&>, B<$`>, B<$'> that stand for the matched string, the string | |
| 662 before it, and the string after it, respectively. All the other | |
| 663 arguments are as in C<amatch()>, plus one additional modifier, B<"g"> | |
| 664 which means substitute globally (all the matches in an element and not | |
| 665 just the first one, as is the default). | |
| 666 | |
| 667 See L<BAD NEWS> about the unfortunate stinginess of C<asubstitute()>. | |
| 668 | |
| 669 =head1 INDEX | |
| 670 | |
| 671 use String::Approx 'aindex'; | |
| 672 | |
| 673 $index = aindex("pattern") | |
| 674 @indices = aindex("pattern", @inputs) | |
| 675 $index = aindex("pattern", [ modifiers ]) | |
| 676 @indices = aindex("pattern", [ modifiers ], @inputs) | |
| 677 | |
| 678 Like C<amatch()> but returns the index/indices at which the pattern | |
| 679 matches approximately. In list context and if C<@inputs> are used, | |
| 680 returns a list of indices, one index for each input element. | |
| 681 If there's no approximate match, C<-1> is returned as the index. | |
| 682 | |
| 683 NOTE: if there is character repetition (e.g. "aa") either in | |
| 684 the pattern or in the text, the returned index might start | |
| 685 "too early". This is consistent with the goal of the module | |
| 686 of matching "as early as possible", just like regular expressions | |
| 687 (that there might be a "less approximate" match starting later is | |
| 688 of somewhat irrelevant). | |
| 689 | |
| 690 There's also backwards-scanning C<arindex()>. | |
| 691 | |
| 692 =head1 SLICE | |
| 693 | |
| 694 use String::Approx 'aslice'; | |
| 695 | |
| 696 ($index, $size) = aslice("pattern") | |
| 697 ([$i0, $s0], ...) = aslice("pattern", @inputs) | |
| 698 ($index, $size) = aslice("pattern", [ modifiers ]) | |
| 699 ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
| 700 | |
| 701 Like C<aindex()> but returns also the size (length) of the match. | |
| 702 If the match fails, returns an empty list (when matching against C<$_>) | |
| 703 or an empty anonymous list corresponding to the particular input. | |
| 704 | |
| 705 NOTE: size of the match will very probably be something you did not | |
| 706 expect (such as longer than the pattern, or a negative number). This | |
| 707 may or may not be fixed in future releases. Also the beginning of the | |
| 708 match may vary from the expected as with aindex(), see above. | |
| 709 | |
| 710 If the modifier | |
| 711 | |
| 712 "minimal_distance" | |
| 713 | |
| 714 is used, the minimal possible edit distance is returned as the | |
| 715 third element: | |
| 716 | |
| 717 ($index, $size, $distance) = aslice("pattern", [ modifiers ]) | |
| 718 ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
| 719 | |
| 720 =head1 DISTANCE | |
| 721 | |
| 722 use String::Approx 'adist'; | |
| 723 | |
| 724 $dist = adist("pattern", $input); | |
| 725 @dist = adist("pattern", @input); | |
| 726 | |
| 727 Return the I<edit distance> or distances between the pattern and the | |
| 728 input or inputs. Zero edit distance means exact match. (Remember | |
| 729 that the match can 'float' in the inputs, the match is a substring | |
| 730 match.) If the pattern is longer than the input or inputs, the | |
| 731 returned distance or distances is or are negative. | |
| 732 | |
| 733 use String::Approx 'adistr'; | |
| 734 | |
| 735 $dist = adistr("pattern", $input); | |
| 736 @dist = adistr("pattern", @inputs); | |
| 737 | |
| 738 Return the B<relative> I<edit distance> or distances between the | |
| 739 pattern and the input or inputs. Zero relative edit distance means | |
| 740 exact match, one means completely different. (Remember that the | |
| 741 match can 'float' in the inputs, the match is a substring match.) If | |
| 742 the pattern is longer than the input or inputs, the returned distance | |
| 743 or distances is or are negative. | |
| 744 | |
| 745 You can use adist() or adistr() to sort the inputs according to their | |
| 746 approximateness: | |
| 747 | |
| 748 my %d; | |
| 749 @d{@inputs} = map { abs } adistr("pattern", @inputs); | |
| 750 my @d = sort { $d{$a} <=> $d{$b} } @inputs; | |
| 751 | |
| 752 Now C<@d> contains the inputs, the most like C<"pattern"> first. | |
| 753 | |
| 754 =head1 CONTROLLING THE CACHE | |
| 755 | |
| 756 C<String::Approx> maintains a LU (least-used) cache that holds the | |
| 757 'matching engines' for each instance of a I<pattern+modifiers>. The | |
| 758 cache is intended to help the case where you match a small set of | |
| 759 patterns against a large set of string. However, the more engines you | |
| 760 cache the more you eat memory. If you have a lot of different | |
| 761 patterns or if you have a lot of memory to burn, you may want to | |
| 762 control the cache yourself. For example, allowing a larger cache | |
| 763 consumes more memory but probably runs a little bit faster since the | |
| 764 cache fills (and needs flushing) less often. | |
| 765 | |
| 766 The cache has two parameters: I<max> and I<purge>. The first one | |
| 767 is the maximum size of the cache and the second one is the cache | |
| 768 flushing ratio: when the number of cache entries exceeds I<max>, | |
| 769 I<max> times I<purge> cache entries are flushed. The default | |
| 770 values are 1000 and 0.75, respectively, which means that when | |
| 771 the 1001st entry would be cached, 750 least used entries will | |
| 772 be removed from the cache. To access the parameters you can | |
| 773 use the calls | |
| 774 | |
| 775 $now_max = String::Approx::cache_max(); | |
| 776 String::Approx::cache_max($new_max); | |
| 777 | |
| 778 $now_purge = String::Approx::cache_purge(); | |
| 779 String::Approx::cache_purge($new_purge); | |
| 780 | |
| 781 $limit = String::Approx::cache_n_purge(); | |
| 782 | |
| 783 To be honest, there are actually B<two> caches: the first one is used | |
| 784 far the patterns with no modifiers, the second one for the patterns | |
| 785 with pattern modifiers. Using the standard parameters you will | |
| 786 therefore actually cache up to 2000 entries. The above calls control | |
| 787 both caches for the same price. | |
| 788 | |
| 789 To disable caching completely use | |
| 790 | |
| 791 String::Approx::cache_disable(); | |
| 792 | |
| 793 Note that this doesn't flush any possibly existing cache entries, | |
| 794 to do that use | |
| 795 | |
| 796 String::Approx::cache_flush_all(); | |
| 797 | |
| 798 =head1 NOTES | |
| 799 | |
| 800 Because matching is by I<substrings>, not by whole strings, insertions | |
| 801 and substitutions produce often very similar results: "abcde" matches | |
| 802 "axbcde" either by insertion B<or> substitution of "x". | |
| 803 | |
| 804 The maximum edit distance is also the maximum number of edits. | |
| 805 That is, the B<"I2"> in | |
| 806 | |
| 807 amatch("abcd", ["I2"]) | |
| 808 | |
| 809 is useless because the maximum edit distance is (implicitly) 1. | |
| 810 You may have meant to say | |
| 811 | |
| 812 amatch("abcd", ["2D1S1"]) | |
| 813 | |
| 814 or something like that. | |
| 815 | |
| 816 If you want to simulate transposes | |
| 817 | |
| 818 feet fete | |
| 819 | |
| 820 you need to allow at least edit distance of two because in terms of | |
| 821 our edit primitives a transpose is first one deletion and then one | |
| 822 insertion. | |
| 823 | |
| 824 =head2 TEXT POSITION | |
| 825 | |
| 826 The starting and ending positions of matching, substituting, indexing, or | |
| 827 slicing can be changed from the beginning and end of the input(s) to | |
| 828 some other positions by using either or both of the modifiers | |
| 829 | |
| 830 "initial_position=24" | |
| 831 "final_position=42" | |
| 832 | |
| 833 or the both the modifiers | |
| 834 | |
| 835 "initial_position=24" | |
| 836 "position_range=10" | |
| 837 | |
| 838 By setting the B<"position_range"> to be zero you can limit | |
| 839 (anchor) the operation to happen only once (if a match is possible) | |
| 840 at the position. | |
| 841 | |
| 842 =head1 VERSION | |
| 843 | |
| 844 Major release 3. | |
| 845 | |
| 846 =head1 CHANGES FROM VERSION 2 | |
| 847 | |
| 848 =head2 GOOD NEWS | |
| 849 | |
| 850 =over 4 | |
| 851 | |
| 852 =item The version 3 is 2-3 times faster than version 2 | |
| 853 | |
| 854 =item No pattern length limitation | |
| 855 | |
| 856 The algorithm is independent on the pattern length: its time | |
| 857 complexity is I<O(kn)>, where I<k> is the number of edits and I<n> the | |
| 858 length of the text (input). The preprocessing of the pattern will of | |
| 859 course take some I<O(m)> (I<m> being the pattern length) time, but | |
| 860 C<amatch()> and C<asubstitute()> cache the result of this | |
| 861 preprocessing so that it is done only once per pattern. | |
| 862 | |
| 863 =back | |
| 864 | |
| 865 =head2 BAD NEWS | |
| 866 | |
| 867 =over 4 | |
| 868 | |
| 869 =item You do need a C compiler to install the module | |
| 870 | |
| 871 Perl's regular expressions are no more used; instead a faster and more | |
| 872 scalable algorithm written in C is used. | |
| 873 | |
| 874 =item C<asubstitute()> is now always stingy | |
| 875 | |
| 876 The string matched and substituted is now always stingy, as short | |
| 877 as possible. It used to be as long as possible. This is an unfortunate | |
| 878 change stemming from switching the matching algorithm. Example: with | |
| 879 edit distance of two and substituting for B<"word"> from B<"cork"> and | |
| 880 B<"wool"> previously did match B<"cork"> and B<"wool">. Now it does | |
| 881 match B<"or"> and B<"wo">. As little as possible, or, in other words, | |
| 882 with as much approximateness, as many edits, as possible. Because | |
| 883 there is no I<need> to match the B<"c"> of B<"cork">, it is not matched. | |
| 884 | |
| 885 =item no more C<aregex()> because regular expressions are no more used | |
| 886 | |
| 887 =item no more C<compat1> for String::Approx version 1 compatibility | |
| 888 | |
| 889 =back | |
| 890 | |
| 891 =head1 ACKNOWLEDGEMENTS | |
| 892 | |
| 893 The following people have provided valuable test cases, documentation | |
| 894 clarifications, and other feedback: | |
| 895 | |
| 896 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz, | |
| 897 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda, | |
| 898 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner, | |
| 899 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton, | |
| 900 'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley, | |
| 901 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman, | |
| 902 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park, | |
| 903 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic, | |
| 904 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin, | |
| 905 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith, | |
| 906 Frank Tobin, Greg Ward, Rich Williams, Rick Wise. | |
| 907 | |
| 908 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra | |
| 909 Gopal in the Department of Computer Science, University of Arizona. | |
| 910 | |
| 911 =head1 AUTHOR | |
| 912 | |
| 913 Jarkko Hietaniemi <jhi@iki.fi> | |
| 914 | |
| 915 =head1 COPYRIGHT AND LICENSE | |
| 916 | |
| 917 Copyright 2001-2013 by Jarkko Hietaniemi | |
| 918 | |
| 919 This library is free software; you can redistribute it and/or modify | |
| 920 under either the terms of the Artistic License 2.0, or the GNU Library | |
| 921 General Public License, Version 2. See the files Artistic and LGPL | |
| 922 for more details. | |
| 923 | |
| 924 Furthermore: no warranties or obligations of any kind are given, and | |
| 925 the separate file F<COPYRIGHT> must be included intact in all copies | |
| 926 and derived materials. | |
| 927 | |
| 928 =cut |
