Mercurial > repos > plus91-technologies-pvt-ltd > ss_test_tool
comparison 2.4/man/man3/String::Approx.3pm @ 0:00b9898b8510 draft
Uploaded
| author | plus91-technologies-pvt-ltd |
|---|---|
| date | Wed, 04 Jun 2014 03:41:27 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:00b9898b8510 |
|---|---|
| 1 .\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.16) | |
| 2 .\" | |
| 3 .\" Standard preamble: | |
| 4 .\" ======================================================================== | |
| 5 .de Sp \" Vertical space (when we can't use .PP) | |
| 6 .if t .sp .5v | |
| 7 .if n .sp | |
| 8 .. | |
| 9 .de Vb \" Begin verbatim text | |
| 10 .ft CW | |
| 11 .nf | |
| 12 .ne \\$1 | |
| 13 .. | |
| 14 .de Ve \" End verbatim text | |
| 15 .ft R | |
| 16 .fi | |
| 17 .. | |
| 18 .\" Set up some character translations and predefined strings. \*(-- will | |
| 19 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
| 20 .\" double quote, and \*(R" will give a right double quote. \*(C+ will | |
| 21 .\" give a nicer C++. Capital omega is used to do unbreakable dashes and | |
| 22 .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, | |
| 23 .\" nothing in troff, for use with C<>. | |
| 24 .tr \(*W- | |
| 25 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
| 26 .ie n \{\ | |
| 27 . ds -- \(*W- | |
| 28 . ds PI pi | |
| 29 . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
| 30 . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
| 31 . ds L" "" | |
| 32 . ds R" "" | |
| 33 . ds C` "" | |
| 34 . ds C' "" | |
| 35 'br\} | |
| 36 .el\{\ | |
| 37 . ds -- \|\(em\| | |
| 38 . ds PI \(*p | |
| 39 . ds L" `` | |
| 40 . ds R" '' | |
| 41 'br\} | |
| 42 .\" | |
| 43 .\" Escape single quotes in literal strings from groff's Unicode transform. | |
| 44 .ie \n(.g .ds Aq \(aq | |
| 45 .el .ds Aq ' | |
| 46 .\" | |
| 47 .\" If the F register is turned on, we'll generate index entries on stderr for | |
| 48 .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index | |
| 49 .\" entries marked with X<> in POD. Of course, you'll have to process the | |
| 50 .\" output yourself in some meaningful fashion. | |
| 51 .ie \nF \{\ | |
| 52 . de IX | |
| 53 . tm Index:\\$1\t\\n%\t"\\$2" | |
| 54 .. | |
| 55 . nr % 0 | |
| 56 . rr F | |
| 57 .\} | |
| 58 .el \{\ | |
| 59 . de IX | |
| 60 .. | |
| 61 .\} | |
| 62 .\" | |
| 63 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
| 64 .\" Fear. Run. Save yourself. No user-serviceable parts. | |
| 65 . \" fudge factors for nroff and troff | |
| 66 .if n \{\ | |
| 67 . ds #H 0 | |
| 68 . ds #V .8m | |
| 69 . ds #F .3m | |
| 70 . ds #[ \f1 | |
| 71 . ds #] \fP | |
| 72 .\} | |
| 73 .if t \{\ | |
| 74 . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
| 75 . ds #V .6m | |
| 76 . ds #F 0 | |
| 77 . ds #[ \& | |
| 78 . ds #] \& | |
| 79 .\} | |
| 80 . \" simple accents for nroff and troff | |
| 81 .if n \{\ | |
| 82 . ds ' \& | |
| 83 . ds ` \& | |
| 84 . ds ^ \& | |
| 85 . ds , \& | |
| 86 . ds ~ ~ | |
| 87 . ds / | |
| 88 .\} | |
| 89 .if t \{\ | |
| 90 . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
| 91 . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
| 92 . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
| 93 . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
| 94 . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
| 95 . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
| 96 .\} | |
| 97 . \" troff and (daisy-wheel) nroff accents | |
| 98 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
| 99 .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
| 100 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
| 101 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
| 102 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
| 103 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
| 104 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
| 105 .ds ae a\h'-(\w'a'u*4/10)'e | |
| 106 .ds Ae A\h'-(\w'A'u*4/10)'E | |
| 107 . \" corrections for vroff | |
| 108 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
| 109 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
| 110 . \" for low resolution devices (crt and lpr) | |
| 111 .if \n(.H>23 .if \n(.V>19 \ | |
| 112 \{\ | |
| 113 . ds : e | |
| 114 . ds 8 ss | |
| 115 . ds o a | |
| 116 . ds d- d\h'-1'\(ga | |
| 117 . ds D- D\h'-1'\(hy | |
| 118 . ds th \o'bp' | |
| 119 . ds Th \o'LP' | |
| 120 . ds ae ae | |
| 121 . ds Ae AE | |
| 122 .\} | |
| 123 .rm #[ #] #H #V #F C | |
| 124 .\" ======================================================================== | |
| 125 .\" | |
| 126 .IX Title "Approx 3pm" | |
| 127 .TH Approx 3pm "2013-01-22" "perl v5.14.2" "User Contributed Perl Documentation" | |
| 128 .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
| 129 .\" way too many mistakes in technical documents. | |
| 130 .if n .ad l | |
| 131 .nh | |
| 132 .SH "NAME" | |
| 133 String::Approx \- Perl extension for approximate matching (fuzzy matching) | |
| 134 .SH "SYNOPSIS" | |
| 135 .IX Header "SYNOPSIS" | |
| 136 .Vb 1 | |
| 137 \& use String::Approx \*(Aqamatch\*(Aq; | |
| 138 \& | |
| 139 \& print if amatch("foobar"); | |
| 140 \& | |
| 141 \& my @matches = amatch("xyzzy", @inputs); | |
| 142 \& | |
| 143 \& my @catches = amatch("plugh", [\*(Aq2\*(Aq], @inputs); | |
| 144 .Ve | |
| 145 .SH "DESCRIPTION" | |
| 146 .IX Header "DESCRIPTION" | |
| 147 String::Approx lets you match and substitute strings approximately. | |
| 148 With this you can emulate errors: typing errorrs, speling errors, | |
| 149 closely related vocabularies (colour color), genetic mutations (\s-1GAG\s0 | |
| 150 \&\s-1ACT\s0), abbreviations (McScot, MacScot). | |
| 151 .PP | |
| 152 \&\s-1NOTE:\s0 String::Approx suits the task of \fBstring matching\fR, not | |
| 153 \&\fBstring comparison\fR, and it works for \fBstrings\fR, not for \fBtext\fR. | |
| 154 .PP | |
| 155 If you want to compare strings for similarity, you probably just want | |
| 156 the Levenshtein edit distance (explained below), the Text::Levenshtein | |
| 157 and Text::LevenshteinXS modules in \s-1CPAN\s0. See also Text::WagnerFischer | |
| 158 and Text::PhraseDistance. (There are functions for this in String::Approx, | |
| 159 e.g. \fIadist()\fR, but their results sometimes differ from the bare Levenshtein | |
| 160 et al.) | |
| 161 .PP | |
| 162 If you want to compare things like text or source code, consisting of | |
| 163 \&\fBwords\fR or \fBtokens\fR and \fBphrases\fR and \fBsentences\fR, or | |
| 164 \&\fBexpressions\fR and \fBstatements\fR, you should probably use some other | |
| 165 tool than String::Approx, like for example the standard \s-1UNIX\s0 \fIdiff\fR\|(1) | |
| 166 tool, or the Algorithm::Diff module from \s-1CPAN\s0. | |
| 167 .PP | |
| 168 The measure of \fBapproximateness\fR is the \fILevenshtein edit distance\fR. | |
| 169 It is the total number of \*(L"edits\*(R": insertions, | |
| 170 .PP | |
| 171 .Vb 1 | |
| 172 \& word world | |
| 173 .Ve | |
| 174 .PP | |
| 175 deletions, | |
| 176 .PP | |
| 177 .Vb 1 | |
| 178 \& monkey money | |
| 179 .Ve | |
| 180 .PP | |
| 181 and substitutions | |
| 182 .PP | |
| 183 .Vb 1 | |
| 184 \& sun fun | |
| 185 .Ve | |
| 186 .PP | |
| 187 required to transform a string to another string. For example, to | |
| 188 transform \fI\*(L"lead\*(R"\fR into \fI\*(L"gold\*(R"\fR, you need three edits: | |
| 189 .PP | |
| 190 .Vb 1 | |
| 191 \& lead gead goad gold | |
| 192 .Ve | |
| 193 .PP | |
| 194 The edit distance of \*(L"lead\*(R" and \*(L"gold\*(R" is therefore three, or 75%. | |
| 195 .PP | |
| 196 \&\fBString::Approx\fR uses the Levenshtein edit distance as its measure, but | |
| 197 String::Approx is not well-suited for comparing strings of different | |
| 198 length, in other words, if you want a \*(L"fuzzy eq\*(R", see above. | |
| 199 String::Approx is more like regular expressions or \fIindex()\fR, it finds | |
| 200 substrings that are close matches.> | |
| 201 .SH "MATCH" | |
| 202 .IX Header "MATCH" | |
| 203 .Vb 1 | |
| 204 \& use String::Approx \*(Aqamatch\*(Aq; | |
| 205 \& | |
| 206 \& $matched = amatch("pattern") | |
| 207 \& $matched = amatch("pattern", [ modifiers ]) | |
| 208 \& | |
| 209 \& $any_matched = amatch("pattern", @inputs) | |
| 210 \& $any_matched = amatch("pattern", [ modifiers ], @inputs) | |
| 211 \& | |
| 212 \& @match = amatch("pattern") | |
| 213 \& @match = amatch("pattern", [ modifiers ]) | |
| 214 \& | |
| 215 \& @matches = amatch("pattern", @inputs) | |
| 216 \& @matches = amatch("pattern", [ modifiers ], @inputs) | |
| 217 .Ve | |
| 218 .PP | |
| 219 Match \fBpattern\fR approximately. In list context return the matched | |
| 220 \&\fB\f(CB@inputs\fB\fR. If no inputs are given, match against the \fB\f(CB$_\fB\fR. In scalar | |
| 221 context return true if \fIany\fR of the inputs match, false if none match. | |
| 222 .PP | |
| 223 Notice that the pattern is a string. Not a regular expression. None | |
| 224 of the regular expression notations (^, ., *, and so on) work. They | |
| 225 are characters just like the others. Note-on-note: some limited form | |
| 226 of \fI\*(L"regular expressionism\*(R"\fR is planned in future: for example | |
| 227 character classes ([abc]) and \fIany-chars\fR (.). But that feature will | |
| 228 be turned on by a special \fImodifier\fR (just a guess: \*(L"r\*(R"), so there | |
| 229 should be no backward compatibility problem. | |
| 230 .PP | |
| 231 Notice also that matching is not symmetric. The inputs are matched | |
| 232 against the pattern, not the other way round. In other words: the | |
| 233 pattern can be a substring, a submatch, of an input element. An input | |
| 234 element is always a superstring of the pattern. | |
| 235 .SS "\s-1MODIFIERS\s0" | |
| 236 .IX Subsection "MODIFIERS" | |
| 237 With the modifiers you can control the amount of approximateness and | |
| 238 certain other control variables. The modifiers are one or more | |
| 239 strings, for example \fB\*(L"i\*(R"\fR, within a string optionally separated by | |
| 240 whitespace. The modifiers are inside an anonymous array: the \fB[ ]\fR | |
| 241 in the syntax are not notational, they really do mean \fB[ ]\fR, for | |
| 242 example \fB[ \*(L"i\*(R", \*(L"2\*(R" ]\fR. \fB[\*(L"2 i\*(R"]\fR would be identical. | |
| 243 .PP | |
| 244 The implicit default approximateness is 10%, rounded up. In other | |
| 245 words: every tenth character in the pattern may be an error, an edit. | |
| 246 You can explicitly set the maximum approximateness by supplying a | |
| 247 modifier like | |
| 248 .PP | |
| 249 .Vb 2 | |
| 250 \& number | |
| 251 \& number% | |
| 252 .Ve | |
| 253 .PP | |
| 254 Examples: \fB\*(L"3\*(R"\fR, \fB\*(L"15%\*(R"\fR. | |
| 255 .PP | |
| 256 Note that \f(CW\*(C`0%\*(C'\fR is not rounded up, it is equal to \f(CW0\fR. | |
| 257 .PP | |
| 258 Using a similar syntax you can separately control the maximum number | |
| 259 of insertions, deletions, and substitutions by prefixing the numbers | |
| 260 with I, D, or S, like this: | |
| 261 .PP | |
| 262 .Vb 6 | |
| 263 \& Inumber | |
| 264 \& Inumber% | |
| 265 \& Dnumber | |
| 266 \& Dnumber% | |
| 267 \& Snumber | |
| 268 \& Snumber% | |
| 269 .Ve | |
| 270 .PP | |
| 271 Examples: \fB\*(L"I2\*(R"\fR, \fB\*(L"D20%\*(R"\fR, \fB\*(L"S0\*(R"\fR. | |
| 272 .PP | |
| 273 You can ignore case (\fB\*(L"A\*(R"\fR becames equal to \fB\*(L"a\*(R"\fR and vice versa) | |
| 274 by adding the \fB\*(L"i\*(R"\fR modifier. | |
| 275 .PP | |
| 276 For example | |
| 277 .PP | |
| 278 .Vb 1 | |
| 279 \& [ "i 25%", "S0" ] | |
| 280 .Ve | |
| 281 .PP | |
| 282 means \fIignore case\fR, \fIallow every fourth character to be \*(L"an edit\*(R"\fR, | |
| 283 but allow \fIno substitutions\fR. (See \s-1NOTES\s0 about disallowing | |
| 284 substitutions or insertions.) | |
| 285 .PP | |
| 286 \&\s-1NOTE:\s0 setting \f(CW\*(C`I0 D0 S0\*(C'\fR is not equivalent to using \fIindex()\fR. | |
| 287 If you want to use \fIindex()\fR, use \fIindex()\fR. | |
| 288 .SH "SUBSTITUTE" | |
| 289 .IX Header "SUBSTITUTE" | |
| 290 .Vb 1 | |
| 291 \& use String::Approx \*(Aqasubstitute\*(Aq; | |
| 292 \& | |
| 293 \& @substituted = asubstitute("pattern", "replacement") | |
| 294 \& @substituted = asubstitute("pattern", "replacement", @inputs) | |
| 295 \& @substituted = asubstitute("pattern", "replacement", [ modifiers ]) | |
| 296 \& @substituted = asubstitute("pattern", "replacement", | |
| 297 \& [ modifiers ], @inputs) | |
| 298 .Ve | |
| 299 .PP | |
| 300 Substitute approximate \fBpattern\fR with \fBreplacement\fR and return as a | |
| 301 list <copies> of \fB\f(CB@inputs\fB\fR, the substitutions having been made on the | |
| 302 elements that did match the pattern. If no inputs are given, | |
| 303 substitute in the \fB\f(CB$_\fB\fR. The replacement can contain magic strings | |
| 304 \&\fB$&\fR, \fB$`\fR, \fB$'\fR that stand for the matched string, the string | |
| 305 before it, and the string after it, respectively. All the other | |
| 306 arguments are as in \f(CW\*(C`amatch()\*(C'\fR, plus one additional modifier, \fB\*(L"g\*(R"\fR | |
| 307 which means substitute globally (all the matches in an element and not | |
| 308 just the first one, as is the default). | |
| 309 .PP | |
| 310 See \*(L"\s-1BAD\s0 \s-1NEWS\s0\*(R" about the unfortunate stinginess of \f(CW\*(C`asubstitute()\*(C'\fR. | |
| 311 .SH "INDEX" | |
| 312 .IX Header "INDEX" | |
| 313 .Vb 1 | |
| 314 \& use String::Approx \*(Aqaindex\*(Aq; | |
| 315 \& | |
| 316 \& $index = aindex("pattern") | |
| 317 \& @indices = aindex("pattern", @inputs) | |
| 318 \& $index = aindex("pattern", [ modifiers ]) | |
| 319 \& @indices = aindex("pattern", [ modifiers ], @inputs) | |
| 320 .Ve | |
| 321 .PP | |
| 322 Like \f(CW\*(C`amatch()\*(C'\fR but returns the index/indices at which the pattern | |
| 323 matches approximately. In list context and if \f(CW@inputs\fR are used, | |
| 324 returns a list of indices, one index for each input element. | |
| 325 If there's no approximate match, \f(CW\*(C`\-1\*(C'\fR is returned as the index. | |
| 326 .PP | |
| 327 \&\s-1NOTE:\s0 if there is character repetition (e.g. \*(L"aa\*(R") either in | |
| 328 the pattern or in the text, the returned index might start | |
| 329 \&\*(L"too early\*(R". This is consistent with the goal of the module | |
| 330 of matching \*(L"as early as possible\*(R", just like regular expressions | |
| 331 (that there might be a \*(L"less approximate\*(R" match starting later is | |
| 332 of somewhat irrelevant). | |
| 333 .PP | |
| 334 There's also backwards-scanning \f(CW\*(C`arindex()\*(C'\fR. | |
| 335 .SH "SLICE" | |
| 336 .IX Header "SLICE" | |
| 337 .Vb 1 | |
| 338 \& use String::Approx \*(Aqaslice\*(Aq; | |
| 339 \& | |
| 340 \& ($index, $size) = aslice("pattern") | |
| 341 \& ([$i0, $s0], ...) = aslice("pattern", @inputs) | |
| 342 \& ($index, $size) = aslice("pattern", [ modifiers ]) | |
| 343 \& ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
| 344 .Ve | |
| 345 .PP | |
| 346 Like \f(CW\*(C`aindex()\*(C'\fR but returns also the size (length) of the match. | |
| 347 If the match fails, returns an empty list (when matching against \f(CW$_\fR) | |
| 348 or an empty anonymous list corresponding to the particular input. | |
| 349 .PP | |
| 350 \&\s-1NOTE:\s0 size of the match will very probably be something you did not | |
| 351 expect (such as longer than the pattern, or a negative number). This | |
| 352 may or may not be fixed in future releases. Also the beginning of the | |
| 353 match may vary from the expected as with \fIaindex()\fR, see above. | |
| 354 .PP | |
| 355 If the modifier | |
| 356 .PP | |
| 357 .Vb 1 | |
| 358 \& "minimal_distance" | |
| 359 .Ve | |
| 360 .PP | |
| 361 is used, the minimal possible edit distance is returned as the | |
| 362 third element: | |
| 363 .PP | |
| 364 .Vb 2 | |
| 365 \& ($index, $size, $distance) = aslice("pattern", [ modifiers ]) | |
| 366 \& ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs) | |
| 367 .Ve | |
| 368 .SH "DISTANCE" | |
| 369 .IX Header "DISTANCE" | |
| 370 .Vb 1 | |
| 371 \& use String::Approx \*(Aqadist\*(Aq; | |
| 372 \& | |
| 373 \& $dist = adist("pattern", $input); | |
| 374 \& @dist = adist("pattern", @input); | |
| 375 .Ve | |
| 376 .PP | |
| 377 Return the \fIedit distance\fR or distances between the pattern and the | |
| 378 input or inputs. Zero edit distance means exact match. (Remember | |
| 379 that the match can 'float' in the inputs, the match is a substring | |
| 380 match.) If the pattern is longer than the input or inputs, the | |
| 381 returned distance or distances is or are negative. | |
| 382 .PP | |
| 383 .Vb 1 | |
| 384 \& use String::Approx \*(Aqadistr\*(Aq; | |
| 385 \& | |
| 386 \& $dist = adistr("pattern", $input); | |
| 387 \& @dist = adistr("pattern", @inputs); | |
| 388 .Ve | |
| 389 .PP | |
| 390 Return the \fBrelative\fR \fIedit distance\fR or distances between the | |
| 391 pattern and the input or inputs. Zero relative edit distance means | |
| 392 exact match, one means completely different. (Remember that the | |
| 393 match can 'float' in the inputs, the match is a substring match.) If | |
| 394 the pattern is longer than the input or inputs, the returned distance | |
| 395 or distances is or are negative. | |
| 396 .PP | |
| 397 You can use \fIadist()\fR or \fIadistr()\fR to sort the inputs according to their | |
| 398 approximateness: | |
| 399 .PP | |
| 400 .Vb 3 | |
| 401 \& my %d; | |
| 402 \& @d{@inputs} = map { abs } adistr("pattern", @inputs); | |
| 403 \& my @d = sort { $d{$a} <=> $d{$b} } @inputs; | |
| 404 .Ve | |
| 405 .PP | |
| 406 Now \f(CW@d\fR contains the inputs, the most like \f(CW"pattern"\fR first. | |
| 407 .SH "CONTROLLING THE CACHE" | |
| 408 .IX Header "CONTROLLING THE CACHE" | |
| 409 \&\f(CW\*(C`String::Approx\*(C'\fR maintains a \s-1LU\s0 (least-used) cache that holds the | |
| 410 \&'matching engines' for each instance of a \fIpattern+modifiers\fR. The | |
| 411 cache is intended to help the case where you match a small set of | |
| 412 patterns against a large set of string. However, the more engines you | |
| 413 cache the more you eat memory. If you have a lot of different | |
| 414 patterns or if you have a lot of memory to burn, you may want to | |
| 415 control the cache yourself. For example, allowing a larger cache | |
| 416 consumes more memory but probably runs a little bit faster since the | |
| 417 cache fills (and needs flushing) less often. | |
| 418 .PP | |
| 419 The cache has two parameters: \fImax\fR and \fIpurge\fR. The first one | |
| 420 is the maximum size of the cache and the second one is the cache | |
| 421 flushing ratio: when the number of cache entries exceeds \fImax\fR, | |
| 422 \&\fImax\fR times \fIpurge\fR cache entries are flushed. The default | |
| 423 values are 1000 and 0.75, respectively, which means that when | |
| 424 the 1001st entry would be cached, 750 least used entries will | |
| 425 be removed from the cache. To access the parameters you can | |
| 426 use the calls | |
| 427 .PP | |
| 428 .Vb 2 | |
| 429 \& $now_max = String::Approx::cache_max(); | |
| 430 \& String::Approx::cache_max($new_max); | |
| 431 \& | |
| 432 \& $now_purge = String::Approx::cache_purge(); | |
| 433 \& String::Approx::cache_purge($new_purge); | |
| 434 \& | |
| 435 \& $limit = String::Approx::cache_n_purge(); | |
| 436 .Ve | |
| 437 .PP | |
| 438 To be honest, there are actually \fBtwo\fR caches: the first one is used | |
| 439 far the patterns with no modifiers, the second one for the patterns | |
| 440 with pattern modifiers. Using the standard parameters you will | |
| 441 therefore actually cache up to 2000 entries. The above calls control | |
| 442 both caches for the same price. | |
| 443 .PP | |
| 444 To disable caching completely use | |
| 445 .PP | |
| 446 .Vb 1 | |
| 447 \& String::Approx::cache_disable(); | |
| 448 .Ve | |
| 449 .PP | |
| 450 Note that this doesn't flush any possibly existing cache entries, | |
| 451 to do that use | |
| 452 .PP | |
| 453 .Vb 1 | |
| 454 \& String::Approx::cache_flush_all(); | |
| 455 .Ve | |
| 456 .SH "NOTES" | |
| 457 .IX Header "NOTES" | |
| 458 Because matching is by \fIsubstrings\fR, not by whole strings, insertions | |
| 459 and substitutions produce often very similar results: \*(L"abcde\*(R" matches | |
| 460 \&\*(L"axbcde\*(R" either by insertion \fBor\fR substitution of \*(L"x\*(R". | |
| 461 .PP | |
| 462 The maximum edit distance is also the maximum number of edits. | |
| 463 That is, the \fB\*(L"I2\*(R"\fR in | |
| 464 .PP | |
| 465 .Vb 1 | |
| 466 \& amatch("abcd", ["I2"]) | |
| 467 .Ve | |
| 468 .PP | |
| 469 is useless because the maximum edit distance is (implicitly) 1. | |
| 470 You may have meant to say | |
| 471 .PP | |
| 472 .Vb 1 | |
| 473 \& amatch("abcd", ["2D1S1"]) | |
| 474 .Ve | |
| 475 .PP | |
| 476 or something like that. | |
| 477 .PP | |
| 478 If you want to simulate transposes | |
| 479 .PP | |
| 480 .Vb 1 | |
| 481 \& feet fete | |
| 482 .Ve | |
| 483 .PP | |
| 484 you need to allow at least edit distance of two because in terms of | |
| 485 our edit primitives a transpose is first one deletion and then one | |
| 486 insertion. | |
| 487 .SS "\s-1TEXT\s0 \s-1POSITION\s0" | |
| 488 .IX Subsection "TEXT POSITION" | |
| 489 The starting and ending positions of matching, substituting, indexing, or | |
| 490 slicing can be changed from the beginning and end of the input(s) to | |
| 491 some other positions by using either or both of the modifiers | |
| 492 .PP | |
| 493 .Vb 2 | |
| 494 \& "initial_position=24" | |
| 495 \& "final_position=42" | |
| 496 .Ve | |
| 497 .PP | |
| 498 or the both the modifiers | |
| 499 .PP | |
| 500 .Vb 2 | |
| 501 \& "initial_position=24" | |
| 502 \& "position_range=10" | |
| 503 .Ve | |
| 504 .PP | |
| 505 By setting the \fB\*(L"position_range\*(R"\fR to be zero you can limit | |
| 506 (anchor) the operation to happen only once (if a match is possible) | |
| 507 at the position. | |
| 508 .SH "VERSION" | |
| 509 .IX Header "VERSION" | |
| 510 Major release 3. | |
| 511 .SH "CHANGES FROM VERSION 2" | |
| 512 .IX Header "CHANGES FROM VERSION 2" | |
| 513 .SS "\s-1GOOD\s0 \s-1NEWS\s0" | |
| 514 .IX Subsection "GOOD NEWS" | |
| 515 .IP "The version 3 is 2\-3 times faster than version 2" 4 | |
| 516 .IX Item "The version 3 is 2-3 times faster than version 2" | |
| 517 .PD 0 | |
| 518 .IP "No pattern length limitation" 4 | |
| 519 .IX Item "No pattern length limitation" | |
| 520 .PD | |
| 521 The algorithm is independent on the pattern length: its time | |
| 522 complexity is \fIO(kn)\fR, where \fIk\fR is the number of edits and \fIn\fR the | |
| 523 length of the text (input). The preprocessing of the pattern will of | |
| 524 course take some \fIO(m)\fR (\fIm\fR being the pattern length) time, but | |
| 525 \&\f(CW\*(C`amatch()\*(C'\fR and \f(CW\*(C`asubstitute()\*(C'\fR cache the result of this | |
| 526 preprocessing so that it is done only once per pattern. | |
| 527 .SS "\s-1BAD\s0 \s-1NEWS\s0" | |
| 528 .IX Subsection "BAD NEWS" | |
| 529 .IP "You do need a C compiler to install the module" 4 | |
| 530 .IX Item "You do need a C compiler to install the module" | |
| 531 Perl's regular expressions are no more used; instead a faster and more | |
| 532 scalable algorithm written in C is used. | |
| 533 .ie n .IP """asubstitute()"" is now always stingy" 4 | |
| 534 .el .IP "\f(CWasubstitute()\fR is now always stingy" 4 | |
| 535 .IX Item "asubstitute() is now always stingy" | |
| 536 The string matched and substituted is now always stingy, as short | |
| 537 as possible. It used to be as long as possible. This is an unfortunate | |
| 538 change stemming from switching the matching algorithm. Example: with | |
| 539 edit distance of two and substituting for \fB\*(L"word\*(R"\fR from \fB\*(L"cork\*(R"\fR and | |
| 540 \&\fB\*(L"wool\*(R"\fR previously did match \fB\*(L"cork\*(R"\fR and \fB\*(L"wool\*(R"\fR. Now it does | |
| 541 match \fB\*(L"or\*(R"\fR and \fB\*(L"wo\*(R"\fR. As little as possible, or, in other words, | |
| 542 with as much approximateness, as many edits, as possible. Because | |
| 543 there is no \fIneed\fR to match the \fB\*(L"c\*(R"\fR of \fB\*(L"cork\*(R"\fR, it is not matched. | |
| 544 .ie n .IP "no more ""aregex()"" because regular expressions are no more used" 4 | |
| 545 .el .IP "no more \f(CWaregex()\fR because regular expressions are no more used" 4 | |
| 546 .IX Item "no more aregex() because regular expressions are no more used" | |
| 547 .PD 0 | |
| 548 .ie n .IP "no more ""compat1"" for String::Approx version 1 compatibility" 4 | |
| 549 .el .IP "no more \f(CWcompat1\fR for String::Approx version 1 compatibility" 4 | |
| 550 .IX Item "no more compat1 for String::Approx version 1 compatibility" | |
| 551 .PD | |
| 552 .SH "ACKNOWLEDGEMENTS" | |
| 553 .IX Header "ACKNOWLEDGEMENTS" | |
| 554 The following people have provided valuable test cases, documentation | |
| 555 clarifications, and other feedback: | |
| 556 .PP | |
| 557 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz, | |
| 558 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda, | |
| 559 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner, | |
| 560 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton, | |
| 561 \&'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley, | |
| 562 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman, | |
| 563 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park, | |
| 564 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic, | |
| 565 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin, | |
| 566 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith, | |
| 567 Frank Tobin, Greg Ward, Rich Williams, Rick Wise. | |
| 568 .PP | |
| 569 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra | |
| 570 Gopal in the Department of Computer Science, University of Arizona. | |
| 571 .SH "AUTHOR" | |
| 572 .IX Header "AUTHOR" | |
| 573 Jarkko Hietaniemi <jhi@iki.fi> | |
| 574 .SH "COPYRIGHT AND LICENSE" | |
| 575 .IX Header "COPYRIGHT AND LICENSE" | |
| 576 Copyright 2001\-2013 by Jarkko Hietaniemi | |
| 577 .PP | |
| 578 This library is free software; you can redistribute it and/or modify | |
| 579 under either the terms of the Artistic License 2.0, or the \s-1GNU\s0 Library | |
| 580 General Public License, Version 2. See the files Artistic and \s-1LGPL\s0 | |
| 581 for more details. | |
| 582 .PP | |
| 583 Furthermore: no warranties or obligations of any kind are given, and | |
| 584 the separate file \fI\s-1COPYRIGHT\s0\fR must be included intact in all copies | |
| 585 and derived materials. |
