Mercurial > repos > yating-l > jbrowsearchivecreator
comparison test/lib/python2.7/sre_parse.py @ 3:7d1a9a91b989 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit d583ac16a6c6942730ea536eb59cc37941816030-dirty
| author | yating-l |
|---|---|
| date | Thu, 18 May 2017 18:37:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:3e2160197902 | 3:7d1a9a91b989 |
|---|---|
| 1 # | |
| 2 # Secret Labs' Regular Expression Engine | |
| 3 # | |
| 4 # convert re-style regular expression to sre pattern | |
| 5 # | |
| 6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. | |
| 7 # | |
| 8 # See the sre.py file for information on usage and redistribution. | |
| 9 # | |
| 10 | |
| 11 """Internal support module for sre""" | |
| 12 | |
| 13 # XXX: show string offset and offending character for all errors | |
| 14 | |
| 15 import sys | |
| 16 | |
| 17 from sre_constants import * | |
| 18 | |
| 19 SPECIAL_CHARS = ".\\[{()*+?^$|" | |
| 20 REPEAT_CHARS = "*+?{" | |
| 21 | |
| 22 DIGITS = set("0123456789") | |
| 23 | |
| 24 OCTDIGITS = set("01234567") | |
| 25 HEXDIGITS = set("0123456789abcdefABCDEF") | |
| 26 | |
| 27 WHITESPACE = set(" \t\n\r\v\f") | |
| 28 | |
| 29 ESCAPES = { | |
| 30 r"\a": (LITERAL, ord("\a")), | |
| 31 r"\b": (LITERAL, ord("\b")), | |
| 32 r"\f": (LITERAL, ord("\f")), | |
| 33 r"\n": (LITERAL, ord("\n")), | |
| 34 r"\r": (LITERAL, ord("\r")), | |
| 35 r"\t": (LITERAL, ord("\t")), | |
| 36 r"\v": (LITERAL, ord("\v")), | |
| 37 r"\\": (LITERAL, ord("\\")) | |
| 38 } | |
| 39 | |
| 40 CATEGORIES = { | |
| 41 r"\A": (AT, AT_BEGINNING_STRING), # start of string | |
| 42 r"\b": (AT, AT_BOUNDARY), | |
| 43 r"\B": (AT, AT_NON_BOUNDARY), | |
| 44 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), | |
| 45 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), | |
| 46 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), | |
| 47 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), | |
| 48 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), | |
| 49 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), | |
| 50 r"\Z": (AT, AT_END_STRING), # end of string | |
| 51 } | |
| 52 | |
| 53 FLAGS = { | |
| 54 # standard flags | |
| 55 "i": SRE_FLAG_IGNORECASE, | |
| 56 "L": SRE_FLAG_LOCALE, | |
| 57 "m": SRE_FLAG_MULTILINE, | |
| 58 "s": SRE_FLAG_DOTALL, | |
| 59 "x": SRE_FLAG_VERBOSE, | |
| 60 # extensions | |
| 61 "t": SRE_FLAG_TEMPLATE, | |
| 62 "u": SRE_FLAG_UNICODE, | |
| 63 } | |
| 64 | |
| 65 class Pattern: | |
| 66 # master pattern object. keeps track of global attributes | |
| 67 def __init__(self): | |
| 68 self.flags = 0 | |
| 69 self.open = [] | |
| 70 self.groups = 1 | |
| 71 self.groupdict = {} | |
| 72 self.lookbehind = 0 | |
| 73 | |
| 74 def opengroup(self, name=None): | |
| 75 gid = self.groups | |
| 76 self.groups = gid + 1 | |
| 77 if name is not None: | |
| 78 ogid = self.groupdict.get(name, None) | |
| 79 if ogid is not None: | |
| 80 raise error, ("redefinition of group name %s as group %d; " | |
| 81 "was group %d" % (repr(name), gid, ogid)) | |
| 82 self.groupdict[name] = gid | |
| 83 self.open.append(gid) | |
| 84 return gid | |
| 85 def closegroup(self, gid): | |
| 86 self.open.remove(gid) | |
| 87 def checkgroup(self, gid): | |
| 88 return gid < self.groups and gid not in self.open | |
| 89 | |
| 90 class SubPattern: | |
| 91 # a subpattern, in intermediate form | |
| 92 def __init__(self, pattern, data=None): | |
| 93 self.pattern = pattern | |
| 94 if data is None: | |
| 95 data = [] | |
| 96 self.data = data | |
| 97 self.width = None | |
| 98 def dump(self, level=0): | |
| 99 seqtypes = (tuple, list) | |
| 100 for op, av in self.data: | |
| 101 print level*" " + op, | |
| 102 if op == IN: | |
| 103 # member sublanguage | |
| 104 print | |
| 105 for op, a in av: | |
| 106 print (level+1)*" " + op, a | |
| 107 elif op == BRANCH: | |
| 108 print | |
| 109 for i, a in enumerate(av[1]): | |
| 110 if i: | |
| 111 print level*" " + "or" | |
| 112 a.dump(level+1) | |
| 113 elif op == GROUPREF_EXISTS: | |
| 114 condgroup, item_yes, item_no = av | |
| 115 print condgroup | |
| 116 item_yes.dump(level+1) | |
| 117 if item_no: | |
| 118 print level*" " + "else" | |
| 119 item_no.dump(level+1) | |
| 120 elif isinstance(av, seqtypes): | |
| 121 nl = 0 | |
| 122 for a in av: | |
| 123 if isinstance(a, SubPattern): | |
| 124 if not nl: | |
| 125 print | |
| 126 a.dump(level+1) | |
| 127 nl = 1 | |
| 128 else: | |
| 129 print a, | |
| 130 nl = 0 | |
| 131 if not nl: | |
| 132 print | |
| 133 else: | |
| 134 print av | |
| 135 def __repr__(self): | |
| 136 return repr(self.data) | |
| 137 def __len__(self): | |
| 138 return len(self.data) | |
| 139 def __delitem__(self, index): | |
| 140 del self.data[index] | |
| 141 def __getitem__(self, index): | |
| 142 if isinstance(index, slice): | |
| 143 return SubPattern(self.pattern, self.data[index]) | |
| 144 return self.data[index] | |
| 145 def __setitem__(self, index, code): | |
| 146 self.data[index] = code | |
| 147 def insert(self, index, code): | |
| 148 self.data.insert(index, code) | |
| 149 def append(self, code): | |
| 150 self.data.append(code) | |
| 151 def getwidth(self): | |
| 152 # determine the width (min, max) for this subpattern | |
| 153 if self.width: | |
| 154 return self.width | |
| 155 lo = hi = 0 | |
| 156 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) | |
| 157 REPEATCODES = (MIN_REPEAT, MAX_REPEAT) | |
| 158 for op, av in self.data: | |
| 159 if op is BRANCH: | |
| 160 i = MAXREPEAT - 1 | |
| 161 j = 0 | |
| 162 for av in av[1]: | |
| 163 l, h = av.getwidth() | |
| 164 i = min(i, l) | |
| 165 j = max(j, h) | |
| 166 lo = lo + i | |
| 167 hi = hi + j | |
| 168 elif op is CALL: | |
| 169 i, j = av.getwidth() | |
| 170 lo = lo + i | |
| 171 hi = hi + j | |
| 172 elif op is SUBPATTERN: | |
| 173 i, j = av[1].getwidth() | |
| 174 lo = lo + i | |
| 175 hi = hi + j | |
| 176 elif op in REPEATCODES: | |
| 177 i, j = av[2].getwidth() | |
| 178 lo = lo + i * av[0] | |
| 179 hi = hi + j * av[1] | |
| 180 elif op in UNITCODES: | |
| 181 lo = lo + 1 | |
| 182 hi = hi + 1 | |
| 183 elif op == SUCCESS: | |
| 184 break | |
| 185 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) | |
| 186 return self.width | |
| 187 | |
| 188 class Tokenizer: | |
| 189 def __init__(self, string): | |
| 190 self.string = string | |
| 191 self.index = 0 | |
| 192 self.__next() | |
| 193 def __next(self): | |
| 194 if self.index >= len(self.string): | |
| 195 self.next = None | |
| 196 return | |
| 197 char = self.string[self.index] | |
| 198 if char[0] == "\\": | |
| 199 try: | |
| 200 c = self.string[self.index + 1] | |
| 201 except IndexError: | |
| 202 raise error, "bogus escape (end of line)" | |
| 203 char = char + c | |
| 204 self.index = self.index + len(char) | |
| 205 self.next = char | |
| 206 def match(self, char, skip=1): | |
| 207 if char == self.next: | |
| 208 if skip: | |
| 209 self.__next() | |
| 210 return 1 | |
| 211 return 0 | |
| 212 def get(self): | |
| 213 this = self.next | |
| 214 self.__next() | |
| 215 return this | |
| 216 def tell(self): | |
| 217 return self.index, self.next | |
| 218 def seek(self, index): | |
| 219 self.index, self.next = index | |
| 220 | |
| 221 def isident(char): | |
| 222 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" | |
| 223 | |
| 224 def isdigit(char): | |
| 225 return "0" <= char <= "9" | |
| 226 | |
| 227 def isname(name): | |
| 228 # check that group name is a valid string | |
| 229 if not isident(name[0]): | |
| 230 return False | |
| 231 for char in name[1:]: | |
| 232 if not isident(char) and not isdigit(char): | |
| 233 return False | |
| 234 return True | |
| 235 | |
| 236 def _class_escape(source, escape): | |
| 237 # handle escape code inside character class | |
| 238 code = ESCAPES.get(escape) | |
| 239 if code: | |
| 240 return code | |
| 241 code = CATEGORIES.get(escape) | |
| 242 if code and code[0] == IN: | |
| 243 return code | |
| 244 try: | |
| 245 c = escape[1:2] | |
| 246 if c == "x": | |
| 247 # hexadecimal escape (exactly two digits) | |
| 248 while source.next in HEXDIGITS and len(escape) < 4: | |
| 249 escape = escape + source.get() | |
| 250 escape = escape[2:] | |
| 251 if len(escape) != 2: | |
| 252 raise error, "bogus escape: %s" % repr("\\" + escape) | |
| 253 return LITERAL, int(escape, 16) & 0xff | |
| 254 elif c in OCTDIGITS: | |
| 255 # octal escape (up to three digits) | |
| 256 while source.next in OCTDIGITS and len(escape) < 4: | |
| 257 escape = escape + source.get() | |
| 258 escape = escape[1:] | |
| 259 return LITERAL, int(escape, 8) & 0xff | |
| 260 elif c in DIGITS: | |
| 261 raise error, "bogus escape: %s" % repr(escape) | |
| 262 if len(escape) == 2: | |
| 263 return LITERAL, ord(escape[1]) | |
| 264 except ValueError: | |
| 265 pass | |
| 266 raise error, "bogus escape: %s" % repr(escape) | |
| 267 | |
| 268 def _escape(source, escape, state): | |
| 269 # handle escape code in expression | |
| 270 code = CATEGORIES.get(escape) | |
| 271 if code: | |
| 272 return code | |
| 273 code = ESCAPES.get(escape) | |
| 274 if code: | |
| 275 return code | |
| 276 try: | |
| 277 c = escape[1:2] | |
| 278 if c == "x": | |
| 279 # hexadecimal escape | |
| 280 while source.next in HEXDIGITS and len(escape) < 4: | |
| 281 escape = escape + source.get() | |
| 282 if len(escape) != 4: | |
| 283 raise ValueError | |
| 284 return LITERAL, int(escape[2:], 16) & 0xff | |
| 285 elif c == "0": | |
| 286 # octal escape | |
| 287 while source.next in OCTDIGITS and len(escape) < 4: | |
| 288 escape = escape + source.get() | |
| 289 return LITERAL, int(escape[1:], 8) & 0xff | |
| 290 elif c in DIGITS: | |
| 291 # octal escape *or* decimal group reference (sigh) | |
| 292 if source.next in DIGITS: | |
| 293 escape = escape + source.get() | |
| 294 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and | |
| 295 source.next in OCTDIGITS): | |
| 296 # got three octal digits; this is an octal escape | |
| 297 escape = escape + source.get() | |
| 298 return LITERAL, int(escape[1:], 8) & 0xff | |
| 299 # not an octal escape, so this is a group reference | |
| 300 group = int(escape[1:]) | |
| 301 if group < state.groups: | |
| 302 if not state.checkgroup(group): | |
| 303 raise error, "cannot refer to open group" | |
| 304 if state.lookbehind: | |
| 305 import warnings | |
| 306 warnings.warn('group references in lookbehind ' | |
| 307 'assertions are not supported', | |
| 308 RuntimeWarning) | |
| 309 return GROUPREF, group | |
| 310 raise ValueError | |
| 311 if len(escape) == 2: | |
| 312 return LITERAL, ord(escape[1]) | |
| 313 except ValueError: | |
| 314 pass | |
| 315 raise error, "bogus escape: %s" % repr(escape) | |
| 316 | |
| 317 def _parse_sub(source, state, nested=1): | |
| 318 # parse an alternation: a|b|c | |
| 319 | |
| 320 items = [] | |
| 321 itemsappend = items.append | |
| 322 sourcematch = source.match | |
| 323 while 1: | |
| 324 itemsappend(_parse(source, state)) | |
| 325 if sourcematch("|"): | |
| 326 continue | |
| 327 if not nested: | |
| 328 break | |
| 329 if not source.next or sourcematch(")", 0): | |
| 330 break | |
| 331 else: | |
| 332 raise error, "pattern not properly closed" | |
| 333 | |
| 334 if len(items) == 1: | |
| 335 return items[0] | |
| 336 | |
| 337 subpattern = SubPattern(state) | |
| 338 subpatternappend = subpattern.append | |
| 339 | |
| 340 # check if all items share a common prefix | |
| 341 while 1: | |
| 342 prefix = None | |
| 343 for item in items: | |
| 344 if not item: | |
| 345 break | |
| 346 if prefix is None: | |
| 347 prefix = item[0] | |
| 348 elif item[0] != prefix: | |
| 349 break | |
| 350 else: | |
| 351 # all subitems start with a common "prefix". | |
| 352 # move it out of the branch | |
| 353 for item in items: | |
| 354 del item[0] | |
| 355 subpatternappend(prefix) | |
| 356 continue # check next one | |
| 357 break | |
| 358 | |
| 359 # check if the branch can be replaced by a character set | |
| 360 for item in items: | |
| 361 if len(item) != 1 or item[0][0] != LITERAL: | |
| 362 break | |
| 363 else: | |
| 364 # we can store this as a character set instead of a | |
| 365 # branch (the compiler may optimize this even more) | |
| 366 set = [] | |
| 367 setappend = set.append | |
| 368 for item in items: | |
| 369 setappend(item[0]) | |
| 370 subpatternappend((IN, set)) | |
| 371 return subpattern | |
| 372 | |
| 373 subpattern.append((BRANCH, (None, items))) | |
| 374 return subpattern | |
| 375 | |
| 376 def _parse_sub_cond(source, state, condgroup): | |
| 377 item_yes = _parse(source, state) | |
| 378 if source.match("|"): | |
| 379 item_no = _parse(source, state) | |
| 380 if source.match("|"): | |
| 381 raise error, "conditional backref with more than two branches" | |
| 382 else: | |
| 383 item_no = None | |
| 384 if source.next and not source.match(")", 0): | |
| 385 raise error, "pattern not properly closed" | |
| 386 subpattern = SubPattern(state) | |
| 387 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) | |
| 388 return subpattern | |
| 389 | |
| 390 _PATTERNENDERS = set("|)") | |
| 391 _ASSERTCHARS = set("=!<") | |
| 392 _LOOKBEHINDASSERTCHARS = set("=!") | |
| 393 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) | |
| 394 | |
| 395 def _parse(source, state): | |
| 396 # parse a simple pattern | |
| 397 subpattern = SubPattern(state) | |
| 398 | |
| 399 # precompute constants into local variables | |
| 400 subpatternappend = subpattern.append | |
| 401 sourceget = source.get | |
| 402 sourcematch = source.match | |
| 403 _len = len | |
| 404 PATTERNENDERS = _PATTERNENDERS | |
| 405 ASSERTCHARS = _ASSERTCHARS | |
| 406 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS | |
| 407 REPEATCODES = _REPEATCODES | |
| 408 | |
| 409 while 1: | |
| 410 | |
| 411 if source.next in PATTERNENDERS: | |
| 412 break # end of subpattern | |
| 413 this = sourceget() | |
| 414 if this is None: | |
| 415 break # end of pattern | |
| 416 | |
| 417 if state.flags & SRE_FLAG_VERBOSE: | |
| 418 # skip whitespace and comments | |
| 419 if this in WHITESPACE: | |
| 420 continue | |
| 421 if this == "#": | |
| 422 while 1: | |
| 423 this = sourceget() | |
| 424 if this in (None, "\n"): | |
| 425 break | |
| 426 continue | |
| 427 | |
| 428 if this and this[0] not in SPECIAL_CHARS: | |
| 429 subpatternappend((LITERAL, ord(this))) | |
| 430 | |
| 431 elif this == "[": | |
| 432 # character set | |
| 433 set = [] | |
| 434 setappend = set.append | |
| 435 ## if sourcematch(":"): | |
| 436 ## pass # handle character classes | |
| 437 if sourcematch("^"): | |
| 438 setappend((NEGATE, None)) | |
| 439 # check remaining characters | |
| 440 start = set[:] | |
| 441 while 1: | |
| 442 this = sourceget() | |
| 443 if this == "]" and set != start: | |
| 444 break | |
| 445 elif this and this[0] == "\\": | |
| 446 code1 = _class_escape(source, this) | |
| 447 elif this: | |
| 448 code1 = LITERAL, ord(this) | |
| 449 else: | |
| 450 raise error, "unexpected end of regular expression" | |
| 451 if sourcematch("-"): | |
| 452 # potential range | |
| 453 this = sourceget() | |
| 454 if this == "]": | |
| 455 if code1[0] is IN: | |
| 456 code1 = code1[1][0] | |
| 457 setappend(code1) | |
| 458 setappend((LITERAL, ord("-"))) | |
| 459 break | |
| 460 elif this: | |
| 461 if this[0] == "\\": | |
| 462 code2 = _class_escape(source, this) | |
| 463 else: | |
| 464 code2 = LITERAL, ord(this) | |
| 465 if code1[0] != LITERAL or code2[0] != LITERAL: | |
| 466 raise error, "bad character range" | |
| 467 lo = code1[1] | |
| 468 hi = code2[1] | |
| 469 if hi < lo: | |
| 470 raise error, "bad character range" | |
| 471 setappend((RANGE, (lo, hi))) | |
| 472 else: | |
| 473 raise error, "unexpected end of regular expression" | |
| 474 else: | |
| 475 if code1[0] is IN: | |
| 476 code1 = code1[1][0] | |
| 477 setappend(code1) | |
| 478 | |
| 479 # XXX: <fl> should move set optimization to compiler! | |
| 480 if _len(set)==1 and set[0][0] is LITERAL: | |
| 481 subpatternappend(set[0]) # optimization | |
| 482 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: | |
| 483 subpatternappend((NOT_LITERAL, set[1][1])) # optimization | |
| 484 else: | |
| 485 # XXX: <fl> should add charmap optimization here | |
| 486 subpatternappend((IN, set)) | |
| 487 | |
| 488 elif this and this[0] in REPEAT_CHARS: | |
| 489 # repeat previous item | |
| 490 if this == "?": | |
| 491 min, max = 0, 1 | |
| 492 elif this == "*": | |
| 493 min, max = 0, MAXREPEAT | |
| 494 | |
| 495 elif this == "+": | |
| 496 min, max = 1, MAXREPEAT | |
| 497 elif this == "{": | |
| 498 if source.next == "}": | |
| 499 subpatternappend((LITERAL, ord(this))) | |
| 500 continue | |
| 501 here = source.tell() | |
| 502 min, max = 0, MAXREPEAT | |
| 503 lo = hi = "" | |
| 504 while source.next in DIGITS: | |
| 505 lo = lo + source.get() | |
| 506 if sourcematch(","): | |
| 507 while source.next in DIGITS: | |
| 508 hi = hi + sourceget() | |
| 509 else: | |
| 510 hi = lo | |
| 511 if not sourcematch("}"): | |
| 512 subpatternappend((LITERAL, ord(this))) | |
| 513 source.seek(here) | |
| 514 continue | |
| 515 if lo: | |
| 516 min = int(lo) | |
| 517 if min >= MAXREPEAT: | |
| 518 raise OverflowError("the repetition number is too large") | |
| 519 if hi: | |
| 520 max = int(hi) | |
| 521 if max >= MAXREPEAT: | |
| 522 raise OverflowError("the repetition number is too large") | |
| 523 if max < min: | |
| 524 raise error("bad repeat interval") | |
| 525 else: | |
| 526 raise error, "not supported" | |
| 527 # figure out which item to repeat | |
| 528 if subpattern: | |
| 529 item = subpattern[-1:] | |
| 530 else: | |
| 531 item = None | |
| 532 if not item or (_len(item) == 1 and item[0][0] == AT): | |
| 533 raise error, "nothing to repeat" | |
| 534 if item[0][0] in REPEATCODES: | |
| 535 raise error, "multiple repeat" | |
| 536 if sourcematch("?"): | |
| 537 subpattern[-1] = (MIN_REPEAT, (min, max, item)) | |
| 538 else: | |
| 539 subpattern[-1] = (MAX_REPEAT, (min, max, item)) | |
| 540 | |
| 541 elif this == ".": | |
| 542 subpatternappend((ANY, None)) | |
| 543 | |
| 544 elif this == "(": | |
| 545 group = 1 | |
| 546 name = None | |
| 547 condgroup = None | |
| 548 if sourcematch("?"): | |
| 549 group = 0 | |
| 550 # options | |
| 551 if sourcematch("P"): | |
| 552 # python extensions | |
| 553 if sourcematch("<"): | |
| 554 # named group: skip forward to end of name | |
| 555 name = "" | |
| 556 while 1: | |
| 557 char = sourceget() | |
| 558 if char is None: | |
| 559 raise error, "unterminated name" | |
| 560 if char == ">": | |
| 561 break | |
| 562 name = name + char | |
| 563 group = 1 | |
| 564 if not name: | |
| 565 raise error("missing group name") | |
| 566 if not isname(name): | |
| 567 raise error("bad character in group name %r" % | |
| 568 name) | |
| 569 elif sourcematch("="): | |
| 570 # named backreference | |
| 571 name = "" | |
| 572 while 1: | |
| 573 char = sourceget() | |
| 574 if char is None: | |
| 575 raise error, "unterminated name" | |
| 576 if char == ")": | |
| 577 break | |
| 578 name = name + char | |
| 579 if not name: | |
| 580 raise error("missing group name") | |
| 581 if not isname(name): | |
| 582 raise error("bad character in backref group name " | |
| 583 "%r" % name) | |
| 584 gid = state.groupdict.get(name) | |
| 585 if gid is None: | |
| 586 msg = "unknown group name: {0!r}".format(name) | |
| 587 raise error(msg) | |
| 588 if state.lookbehind: | |
| 589 import warnings | |
| 590 warnings.warn('group references in lookbehind ' | |
| 591 'assertions are not supported', | |
| 592 RuntimeWarning) | |
| 593 subpatternappend((GROUPREF, gid)) | |
| 594 continue | |
| 595 else: | |
| 596 char = sourceget() | |
| 597 if char is None: | |
| 598 raise error, "unexpected end of pattern" | |
| 599 raise error, "unknown specifier: ?P%s" % char | |
| 600 elif sourcematch(":"): | |
| 601 # non-capturing group | |
| 602 group = 2 | |
| 603 elif sourcematch("#"): | |
| 604 # comment | |
| 605 while 1: | |
| 606 if source.next is None or source.next == ")": | |
| 607 break | |
| 608 sourceget() | |
| 609 if not sourcematch(")"): | |
| 610 raise error, "unbalanced parenthesis" | |
| 611 continue | |
| 612 elif source.next in ASSERTCHARS: | |
| 613 # lookahead assertions | |
| 614 char = sourceget() | |
| 615 dir = 1 | |
| 616 if char == "<": | |
| 617 if source.next not in LOOKBEHINDASSERTCHARS: | |
| 618 raise error, "syntax error" | |
| 619 dir = -1 # lookbehind | |
| 620 char = sourceget() | |
| 621 state.lookbehind += 1 | |
| 622 p = _parse_sub(source, state) | |
| 623 if dir < 0: | |
| 624 state.lookbehind -= 1 | |
| 625 if not sourcematch(")"): | |
| 626 raise error, "unbalanced parenthesis" | |
| 627 if char == "=": | |
| 628 subpatternappend((ASSERT, (dir, p))) | |
| 629 else: | |
| 630 subpatternappend((ASSERT_NOT, (dir, p))) | |
| 631 continue | |
| 632 elif sourcematch("("): | |
| 633 # conditional backreference group | |
| 634 condname = "" | |
| 635 while 1: | |
| 636 char = sourceget() | |
| 637 if char is None: | |
| 638 raise error, "unterminated name" | |
| 639 if char == ")": | |
| 640 break | |
| 641 condname = condname + char | |
| 642 group = 2 | |
| 643 if not condname: | |
| 644 raise error("missing group name") | |
| 645 if isname(condname): | |
| 646 condgroup = state.groupdict.get(condname) | |
| 647 if condgroup is None: | |
| 648 msg = "unknown group name: {0!r}".format(condname) | |
| 649 raise error(msg) | |
| 650 else: | |
| 651 try: | |
| 652 condgroup = int(condname) | |
| 653 except ValueError: | |
| 654 raise error, "bad character in group name" | |
| 655 if state.lookbehind: | |
| 656 import warnings | |
| 657 warnings.warn('group references in lookbehind ' | |
| 658 'assertions are not supported', | |
| 659 RuntimeWarning) | |
| 660 else: | |
| 661 # flags | |
| 662 if not source.next in FLAGS: | |
| 663 raise error, "unexpected end of pattern" | |
| 664 while source.next in FLAGS: | |
| 665 state.flags = state.flags | FLAGS[sourceget()] | |
| 666 if group: | |
| 667 # parse group contents | |
| 668 if group == 2: | |
| 669 # anonymous group | |
| 670 group = None | |
| 671 else: | |
| 672 group = state.opengroup(name) | |
| 673 if condgroup: | |
| 674 p = _parse_sub_cond(source, state, condgroup) | |
| 675 else: | |
| 676 p = _parse_sub(source, state) | |
| 677 if not sourcematch(")"): | |
| 678 raise error, "unbalanced parenthesis" | |
| 679 if group is not None: | |
| 680 state.closegroup(group) | |
| 681 subpatternappend((SUBPATTERN, (group, p))) | |
| 682 else: | |
| 683 while 1: | |
| 684 char = sourceget() | |
| 685 if char is None: | |
| 686 raise error, "unexpected end of pattern" | |
| 687 if char == ")": | |
| 688 break | |
| 689 raise error, "unknown extension" | |
| 690 | |
| 691 elif this == "^": | |
| 692 subpatternappend((AT, AT_BEGINNING)) | |
| 693 | |
| 694 elif this == "$": | |
| 695 subpattern.append((AT, AT_END)) | |
| 696 | |
| 697 elif this and this[0] == "\\": | |
| 698 code = _escape(source, this, state) | |
| 699 subpatternappend(code) | |
| 700 | |
| 701 else: | |
| 702 raise error, "parser error" | |
| 703 | |
| 704 return subpattern | |
| 705 | |
| 706 def parse(str, flags=0, pattern=None): | |
| 707 # parse 're' pattern into list of (opcode, argument) tuples | |
| 708 | |
| 709 source = Tokenizer(str) | |
| 710 | |
| 711 if pattern is None: | |
| 712 pattern = Pattern() | |
| 713 pattern.flags = flags | |
| 714 pattern.str = str | |
| 715 | |
| 716 p = _parse_sub(source, pattern, 0) | |
| 717 | |
| 718 tail = source.get() | |
| 719 if tail == ")": | |
| 720 raise error, "unbalanced parenthesis" | |
| 721 elif tail: | |
| 722 raise error, "bogus characters at end of regular expression" | |
| 723 | |
| 724 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: | |
| 725 # the VERBOSE flag was switched on inside the pattern. to be | |
| 726 # on the safe side, we'll parse the whole thing again... | |
| 727 return parse(str, p.pattern.flags) | |
| 728 | |
| 729 if flags & SRE_FLAG_DEBUG: | |
| 730 p.dump() | |
| 731 | |
| 732 return p | |
| 733 | |
| 734 def parse_template(source, pattern): | |
| 735 # parse 're' replacement string into list of literals and | |
| 736 # group references | |
| 737 s = Tokenizer(source) | |
| 738 sget = s.get | |
| 739 p = [] | |
| 740 a = p.append | |
| 741 def literal(literal, p=p, pappend=a): | |
| 742 if p and p[-1][0] is LITERAL: | |
| 743 p[-1] = LITERAL, p[-1][1] + literal | |
| 744 else: | |
| 745 pappend((LITERAL, literal)) | |
| 746 sep = source[:0] | |
| 747 if type(sep) is type(""): | |
| 748 makechar = chr | |
| 749 else: | |
| 750 makechar = unichr | |
| 751 while 1: | |
| 752 this = sget() | |
| 753 if this is None: | |
| 754 break # end of replacement string | |
| 755 if this and this[0] == "\\": | |
| 756 # group | |
| 757 c = this[1:2] | |
| 758 if c == "g": | |
| 759 name = "" | |
| 760 if s.match("<"): | |
| 761 while 1: | |
| 762 char = sget() | |
| 763 if char is None: | |
| 764 raise error, "unterminated group name" | |
| 765 if char == ">": | |
| 766 break | |
| 767 name = name + char | |
| 768 if not name: | |
| 769 raise error, "missing group name" | |
| 770 try: | |
| 771 index = int(name) | |
| 772 if index < 0: | |
| 773 raise error, "negative group number" | |
| 774 except ValueError: | |
| 775 if not isname(name): | |
| 776 raise error, "bad character in group name" | |
| 777 try: | |
| 778 index = pattern.groupindex[name] | |
| 779 except KeyError: | |
| 780 msg = "unknown group name: {0!r}".format(name) | |
| 781 raise IndexError(msg) | |
| 782 a((MARK, index)) | |
| 783 elif c == "0": | |
| 784 if s.next in OCTDIGITS: | |
| 785 this = this + sget() | |
| 786 if s.next in OCTDIGITS: | |
| 787 this = this + sget() | |
| 788 literal(makechar(int(this[1:], 8) & 0xff)) | |
| 789 elif c in DIGITS: | |
| 790 isoctal = False | |
| 791 if s.next in DIGITS: | |
| 792 this = this + sget() | |
| 793 if (c in OCTDIGITS and this[2] in OCTDIGITS and | |
| 794 s.next in OCTDIGITS): | |
| 795 this = this + sget() | |
| 796 isoctal = True | |
| 797 literal(makechar(int(this[1:], 8) & 0xff)) | |
| 798 if not isoctal: | |
| 799 a((MARK, int(this[1:]))) | |
| 800 else: | |
| 801 try: | |
| 802 this = makechar(ESCAPES[this][1]) | |
| 803 except KeyError: | |
| 804 pass | |
| 805 literal(this) | |
| 806 else: | |
| 807 literal(this) | |
| 808 # convert template to groups and literals lists | |
| 809 i = 0 | |
| 810 groups = [] | |
| 811 groupsappend = groups.append | |
| 812 literals = [None] * len(p) | |
| 813 for c, s in p: | |
| 814 if c is MARK: | |
| 815 groupsappend((i, s)) | |
| 816 # literal[i] is already None | |
| 817 else: | |
| 818 literals[i] = s | |
| 819 i = i + 1 | |
| 820 return groups, literals | |
| 821 | |
| 822 def expand_template(template, match): | |
| 823 g = match.group | |
| 824 sep = match.string[:0] | |
| 825 groups, literals = template | |
| 826 literals = literals[:] | |
| 827 try: | |
| 828 for index, group in groups: | |
| 829 literals[index] = s = g(group) | |
| 830 if s is None: | |
| 831 raise error, "unmatched group" | |
| 832 except IndexError: | |
| 833 raise error, "invalid group reference" | |
| 834 return sep.join(literals) |
