Mercurial > repos > bcclaywell > argo_navis
comparison venv/lib/python2.7/site-packages/docutils/utils/punctuation_chars.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
| author | bcclaywell |
|---|---|
| date | Mon, 12 Oct 2015 17:43:33 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d67268158946 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 # :Copyright: © 2011 Günter Milde. | |
| 4 # :License: Released under the terms of the `2-Clause BSD license`_, in short: | |
| 5 # | |
| 6 # Copying and distribution of this file, with or without modification, | |
| 7 # are permitted in any medium without royalty provided the copyright | |
| 8 # notice and this notice are preserved. | |
| 9 # This file is offered as-is, without any warranty. | |
| 10 # | |
| 11 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause | |
| 12 | |
| 13 # :Id: $Id: punctuation_chars.py 7668 2013-06-04 12:46:30Z milde $ | |
| 14 | |
| 15 import sys, re | |
| 16 import unicodedata | |
| 17 | |
| 18 # punctuation characters around inline markup | |
| 19 # =========================================== | |
| 20 # | |
| 21 # This module provides the lists of characters for the implementation of | |
| 22 # the `inline markup recognition rules`_ in the reStructuredText parser | |
| 23 # (states.py) | |
| 24 # | |
| 25 # .. _inline markup recognition rules: | |
| 26 # ../../docs/ref/rst/restructuredtext.html#inline-markup | |
| 27 | |
| 28 # Docutils punctuation category sample strings | |
| 29 # -------------------------------------------- | |
| 30 # | |
| 31 # The sample strings are generated by punctuation_samples() and put here | |
| 32 # literal to avoid the time-consuming generation with every Docutils run. | |
| 33 # As the samples are used inside ``[ ]`` in regular expressions, hyphen and | |
| 34 # square brackets are escaped. :: | |
| 35 | |
| 36 openers = (u'"\'(<\\[{\u0f3a\u0f3c\u169b\u2045\u207d\u208d\u2329\u2768' | |
| 37 u'\u276a\u276c\u276e\u2770\u2772\u2774\u27c5\u27e6\u27e8\u27ea' | |
| 38 u'\u27ec\u27ee\u2983\u2985\u2987\u2989\u298b\u298d\u298f\u2991' | |
| 39 u'\u2993\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26\u2e28' | |
| 40 u'\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d' | |
| 41 u'\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41' | |
| 42 u'\ufe43\ufe47\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff5f\uff62' | |
| 43 u'\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c\u2e1c\u2e20' | |
| 44 u'\u201a\u201e\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d' | |
| 45 u'\u2e1d\u2e21\u201b\u201f') | |
| 46 closers = (u'"\')>\\]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a\u2769' | |
| 47 u'\u276b\u276d\u276f\u2771\u2773\u2775\u27c6\u27e7\u27e9\u27eb' | |
| 48 u'\u27ed\u27ef\u2984\u2986\u2988\u298a\u298c\u298e\u2990\u2992' | |
| 49 u'\u2994\u2996\u2998\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29' | |
| 50 u'\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e' | |
| 51 u'\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42' | |
| 52 u'\ufe44\ufe48\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff60\uff63' | |
| 53 u'\xbb\u2019\u201d\u203a\u2e03\u2e05\u2e0a\u2e0d\u2e1d\u2e21' | |
| 54 u'\u201b\u201f\xab\u2018\u201c\u2039\u2e02\u2e04\u2e09\u2e0c' | |
| 55 u'\u2e1c\u2e20\u201a\u201e') | |
| 56 delimiters = (u'\\-/:\u058a\xa1\xb7\xbf\u037e\u0387\u055a-\u055f\u0589' | |
| 57 u'\u05be\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c' | |
| 58 u'\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d' | |
| 59 u'\u07f7-\u07f9\u0830-\u083e\u0964\u0965\u0970\u0df4\u0e4f' | |
| 60 u'\u0e5a\u0e5b\u0f04-\u0f12\u0f85\u0fd0-\u0fd4\u104a-\u104f' | |
| 61 u'\u10fb\u1361-\u1368\u1400\u166d\u166e\u16eb-\u16ed\u1735' | |
| 62 u'\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u180a\u1944\u1945' | |
| 63 u'\u19de\u19df\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-' | |
| 64 u'\u1b60\u1c3b-\u1c3f\u1c7e\u1c7f\u1cd3\u2010-\u2017\u2020-' | |
| 65 u'\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-' | |
| 66 u'\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2e00' | |
| 67 u'\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e1b\u2e1e\u2e1f\u2e2a-' | |
| 68 u'\u2e2e\u2e30\u2e31\u3001-\u3003\u301c\u3030\u303d\u30a0' | |
| 69 u'\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7' | |
| 70 u'\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f' | |
| 71 u'\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uabeb' | |
| 72 u'\ufe10-\ufe16\ufe19\ufe30-\ufe32\ufe45\ufe46\ufe49-\ufe4c' | |
| 73 u'\ufe50-\ufe52\ufe54-\ufe58\ufe5f-\ufe61\ufe63\ufe68\ufe6a' | |
| 74 u'\ufe6b\uff01-\uff03\uff05-\uff07\uff0a\uff0c-\uff0f\uff1a' | |
| 75 u'\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65') | |
| 76 if sys.maxunicode >= 0x10FFFF: # "wide" build | |
| 77 delimiters += (u'\U00010100\U00010101\U0001039f\U000103d0\U00010857' | |
| 78 u'\U0001091f\U0001093f\U00010a50-\U00010a58\U00010a7f' | |
| 79 u'\U00010b39-\U00010b3f\U000110bb\U000110bc\U000110be-' | |
| 80 u'\U000110c1\U00012470-\U00012473') | |
| 81 closing_delimiters = u'\\\\.,;!?' | |
| 82 | |
| 83 | |
| 84 # Matching open/close quotes | |
| 85 # -------------------------- | |
| 86 | |
| 87 # Rule (5) requires determination of matching open/close pairs. However, | |
| 88 # the pairing of open/close quotes is ambigue due to different typographic | |
| 89 # conventions in different languages. | |
| 90 | |
| 91 quote_pairs = {u'\xbb': u'\xbb', # Swedish | |
| 92 u'\u2018': u'\u201a', # Greek | |
| 93 u'\u2019': u'\u2019', # Swedish | |
| 94 u'\u201a': u'\u2018\u2019', # German, Polish | |
| 95 u'\u201c': u'\u201e', # German | |
| 96 u'\u201e': u'\u201c\u201d', | |
| 97 u'\u201d': u'\u201d', # Swedish | |
| 98 u'\u203a': u'\u203a', # Swedish | |
| 99 } | |
| 100 | |
| 101 def match_chars(c1, c2): | |
| 102 try: | |
| 103 i = openers.index(c1) | |
| 104 except ValueError: # c1 not in openers | |
| 105 return False | |
| 106 return c2 == closers[i] or c2 in quote_pairs.get(c1, '') | |
| 107 | |
| 108 | |
| 109 # Running this file as a standalone module checks the definitions against a | |
| 110 # re-calculation:: | |
| 111 | |
| 112 if __name__ == '__main__': | |
| 113 | |
| 114 | |
| 115 # Unicode punctuation character categories | |
| 116 # ---------------------------------------- | |
| 117 | |
| 118 unicode_punctuation_categories = { | |
| 119 # 'Pc': 'Connector', # not used in Docutils inline markup recognition | |
| 120 'Pd': 'Dash', | |
| 121 'Ps': 'Open', | |
| 122 'Pe': 'Close', | |
| 123 'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage | |
| 124 'Pf': 'Final quote', # may behave like Ps or Pe depending on usage | |
| 125 'Po': 'Other' | |
| 126 } | |
| 127 """Unicode character categories for punctuation""" | |
| 128 | |
| 129 | |
| 130 # generate character pattern strings | |
| 131 # ================================== | |
| 132 | |
| 133 def unicode_charlists(categories, cp_min=0, cp_max=None): | |
| 134 """Return dictionary of Unicode character lists. | |
| 135 | |
| 136 For each of the `catagories`, an item contains a list with all Unicode | |
| 137 characters with `cp_min` <= code-point <= `cp_max` that belong to | |
| 138 the category. | |
| 139 | |
| 140 The default values check every code-point supported by Python | |
| 141 (`sys.maxint` is 0x10FFFF in a "wide" build and 0xFFFF in a "narrow" | |
| 142 build, i.e. ucs4 and ucs2 respectively). | |
| 143 """ | |
| 144 # Determine highest code point with one of the given categories | |
| 145 # (may shorten the search time considerably if there are many | |
| 146 # categories with not too high characters): | |
| 147 if cp_max is None: | |
| 148 cp_max = max(x for x in xrange(sys.maxunicode+1) | |
| 149 if unicodedata.category(unichr(x)) in categories) | |
| 150 # print cp_max # => 74867 for unicode_punctuation_categories | |
| 151 charlists = {} | |
| 152 for cat in categories: | |
| 153 charlists[cat] = [unichr(x) for x in xrange(cp_min, cp_max+1) | |
| 154 if unicodedata.category(unichr(x)) == cat] | |
| 155 return charlists | |
| 156 | |
| 157 | |
| 158 # Character categories in Docutils | |
| 159 # -------------------------------- | |
| 160 | |
| 161 def punctuation_samples(): | |
| 162 | |
| 163 """Docutils punctuation category sample strings. | |
| 164 | |
| 165 Return list of sample strings for the categories "Open", "Close", | |
| 166 "Delimiters" and "Closing-Delimiters" used in the `inline markup | |
| 167 recognition rules`_. | |
| 168 """ | |
| 169 | |
| 170 # Lists with characters in Unicode punctuation character categories | |
| 171 cp_min = 160 # ASCII chars have special rules for backwards compatibility | |
| 172 ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min) | |
| 173 | |
| 174 # match opening/closing characters | |
| 175 # -------------------------------- | |
| 176 # Rearange the lists to ensure matching characters at the same | |
| 177 # index position. | |
| 178 | |
| 179 # low quotation marks are also used as closers (e.g. in Greek) | |
| 180 # move them to category Pi: | |
| 181 ucharlists['Ps'].remove(u'‚') # 201A SINGLE LOW-9 QUOTATION MARK | |
| 182 ucharlists['Ps'].remove(u'„') # 201E DOUBLE LOW-9 QUOTATION MARK | |
| 183 ucharlists['Pi'] += [u'‚', u'„'] | |
| 184 | |
| 185 ucharlists['Pi'].remove(u'‛') # 201B SINGLE HIGH-REVERSED-9 QUOTATION MARK | |
| 186 ucharlists['Pi'].remove(u'‟') # 201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK | |
| 187 ucharlists['Pf'] += [u'‛', u'‟'] | |
| 188 | |
| 189 # 301F LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant: | |
| 190 ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d') | |
| 191 | |
| 192 # print u''.join(ucharlists['Ps']).encode('utf8') | |
| 193 # print u''.join(ucharlists['Pe']).encode('utf8') | |
| 194 # print u''.join(ucharlists['Pi']).encode('utf8') | |
| 195 # print u''.join(ucharlists['Pf']).encode('utf8') | |
| 196 | |
| 197 # The Docutils character categories | |
| 198 # --------------------------------- | |
| 199 # | |
| 200 # The categorization of ASCII chars is non-standard to reduce | |
| 201 # both false positives and need for escaping. (see `inline markup | |
| 202 # recognition rules`_) | |
| 203 | |
| 204 # allowed before markup if there is a matching closer | |
| 205 openers = [u'"\'(<\\[{'] | |
| 206 for cat in ('Ps', 'Pi', 'Pf'): | |
| 207 openers.extend(ucharlists[cat]) | |
| 208 | |
| 209 # allowed after markup if there is a matching opener | |
| 210 closers = [u'"\')>\\]}'] | |
| 211 for cat in ('Pe', 'Pf', 'Pi'): | |
| 212 closers.extend(ucharlists[cat]) | |
| 213 | |
| 214 # non-matching, allowed on both sides | |
| 215 delimiters = [u'\\-/:'] | |
| 216 for cat in ('Pd', 'Po'): | |
| 217 delimiters.extend(ucharlists[cat]) | |
| 218 | |
| 219 # non-matching, after markup | |
| 220 closing_delimiters = [r'\\.,;!?'] | |
| 221 | |
| 222 # # Test open/close matching: | |
| 223 # for i in range(min(len(openers),len(closers))): | |
| 224 # print '%4d %s %s' % (i, openers[i].encode('utf8'), | |
| 225 # closers[i].encode('utf8')) | |
| 226 | |
| 227 return [u''.join(chars) for chars in (openers, closers, delimiters, | |
| 228 closing_delimiters)] | |
| 229 | |
| 230 def separate_wide_chars(s): | |
| 231 """Return (s1,s2) with characters above 0xFFFF in s2""" | |
| 232 maxunicode_narrow = 0xFFFF | |
| 233 l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow] | |
| 234 l2 = [ch for ch in s if ord(ch) > maxunicode_narrow] | |
| 235 return ''.join(l1), ''.join(l2) | |
| 236 | |
| 237 def mark_intervals(s): | |
| 238 """Return s with shortcut notation for runs of consecutive characters | |
| 239 | |
| 240 Sort string and replace 'cdef' by 'c-f' and similar. | |
| 241 """ | |
| 242 l =[] | |
| 243 s = [ord(ch) for ch in s] | |
| 244 s.sort() | |
| 245 for n in s: | |
| 246 try: | |
| 247 if l[-1][-1]+1 == n: | |
| 248 l[-1].append(n) | |
| 249 else: | |
| 250 l.append([n]) | |
| 251 except IndexError: | |
| 252 l.append([n]) | |
| 253 | |
| 254 l2 = [] | |
| 255 for i in l: | |
| 256 i = [unichr(n) for n in i] | |
| 257 if len(i) > 2: | |
| 258 i = i[0], u'-', i[-1] | |
| 259 l2.extend(i) | |
| 260 | |
| 261 return ''.join(l2) | |
| 262 | |
| 263 def wrap_string(s, startstring= "(", | |
| 264 endstring = ")", wrap=65): | |
| 265 """Line-wrap a unicode string literal definition.""" | |
| 266 c = len(startstring) | |
| 267 contstring = "'\n" + ' ' * len(startstring) + "u'" | |
| 268 l = [startstring] | |
| 269 for ch in s: | |
| 270 c += 1 | |
| 271 if ch == '\\' and c > wrap: | |
| 272 c = len(startstring) | |
| 273 ch = contstring + ch | |
| 274 l.append(ch) | |
| 275 l.append(endstring) | |
| 276 return ''.join(l) | |
| 277 | |
| 278 | |
| 279 # print results | |
| 280 # ============= | |
| 281 | |
| 282 # (re) create and compare the samples: | |
| 283 | |
| 284 (o, c, d, cd) = punctuation_samples() | |
| 285 o, o_wide = separate_wide_chars(o) | |
| 286 c, c_wide = separate_wide_chars(c) | |
| 287 d, d_wide = separate_wide_chars(d) | |
| 288 d = d[:5] + mark_intervals(d[5:]) | |
| 289 d_wide = mark_intervals(d_wide) | |
| 290 if sys.maxunicode >= 0x10FFFF: # "wide" build | |
| 291 d += d_wide | |
| 292 if o != openers: | |
| 293 print '- openers = ur"""%s"""' % openers.encode('utf8') | |
| 294 print '+ openers = ur"""%s"""' % o.encode('utf8') | |
| 295 if o_wide: | |
| 296 print '+ openers-wide = ur"""%s"""' % o_wide.encode('utf8') | |
| 297 if c != closers: | |
| 298 print '- closers = ur"""%s"""' % closers.encode('utf8') | |
| 299 print '+ closers = ur"""%s"""' % c.encode('utf8') | |
| 300 if c_wide: | |
| 301 print '+ closers-wide = ur"""%s"""' % c_wide.encode('utf8') | |
| 302 if d != delimiters: | |
| 303 print '- delimiters = ur"%s"' % delimiters.encode('utf8') | |
| 304 print '+ delimiters = ur"%s"' % d.encode('utf8') | |
| 305 if cd != closing_delimiters: | |
| 306 print '- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8') | |
| 307 print '+ closing_delimiters = ur"%s"' % cd.encode('utf8') | |
| 308 # closing_delimiters are all ASCII characters | |
| 309 | |
| 310 # Print literal code to define the character sets: | |
| 311 | |
| 312 # `openers` and `closers` must be verbose and keep order because they are | |
| 313 # also used in `match_chars()`. | |
| 314 print wrap_string(repr(o), startstring='openers = (') | |
| 315 print wrap_string(repr(c), startstring='closers = (') | |
| 316 # delimiters: sort and use shortcut for intervals (saves ~150 characters): | |
| 317 print wrap_string(repr(d), startstring='delimiters = (') | |
| 318 # add characters in the upper plane only in a "wide" build: | |
| 319 print 'if sys.maxunicode >= 0x10FFFF: # "wide" build' | |
| 320 print wrap_string(repr(d_wide), startstring=' delimiters += (') | |
| 321 print 'closing_delimiters =', repr(cd) | |
| 322 | |
| 323 # test prints | |
| 324 | |
| 325 # print "wide" Unicode characters: | |
| 326 # ucharlists = unicode_charlists(unicode_punctuation_categories) | |
| 327 # for key in ucharlists: | |
| 328 # if key.endswith('wide'): | |
| 329 # print key, ucharlists[key] | |
| 330 | |
| 331 # print 'openers = ', repr(openers) | |
| 332 # print 'closers = ', repr(closers) | |
| 333 # print 'delimiters = ', repr(delimiters) | |
| 334 # print 'closing_delimiters = ', repr(closing_delimiters) | |
| 335 | |
| 336 # ucharlists = unicode_charlists(unicode_punctuation_categories) | |
| 337 # for cat, chars in ucharlists.items(): | |
| 338 # # print cat, chars | |
| 339 # # compact output (visible with a comprehensive font): | |
| 340 # print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8') | |
| 341 | |
| 342 # verbose print | |
| 343 | |
| 344 # print 'openers:' | |
| 345 # for ch in openers: | |
| 346 # print ch.encode('utf8'), unicodedata.name(ch) | |
| 347 # print 'closers:' | |
| 348 # for ch in closers: | |
| 349 # print ch.encode('utf8'), unicodedata.name(ch) | |
| 350 # print 'delimiters:' | |
| 351 # for ch in delimiters: | |
| 352 # print ch.encode('utf8'), unicodedata.name(ch) | |
| 353 # print 'closing_delimiters:' | |
| 354 # for ch in closing_delimiters: | |
| 355 # print ch.encode('utf8'), unicodedata.name(ch) |
