Mercurial > repos > bcclaywell > argo_navis
comparison venv/lib/python2.7/site-packages/docutils/utils/smartquotes.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
| author | bcclaywell |
|---|---|
| date | Mon, 12 Oct 2015 17:43:33 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d67268158946 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 # :Id: $Id: smartquotes.py 7716 2013-08-21 21:54:57Z milde $ | |
| 5 # :Copyright: © 2010 Günter Milde, | |
| 6 # original `SmartyPants`_: © 2003 John Gruber | |
| 7 # smartypants.py: © 2004, 2007 Chad Miller | |
| 8 # :Maintainer: docutils-develop@lists.sourceforge.net | |
| 9 # :License: Released under the terms of the `2-Clause BSD license`_, in short: | |
| 10 # | |
| 11 # Copying and distribution of this file, with or without modification, | |
| 12 # are permitted in any medium without royalty provided the copyright | |
| 13 # notices and this notice are preserved. | |
| 14 # This file is offered as-is, without any warranty. | |
| 15 # | |
| 16 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause | |
| 17 | |
| 18 | |
| 19 r""" | |
| 20 ======================== | |
| 21 SmartyPants for Docutils | |
| 22 ======================== | |
| 23 | |
| 24 Synopsis | |
| 25 ======== | |
| 26 | |
| 27 Smart-quotes for Docutils. | |
| 28 | |
| 29 The original "SmartyPants" is a free web publishing plug-in for Movable Type, | |
| 30 Blosxom, and BBEdit that easily translates plain ASCII punctuation characters | |
| 31 into "smart" typographic punctuation characters. | |
| 32 | |
| 33 `smartypants.py`, endeavours to be a functional port of | |
| 34 SmartyPants to Python, for use with Pyblosxom_. | |
| 35 | |
| 36 `smartquotes.py` is an adaption of Smartypants to Docutils_. By using Unicode | |
| 37 characters instead of HTML entities for typographic quotes, it works for any | |
| 38 output format that supports Unicode. | |
| 39 | |
| 40 Authors | |
| 41 ======= | |
| 42 | |
| 43 `John Gruber`_ did all of the hard work of writing this software in Perl for | |
| 44 `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_ | |
| 45 ported it to Python to use with Pyblosxom_. | |
| 46 Adapted to Docutils_ by Günter Milde | |
| 47 | |
| 48 Additional Credits | |
| 49 ================== | |
| 50 | |
| 51 Portions of the SmartyPants original work are based on Brad Choate's nifty | |
| 52 MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to | |
| 53 this plug-in. Brad Choate is a fine hacker indeed. | |
| 54 | |
| 55 `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta | |
| 56 testing of the original SmartyPants. | |
| 57 | |
| 58 `Rael Dornfest`_ ported SmartyPants to Blosxom. | |
| 59 | |
| 60 .. _Brad Choate: http://bradchoate.com/ | |
| 61 .. _Jeremy Hedley: http://antipixel.com/ | |
| 62 .. _Charles Wiltgen: http://playbacktime.com/ | |
| 63 .. _Rael Dornfest: http://raelity.org/ | |
| 64 | |
| 65 | |
| 66 Copyright and License | |
| 67 ===================== | |
| 68 | |
| 69 SmartyPants_ license (3-Clause BSD license): | |
| 70 | |
| 71 Copyright (c) 2003 John Gruber (http://daringfireball.net/) | |
| 72 All rights reserved. | |
| 73 | |
| 74 Redistribution and use in source and binary forms, with or without | |
| 75 modification, are permitted provided that the following conditions are | |
| 76 met: | |
| 77 | |
| 78 * Redistributions of source code must retain the above copyright | |
| 79 notice, this list of conditions and the following disclaimer. | |
| 80 | |
| 81 * Redistributions in binary form must reproduce the above copyright | |
| 82 notice, this list of conditions and the following disclaimer in | |
| 83 the documentation and/or other materials provided with the | |
| 84 distribution. | |
| 85 | |
| 86 * Neither the name "SmartyPants" nor the names of its contributors | |
| 87 may be used to endorse or promote products derived from this | |
| 88 software without specific prior written permission. | |
| 89 | |
| 90 This software is provided by the copyright holders and contributors | |
| 91 "as is" and any express or implied warranties, including, but not | |
| 92 limited to, the implied warranties of merchantability and fitness for | |
| 93 a particular purpose are disclaimed. In no event shall the copyright | |
| 94 owner or contributors be liable for any direct, indirect, incidental, | |
| 95 special, exemplary, or consequential damages (including, but not | |
| 96 limited to, procurement of substitute goods or services; loss of use, | |
| 97 data, or profits; or business interruption) however caused and on any | |
| 98 theory of liability, whether in contract, strict liability, or tort | |
| 99 (including negligence or otherwise) arising in any way out of the use | |
| 100 of this software, even if advised of the possibility of such damage. | |
| 101 | |
| 102 smartypants.py license (2-Clause BSD license): | |
| 103 | |
| 104 smartypants.py is a derivative work of SmartyPants. | |
| 105 | |
| 106 Redistribution and use in source and binary forms, with or without | |
| 107 modification, are permitted provided that the following conditions are | |
| 108 met: | |
| 109 | |
| 110 * Redistributions of source code must retain the above copyright | |
| 111 notice, this list of conditions and the following disclaimer. | |
| 112 | |
| 113 * Redistributions in binary form must reproduce the above copyright | |
| 114 notice, this list of conditions and the following disclaimer in | |
| 115 the documentation and/or other materials provided with the | |
| 116 distribution. | |
| 117 | |
| 118 This software is provided by the copyright holders and contributors | |
| 119 "as is" and any express or implied warranties, including, but not | |
| 120 limited to, the implied warranties of merchantability and fitness for | |
| 121 a particular purpose are disclaimed. In no event shall the copyright | |
| 122 owner or contributors be liable for any direct, indirect, incidental, | |
| 123 special, exemplary, or consequential damages (including, but not | |
| 124 limited to, procurement of substitute goods or services; loss of use, | |
| 125 data, or profits; or business interruption) however caused and on any | |
| 126 theory of liability, whether in contract, strict liability, or tort | |
| 127 (including negligence or otherwise) arising in any way out of the use | |
| 128 of this software, even if advised of the possibility of such damage. | |
| 129 | |
| 130 .. _John Gruber: http://daringfireball.net/ | |
| 131 .. _Chad Miller: http://web.chad.org/ | |
| 132 | |
| 133 .. _Pyblosxom: http://pyblosxom.bluesock.org/ | |
| 134 .. _SmartyPants: http://daringfireball.net/projects/smartypants/ | |
| 135 .. _Movable Type: http://www.movabletype.org/ | |
| 136 .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause | |
| 137 .. _Docutils: http://docutils.sf.net/ | |
| 138 | |
| 139 Description | |
| 140 =========== | |
| 141 | |
| 142 SmartyPants can perform the following transformations: | |
| 143 | |
| 144 - Straight quotes ( " and ' ) into "curly" quote characters | |
| 145 - Backticks-style quotes (\`\`like this'') into "curly" quote characters | |
| 146 - Dashes (``--`` and ``---``) into en- and em-dash entities | |
| 147 - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity | |
| 148 | |
| 149 This means you can write, edit, and save your posts using plain old | |
| 150 ASCII straight quotes, plain dashes, and plain dots, but your published | |
| 151 posts (and final HTML output) will appear with smart quotes, em-dashes, | |
| 152 and proper ellipses. | |
| 153 | |
| 154 SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``, | |
| 155 ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to | |
| 156 display text where smart quotes and other "smart punctuation" would not be | |
| 157 appropriate, such as source code or example markup. | |
| 158 | |
| 159 | |
| 160 Backslash Escapes | |
| 161 ================= | |
| 162 | |
| 163 If you need to use literal straight quotes (or plain hyphens and | |
| 164 periods), SmartyPants accepts the following backslash escape sequences | |
| 165 to force non-smart punctuation. It does so by transforming the escape | |
| 166 sequence into a character: | |
| 167 | |
| 168 ======== ===== ========= | |
| 169 Escape Value Character | |
| 170 ======== ===== ========= | |
| 171 ``\\\\`` \ \\ | |
| 172 \\" " " | |
| 173 \\' ' ' | |
| 174 \\. . . | |
| 175 \\- - \- | |
| 176 \\` ` \` | |
| 177 ======== ===== ========= | |
| 178 | |
| 179 This is useful, for example, when you want to use straight quotes as | |
| 180 foot and inch marks: 6\\'2\\" tall; a 17\\" iMac. | |
| 181 | |
| 182 Options | |
| 183 ======= | |
| 184 | |
| 185 For Pyblosxom users, the ``smartypants_attributes`` attribute is where you | |
| 186 specify configuration options. | |
| 187 | |
| 188 Numeric values are the easiest way to configure SmartyPants' behavior: | |
| 189 | |
| 190 "0" | |
| 191 Suppress all transformations. (Do nothing.) | |
| 192 "1" | |
| 193 Performs default SmartyPants transformations: quotes (including | |
| 194 \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash) | |
| 195 is used to signify an em-dash; there is no support for en-dashes. | |
| 196 | |
| 197 "2" | |
| 198 Same as smarty_pants="1", except that it uses the old-school typewriter | |
| 199 shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``" | |
| 200 (dash dash dash) | |
| 201 for em-dashes. | |
| 202 | |
| 203 "3" | |
| 204 Same as smarty_pants="2", but inverts the shorthand for dashes: | |
| 205 "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for | |
| 206 en-dashes. | |
| 207 | |
| 208 "-1" | |
| 209 Stupefy mode. Reverses the SmartyPants transformation process, turning | |
| 210 the characters produced by SmartyPants into their ASCII equivalents. | |
| 211 E.g. "“" is turned into a simple double-quote (\"), "—" is | |
| 212 turned into two dashes, etc. | |
| 213 | |
| 214 | |
| 215 The following single-character attribute values can be combined to toggle | |
| 216 individual transformations from within the smarty_pants attribute. For | |
| 217 example, to educate normal quotes and em-dashes, but not ellipses or | |
| 218 \`\`backticks'' -style quotes: | |
| 219 | |
| 220 ``py['smartypants_attributes'] = "1"`` | |
| 221 | |
| 222 "q" | |
| 223 Educates normal quote characters: (") and ('). | |
| 224 | |
| 225 "b" | |
| 226 Educates \`\`backticks'' -style double quotes. | |
| 227 | |
| 228 "B" | |
| 229 Educates \`\`backticks'' -style double quotes and \`single' quotes. | |
| 230 | |
| 231 "d" | |
| 232 Educates em-dashes. | |
| 233 | |
| 234 "D" | |
| 235 Educates em-dashes and en-dashes, using old-school typewriter shorthand: | |
| 236 (dash dash) for en-dashes, (dash dash dash) for em-dashes. | |
| 237 | |
| 238 "i" | |
| 239 Educates em-dashes and en-dashes, using inverted old-school typewriter | |
| 240 shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes. | |
| 241 | |
| 242 "e" | |
| 243 Educates ellipses. | |
| 244 | |
| 245 "w" | |
| 246 Translates any instance of ``"`` into a normal double-quote character. | |
| 247 This should be of no interest to most people, but of particular interest | |
| 248 to anyone who writes their posts using Dreamweaver, as Dreamweaver | |
| 249 inexplicably uses this entity to represent a literal double-quote | |
| 250 character. SmartyPants only educates normal quotes, not entities (because | |
| 251 ordinarily, entities are used for the explicit purpose of representing the | |
| 252 specific character they represent). The "w" option must be used in | |
| 253 conjunction with one (or both) of the other quote options ("q" or "b"). | |
| 254 Thus, if you wish to apply all SmartyPants transformations (quotes, en- | |
| 255 and em-dashes, and ellipses) and also translate ``"`` entities into | |
| 256 regular quotes so SmartyPants can educate them, you should pass the | |
| 257 following to the smarty_pants attribute: | |
| 258 | |
| 259 | |
| 260 Caveats | |
| 261 ======= | |
| 262 | |
| 263 Why You Might Not Want to Use Smart Quotes in Your Weblog | |
| 264 --------------------------------------------------------- | |
| 265 | |
| 266 For one thing, you might not care. | |
| 267 | |
| 268 Most normal, mentally stable individuals do not take notice of proper | |
| 269 typographic punctuation. Many design and typography nerds, however, break | |
| 270 out in a nasty rash when they encounter, say, a restaurant sign that uses | |
| 271 a straight apostrophe to spell "Joe's". | |
| 272 | |
| 273 If you're the sort of person who just doesn't care, you might well want to | |
| 274 continue not caring. Using straight quotes -- and sticking to the 7-bit | |
| 275 ASCII character set in general -- is certainly a simpler way to live. | |
| 276 | |
| 277 Even if you I *do* care about accurate typography, you still might want to | |
| 278 think twice before educating the quote characters in your weblog. One side | |
| 279 effect of publishing curly quote characters is that it makes your | |
| 280 weblog a bit harder for others to quote from using copy-and-paste. What | |
| 281 happens is that when someone copies text from your blog, the copied text | |
| 282 contains the 8-bit curly quote characters (as well as the 8-bit characters | |
| 283 for em-dashes and ellipses, if you use these options). These characters | |
| 284 are not standard across different text encoding methods, which is why they | |
| 285 need to be encoded as characters. | |
| 286 | |
| 287 People copying text from your weblog, however, may not notice that you're | |
| 288 using curly quotes, and they'll go ahead and paste the unencoded 8-bit | |
| 289 characters copied from their browser into an email message or their own | |
| 290 weblog. When pasted as raw "smart quotes", these characters are likely to | |
| 291 get mangled beyond recognition. | |
| 292 | |
| 293 That said, my own opinion is that any decent text editor or email client | |
| 294 makes it easy to stupefy smart quote characters into their 7-bit | |
| 295 equivalents, and I don't consider it my problem if you're using an | |
| 296 indecent text editor or email client. | |
| 297 | |
| 298 | |
| 299 Algorithmic Shortcomings | |
| 300 ------------------------ | |
| 301 | |
| 302 One situation in which quotes will get curled the wrong way is when | |
| 303 apostrophes are used at the start of leading contractions. For example: | |
| 304 | |
| 305 ``'Twas the night before Christmas.`` | |
| 306 | |
| 307 In the case above, SmartyPants will turn the apostrophe into an opening | |
| 308 single-quote, when in fact it should be a closing one. I don't think | |
| 309 this problem can be solved in the general case -- every word processor | |
| 310 I've tried gets this wrong as well. In such cases, it's best to use the | |
| 311 proper character for closing single-quotes (``’``) by hand. | |
| 312 | |
| 313 | |
| 314 Version History | |
| 315 =============== | |
| 316 | |
| 317 1.7 2012-11-19 | |
| 318 - Internationalization: language-dependent quotes. | |
| 319 | |
| 320 1.6.1: 2012-11-06 | |
| 321 - Refactor code, code cleanup, | |
| 322 - `educate_tokens()` generator as interface for Docutils. | |
| 323 | |
| 324 1.6: 2010-08-26 | |
| 325 - Adaption to Docutils: | |
| 326 - Use Unicode instead of HTML entities, | |
| 327 - Remove code special to pyblosxom. | |
| 328 | |
| 329 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400 | |
| 330 - Fixed bug where blocks of precious unalterable text was instead | |
| 331 interpreted. Thanks to Le Roux and Dirk van Oosterbosch. | |
| 332 | |
| 333 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400 | |
| 334 - Fix bogus magical quotation when there is no hint that the | |
| 335 user wants it, e.g., in "21st century". Thanks to Nathan Hamblen. | |
| 336 - Be smarter about quotes before terminating numbers in an en-dash'ed | |
| 337 range. | |
| 338 | |
| 339 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500 | |
| 340 - Fix a date-processing bug, as reported by jacob childress. | |
| 341 - Begin a test-suite for ensuring correct output. | |
| 342 - Removed import of "string", since I didn't really need it. | |
| 343 (This was my first every Python program. Sue me!) | |
| 344 | |
| 345 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400 | |
| 346 - Abort processing if the flavour is in forbidden-list. Default of | |
| 347 [ "rss" ] (Idea of Wolfgang SCHNERRING.) | |
| 348 - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING. | |
| 349 | |
| 350 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400 | |
| 351 - Some single quotes weren't replaced properly. Diff-tesuji played | |
| 352 by Benjamin GEIGER. | |
| 353 | |
| 354 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500 | |
| 355 - Support upcoming pyblosxom 0.9 plugin verification feature. | |
| 356 | |
| 357 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500 | |
| 358 - Initial release | |
| 359 """ | |
| 360 | |
| 361 default_smartypants_attr = "1" | |
| 362 | |
| 363 | |
| 364 import re | |
| 365 | |
| 366 class smartchars(object): | |
| 367 """Smart quotes and dashes | |
| 368 """ | |
| 369 | |
| 370 endash = u'–' # "–" EN DASH | |
| 371 emdash = u'—' # "—" EM DASH | |
| 372 ellipsis = u'…' # "…" HORIZONTAL ELLIPSIS | |
| 373 | |
| 374 # quote characters (language-specific, set in __init__()) | |
| 375 # | |
| 376 # English smart quotes (open primary, close primary, open secondary, close | |
| 377 # secondary) are: | |
| 378 # opquote = u'“' # "“" LEFT DOUBLE QUOTATION MARK | |
| 379 # cpquote = u'”' # "”" RIGHT DOUBLE QUOTATION MARK | |
| 380 # osquote = u'‘' # "‘" LEFT SINGLE QUOTATION MARK | |
| 381 # csquote = u'’' # "’" RIGHT SINGLE QUOTATION MARK | |
| 382 # For other languages see: | |
| 383 # http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks | |
| 384 # http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen | |
| 385 quotes = {'af': u'“”‘’', | |
| 386 'af-x-altquot': u'„”‚’', | |
| 387 'ca': u'«»“”', | |
| 388 'ca-x-altquot': u'“”‘’', | |
| 389 'cs': u'„“‚‘', | |
| 390 'cs-x-altquot': u'»«›‹', | |
| 391 'da': u'»«‘’', | |
| 392 'da-x-altquot': u'„“‚‘', | |
| 393 'de': u'„“‚‘', | |
| 394 'de-x-altquot': u'»«›‹', | |
| 395 'de-CH': u'«»‹›', | |
| 396 'el': u'«»“”', | |
| 397 'en': u'“”‘’', | |
| 398 'en-UK': u'‘’“”', | |
| 399 'eo': u'“”‘’', | |
| 400 'es': u'«»“”', | |
| 401 'et': u'„“‚‘', # no secondary quote listed in | |
| 402 'et-x-altquot': u'»«›‹', # the sources above (wikipedia.org) | |
| 403 'eu': u'«»‹›', | |
| 404 'es-x-altquot': u'“”‘’', | |
| 405 'fi': u'””’’', | |
| 406 'fi-x-altquot': u'»»’’', | |
| 407 'fr': (u'« ', u' »', u'‹ ', u' ›'), # with narrow no-break space | |
| 408 'fr-x-altquot': u'«»‹›', # for use with manually set spaces | |
| 409 # 'fr-x-altquot': (u'“ ', u' ”', u'‘ ', u' ’'), # rarely used | |
| 410 'fr-CH': u'«»‹›', | |
| 411 'gl': u'«»“”', | |
| 412 'he': u'”“»«', | |
| 413 'he-x-altquot': u'„”‚’', | |
| 414 'it': u'«»“”', | |
| 415 'it-CH': u'«»‹›', | |
| 416 'it-x-altquot': u'“”‘’', | |
| 417 'ja': u'「」『』', | |
| 418 'lt': u'„“‚‘', | |
| 419 'nl': u'“”‘’', | |
| 420 'nl-x-altquot': u'„”‚’', | |
| 421 'pl': u'„”«»', | |
| 422 'pl-x-altquot': u'«»“”', | |
| 423 'pt': u'«»“”', | |
| 424 'pt-BR': u'“”‘’', | |
| 425 'ro': u'„”«»', | |
| 426 'ro-x-altquot': u'«»„”', | |
| 427 'ru': u'«»„“', | |
| 428 'sk': u'„“‚‘', | |
| 429 'sk-x-altquot': u'»«›‹', | |
| 430 'sv': u'„“‚‘', | |
| 431 'sv-x-altquot': u'»«›‹', | |
| 432 'zh-CN': u'“”‘’', | |
| 433 'it': u'«»“”', | |
| 434 'zh-TW': u'「」『』', | |
| 435 } | |
| 436 | |
| 437 def __init__(self, language='en'): | |
| 438 self.language = language | |
| 439 try: | |
| 440 (self.opquote, self.cpquote, | |
| 441 self.osquote, self.csquote) = self.quotes[language] | |
| 442 except KeyError: | |
| 443 self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\'' | |
| 444 | |
| 445 | |
| 446 def smartyPants(text, attr=default_smartypants_attr, language='en'): | |
| 447 """Main function for "traditional" use.""" | |
| 448 | |
| 449 return "".join([t for t in educate_tokens(tokenize(text), | |
| 450 attr, language)]) | |
| 451 | |
| 452 | |
| 453 def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'): | |
| 454 """Return iterator that "educates" the items of `text_tokens`. | |
| 455 """ | |
| 456 | |
| 457 # Parse attributes: | |
| 458 # 0 : do nothing | |
| 459 # 1 : set all | |
| 460 # 2 : set all, using old school en- and em- dash shortcuts | |
| 461 # 3 : set all, using inverted old school en and em- dash shortcuts | |
| 462 # | |
| 463 # q : quotes | |
| 464 # b : backtick quotes (``double'' only) | |
| 465 # B : backtick quotes (``double'' and `single') | |
| 466 # d : dashes | |
| 467 # D : old school dashes | |
| 468 # i : inverted old school dashes | |
| 469 # e : ellipses | |
| 470 # w : convert " entities to " for Dreamweaver users | |
| 471 | |
| 472 convert_quot = False # translate " entities into normal quotes? | |
| 473 do_dashes = False | |
| 474 do_backticks = False | |
| 475 do_quotes = False | |
| 476 do_ellipses = False | |
| 477 do_stupefy = False | |
| 478 | |
| 479 if attr == "0": # Do nothing. | |
| 480 yield text | |
| 481 elif attr == "1": # Do everything, turn all options on. | |
| 482 do_quotes = True | |
| 483 do_backticks = True | |
| 484 do_dashes = 1 | |
| 485 do_ellipses = True | |
| 486 elif attr == "2": | |
| 487 # Do everything, turn all options on, use old school dash shorthand. | |
| 488 do_quotes = True | |
| 489 do_backticks = True | |
| 490 do_dashes = 2 | |
| 491 do_ellipses = True | |
| 492 elif attr == "3": | |
| 493 # Do everything, use inverted old school dash shorthand. | |
| 494 do_quotes = True | |
| 495 do_backticks = True | |
| 496 do_dashes = 3 | |
| 497 do_ellipses = True | |
| 498 elif attr == "-1": # Special "stupefy" mode. | |
| 499 do_stupefy = True | |
| 500 else: | |
| 501 if "q" in attr: do_quotes = True | |
| 502 if "b" in attr: do_backticks = True | |
| 503 if "B" in attr: do_backticks = 2 | |
| 504 if "d" in attr: do_dashes = 1 | |
| 505 if "D" in attr: do_dashes = 2 | |
| 506 if "i" in attr: do_dashes = 3 | |
| 507 if "e" in attr: do_ellipses = True | |
| 508 if "w" in attr: convert_quot = True | |
| 509 | |
| 510 prev_token_last_char = " " | |
| 511 # Last character of the previous text token. Used as | |
| 512 # context to curl leading quote characters correctly. | |
| 513 | |
| 514 for (ttype, text) in text_tokens: | |
| 515 | |
| 516 # skip HTML and/or XML tags as well as emtpy text tokens | |
| 517 # without updating the last character | |
| 518 if ttype == 'tag' or not text: | |
| 519 yield text | |
| 520 continue | |
| 521 | |
| 522 # skip literal text (math, literal, raw, ...) | |
| 523 if ttype == 'literal': | |
| 524 prev_token_last_char = text[-1:] | |
| 525 yield text | |
| 526 continue | |
| 527 | |
| 528 last_char = text[-1:] # Remember last char before processing. | |
| 529 | |
| 530 text = processEscapes(text) | |
| 531 | |
| 532 if convert_quot: | |
| 533 text = re.sub('"', '"', text) | |
| 534 | |
| 535 if do_dashes == 1: | |
| 536 text = educateDashes(text) | |
| 537 elif do_dashes == 2: | |
| 538 text = educateDashesOldSchool(text) | |
| 539 elif do_dashes == 3: | |
| 540 text = educateDashesOldSchoolInverted(text) | |
| 541 | |
| 542 if do_ellipses: | |
| 543 text = educateEllipses(text) | |
| 544 | |
| 545 # Note: backticks need to be processed before quotes. | |
| 546 if do_backticks: | |
| 547 text = educateBackticks(text, language) | |
| 548 | |
| 549 if do_backticks == 2: | |
| 550 text = educateSingleBackticks(text, language) | |
| 551 | |
| 552 if do_quotes: | |
| 553 text = educateQuotes(prev_token_last_char+text, language)[1:] | |
| 554 | |
| 555 if do_stupefy: | |
| 556 text = stupefyEntities(text, language) | |
| 557 | |
| 558 # Remember last char as context for the next token | |
| 559 prev_token_last_char = last_char | |
| 560 | |
| 561 text = processEscapes(text, restore=True) | |
| 562 | |
| 563 yield text | |
| 564 | |
| 565 | |
| 566 | |
| 567 def educateQuotes(text, language='en'): | |
| 568 """ | |
| 569 Parameter: - text string (unicode or bytes). | |
| 570 - language (`BCP 47` language tag.) | |
| 571 Returns: The `text`, with "educated" curly quote characters. | |
| 572 | |
| 573 Example input: "Isn't this fun?" | |
| 574 Example output: “Isn’t this fun?“; | |
| 575 """ | |
| 576 | |
| 577 smart = smartchars(language) | |
| 578 | |
| 579 # oldtext = text | |
| 580 punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" | |
| 581 | |
| 582 # Special case if the very first character is a quote | |
| 583 # followed by punctuation at a non-word-break. | |
| 584 # Close the quotes by brute force: | |
| 585 text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.csquote, text) | |
| 586 text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.cpquote, text) | |
| 587 | |
| 588 # Special case for double sets of quotes, e.g.: | |
| 589 # <p>He said, "'Quoted' words in a larger quote."</p> | |
| 590 text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text) | |
| 591 text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text) | |
| 592 | |
| 593 # Special case for decade abbreviations (the '80s): | |
| 594 text = re.sub(r"""\b'(?=\d{2}s)""", smart.csquote, text) | |
| 595 | |
| 596 close_class = r"""[^\ \t\r\n\[\{\(\-]""" | |
| 597 dec_dashes = r"""–|—""" | |
| 598 | |
| 599 # Get most opening single quotes: | |
| 600 opening_single_quotes_regex = re.compile(r""" | |
| 601 ( | |
| 602 \s | # a whitespace char, or | |
| 603 | # a non-breaking space entity, or | |
| 604 -- | # dashes, or | |
| 605 &[mn]dash; | # named dash entities | |
| 606 %s | # or decimal entities | |
| 607 &\#x201[34]; # or hex | |
| 608 ) | |
| 609 ' # the quote | |
| 610 (?=\w) # followed by a word character | |
| 611 """ % (dec_dashes,), re.VERBOSE) | |
| 612 text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text) | |
| 613 | |
| 614 closing_single_quotes_regex = re.compile(r""" | |
| 615 (%s) | |
| 616 ' | |
| 617 (?!\s | s\b | \d) | |
| 618 """ % (close_class,), re.VERBOSE) | |
| 619 text = closing_single_quotes_regex.sub(r'\1'+smart.csquote, text) | |
| 620 | |
| 621 closing_single_quotes_regex = re.compile(r""" | |
| 622 (%s) | |
| 623 ' | |
| 624 (\s | s\b) | |
| 625 """ % (close_class,), re.VERBOSE) | |
| 626 text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text) | |
| 627 | |
| 628 # Any remaining single quotes should be opening ones: | |
| 629 text = re.sub(r"""'""", smart.osquote, text) | |
| 630 | |
| 631 # Get most opening double quotes: | |
| 632 opening_double_quotes_regex = re.compile(r""" | |
| 633 ( | |
| 634 \s | # a whitespace char, or | |
| 635 | # a non-breaking space entity, or | |
| 636 -- | # dashes, or | |
| 637 &[mn]dash; | # named dash entities | |
| 638 %s | # or decimal entities | |
| 639 &\#x201[34]; # or hex | |
| 640 ) | |
| 641 " # the quote | |
| 642 (?=\w) # followed by a word character | |
| 643 """ % (dec_dashes,), re.VERBOSE) | |
| 644 text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text) | |
| 645 | |
| 646 # Double closing quotes: | |
| 647 closing_double_quotes_regex = re.compile(r""" | |
| 648 #(%s)? # character that indicates the quote should be closing | |
| 649 " | |
| 650 (?=\s) | |
| 651 """ % (close_class,), re.VERBOSE) | |
| 652 text = closing_double_quotes_regex.sub(smart.cpquote, text) | |
| 653 | |
| 654 closing_double_quotes_regex = re.compile(r""" | |
| 655 (%s) # character that indicates the quote should be closing | |
| 656 " | |
| 657 """ % (close_class,), re.VERBOSE) | |
| 658 text = closing_double_quotes_regex.sub(r'\1'+smart.cpquote, text) | |
| 659 | |
| 660 # Any remaining quotes should be opening ones. | |
| 661 text = re.sub(r'"', smart.opquote, text) | |
| 662 | |
| 663 return text | |
| 664 | |
| 665 | |
| 666 def educateBackticks(text, language='en'): | |
| 667 """ | |
| 668 Parameter: String (unicode or bytes). | |
| 669 Returns: The `text`, with ``backticks'' -style double quotes | |
| 670 translated into HTML curly quote entities. | |
| 671 Example input: ``Isn't this fun?'' | |
| 672 Example output: “Isn't this fun?“; | |
| 673 """ | |
| 674 smart = smartchars(language) | |
| 675 | |
| 676 text = re.sub(r"""``""", smart.opquote, text) | |
| 677 text = re.sub(r"""''""", smart.cpquote, text) | |
| 678 return text | |
| 679 | |
| 680 | |
| 681 def educateSingleBackticks(text, language='en'): | |
| 682 """ | |
| 683 Parameter: String (unicode or bytes). | |
| 684 Returns: The `text`, with `backticks' -style single quotes | |
| 685 translated into HTML curly quote entities. | |
| 686 | |
| 687 Example input: `Isn't this fun?' | |
| 688 Example output: ‘Isn’t this fun?’ | |
| 689 """ | |
| 690 smart = smartchars(language) | |
| 691 | |
| 692 text = re.sub(r"""`""", smart.osquote, text) | |
| 693 text = re.sub(r"""'""", smart.csquote, text) | |
| 694 return text | |
| 695 | |
| 696 | |
| 697 def educateDashes(text): | |
| 698 """ | |
| 699 Parameter: String (unicode or bytes). | |
| 700 Returns: The `text`, with each instance of "--" translated to | |
| 701 an em-dash character. | |
| 702 """ | |
| 703 | |
| 704 text = re.sub(r"""---""", smartchars.endash, text) # en (yes, backwards) | |
| 705 text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards) | |
| 706 return text | |
| 707 | |
| 708 | |
| 709 def educateDashesOldSchool(text): | |
| 710 """ | |
| 711 Parameter: String (unicode or bytes). | |
| 712 Returns: The `text`, with each instance of "--" translated to | |
| 713 an en-dash character, and each "---" translated to | |
| 714 an em-dash character. | |
| 715 """ | |
| 716 | |
| 717 text = re.sub(r"""---""", smartchars.emdash, text) | |
| 718 text = re.sub(r"""--""", smartchars.endash, text) | |
| 719 return text | |
| 720 | |
| 721 | |
| 722 def educateDashesOldSchoolInverted(text): | |
| 723 """ | |
| 724 Parameter: String (unicode or bytes). | |
| 725 Returns: The `text`, with each instance of "--" translated to | |
| 726 an em-dash character, and each "---" translated to | |
| 727 an en-dash character. Two reasons why: First, unlike the | |
| 728 en- and em-dash syntax supported by | |
| 729 EducateDashesOldSchool(), it's compatible with existing | |
| 730 entries written before SmartyPants 1.1, back when "--" was | |
| 731 only used for em-dashes. Second, em-dashes are more | |
| 732 common than en-dashes, and so it sort of makes sense that | |
| 733 the shortcut should be shorter to type. (Thanks to Aaron | |
| 734 Swartz for the idea.) | |
| 735 """ | |
| 736 text = re.sub(r"""---""", smartchars.endash, text) # em | |
| 737 text = re.sub(r"""--""", smartchars.emdash, text) # en | |
| 738 return text | |
| 739 | |
| 740 | |
| 741 | |
| 742 def educateEllipses(text): | |
| 743 """ | |
| 744 Parameter: String (unicode or bytes). | |
| 745 Returns: The `text`, with each instance of "..." translated to | |
| 746 an ellipsis character. | |
| 747 | |
| 748 Example input: Huh...? | |
| 749 Example output: Huh…? | |
| 750 """ | |
| 751 | |
| 752 text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text) | |
| 753 text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text) | |
| 754 return text | |
| 755 | |
| 756 | |
| 757 def stupefyEntities(text, language='en'): | |
| 758 """ | |
| 759 Parameter: String (unicode or bytes). | |
| 760 Returns: The `text`, with each SmartyPants character translated to | |
| 761 its ASCII counterpart. | |
| 762 | |
| 763 Example input: “Hello — world.” | |
| 764 Example output: "Hello -- world." | |
| 765 """ | |
| 766 smart = smartchars(language) | |
| 767 | |
| 768 text = re.sub(smart.endash, "-", text) # en-dash | |
| 769 text = re.sub(smart.emdash, "--", text) # em-dash | |
| 770 | |
| 771 text = re.sub(smart.osquote, "'", text) # open single quote | |
| 772 text = re.sub(smart.csquote, "'", text) # close single quote | |
| 773 | |
| 774 text = re.sub(smart.opquote, '"', text) # open double quote | |
| 775 text = re.sub(smart.cpquote, '"', text) # close double quote | |
| 776 | |
| 777 text = re.sub(smart.ellipsis, '...', text)# ellipsis | |
| 778 | |
| 779 return text | |
| 780 | |
| 781 | |
| 782 def processEscapes(text, restore=False): | |
| 783 r""" | |
| 784 Parameter: String (unicode or bytes). | |
| 785 Returns: The `text`, with after processing the following backslash | |
| 786 escape sequences. This is useful if you want to force a "dumb" | |
| 787 quote or other character to appear. | |
| 788 | |
| 789 Escape Value | |
| 790 ------ ----- | |
| 791 \\ \ | |
| 792 \" " | |
| 793 \' ' | |
| 794 \. . | |
| 795 \- - | |
| 796 \` ` | |
| 797 """ | |
| 798 replacements = ((r'\\', r'\'), | |
| 799 (r'\"', r'"'), | |
| 800 (r"\'", r'''), | |
| 801 (r'\.', r'.'), | |
| 802 (r'\-', r'-'), | |
| 803 (r'\`', r'`')) | |
| 804 if restore: | |
| 805 for (ch, rep) in replacements: | |
| 806 text = text.replace(rep, ch[1]) | |
| 807 else: | |
| 808 for (ch, rep) in replacements: | |
| 809 text = text.replace(ch, rep) | |
| 810 | |
| 811 return text | |
| 812 | |
| 813 | |
| 814 def tokenize(text): | |
| 815 """ | |
| 816 Parameter: String containing HTML markup. | |
| 817 Returns: An iterator that yields the tokens comprising the input | |
| 818 string. Each token is either a tag (possibly with nested, | |
| 819 tags contained therein, such as <a href="<MTFoo>">, or a | |
| 820 run of text between tags. Each yielded element is a | |
| 821 two-element tuple; the first is either 'tag' or 'text'; | |
| 822 the second is the actual value. | |
| 823 | |
| 824 Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. | |
| 825 <http://www.bradchoate.com/past/mtregex.php> | |
| 826 """ | |
| 827 | |
| 828 pos = 0 | |
| 829 length = len(text) | |
| 830 # tokens = [] | |
| 831 | |
| 832 depth = 6 | |
| 833 nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth) | |
| 834 #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments | |
| 835 # (?: <\? .*? \?> ) | # directives | |
| 836 # %s # nested tags """ % (nested_tags,) | |
| 837 tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") | |
| 838 | |
| 839 token_match = tag_soup.search(text) | |
| 840 | |
| 841 previous_end = 0 | |
| 842 while token_match is not None: | |
| 843 if token_match.group(1): | |
| 844 yield ('text', token_match.group(1)) | |
| 845 | |
| 846 yield ('tag', token_match.group(2)) | |
| 847 | |
| 848 previous_end = token_match.end() | |
| 849 token_match = tag_soup.search(text, token_match.end()) | |
| 850 | |
| 851 if previous_end < len(text): | |
| 852 yield ('text', text[previous_end:]) | |
| 853 | |
| 854 | |
| 855 | |
| 856 if __name__ == "__main__": | |
| 857 | |
| 858 import locale | |
| 859 | |
| 860 try: | |
| 861 locale.setlocale(locale.LC_ALL, '') | |
| 862 except: | |
| 863 pass | |
| 864 | |
| 865 from docutils.core import publish_string | |
| 866 docstring_html = publish_string(__doc__, writer_name='html') | |
| 867 | |
| 868 print docstring_html | |
| 869 | |
| 870 | |
| 871 # Unit test output goes out stderr. | |
| 872 import unittest | |
| 873 sp = smartyPants | |
| 874 | |
| 875 class TestSmartypantsAllAttributes(unittest.TestCase): | |
| 876 # the default attribute is "1", which means "all". | |
| 877 | |
| 878 def test_dates(self): | |
| 879 self.assertEqual(sp("1440-80's"), u"1440-80’s") | |
| 880 self.assertEqual(sp("1440-'80s"), u"1440-‘80s") | |
| 881 self.assertEqual(sp("1440---'80s"), u"1440–‘80s") | |
| 882 self.assertEqual(sp("1960s"), "1960s") # no effect. | |
| 883 self.assertEqual(sp("1960's"), u"1960’s") | |
| 884 self.assertEqual(sp("one two '60s"), u"one two ‘60s") | |
| 885 self.assertEqual(sp("'60s"), u"‘60s") | |
| 886 | |
| 887 def test_ordinal_numbers(self): | |
| 888 self.assertEqual(sp("21st century"), "21st century") # no effect. | |
| 889 self.assertEqual(sp("3rd"), "3rd") # no effect. | |
| 890 | |
| 891 def test_educated_quotes(self): | |
| 892 self.assertEqual(sp('''"Isn't this fun?"'''), u'“Isn’t this fun?”') | |
| 893 | |
| 894 def test_html_tags(self): | |
| 895 text = '<a src="foo">more</a>' | |
| 896 self.assertEqual(sp(text), text) | |
| 897 | |
| 898 unittest.main() | |
| 899 | |
| 900 | |
| 901 | |
| 902 | |
| 903 __author__ = "Chad Miller <smartypantspy@chad.org>" | |
| 904 __version__ = "1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400" | |
| 905 __url__ = "http://wiki.chad.org/SmartyPantsPy" | |
| 906 __description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom" |
