| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 # Copyright (c) 2012 Trent Mick.
 | 
| 
 | 
     3 # Copyright (c) 2007-2008 ActiveState Corp.
 | 
| 
 | 
     4 # License: MIT (http://www.opensource.org/licenses/mit-license.php)
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 from __future__ import generators
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 r"""A fast and complete Python implementation of Markdown.
 | 
| 
 | 
     9 
 | 
| 
 | 
    10 [from http://daringfireball.net/projects/markdown/]
 | 
| 
 | 
    11 > Markdown is a text-to-HTML filter; it translates an easy-to-read /
 | 
| 
 | 
    12 > easy-to-write structured text format into HTML.  Markdown's text
 | 
| 
 | 
    13 > format is most similar to that of plain text email, and supports
 | 
| 
 | 
    14 > features such as headers, *emphasis*, code blocks, blockquotes, and
 | 
| 
 | 
    15 > links.
 | 
| 
 | 
    16 >
 | 
| 
 | 
    17 > Markdown's syntax is designed not as a generic markup language, but
 | 
| 
 | 
    18 > specifically to serve as a front-end to (X)HTML. You can use span-level
 | 
| 
 | 
    19 > HTML tags anywhere in a Markdown document, and you can use block level
 | 
| 
 | 
    20 > HTML tags (like <div> and <table> as well).
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 Module usage:
 | 
| 
 | 
    23 
 | 
| 
 | 
    24     >>> import markdown2
 | 
| 
 | 
    25     >>> markdown2.markdown("*boo!*")  # or use `html = markdown_path(PATH)`
 | 
| 
 | 
    26     u'<p><em>boo!</em></p>\n'
 | 
| 
 | 
    27 
 | 
| 
 | 
    28     >>> markdowner = Markdown()
 | 
| 
 | 
    29     >>> markdowner.convert("*boo!*")
 | 
| 
 | 
    30     u'<p><em>boo!</em></p>\n'
 | 
| 
 | 
    31     >>> markdowner.convert("**boom!**")
 | 
| 
 | 
    32     u'<p><strong>boom!</strong></p>\n'
 | 
| 
 | 
    33 
 | 
| 
 | 
    34 This implementation of Markdown implements the full "core" syntax plus a
 | 
| 
 | 
    35 number of extras (e.g., code syntax coloring, footnotes) as described on
 | 
| 
 | 
    36 <https://github.com/trentm/python-markdown2/wiki/Extras>.
 | 
| 
 | 
    37 """
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 cmdln_desc = """A fast and complete Python implementation of Markdown, a
 | 
| 
 | 
    40 text-to-HTML conversion tool for web writers.
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 Supported extra syntax options (see -x|--extras option below and
 | 
| 
 | 
    43 see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
 | 
| 
 | 
    44 
 | 
| 
 | 
    45 * code-friendly: Disable _ and __ for em and strong.
 | 
| 
 | 
    46 * cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
 | 
| 
 | 
    47 * fenced-code-blocks: Allows a code block to not have to be indented
 | 
| 
 | 
    48   by fencing it with '```' on a line before and after. Based on
 | 
| 
 | 
    49   <http://github.github.com/github-flavored-markdown/> with support for
 | 
| 
 | 
    50   syntax highlighting.
 | 
| 
 | 
    51 * footnotes: Support footnotes as in use on daringfireball.net and
 | 
| 
 | 
    52   implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
 | 
| 
 | 
    53 * header-ids: Adds "id" attributes to headers. The id value is a slug of
 | 
| 
 | 
    54   the header text.
 | 
| 
 | 
    55 * html-classes: Takes a dict mapping html tag names (lowercase) to a
 | 
| 
 | 
    56   string to use for a "class" tag attribute. Currently only supports
 | 
| 
 | 
    57   "pre" and "code" tags. Add an issue if you require this for other tags.
 | 
| 
 | 
    58 * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
 | 
| 
 | 
    59   have markdown processing be done on its contents. Similar to
 | 
| 
 | 
    60   <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
 | 
| 
 | 
    61   some limitations.
 | 
| 
 | 
    62 * metadata: Extract metadata from a leading '---'-fenced block.
 | 
| 
 | 
    63   See <https://github.com/trentm/python-markdown2/issues/77> for details.
 | 
| 
 | 
    64 * nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See
 | 
| 
 | 
    65   <http://en.wikipedia.org/wiki/Nofollow>.
 | 
| 
 | 
    66 * pyshell: Treats unindented Python interactive shell sessions as <code>
 | 
| 
 | 
    67   blocks.
 | 
| 
 | 
    68 * link-patterns: Auto-link given regex patterns in text (e.g. bug number
 | 
| 
 | 
    69   references, revision number references).
 | 
| 
 | 
    70 * smarty-pants: Replaces ' and " with curly quotation marks or curly
 | 
| 
 | 
    71   apostrophes.  Replaces --, ---, ..., and . . . with en dashes, em dashes,
 | 
| 
 | 
    72   and ellipses.
 | 
| 
 | 
    73 * toc: The returned HTML string gets a new "toc_html" attribute which is
 | 
| 
 | 
    74   a Table of Contents for the document. (experimental)
 | 
| 
 | 
    75 * xml: Passes one-liner processing instructions and namespaced XML tags.
 | 
| 
 | 
    76 * tables: Tables using the same format as GFM
 | 
| 
 | 
    77   <https://help.github.com/articles/github-flavored-markdown#tables> and
 | 
| 
 | 
    78   PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
 | 
| 
 | 
    79 * wiki-tables: Google Code Wiki-style tables. See
 | 
| 
 | 
    80   <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
 | 
| 
 | 
    81 """
 | 
| 
 | 
    82 
 | 
| 
 | 
    83 # Dev Notes:
 | 
| 
 | 
    84 # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
 | 
| 
 | 
    85 #   not yet sure if there implications with this. Compare 'pydoc sre'
 | 
| 
 | 
    86 #   and 'perldoc perlre'.
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 __version_info__ = (2, 3, 1)
 | 
| 
 | 
    89 __version__ = '.'.join(map(str, __version_info__))
 | 
| 
 | 
    90 __author__ = "Trent Mick"
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 import os
 | 
| 
 | 
    93 import sys
 | 
| 
 | 
    94 from pprint import pprint, pformat
 | 
| 
 | 
    95 import re
 | 
| 
 | 
    96 import logging
 | 
| 
 | 
    97 try:
 | 
| 
 | 
    98     from hashlib import md5
 | 
| 
 | 
    99 except ImportError:
 | 
| 
 | 
   100     from md5 import md5
 | 
| 
 | 
   101 import optparse
 | 
| 
 | 
   102 from random import random, randint
 | 
| 
 | 
   103 import codecs
 | 
| 
 | 
   104 
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 #---- Python version compat
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 try:
 | 
| 
 | 
   109     from urllib.parse import quote # python3
 | 
| 
 | 
   110 except ImportError:
 | 
| 
 | 
   111     from urllib import quote # python2
 | 
| 
 | 
   112 
 | 
| 
 | 
   113 if sys.version_info[:2] < (2,4):
 | 
| 
 | 
   114     from sets import Set as set
 | 
| 
 | 
   115     def reversed(sequence):
 | 
| 
 | 
   116         for i in sequence[::-1]:
 | 
| 
 | 
   117             yield i
 | 
| 
 | 
   118 
 | 
| 
 | 
   119 # Use `bytes` for byte strings and `unicode` for unicode strings (str in Py3).
 | 
| 
 | 
   120 if sys.version_info[0] <= 2:
 | 
| 
 | 
   121     py3 = False
 | 
| 
 | 
   122     try:
 | 
| 
 | 
   123         bytes
 | 
| 
 | 
   124     except NameError:
 | 
| 
 | 
   125         bytes = str
 | 
| 
 | 
   126     base_string_type = basestring
 | 
| 
 | 
   127 elif sys.version_info[0] >= 3:
 | 
| 
 | 
   128     py3 = True
 | 
| 
 | 
   129     unicode = str
 | 
| 
 | 
   130     base_string_type = str
 | 
| 
 | 
   131 
 | 
| 
 | 
   132 
 | 
| 
 | 
   133 
 | 
| 
 | 
   134 #---- globals
 | 
| 
 | 
   135 
 | 
| 
 | 
   136 DEBUG = False
 | 
| 
 | 
   137 log = logging.getLogger("markdown")
 | 
| 
 | 
   138 
 | 
| 
 | 
   139 DEFAULT_TAB_WIDTH = 4
 | 
| 
 | 
   140 
 | 
| 
 | 
   141 
 | 
| 
 | 
   142 SECRET_SALT = bytes(randint(0, 1000000))
 | 
| 
 | 
   143 def _hash_text(s):
 | 
| 
 | 
   144     return 'md5-' + md5(SECRET_SALT + s.encode("utf-8")).hexdigest()
 | 
| 
 | 
   145 
 | 
| 
 | 
   146 # Table of hash values for escaped characters:
 | 
| 
 | 
   147 g_escape_table = dict([(ch, _hash_text(ch))
 | 
| 
 | 
   148     for ch in '\\`*_{}[]()>#+-.!'])
 | 
| 
 | 
   149 
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 
 | 
| 
 | 
   152 #---- exceptions
 | 
| 
 | 
   153 
 | 
| 
 | 
   154 class MarkdownError(Exception):
 | 
| 
 | 
   155     pass
 | 
| 
 | 
   156 
 | 
| 
 | 
   157 
 | 
| 
 | 
   158 
 | 
| 
 | 
   159 #---- public api
 | 
| 
 | 
   160 
 | 
| 
 | 
   161 def markdown_path(path, encoding="utf-8",
 | 
| 
 | 
   162                   html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
 | 
| 
 | 
   163                   safe_mode=None, extras=None, link_patterns=None,
 | 
| 
 | 
   164                   use_file_vars=False):
 | 
| 
 | 
   165     fp = codecs.open(path, 'r', encoding)
 | 
| 
 | 
   166     text = fp.read()
 | 
| 
 | 
   167     fp.close()
 | 
| 
 | 
   168     return Markdown(html4tags=html4tags, tab_width=tab_width,
 | 
| 
 | 
   169                     safe_mode=safe_mode, extras=extras,
 | 
| 
 | 
   170                     link_patterns=link_patterns,
 | 
| 
 | 
   171                     use_file_vars=use_file_vars).convert(text)
 | 
| 
 | 
   172 
 | 
| 
 | 
   173 def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
 | 
| 
 | 
   174              safe_mode=None, extras=None, link_patterns=None,
 | 
| 
 | 
   175              use_file_vars=False):
 | 
| 
 | 
   176     return Markdown(html4tags=html4tags, tab_width=tab_width,
 | 
| 
 | 
   177                     safe_mode=safe_mode, extras=extras,
 | 
| 
 | 
   178                     link_patterns=link_patterns,
 | 
| 
 | 
   179                     use_file_vars=use_file_vars).convert(text)
 | 
| 
 | 
   180 
 | 
| 
 | 
   181 class Markdown(object):
 | 
| 
 | 
   182     # The dict of "extras" to enable in processing -- a mapping of
 | 
| 
 | 
   183     # extra name to argument for the extra. Most extras do not have an
 | 
| 
 | 
   184     # argument, in which case the value is None.
 | 
| 
 | 
   185     #
 | 
| 
 | 
   186     # This can be set via (a) subclassing and (b) the constructor
 | 
| 
 | 
   187     # "extras" argument.
 | 
| 
 | 
   188     extras = None
 | 
| 
 | 
   189 
 | 
| 
 | 
   190     urls = None
 | 
| 
 | 
   191     titles = None
 | 
| 
 | 
   192     html_blocks = None
 | 
| 
 | 
   193     html_spans = None
 | 
| 
 | 
   194     html_removed_text = "[HTML_REMOVED]"  # for compat with markdown.py
 | 
| 
 | 
   195 
 | 
| 
 | 
   196     # Used to track when we're inside an ordered or unordered list
 | 
| 
 | 
   197     # (see _ProcessListItems() for details):
 | 
| 
 | 
   198     list_level = 0
 | 
| 
 | 
   199 
 | 
| 
 | 
   200     _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
 | 
| 
 | 
   201 
 | 
| 
 | 
   202     def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
 | 
| 
 | 
   203                  extras=None, link_patterns=None, use_file_vars=False):
 | 
| 
 | 
   204         if html4tags:
 | 
| 
 | 
   205             self.empty_element_suffix = ">"
 | 
| 
 | 
   206         else:
 | 
| 
 | 
   207             self.empty_element_suffix = " />"
 | 
| 
 | 
   208         self.tab_width = tab_width
 | 
| 
 | 
   209 
 | 
| 
 | 
   210         # For compatibility with earlier markdown2.py and with
 | 
| 
 | 
   211         # markdown.py's safe_mode being a boolean,
 | 
| 
 | 
   212         #   safe_mode == True -> "replace"
 | 
| 
 | 
   213         if safe_mode is True:
 | 
| 
 | 
   214             self.safe_mode = "replace"
 | 
| 
 | 
   215         else:
 | 
| 
 | 
   216             self.safe_mode = safe_mode
 | 
| 
 | 
   217 
 | 
| 
 | 
   218         # Massaging and building the "extras" info.
 | 
| 
 | 
   219         if self.extras is None:
 | 
| 
 | 
   220             self.extras = {}
 | 
| 
 | 
   221         elif not isinstance(self.extras, dict):
 | 
| 
 | 
   222             self.extras = dict([(e, None) for e in self.extras])
 | 
| 
 | 
   223         if extras:
 | 
| 
 | 
   224             if not isinstance(extras, dict):
 | 
| 
 | 
   225                 extras = dict([(e, None) for e in extras])
 | 
| 
 | 
   226             self.extras.update(extras)
 | 
| 
 | 
   227         assert isinstance(self.extras, dict)
 | 
| 
 | 
   228         if "toc" in self.extras and not "header-ids" in self.extras:
 | 
| 
 | 
   229             self.extras["header-ids"] = None   # "toc" implies "header-ids"
 | 
| 
 | 
   230         self._instance_extras = self.extras.copy()
 | 
| 
 | 
   231 
 | 
| 
 | 
   232         self.link_patterns = link_patterns
 | 
| 
 | 
   233         self.use_file_vars = use_file_vars
 | 
| 
 | 
   234         self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
 | 
| 
 | 
   235 
 | 
| 
 | 
   236         self._escape_table = g_escape_table.copy()
 | 
| 
 | 
   237         if "smarty-pants" in self.extras:
 | 
| 
 | 
   238             self._escape_table['"'] = _hash_text('"')
 | 
| 
 | 
   239             self._escape_table["'"] = _hash_text("'")
 | 
| 
 | 
   240 
 | 
| 
 | 
   241     def reset(self):
 | 
| 
 | 
   242         self.urls = {}
 | 
| 
 | 
   243         self.titles = {}
 | 
| 
 | 
   244         self.html_blocks = {}
 | 
| 
 | 
   245         self.html_spans = {}
 | 
| 
 | 
   246         self.list_level = 0
 | 
| 
 | 
   247         self.extras = self._instance_extras.copy()
 | 
| 
 | 
   248         if "footnotes" in self.extras:
 | 
| 
 | 
   249             self.footnotes = {}
 | 
| 
 | 
   250             self.footnote_ids = []
 | 
| 
 | 
   251         if "header-ids" in self.extras:
 | 
| 
 | 
   252             self._count_from_header_id = {} # no `defaultdict` in Python 2.4
 | 
| 
 | 
   253         if "metadata" in self.extras:
 | 
| 
 | 
   254             self.metadata = {}
 | 
| 
 | 
   255 
 | 
| 
 | 
   256     # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"
 | 
| 
 | 
   257     # should only be used in <a> tags with an "href" attribute.
 | 
| 
 | 
   258     _a_nofollow = re.compile(r"<(a)([^>]*href=)", re.IGNORECASE)
 | 
| 
 | 
   259 
 | 
| 
 | 
   260     def convert(self, text):
 | 
| 
 | 
   261         """Convert the given text."""
 | 
| 
 | 
   262         # Main function. The order in which other subs are called here is
 | 
| 
 | 
   263         # essential. Link and image substitutions need to happen before
 | 
| 
 | 
   264         # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 | 
| 
 | 
   265         # and <img> tags get encoded.
 | 
| 
 | 
   266 
 | 
| 
 | 
   267         # Clear the global hashes. If we don't clear these, you get conflicts
 | 
| 
 | 
   268         # from other articles when generating a page which contains more than
 | 
| 
 | 
   269         # one article (e.g. an index page that shows the N most recent
 | 
| 
 | 
   270         # articles):
 | 
| 
 | 
   271         self.reset()
 | 
| 
 | 
   272 
 | 
| 
 | 
   273         if not isinstance(text, unicode):
 | 
| 
 | 
   274             #TODO: perhaps shouldn't presume UTF-8 for string input?
 | 
| 
 | 
   275             text = unicode(text, 'utf-8')
 | 
| 
 | 
   276 
 | 
| 
 | 
   277         if self.use_file_vars:
 | 
| 
 | 
   278             # Look for emacs-style file variable hints.
 | 
| 
 | 
   279             emacs_vars = self._get_emacs_vars(text)
 | 
| 
 | 
   280             if "markdown-extras" in emacs_vars:
 | 
| 
 | 
   281                 splitter = re.compile("[ ,]+")
 | 
| 
 | 
   282                 for e in splitter.split(emacs_vars["markdown-extras"]):
 | 
| 
 | 
   283                     if '=' in e:
 | 
| 
 | 
   284                         ename, earg = e.split('=', 1)
 | 
| 
 | 
   285                         try:
 | 
| 
 | 
   286                             earg = int(earg)
 | 
| 
 | 
   287                         except ValueError:
 | 
| 
 | 
   288                             pass
 | 
| 
 | 
   289                     else:
 | 
| 
 | 
   290                         ename, earg = e, None
 | 
| 
 | 
   291                     self.extras[ename] = earg
 | 
| 
 | 
   292 
 | 
| 
 | 
   293         # Standardize line endings:
 | 
| 
 | 
   294         text = re.sub("\r\n|\r", "\n", text)
 | 
| 
 | 
   295 
 | 
| 
 | 
   296         # Make sure $text ends with a couple of newlines:
 | 
| 
 | 
   297         text += "\n\n"
 | 
| 
 | 
   298 
 | 
| 
 | 
   299         # Convert all tabs to spaces.
 | 
| 
 | 
   300         text = self._detab(text)
 | 
| 
 | 
   301 
 | 
| 
 | 
   302         # Strip any lines consisting only of spaces and tabs.
 | 
| 
 | 
   303         # This makes subsequent regexen easier to write, because we can
 | 
| 
 | 
   304         # match consecutive blank lines with /\n+/ instead of something
 | 
| 
 | 
   305         # contorted like /[ \t]*\n+/ .
 | 
| 
 | 
   306         text = self._ws_only_line_re.sub("", text)
 | 
| 
 | 
   307 
 | 
| 
 | 
   308         # strip metadata from head and extract
 | 
| 
 | 
   309         if "metadata" in self.extras:
 | 
| 
 | 
   310             text = self._extract_metadata(text)
 | 
| 
 | 
   311 
 | 
| 
 | 
   312         text = self.preprocess(text)
 | 
| 
 | 
   313 
 | 
| 
 | 
   314         if "fenced-code-blocks" in self.extras and not self.safe_mode:
 | 
| 
 | 
   315             text = self._do_fenced_code_blocks(text)
 | 
| 
 | 
   316 
 | 
| 
 | 
   317         if self.safe_mode:
 | 
| 
 | 
   318             text = self._hash_html_spans(text)
 | 
| 
 | 
   319 
 | 
| 
 | 
   320         # Turn block-level HTML blocks into hash entries
 | 
| 
 | 
   321         text = self._hash_html_blocks(text, raw=True)
 | 
| 
 | 
   322 
 | 
| 
 | 
   323         if "fenced-code-blocks" in self.extras and self.safe_mode:
 | 
| 
 | 
   324             text = self._do_fenced_code_blocks(text)
 | 
| 
 | 
   325 
 | 
| 
 | 
   326         # Strip link definitions, store in hashes.
 | 
| 
 | 
   327         if "footnotes" in self.extras:
 | 
| 
 | 
   328             # Must do footnotes first because an unlucky footnote defn
 | 
| 
 | 
   329             # looks like a link defn:
 | 
| 
 | 
   330             #   [^4]: this "looks like a link defn"
 | 
| 
 | 
   331             text = self._strip_footnote_definitions(text)
 | 
| 
 | 
   332         text = self._strip_link_definitions(text)
 | 
| 
 | 
   333 
 | 
| 
 | 
   334         text = self._run_block_gamut(text)
 | 
| 
 | 
   335 
 | 
| 
 | 
   336         if "footnotes" in self.extras:
 | 
| 
 | 
   337             text = self._add_footnotes(text)
 | 
| 
 | 
   338 
 | 
| 
 | 
   339         text = self.postprocess(text)
 | 
| 
 | 
   340 
 | 
| 
 | 
   341         text = self._unescape_special_chars(text)
 | 
| 
 | 
   342 
 | 
| 
 | 
   343         if self.safe_mode:
 | 
| 
 | 
   344             text = self._unhash_html_spans(text)
 | 
| 
 | 
   345 
 | 
| 
 | 
   346         if "nofollow" in self.extras:
 | 
| 
 | 
   347             text = self._a_nofollow.sub(r'<\1 rel="nofollow"\2', text)
 | 
| 
 | 
   348 
 | 
| 
 | 
   349         text += "\n"
 | 
| 
 | 
   350 
 | 
| 
 | 
   351         rv = UnicodeWithAttrs(text)
 | 
| 
 | 
   352         if "toc" in self.extras:
 | 
| 
 | 
   353             rv._toc = self._toc
 | 
| 
 | 
   354         if "metadata" in self.extras:
 | 
| 
 | 
   355             rv.metadata = self.metadata
 | 
| 
 | 
   356         return rv
 | 
| 
 | 
   357 
 | 
| 
 | 
   358     def postprocess(self, text):
 | 
| 
 | 
   359         """A hook for subclasses to do some postprocessing of the html, if
 | 
| 
 | 
   360         desired. This is called before unescaping of special chars and
 | 
| 
 | 
   361         unhashing of raw HTML spans.
 | 
| 
 | 
   362         """
 | 
| 
 | 
   363         return text
 | 
| 
 | 
   364 
 | 
| 
 | 
   365     def preprocess(self, text):
 | 
| 
 | 
   366         """A hook for subclasses to do some preprocessing of the Markdown, if
 | 
| 
 | 
   367         desired. This is called after basic formatting of the text, but prior
 | 
| 
 | 
   368         to any extras, safe mode, etc. processing.
 | 
| 
 | 
   369         """
 | 
| 
 | 
   370         return text
 | 
| 
 | 
   371 
 | 
| 
 | 
   372     # Is metadata if the content starts with '---'-fenced `key: value`
 | 
| 
 | 
   373     # pairs. E.g. (indented for presentation):
 | 
| 
 | 
   374     #   ---
 | 
| 
 | 
   375     #   foo: bar
 | 
| 
 | 
   376     #   another-var: blah blah
 | 
| 
 | 
   377     #   ---
 | 
| 
 | 
   378     _metadata_pat = re.compile("""^---[ \t]*\n((?:[ \t]*[^ \t:]+[ \t]*:[^\n]*\n)+)---[ \t]*\n""")
 | 
| 
 | 
   379 
 | 
| 
 | 
   380     def _extract_metadata(self, text):
 | 
| 
 | 
   381         # fast test
 | 
| 
 | 
   382         if not text.startswith("---"):
 | 
| 
 | 
   383             return text
 | 
| 
 | 
   384         match = self._metadata_pat.match(text)
 | 
| 
 | 
   385         if not match:
 | 
| 
 | 
   386             return text
 | 
| 
 | 
   387 
 | 
| 
 | 
   388         tail = text[len(match.group(0)):]
 | 
| 
 | 
   389         metadata_str = match.group(1).strip()
 | 
| 
 | 
   390         for line in metadata_str.split('\n'):
 | 
| 
 | 
   391             key, value = line.split(':', 1)
 | 
| 
 | 
   392             self.metadata[key.strip()] = value.strip()
 | 
| 
 | 
   393 
 | 
| 
 | 
   394         return tail
 | 
| 
 | 
   395 
 | 
| 
 | 
   396 
 | 
| 
 | 
   397     _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
 | 
| 
 | 
   398     # This regular expression is intended to match blocks like this:
 | 
| 
 | 
   399     #    PREFIX Local Variables: SUFFIX
 | 
| 
 | 
   400     #    PREFIX mode: Tcl SUFFIX
 | 
| 
 | 
   401     #    PREFIX End: SUFFIX
 | 
| 
 | 
   402     # Some notes:
 | 
| 
 | 
   403     # - "[ \t]" is used instead of "\s" to specifically exclude newlines
 | 
| 
 | 
   404     # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
 | 
| 
 | 
   405     #   not like anything other than Unix-style line terminators.
 | 
| 
 | 
   406     _emacs_local_vars_pat = re.compile(r"""^
 | 
| 
 | 
   407         (?P<prefix>(?:[^\r\n|\n|\r])*?)
 | 
| 
 | 
   408         [\ \t]*Local\ Variables:[\ \t]*
 | 
| 
 | 
   409         (?P<suffix>.*?)(?:\r\n|\n|\r)
 | 
| 
 | 
   410         (?P<content>.*?\1End:)
 | 
| 
 | 
   411         """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
 | 
| 
 | 
   412 
 | 
| 
 | 
   413     def _get_emacs_vars(self, text):
 | 
| 
 | 
   414         """Return a dictionary of emacs-style local variables.
 | 
| 
 | 
   415 
 | 
| 
 | 
   416         Parsing is done loosely according to this spec (and according to
 | 
| 
 | 
   417         some in-practice deviations from this):
 | 
| 
 | 
   418         http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
 | 
| 
 | 
   419         """
 | 
| 
 | 
   420         emacs_vars = {}
 | 
| 
 | 
   421         SIZE = pow(2, 13) # 8kB
 | 
| 
 | 
   422 
 | 
| 
 | 
   423         # Search near the start for a '-*-'-style one-liner of variables.
 | 
| 
 | 
   424         head = text[:SIZE]
 | 
| 
 | 
   425         if "-*-" in head:
 | 
| 
 | 
   426             match = self._emacs_oneliner_vars_pat.search(head)
 | 
| 
 | 
   427             if match:
 | 
| 
 | 
   428                 emacs_vars_str = match.group(1)
 | 
| 
 | 
   429                 assert '\n' not in emacs_vars_str
 | 
| 
 | 
   430                 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
 | 
| 
 | 
   431                                   if s.strip()]
 | 
| 
 | 
   432                 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
 | 
| 
 | 
   433                     # While not in the spec, this form is allowed by emacs:
 | 
| 
 | 
   434                     #   -*- Tcl -*-
 | 
| 
 | 
   435                     # where the implied "variable" is "mode". This form
 | 
| 
 | 
   436                     # is only allowed if there are no other variables.
 | 
| 
 | 
   437                     emacs_vars["mode"] = emacs_var_strs[0].strip()
 | 
| 
 | 
   438                 else:
 | 
| 
 | 
   439                     for emacs_var_str in emacs_var_strs:
 | 
| 
 | 
   440                         try:
 | 
| 
 | 
   441                             variable, value = emacs_var_str.strip().split(':', 1)
 | 
| 
 | 
   442                         except ValueError:
 | 
| 
 | 
   443                             log.debug("emacs variables error: malformed -*- "
 | 
| 
 | 
   444                                       "line: %r", emacs_var_str)
 | 
| 
 | 
   445                             continue
 | 
| 
 | 
   446                         # Lowercase the variable name because Emacs allows "Mode"
 | 
| 
 | 
   447                         # or "mode" or "MoDe", etc.
 | 
| 
 | 
   448                         emacs_vars[variable.lower()] = value.strip()
 | 
| 
 | 
   449 
 | 
| 
 | 
   450         tail = text[-SIZE:]
 | 
| 
 | 
   451         if "Local Variables" in tail:
 | 
| 
 | 
   452             match = self._emacs_local_vars_pat.search(tail)
 | 
| 
 | 
   453             if match:
 | 
| 
 | 
   454                 prefix = match.group("prefix")
 | 
| 
 | 
   455                 suffix = match.group("suffix")
 | 
| 
 | 
   456                 lines = match.group("content").splitlines(0)
 | 
| 
 | 
   457                 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
 | 
| 
 | 
   458                 #      % (prefix, suffix, match.group("content"), lines)
 | 
| 
 | 
   459 
 | 
| 
 | 
   460                 # Validate the Local Variables block: proper prefix and suffix
 | 
| 
 | 
   461                 # usage.
 | 
| 
 | 
   462                 for i, line in enumerate(lines):
 | 
| 
 | 
   463                     if not line.startswith(prefix):
 | 
| 
 | 
   464                         log.debug("emacs variables error: line '%s' "
 | 
| 
 | 
   465                                   "does not use proper prefix '%s'"
 | 
| 
 | 
   466                                   % (line, prefix))
 | 
| 
 | 
   467                         return {}
 | 
| 
 | 
   468                     # Don't validate suffix on last line. Emacs doesn't care,
 | 
| 
 | 
   469                     # neither should we.
 | 
| 
 | 
   470                     if i != len(lines)-1 and not line.endswith(suffix):
 | 
| 
 | 
   471                         log.debug("emacs variables error: line '%s' "
 | 
| 
 | 
   472                                   "does not use proper suffix '%s'"
 | 
| 
 | 
   473                                   % (line, suffix))
 | 
| 
 | 
   474                         return {}
 | 
| 
 | 
   475 
 | 
| 
 | 
   476                 # Parse out one emacs var per line.
 | 
| 
 | 
   477                 continued_for = None
 | 
| 
 | 
   478                 for line in lines[:-1]: # no var on the last line ("PREFIX End:")
 | 
| 
 | 
   479                     if prefix: line = line[len(prefix):] # strip prefix
 | 
| 
 | 
   480                     if suffix: line = line[:-len(suffix)] # strip suffix
 | 
| 
 | 
   481                     line = line.strip()
 | 
| 
 | 
   482                     if continued_for:
 | 
| 
 | 
   483                         variable = continued_for
 | 
| 
 | 
   484                         if line.endswith('\\'):
 | 
| 
 | 
   485                             line = line[:-1].rstrip()
 | 
| 
 | 
   486                         else:
 | 
| 
 | 
   487                             continued_for = None
 | 
| 
 | 
   488                         emacs_vars[variable] += ' ' + line
 | 
| 
 | 
   489                     else:
 | 
| 
 | 
   490                         try:
 | 
| 
 | 
   491                             variable, value = line.split(':', 1)
 | 
| 
 | 
   492                         except ValueError:
 | 
| 
 | 
   493                             log.debug("local variables error: missing colon "
 | 
| 
 | 
   494                                       "in local variables entry: '%s'" % line)
 | 
| 
 | 
   495                             continue
 | 
| 
 | 
   496                         # Do NOT lowercase the variable name, because Emacs only
 | 
| 
 | 
   497                         # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
 | 
| 
 | 
   498                         value = value.strip()
 | 
| 
 | 
   499                         if value.endswith('\\'):
 | 
| 
 | 
   500                             value = value[:-1].rstrip()
 | 
| 
 | 
   501                             continued_for = variable
 | 
| 
 | 
   502                         else:
 | 
| 
 | 
   503                             continued_for = None
 | 
| 
 | 
   504                         emacs_vars[variable] = value
 | 
| 
 | 
   505 
 | 
| 
 | 
   506         # Unquote values.
 | 
| 
 | 
   507         for var, val in list(emacs_vars.items()):
 | 
| 
 | 
   508             if len(val) > 1 and (val.startswith('"') and val.endswith('"')
 | 
| 
 | 
   509                or val.startswith('"') and val.endswith('"')):
 | 
| 
 | 
   510                 emacs_vars[var] = val[1:-1]
 | 
| 
 | 
   511 
 | 
| 
 | 
   512         return emacs_vars
 | 
| 
 | 
   513 
 | 
| 
 | 
   514     # Cribbed from a post by Bart Lateur:
 | 
| 
 | 
   515     # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
 | 
| 
 | 
   516     _detab_re = re.compile(r'(.*?)\t', re.M)
 | 
| 
 | 
   517     def _detab_sub(self, match):
 | 
| 
 | 
   518         g1 = match.group(1)
 | 
| 
 | 
   519         return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
 | 
| 
 | 
   520     def _detab(self, text):
 | 
| 
 | 
   521         r"""Remove (leading?) tabs from a file.
 | 
| 
 | 
   522 
 | 
| 
 | 
   523             >>> m = Markdown()
 | 
| 
 | 
   524             >>> m._detab("\tfoo")
 | 
| 
 | 
   525             '    foo'
 | 
| 
 | 
   526             >>> m._detab("  \tfoo")
 | 
| 
 | 
   527             '    foo'
 | 
| 
 | 
   528             >>> m._detab("\t  foo")
 | 
| 
 | 
   529             '      foo'
 | 
| 
 | 
   530             >>> m._detab("  foo")
 | 
| 
 | 
   531             '  foo'
 | 
| 
 | 
   532             >>> m._detab("  foo\n\tbar\tblam")
 | 
| 
 | 
   533             '  foo\n    bar blam'
 | 
| 
 | 
   534         """
 | 
| 
 | 
   535         if '\t' not in text:
 | 
| 
 | 
   536             return text
 | 
| 
 | 
   537         return self._detab_re.subn(self._detab_sub, text)[0]
 | 
| 
 | 
   538 
 | 
| 
 | 
   539     # I broke out the html5 tags here and add them to _block_tags_a and
 | 
| 
 | 
   540     # _block_tags_b.  This way html5 tags are easy to keep track of.
 | 
| 
 | 
   541     _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
 | 
| 
 | 
   542 
 | 
| 
 | 
   543     _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
 | 
| 
 | 
   544     _block_tags_a += _html5tags
 | 
| 
 | 
   545 
 | 
| 
 | 
   546     _strict_tag_block_re = re.compile(r"""
 | 
| 
 | 
   547         (                       # save in \1
 | 
| 
 | 
   548             ^                   # start of line  (with re.M)
 | 
| 
 | 
   549             <(%s)               # start tag = \2
 | 
| 
 | 
   550             \b                  # word break
 | 
| 
 | 
   551             (.*\n)*?            # any number of lines, minimally matching
 | 
| 
 | 
   552             </\2>               # the matching end tag
 | 
| 
 | 
   553             [ \t]*              # trailing spaces/tabs
 | 
| 
 | 
   554             (?=\n+|\Z)          # followed by a newline or end of document
 | 
| 
 | 
   555         )
 | 
| 
 | 
   556         """ % _block_tags_a,
 | 
| 
 | 
   557         re.X | re.M)
 | 
| 
 | 
   558 
 | 
| 
 | 
   559     _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
 | 
| 
 | 
   560     _block_tags_b += _html5tags
 | 
| 
 | 
   561 
 | 
| 
 | 
   562     _liberal_tag_block_re = re.compile(r"""
 | 
| 
 | 
   563         (                       # save in \1
 | 
| 
 | 
   564             ^                   # start of line  (with re.M)
 | 
| 
 | 
   565             <(%s)               # start tag = \2
 | 
| 
 | 
   566             \b                  # word break
 | 
| 
 | 
   567             (.*\n)*?            # any number of lines, minimally matching
 | 
| 
 | 
   568             .*</\2>             # the matching end tag
 | 
| 
 | 
   569             [ \t]*              # trailing spaces/tabs
 | 
| 
 | 
   570             (?=\n+|\Z)          # followed by a newline or end of document
 | 
| 
 | 
   571         )
 | 
| 
 | 
   572         """ % _block_tags_b,
 | 
| 
 | 
   573         re.X | re.M)
 | 
| 
 | 
   574 
 | 
| 
 | 
   575     _html_markdown_attr_re = re.compile(
 | 
| 
 | 
   576         r'''\s+markdown=("1"|'1')''')
 | 
| 
 | 
   577     def _hash_html_block_sub(self, match, raw=False):
 | 
| 
 | 
   578         html = match.group(1)
 | 
| 
 | 
   579         if raw and self.safe_mode:
 | 
| 
 | 
   580             html = self._sanitize_html(html)
 | 
| 
 | 
   581         elif 'markdown-in-html' in self.extras and 'markdown=' in html:
 | 
| 
 | 
   582             first_line = html.split('\n', 1)[0]
 | 
| 
 | 
   583             m = self._html_markdown_attr_re.search(first_line)
 | 
| 
 | 
   584             if m:
 | 
| 
 | 
   585                 lines = html.split('\n')
 | 
| 
 | 
   586                 middle = '\n'.join(lines[1:-1])
 | 
| 
 | 
   587                 last_line = lines[-1]
 | 
| 
 | 
   588                 first_line = first_line[:m.start()] + first_line[m.end():]
 | 
| 
 | 
   589                 f_key = _hash_text(first_line)
 | 
| 
 | 
   590                 self.html_blocks[f_key] = first_line
 | 
| 
 | 
   591                 l_key = _hash_text(last_line)
 | 
| 
 | 
   592                 self.html_blocks[l_key] = last_line
 | 
| 
 | 
   593                 return ''.join(["\n\n", f_key,
 | 
| 
 | 
   594                     "\n\n", middle, "\n\n",
 | 
| 
 | 
   595                     l_key, "\n\n"])
 | 
| 
 | 
   596         key = _hash_text(html)
 | 
| 
 | 
   597         self.html_blocks[key] = html
 | 
| 
 | 
   598         return "\n\n" + key + "\n\n"
 | 
| 
 | 
   599 
 | 
| 
 | 
   600     def _hash_html_blocks(self, text, raw=False):
 | 
| 
 | 
   601         """Hashify HTML blocks
 | 
| 
 | 
   602 
 | 
| 
 | 
   603         We only want to do this for block-level HTML tags, such as headers,
 | 
| 
 | 
   604         lists, and tables. That's because we still want to wrap <p>s around
 | 
| 
 | 
   605         "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 | 
| 
 | 
   606         phrase emphasis, and spans. The list of tags we're looking for is
 | 
| 
 | 
   607         hard-coded.
 | 
| 
 | 
   608 
 | 
| 
 | 
   609         @param raw {boolean} indicates if these are raw HTML blocks in
 | 
| 
 | 
   610             the original source. It makes a difference in "safe" mode.
 | 
| 
 | 
   611         """
 | 
| 
 | 
   612         if '<' not in text:
 | 
| 
 | 
   613             return text
 | 
| 
 | 
   614 
 | 
| 
 | 
   615         # Pass `raw` value into our calls to self._hash_html_block_sub.
 | 
| 
 | 
   616         hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
 | 
| 
 | 
   617 
 | 
| 
 | 
   618         # First, look for nested blocks, e.g.:
 | 
| 
 | 
   619         #   <div>
 | 
| 
 | 
   620         #       <div>
 | 
| 
 | 
   621         #       tags for inner block must be indented.
 | 
| 
 | 
   622         #       </div>
 | 
| 
 | 
   623         #   </div>
 | 
| 
 | 
   624         #
 | 
| 
 | 
   625         # The outermost tags must start at the left margin for this to match, and
 | 
| 
 | 
   626         # the inner nested divs must be indented.
 | 
| 
 | 
   627         # We need to do this before the next, more liberal match, because the next
 | 
| 
 | 
   628         # match will start at the first `<div>` and stop at the first `</div>`.
 | 
| 
 | 
   629         text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
 | 
| 
 | 
   630 
 | 
| 
 | 
   631         # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 | 
| 
 | 
   632         text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
 | 
| 
 | 
   633 
 | 
| 
 | 
   634         # Special case just for <hr />. It was easier to make a special
 | 
| 
 | 
   635         # case than to make the other regex more complicated.
 | 
| 
 | 
   636         if "<hr" in text:
 | 
| 
 | 
   637             _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
 | 
| 
 | 
   638             text = _hr_tag_re.sub(hash_html_block_sub, text)
 | 
| 
 | 
   639 
 | 
| 
 | 
   640         # Special case for standalone HTML comments:
 | 
| 
 | 
   641         if "<!--" in text:
 | 
| 
 | 
   642             start = 0
 | 
| 
 | 
   643             while True:
 | 
| 
 | 
   644                 # Delimiters for next comment block.
 | 
| 
 | 
   645                 try:
 | 
| 
 | 
   646                     start_idx = text.index("<!--", start)
 | 
| 
 | 
   647                 except ValueError:
 | 
| 
 | 
   648                     break
 | 
| 
 | 
   649                 try:
 | 
| 
 | 
   650                     end_idx = text.index("-->", start_idx) + 3
 | 
| 
 | 
   651                 except ValueError:
 | 
| 
 | 
   652                     break
 | 
| 
 | 
   653 
 | 
| 
 | 
   654                 # Start position for next comment block search.
 | 
| 
 | 
   655                 start = end_idx
 | 
| 
 | 
   656 
 | 
| 
 | 
   657                 # Validate whitespace before comment.
 | 
| 
 | 
   658                 if start_idx:
 | 
| 
 | 
   659                     # - Up to `tab_width - 1` spaces before start_idx.
 | 
| 
 | 
   660                     for i in range(self.tab_width - 1):
 | 
| 
 | 
   661                         if text[start_idx - 1] != ' ':
 | 
| 
 | 
   662                             break
 | 
| 
 | 
   663                         start_idx -= 1
 | 
| 
 | 
   664                         if start_idx == 0:
 | 
| 
 | 
   665                             break
 | 
| 
 | 
   666                     # - Must be preceded by 2 newlines or hit the start of
 | 
| 
 | 
   667                     #   the document.
 | 
| 
 | 
   668                     if start_idx == 0:
 | 
| 
 | 
   669                         pass
 | 
| 
 | 
   670                     elif start_idx == 1 and text[0] == '\n':
 | 
| 
 | 
   671                         start_idx = 0  # to match minute detail of Markdown.pl regex
 | 
| 
 | 
   672                     elif text[start_idx-2:start_idx] == '\n\n':
 | 
| 
 | 
   673                         pass
 | 
| 
 | 
   674                     else:
 | 
| 
 | 
   675                         break
 | 
| 
 | 
   676 
 | 
| 
 | 
   677                 # Validate whitespace after comment.
 | 
| 
 | 
   678                 # - Any number of spaces and tabs.
 | 
| 
 | 
   679                 while end_idx < len(text):
 | 
| 
 | 
   680                     if text[end_idx] not in ' \t':
 | 
| 
 | 
   681                         break
 | 
| 
 | 
   682                     end_idx += 1
 | 
| 
 | 
   683                 # - Must be following by 2 newlines or hit end of text.
 | 
| 
 | 
   684                 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
 | 
| 
 | 
   685                     continue
 | 
| 
 | 
   686 
 | 
| 
 | 
   687                 # Escape and hash (must match `_hash_html_block_sub`).
 | 
| 
 | 
   688                 html = text[start_idx:end_idx]
 | 
| 
 | 
   689                 if raw and self.safe_mode:
 | 
| 
 | 
   690                     html = self._sanitize_html(html)
 | 
| 
 | 
   691                 key = _hash_text(html)
 | 
| 
 | 
   692                 self.html_blocks[key] = html
 | 
| 
 | 
   693                 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
 | 
| 
 | 
   694 
 | 
| 
 | 
   695         if "xml" in self.extras:
 | 
| 
 | 
   696             # Treat XML processing instructions and namespaced one-liner
 | 
| 
 | 
   697             # tags as if they were block HTML tags. E.g., if standalone
 | 
| 
 | 
   698             # (i.e. are their own paragraph), the following do not get
 | 
| 
 | 
   699             # wrapped in a <p> tag:
 | 
| 
 | 
   700             #    <?foo bar?>
 | 
| 
 | 
   701             #
 | 
| 
 | 
   702             #    <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
 | 
| 
 | 
   703             _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
 | 
| 
 | 
   704             text = _xml_oneliner_re.sub(hash_html_block_sub, text)
 | 
| 
 | 
   705 
 | 
| 
 | 
   706         return text
 | 
| 
 | 
   707 
 | 
| 
 | 
   708     def _strip_link_definitions(self, text):
 | 
| 
 | 
   709         # Strips link definitions from text, stores the URLs and titles in
 | 
| 
 | 
   710         # hash references.
 | 
| 
 | 
   711         less_than_tab = self.tab_width - 1
 | 
| 
 | 
   712 
 | 
| 
 | 
   713         # Link defs are in the form:
 | 
| 
 | 
   714         #   [id]: url "optional title"
 | 
| 
 | 
   715         _link_def_re = re.compile(r"""
 | 
| 
 | 
   716             ^[ ]{0,%d}\[(.+)\]: # id = \1
 | 
| 
 | 
   717               [ \t]*
 | 
| 
 | 
   718               \n?               # maybe *one* newline
 | 
| 
 | 
   719               [ \t]*
 | 
| 
 | 
   720             <?(.+?)>?           # url = \2
 | 
| 
 | 
   721               [ \t]*
 | 
| 
 | 
   722             (?:
 | 
| 
 | 
   723                 \n?             # maybe one newline
 | 
| 
 | 
   724                 [ \t]*
 | 
| 
 | 
   725                 (?<=\s)         # lookbehind for whitespace
 | 
| 
 | 
   726                 ['"(]
 | 
| 
 | 
   727                 ([^\n]*)        # title = \3
 | 
| 
 | 
   728                 ['")]
 | 
| 
 | 
   729                 [ \t]*
 | 
| 
 | 
   730             )?  # title is optional
 | 
| 
 | 
   731             (?:\n+|\Z)
 | 
| 
 | 
   732             """ % less_than_tab, re.X | re.M | re.U)
 | 
| 
 | 
   733         return _link_def_re.sub(self._extract_link_def_sub, text)
 | 
| 
 | 
   734 
 | 
| 
 | 
   735     def _extract_link_def_sub(self, match):
 | 
| 
 | 
   736         id, url, title = match.groups()
 | 
| 
 | 
   737         key = id.lower()    # Link IDs are case-insensitive
 | 
| 
 | 
   738         self.urls[key] = self._encode_amps_and_angles(url)
 | 
| 
 | 
   739         if title:
 | 
| 
 | 
   740             self.titles[key] = title
 | 
| 
 | 
   741         return ""
 | 
| 
 | 
   742 
 | 
| 
 | 
   743     def _extract_footnote_def_sub(self, match):
 | 
| 
 | 
   744         id, text = match.groups()
 | 
| 
 | 
   745         text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
 | 
| 
 | 
   746         normed_id = re.sub(r'\W', '-', id)
 | 
| 
 | 
   747         # Ensure footnote text ends with a couple newlines (for some
 | 
| 
 | 
   748         # block gamut matches).
 | 
| 
 | 
   749         self.footnotes[normed_id] = text + "\n\n"
 | 
| 
 | 
   750         return ""
 | 
| 
 | 
   751 
 | 
| 
 | 
   752     def _strip_footnote_definitions(self, text):
 | 
| 
 | 
   753         """A footnote definition looks like this:
 | 
| 
 | 
   754 
 | 
| 
 | 
   755             [^note-id]: Text of the note.
 | 
| 
 | 
   756 
 | 
| 
 | 
   757                 May include one or more indented paragraphs.
 | 
| 
 | 
   758 
 | 
| 
 | 
   759         Where,
 | 
| 
 | 
   760         - The 'note-id' can be pretty much anything, though typically it
 | 
| 
 | 
   761           is the number of the footnote.
 | 
| 
 | 
   762         - The first paragraph may start on the next line, like so:
 | 
| 
 | 
   763 
 | 
| 
 | 
   764             [^note-id]:
 | 
| 
 | 
   765                 Text of the note.
 | 
| 
 | 
   766         """
 | 
| 
 | 
   767         less_than_tab = self.tab_width - 1
 | 
| 
 | 
   768         footnote_def_re = re.compile(r'''
 | 
| 
 | 
   769             ^[ ]{0,%d}\[\^(.+)\]:   # id = \1
 | 
| 
 | 
   770             [ \t]*
 | 
| 
 | 
   771             (                       # footnote text = \2
 | 
| 
 | 
   772               # First line need not start with the spaces.
 | 
| 
 | 
   773               (?:\s*.*\n+)
 | 
| 
 | 
   774               (?:
 | 
| 
 | 
   775                 (?:[ ]{%d} | \t)  # Subsequent lines must be indented.
 | 
| 
 | 
   776                 .*\n+
 | 
| 
 | 
   777               )*
 | 
| 
 | 
   778             )
 | 
| 
 | 
   779             # Lookahead for non-space at line-start, or end of doc.
 | 
| 
 | 
   780             (?:(?=^[ ]{0,%d}\S)|\Z)
 | 
| 
 | 
   781             ''' % (less_than_tab, self.tab_width, self.tab_width),
 | 
| 
 | 
   782             re.X | re.M)
 | 
| 
 | 
   783         return footnote_def_re.sub(self._extract_footnote_def_sub, text)
 | 
| 
 | 
   784 
 | 
| 
 | 
   785     _hr_re = re.compile(r'^[ ]{0,3}([-_*][ ]{0,2}){3,}$', re.M)
 | 
| 
 | 
   786 
 | 
| 
 | 
   787     def _run_block_gamut(self, text):
 | 
| 
 | 
   788         # These are all the transformations that form block-level
 | 
| 
 | 
   789         # tags like paragraphs, headers, and list items.
 | 
| 
 | 
   790 
 | 
| 
 | 
   791         if "fenced-code-blocks" in self.extras:
 | 
| 
 | 
   792             text = self._do_fenced_code_blocks(text)
 | 
| 
 | 
   793 
 | 
| 
 | 
   794         text = self._do_headers(text)
 | 
| 
 | 
   795 
 | 
| 
 | 
   796         # Do Horizontal Rules:
 | 
| 
 | 
   797         # On the number of spaces in horizontal rules: The spec is fuzzy: "If
 | 
| 
 | 
   798         # you wish, you may use spaces between the hyphens or asterisks."
 | 
| 
 | 
   799         # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
 | 
| 
 | 
   800         # hr chars to one or two. We'll reproduce that limit here.
 | 
| 
 | 
   801         hr = "\n<hr"+self.empty_element_suffix+"\n"
 | 
| 
 | 
   802         text = re.sub(self._hr_re, hr, text)
 | 
| 
 | 
   803 
 | 
| 
 | 
   804         text = self._do_lists(text)
 | 
| 
 | 
   805 
 | 
| 
 | 
   806         if "pyshell" in self.extras:
 | 
| 
 | 
   807             text = self._prepare_pyshell_blocks(text)
 | 
| 
 | 
   808         if "wiki-tables" in self.extras:
 | 
| 
 | 
   809             text = self._do_wiki_tables(text)
 | 
| 
 | 
   810         if "tables" in self.extras:
 | 
| 
 | 
   811             text = self._do_tables(text)
 | 
| 
 | 
   812 
 | 
| 
 | 
   813         text = self._do_code_blocks(text)
 | 
| 
 | 
   814 
 | 
| 
 | 
   815         text = self._do_block_quotes(text)
 | 
| 
 | 
   816 
 | 
| 
 | 
   817         # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 | 
| 
 | 
   818         # was to escape raw HTML in the original Markdown source. This time,
 | 
| 
 | 
   819         # we're escaping the markup we've just created, so that we don't wrap
 | 
| 
 | 
   820         # <p> tags around block-level tags.
 | 
| 
 | 
   821         text = self._hash_html_blocks(text)
 | 
| 
 | 
   822 
 | 
| 
 | 
   823         text = self._form_paragraphs(text)
 | 
| 
 | 
   824 
 | 
| 
 | 
   825         return text
 | 
| 
 | 
   826 
 | 
| 
 | 
   827     def _pyshell_block_sub(self, match):
 | 
| 
 | 
   828         lines = match.group(0).splitlines(0)
 | 
| 
 | 
   829         _dedentlines(lines)
 | 
| 
 | 
   830         indent = ' ' * self.tab_width
 | 
| 
 | 
   831         s = ('\n' # separate from possible cuddled paragraph
 | 
| 
 | 
   832              + indent + ('\n'+indent).join(lines)
 | 
| 
 | 
   833              + '\n\n')
 | 
| 
 | 
   834         return s
 | 
| 
 | 
   835 
 | 
| 
 | 
   836     def _prepare_pyshell_blocks(self, text):
 | 
| 
 | 
   837         """Ensure that Python interactive shell sessions are put in
 | 
| 
 | 
   838         code blocks -- even if not properly indented.
 | 
| 
 | 
   839         """
 | 
| 
 | 
   840         if ">>>" not in text:
 | 
| 
 | 
   841             return text
 | 
| 
 | 
   842 
 | 
| 
 | 
   843         less_than_tab = self.tab_width - 1
 | 
| 
 | 
   844         _pyshell_block_re = re.compile(r"""
 | 
| 
 | 
   845             ^([ ]{0,%d})>>>[ ].*\n   # first line
 | 
| 
 | 
   846             ^(\1.*\S+.*\n)*         # any number of subsequent lines
 | 
| 
 | 
   847             ^\n                     # ends with a blank line
 | 
| 
 | 
   848             """ % less_than_tab, re.M | re.X)
 | 
| 
 | 
   849 
 | 
| 
 | 
   850         return _pyshell_block_re.sub(self._pyshell_block_sub, text)
 | 
| 
 | 
   851 
 | 
| 
 | 
   852     def _table_sub(self, match):
 | 
| 
 | 
   853         head, underline, body = match.groups()
 | 
| 
 | 
   854 
 | 
| 
 | 
   855         # Determine aligns for columns.
 | 
| 
 | 
   856         cols = [cell.strip() for cell in underline.strip('| \t\n').split('|')]
 | 
| 
 | 
   857         align_from_col_idx = {}
 | 
| 
 | 
   858         for col_idx, col in enumerate(cols):
 | 
| 
 | 
   859             if col[0] == ':' and col[-1] == ':':
 | 
| 
 | 
   860                 align_from_col_idx[col_idx] = ' align="center"'
 | 
| 
 | 
   861             elif col[0] == ':':
 | 
| 
 | 
   862                 align_from_col_idx[col_idx] = ' align="left"'
 | 
| 
 | 
   863             elif col[-1] == ':':
 | 
| 
 | 
   864                 align_from_col_idx[col_idx] = ' align="right"'
 | 
| 
 | 
   865 
 | 
| 
 | 
   866         # thead
 | 
| 
 | 
   867         hlines = ['<table>', '<thead>', '<tr>']
 | 
| 
 | 
   868         cols = [cell.strip() for cell in head.strip('| \t\n').split('|')]
 | 
| 
 | 
   869         for col_idx, col in enumerate(cols):
 | 
| 
 | 
   870             hlines.append('  <th%s>%s</th>' % (
 | 
| 
 | 
   871                 align_from_col_idx.get(col_idx, ''),
 | 
| 
 | 
   872                 self._run_span_gamut(col)
 | 
| 
 | 
   873             ))
 | 
| 
 | 
   874         hlines.append('</tr>')
 | 
| 
 | 
   875         hlines.append('</thead>')
 | 
| 
 | 
   876 
 | 
| 
 | 
   877         # tbody
 | 
| 
 | 
   878         hlines.append('<tbody>')
 | 
| 
 | 
   879         for line in body.strip('\n').split('\n'):
 | 
| 
 | 
   880             hlines.append('<tr>')
 | 
| 
 | 
   881             cols = [cell.strip() for cell in line.strip('| \t\n').split('|')]
 | 
| 
 | 
   882             for col_idx, col in enumerate(cols):
 | 
| 
 | 
   883                 hlines.append('  <td%s>%s</td>' % (
 | 
| 
 | 
   884                     align_from_col_idx.get(col_idx, ''),
 | 
| 
 | 
   885                     self._run_span_gamut(col)
 | 
| 
 | 
   886                 ))
 | 
| 
 | 
   887             hlines.append('</tr>')
 | 
| 
 | 
   888         hlines.append('</tbody>')
 | 
| 
 | 
   889         hlines.append('</table>')
 | 
| 
 | 
   890 
 | 
| 
 | 
   891         return '\n'.join(hlines) + '\n'
 | 
| 
 | 
   892 
 | 
| 
 | 
   893     def _do_tables(self, text):
 | 
| 
 | 
   894         """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
 | 
| 
 | 
   895         https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
 | 
| 
 | 
   896         """
 | 
| 
 | 
   897         less_than_tab = self.tab_width - 1
 | 
| 
 | 
   898         table_re = re.compile(r'''
 | 
| 
 | 
   899                 (?:(?<=\n\n)|\A\n?)             # leading blank line
 | 
| 
 | 
   900 
 | 
| 
 | 
   901                 ^[ ]{0,%d}                      # allowed whitespace
 | 
| 
 | 
   902                 (.*[|].*)  \n                   # $1: header row (at least one pipe)
 | 
| 
 | 
   903 
 | 
| 
 | 
   904                 ^[ ]{0,%d}                      # allowed whitespace
 | 
| 
 | 
   905                 (                               # $2: underline row
 | 
| 
 | 
   906                     # underline row with leading bar
 | 
| 
 | 
   907                     (?:  \|\ *:?-+:?\ *  )+  \|?  \n
 | 
| 
 | 
   908                     |
 | 
| 
 | 
   909                     # or, underline row without leading bar
 | 
| 
 | 
   910                     (?:  \ *:?-+:?\ *\|  )+  (?:  \ *:?-+:?\ *  )?  \n
 | 
| 
 | 
   911                 )
 | 
| 
 | 
   912 
 | 
| 
 | 
   913                 (                               # $3: data rows
 | 
| 
 | 
   914                     (?:
 | 
| 
 | 
   915                         ^[ ]{0,%d}(?!\ )         # ensure line begins with 0 to less_than_tab spaces
 | 
| 
 | 
   916                         .*\|.*  \n
 | 
| 
 | 
   917                     )+
 | 
| 
 | 
   918                 )
 | 
| 
 | 
   919             ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
 | 
| 
 | 
   920         return table_re.sub(self._table_sub, text)
 | 
| 
 | 
   921 
 | 
| 
 | 
   922     def _wiki_table_sub(self, match):
 | 
| 
 | 
   923         ttext = match.group(0).strip()
 | 
| 
 | 
   924         #print 'wiki table: %r' % match.group(0)
 | 
| 
 | 
   925         rows = []
 | 
| 
 | 
   926         for line in ttext.splitlines(0):
 | 
| 
 | 
   927             line = line.strip()[2:-2].strip()
 | 
| 
 | 
   928             row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
 | 
| 
 | 
   929             rows.append(row)
 | 
| 
 | 
   930         #pprint(rows)
 | 
| 
 | 
   931         hlines = ['<table>', '<tbody>']
 | 
| 
 | 
   932         for row in rows:
 | 
| 
 | 
   933             hrow = ['<tr>']
 | 
| 
 | 
   934             for cell in row:
 | 
| 
 | 
   935                 hrow.append('<td>')
 | 
| 
 | 
   936                 hrow.append(self._run_span_gamut(cell))
 | 
| 
 | 
   937                 hrow.append('</td>')
 | 
| 
 | 
   938             hrow.append('</tr>')
 | 
| 
 | 
   939             hlines.append(''.join(hrow))
 | 
| 
 | 
   940         hlines += ['</tbody>', '</table>']
 | 
| 
 | 
   941         return '\n'.join(hlines) + '\n'
 | 
| 
 | 
   942 
 | 
| 
 | 
   943     def _do_wiki_tables(self, text):
 | 
| 
 | 
   944         # Optimization.
 | 
| 
 | 
   945         if "||" not in text:
 | 
| 
 | 
   946             return text
 | 
| 
 | 
   947 
 | 
| 
 | 
   948         less_than_tab = self.tab_width - 1
 | 
| 
 | 
   949         wiki_table_re = re.compile(r'''
 | 
| 
 | 
   950             (?:(?<=\n\n)|\A\n?)            # leading blank line
 | 
| 
 | 
   951             ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n  # first line
 | 
| 
 | 
   952             (^\1\|\|.+?\|\|\n)*        # any number of subsequent lines
 | 
| 
 | 
   953             ''' % less_than_tab, re.M | re.X)
 | 
| 
 | 
   954         return wiki_table_re.sub(self._wiki_table_sub, text)
 | 
| 
 | 
   955 
 | 
| 
 | 
   956     def _run_span_gamut(self, text):
 | 
| 
 | 
   957         # These are all the transformations that occur *within* block-level
 | 
| 
 | 
   958         # tags like paragraphs, headers, and list items.
 | 
| 
 | 
   959 
 | 
| 
 | 
   960         text = self._do_code_spans(text)
 | 
| 
 | 
   961 
 | 
| 
 | 
   962         text = self._escape_special_chars(text)
 | 
| 
 | 
   963 
 | 
| 
 | 
   964         # Process anchor and image tags.
 | 
| 
 | 
   965         text = self._do_links(text)
 | 
| 
 | 
   966 
 | 
| 
 | 
   967         # Make links out of things like `<http://example.com/>`
 | 
| 
 | 
   968         # Must come after _do_links(), because you can use < and >
 | 
| 
 | 
   969         # delimiters in inline links like [this](<url>).
 | 
| 
 | 
   970         text = self._do_auto_links(text)
 | 
| 
 | 
   971 
 | 
| 
 | 
   972         if "link-patterns" in self.extras:
 | 
| 
 | 
   973             text = self._do_link_patterns(text)
 | 
| 
 | 
   974 
 | 
| 
 | 
   975         text = self._encode_amps_and_angles(text)
 | 
| 
 | 
   976 
 | 
| 
 | 
   977         text = self._do_italics_and_bold(text)
 | 
| 
 | 
   978 
 | 
| 
 | 
   979         if "smarty-pants" in self.extras:
 | 
| 
 | 
   980             text = self._do_smart_punctuation(text)
 | 
| 
 | 
   981 
 | 
| 
 | 
   982         # Do hard breaks:
 | 
| 
 | 
   983         if "break-on-newline" in self.extras:
 | 
| 
 | 
   984             text = re.sub(r" *\n", "<br%s\n" % self.empty_element_suffix, text)
 | 
| 
 | 
   985         else:
 | 
| 
 | 
   986             text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
 | 
| 
 | 
   987 
 | 
| 
 | 
   988         return text
 | 
| 
 | 
   989 
 | 
| 
 | 
   990     # "Sorta" because auto-links are identified as "tag" tokens.
 | 
| 
 | 
   991     _sorta_html_tokenize_re = re.compile(r"""
 | 
| 
 | 
   992         (
 | 
| 
 | 
   993             # tag
 | 
| 
 | 
   994             </?
 | 
| 
 | 
   995             (?:\w+)                                     # tag name
 | 
| 
 | 
   996             (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))*  # attributes
 | 
| 
 | 
   997             \s*/?>
 | 
| 
 | 
   998             |
 | 
| 
 | 
   999             # auto-link (e.g., <http://www.activestate.com/>)
 | 
| 
 | 
  1000             <\w+[^>]*>
 | 
| 
 | 
  1001             |
 | 
| 
 | 
  1002             <!--.*?-->      # comment
 | 
| 
 | 
  1003             |
 | 
| 
 | 
  1004             <\?.*?\?>       # processing instruction
 | 
| 
 | 
  1005         )
 | 
| 
 | 
  1006         """, re.X)
 | 
| 
 | 
  1007 
 | 
| 
 | 
  1008     def _escape_special_chars(self, text):
 | 
| 
 | 
  1009         # Python markdown note: the HTML tokenization here differs from
 | 
| 
 | 
  1010         # that in Markdown.pl, hence the behaviour for subtle cases can
 | 
| 
 | 
  1011         # differ (I believe the tokenizer here does a better job because
 | 
| 
 | 
  1012         # it isn't susceptible to unmatched '<' and '>' in HTML tags).
 | 
| 
 | 
  1013         # Note, however, that '>' is not allowed in an auto-link URL
 | 
| 
 | 
  1014         # here.
 | 
| 
 | 
  1015         escaped = []
 | 
| 
 | 
  1016         is_html_markup = False
 | 
| 
 | 
  1017         for token in self._sorta_html_tokenize_re.split(text):
 | 
| 
 | 
  1018             if is_html_markup:
 | 
| 
 | 
  1019                 # Within tags/HTML-comments/auto-links, encode * and _
 | 
| 
 | 
  1020                 # so they don't conflict with their use in Markdown for
 | 
| 
 | 
  1021                 # italics and strong.  We're replacing each such
 | 
| 
 | 
  1022                 # character with its corresponding MD5 checksum value;
 | 
| 
 | 
  1023                 # this is likely overkill, but it should prevent us from
 | 
| 
 | 
  1024                 # colliding with the escape values by accident.
 | 
| 
 | 
  1025                 escaped.append(token.replace('*', self._escape_table['*'])
 | 
| 
 | 
  1026                                     .replace('_', self._escape_table['_']))
 | 
| 
 | 
  1027             else:
 | 
| 
 | 
  1028                 escaped.append(self._encode_backslash_escapes(token))
 | 
| 
 | 
  1029             is_html_markup = not is_html_markup
 | 
| 
 | 
  1030         return ''.join(escaped)
 | 
| 
 | 
  1031 
 | 
| 
 | 
  1032     def _hash_html_spans(self, text):
 | 
| 
 | 
  1033         # Used for safe_mode.
 | 
| 
 | 
  1034 
 | 
| 
 | 
  1035         def _is_auto_link(s):
 | 
| 
 | 
  1036             if ':' in s and self._auto_link_re.match(s):
 | 
| 
 | 
  1037                 return True
 | 
| 
 | 
  1038             elif '@' in s and self._auto_email_link_re.match(s):
 | 
| 
 | 
  1039                 return True
 | 
| 
 | 
  1040             return False
 | 
| 
 | 
  1041 
 | 
| 
 | 
  1042         tokens = []
 | 
| 
 | 
  1043         is_html_markup = False
 | 
| 
 | 
  1044         for token in self._sorta_html_tokenize_re.split(text):
 | 
| 
 | 
  1045             if is_html_markup and not _is_auto_link(token):
 | 
| 
 | 
  1046                 sanitized = self._sanitize_html(token)
 | 
| 
 | 
  1047                 key = _hash_text(sanitized)
 | 
| 
 | 
  1048                 self.html_spans[key] = sanitized
 | 
| 
 | 
  1049                 tokens.append(key)
 | 
| 
 | 
  1050             else:
 | 
| 
 | 
  1051                 tokens.append(token)
 | 
| 
 | 
  1052             is_html_markup = not is_html_markup
 | 
| 
 | 
  1053         return ''.join(tokens)
 | 
| 
 | 
  1054 
 | 
| 
 | 
  1055     def _unhash_html_spans(self, text):
 | 
| 
 | 
  1056         for key, sanitized in list(self.html_spans.items()):
 | 
| 
 | 
  1057             text = text.replace(key, sanitized)
 | 
| 
 | 
  1058         return text
 | 
| 
 | 
  1059 
 | 
| 
 | 
  1060     def _sanitize_html(self, s):
 | 
| 
 | 
  1061         if self.safe_mode == "replace":
 | 
| 
 | 
  1062             return self.html_removed_text
 | 
| 
 | 
  1063         elif self.safe_mode == "escape":
 | 
| 
 | 
  1064             replacements = [
 | 
| 
 | 
  1065                 ('&', '&'),
 | 
| 
 | 
  1066                 ('<', '<'),
 | 
| 
 | 
  1067                 ('>', '>'),
 | 
| 
 | 
  1068             ]
 | 
| 
 | 
  1069             for before, after in replacements:
 | 
| 
 | 
  1070                 s = s.replace(before, after)
 | 
| 
 | 
  1071             return s
 | 
| 
 | 
  1072         else:
 | 
| 
 | 
  1073             raise MarkdownError("invalid value for 'safe_mode': %r (must be "
 | 
| 
 | 
  1074                                 "'escape' or 'replace')" % self.safe_mode)
 | 
| 
 | 
  1075 
 | 
| 
 | 
  1076     _inline_link_title = re.compile(r'''
 | 
| 
 | 
  1077             (                   # \1
 | 
| 
 | 
  1078               [ \t]+
 | 
| 
 | 
  1079               (['"])            # quote char = \2
 | 
| 
 | 
  1080               (?P<title>.*?)
 | 
| 
 | 
  1081               \2
 | 
| 
 | 
  1082             )?                  # title is optional
 | 
| 
 | 
  1083           \)$
 | 
| 
 | 
  1084         ''', re.X | re.S)
 | 
| 
 | 
  1085     _tail_of_reference_link_re = re.compile(r'''
 | 
| 
 | 
  1086           # Match tail of: [text][id]
 | 
| 
 | 
  1087           [ ]?          # one optional space
 | 
| 
 | 
  1088           (?:\n[ ]*)?   # one optional newline followed by spaces
 | 
| 
 | 
  1089           \[
 | 
| 
 | 
  1090             (?P<id>.*?)
 | 
| 
 | 
  1091           \]
 | 
| 
 | 
  1092         ''', re.X | re.S)
 | 
| 
 | 
  1093 
 | 
| 
 | 
  1094     _whitespace = re.compile(r'\s*')
 | 
| 
 | 
  1095 
 | 
| 
 | 
  1096     _strip_anglebrackets = re.compile(r'<(.*)>.*')
 | 
| 
 | 
  1097 
 | 
| 
 | 
  1098     def _find_non_whitespace(self, text, start):
 | 
| 
 | 
  1099         """Returns the index of the first non-whitespace character in text
 | 
| 
 | 
  1100         after (and including) start
 | 
| 
 | 
  1101         """
 | 
| 
 | 
  1102         match = self._whitespace.match(text, start)
 | 
| 
 | 
  1103         return match.end()
 | 
| 
 | 
  1104 
 | 
| 
 | 
  1105     def _find_balanced(self, text, start, open_c, close_c):
 | 
| 
 | 
  1106         """Returns the index where the open_c and close_c characters balance
 | 
| 
 | 
  1107         out - the same number of open_c and close_c are encountered - or the
 | 
| 
 | 
  1108         end of string if it's reached before the balance point is found.
 | 
| 
 | 
  1109         """
 | 
| 
 | 
  1110         i = start
 | 
| 
 | 
  1111         l = len(text)
 | 
| 
 | 
  1112         count = 1
 | 
| 
 | 
  1113         while count > 0 and i < l:
 | 
| 
 | 
  1114             if text[i] == open_c:
 | 
| 
 | 
  1115                 count += 1
 | 
| 
 | 
  1116             elif text[i] == close_c:
 | 
| 
 | 
  1117                 count -= 1
 | 
| 
 | 
  1118             i += 1
 | 
| 
 | 
  1119         return i
 | 
| 
 | 
  1120 
 | 
| 
 | 
  1121     def _extract_url_and_title(self, text, start):
 | 
| 
 | 
  1122         """Extracts the url and (optional) title from the tail of a link"""
 | 
| 
 | 
  1123         # text[start] equals the opening parenthesis
 | 
| 
 | 
  1124         idx = self._find_non_whitespace(text, start+1)
 | 
| 
 | 
  1125         if idx == len(text):
 | 
| 
 | 
  1126             return None, None, None
 | 
| 
 | 
  1127         end_idx = idx
 | 
| 
 | 
  1128         has_anglebrackets = text[idx] == "<"
 | 
| 
 | 
  1129         if has_anglebrackets:
 | 
| 
 | 
  1130             end_idx = self._find_balanced(text, end_idx+1, "<", ">")
 | 
| 
 | 
  1131         end_idx = self._find_balanced(text, end_idx, "(", ")")
 | 
| 
 | 
  1132         match = self._inline_link_title.search(text, idx, end_idx)
 | 
| 
 | 
  1133         if not match:
 | 
| 
 | 
  1134             return None, None, None
 | 
| 
 | 
  1135         url, title = text[idx:match.start()], match.group("title")
 | 
| 
 | 
  1136         if has_anglebrackets:
 | 
| 
 | 
  1137             url = self._strip_anglebrackets.sub(r'\1', url)
 | 
| 
 | 
  1138         return url, title, end_idx
 | 
| 
 | 
  1139 
 | 
| 
 | 
  1140     def _do_links(self, text):
 | 
| 
 | 
  1141         """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
 | 
| 
 | 
  1142 
 | 
| 
 | 
  1143         This is a combination of Markdown.pl's _DoAnchors() and
 | 
| 
 | 
  1144         _DoImages(). They are done together because that simplified the
 | 
| 
 | 
  1145         approach. It was necessary to use a different approach than
 | 
| 
 | 
  1146         Markdown.pl because of the lack of atomic matching support in
 | 
| 
 | 
  1147         Python's regex engine used in $g_nested_brackets.
 | 
| 
 | 
  1148         """
 | 
| 
 | 
  1149         MAX_LINK_TEXT_SENTINEL = 3000  # markdown2 issue 24
 | 
| 
 | 
  1150 
 | 
| 
 | 
  1151         # `anchor_allowed_pos` is used to support img links inside
 | 
| 
 | 
  1152         # anchors, but not anchors inside anchors. An anchor's start
 | 
| 
 | 
  1153         # pos must be `>= anchor_allowed_pos`.
 | 
| 
 | 
  1154         anchor_allowed_pos = 0
 | 
| 
 | 
  1155 
 | 
| 
 | 
  1156         curr_pos = 0
 | 
| 
 | 
  1157         while True: # Handle the next link.
 | 
| 
 | 
  1158             # The next '[' is the start of:
 | 
| 
 | 
  1159             # - an inline anchor:   [text](url "title")
 | 
| 
 | 
  1160             # - a reference anchor: [text][id]
 | 
| 
 | 
  1161             # - an inline img:      
 | 
| 
 | 
  1162             # - a reference img:    ![text][id]
 | 
| 
 | 
  1163             # - a footnote ref:     [^id]
 | 
| 
 | 
  1164             #   (Only if 'footnotes' extra enabled)
 | 
| 
 | 
  1165             # - a footnote defn:    [^id]: ...
 | 
| 
 | 
  1166             #   (Only if 'footnotes' extra enabled) These have already
 | 
| 
 | 
  1167             #   been stripped in _strip_footnote_definitions() so no
 | 
| 
 | 
  1168             #   need to watch for them.
 | 
| 
 | 
  1169             # - a link definition:  [id]: url "title"
 | 
| 
 | 
  1170             #   These have already been stripped in
 | 
| 
 | 
  1171             #   _strip_link_definitions() so no need to watch for them.
 | 
| 
 | 
  1172             # - not markup:         [...anything else...
 | 
| 
 | 
  1173             try:
 | 
| 
 | 
  1174                 start_idx = text.index('[', curr_pos)
 | 
| 
 | 
  1175             except ValueError:
 | 
| 
 | 
  1176                 break
 | 
| 
 | 
  1177             text_length = len(text)
 | 
| 
 | 
  1178 
 | 
| 
 | 
  1179             # Find the matching closing ']'.
 | 
| 
 | 
  1180             # Markdown.pl allows *matching* brackets in link text so we
 | 
| 
 | 
  1181             # will here too. Markdown.pl *doesn't* currently allow
 | 
| 
 | 
  1182             # matching brackets in img alt text -- we'll differ in that
 | 
| 
 | 
  1183             # regard.
 | 
| 
 | 
  1184             bracket_depth = 0
 | 
| 
 | 
  1185             for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
 | 
| 
 | 
  1186                                             text_length)):
 | 
| 
 | 
  1187                 ch = text[p]
 | 
| 
 | 
  1188                 if ch == ']':
 | 
| 
 | 
  1189                     bracket_depth -= 1
 | 
| 
 | 
  1190                     if bracket_depth < 0:
 | 
| 
 | 
  1191                         break
 | 
| 
 | 
  1192                 elif ch == '[':
 | 
| 
 | 
  1193                     bracket_depth += 1
 | 
| 
 | 
  1194             else:
 | 
| 
 | 
  1195                 # Closing bracket not found within sentinel length.
 | 
| 
 | 
  1196                 # This isn't markup.
 | 
| 
 | 
  1197                 curr_pos = start_idx + 1
 | 
| 
 | 
  1198                 continue
 | 
| 
 | 
  1199             link_text = text[start_idx+1:p]
 | 
| 
 | 
  1200 
 | 
| 
 | 
  1201             # Possibly a footnote ref?
 | 
| 
 | 
  1202             if "footnotes" in self.extras and link_text.startswith("^"):
 | 
| 
 | 
  1203                 normed_id = re.sub(r'\W', '-', link_text[1:])
 | 
| 
 | 
  1204                 if normed_id in self.footnotes:
 | 
| 
 | 
  1205                     self.footnote_ids.append(normed_id)
 | 
| 
 | 
  1206                     result = '<sup class="footnote-ref" id="fnref-%s">' \
 | 
| 
 | 
  1207                              '<a href="#fn-%s">%s</a></sup>' \
 | 
| 
 | 
  1208                              % (normed_id, normed_id, len(self.footnote_ids))
 | 
| 
 | 
  1209                     text = text[:start_idx] + result + text[p+1:]
 | 
| 
 | 
  1210                 else:
 | 
| 
 | 
  1211                     # This id isn't defined, leave the markup alone.
 | 
| 
 | 
  1212                     curr_pos = p+1
 | 
| 
 | 
  1213                 continue
 | 
| 
 | 
  1214 
 | 
| 
 | 
  1215             # Now determine what this is by the remainder.
 | 
| 
 | 
  1216             p += 1
 | 
| 
 | 
  1217             if p == text_length:
 | 
| 
 | 
  1218                 return text
 | 
| 
 | 
  1219 
 | 
| 
 | 
  1220             # Inline anchor or img?
 | 
| 
 | 
  1221             if text[p] == '(': # attempt at perf improvement
 | 
| 
 | 
  1222                 url, title, url_end_idx = self._extract_url_and_title(text, p)
 | 
| 
 | 
  1223                 if url is not None:
 | 
| 
 | 
  1224                     # Handle an inline anchor or img.
 | 
| 
 | 
  1225                     is_img = start_idx > 0 and text[start_idx-1] == "!"
 | 
| 
 | 
  1226                     if is_img:
 | 
| 
 | 
  1227                         start_idx -= 1
 | 
| 
 | 
  1228 
 | 
| 
 | 
  1229                     # We've got to encode these to avoid conflicting
 | 
| 
 | 
  1230                     # with italics/bold.
 | 
| 
 | 
  1231                     url = url.replace('*', self._escape_table['*']) \
 | 
| 
 | 
  1232                              .replace('_', self._escape_table['_'])
 | 
| 
 | 
  1233                     if title:
 | 
| 
 | 
  1234                         title_str = ' title="%s"' % (
 | 
| 
 | 
  1235                             _xml_escape_attr(title)
 | 
| 
 | 
  1236                                 .replace('*', self._escape_table['*'])
 | 
| 
 | 
  1237                                 .replace('_', self._escape_table['_']))
 | 
| 
 | 
  1238                     else:
 | 
| 
 | 
  1239                         title_str = ''
 | 
| 
 | 
  1240                     if is_img:
 | 
| 
 | 
  1241                         img_class_str = self._html_class_str_from_tag("img")
 | 
| 
 | 
  1242                         result = '<img src="%s" alt="%s"%s%s%s' \
 | 
| 
 | 
  1243                             % (url.replace('"', '"'),
 | 
| 
 | 
  1244                                _xml_escape_attr(link_text),
 | 
| 
 | 
  1245                                title_str, img_class_str, self.empty_element_suffix)
 | 
| 
 | 
  1246                         if "smarty-pants" in self.extras:
 | 
| 
 | 
  1247                             result = result.replace('"', self._escape_table['"'])
 | 
| 
 | 
  1248                         curr_pos = start_idx + len(result)
 | 
| 
 | 
  1249                         text = text[:start_idx] + result + text[url_end_idx:]
 | 
| 
 | 
  1250                     elif start_idx >= anchor_allowed_pos:
 | 
| 
 | 
  1251                         result_head = '<a href="%s"%s>' % (url, title_str)
 | 
| 
 | 
  1252                         result = '%s%s</a>' % (result_head, link_text)
 | 
| 
 | 
  1253                         if "smarty-pants" in self.extras:
 | 
| 
 | 
  1254                             result = result.replace('"', self._escape_table['"'])
 | 
| 
 | 
  1255                         # <img> allowed from curr_pos on, <a> from
 | 
| 
 | 
  1256                         # anchor_allowed_pos on.
 | 
| 
 | 
  1257                         curr_pos = start_idx + len(result_head)
 | 
| 
 | 
  1258                         anchor_allowed_pos = start_idx + len(result)
 | 
| 
 | 
  1259                         text = text[:start_idx] + result + text[url_end_idx:]
 | 
| 
 | 
  1260                     else:
 | 
| 
 | 
  1261                         # Anchor not allowed here.
 | 
| 
 | 
  1262                         curr_pos = start_idx + 1
 | 
| 
 | 
  1263                     continue
 | 
| 
 | 
  1264 
 | 
| 
 | 
  1265             # Reference anchor or img?
 | 
| 
 | 
  1266             else:
 | 
| 
 | 
  1267                 match = self._tail_of_reference_link_re.match(text, p)
 | 
| 
 | 
  1268                 if match:
 | 
| 
 | 
  1269                     # Handle a reference-style anchor or img.
 | 
| 
 | 
  1270                     is_img = start_idx > 0 and text[start_idx-1] == "!"
 | 
| 
 | 
  1271                     if is_img:
 | 
| 
 | 
  1272                         start_idx -= 1
 | 
| 
 | 
  1273                     link_id = match.group("id").lower()
 | 
| 
 | 
  1274                     if not link_id:
 | 
| 
 | 
  1275                         link_id = link_text.lower()  # for links like [this][]
 | 
| 
 | 
  1276                     if link_id in self.urls:
 | 
| 
 | 
  1277                         url = self.urls[link_id]
 | 
| 
 | 
  1278                         # We've got to encode these to avoid conflicting
 | 
| 
 | 
  1279                         # with italics/bold.
 | 
| 
 | 
  1280                         url = url.replace('*', self._escape_table['*']) \
 | 
| 
 | 
  1281                                  .replace('_', self._escape_table['_'])
 | 
| 
 | 
  1282                         title = self.titles.get(link_id)
 | 
| 
 | 
  1283                         if title:
 | 
| 
 | 
  1284                             before = title
 | 
| 
 | 
  1285                             title = _xml_escape_attr(title) \
 | 
| 
 | 
  1286                                 .replace('*', self._escape_table['*']) \
 | 
| 
 | 
  1287                                 .replace('_', self._escape_table['_'])
 | 
| 
 | 
  1288                             title_str = ' title="%s"' % title
 | 
| 
 | 
  1289                         else:
 | 
| 
 | 
  1290                             title_str = ''
 | 
| 
 | 
  1291                         if is_img:
 | 
| 
 | 
  1292                             img_class_str = self._html_class_str_from_tag("img")
 | 
| 
 | 
  1293                             result = '<img src="%s" alt="%s"%s%s%s' \
 | 
| 
 | 
  1294                                 % (url.replace('"', '"'),
 | 
| 
 | 
  1295                                    link_text.replace('"', '"'),
 | 
| 
 | 
  1296                                    title_str, img_class_str, self.empty_element_suffix)
 | 
| 
 | 
  1297                             if "smarty-pants" in self.extras:
 | 
| 
 | 
  1298                                 result = result.replace('"', self._escape_table['"'])
 | 
| 
 | 
  1299                             curr_pos = start_idx + len(result)
 | 
| 
 | 
  1300                             text = text[:start_idx] + result + text[match.end():]
 | 
| 
 | 
  1301                         elif start_idx >= anchor_allowed_pos:
 | 
| 
 | 
  1302                             result = '<a href="%s"%s>%s</a>' \
 | 
| 
 | 
  1303                                 % (url, title_str, link_text)
 | 
| 
 | 
  1304                             result_head = '<a href="%s"%s>' % (url, title_str)
 | 
| 
 | 
  1305                             result = '%s%s</a>' % (result_head, link_text)
 | 
| 
 | 
  1306                             if "smarty-pants" in self.extras:
 | 
| 
 | 
  1307                                 result = result.replace('"', self._escape_table['"'])
 | 
| 
 | 
  1308                             # <img> allowed from curr_pos on, <a> from
 | 
| 
 | 
  1309                             # anchor_allowed_pos on.
 | 
| 
 | 
  1310                             curr_pos = start_idx + len(result_head)
 | 
| 
 | 
  1311                             anchor_allowed_pos = start_idx + len(result)
 | 
| 
 | 
  1312                             text = text[:start_idx] + result + text[match.end():]
 | 
| 
 | 
  1313                         else:
 | 
| 
 | 
  1314                             # Anchor not allowed here.
 | 
| 
 | 
  1315                             curr_pos = start_idx + 1
 | 
| 
 | 
  1316                     else:
 | 
| 
 | 
  1317                         # This id isn't defined, leave the markup alone.
 | 
| 
 | 
  1318                         curr_pos = match.end()
 | 
| 
 | 
  1319                     continue
 | 
| 
 | 
  1320 
 | 
| 
 | 
  1321             # Otherwise, it isn't markup.
 | 
| 
 | 
  1322             curr_pos = start_idx + 1
 | 
| 
 | 
  1323 
 | 
| 
 | 
  1324         return text
 | 
| 
 | 
  1325 
 | 
| 
 | 
  1326     def header_id_from_text(self, text, prefix, n):
 | 
| 
 | 
  1327         """Generate a header id attribute value from the given header
 | 
| 
 | 
  1328         HTML content.
 | 
| 
 | 
  1329 
 | 
| 
 | 
  1330         This is only called if the "header-ids" extra is enabled.
 | 
| 
 | 
  1331         Subclasses may override this for different header ids.
 | 
| 
 | 
  1332 
 | 
| 
 | 
  1333         @param text {str} The text of the header tag
 | 
| 
 | 
  1334         @param prefix {str} The requested prefix for header ids. This is the
 | 
| 
 | 
  1335             value of the "header-ids" extra key, if any. Otherwise, None.
 | 
| 
 | 
  1336         @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
 | 
| 
 | 
  1337         @returns {str} The value for the header tag's "id" attribute. Return
 | 
| 
 | 
  1338             None to not have an id attribute and to exclude this header from
 | 
| 
 | 
  1339             the TOC (if the "toc" extra is specified).
 | 
| 
 | 
  1340         """
 | 
| 
 | 
  1341         header_id = _slugify(text)
 | 
| 
 | 
  1342         if prefix and isinstance(prefix, base_string_type):
 | 
| 
 | 
  1343             header_id = prefix + '-' + header_id
 | 
| 
 | 
  1344         if header_id in self._count_from_header_id:
 | 
| 
 | 
  1345             self._count_from_header_id[header_id] += 1
 | 
| 
 | 
  1346             header_id += '-%s' % self._count_from_header_id[header_id]
 | 
| 
 | 
  1347         else:
 | 
| 
 | 
  1348             self._count_from_header_id[header_id] = 1
 | 
| 
 | 
  1349         return header_id
 | 
| 
 | 
  1350 
 | 
| 
 | 
  1351     _toc = None
 | 
| 
 | 
  1352     def _toc_add_entry(self, level, id, name):
 | 
| 
 | 
  1353         if self._toc is None:
 | 
| 
 | 
  1354             self._toc = []
 | 
| 
 | 
  1355         self._toc.append((level, id, self._unescape_special_chars(name)))
 | 
| 
 | 
  1356 
 | 
| 
 | 
  1357     _h_re_base = r'''
 | 
| 
 | 
  1358         (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
 | 
| 
 | 
  1359         |
 | 
| 
 | 
  1360         (^(\#{1,6})  # \1 = string of #'s
 | 
| 
 | 
  1361         [ \t]%s
 | 
| 
 | 
  1362         (.+?)       # \2 = Header text
 | 
| 
 | 
  1363         [ \t]*
 | 
| 
 | 
  1364         (?<!\\)     # ensure not an escaped trailing '#'
 | 
| 
 | 
  1365         \#*         # optional closing #'s (not counted)
 | 
| 
 | 
  1366         \n+
 | 
| 
 | 
  1367         )
 | 
| 
 | 
  1368         '''
 | 
| 
 | 
  1369 
 | 
| 
 | 
  1370     _h_re = re.compile(_h_re_base % '*', re.X | re.M)
 | 
| 
 | 
  1371     _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
 | 
| 
 | 
  1372 
 | 
| 
 | 
  1373     def _h_sub(self, match):
 | 
| 
 | 
  1374         if match.group(1) is not None:
 | 
| 
 | 
  1375             # Setext header
 | 
| 
 | 
  1376             n = {"=": 1, "-": 2}[match.group(3)[0]]
 | 
| 
 | 
  1377             header_group = match.group(2)
 | 
| 
 | 
  1378         else:
 | 
| 
 | 
  1379             # atx header
 | 
| 
 | 
  1380             n = len(match.group(5))
 | 
| 
 | 
  1381             header_group = match.group(6)
 | 
| 
 | 
  1382 
 | 
| 
 | 
  1383         demote_headers = self.extras.get("demote-headers")
 | 
| 
 | 
  1384         if demote_headers:
 | 
| 
 | 
  1385             n = min(n + demote_headers, 6)
 | 
| 
 | 
  1386         header_id_attr = ""
 | 
| 
 | 
  1387         if "header-ids" in self.extras:
 | 
| 
 | 
  1388             header_id = self.header_id_from_text(header_group,
 | 
| 
 | 
  1389                 self.extras["header-ids"], n)
 | 
| 
 | 
  1390             if header_id:
 | 
| 
 | 
  1391                 header_id_attr = ' id="%s"' % header_id
 | 
| 
 | 
  1392         html = self._run_span_gamut(header_group)
 | 
| 
 | 
  1393         if "toc" in self.extras and header_id:
 | 
| 
 | 
  1394             self._toc_add_entry(n, header_id, html)
 | 
| 
 | 
  1395         return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
 | 
| 
 | 
  1396 
 | 
| 
 | 
  1397     def _do_headers(self, text):
 | 
| 
 | 
  1398         # Setext-style headers:
 | 
| 
 | 
  1399         #     Header 1
 | 
| 
 | 
  1400         #     ========
 | 
| 
 | 
  1401         #
 | 
| 
 | 
  1402         #     Header 2
 | 
| 
 | 
  1403         #     --------
 | 
| 
 | 
  1404 
 | 
| 
 | 
  1405         # atx-style headers:
 | 
| 
 | 
  1406         #   # Header 1
 | 
| 
 | 
  1407         #   ## Header 2
 | 
| 
 | 
  1408         #   ## Header 2 with closing hashes ##
 | 
| 
 | 
  1409         #   ...
 | 
| 
 | 
  1410         #   ###### Header 6
 | 
| 
 | 
  1411 
 | 
| 
 | 
  1412         if 'tag-friendly' in self.extras:
 | 
| 
 | 
  1413             return self._h_re_tag_friendly.sub(self._h_sub, text)
 | 
| 
 | 
  1414         return self._h_re.sub(self._h_sub, text)
 | 
| 
 | 
  1415 
 | 
| 
 | 
  1416     _marker_ul_chars  = '*+-'
 | 
| 
 | 
  1417     _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
 | 
| 
 | 
  1418     _marker_ul = '(?:[%s])' % _marker_ul_chars
 | 
| 
 | 
  1419     _marker_ol = r'(?:\d+\.)'
 | 
| 
 | 
  1420 
 | 
| 
 | 
  1421     def _list_sub(self, match):
 | 
| 
 | 
  1422         lst = match.group(1)
 | 
| 
 | 
  1423         lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
 | 
| 
 | 
  1424         result = self._process_list_items(lst)
 | 
| 
 | 
  1425         if self.list_level:
 | 
| 
 | 
  1426             return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
 | 
| 
 | 
  1427         else:
 | 
| 
 | 
  1428             return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
 | 
| 
 | 
  1429 
 | 
| 
 | 
  1430     def _do_lists(self, text):
 | 
| 
 | 
  1431         # Form HTML ordered (numbered) and unordered (bulleted) lists.
 | 
| 
 | 
  1432 
 | 
| 
 | 
  1433         # Iterate over each *non-overlapping* list match.
 | 
| 
 | 
  1434         pos = 0
 | 
| 
 | 
  1435         while True:
 | 
| 
 | 
  1436             # Find the *first* hit for either list style (ul or ol). We
 | 
| 
 | 
  1437             # match ul and ol separately to avoid adjacent lists of different
 | 
| 
 | 
  1438             # types running into each other (see issue #16).
 | 
| 
 | 
  1439             hits = []
 | 
| 
 | 
  1440             for marker_pat in (self._marker_ul, self._marker_ol):
 | 
| 
 | 
  1441                 less_than_tab = self.tab_width - 1
 | 
| 
 | 
  1442                 whole_list = r'''
 | 
| 
 | 
  1443                     (                   # \1 = whole list
 | 
| 
 | 
  1444                       (                 # \2
 | 
| 
 | 
  1445                         [ ]{0,%d}
 | 
| 
 | 
  1446                         (%s)            # \3 = first list item marker
 | 
| 
 | 
  1447                         [ \t]+
 | 
| 
 | 
  1448                         (?!\ *\3\ )     # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
 | 
| 
 | 
  1449                       )
 | 
| 
 | 
  1450                       (?:.+?)
 | 
| 
 | 
  1451                       (                 # \4
 | 
| 
 | 
  1452                           \Z
 | 
| 
 | 
  1453                         |
 | 
| 
 | 
  1454                           \n{2,}
 | 
| 
 | 
  1455                           (?=\S)
 | 
| 
 | 
  1456                           (?!           # Negative lookahead for another list item marker
 | 
| 
 | 
  1457                             [ \t]*
 | 
| 
 | 
  1458                             %s[ \t]+
 | 
| 
 | 
  1459                           )
 | 
| 
 | 
  1460                       )
 | 
| 
 | 
  1461                     )
 | 
| 
 | 
  1462                 ''' % (less_than_tab, marker_pat, marker_pat)
 | 
| 
 | 
  1463                 if self.list_level:  # sub-list
 | 
| 
 | 
  1464                     list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
 | 
| 
 | 
  1465                 else:
 | 
| 
 | 
  1466                     list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
 | 
| 
 | 
  1467                                          re.X | re.M | re.S)
 | 
| 
 | 
  1468                 match = list_re.search(text, pos)
 | 
| 
 | 
  1469                 if match:
 | 
| 
 | 
  1470                     hits.append((match.start(), match))
 | 
| 
 | 
  1471             if not hits:
 | 
| 
 | 
  1472                 break
 | 
| 
 | 
  1473             hits.sort()
 | 
| 
 | 
  1474             match = hits[0][1]
 | 
| 
 | 
  1475             start, end = match.span()
 | 
| 
 | 
  1476             middle = self._list_sub(match)
 | 
| 
 | 
  1477             text = text[:start] + middle + text[end:]
 | 
| 
 | 
  1478             pos = start + len(middle) # start pos for next attempted match
 | 
| 
 | 
  1479 
 | 
| 
 | 
  1480         return text
 | 
| 
 | 
  1481 
 | 
| 
 | 
  1482     _list_item_re = re.compile(r'''
 | 
| 
 | 
  1483         (\n)?                   # leading line = \1
 | 
| 
 | 
  1484         (^[ \t]*)               # leading whitespace = \2
 | 
| 
 | 
  1485         (?P<marker>%s) [ \t]+   # list marker = \3
 | 
| 
 | 
  1486         ((?:.+?)                # list item text = \4
 | 
| 
 | 
  1487          (\n{1,2}))             # eols = \5
 | 
| 
 | 
  1488         (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
 | 
| 
 | 
  1489         ''' % (_marker_any, _marker_any),
 | 
| 
 | 
  1490         re.M | re.X | re.S)
 | 
| 
 | 
  1491 
 | 
| 
 | 
  1492     _last_li_endswith_two_eols = False
 | 
| 
 | 
  1493     def _list_item_sub(self, match):
 | 
| 
 | 
  1494         item = match.group(4)
 | 
| 
 | 
  1495         leading_line = match.group(1)
 | 
| 
 | 
  1496         leading_space = match.group(2)
 | 
| 
 | 
  1497         if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
 | 
| 
 | 
  1498             item = self._run_block_gamut(self._outdent(item))
 | 
| 
 | 
  1499         else:
 | 
| 
 | 
  1500             # Recursion for sub-lists:
 | 
| 
 | 
  1501             item = self._do_lists(self._outdent(item))
 | 
| 
 | 
  1502             if item.endswith('\n'):
 | 
| 
 | 
  1503                 item = item[:-1]
 | 
| 
 | 
  1504             item = self._run_span_gamut(item)
 | 
| 
 | 
  1505         self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
 | 
| 
 | 
  1506         return "<li>%s</li>\n" % item
 | 
| 
 | 
  1507 
 | 
| 
 | 
  1508     def _process_list_items(self, list_str):
 | 
| 
 | 
  1509         # Process the contents of a single ordered or unordered list,
 | 
| 
 | 
  1510         # splitting it into individual list items.
 | 
| 
 | 
  1511 
 | 
| 
 | 
  1512         # The $g_list_level global keeps track of when we're inside a list.
 | 
| 
 | 
  1513         # Each time we enter a list, we increment it; when we leave a list,
 | 
| 
 | 
  1514         # we decrement. If it's zero, we're not in a list anymore.
 | 
| 
 | 
  1515         #
 | 
| 
 | 
  1516         # We do this because when we're not inside a list, we want to treat
 | 
| 
 | 
  1517         # something like this:
 | 
| 
 | 
  1518         #
 | 
| 
 | 
  1519         #       I recommend upgrading to version
 | 
| 
 | 
  1520         #       8. Oops, now this line is treated
 | 
| 
 | 
  1521         #       as a sub-list.
 | 
| 
 | 
  1522         #
 | 
| 
 | 
  1523         # As a single paragraph, despite the fact that the second line starts
 | 
| 
 | 
  1524         # with a digit-period-space sequence.
 | 
| 
 | 
  1525         #
 | 
| 
 | 
  1526         # Whereas when we're inside a list (or sub-list), that line will be
 | 
| 
 | 
  1527         # treated as the start of a sub-list. What a kludge, huh? This is
 | 
| 
 | 
  1528         # an aspect of Markdown's syntax that's hard to parse perfectly
 | 
| 
 | 
  1529         # without resorting to mind-reading. Perhaps the solution is to
 | 
| 
 | 
  1530         # change the syntax rules such that sub-lists must start with a
 | 
| 
 | 
  1531         # starting cardinal number; e.g. "1." or "a.".
 | 
| 
 | 
  1532         self.list_level += 1
 | 
| 
 | 
  1533         self._last_li_endswith_two_eols = False
 | 
| 
 | 
  1534         list_str = list_str.rstrip('\n') + '\n'
 | 
| 
 | 
  1535         list_str = self._list_item_re.sub(self._list_item_sub, list_str)
 | 
| 
 | 
  1536         self.list_level -= 1
 | 
| 
 | 
  1537         return list_str
 | 
| 
 | 
  1538 
 | 
| 
 | 
  1539     def _get_pygments_lexer(self, lexer_name):
 | 
| 
 | 
  1540         try:
 | 
| 
 | 
  1541             from pygments import lexers, util
 | 
| 
 | 
  1542         except ImportError:
 | 
| 
 | 
  1543             return None
 | 
| 
 | 
  1544         try:
 | 
| 
 | 
  1545             return lexers.get_lexer_by_name(lexer_name)
 | 
| 
 | 
  1546         except util.ClassNotFound:
 | 
| 
 | 
  1547             return None
 | 
| 
 | 
  1548 
 | 
| 
 | 
  1549     def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
 | 
| 
 | 
  1550         import pygments
 | 
| 
 | 
  1551         import pygments.formatters
 | 
| 
 | 
  1552 
 | 
| 
 | 
  1553         class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
 | 
| 
 | 
  1554             def _wrap_code(self, inner):
 | 
| 
 | 
  1555                 """A function for use in a Pygments Formatter which
 | 
| 
 | 
  1556                 wraps in <code> tags.
 | 
| 
 | 
  1557                 """
 | 
| 
 | 
  1558                 yield 0, "<code>"
 | 
| 
 | 
  1559                 for tup in inner:
 | 
| 
 | 
  1560                     yield tup
 | 
| 
 | 
  1561                 yield 0, "</code>"
 | 
| 
 | 
  1562 
 | 
| 
 | 
  1563             def wrap(self, source, outfile):
 | 
| 
 | 
  1564                 """Return the source with a code, pre, and div."""
 | 
| 
 | 
  1565                 return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
 | 
| 
 | 
  1566 
 | 
| 
 | 
  1567         formatter_opts.setdefault("cssclass", "codehilite")
 | 
| 
 | 
  1568         formatter = HtmlCodeFormatter(**formatter_opts)
 | 
| 
 | 
  1569         return pygments.highlight(codeblock, lexer, formatter)
 | 
| 
 | 
  1570 
 | 
| 
 | 
  1571     def _code_block_sub(self, match, is_fenced_code_block=False):
 | 
| 
 | 
  1572         lexer_name = None
 | 
| 
 | 
  1573         if is_fenced_code_block:
 | 
| 
 | 
  1574             lexer_name = match.group(1)
 | 
| 
 | 
  1575             if lexer_name:
 | 
| 
 | 
  1576                 formatter_opts = self.extras['fenced-code-blocks'] or {}
 | 
| 
 | 
  1577             codeblock = match.group(2)
 | 
| 
 | 
  1578             codeblock = codeblock[:-1]  # drop one trailing newline
 | 
| 
 | 
  1579         else:
 | 
| 
 | 
  1580             codeblock = match.group(1)
 | 
| 
 | 
  1581             codeblock = self._outdent(codeblock)
 | 
| 
 | 
  1582             codeblock = self._detab(codeblock)
 | 
| 
 | 
  1583             codeblock = codeblock.lstrip('\n')  # trim leading newlines
 | 
| 
 | 
  1584             codeblock = codeblock.rstrip()      # trim trailing whitespace
 | 
| 
 | 
  1585 
 | 
| 
 | 
  1586             # Note: "code-color" extra is DEPRECATED.
 | 
| 
 | 
  1587             if "code-color" in self.extras and codeblock.startswith(":::"):
 | 
| 
 | 
  1588                 lexer_name, rest = codeblock.split('\n', 1)
 | 
| 
 | 
  1589                 lexer_name = lexer_name[3:].strip()
 | 
| 
 | 
  1590                 codeblock = rest.lstrip("\n")   # Remove lexer declaration line.
 | 
| 
 | 
  1591                 formatter_opts = self.extras['code-color'] or {}
 | 
| 
 | 
  1592 
 | 
| 
 | 
  1593         if lexer_name:
 | 
| 
 | 
  1594             def unhash_code( codeblock ):
 | 
| 
 | 
  1595                 for key, sanitized in list(self.html_spans.items()):
 | 
| 
 | 
  1596                     codeblock = codeblock.replace(key, sanitized)
 | 
| 
 | 
  1597                 replacements = [
 | 
| 
 | 
  1598                     ("&", "&"),
 | 
| 
 | 
  1599                     ("<", "<"),
 | 
| 
 | 
  1600                     (">", ">")
 | 
| 
 | 
  1601                 ]
 | 
| 
 | 
  1602                 for old, new in replacements:
 | 
| 
 | 
  1603                     codeblock = codeblock.replace(old, new)
 | 
| 
 | 
  1604                 return codeblock
 | 
| 
 | 
  1605             lexer = self._get_pygments_lexer(lexer_name)
 | 
| 
 | 
  1606             if lexer:
 | 
| 
 | 
  1607                 codeblock = unhash_code( codeblock )
 | 
| 
 | 
  1608                 colored = self._color_with_pygments(codeblock, lexer,
 | 
| 
 | 
  1609                                                     **formatter_opts)
 | 
| 
 | 
  1610                 return "\n\n%s\n\n" % colored
 | 
| 
 | 
  1611 
 | 
| 
 | 
  1612         codeblock = self._encode_code(codeblock)
 | 
| 
 | 
  1613         pre_class_str = self._html_class_str_from_tag("pre")
 | 
| 
 | 
  1614         code_class_str = self._html_class_str_from_tag("code")
 | 
| 
 | 
  1615         return "\n\n<pre%s><code%s>%s\n</code></pre>\n\n" % (
 | 
| 
 | 
  1616             pre_class_str, code_class_str, codeblock)
 | 
| 
 | 
  1617 
 | 
| 
 | 
  1618     def _html_class_str_from_tag(self, tag):
 | 
| 
 | 
  1619         """Get the appropriate ' class="..."' string (note the leading
 | 
| 
 | 
  1620         space), if any, for the given tag.
 | 
| 
 | 
  1621         """
 | 
| 
 | 
  1622         if "html-classes" not in self.extras:
 | 
| 
 | 
  1623             return ""
 | 
| 
 | 
  1624         try:
 | 
| 
 | 
  1625             html_classes_from_tag = self.extras["html-classes"]
 | 
| 
 | 
  1626         except TypeError:
 | 
| 
 | 
  1627             return ""
 | 
| 
 | 
  1628         else:
 | 
| 
 | 
  1629             if tag in html_classes_from_tag:
 | 
| 
 | 
  1630                 return ' class="%s"' % html_classes_from_tag[tag]
 | 
| 
 | 
  1631         return ""
 | 
| 
 | 
  1632 
 | 
| 
 | 
  1633     def _do_code_blocks(self, text):
 | 
| 
 | 
  1634         """Process Markdown `<pre><code>` blocks."""
 | 
| 
 | 
  1635         code_block_re = re.compile(r'''
 | 
| 
 | 
  1636             (?:\n\n|\A\n?)
 | 
| 
 | 
  1637             (               # $1 = the code block -- one or more lines, starting with a space/tab
 | 
| 
 | 
  1638               (?:
 | 
| 
 | 
  1639                 (?:[ ]{%d} | \t)  # Lines must start with a tab or a tab-width of spaces
 | 
| 
 | 
  1640                 .*\n+
 | 
| 
 | 
  1641               )+
 | 
| 
 | 
  1642             )
 | 
| 
 | 
  1643             ((?=^[ ]{0,%d}\S)|\Z)   # Lookahead for non-space at line-start, or end of doc
 | 
| 
 | 
  1644             # Lookahead to make sure this block isn't already in a code block.
 | 
| 
 | 
  1645             # Needed when syntax highlighting is being used.
 | 
| 
 | 
  1646             (?![^<]*\</code\>)
 | 
| 
 | 
  1647             ''' % (self.tab_width, self.tab_width),
 | 
| 
 | 
  1648             re.M | re.X)
 | 
| 
 | 
  1649         return code_block_re.sub(self._code_block_sub, text)
 | 
| 
 | 
  1650 
 | 
| 
 | 
  1651     _fenced_code_block_re = re.compile(r'''
 | 
| 
 | 
  1652         (?:\n\n|\A\n?)
 | 
| 
 | 
  1653         ^```([\w+-]+)?[ \t]*\n      # opening fence, $1 = optional lang
 | 
| 
 | 
  1654         (.*?)                       # $2 = code block content
 | 
| 
 | 
  1655         ^```[ \t]*\n                # closing fence
 | 
| 
 | 
  1656         ''', re.M | re.X | re.S)
 | 
| 
 | 
  1657 
 | 
| 
 | 
  1658     def _fenced_code_block_sub(self, match):
 | 
| 
 | 
  1659         return self._code_block_sub(match, is_fenced_code_block=True);
 | 
| 
 | 
  1660 
 | 
| 
 | 
  1661     def _do_fenced_code_blocks(self, text):
 | 
| 
 | 
  1662         """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
 | 
| 
 | 
  1663         return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
 | 
| 
 | 
  1664 
 | 
| 
 | 
  1665     # Rules for a code span:
 | 
| 
 | 
  1666     # - backslash escapes are not interpreted in a code span
 | 
| 
 | 
  1667     # - to include one or or a run of more backticks the delimiters must
 | 
| 
 | 
  1668     #   be a longer run of backticks
 | 
| 
 | 
  1669     # - cannot start or end a code span with a backtick; pad with a
 | 
| 
 | 
  1670     #   space and that space will be removed in the emitted HTML
 | 
| 
 | 
  1671     # See `test/tm-cases/escapes.text` for a number of edge-case
 | 
| 
 | 
  1672     # examples.
 | 
| 
 | 
  1673     _code_span_re = re.compile(r'''
 | 
| 
 | 
  1674             (?<!\\)
 | 
| 
 | 
  1675             (`+)        # \1 = Opening run of `
 | 
| 
 | 
  1676             (?!`)       # See Note A test/tm-cases/escapes.text
 | 
| 
 | 
  1677             (.+?)       # \2 = The code block
 | 
| 
 | 
  1678             (?<!`)
 | 
| 
 | 
  1679             \1          # Matching closer
 | 
| 
 | 
  1680             (?!`)
 | 
| 
 | 
  1681         ''', re.X | re.S)
 | 
| 
 | 
  1682 
 | 
| 
 | 
  1683     def _code_span_sub(self, match):
 | 
| 
 | 
  1684         c = match.group(2).strip(" \t")
 | 
| 
 | 
  1685         c = self._encode_code(c)
 | 
| 
 | 
  1686         return "<code>%s</code>" % c
 | 
| 
 | 
  1687 
 | 
| 
 | 
  1688     def _do_code_spans(self, text):
 | 
| 
 | 
  1689         #   *   Backtick quotes are used for <code></code> spans.
 | 
| 
 | 
  1690         #
 | 
| 
 | 
  1691         #   *   You can use multiple backticks as the delimiters if you want to
 | 
| 
 | 
  1692         #       include literal backticks in the code span. So, this input:
 | 
| 
 | 
  1693         #
 | 
| 
 | 
  1694         #         Just type ``foo `bar` baz`` at the prompt.
 | 
| 
 | 
  1695         #
 | 
| 
 | 
  1696         #       Will translate to:
 | 
| 
 | 
  1697         #
 | 
| 
 | 
  1698         #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
 | 
| 
 | 
  1699         #
 | 
| 
 | 
  1700         #       There's no arbitrary limit to the number of backticks you
 | 
| 
 | 
  1701         #       can use as delimters. If you need three consecutive backticks
 | 
| 
 | 
  1702         #       in your code, use four for delimiters, etc.
 | 
| 
 | 
  1703         #
 | 
| 
 | 
  1704         #   *   You can use spaces to get literal backticks at the edges:
 | 
| 
 | 
  1705         #
 | 
| 
 | 
  1706         #         ... type `` `bar` `` ...
 | 
| 
 | 
  1707         #
 | 
| 
 | 
  1708         #       Turns to:
 | 
| 
 | 
  1709         #
 | 
| 
 | 
  1710         #         ... type <code>`bar`</code> ...
 | 
| 
 | 
  1711         return self._code_span_re.sub(self._code_span_sub, text)
 | 
| 
 | 
  1712 
 | 
| 
 | 
  1713     def _encode_code(self, text):
 | 
| 
 | 
  1714         """Encode/escape certain characters inside Markdown code runs.
 | 
| 
 | 
  1715         The point is that in code, these characters are literals,
 | 
| 
 | 
  1716         and lose their special Markdown meanings.
 | 
| 
 | 
  1717         """
 | 
| 
 | 
  1718         replacements = [
 | 
| 
 | 
  1719             # Encode all ampersands; HTML entities are not
 | 
| 
 | 
  1720             # entities within a Markdown code span.
 | 
| 
 | 
  1721             ('&', '&'),
 | 
| 
 | 
  1722             # Do the angle bracket song and dance:
 | 
| 
 | 
  1723             ('<', '<'),
 | 
| 
 | 
  1724             ('>', '>'),
 | 
| 
 | 
  1725         ]
 | 
| 
 | 
  1726         for before, after in replacements:
 | 
| 
 | 
  1727             text = text.replace(before, after)
 | 
| 
 | 
  1728         hashed = _hash_text(text)
 | 
| 
 | 
  1729         self._escape_table[text] = hashed
 | 
| 
 | 
  1730         return hashed
 | 
| 
 | 
  1731 
 | 
| 
 | 
  1732     _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
 | 
| 
 | 
  1733     _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
 | 
| 
 | 
  1734     _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
 | 
| 
 | 
  1735     _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
 | 
| 
 | 
  1736     def _do_italics_and_bold(self, text):
 | 
| 
 | 
  1737         # <strong> must go first:
 | 
| 
 | 
  1738         if "code-friendly" in self.extras:
 | 
| 
 | 
  1739             text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
 | 
| 
 | 
  1740             text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
 | 
| 
 | 
  1741         else:
 | 
| 
 | 
  1742             text = self._strong_re.sub(r"<strong>\2</strong>", text)
 | 
| 
 | 
  1743             text = self._em_re.sub(r"<em>\2</em>", text)
 | 
| 
 | 
  1744         return text
 | 
| 
 | 
  1745 
 | 
| 
 | 
  1746     # "smarty-pants" extra: Very liberal in interpreting a single prime as an
 | 
| 
 | 
  1747     # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
 | 
| 
 | 
  1748     # "twixt" can be written without an initial apostrophe. This is fine because
 | 
| 
 | 
  1749     # using scare quotes (single quotation marks) is rare.
 | 
| 
 | 
  1750     _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
 | 
| 
 | 
  1751     _contractions = ["tis", "twas", "twer", "neath", "o", "n",
 | 
| 
 | 
  1752         "round", "bout", "twixt", "nuff", "fraid", "sup"]
 | 
| 
 | 
  1753     def _do_smart_contractions(self, text):
 | 
| 
 | 
  1754         text = self._apostrophe_year_re.sub(r"’\1", text)
 | 
| 
 | 
  1755         for c in self._contractions:
 | 
| 
 | 
  1756             text = text.replace("'%s" % c, "’%s" % c)
 | 
| 
 | 
  1757             text = text.replace("'%s" % c.capitalize(),
 | 
| 
 | 
  1758                 "’%s" % c.capitalize())
 | 
| 
 | 
  1759         return text
 | 
| 
 | 
  1760 
 | 
| 
 | 
  1761     # Substitute double-quotes before single-quotes.
 | 
| 
 | 
  1762     _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
 | 
| 
 | 
  1763     _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
 | 
| 
 | 
  1764     _closing_single_quote_re = re.compile(r"(?<=\S)'")
 | 
| 
 | 
  1765     _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
 | 
| 
 | 
  1766     def _do_smart_punctuation(self, text):
 | 
| 
 | 
  1767         """Fancifies 'single quotes', "double quotes", and apostrophes.
 | 
| 
 | 
  1768         Converts --, ---, and ... into en dashes, em dashes, and ellipses.
 | 
| 
 | 
  1769 
 | 
| 
 | 
  1770         Inspiration is: <http://daringfireball.net/projects/smartypants/>
 | 
| 
 | 
  1771         See "test/tm-cases/smarty_pants.text" for a full discussion of the
 | 
| 
 | 
  1772         support here and
 | 
| 
 | 
  1773         <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
 | 
| 
 | 
  1774         discussion of some diversion from the original SmartyPants.
 | 
| 
 | 
  1775         """
 | 
| 
 | 
  1776         if "'" in text: # guard for perf
 | 
| 
 | 
  1777             text = self._do_smart_contractions(text)
 | 
| 
 | 
  1778             text = self._opening_single_quote_re.sub("‘", text)
 | 
| 
 | 
  1779             text = self._closing_single_quote_re.sub("’", text)
 | 
| 
 | 
  1780 
 | 
| 
 | 
  1781         if '"' in text: # guard for perf
 | 
| 
 | 
  1782             text = self._opening_double_quote_re.sub("“", text)
 | 
| 
 | 
  1783             text = self._closing_double_quote_re.sub("”", text)
 | 
| 
 | 
  1784 
 | 
| 
 | 
  1785         text = text.replace("---", "—")
 | 
| 
 | 
  1786         text = text.replace("--", "–")
 | 
| 
 | 
  1787         text = text.replace("...", "…")
 | 
| 
 | 
  1788         text = text.replace(" . . . ", "…")
 | 
| 
 | 
  1789         text = text.replace(". . .", "…")
 | 
| 
 | 
  1790         return text
 | 
| 
 | 
  1791 
 | 
| 
 | 
  1792     _block_quote_re = re.compile(r'''
 | 
| 
 | 
  1793         (                           # Wrap whole match in \1
 | 
| 
 | 
  1794           (
 | 
| 
 | 
  1795             ^[ \t]*>[ \t]?          # '>' at the start of a line
 | 
| 
 | 
  1796               .+\n                  # rest of the first line
 | 
| 
 | 
  1797             (.+\n)*                 # subsequent consecutive lines
 | 
| 
 | 
  1798             \n*                     # blanks
 | 
| 
 | 
  1799           )+
 | 
| 
 | 
  1800         )
 | 
| 
 | 
  1801         ''', re.M | re.X)
 | 
| 
 | 
  1802     _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
 | 
| 
 | 
  1803 
 | 
| 
 | 
  1804     _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
 | 
| 
 | 
  1805     def _dedent_two_spaces_sub(self, match):
 | 
| 
 | 
  1806         return re.sub(r'(?m)^  ', '', match.group(1))
 | 
| 
 | 
  1807 
 | 
| 
 | 
  1808     def _block_quote_sub(self, match):
 | 
| 
 | 
  1809         bq = match.group(1)
 | 
| 
 | 
  1810         bq = self._bq_one_level_re.sub('', bq)  # trim one level of quoting
 | 
| 
 | 
  1811         bq = self._ws_only_line_re.sub('', bq)  # trim whitespace-only lines
 | 
| 
 | 
  1812         bq = self._run_block_gamut(bq)          # recurse
 | 
| 
 | 
  1813 
 | 
| 
 | 
  1814         bq = re.sub('(?m)^', '  ', bq)
 | 
| 
 | 
  1815         # These leading spaces screw with <pre> content, so we need to fix that:
 | 
| 
 | 
  1816         bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
 | 
| 
 | 
  1817 
 | 
| 
 | 
  1818         return "<blockquote>\n%s\n</blockquote>\n\n" % bq
 | 
| 
 | 
  1819 
 | 
| 
 | 
  1820     def _do_block_quotes(self, text):
 | 
| 
 | 
  1821         if '>' not in text:
 | 
| 
 | 
  1822             return text
 | 
| 
 | 
  1823         return self._block_quote_re.sub(self._block_quote_sub, text)
 | 
| 
 | 
  1824 
 | 
| 
 | 
  1825     def _form_paragraphs(self, text):
 | 
| 
 | 
  1826         # Strip leading and trailing lines:
 | 
| 
 | 
  1827         text = text.strip('\n')
 | 
| 
 | 
  1828 
 | 
| 
 | 
  1829         # Wrap <p> tags.
 | 
| 
 | 
  1830         grafs = []
 | 
| 
 | 
  1831         for i, graf in enumerate(re.split(r"\n{2,}", text)):
 | 
| 
 | 
  1832             if graf in self.html_blocks:
 | 
| 
 | 
  1833                 # Unhashify HTML blocks
 | 
| 
 | 
  1834                 grafs.append(self.html_blocks[graf])
 | 
| 
 | 
  1835             else:
 | 
| 
 | 
  1836                 cuddled_list = None
 | 
| 
 | 
  1837                 if "cuddled-lists" in self.extras:
 | 
| 
 | 
  1838                     # Need to put back trailing '\n' for `_list_item_re`
 | 
| 
 | 
  1839                     # match at the end of the paragraph.
 | 
| 
 | 
  1840                     li = self._list_item_re.search(graf + '\n')
 | 
| 
 | 
  1841                     # Two of the same list marker in this paragraph: a likely
 | 
| 
 | 
  1842                     # candidate for a list cuddled to preceding paragraph
 | 
| 
 | 
  1843                     # text (issue 33). Note the `[-1]` is a quick way to
 | 
| 
 | 
  1844                     # consider numeric bullets (e.g. "1." and "2.") to be
 | 
| 
 | 
  1845                     # equal.
 | 
| 
 | 
  1846                     if (li and len(li.group(2)) <= 3 and li.group("next_marker")
 | 
| 
 | 
  1847                         and li.group("marker")[-1] == li.group("next_marker")[-1]):
 | 
| 
 | 
  1848                         start = li.start()
 | 
| 
 | 
  1849                         cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
 | 
| 
 | 
  1850                         assert cuddled_list.startswith("<ul>") or cuddled_list.startswith("<ol>")
 | 
| 
 | 
  1851                         graf = graf[:start]
 | 
| 
 | 
  1852 
 | 
| 
 | 
  1853                 # Wrap <p> tags.
 | 
| 
 | 
  1854                 graf = self._run_span_gamut(graf)
 | 
| 
 | 
  1855                 grafs.append("<p>" + graf.lstrip(" \t") + "</p>")
 | 
| 
 | 
  1856 
 | 
| 
 | 
  1857                 if cuddled_list:
 | 
| 
 | 
  1858                     grafs.append(cuddled_list)
 | 
| 
 | 
  1859 
 | 
| 
 | 
  1860         return "\n\n".join(grafs)
 | 
| 
 | 
  1861 
 | 
| 
 | 
  1862     def _add_footnotes(self, text):
 | 
| 
 | 
  1863         if self.footnotes:
 | 
| 
 | 
  1864             footer = [
 | 
| 
 | 
  1865                 '<div class="footnotes">',
 | 
| 
 | 
  1866                 '<hr' + self.empty_element_suffix,
 | 
| 
 | 
  1867                 '<ol>',
 | 
| 
 | 
  1868             ]
 | 
| 
 | 
  1869             for i, id in enumerate(self.footnote_ids):
 | 
| 
 | 
  1870                 if i != 0:
 | 
| 
 | 
  1871                     footer.append('')
 | 
| 
 | 
  1872                 footer.append('<li id="fn-%s">' % id)
 | 
| 
 | 
  1873                 footer.append(self._run_block_gamut(self.footnotes[id]))
 | 
| 
 | 
  1874                 backlink = ('<a href="#fnref-%s" '
 | 
| 
 | 
  1875                     'class="footnoteBackLink" '
 | 
| 
 | 
  1876                     'title="Jump back to footnote %d in the text.">'
 | 
| 
 | 
  1877                     '↩</a>' % (id, i+1))
 | 
| 
 | 
  1878                 if footer[-1].endswith("</p>"):
 | 
| 
 | 
  1879                     footer[-1] = footer[-1][:-len("</p>")] \
 | 
| 
 | 
  1880                         + ' ' + backlink + "</p>"
 | 
| 
 | 
  1881                 else:
 | 
| 
 | 
  1882                     footer.append("\n<p>%s</p>" % backlink)
 | 
| 
 | 
  1883                 footer.append('</li>')
 | 
| 
 | 
  1884             footer.append('</ol>')
 | 
| 
 | 
  1885             footer.append('</div>')
 | 
| 
 | 
  1886             return text + '\n\n' + '\n'.join(footer)
 | 
| 
 | 
  1887         else:
 | 
| 
 | 
  1888             return text
 | 
| 
 | 
  1889 
 | 
| 
 | 
  1890     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
 | 
| 
 | 
  1891     #   http://bumppo.net/projects/amputator/
 | 
| 
 | 
  1892     _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
 | 
| 
 | 
  1893     _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
 | 
| 
 | 
  1894     _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)
 | 
| 
 | 
  1895 
 | 
| 
 | 
  1896     def _encode_amps_and_angles(self, text):
 | 
| 
 | 
  1897         # Smart processing for ampersands and angle brackets that need
 | 
| 
 | 
  1898         # to be encoded.
 | 
| 
 | 
  1899         text = self._ampersand_re.sub('&', text)
 | 
| 
 | 
  1900 
 | 
| 
 | 
  1901         # Encode naked <'s
 | 
| 
 | 
  1902         text = self._naked_lt_re.sub('<', text)
 | 
| 
 | 
  1903 
 | 
| 
 | 
  1904         # Encode naked >'s
 | 
| 
 | 
  1905         # Note: Other markdown implementations (e.g. Markdown.pl, PHP
 | 
| 
 | 
  1906         # Markdown) don't do this.
 | 
| 
 | 
  1907         text = self._naked_gt_re.sub('>', text)
 | 
| 
 | 
  1908         return text
 | 
| 
 | 
  1909 
 | 
| 
 | 
  1910     def _encode_backslash_escapes(self, text):
 | 
| 
 | 
  1911         for ch, escape in list(self._escape_table.items()):
 | 
| 
 | 
  1912             text = text.replace("\\"+ch, escape)
 | 
| 
 | 
  1913         return text
 | 
| 
 | 
  1914 
 | 
| 
 | 
  1915     _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
 | 
| 
 | 
  1916     def _auto_link_sub(self, match):
 | 
| 
 | 
  1917         g1 = match.group(1)
 | 
| 
 | 
  1918         return '<a href="%s">%s</a>' % (g1, g1)
 | 
| 
 | 
  1919 
 | 
| 
 | 
  1920     _auto_email_link_re = re.compile(r"""
 | 
| 
 | 
  1921           <
 | 
| 
 | 
  1922            (?:mailto:)?
 | 
| 
 | 
  1923           (
 | 
| 
 | 
  1924               [-.\w]+
 | 
| 
 | 
  1925               \@
 | 
| 
 | 
  1926               [-\w]+(\.[-\w]+)*\.[a-z]+
 | 
| 
 | 
  1927           )
 | 
| 
 | 
  1928           >
 | 
| 
 | 
  1929         """, re.I | re.X | re.U)
 | 
| 
 | 
  1930     def _auto_email_link_sub(self, match):
 | 
| 
 | 
  1931         return self._encode_email_address(
 | 
| 
 | 
  1932             self._unescape_special_chars(match.group(1)))
 | 
| 
 | 
  1933 
 | 
| 
 | 
  1934     def _do_auto_links(self, text):
 | 
| 
 | 
  1935         text = self._auto_link_re.sub(self._auto_link_sub, text)
 | 
| 
 | 
  1936         text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
 | 
| 
 | 
  1937         return text
 | 
| 
 | 
  1938 
 | 
| 
 | 
  1939     def _encode_email_address(self, addr):
 | 
| 
 | 
  1940         #  Input: an email address, e.g. "foo@example.com"
 | 
| 
 | 
  1941         #
 | 
| 
 | 
  1942         #  Output: the email address as a mailto link, with each character
 | 
| 
 | 
  1943         #      of the address encoded as either a decimal or hex entity, in
 | 
| 
 | 
  1944         #      the hopes of foiling most address harvesting spam bots. E.g.:
 | 
| 
 | 
  1945         #
 | 
| 
 | 
  1946         #    <a href="mailto:foo@e
 | 
| 
 | 
  1947         #       xample.com">foo
 | 
| 
 | 
  1948         #       @example.com</a>
 | 
| 
 | 
  1949         #
 | 
| 
 | 
  1950         #  Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
 | 
| 
 | 
  1951         #  mailing list: <http://tinyurl.com/yu7ue>
 | 
| 
 | 
  1952         chars = [_xml_encode_email_char_at_random(ch)
 | 
| 
 | 
  1953                  for ch in "mailto:" + addr]
 | 
| 
 | 
  1954         # Strip the mailto: from the visible part.
 | 
| 
 | 
  1955         addr = '<a href="%s">%s</a>' \
 | 
| 
 | 
  1956                % (''.join(chars), ''.join(chars[7:]))
 | 
| 
 | 
  1957         return addr
 | 
| 
 | 
  1958 
 | 
| 
 | 
  1959     def _do_link_patterns(self, text):
 | 
| 
 | 
  1960         """Caveat emptor: there isn't much guarding against link
 | 
| 
 | 
  1961         patterns being formed inside other standard Markdown links, e.g.
 | 
| 
 | 
  1962         inside a [link def][like this].
 | 
| 
 | 
  1963 
 | 
| 
 | 
  1964         Dev Notes: *Could* consider prefixing regexes with a negative
 | 
| 
 | 
  1965         lookbehind assertion to attempt to guard against this.
 | 
| 
 | 
  1966         """
 | 
| 
 | 
  1967         link_from_hash = {}
 | 
| 
 | 
  1968         for regex, repl in self.link_patterns:
 | 
| 
 | 
  1969             replacements = []
 | 
| 
 | 
  1970             for match in regex.finditer(text):
 | 
| 
 | 
  1971                 if hasattr(repl, "__call__"):
 | 
| 
 | 
  1972                     href = repl(match)
 | 
| 
 | 
  1973                 else:
 | 
| 
 | 
  1974                     href = match.expand(repl)
 | 
| 
 | 
  1975                 replacements.append((match.span(), href))
 | 
| 
 | 
  1976             for (start, end), href in reversed(replacements):
 | 
| 
 | 
  1977                 escaped_href = (
 | 
| 
 | 
  1978                     href.replace('"', '"')  # b/c of attr quote
 | 
| 
 | 
  1979                         # To avoid markdown <em> and <strong>:
 | 
| 
 | 
  1980                         .replace('*', self._escape_table['*'])
 | 
| 
 | 
  1981                         .replace('_', self._escape_table['_']))
 | 
| 
 | 
  1982                 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
 | 
| 
 | 
  1983                 hash = _hash_text(link)
 | 
| 
 | 
  1984                 link_from_hash[hash] = link
 | 
| 
 | 
  1985                 text = text[:start] + hash + text[end:]
 | 
| 
 | 
  1986         for hash, link in list(link_from_hash.items()):
 | 
| 
 | 
  1987             text = text.replace(hash, link)
 | 
| 
 | 
  1988         return text
 | 
| 
 | 
  1989 
 | 
| 
 | 
  1990     def _unescape_special_chars(self, text):
 | 
| 
 | 
  1991         # Swap back in all the special characters we've hidden.
 | 
| 
 | 
  1992         for ch, hash in list(self._escape_table.items()):
 | 
| 
 | 
  1993             text = text.replace(hash, ch)
 | 
| 
 | 
  1994         return text
 | 
| 
 | 
  1995 
 | 
| 
 | 
  1996     def _outdent(self, text):
 | 
| 
 | 
  1997         # Remove one level of line-leading tabs or spaces
 | 
| 
 | 
  1998         return self._outdent_re.sub('', text)
 | 
| 
 | 
  1999 
 | 
| 
 | 
  2000 
 | 
| 
 | 
  2001 class MarkdownWithExtras(Markdown):
 | 
| 
 | 
  2002     """A markdowner class that enables most extras:
 | 
| 
 | 
  2003 
 | 
| 
 | 
  2004     - footnotes
 | 
| 
 | 
  2005     - code-color (only has effect if 'pygments' Python module on path)
 | 
| 
 | 
  2006 
 | 
| 
 | 
  2007     These are not included:
 | 
| 
 | 
  2008     - pyshell (specific to Python-related documenting)
 | 
| 
 | 
  2009     - code-friendly (because it *disables* part of the syntax)
 | 
| 
 | 
  2010     - link-patterns (because you need to specify some actual
 | 
| 
 | 
  2011       link-patterns anyway)
 | 
| 
 | 
  2012     """
 | 
| 
 | 
  2013     extras = ["footnotes", "code-color"]
 | 
| 
 | 
  2014 
 | 
| 
 | 
  2015 
 | 
| 
 | 
  2016 #---- internal support functions
 | 
| 
 | 
  2017 
 | 
| 
 | 
  2018 class UnicodeWithAttrs(unicode):
 | 
| 
 | 
  2019     """A subclass of unicode used for the return value of conversion to
 | 
| 
 | 
  2020     possibly attach some attributes. E.g. the "toc_html" attribute when
 | 
| 
 | 
  2021     the "toc" extra is used.
 | 
| 
 | 
  2022     """
 | 
| 
 | 
  2023     metadata = None
 | 
| 
 | 
  2024     _toc = None
 | 
| 
 | 
  2025     def toc_html(self):
 | 
| 
 | 
  2026         """Return the HTML for the current TOC.
 | 
| 
 | 
  2027 
 | 
| 
 | 
  2028         This expects the `_toc` attribute to have been set on this instance.
 | 
| 
 | 
  2029         """
 | 
| 
 | 
  2030         if self._toc is None:
 | 
| 
 | 
  2031             return None
 | 
| 
 | 
  2032 
 | 
| 
 | 
  2033         def indent():
 | 
| 
 | 
  2034             return '  ' * (len(h_stack) - 1)
 | 
| 
 | 
  2035         lines = []
 | 
| 
 | 
  2036         h_stack = [0]   # stack of header-level numbers
 | 
| 
 | 
  2037         for level, id, name in self._toc:
 | 
| 
 | 
  2038             if level > h_stack[-1]:
 | 
| 
 | 
  2039                 lines.append("%s<ul>" % indent())
 | 
| 
 | 
  2040                 h_stack.append(level)
 | 
| 
 | 
  2041             elif level == h_stack[-1]:
 | 
| 
 | 
  2042                 lines[-1] += "</li>"
 | 
| 
 | 
  2043             else:
 | 
| 
 | 
  2044                 while level < h_stack[-1]:
 | 
| 
 | 
  2045                     h_stack.pop()
 | 
| 
 | 
  2046                     if not lines[-1].endswith("</li>"):
 | 
| 
 | 
  2047                         lines[-1] += "</li>"
 | 
| 
 | 
  2048                     lines.append("%s</ul></li>" % indent())
 | 
| 
 | 
  2049             lines.append('%s<li><a href="#%s">%s</a>' % (
 | 
| 
 | 
  2050                 indent(), id, name))
 | 
| 
 | 
  2051         while len(h_stack) > 1:
 | 
| 
 | 
  2052             h_stack.pop()
 | 
| 
 | 
  2053             if not lines[-1].endswith("</li>"):
 | 
| 
 | 
  2054                 lines[-1] += "</li>"
 | 
| 
 | 
  2055             lines.append("%s</ul>" % indent())
 | 
| 
 | 
  2056         return '\n'.join(lines) + '\n'
 | 
| 
 | 
  2057     toc_html = property(toc_html)
 | 
| 
 | 
  2058 
 | 
| 
 | 
  2059 ## {{{ http://code.activestate.com/recipes/577257/ (r1)
 | 
| 
 | 
  2060 _slugify_strip_re = re.compile(r'[^\w\s-]')
 | 
| 
 | 
  2061 _slugify_hyphenate_re = re.compile(r'[-\s]+')
 | 
| 
 | 
  2062 def _slugify(value):
 | 
| 
 | 
  2063     """
 | 
| 
 | 
  2064     Normalizes string, converts to lowercase, removes non-alpha characters,
 | 
| 
 | 
  2065     and converts spaces to hyphens.
 | 
| 
 | 
  2066 
 | 
| 
 | 
  2067     From Django's "django/template/defaultfilters.py".
 | 
| 
 | 
  2068     """
 | 
| 
 | 
  2069     import unicodedata
 | 
| 
 | 
  2070     value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
 | 
| 
 | 
  2071     value = _slugify_strip_re.sub('', value).strip().lower()
 | 
| 
 | 
  2072     return _slugify_hyphenate_re.sub('-', value)
 | 
| 
 | 
  2073 ## end of http://code.activestate.com/recipes/577257/ }}}
 | 
| 
 | 
  2074 
 | 
| 
 | 
  2075 
 | 
| 
 | 
  2076 # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
 | 
| 
 | 
  2077 def _curry(*args, **kwargs):
 | 
| 
 | 
  2078     function, args = args[0], args[1:]
 | 
| 
 | 
  2079     def result(*rest, **kwrest):
 | 
| 
 | 
  2080         combined = kwargs.copy()
 | 
| 
 | 
  2081         combined.update(kwrest)
 | 
| 
 | 
  2082         return function(*args + rest, **combined)
 | 
| 
 | 
  2083     return result
 | 
| 
 | 
  2084 
 | 
| 
 | 
  2085 # Recipe: regex_from_encoded_pattern (1.0)
 | 
| 
 | 
  2086 def _regex_from_encoded_pattern(s):
 | 
| 
 | 
  2087     """'foo'    -> re.compile(re.escape('foo'))
 | 
| 
 | 
  2088        '/foo/'  -> re.compile('foo')
 | 
| 
 | 
  2089        '/foo/i' -> re.compile('foo', re.I)
 | 
| 
 | 
  2090     """
 | 
| 
 | 
  2091     if s.startswith('/') and s.rfind('/') != 0:
 | 
| 
 | 
  2092         # Parse it: /PATTERN/FLAGS
 | 
| 
 | 
  2093         idx = s.rfind('/')
 | 
| 
 | 
  2094         pattern, flags_str = s[1:idx], s[idx+1:]
 | 
| 
 | 
  2095         flag_from_char = {
 | 
| 
 | 
  2096             "i": re.IGNORECASE,
 | 
| 
 | 
  2097             "l": re.LOCALE,
 | 
| 
 | 
  2098             "s": re.DOTALL,
 | 
| 
 | 
  2099             "m": re.MULTILINE,
 | 
| 
 | 
  2100             "u": re.UNICODE,
 | 
| 
 | 
  2101         }
 | 
| 
 | 
  2102         flags = 0
 | 
| 
 | 
  2103         for char in flags_str:
 | 
| 
 | 
  2104             try:
 | 
| 
 | 
  2105                 flags |= flag_from_char[char]
 | 
| 
 | 
  2106             except KeyError:
 | 
| 
 | 
  2107                 raise ValueError("unsupported regex flag: '%s' in '%s' "
 | 
| 
 | 
  2108                                  "(must be one of '%s')"
 | 
| 
 | 
  2109                                  % (char, s, ''.join(list(flag_from_char.keys()))))
 | 
| 
 | 
  2110         return re.compile(s[1:idx], flags)
 | 
| 
 | 
  2111     else: # not an encoded regex
 | 
| 
 | 
  2112         return re.compile(re.escape(s))
 | 
| 
 | 
  2113 
 | 
| 
 | 
  2114 # Recipe: dedent (0.1.2)
 | 
| 
 | 
  2115 def _dedentlines(lines, tabsize=8, skip_first_line=False):
 | 
| 
 | 
  2116     """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
 | 
| 
 | 
  2117 
 | 
| 
 | 
  2118         "lines" is a list of lines to dedent.
 | 
| 
 | 
  2119         "tabsize" is the tab width to use for indent width calculations.
 | 
| 
 | 
  2120         "skip_first_line" is a boolean indicating if the first line should
 | 
| 
 | 
  2121             be skipped for calculating the indent width and for dedenting.
 | 
| 
 | 
  2122             This is sometimes useful for docstrings and similar.
 | 
| 
 | 
  2123 
 | 
| 
 | 
  2124     Same as dedent() except operates on a sequence of lines. Note: the
 | 
| 
 | 
  2125     lines list is modified **in-place**.
 | 
| 
 | 
  2126     """
 | 
| 
 | 
  2127     DEBUG = False
 | 
| 
 | 
  2128     if DEBUG:
 | 
| 
 | 
  2129         print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
 | 
| 
 | 
  2130               % (tabsize, skip_first_line))
 | 
| 
 | 
  2131     indents = []
 | 
| 
 | 
  2132     margin = None
 | 
| 
 | 
  2133     for i, line in enumerate(lines):
 | 
| 
 | 
  2134         if i == 0 and skip_first_line: continue
 | 
| 
 | 
  2135         indent = 0
 | 
| 
 | 
  2136         for ch in line:
 | 
| 
 | 
  2137             if ch == ' ':
 | 
| 
 | 
  2138                 indent += 1
 | 
| 
 | 
  2139             elif ch == '\t':
 | 
| 
 | 
  2140                 indent += tabsize - (indent % tabsize)
 | 
| 
 | 
  2141             elif ch in '\r\n':
 | 
| 
 | 
  2142                 continue # skip all-whitespace lines
 | 
| 
 | 
  2143             else:
 | 
| 
 | 
  2144                 break
 | 
| 
 | 
  2145         else:
 | 
| 
 | 
  2146             continue # skip all-whitespace lines
 | 
| 
 | 
  2147         if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
 | 
| 
 | 
  2148         if margin is None:
 | 
| 
 | 
  2149             margin = indent
 | 
| 
 | 
  2150         else:
 | 
| 
 | 
  2151             margin = min(margin, indent)
 | 
| 
 | 
  2152     if DEBUG: print("dedent: margin=%r" % margin)
 | 
| 
 | 
  2153 
 | 
| 
 | 
  2154     if margin is not None and margin > 0:
 | 
| 
 | 
  2155         for i, line in enumerate(lines):
 | 
| 
 | 
  2156             if i == 0 and skip_first_line: continue
 | 
| 
 | 
  2157             removed = 0
 | 
| 
 | 
  2158             for j, ch in enumerate(line):
 | 
| 
 | 
  2159                 if ch == ' ':
 | 
| 
 | 
  2160                     removed += 1
 | 
| 
 | 
  2161                 elif ch == '\t':
 | 
| 
 | 
  2162                     removed += tabsize - (removed % tabsize)
 | 
| 
 | 
  2163                 elif ch in '\r\n':
 | 
| 
 | 
  2164                     if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
 | 
| 
 | 
  2165                     lines[i] = lines[i][j:]
 | 
| 
 | 
  2166                     break
 | 
| 
 | 
  2167                 else:
 | 
| 
 | 
  2168                     raise ValueError("unexpected non-whitespace char %r in "
 | 
| 
 | 
  2169                                      "line %r while removing %d-space margin"
 | 
| 
 | 
  2170                                      % (ch, line, margin))
 | 
| 
 | 
  2171                 if DEBUG:
 | 
| 
 | 
  2172                     print("dedent: %r: %r -> removed %d/%d"\
 | 
| 
 | 
  2173                           % (line, ch, removed, margin))
 | 
| 
 | 
  2174                 if removed == margin:
 | 
| 
 | 
  2175                     lines[i] = lines[i][j+1:]
 | 
| 
 | 
  2176                     break
 | 
| 
 | 
  2177                 elif removed > margin:
 | 
| 
 | 
  2178                     lines[i] = ' '*(removed-margin) + lines[i][j+1:]
 | 
| 
 | 
  2179                     break
 | 
| 
 | 
  2180             else:
 | 
| 
 | 
  2181                 if removed:
 | 
| 
 | 
  2182                     lines[i] = lines[i][removed:]
 | 
| 
 | 
  2183     return lines
 | 
| 
 | 
  2184 
 | 
| 
 | 
  2185 def _dedent(text, tabsize=8, skip_first_line=False):
 | 
| 
 | 
  2186     """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
 | 
| 
 | 
  2187 
 | 
| 
 | 
  2188         "text" is the text to dedent.
 | 
| 
 | 
  2189         "tabsize" is the tab width to use for indent width calculations.
 | 
| 
 | 
  2190         "skip_first_line" is a boolean indicating if the first line should
 | 
| 
 | 
  2191             be skipped for calculating the indent width and for dedenting.
 | 
| 
 | 
  2192             This is sometimes useful for docstrings and similar.
 | 
| 
 | 
  2193 
 | 
| 
 | 
  2194     textwrap.dedent(s), but don't expand tabs to spaces
 | 
| 
 | 
  2195     """
 | 
| 
 | 
  2196     lines = text.splitlines(1)
 | 
| 
 | 
  2197     _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
 | 
| 
 | 
  2198     return ''.join(lines)
 | 
| 
 | 
  2199 
 | 
| 
 | 
  2200 
 | 
| 
 | 
  2201 class _memoized(object):
 | 
| 
 | 
  2202    """Decorator that caches a function's return value each time it is called.
 | 
| 
 | 
  2203    If called later with the same arguments, the cached value is returned, and
 | 
| 
 | 
  2204    not re-evaluated.
 | 
| 
 | 
  2205 
 | 
| 
 | 
  2206    http://wiki.python.org/moin/PythonDecoratorLibrary
 | 
| 
 | 
  2207    """
 | 
| 
 | 
  2208    def __init__(self, func):
 | 
| 
 | 
  2209       self.func = func
 | 
| 
 | 
  2210       self.cache = {}
 | 
| 
 | 
  2211    def __call__(self, *args):
 | 
| 
 | 
  2212       try:
 | 
| 
 | 
  2213          return self.cache[args]
 | 
| 
 | 
  2214       except KeyError:
 | 
| 
 | 
  2215          self.cache[args] = value = self.func(*args)
 | 
| 
 | 
  2216          return value
 | 
| 
 | 
  2217       except TypeError:
 | 
| 
 | 
  2218          # uncachable -- for instance, passing a list as an argument.
 | 
| 
 | 
  2219          # Better to not cache than to blow up entirely.
 | 
| 
 | 
  2220          return self.func(*args)
 | 
| 
 | 
  2221    def __repr__(self):
 | 
| 
 | 
  2222       """Return the function's docstring."""
 | 
| 
 | 
  2223       return self.func.__doc__
 | 
| 
 | 
  2224 
 | 
| 
 | 
  2225 
 | 
| 
 | 
  2226 def _xml_oneliner_re_from_tab_width(tab_width):
 | 
| 
 | 
  2227     """Standalone XML processing instruction regex."""
 | 
| 
 | 
  2228     return re.compile(r"""
 | 
| 
 | 
  2229         (?:
 | 
| 
 | 
  2230             (?<=\n\n)       # Starting after a blank line
 | 
| 
 | 
  2231             |               # or
 | 
| 
 | 
  2232             \A\n?           # the beginning of the doc
 | 
| 
 | 
  2233         )
 | 
| 
 | 
  2234         (                           # save in $1
 | 
| 
 | 
  2235             [ ]{0,%d}
 | 
| 
 | 
  2236             (?:
 | 
| 
 | 
  2237                 <\?\w+\b\s+.*?\?>   # XML processing instruction
 | 
| 
 | 
  2238                 |
 | 
| 
 | 
  2239                 <\w+:\w+\b\s+.*?/>  # namespaced single tag
 | 
| 
 | 
  2240             )
 | 
| 
 | 
  2241             [ \t]*
 | 
| 
 | 
  2242             (?=\n{2,}|\Z)       # followed by a blank line or end of document
 | 
| 
 | 
  2243         )
 | 
| 
 | 
  2244         """ % (tab_width - 1), re.X)
 | 
| 
 | 
  2245 _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
 | 
| 
 | 
  2246 
 | 
| 
 | 
  2247 def _hr_tag_re_from_tab_width(tab_width):
 | 
| 
 | 
  2248      return re.compile(r"""
 | 
| 
 | 
  2249         (?:
 | 
| 
 | 
  2250             (?<=\n\n)       # Starting after a blank line
 | 
| 
 | 
  2251             |               # or
 | 
| 
 | 
  2252             \A\n?           # the beginning of the doc
 | 
| 
 | 
  2253         )
 | 
| 
 | 
  2254         (                       # save in \1
 | 
| 
 | 
  2255             [ ]{0,%d}
 | 
| 
 | 
  2256             <(hr)               # start tag = \2
 | 
| 
 | 
  2257             \b                  # word break
 | 
| 
 | 
  2258             ([^<>])*?           #
 | 
| 
 | 
  2259             /?>                 # the matching end tag
 | 
| 
 | 
  2260             [ \t]*
 | 
| 
 | 
  2261             (?=\n{2,}|\Z)       # followed by a blank line or end of document
 | 
| 
 | 
  2262         )
 | 
| 
 | 
  2263         """ % (tab_width - 1), re.X)
 | 
| 
 | 
  2264 _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
 | 
| 
 | 
  2265 
 | 
| 
 | 
  2266 
 | 
| 
 | 
  2267 def _xml_escape_attr(attr, skip_single_quote=True):
 | 
| 
 | 
  2268     """Escape the given string for use in an HTML/XML tag attribute.
 | 
| 
 | 
  2269 
 | 
| 
 | 
  2270     By default this doesn't bother with escaping `'` to `'`, presuming that
 | 
| 
 | 
  2271     the tag attribute is surrounded by double quotes.
 | 
| 
 | 
  2272     """
 | 
| 
 | 
  2273     escaped = (attr
 | 
| 
 | 
  2274         .replace('&', '&')
 | 
| 
 | 
  2275         .replace('"', '"')
 | 
| 
 | 
  2276         .replace('<', '<')
 | 
| 
 | 
  2277         .replace('>', '>'))
 | 
| 
 | 
  2278     if not skip_single_quote:
 | 
| 
 | 
  2279         escaped = escaped.replace("'", "'")
 | 
| 
 | 
  2280     return escaped
 | 
| 
 | 
  2281 
 | 
| 
 | 
  2282 
 | 
| 
 | 
  2283 def _xml_encode_email_char_at_random(ch):
 | 
| 
 | 
  2284     r = random()
 | 
| 
 | 
  2285     # Roughly 10% raw, 45% hex, 45% dec.
 | 
| 
 | 
  2286     # '@' *must* be encoded. I [John Gruber] insist.
 | 
| 
 | 
  2287     # Issue 26: '_' must be encoded.
 | 
| 
 | 
  2288     if r > 0.9 and ch not in "@_":
 | 
| 
 | 
  2289         return ch
 | 
| 
 | 
  2290     elif r < 0.45:
 | 
| 
 | 
  2291         # The [1:] is to drop leading '0': 0x63 -> x63
 | 
| 
 | 
  2292         return '&#%s;' % hex(ord(ch))[1:]
 | 
| 
 | 
  2293     else:
 | 
| 
 | 
  2294         return '&#%s;' % ord(ch)
 | 
| 
 | 
  2295 
 | 
| 
 | 
  2296 
 | 
| 
 | 
  2297 
 | 
| 
 | 
  2298 #---- mainline
 | 
| 
 | 
  2299 
 | 
| 
 | 
  2300 class _NoReflowFormatter(optparse.IndentedHelpFormatter):
 | 
| 
 | 
  2301     """An optparse formatter that does NOT reflow the description."""
 | 
| 
 | 
  2302     def format_description(self, description):
 | 
| 
 | 
  2303         return description or ""
 | 
| 
 | 
  2304 
 | 
| 
 | 
  2305 def _test():
 | 
| 
 | 
  2306     import doctest
 | 
| 
 | 
  2307     doctest.testmod()
 | 
| 
 | 
  2308 
 | 
| 
 | 
  2309 def main(argv=None):
 | 
| 
 | 
  2310     if argv is None:
 | 
| 
 | 
  2311         argv = sys.argv
 | 
| 
 | 
  2312     if not logging.root.handlers:
 | 
| 
 | 
  2313         logging.basicConfig()
 | 
| 
 | 
  2314 
 | 
| 
 | 
  2315     usage = "usage: %prog [PATHS...]"
 | 
| 
 | 
  2316     version = "%prog "+__version__
 | 
| 
 | 
  2317     parser = optparse.OptionParser(prog="markdown2", usage=usage,
 | 
| 
 | 
  2318         version=version, description=cmdln_desc,
 | 
| 
 | 
  2319         formatter=_NoReflowFormatter())
 | 
| 
 | 
  2320     parser.add_option("-v", "--verbose", dest="log_level",
 | 
| 
 | 
  2321                       action="store_const", const=logging.DEBUG,
 | 
| 
 | 
  2322                       help="more verbose output")
 | 
| 
 | 
  2323     parser.add_option("--encoding",
 | 
| 
 | 
  2324                       help="specify encoding of text content")
 | 
| 
 | 
  2325     parser.add_option("--html4tags", action="store_true", default=False,
 | 
| 
 | 
  2326                       help="use HTML 4 style for empty element tags")
 | 
| 
 | 
  2327     parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
 | 
| 
 | 
  2328                       help="sanitize literal HTML: 'escape' escapes "
 | 
| 
 | 
  2329                            "HTML meta chars, 'replace' replaces with an "
 | 
| 
 | 
  2330                            "[HTML_REMOVED] note")
 | 
| 
 | 
  2331     parser.add_option("-x", "--extras", action="append",
 | 
| 
 | 
  2332                       help="Turn on specific extra features (not part of "
 | 
| 
 | 
  2333                            "the core Markdown spec). See above.")
 | 
| 
 | 
  2334     parser.add_option("--use-file-vars",
 | 
| 
 | 
  2335                       help="Look for and use Emacs-style 'markdown-extras' "
 | 
| 
 | 
  2336                            "file var to turn on extras. See "
 | 
| 
 | 
  2337                            "<https://github.com/trentm/python-markdown2/wiki/Extras>")
 | 
| 
 | 
  2338     parser.add_option("--link-patterns-file",
 | 
| 
 | 
  2339                       help="path to a link pattern file")
 | 
| 
 | 
  2340     parser.add_option("--self-test", action="store_true",
 | 
| 
 | 
  2341                       help="run internal self-tests (some doctests)")
 | 
| 
 | 
  2342     parser.add_option("--compare", action="store_true",
 | 
| 
 | 
  2343                       help="run against Markdown.pl as well (for testing)")
 | 
| 
 | 
  2344     parser.set_defaults(log_level=logging.INFO, compare=False,
 | 
| 
 | 
  2345                         encoding="utf-8", safe_mode=None, use_file_vars=False)
 | 
| 
 | 
  2346     opts, paths = parser.parse_args()
 | 
| 
 | 
  2347     log.setLevel(opts.log_level)
 | 
| 
 | 
  2348 
 | 
| 
 | 
  2349     if opts.self_test:
 | 
| 
 | 
  2350         return _test()
 | 
| 
 | 
  2351 
 | 
| 
 | 
  2352     if opts.extras:
 | 
| 
 | 
  2353         extras = {}
 | 
| 
 | 
  2354         for s in opts.extras:
 | 
| 
 | 
  2355             splitter = re.compile("[,;: ]+")
 | 
| 
 | 
  2356             for e in splitter.split(s):
 | 
| 
 | 
  2357                 if '=' in e:
 | 
| 
 | 
  2358                     ename, earg = e.split('=', 1)
 | 
| 
 | 
  2359                     try:
 | 
| 
 | 
  2360                         earg = int(earg)
 | 
| 
 | 
  2361                     except ValueError:
 | 
| 
 | 
  2362                         pass
 | 
| 
 | 
  2363                 else:
 | 
| 
 | 
  2364                     ename, earg = e, None
 | 
| 
 | 
  2365                 extras[ename] = earg
 | 
| 
 | 
  2366     else:
 | 
| 
 | 
  2367         extras = None
 | 
| 
 | 
  2368 
 | 
| 
 | 
  2369     if opts.link_patterns_file:
 | 
| 
 | 
  2370         link_patterns = []
 | 
| 
 | 
  2371         f = open(opts.link_patterns_file)
 | 
| 
 | 
  2372         try:
 | 
| 
 | 
  2373             for i, line in enumerate(f.readlines()):
 | 
| 
 | 
  2374                 if not line.strip(): continue
 | 
| 
 | 
  2375                 if line.lstrip().startswith("#"): continue
 | 
| 
 | 
  2376                 try:
 | 
| 
 | 
  2377                     pat, href = line.rstrip().rsplit(None, 1)
 | 
| 
 | 
  2378                 except ValueError:
 | 
| 
 | 
  2379                     raise MarkdownError("%s:%d: invalid link pattern line: %r"
 | 
| 
 | 
  2380                                         % (opts.link_patterns_file, i+1, line))
 | 
| 
 | 
  2381                 link_patterns.append(
 | 
| 
 | 
  2382                     (_regex_from_encoded_pattern(pat), href))
 | 
| 
 | 
  2383         finally:
 | 
| 
 | 
  2384             f.close()
 | 
| 
 | 
  2385     else:
 | 
| 
 | 
  2386         link_patterns = None
 | 
| 
 | 
  2387 
 | 
| 
 | 
  2388     from os.path import join, dirname, abspath, exists
 | 
| 
 | 
  2389     markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
 | 
| 
 | 
  2390                        "Markdown.pl")
 | 
| 
 | 
  2391     if not paths:
 | 
| 
 | 
  2392         paths = ['-']
 | 
| 
 | 
  2393     for path in paths:
 | 
| 
 | 
  2394         if path == '-':
 | 
| 
 | 
  2395             text = sys.stdin.read()
 | 
| 
 | 
  2396         else:
 | 
| 
 | 
  2397             fp = codecs.open(path, 'r', opts.encoding)
 | 
| 
 | 
  2398             text = fp.read()
 | 
| 
 | 
  2399             fp.close()
 | 
| 
 | 
  2400         if opts.compare:
 | 
| 
 | 
  2401             from subprocess import Popen, PIPE
 | 
| 
 | 
  2402             print("==== Markdown.pl ====")
 | 
| 
 | 
  2403             p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
 | 
| 
 | 
  2404             p.stdin.write(text.encode('utf-8'))
 | 
| 
 | 
  2405             p.stdin.close()
 | 
| 
 | 
  2406             perl_html = p.stdout.read().decode('utf-8')
 | 
| 
 | 
  2407             if py3:
 | 
| 
 | 
  2408                 sys.stdout.write(perl_html)
 | 
| 
 | 
  2409             else:
 | 
| 
 | 
  2410                 sys.stdout.write(perl_html.encode(
 | 
| 
 | 
  2411                     sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
 | 
| 
 | 
  2412             print("==== markdown2.py ====")
 | 
| 
 | 
  2413         html = markdown(text,
 | 
| 
 | 
  2414             html4tags=opts.html4tags,
 | 
| 
 | 
  2415             safe_mode=opts.safe_mode,
 | 
| 
 | 
  2416             extras=extras, link_patterns=link_patterns,
 | 
| 
 | 
  2417             use_file_vars=opts.use_file_vars)
 | 
| 
 | 
  2418         if py3:
 | 
| 
 | 
  2419             sys.stdout.write(html)
 | 
| 
 | 
  2420         else:
 | 
| 
 | 
  2421             sys.stdout.write(html.encode(
 | 
| 
 | 
  2422                 sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
 | 
| 
 | 
  2423         if extras and "toc" in extras:
 | 
| 
 | 
  2424             log.debug("toc_html: " +
 | 
| 
 | 
  2425                 html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
 | 
| 
 | 
  2426         if opts.compare:
 | 
| 
 | 
  2427             test_dir = join(dirname(dirname(abspath(__file__))), "test")
 | 
| 
 | 
  2428             if exists(join(test_dir, "test_markdown2.py")):
 | 
| 
 | 
  2429                 sys.path.insert(0, test_dir)
 | 
| 
 | 
  2430                 from test_markdown2 import norm_html_from_html
 | 
| 
 | 
  2431                 norm_html = norm_html_from_html(html)
 | 
| 
 | 
  2432                 norm_perl_html = norm_html_from_html(perl_html)
 | 
| 
 | 
  2433             else:
 | 
| 
 | 
  2434                 norm_html = html
 | 
| 
 | 
  2435                 norm_perl_html = perl_html
 | 
| 
 | 
  2436             print("==== match? %r ====" % (norm_perl_html == norm_html))
 | 
| 
 | 
  2437 
 | 
| 
 | 
  2438 
 | 
| 
 | 
  2439 if __name__ == "__main__":
 | 
| 
 | 
  2440     sys.exit( main(sys.argv) )
 |