From a8bb59e37af177dff070d5d27a43eea1d5d34ed9 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 9 Feb 2024 11:19:02 -0500 Subject: [PATCH 01/17] Refactor TOC sanitation - All postprocessors are run on heading content (not just `RawHtmlPostprocessor`). - Footnote references are stripped from heading content. Fixes #660. - A more robust `striptags` is provided to convert headings to plain text. Unlike, markupsafe's implementation, HTML entities are not unescaped. - Both the plain text `name` and rich `html` are saved to `toc_tokens`, which means users can now access the full rich text content of the headings directly from the `toc_tokens`. - `data-toc-label` is sanitized separate from heading content. - A `html.unescape` call added to `slugify` and `slugify_unicode`, which ensures `slugify` operates on Unicode characters, rather than HTML entities. By including in the functions, users can override with their own slugify functions if they desire. Note that this first commit includes minimal changes to the tests to show very little change in behavior (mostly the new `html` attribute of the `toc_tokens` was added). A refactoring of the tests will be in a separate commit. --- markdown/extensions/toc.py | 133 +++++++++++++------- tests/test_extensions.py | 39 +++--- tests/test_syntax/extensions/test_smarty.py | 1 + tests/test_syntax/extensions/test_toc.py | 77 ++++++++++++ 4 files changed, 190 insertions(+), 60 deletions(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index a17d7241c..e7cde03e7 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -21,11 +21,14 @@ from . import Extension from ..treeprocessors import Treeprocessor -from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString +from ..util import parseBoolValue, AMP_SUBSTITUTE from ..treeprocessors import UnescapeTreeprocessor +from ..serializers import RE_AMP import re import html import unicodedata +from copy import deepcopy +from html import unescape as html_unescape import xml.etree.ElementTree as etree from typing import TYPE_CHECKING, Any, Iterator, MutableSet @@ -35,6 +38,8 @@ def slugify(value: str, separator: str, unicode: bool = False) -> str: """ Slugify a string, to make it URL friendly. """ + # First convert HTML entities to Unicode characters + value = html_unescape(value) if not unicode: # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty` value = unicodedata.normalize('NFKD', value) @@ -63,41 +68,81 @@ def unique(id: str, ids: MutableSet[str]) -> str: return id -def get_name(el: etree.Element) -> str: - """Get title name.""" - - text = [] - for c in el.itertext(): - if isinstance(c, AtomicString): - text.append(html.unescape(c)) - else: - text.append(c) - return ''.join(text).strip() - - -def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str: - """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """ - def _html_sub(m: re.Match[str]) -> str: - """ Substitute raw html with plain text. """ - try: - raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))] - except (IndexError, TypeError): # pragma: no cover - return m.group(0) - # Strip out tags and/or entities - leaving text - res = re.sub(r'(<[^>]+>)', '', raw) - if strip_entities: - res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res) - return res - - return HTML_PLACEHOLDER_RE.sub(_html_sub, text) - - -def unescape(text: str) -> str: - """ Unescape escaped text. """ +def md_unescape(text: str) -> str: + """ Unescape Markdown backslash escaped text. """ c = UnescapeTreeprocessor() return c.unescape(text) +def strip_tags(text: str) -> str: + """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """ + # A comment could contain a tag, so strip comments first + while (start := text.find('', start)) != -1: + text = f'{text[:start]}{text[end + 3:]}' + + while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1: + text = f'{text[:start]}{text[end + 1:]}' + + # Collapse whitespace + text = ' '.join(text.split()) + return text + + +def escape_cdata(text: str) -> str: + """ Escape character data. """ + if "&" in text: + # Only replace & when not part of an entity + text = RE_AMP.sub('&', text) + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + return text + + +def run_postprocessors(text: str, md: Markdown) -> str: + """ Run postprocessors from Markdown instance on text. """ + for pp in md.postprocessors: + text = pp.run(text) + return text.strip() + + +def render_inner_html(el: etree.Element, md: Markdown) -> str: + """ Fully render inner html of an etree element as a string. """ + # The UnescapeTreeprocessor runs after TOC so run here. + text = md_unescape(md.serializer(el)) + + # strip parent tag + start = text.index('>') + 1 + end = text.rindex('<') + text = text[start:end].strip() + + return run_postprocessors(text, md) + + +def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element: + """ Return a deep copy of an etree element, optionally with footnote references removed. """ + el = deepcopy(el) + # Remove footnote references, which look like this: `...`. + if exclude_fnrefs: + for sup in el.findall('sup'): + id = sup.get('id', '') + if id.startswith('fnref'): + # We have a footnote reference. Remove it. + parent = el.find(f'.//sup[@id="{id}"]..') + if sup.tail: + # Preserve the tail text + siblings = list(parent) + pos = siblings.index(sup) + if pos == 0: + parent.text = f'{parent.text or ""}{sup.tail}' + else: + sibling = siblings[pos - 1] + sibling.tail = f'{sibling.tail or ""}{sup.tail}' + parent.remove(sup) + return el + + def nest_toc_tokens(toc_list): """Given an unsorted list with errors and skips, return a nested one. @@ -300,27 +345,29 @@ def run(self, doc: etree.Element) -> None: for el in doc.iter(): if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) - text = get_name(el) + html = render_inner_html(copy_element(el), self.md) + text = strip_tags(html) # Do not override pre-existing ids if "id" not in el.attrib: - innertext = unescape(stashedHTML2text(text, self.md)) - el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids) + el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids) + + if 'data-toc-label' in el.attrib: + text = md_unescape(el.attrib['data-toc-label']) + text = run_postprocessors(text, self.md) + text = strip_tags(text) + text = escape_cdata(text) + # Remove the data-toc-label attribute as it is no longer needed + del el.attrib['data-toc-label'] if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom: toc_tokens.append({ 'level': int(el.tag[-1]), 'id': el.attrib["id"], - 'name': unescape(stashedHTML2text( - code_escape(el.attrib.get('data-toc-label', text)), - self.md, strip_entities=False - )) + 'name': text, + 'html': html }) - # Remove the data-toc-label attribute as it is no longer needed - if 'data-toc-label' in el.attrib: - del el.attrib['data-toc-label'] - if self.use_anchors: self.add_anchor(el, el.attrib["id"]) if self.use_permalinks not in [False, None]: diff --git a/tests/test_extensions.py b/tests/test_extensions.py index a9e789f19..4ebe4eca5 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -420,9 +420,9 @@ def testUniqueIds(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'header', 'name': 'Header', 'children': []}, - {'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []}, - {'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []}, + {'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []}, + {'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []}, + {'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []}, ]) def testHtmlEntities(self): @@ -441,7 +441,7 @@ def testHtmlEntities(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'children': []}, + {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'html': 'Foo & bar', 'children': []}, ]) def testHtmlSpecialChars(self): @@ -460,7 +460,7 @@ def testHtmlSpecialChars(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'children': []}, + {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'html': 'Foo > & bar', 'children': []}, ]) def testRawHtml(self): @@ -479,7 +479,7 @@ def testRawHtml(self): '\n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []}, + {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo Bar Baz.', 'children': []}, ]) def testBaseLevel(self): @@ -508,9 +508,9 @@ def testBaseLevel(self): '\n' ) self.assertEqual(md.toc_tokens, [ - {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [ - {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []}, - {'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []}, + {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [ + {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []}, + {'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []}, ]}, ]) @@ -532,9 +532,13 @@ def testHeaderInlineMarkup(self): '\n' # noqa '\n' ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []}, - ]) + self.assertEqual(self.md.toc_tokens, [{ + 'level': 1, + 'id': 'some-header-with-markup', + 'name': 'Some Header with markup.', + 'html': 'Some Header with markup.', + 'children': [] + }]) def testTitle(self): """ Test TOC Title. """ @@ -549,6 +553,7 @@ def testTitle(self): def testWithAttrList(self): """ Test TOC with `attr_list` Extension. """ + self.maxDiff = None md = markdown.Markdown(extensions=['toc', 'attr_list']) text = ('# Header 1\n\n' '## Header 2 { #foo }\n\n' @@ -580,12 +585,12 @@ def testWithAttrList(self): '\n' ) self.assertEqual(md.toc_tokens, [ - {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [ - {'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []}, - {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []} + {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [ + {'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []}, + {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []} ]}, - {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'children': []}, - {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []}, + {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'html': 'Header 4', 'children': []}, + {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []}, ]) def testUniqueFunc(self): diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py index 8a1767459..d2d9b094b 100644 --- a/tests/test_syntax/extensions/test_smarty.py +++ b/tests/test_syntax/extensions/test_smarty.py @@ -216,6 +216,7 @@ def test_smarty_and_toc(self): 'level': 1, 'id': 'foo-bar', 'name': 'Foo — bar', + 'html': 'Foobar', 'children': [], }, ], diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 797643643..ff5a1774d 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -140,11 +140,13 @@ def testMinMaxLevel(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', 'children': [] } ] @@ -189,11 +191,13 @@ def testMaxLevel(self): 'level': 1, 'id': 'header-1', 'name': 'Header 1', + 'html': 'Header 1', 'children': [ { 'level': 2, 'id': 'header-2', 'name': 'Header 2', + 'html': 'Header 2', 'children': [] } ] @@ -245,11 +249,13 @@ def testMinMaxLevelwithAnchorLink(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', 'children': [] } ] @@ -301,11 +307,13 @@ def testMinMaxLevelwithPermalink(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', 'children': [] } ] @@ -353,11 +361,13 @@ def testMinMaxLevelwithBaseLevel(self): 'level': 4, 'id': 'second-level', 'name': 'Second Level', + 'html': 'Second Level', 'children': [ { 'level': 5, 'id': 'third-level', 'name': 'Third Level', + 'html': 'Third Level', 'children': [] } ] @@ -402,11 +412,13 @@ def testMaxLevelwithBaseLevel(self): 'level': 2, 'id': 'some-header', 'name': 'Some Header', + 'html': 'Some Header', 'children': [ { 'level': 3, 'id': 'next-level', 'name': 'Next Level', + 'html': 'Next Level', 'children': [] } ] @@ -455,6 +467,7 @@ def test_escaped_char_in_id(self): 'level': 1, 'id': 'escaped_character', 'name': 'escaped_character', + 'html': 'escaped_character', 'children': [] } ] @@ -671,3 +684,67 @@ def testTOCWithCustomTitleClass(self): ), extensions=[TocExtension(title_class="tocname", title='ToC')] ) + + def testHeadingRemoveFootnoteRef(self): + + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1[^1] + # Header[^1] 2 + # Header *subelement*[^1] 3 + + [^1]: footnote + ''' + ), + self.dedent( + ''' +

Header 11

+

Header1 2

+

Header subelement1 3

+
+
+
    +
  1. +

    footnote 

    +
  2. +
+
+ ''' + ), + expected_attrs={ + 'toc': ( + '
\n' + '\n' # noqa + '
\n' # noqa + ), + 'toc_tokens': [ + { + 'level': 1, + 'id': 'header-1', + 'name': 'Header 1', + 'html': 'Header 1', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-2', + 'name': 'Header 2', + 'html': 'Header 2', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-subelement-3', + 'name': 'Header subelement 3', + 'html': 'Header subelement 3', + 'children': [] + } + ] + }, + extensions=[TocExtension(), 'footnotes'] + ) From 4ef4ad6f57e15bbe8c87830369d5ce02865246e2 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 9 Feb 2024 14:55:27 -0500 Subject: [PATCH 02/17] lint cleanup --- markdown/extensions/toc.py | 15 +++++------ tests/test_syntax/extensions/test_toc.py | 33 ++++++++++++++---------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index e7cde03e7..3dc5b962e 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -28,7 +28,6 @@ import html import unicodedata from copy import deepcopy -from html import unescape as html_unescape import xml.etree.ElementTree as etree from typing import TYPE_CHECKING, Any, Iterator, MutableSet @@ -39,7 +38,7 @@ def slugify(value: str, separator: str, unicode: bool = False) -> str: """ Slugify a string, to make it URL friendly. """ # First convert HTML entities to Unicode characters - value = html_unescape(value) + value = html.unescape(value) if not unicode: # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty` value = unicodedata.normalize('NFKD', value) @@ -108,8 +107,8 @@ def run_postprocessors(text: str, md: Markdown) -> str: def render_inner_html(el: etree.Element, md: Markdown) -> str: - """ Fully render inner html of an etree element as a string. """ - # The UnescapeTreeprocessor runs after TOC so run here. + """ Fully render inner html of an `etree` element as a string. """ + # The `UnescapeTreeprocessor` runs after `toc` extension so run here. text = md_unescape(md.serializer(el)) # strip parent tag @@ -121,7 +120,7 @@ def render_inner_html(el: etree.Element, md: Markdown) -> str: def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element: - """ Return a deep copy of an etree element, optionally with footnote references removed. """ + """ Return a deep copy of an `etree` element, optionally with footnote references removed. """ el = deepcopy(el) # Remove footnote references, which look like this: `...`. if exclude_fnrefs: @@ -345,8 +344,8 @@ def run(self, doc: etree.Element) -> None: for el in doc.iter(): if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) - html = render_inner_html(copy_element(el), self.md) - text = strip_tags(html) + innerhtml = render_inner_html(copy_element(el), self.md) + text = strip_tags(innerhtml) # Do not override pre-existing ids if "id" not in el.attrib: @@ -365,7 +364,7 @@ def run(self, doc: etree.Element) -> None: 'level': int(el.tag[-1]), 'id': el.attrib["id"], 'name': text, - 'html': html + 'html': innerhtml }) if self.use_anchors: diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index ff5a1774d..9c5535e7c 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -697,20 +697,25 @@ def testHeadingRemoveFootnoteRef(self): [^1]: footnote ''' ), - self.dedent( - ''' -

Header 11

-

Header1 2

-

Header subelement1 3

-
-
-
    -
  1. -

    footnote 

    -
  2. -
-
- ''' + ( + '

Header 11

\n' + '

Header1 2

\n' + '

' + 'Header subelement1 3' + '

\n' + '
\n' + '
\n' + '
    \n' + '
  1. \n' + '

    ' + 'footnote ' + '' + '' + '' + '

    \n' + '
  2. \n' + '
\n' + '
' ), expected_attrs={ 'toc': ( From dbbd038582dc138e55bb0b6c7f16864a3d2e18e2 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 9 Feb 2024 15:26:45 -0500 Subject: [PATCH 03/17] Restore and deprecate deleted functions --- markdown/extensions/toc.py | 39 ++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index 3dc5b962e..51b20be4d 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -21,7 +21,7 @@ from . import Extension from ..treeprocessors import Treeprocessor -from ..util import parseBoolValue, AMP_SUBSTITUTE +from ..util import parseBoolValue, AMP_SUBSTITUTE, deprecated, HTML_PLACEHOLDER_RE, AtomicString from ..treeprocessors import UnescapeTreeprocessor from ..serializers import RE_AMP import re @@ -67,7 +67,38 @@ def unique(id: str, ids: MutableSet[str]) -> str: return id -def md_unescape(text: str) -> str: +@deprecated('Use `render_inner_html` and `striptags` instead.') +def get_name(el: etree.Element) -> str: + """Get title name.""" + + text = [] + for c in el.itertext(): + if isinstance(c, AtomicString): + text.append(html.unescape(c)) + else: + text.append(c) + return ''.join(text).strip() + + +@deprecated('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.') +def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str: + """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """ + def _html_sub(m: re.Match[str]) -> str: + """ Substitute raw html with plain text. """ + try: + raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))] + except (IndexError, TypeError): # pragma: no cover + return m.group(0) + # Strip out tags and/or entities - leaving text + res = re.sub(r'(<[^>]+>)', '', raw) + if strip_entities: + res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res) + return res + + return HTML_PLACEHOLDER_RE.sub(_html_sub, text) + + +def unescape(text: str) -> str: """ Unescape Markdown backslash escaped text. """ c = UnescapeTreeprocessor() return c.unescape(text) @@ -109,7 +140,7 @@ def run_postprocessors(text: str, md: Markdown) -> str: def render_inner_html(el: etree.Element, md: Markdown) -> str: """ Fully render inner html of an `etree` element as a string. """ # The `UnescapeTreeprocessor` runs after `toc` extension so run here. - text = md_unescape(md.serializer(el)) + text = unescape(md.serializer(el)) # strip parent tag start = text.index('>') + 1 @@ -352,7 +383,7 @@ def run(self, doc: etree.Element) -> None: el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids) if 'data-toc-label' in el.attrib: - text = md_unescape(el.attrib['data-toc-label']) + text = unescape(el.attrib['data-toc-label']) text = run_postprocessors(text, self.md) text = strip_tags(text) text = escape_cdata(text) From a5617893df8b55558238c72451a5fad7d79c87f7 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 12 Feb 2024 09:03:11 -0500 Subject: [PATCH 04/17] unescape before calling slugify for compat w/ 3rd parties --- markdown/extensions/toc.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index 51b20be4d..6c0cf1213 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -37,8 +37,6 @@ def slugify(value: str, separator: str, unicode: bool = False) -> str: """ Slugify a string, to make it URL friendly. """ - # First convert HTML entities to Unicode characters - value = html.unescape(value) if not unicode: # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty` value = unicodedata.normalize('NFKD', value) @@ -380,7 +378,7 @@ def run(self, doc: etree.Element) -> None: # Do not override pre-existing ids if "id" not in el.attrib: - el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids) + el.attrib["id"] = unique(self.slugify(html.unescape(text), self.sep), used_ids) if 'data-toc-label' in el.attrib: text = unescape(el.attrib['data-toc-label']) From 6cb1fd767591a92953c06ba643e0ce92302bf96d Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 12 Feb 2024 09:24:07 -0500 Subject: [PATCH 05/17] Add test for email auto links Thanks to @vedranmiletic for test case. --- tests/test_syntax/extensions/test_toc.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 9c5535e7c..8afc52e3e 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -475,6 +475,30 @@ def test_escaped_char_in_id(self): extensions=['toc'] ) + + def testAutoLinkEmail(self): + self.assertMarkdownRenders( + '## ', + '

foo@exa' + 'mple.org

', + expected_attrs={ + 'toc_tokens': [ + { + 'level': 2, + 'id': 'fooexampleorg', + 'name': 'foo@exam' + 'ple.org', + 'html': 'foo' + '@example.org', + 'children': [] + } + ] + }, + extensions=['toc'] + ) + def testAnchorLinkWithCustomClass(self): self.assertMarkdownRenders( self.dedent( From 5377d2bae619f3fa96ffda9c19ef8630ae9d5f87 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 12 Feb 2024 09:28:03 -0500 Subject: [PATCH 06/17] lint cleanup --- tests/test_syntax/extensions/test_toc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 8afc52e3e..a6dba5582 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -475,7 +475,6 @@ def test_escaped_char_in_id(self): extensions=['toc'] ) - def testAutoLinkEmail(self): self.assertMarkdownRenders( '## ', From caf4f2efbb1f7f8b26fbd5d0b9bcba96d3e0a593 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 12 Feb 2024 11:02:11 -0500 Subject: [PATCH 07/17] refactor testTocWithAttrList for full coverage --- tests/test_extensions.py | 42 ------------ tests/test_syntax/extensions/test_toc.py | 82 ++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 42 deletions(-) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 4ebe4eca5..5af05e69e 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -551,48 +551,6 @@ def testTitle(self): md.toc ) - def testWithAttrList(self): - """ Test TOC with `attr_list` Extension. """ - self.maxDiff = None - md = markdown.Markdown(extensions=['toc', 'attr_list']) - text = ('# Header 1\n\n' - '## Header 2 { #foo }\n\n' - '## Header 3 { data-toc-label="Foo Bar" }\n\n' - '# Header 4 { data-toc-label="Foo > Baz" }\n\n' - '# Header 5 { data-toc-label="Foo Quux" }') - - self.assertEqual( - md.convert(text), - '

Header 1

\n' - '

Header 2

\n' - '

Header 3

\n' - '

Header 4

\n' - '

Header 5

' - ) - self.assertEqual( - md.toc, - '
\n' - '\n' # noqa - '
\n' - ) - self.assertEqual(md.toc_tokens, [ - {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [ - {'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []}, - {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []} - ]}, - {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'html': 'Header 4', 'children': []}, - {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []}, - ]) - def testUniqueFunc(self): """ Test 'unique' function. """ from markdown.extensions.toc import unique diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index a6dba5582..f0a3b79ba 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -708,6 +708,88 @@ def testTOCWithCustomTitleClass(self): extensions=[TocExtension(title_class="tocname", title='ToC')] ) + def testTocWithAttrList(self): + + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1 + + ## Header 2 { #foo } + + ## Header 3 { data-toc-label="Foo Bar" } + + # Header 4 { data-toc-label="Foo > & < Baz" } + + # Header 5 { data-toc-label="Foo Quux" } + ''' + ), + self.dedent( + ''' +

Header 1

+

Header 2

+

Header 3

+

Header 4

+

Header 5

+ ''' + ), + expected_attrs={ + 'toc': ( + '
\n' + '\n' # noqa + '
\n' + ), + 'toc_tokens': [ + { + 'level': 1, + 'id': 'header-1', + 'name': 'Header 1', + 'html': 'Header 1', + 'children': [ + { + 'level': 2, + 'id': 'foo', + 'name': 'Header 2', + 'html': 'Header 2', + 'children': [] + }, + { + 'level': 2, + 'id': 'header-3', + 'name': 'Foo Bar', + 'html': 'Header 3', + 'children': [] + } + ] + }, + { + 'level': 1, + 'id': 'header-4', + 'name': 'Foo > & < Baz', + 'html': 'Header 4', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-5', + 'name': 'Foo Quux', + 'html': 'Header 5', + 'children': [] + }, + ] + }, + extensions=[TocExtension(), 'attr_list'] + ) + def testHeadingRemoveFootnoteRef(self): self.assertMarkdownRenders( From d7bff9b3d26cfdf64d950895fdfa505435cf7eda Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 12 Feb 2024 14:19:13 -0500 Subject: [PATCH 08/17] Add comprehensive strip_tag tests --- tests/test_syntax/extensions/test_toc.py | 71 +++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index f0a3b79ba..07f06b1d3 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -20,7 +20,7 @@ """ from markdown.test_tools import TestCase -from markdown.extensions.toc import TocExtension +from markdown.extensions.toc import TocExtension, strip_tags from markdown.extensions.nl2br import Nl2BrExtension @@ -858,3 +858,72 @@ def testHeadingRemoveFootnoteRef(self): }, extensions=[TocExtension(), 'footnotes'] ) + + +class testStripTags(TestCase): + + def testStripElement(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripOpenElement(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripEmptyElement(self): + self.assertEqual( + strip_tags('foo
bar'), + 'foo bar' + ) + + def testDontStripOpenBracket(self): + self.assertEqual( + strip_tags('foo < bar'), + 'foo < bar' + ) + + def testDontStripCloseBracket(self): + self.assertEqual( + strip_tags('foo > bar'), + 'foo > bar' + ) + + def testStripCollapseWhitespace(self): + self.assertEqual( + strip_tags('foo \tbar\t'), + 'foo bar' + ) + + def testStripElementWithNewlines(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripComment(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripCommentWithInnerTags(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripCommentInElement(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testDontStripHTMLEntities(self): + self.assertEqual( + strip_tags('foo < & < bar'), + 'foo < & < bar' + ) From 1355826414a8e18a237e232ef2201d256c031497 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 22 Feb 2024 09:37:41 -0500 Subject: [PATCH 09/17] Add a test case for double footnotes --- tests/test_syntax/extensions/test_toc.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 07f06b1d3..20deed327 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -798,6 +798,7 @@ def testHeadingRemoveFootnoteRef(self): # Header 1[^1] # Header[^1] 2 # Header *subelement*[^1] 3 + # Header[^1] double[^1] 4 [^1]: footnote ''' @@ -808,6 +809,7 @@ def testHeadingRemoveFootnoteRef(self): '

' 'Header subelement1 3' '

\n' + '

Header1 double1 4

\n' '
\n' '
\n' '
    \n' @@ -817,6 +819,8 @@ def testHeadingRemoveFootnoteRef(self): '' '' '' + '' + '' '

    \n' '\n' '
\n' @@ -829,6 +833,7 @@ def testHeadingRemoveFootnoteRef(self): '
  • Header 1
  • \n' # noqa '
  • Header 2
  • \n' # noqa '
  • Header subelement 3
  • \n' # noqa + '
  • Header double 4
  • \n' # noqa '\n' # noqa '
    \n' # noqa ), @@ -853,6 +858,13 @@ def testHeadingRemoveFootnoteRef(self): 'name': 'Header subelement 3', 'html': 'Header subelement 3', 'children': [] + }, + { + 'level': 1, + 'id': 'header-double-4', + 'name': 'Header double 4', + 'html': 'Header double 4', + 'children': [] } ] }, From 7e4117b43c9dca51788bd4de61e9c97b43db55c6 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 22 Feb 2024 09:51:01 -0500 Subject: [PATCH 10/17] lint cleanup --- tests/test_syntax/extensions/test_toc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 20deed327..017fe0d36 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -809,7 +809,10 @@ def testHeadingRemoveFootnoteRef(self): '

    ' 'Header subelement1 3' '

    \n' - '

    Header1 double1 4

    \n' + '

    ' + 'Header1 double' + '1 4' + '

    \n' '
    \n' '
    \n' '
      \n' From b4dbb0a603ee77dea8e99f780f16b9a80b2d0382 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 22 Feb 2024 13:20:15 -0500 Subject: [PATCH 11/17] Add data-toc-label to toc_token --- markdown/extensions/toc.py | 17 ++-- tests/test_extensions.py | 92 +++++++++++++++++---- tests/test_syntax/extensions/test_smarty.py | 1 + tests/test_syntax/extensions/test_toc.py | 23 ++++++ 4 files changed, 107 insertions(+), 26 deletions(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index 6c0cf1213..f58fd8870 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -374,17 +374,17 @@ def run(self, doc: etree.Element) -> None: if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) innerhtml = render_inner_html(copy_element(el), self.md) - text = strip_tags(innerhtml) + name = strip_tags(innerhtml) # Do not override pre-existing ids if "id" not in el.attrib: - el.attrib["id"] = unique(self.slugify(html.unescape(text), self.sep), used_ids) + el.attrib["id"] = unique(self.slugify(html.unescape(name), self.sep), used_ids) + data_toc_label = '' if 'data-toc-label' in el.attrib: - text = unescape(el.attrib['data-toc-label']) - text = run_postprocessors(text, self.md) - text = strip_tags(text) - text = escape_cdata(text) + data_toc_label = run_postprocessors(unescape(el.attrib['data-toc-label']), self.md) + # Overwrite name with sanitized value of `data-toc-label`. + name = escape_cdata(strip_tags(data_toc_label)) # Remove the data-toc-label attribute as it is no longer needed del el.attrib['data-toc-label'] @@ -392,8 +392,9 @@ def run(self, doc: etree.Element) -> None: toc_tokens.append({ 'level': int(el.tag[-1]), 'id': el.attrib["id"], - 'name': text, - 'html': innerhtml + 'name': name, + 'html': innerhtml, + 'data-toc-label': data_toc_label }) if self.use_anchors: diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 5af05e69e..73732a0db 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -420,9 +420,30 @@ def testUniqueIds(self): '
    \n' ) self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []}, - {'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []}, - {'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []}, + { + 'level': 1, + 'id': 'header', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header_1', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header_2', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, ]) def testHtmlEntities(self): @@ -440,9 +461,14 @@ def testHtmlEntities(self): '\n' # noqa '\n' ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'html': 'Foo & bar', 'children': []}, - ]) + self.assertEqual(self.md.toc_tokens, [{ + 'level': 1, + 'id': 'foo-bar', + 'name': 'Foo & bar', + 'html': 'Foo & bar', + 'data-toc-label': '', + 'children': [] + }]) def testHtmlSpecialChars(self): """ Test Headers with HTML special characters. """ @@ -459,9 +485,14 @@ def testHtmlSpecialChars(self): '\n' # noqa '\n' ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'html': 'Foo > & bar', 'children': []}, - ]) + self.assertEqual(self.md.toc_tokens, [{ + 'level': 1, + 'id': 'foo-bar', + 'name': 'Foo > & bar', + 'html': 'Foo > & bar', + 'data-toc-label': '', + 'children': [] + }]) def testRawHtml(self): """ Test Headers with raw HTML. """ @@ -478,9 +509,14 @@ def testRawHtml(self): '\n' # noqa '\n' ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo Bar Baz.', 'children': []}, - ]) + self.assertEqual(self.md.toc_tokens, [{ + 'level': 1, + 'id': 'foo-bar-baz', + 'name': 'Foo Bar Baz.', + 'html': 'Foo Bar Baz.', + 'data-toc-label': '', + 'children': [] + }]) def testBaseLevel(self): """ Test Header Base Level. """ @@ -507,12 +543,31 @@ def testBaseLevel(self): '\n' # noqa '\n' ) - self.assertEqual(md.toc_tokens, [ - {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [ - {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []}, - {'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []}, - ]}, - ]) + self.assertEqual(md.toc_tokens, [{ + 'level': 5, + 'id': 'some-header', + 'name': 'Some Header', + 'html': 'Some Header', + 'data-toc-label': '', + 'children': [ + { + 'level': 6, + 'id': 'next-level', + 'name': 'Next Level', + 'html': 'Next Level', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 6, + 'id': 'too-high', + 'name': 'Too High', + 'html': 'Too High', + 'data-toc-label': '', + 'children': [] + } + ] + }]) def testHeaderInlineMarkup(self): """ Test Headers with inline markup. """ @@ -537,6 +592,7 @@ def testHeaderInlineMarkup(self): 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'html': 'Some Header with markup.', + 'data-toc-label': '', 'children': [] }]) diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py index d2d9b094b..0228ddf02 100644 --- a/tests/test_syntax/extensions/test_smarty.py +++ b/tests/test_syntax/extensions/test_smarty.py @@ -217,6 +217,7 @@ def test_smarty_and_toc(self): 'id': 'foo-bar', 'name': 'Foo — bar', 'html': 'Foobar', + 'data-toc-label': '', 'children': [], }, ], diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 017fe0d36..e67d36657 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -141,12 +141,14 @@ def testMinMaxLevel(self): 'id': 'header-3', 'name': 'Header 3', 'html': 'Header 3', + 'data-toc-label': '', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', 'html': 'Header 4', + 'data-toc-label': '', 'children': [] } ] @@ -192,12 +194,14 @@ def testMaxLevel(self): 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', + 'data-toc-label': '', 'children': [ { 'level': 2, 'id': 'header-2', 'name': 'Header 2', 'html': 'Header 2', + 'data-toc-label': '', 'children': [] } ] @@ -250,12 +254,14 @@ def testMinMaxLevelwithAnchorLink(self): 'id': 'header-3', 'name': 'Header 3', 'html': 'Header 3', + 'data-toc-label': '', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', 'html': 'Header 4', + 'data-toc-label': '', 'children': [] } ] @@ -308,12 +314,14 @@ def testMinMaxLevelwithPermalink(self): 'id': 'header-3', 'name': 'Header 3', 'html': 'Header 3', + 'data-toc-label': '', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', 'html': 'Header 4', + 'data-toc-label': '', 'children': [] } ] @@ -362,12 +370,14 @@ def testMinMaxLevelwithBaseLevel(self): 'id': 'second-level', 'name': 'Second Level', 'html': 'Second Level', + 'data-toc-label': '', 'children': [ { 'level': 5, 'id': 'third-level', 'name': 'Third Level', 'html': 'Third Level', + 'data-toc-label': '', 'children': [] } ] @@ -413,12 +423,14 @@ def testMaxLevelwithBaseLevel(self): 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', + 'data-toc-label': '', 'children': [ { 'level': 3, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', + 'data-toc-label': '', 'children': [] } ] @@ -468,6 +480,7 @@ def test_escaped_char_in_id(self): 'id': 'escaped_character', 'name': 'escaped_character', 'html': 'escaped_character', + 'data-toc-label': '', 'children': [] } ] @@ -491,6 +504,7 @@ def testAutoLinkEmail(self): 'html': 'foo' '@example.org', + 'data-toc-label': '', 'children': [] } ] @@ -754,12 +768,14 @@ def testTocWithAttrList(self): 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', + 'data-toc-label': '', 'children': [ { 'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', + 'data-toc-label': '', 'children': [] }, { @@ -767,6 +783,7 @@ def testTocWithAttrList(self): 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', + 'data-toc-label': 'Foo Bar', 'children': [] } ] @@ -776,6 +793,7 @@ def testTocWithAttrList(self): 'id': 'header-4', 'name': 'Foo > & < Baz', 'html': 'Header 4', + 'data-toc-label': 'Foo > & < Baz', 'children': [] }, { @@ -783,6 +801,7 @@ def testTocWithAttrList(self): 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', + 'data-toc-label': 'Foo Quux', 'children': [] }, ] @@ -846,6 +865,7 @@ def testHeadingRemoveFootnoteRef(self): 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', + 'data-toc-label': '', 'children': [] }, { @@ -853,6 +873,7 @@ def testHeadingRemoveFootnoteRef(self): 'id': 'header-2', 'name': 'Header 2', 'html': 'Header 2', + 'data-toc-label': '', 'children': [] }, { @@ -860,6 +881,7 @@ def testHeadingRemoveFootnoteRef(self): 'id': 'header-subelement-3', 'name': 'Header subelement 3', 'html': 'Header subelement 3', + 'data-toc-label': '', 'children': [] }, { @@ -867,6 +889,7 @@ def testHeadingRemoveFootnoteRef(self): 'id': 'header-double-4', 'name': 'Header double 4', 'html': 'Header double 4', + 'data-toc-label': '', 'children': [] } ] From a7f1554806d70cbd7a3cf0012c7e7fc31611ed7c Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 6 Mar 2024 16:18:55 -0500 Subject: [PATCH 12/17] Apply suggestions from code review Co-authored-by: Oleh Prypin --- markdown/extensions/toc.py | 42 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index f58fd8870..bdea6b48a 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -148,27 +148,27 @@ def render_inner_html(el: etree.Element, md: Markdown) -> str: return run_postprocessors(text, md) -def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element: - """ Return a deep copy of an `etree` element, optionally with footnote references removed. """ - el = deepcopy(el) +def remove_fnrefs(root: etree.Element) -> etree.Element: + """ Remove footnote references from a copy of the element, if any are present. """ # Remove footnote references, which look like this: `...`. - if exclude_fnrefs: - for sup in el.findall('sup'): - id = sup.get('id', '') - if id.startswith('fnref'): - # We have a footnote reference. Remove it. - parent = el.find(f'.//sup[@id="{id}"]..') - if sup.tail: - # Preserve the tail text - siblings = list(parent) - pos = siblings.index(sup) - if pos == 0: - parent.text = f'{parent.text or ""}{sup.tail}' - else: - sibling = siblings[pos - 1] - sibling.tail = f'{sibling.tail or ""}{sup.tail}' - parent.remove(sup) - return el + # If there are no `sup` elements, then nothing to do. + if next(root.iter('sup'), None) is None: + return root + root = deepcopy(root) + # Find parent elements that contain `sup` elements. + for parent in root.findall('.//sup/..'): + carry_text = "" + for child in reversed(parent): # Reversed for the ability to mutate during iteration. + # Remove matching footnote references but carry any `tail` text to preceding elements. + if child.tag == 'sup' and child.get('id', '').startswith('fnref'): + carry_text = f'{child.tail or ""}{carry_text}' + parent.remove(child) + elif carry_text: + child.tail = f'{child.tail or ""}{carry_text}' + carry_text = "" + if carry_text: + parent.text = f'{parent.text or ""}{carry_text}' + return root def nest_toc_tokens(toc_list): @@ -373,7 +373,7 @@ def run(self, doc: etree.Element) -> None: for el in doc.iter(): if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) - innerhtml = render_inner_html(copy_element(el), self.md) + innerhtml = render_inner_html(remove_fnrefs(el), self.md) name = strip_tags(innerhtml) # Do not override pre-existing ids From b1021cb69612804c3a3840288b0c623adcce4d9b Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2024 10:22:13 -0500 Subject: [PATCH 13/17] Update all TOC tests to new framework --- tests/test_extensions.py | 431 -------------------- tests/test_syntax/extensions/test_toc.py | 491 ++++++++++++++++++++++- 2 files changed, 489 insertions(+), 433 deletions(-) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 4dcfc763a..b8bc3c81c 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -29,17 +29,6 @@ import markdown -class TestCaseWithAssertStartsWith(unittest.TestCase): - - def assertStartsWith(self, expectedPrefix, text, msg=None): - if not text.startswith(expectedPrefix): - if len(expectedPrefix) + 5 < len(text): - text = text[:len(expectedPrefix) + 5] + '...' - standardMsg = '{} not found at the start of {}'.format(repr(expectedPrefix), - repr(text)) - self.fail(self._formatMessage(msg, standardMsg)) - - class TestExtensionClass(unittest.TestCase): """ Test markdown.extensions.Extension. """ @@ -281,426 +270,6 @@ def testRE(self): self.assertEqual(RE.match(test).groups(), expected) -class TestTOC(TestCaseWithAssertStartsWith): - """ Test TOC Extension. """ - - def setUp(self): - self.md = markdown.Markdown(extensions=['toc']) - - def testMarker(self): - """ Test TOC with a Marker. """ - text = '[TOC]\n\n# Header 1\n\n## Header 2' - self.assertEqual( - self.md.convert(text), - '
    \n' - '
      \n' # noqa - '
    • Header 1' # noqa - '\n' # noqa - '
    • \n' # noqa - '
    \n' # noqa - '
    \n' - '

    Header 1

    \n' - '

    Header 2

    ' - ) - - def testNoMarker(self): - """ Test TOC without a Marker. """ - text = '# Header 1\n\n## Header 2' - self.assertEqual( - self.md.convert(text), - '

    Header 1

    \n' - '

    Header 2

    ' - ) - self.assertEqual( - self.md.toc, - '
    \n' - '
      \n' # noqa - '
    • Header 1' # noqa - '\n' # noqa - '
    • \n' # noqa - '
    \n' # noqa - '
    \n' - ) - - def testAlternateMarker(self): - """ Test TOC with user defined marker. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(marker='{{marker}}')] - ) - text = '{{marker}}\n\n# Header 1\n\n## Header 2' - self.assertEqual( - md.convert(text), - '
    \n' - '
      \n' # noqa - '
    • Header 1' # noqa - '\n' # noqa - '
    • \n' # noqa - '
    \n' # noqa - '
    \n' - '

    Header 1

    \n' - '

    Header 2

    ' - ) - - def testDisabledMarker(self): - """ Test TOC with disabled marker. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(marker='')] - ) - text = '[TOC]\n\n# Header 1\n\n## Header 2' - self.assertEqual( - md.convert(text), - '

    [TOC]

    \n' - '

    Header 1

    \n' - '

    Header 2

    ' - ) - self.assertStartsWith('
    ', md.toc) - - def testReset(self): - """ Test TOC Reset. """ - self.assertEqual(self.md.toc, '') - self.md.convert('# Header 1\n\n## Header 2') - self.assertStartsWith('
    ', self.md.toc) - self.md.reset() - self.assertEqual(self.md.toc, '') - self.assertEqual(self.md.toc_tokens, []) - - def testUniqueIds(self): - """ Test Unique IDs. """ - - text = '#Header\n#Header\n#Header' - self.assertEqual( - self.md.convert(text), - '

    Header

    \n' - '

    Header

    \n' - '

    Header

    ' - ) - self.assertEqual( - self.md.toc, - '
    \n' - '\n' # noqa - '
    \n' - ) - self.assertEqual(self.md.toc_tokens, [ - { - 'level': 1, - 'id': 'header', - 'name': 'Header', - 'html': 'Header', - 'data-toc-label': '', - 'children': [] - }, - { - 'level': 1, - 'id': 'header_1', - 'name': 'Header', - 'html': 'Header', - 'data-toc-label': '', - 'children': [] - }, - { - 'level': 1, - 'id': 'header_2', - 'name': 'Header', - 'html': 'Header', - 'data-toc-label': '', - 'children': [] - }, - ]) - - def testHtmlEntities(self): - """ Test Headers with HTML Entities. """ - text = '# Foo & bar' - self.assertEqual( - self.md.convert(text), - '

    Foo & bar

    ' - ) - self.assertEqual( - self.md.toc, - '
    \n' - '\n' # noqa - '
    \n' - ) - self.assertEqual(self.md.toc_tokens, [{ - 'level': 1, - 'id': 'foo-bar', - 'name': 'Foo & bar', - 'html': 'Foo & bar', - 'data-toc-label': '', - 'children': [] - }]) - - def testHtmlSpecialChars(self): - """ Test Headers with HTML special characters. """ - text = '# Foo > & bar' - self.assertEqual( - self.md.convert(text), - '

    Foo > & bar

    ' - ) - self.assertEqual( - self.md.toc, - '
    \n' - '\n' # noqa - '
    \n' - ) - self.assertEqual(self.md.toc_tokens, [{ - 'level': 1, - 'id': 'foo-bar', - 'name': 'Foo > & bar', - 'html': 'Foo > & bar', - 'data-toc-label': '', - 'children': [] - }]) - - def testRawHtml(self): - """ Test Headers with raw HTML. """ - text = '# Foo Bar Baz.' - self.assertEqual( - self.md.convert(text), - '

    Foo Bar Baz.

    ' - ) - self.assertEqual( - self.md.toc, - '
    \n' - '\n' # noqa - '
    \n' - ) - self.assertEqual(self.md.toc_tokens, [{ - 'level': 1, - 'id': 'foo-bar-baz', - 'name': 'Foo Bar Baz.', - 'html': 'Foo Bar Baz.', - 'data-toc-label': '', - 'children': [] - }]) - - def testBaseLevel(self): - """ Test Header Base Level. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(baselevel=5)] - ) - text = '# Some Header\n\n## Next Level\n\n### Too High' - self.assertEqual( - md.convert(text), - '
    Some Header
    \n' - '
    Next Level
    \n' - '
    Too High
    ' - ) - self.assertEqual( - md.toc, - '
    \n' - '\n' # noqa - '
    \n' - ) - self.assertEqual(md.toc_tokens, [{ - 'level': 5, - 'id': 'some-header', - 'name': 'Some Header', - 'html': 'Some Header', - 'data-toc-label': '', - 'children': [ - { - 'level': 6, - 'id': 'next-level', - 'name': 'Next Level', - 'html': 'Next Level', - 'data-toc-label': '', - 'children': [] - }, - { - 'level': 6, - 'id': 'too-high', - 'name': 'Too High', - 'html': 'Too High', - 'data-toc-label': '', - 'children': [] - } - ] - }]) - - def testHeaderInlineMarkup(self): - """ Test Headers with inline markup. """ - - text = '#Some *Header* with [markup](http://example.com).' - self.assertEqual( - self.md.convert(text), - '

    Some Header with ' - 'markup.

    ' - ) - self.assertEqual( - self.md.toc, - '
    \n' - '\n' # noqa - '
    \n' - ) - self.assertEqual(self.md.toc_tokens, [{ - 'level': 1, - 'id': 'some-header-with-markup', - 'name': 'Some Header with markup.', - 'html': 'Some Header with markup.', - 'data-toc-label': '', - 'children': [] - }]) - - def testTitle(self): - """ Test TOC Title. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(title='Table of Contents')] - ) - md.convert('# Header 1\n\n## Header 2') - self.assertStartsWith( - '
    Table of Contents
      ', - md.toc - ) - - def testUniqueFunc(self): - """ Test 'unique' function. """ - from markdown.extensions.toc import unique - ids = {'foo'} - self.assertEqual(unique('foo', ids), 'foo_1') - self.assertEqual(ids, {'foo', 'foo_1'}) - - def testTocInHeaders(self): - - text = '[TOC]\n#[TOC]' - self.assertEqual( - self.md.convert(text), - '
      \n' # noqa - '
        \n' # noqa - '
      • [TOC]
      • \n' # noqa - '
      \n' # noqa - '
      \n' # noqa - '

      [TOC]

      ' # noqa - ) - - text = '#[TOC]\n[TOC]' - self.assertEqual( - self.md.convert(text), - '

      [TOC]

      \n' # noqa - '
      \n' # noqa - '
        \n' # noqa - '
      • [TOC]
      • \n' # noqa - '
      \n' # noqa - '
      ' # noqa - ) - - text = '[TOC]\n# *[TOC]*' - self.assertEqual( - self.md.convert(text), - '
      \n' # noqa - '
        \n' # noqa - '
      • [TOC]
      • \n' # noqa - '
      \n' # noqa - '
      \n' # noqa - '

      [TOC]

      ' # noqa - ) - - def testPermalink(self): - """ Test TOC `permalink` feature. """ - text = '# Hd 1\n\n## Hd 2' - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL")] - ) - self.assertEqual( - md.convert(text), - '

      ' - 'Hd 1' # noqa - '' # noqa - '¶' # noqa - '' # noqa - '

      \n' - '

      ' - 'Hd 2' # noqa - '' # noqa - '¶' # noqa - '' # noqa - '

      ' - ) - - def testPermalinkLeading(self): - """ Test TOC `permalink` with `permalink_leading` option. """ - text = '# Hd 1\n\n## Hd 2' - md = markdown.Markdown(extensions=[ - markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL", permalink_leading=True)] - ) - self.assertEqual( - md.convert(text), - '

      ' - '' # noqa - '¶' # noqa - '' # noqa - 'Hd 1' # noqa - '

      \n' - '

      ' - '' # noqa - '¶' # noqa - '' # noqa - 'Hd 2' # noqa - '

      ' - ) - - def testInlineMarkupPermalink(self): - """ Test TOC `permalink` with headers containing markup. """ - text = '# Code `in` hd' - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL")] - ) - self.assertEqual( - md.convert(text), - '

      ' - 'Code in hd' # noqa - '' # noqa - '¶' # noqa - '' # noqa - '

      ' - ) - - def testInlineMarkupPermalinkLeading(self): - """ Test TOC `permalink_leading` with headers containing markup. """ - text = '# Code `in` hd' - md = markdown.Markdown(extensions=[ - markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL", permalink_leading=True)] - ) - self.assertEqual( - md.convert(text), - '

      ' - '' # noqa - '¶' # noqa - '' # noqa - 'Code in hd' # noqa - '

      ' - ) - - class TestSmarty(unittest.TestCase): def setUp(self): config = { diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index e67d36657..01347608f 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -20,14 +20,501 @@ """ from markdown.test_tools import TestCase -from markdown.extensions.toc import TocExtension, strip_tags +from markdown import Markdown +from markdown.extensions.toc import TocExtension, strip_tags, unique from markdown.extensions.nl2br import Nl2BrExtension class TestTOC(TestCase): maxDiff = None + default_kwargs = { + 'extensions': [TocExtension()] + } + + def testTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + + # Header 1 + + ## Header 2 + ''' + ), + '
      \n' + '
        \n' # noqa + '
      • Header 1' # noqa + '\n' # noqa + '
      • \n' # noqa + '
      \n' # noqa + '
      \n' + '

      Header 1

      \n' + '

      Header 2

      ' + ) + + def testNoTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1 + + ## Header 2 + ''' + ), + self.dedent( + ''' +

      Header 1

      +

      Header 2

      + ''' + ), + expected_attrs={ + 'toc': ( + '
      \n' + '
        \n' # noqa + '
      • Header 1' # noqa + '\n' # noqa + '
      • \n' # noqa + '
      \n' # noqa + '
      \n' + ) + } + ) + + def testAlternateTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + {{marker}} + + # Header 1 + + ## Header 2 + ''' + ), + '
      \n' + '
        \n' # noqa + '
      • Header 1' # noqa + '\n' # noqa + '
      • \n' # noqa + '
      \n' # noqa + '
      \n' + '

      Header 1

      \n' + '

      Header 2

      ', + extensions=[TocExtension(marker='{{marker}}')] + ) + + def testDisabledTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + + # Header 1 + + ## Header 2 + ''' + ), + self.dedent( + ''' +

      [TOC]

      +

      Header 1

      +

      Header 2

      + ''' + ), + expected_attrs={ + 'toc': ( + '
      \n' + '
        \n' # noqa + '
      • Header 1' # noqa + '\n' # noqa + '
      • \n' # noqa + '
      \n' # noqa + '
      \n' + ) + }, + extensions=[TocExtension(marker='')] + ) + + def testTOCReset(self): + md = Markdown(extensions=[TocExtension()]) + self.assertEqual(md.toc, '') + self.assertEqual(md.toc_tokens, []) + md.convert('# Header 1') + self.assertEqual('
      ', md.toc[:17]) + self.assertEqual(len(md.toc_tokens), 1) + md.reset() + self.assertEqual(md.toc, '') + self.assertEqual(md.toc_tokens, []) + + def testUniqueIds(self): + self.assertMarkdownRenders( + self.dedent( + ''' + #Header + #Header + #Header + ''' + ), + self.dedent( + ''' +

      Header

      +

      Header

      +

      Header

      + ''' + ), + expected_attrs={ + 'toc': ( + '
      \n' + '\n' # noqa + '
      \n' + ), + 'toc_tokens': [ + { + 'level': 1, + 'id': 'header', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header_1', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header_2', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + ] + } + ) + + def testHtmlEntitiesInTOC(self): + self.assertMarkdownRenders( + '# Foo & bar', + '

      Foo & bar

      ', + expected_attrs={ + 'toc': ( + '
      \n' + '\n' # noqa + '
      \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'foo-bar', + 'name': 'Foo & bar', + 'html': 'Foo & bar', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testHtmlSpecialCharsInTOC(self): + self.assertMarkdownRenders( + '# Foo > & bar', + '

      Foo > & bar

      ', + expected_attrs={ + 'toc': ( + '
      \n' + '\n' # noqa + '
      \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'foo-bar', + 'name': 'Foo > & bar', + 'html': 'Foo > & bar', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testRawHtmlInTOC(self): + self.assertMarkdownRenders( + '# Foo Bar Baz.', + '

      Foo Bar Baz.

      ', + expected_attrs={ + 'toc': ( + '
      \n' + '\n' # noqa + '
      \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'foo-bar-baz', + 'name': 'Foo Bar Baz.', + 'html': 'Foo Bar Baz.', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testTOCBaseLevel(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Some Header + ## Next Level + ### Too High + ''' + ), + self.dedent( + ''' +
      Some Header
      +
      Next Level
      +
      Too High
      + ''' + ), + expected_attrs={ + 'toc': ( + '
      \n' + '\n' # noqa + '
      \n' + ), + 'toc_tokens': [{ + 'level': 5, + 'id': 'some-header', + 'name': 'Some Header', + 'html': 'Some Header', + 'data-toc-label': '', + 'children': [ + { + 'level': 6, + 'id': 'next-level', + 'name': 'Next Level', + 'html': 'Next Level', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 6, + 'id': 'too-high', + 'name': 'Too High', + 'html': 'Too High', + 'data-toc-label': '', + 'children': [] + } + ] + }] + }, + extensions=[TocExtension(baselevel=5)] + ) + + def testHeaderInlineMarkup(self): + self.assertMarkdownRenders( + '#Some *Header* with [markup](http://example.com).', + '

      Some Header with ' + 'markup.

      ', + expected_attrs={ + 'toc': ( + '
      \n' + '\n' # noqa + '
      \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'some-header-with-markup', + 'name': 'Some Header with markup.', + 'html': 'Some Header with markup.', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testTOCTitle(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1 + + ## Header 2 + ''' + ), + self.dedent( + ''' +

      Header 1

      +

      Header 2

      + ''' + ), + expected_attrs={ + 'toc': ( + '
      Table of Contents' + '
        \n' # noqa + '
      • Header 1' # noqa + '\n' # noqa + '
      • \n' # noqa + '
      \n' # noqa + '
      \n' + ) + }, + extensions=[TocExtension(title='Table of Contents')] + ) + + def testTOCUniqueFunc(self): + ids = {'foo'} + self.assertEqual(unique('foo', ids), 'foo_1') + self.assertEqual(ids, {'foo', 'foo_1'}) + + def testTocInHeaders(self): + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + #[TOC] + ''' + ), + '
      \n' # noqa + '
        \n' # noqa + '
      • [TOC]
      • \n' # noqa + '
      \n' # noqa + '
      \n' # noqa + '

      [TOC]

      ' # noqa + ) + + self.assertMarkdownRenders( + self.dedent( + ''' + #[TOC] + [TOC] + ''' + ), + '

      [TOC]

      \n' # noqa + '
      \n' # noqa + '
        \n' # noqa + '
      • [TOC]
      • \n' # noqa + '
      \n' # noqa + '
      ' # noqa + ) + + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + # *[TOC]* + ''' + ), + '
      \n' # noqa + '
        \n' # noqa + '
      • [TOC]
      • \n' # noqa + '
      \n' # noqa + '
      \n' # noqa + '

      [TOC]

      ' # noqa + ) + + def testTOCPermalink(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Hd 1 + + ## Hd 2 + ''' + ), + '

      ' + 'Hd 1' # noqa + '' # noqa + '¶' # noqa + '' # noqa + '

      \n' + '

      ' + 'Hd 2' # noqa + '' # noqa + '¶' # noqa + '' # noqa + '

      ', + extensions=[TocExtension(permalink=True, permalink_title="PL")] + ) + + def testTOCPermalinkLeading(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Hd 1 + ## Hd 2 + ''' + ), + '

      ' + '' # noqa + '¶' # noqa + '' # noqa + 'Hd 1' # noqa + '

      \n' + '

      ' + '' # noqa + '¶' # noqa + '' # noqa + 'Hd 2' # noqa + '

      ', + extensions=[TocExtension(permalink=True, permalink_title="PL", permalink_leading=True)] + ) + + def testTOCInlineMarkupPermalink(self): + self.assertMarkdownRenders( + '# Code `in` hd', + '

      ' + 'Code in hd' # noqa + '' # noqa + '¶' # noqa + '' # noqa + '

      ', + extensions=[TocExtension(permalink=True, permalink_title="PL")] + ) + + def testTOCInlineMarkupPermalinkLeading(self): + self.assertMarkdownRenders( + '# Code `in` hd', + '

      ' + '' # noqa + '¶' # noqa + '' # noqa + 'Code in hd' # noqa + '

      ', + extensions=[TocExtension(permalink=True, permalink_title="PL", permalink_leading=True)] + ) - # TODO: Move the rest of the TOC tests here. def testAnchorLink(self): self.assertMarkdownRenders( From 9d5d81305bed0ee3d55d738e48d85b8c03bbc95c Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2024 10:26:42 -0500 Subject: [PATCH 14/17] cleanup --- tests/test_syntax/extensions/test_toc.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 01347608f..9902072a2 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -96,17 +96,17 @@ def testAlternateTOCMarker(self): ## Header 2 ''' ), - '
      \n' - '
        \n' # noqa - '
      • Header 1' # noqa - '\n' # noqa - '
      • \n' # noqa - '
      \n' # noqa - '
      \n' - '

      Header 1

      \n' - '

      Header 2

      ', + '
      \n' + '
        \n' # noqa + '
      • Header 1' # noqa + '\n' # noqa + '
      • \n' # noqa + '
      \n' # noqa + '
      \n' + '

      Header 1

      \n' + '

      Header 2

      ', extensions=[TocExtension(marker='{{marker}}')] ) @@ -515,7 +515,6 @@ def testTOCInlineMarkupPermalinkLeading(self): extensions=[TocExtension(permalink=True, permalink_title="PL", permalink_leading=True)] ) - def testAnchorLink(self): self.assertMarkdownRenders( self.dedent( From c4a139ff3183acae4e726f05064355268db099d1 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2024 13:24:21 -0500 Subject: [PATCH 15/17] update docs --- docs/changelog.md | 19 +++++++++++++++++++ docs/extensions/toc.md | 11 +++++++++++ 2 files changed, 30 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index 8deaefd28..0d8c38dfb 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [unreleased] +### Changed + +#### Refactor TOC Sanitation + +* All postprocessors are run on heading content. +* Footnote references are stripped from heading content. Fixes #660. +* A more robust `striptags` is provided to convert headings to plain text. + Unlike, markupsafe's implementation, HTML entities are not unescaped. +* The plain text `name`, rich `html` and unescaped raw `data-toc-label` are + saved to `toc_tokens`, allowing users to access the full rich text content of + the headings directly from `toc_tokens`. +* `data-toc-label` is sanitized separate from heading content. +* A `html.unescape` call is made just prior to calling `slugify` so that + `slugify` only operates on Unicode characters. Note that `html.unescape` is + not run on the `name` or `html`. +* The `get_name` and `stashedHTML2text` functions defined in the `toc` extension + are both **deprecated**. Instead, use some combination of `run_postprocessors`, + `render_inner_html` and `striptags`. + ### Fixed * Include `scripts/*.py` in the generated source tarballs (#1430). diff --git a/docs/extensions/toc.md b/docs/extensions/toc.md index 1f80c7ea6..d1c64a9de 100644 --- a/docs/extensions/toc.md +++ b/docs/extensions/toc.md @@ -80,6 +80,8 @@ the following object at `md.toc_tokens`: 'level': 1, 'id': 'header-1', 'name': 'Header 1', + 'html': 'Header 1', + 'data-toc-label': '', 'children': [ {'level': 2, 'id': 'header-2', 'name': 'Header 2', 'children':[]} ] @@ -91,6 +93,11 @@ Note that the `level` refers to the `hn` level. In other words, `

      ` is level `1` and `

      ` is level `2`, etc. Be aware that improperly nested levels in the input may result in odd nesting of the output. +`name` is the sanitized value which would also be used as a label for the HTML +version of the Table of Contents. `html` contains the fully rendered HTML +content of the heading and has not been sanitized in any way. This may be used +with your own custom sanitation to create custom table of contents. + ### Custom Labels In most cases, the text label in the Table of Contents should match the text of @@ -131,6 +138,10 @@ attribute list to provide a cleaner URL when linking to the header. If the ID is not manually defined, it is always derived from the text of the header, never from the `data-toc-label` attribute. +The value of the `data-toc-label` attribute is sanitized and stripped of any HTML +tags. However, `toc_tokens` will contain the raw content under +`data-toc-label`. + Usage ----- From 6a39af08e2c95c0c67d743067037cdd3e0837dde Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2024 13:28:50 -0500 Subject: [PATCH 16/17] cleanup --- docs/changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 0d8c38dfb..a71dbf259 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * All postprocessors are run on heading content. * Footnote references are stripped from heading content. Fixes #660. * A more robust `striptags` is provided to convert headings to plain text. - Unlike, markupsafe's implementation, HTML entities are not unescaped. + Unlike, the `markupsafe` implementation, HTML entities are not unescaped. * The plain text `name`, rich `html` and unescaped raw `data-toc-label` are saved to `toc_tokens`, allowing users to access the full rich text content of the headings directly from `toc_tokens`. From e02941b7d12745db28731bf0f5f06c570ee7d285 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2024 14:01:38 -0500 Subject: [PATCH 17/17] update copyright --- markdown/extensions/toc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index bdea6b48a..5462a906c 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -6,7 +6,7 @@ # Original code Copyright 2008 [Jack Miller](https://codezen.org/) -# All changes Copyright 2008-2014 The Python Markdown Project +# All changes Copyright 2008-2024 The Python Markdown Project # License: [BSD](https://opensource.org/licenses/bsd-license.php)