Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 23 additions & 26 deletions ietf/utils/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Utilities for working with HTML."""


import bleach
import nh3
import html2text

import debug # pyflakes:ignore
Expand All @@ -15,63 +15,60 @@
from ietf.utils.mime import get_mime_type


# Allow the protocols/tags/attributes we specifically want, plus anything that bleach declares
# to be safe. As of 2025-01-27, the explicit lists for protocols and tags are a strict superset
# of bleach's defaults.
acceptable_protocols = bleach.sanitizer.ALLOWED_PROTOCOLS.union(
# Allow the protocols/tags/attributes we specifically want, plus anything that nh3 declares
# to be safe.

acceptable_protocols = nh3.ALLOWED_URL_SCHEMES.union(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO check. this is more permissive than bleach's strict superset. however, unsure if this adds extra vulnerabilities since nh3 uses a better maintained parser https://github.com/rust-ammonia/ammonia

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to try to get a few other eyes on this

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think adding tel: and ftp: (really?) are fine from a security perspective.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't really need ftp, but tel might be useful.

{"http", "https", "mailto", "ftp", "xmpp"}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
{"http", "https", "mailto", "ftp", "xmpp"}
{"ftp", "http", "https", "mailto", "tel", "xmpp"}

Sort. Add "tel".

(If performance is dictated by order, move "https" to the top.)

)
acceptable_tags = bleach.sanitizer.ALLOWED_TAGS.union(
acceptable_tags = nh3.ALLOWED_TAGS.union(
{
# fmt: off
# fmt: off
"a", "abbr", "acronym", "address", "b", "big",
"blockquote", "body", "br", "caption", "center", "cite", "code", "col",
"colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "font",
"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "ins", "kbd",
"li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike", "style",
"strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead",
"tr", "tt", "u", "ul", "var"
"li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike",
"strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "var"
# fmt: on
}
)
acceptable_attributes = bleach.sanitizer.ALLOWED_ATTRIBUTES | {
"*": ["id"],
"ol": ["start"],
acceptable_attributes = nh3.ALLOWED_ATTRIBUTES | {
"*": {"id"},
"ol": {"start"},
}


# Instantiate sanitizer classes
_bleach_cleaner = bleach.sanitizer.Cleaner(
_nh3_cleaner = nh3.Cleaner(
tags=acceptable_tags,
attributes=acceptable_attributes,
protocols=acceptable_protocols,
strip=True,
url_schemes=acceptable_protocols,
link_rel=None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem right to me. The default value is safer. Are there cases where links opened will need window.opener? I can't imagine that being necessary for user-generated content.

Same below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool, changed it here as there were no rel attributes before

)


_liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
tags=acceptable_tags.union({"img", "figure", "figcaption"}),
attributes=acceptable_attributes | {"img": ["src", "alt"]},
protocols=acceptable_protocols,
strip=True,
_liberal_nh3_cleaner = nh3.Cleaner(
tags=acceptable_tags.union({"mg", "figure", "figcaption"}),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo?

Suggested change
tags=acceptable_tags.union({"mg", "figure", "figcaption"}),
tags=acceptable_tags.union({"img", "figure", "figcaption"}),

attributes=acceptable_attributes | {"img": {"src", "alt"}},
url_schemes=acceptable_protocols,
link_rel=None
)


def clean_html(text: str):
"""Clean the HTML in a string"""
return _bleach_cleaner.clean(text)
return _nh3_cleaner.clean(text)


def liberal_clean_html(text: str):
"""More permissively clean the HTML in a string"""
return _liberal_bleach_cleaner.clean(text)
return _liberal_nh3_cleaner.clean(text)


@keep_lazy(str)
def remove_tags(html, tags):
"""Returns the given HTML sanitized, and with the given tags removed."""
allowed = acceptable_tags - set(t.lower() for t in tags)
return bleach.clean(html, tags=allowed, strip=True)
return nh3.clean(html, tags=allowed)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might want to audit invocations of this function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if @martinthomson did so himself, but it looks to me like the only use is via the removetags filter in htmlfilters.py, which is not used anywhere (I also checked the DBTemplates)

If so, we can just lose this method entirely.

If not, I think this changes remove_... to escape_... because nh3 doesn't have a strip option (but that's just based on reading docs)



# ----------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ beautifulsoup4>=4.13.4 # Only used in tests
bibtexparser>=1.4.3 # Only used in tests
bleach>=6.2.0 # project is deprecated but supported
types-bleach>=6.2.0
nh3>=0.3.2
boto3>=1.39.15
boto3-stubs[s3]>=1.39.15
botocore>=1.39.15
Expand Down
Loading