improve striptags performance (#413)

davidism · web-flow · commit a24df39e1bb0 · 2024-01-19T13:39:29.000-08:00
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -3,6 +3,9 @@ Version 2.1.4
 
 Unreleased
 
+-   Don't use regular expressions for ``striptags``, avoiding a performance
+    issue. :pr:`413`
+
 
 Version 2.1.3
 -------------
diff --git a/src/markupsafe/__init__.py b/src/markupsafe/__init__.py
@@ -1,5 +1,4 @@
 import functools
-import re
 import string
 import sys
 import typing as t
@@ -16,9 +15,6 @@ def __html__(self) -> str:
 
 __version__ = "2.1.4.dev"
 
-_strip_comments_re = re.compile(r"<!--.*?-->", re.DOTALL)
-_strip_tags_re = re.compile(r"<.*?>", re.DOTALL)
-
 
 def _simple_escaping_wrapper(func: "t.Callable[_P, str]") -> "t.Callable[_P, Markup]":
     @functools.wraps(func)
@@ -162,10 +158,41 @@ def striptags(self) -> str:
         >>> Markup("Main &raquo;\t<em>About</em>").striptags()
         'Main » About'
         """
-        # Use two regexes to avoid ambiguous matches.
-        value = _strip_comments_re.sub("", self)
-        value = _strip_tags_re.sub("", value)
-        value = " ".join(value.split())
+        # collapse spaces
+        value = " ".join(self.split())
+
+        # Look for comments then tags separately. Otherwise, a comment that
+        # contains a tag would end early, leaving some of the comment behind.
+
+        while True:
+            # keep finding comment start marks
+            start = value.find("<!--")
+
+            if start == -1:
+                break
+
+            # find a comment end mark beyond the start, otherwise stop
+            end = value.find("-->", start)
+
+            if end == -1:
+                break
+
+            value = f"{value[:start]}{value[end + 3:]}"
+
+        # remove tags using the same method
+        while True:
+            start = value.find("<")
+
+            if start == -1:
+                break
+
+            end = value.find(">", start)
+
+            if end == -1:
+                break
+
+            value = f"{value[:start]}{value[end + 1:]}"
+
         return self.__class__(value).unescape()
 
     @classmethod
diff --git a/tox.ini b/tox.ini
@@ -1,6 +1,6 @@
 [tox]
 envlist =
-    py3{12,11,10,9,8,7}
+    py3{12,11,10,9,8}
     pypy310
     style
     typing