Skip to content
Merged
40 changes: 40 additions & 0 deletions Lib/test/test_textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,46 @@ def assertUnchanged(self, text):
"""assert that dedent() has no effect on 'text'"""
self.assertEqual(text, dedent(text))

def test_dedent_only_whitespace(self):
# The empty string.
text = ""
self.assertUnchanged(text)

# Only spaces.
text = " "
expect = ""
self.assertEqual(expect, dedent(text))

# Only tabs.
text = "\t\t\t\t"
expect = ""
self.assertEqual(expect, dedent(text))

# A mixture.
text = " \t \t\t \t "
expect = ""
self.assertEqual(expect, dedent(text))

# ASCII whitespace.
text = "\f\n\r\t\v "
expect = "\n"
self.assertEqual(expect, dedent(text))

# One newline.
text = "\n"
expect = "\n"
self.assertEqual(expect, dedent(text))

# Windows-style newlines.
text = "\r\n"
expect = "\n"
self.assertEqual(expect, dedent(text))

# Whitespace mixture.
text = " \n\t\n \n\t\t\n\n\n "
expect = "\n\n\n\n\n\n"
self.assertEqual(expect, dedent(text))

def test_dedent_nomargin(self):
# No lines indented.
text = "Hello there.\nHow are you?\nOh good, I'm glad."
Expand Down
54 changes: 17 additions & 37 deletions Lib/textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,9 +413,6 @@ def shorten(text, width, **kwargs):

# -- Loosely related functionality -------------------------------------

_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)

def dedent(text):
"""Remove any common leading whitespace from every line in `text`.

Expand All @@ -429,42 +426,25 @@ def dedent(text):

Entirely blank lines are normalized to a newline character.
"""
# Look for the longest leading string of spaces and tabs common to
# all lines.
margin = None
text = _whitespace_only_re.sub('', text)
indents = _leading_whitespace_re.findall(text)
for indent in indents:
if margin is None:
margin = indent

# Current line more deeply indented than previous winner:
# no change (previous winner is still on top).
elif indent.startswith(margin):
pass

# Current line consistent with and no deeper than previous winner:
# it's the new winner.
elif margin.startswith(indent):
margin = indent

# Find the largest common whitespace between current line and previous
# winner.
else:
for i, (x, y) in enumerate(zip(margin, indent)):
if x != y:
margin = margin[:i]
break
if not text:
return text

# If the input is entirely whitespace, return normalized lines.
if text.isspace():
return '\n' * text.count('\n')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The paths are here because otherwise non_blank_lines is empty (not for performance I would say, the cases are rare).

It is possible though to get rid of these cases by writing

def dedent(text):
    lines = text.split('\n')

    # Get length of leading whitespace, inspired by ``os.path.commonprefix()``.
    non_blank_lines = [l for l in lines if l and not l.isspace()]
    l1 = min(non_blank_lines, default='hello')
    l2 = max(non_blank_lines, default='world')
    for margin, c in enumerate(l1):
        if c != l2[margin] or c not in ' \t':
            break

    return '\n'.join([l[margin:] if not l.isspace() else ''
                      for l in lines])

Not sure whether this makes the code nice and small, or part of some obfuscated code contest :-)

Copy link
Member Author

@AA-Turner AA-Turner Mar 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using:

def dedent(text):
    lines = text.split('\n')
    non_blank_lines = [l for l in lines if l and not l.isspace()]
    l1 = min(non_blank_lines, default='')
    l2 = max(non_blank_lines, default='')
    margin = 0
    for margin, c in enumerate(l1):
        if c != l2[margin] or c not in ' \t':
            break

    return '\n'.join([l[margin:] if not l.isspace() else '' for l in lines])

I get the following benchmark results:

Benchmark textwrap with whitespace checks proposed (no branches)
raw_text: no prefix 12.1 ms 4.81 ms: 2.52x faster 4.71 ms: 2.58x faster
raw_text: "abc \t" 12.7 ms 4.81 ms: 2.63x faster 4.50 ms: 2.81x faster
raw_text: " " 19.4 ms 5.77 ms: 3.37x faster 5.67 ms: 3.43x faster
raw_text: "\t" 18.4 ms 5.66 ms: 3.25x faster 5.54 ms: 3.32x faster
raw_text: " \t abc" 21.0 ms 5.75 ms: 3.65x faster 5.42 ms: 3.88x faster
raw_text: " \t abc \t " 24.1 ms 6.12 ms: 3.94x faster 5.76 ms: 4.18x faster
raw_text: 1000 spaces 338 ms 43.5 ms: 7.76x faster 42.7 ms: 7.91x faster
Basic indented text with empty lines 4.66 us 2.92 us: 1.60x faster 3.08 us: 1.52x faster
Text with mixed indentation and blank lines 4.28 us 2.99 us: 1.43x faster 3.07 us: 1.39x faster
No indentation (edge case) 2.66 us 1.95 us: 1.37x faster 2.05 us: 1.30x faster
Only blank lines 1.09 us 299 ns: 3.64x faster 1.56 us: 1.43x slower
Edge case: No common prefix to remove 1.79 us 1.71 us: 1.05x faster not significant
Edge case: Single indented line 2.90 us 1.67 us: 1.74x faster 1.68 us: 1.73x faster
Edge case: Single indented line only 704 ns 260 ns: 2.71x faster 1.00 us: 1.42x slower
Edge case: Empty text 478 ns 130 ns: 3.67x faster 1.01 us: 2.11x slower
no_indent 10.4 us 3.68 us: 2.81x faster 3.97 us: 2.61x faster
spaces 18.2 us 4.76 us: 3.81x faster 4.57 us: 3.98x faster
mixed 27.5 us 18.0 us: 1.53x faster 16.9 us: 1.63x faster
large_text 379 us 109 us: 3.47x faster 117 us: 3.23x faster
whitespace_only 1.23 us 299 ns: 4.12x faster 1.40 us: 1.14x slower
Geometric mean (ref) 2.69x faster 1.94x faster

Notably, the synthetic benchmarks contain quite a lot of whitespace-only cases: Only blank lines, Edge case: Single indented line only, Edge case: Empty text, and whitespace_only. The text-heavy benchmarks don't seem to change, and some have slight improvements.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is looking really good! I am having a lot of difficulties improving this further. The only possible other optimizations I can think of are optimizing the min / max computation to maybe perform the computation as it is computing the non_blank_lines and maybe cache the computation of not l.isspace() since that is done twice. I was trying to implement something like that but did not have much success there since the list comprehension is C optimized as well as the min and max operations.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Caching would only make sense for lines that are very very long and for which we need to iterate multiple times over the same line. Now, for caching, we could either use a custom str class to cache the call or cache the results themselves in an other list and use it as a mask (as for numpy arrays) but I don't know if we're gaining something (we need to test that).

isspace = [l.isspace() for l in lines]
non_blank_lines  = [lines[i] for i, v in enumerate(isspace) if v]
...
return '\n'.join([l[margin:] if not isspace[i] else '' for i, l in enumerate(lines)])

I'm almost certain we can compute isspace and non_blank_lines simultaneously using some itertools recipes but I don't know how much we gain. In the worst case, we need to iterate over all the lines at least 3 + 3 (min/max + compute margin). Currently, we already iterate over the lines at least 1 + 2 + 1 + 1 times (1 for non_blank_lines, 2 for min/max, 1 for margin, and 1 for the final merge) so we're already doing lots of passes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I benchmarked a version using itertools.compress, and found it was slower overall (1.1x):

    # Get length of leading whitespace, inspired by ``os.path.commonprefix()``.
    non_blank_idx = [l and not l.isspace() for l in lines]
    non_blank_lines = list(compress(lines, non_blank_idx))
    l1 = min(non_blank_lines, default='')
    l2 = max(non_blank_lines, default='')
    margin = 0
    for margin, c in enumerate(l1):
        if c != l2[margin] or c not in ' \t':
            break
    return '\n'.join([l[margin:] if non_blank_idx[i] else '' for i, l in enumerate(lines)])

I suspect that the isspace() calls are quite cheap, as I assume they exit on the first non-space character. I also prefer the simplicity of the current implementation. What would help is a C-level min-max function that returns both the min and max from one iteration, but that's out of scope for this PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they exit on the first non-space character

yes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would help is a C-level min-max function that returns both the min and max from one iteration, but that's out of scope for this PR.

Agreed! I was just thinking about potentially creating this function. If I were to try this do you happen to know where the min and max implementation is in this repo? Should be pretty simple to implement the min-max version from that.


lines = text.split('\n')

# sanity check (testing/debugging only)
if 0 and margin:
for line in text.split("\n"):
assert not line or line.startswith(margin), \
"line = %r, margin = %r" % (line, margin)
# Get length of leading whitespace, inspired by ``os.path.commonprefix()``.
non_blank_lines = [l for l in lines if l and not l.isspace()]
l1 = min(non_blank_lines)
l2 = max(non_blank_lines)
for margin, c in enumerate(l1):
if c != l2[margin] or c not in ' \t':
break

if margin:
text = re.sub(r'(?m)^' + margin, '', text)
return text
return '\n'.join([l[margin:] if l and not l.isspace() else ''
for l in lines])


def indent(text, prefix, predicate=None):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improved performance of :func:`textwrap.dedent` by ~2.4x.
Patch by Adam Turner and Marius Juston.
Loading