Skip to content

Commit f3e6ec1

Browse files
committed
[BigString] Fix accidentally quadratic BigString.init
When ingesting a `String` instance, `BigString` assumes that the input string has a reasonably efficient UTF-8 view. Unfortunately, that is very much not the case when the input happens to be backed by a bridged NSString object — it appears that in this case, the ingester loop invokes some operation(s) with linear complexity in the size of the entire input, rendering the ingester’s overall complexity quadratic. The BigString ingester is only expected to operate within a single chunk at the time. It’s unclear precisely which operation triggers the quadratic behavior; ideally we should figure it out and resolve it with a more targeted fix. In the meantime, a blunt stopgap fix is to force-transcode the input string to UTF-8 at the time the ingester is initialized. This unnecessarily wastes some (temporary) memory on holding the transcoded string, but it avoids the quadratic cliff.
1 parent 3d2dc41 commit f3e6ec1

File tree

3 files changed

+118
-0
lines changed

3 files changed

+118
-0
lines changed
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift Collections open source project
4+
//
5+
// Copyright (c) 2024 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import CollectionsBenchmark
13+
import _RopeModule
14+
import Foundation
15+
16+
let someLatinSymbols: [UnicodeScalar] = [
17+
0x20 ..< 0x7f,
18+
0xa1 ..< 0xad,
19+
0xae ..< 0x2af,
20+
0x300 ..< 0x370,
21+
0x1e00 ..< 0x1eff,
22+
].flatMap {
23+
$0.map { UnicodeScalar($0)! }
24+
}
25+
26+
extension UnicodeScalar {
27+
static func randomLatin(
28+
using rng: inout some RandomNumberGenerator
29+
) -> Self {
30+
someLatinSymbols.randomElement(using: &rng)!
31+
}
32+
}
33+
34+
extension String.UnicodeScalarView {
35+
static func randomLatin(
36+
runeCount: Int, using rng: inout some RandomNumberGenerator
37+
) -> Self {
38+
var result = String.UnicodeScalarView()
39+
for _ in 0 ..< runeCount {
40+
result.append(UnicodeScalar.randomLatin(using: &rng))
41+
}
42+
return result
43+
}
44+
}
45+
46+
extension String {
47+
static func randomLatin(
48+
runeCount: Int, using rng: inout some RandomNumberGenerator
49+
) -> Self {
50+
let text = String.UnicodeScalarView.randomLatin(
51+
runeCount: runeCount, using: &rng)
52+
return String(text)
53+
}
54+
}
55+
56+
struct NativeStringInput {
57+
let value: String
58+
59+
init(runeCount: Int, using rng: inout some RandomNumberGenerator) {
60+
self.value = String.randomLatin(runeCount: runeCount, using: &rng)
61+
}
62+
}
63+
64+
struct BridgedStringInput {
65+
let value: String
66+
67+
init(runeCount: Int, using rng: inout some RandomNumberGenerator) {
68+
let string = String.randomLatin(runeCount: runeCount, using: &rng)
69+
let utf16 = Array(string.utf16)
70+
let cocoa = utf16.withUnsafeBufferPointer {
71+
NSString(characters: $0.baseAddress!, length: $0.count)
72+
}
73+
self.value = cocoa as String
74+
}
75+
}
76+
77+
78+
extension Benchmark {
79+
public mutating func addBigStringBenchmarks() {
80+
guard #available(macOS 13.3, iOS 16.4, watchOS 9.4, tvOS 16.4, *) else {
81+
return
82+
}
83+
84+
self.registerInputGenerator(for: NativeStringInput.self) { c in
85+
var rng = SystemRandomNumberGenerator()
86+
return NativeStringInput(runeCount: c, using: &rng)
87+
}
88+
89+
self.registerInputGenerator(for: BridgedStringInput.self) { c in
90+
var rng = SystemRandomNumberGenerator()
91+
return BridgedStringInput(runeCount: c, using: &rng)
92+
}
93+
94+
self.addSimple(
95+
title: "BigString init from native string",
96+
input: NativeStringInput.self
97+
) { input in
98+
blackHole(BigString(input.value))
99+
}
100+
101+
self.addSimple(
102+
title: "BigString init from bridged string",
103+
input: BridgedStringInput.self
104+
) { input in
105+
blackHole(BigString(input.value))
106+
}
107+
}
108+
}

Benchmarks/Sources/benchmark-tool/main.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ benchmark.addHeapBenchmarks()
3737
benchmark.addBitSetBenchmarks()
3838
benchmark.addTreeSetBenchmarks()
3939
benchmark.addCppBenchmarks()
40+
benchmark.addBigStringBenchmarks()
4041
#if os(macOS) || os(iOS) || os(watchOS) || os(tvOS)
4142
benchmark.addFoundationBenchmarks()
4243
#endif

Sources/RopeModule/BigString/Basics/BigString+Ingester.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,21 @@ extension BigString {
4040

4141
init(_ input: Substring) {
4242
self.input = input
43+
self.input.makeContiguousUTF8()
4344
self.start = input.startIndex
4445
self.state = _CharacterRecognizer()
4546
}
4647

4748
init(_ input: Substring, startState: __owned _CharacterRecognizer) {
4849
self.input = input
50+
// Prevent accidentally quadratic operation by ensuring that we have
51+
// a native UTF-8 string.
52+
// FIXME: This is wasteful: if `input` happens to be a bridged
53+
// FIXME: NSString instance, then it temporarily allocates a full
54+
// FIXME: copy of the (transcoded) input string, only to then copy
55+
// FIXME: its pieces into the tree later.
56+
// FIXME: We should have a direct ingester path for native UTF-16 data.
57+
self.input.makeContiguousUTF8()
4958
self.start = input.startIndex
5059
self.state = startState
5160
}

0 commit comments

Comments
 (0)