Skip to content

Commit 4a58679

Browse files
committed
Fix character class range matching
Previously we performed a lexicographic comparison with the bounds of a character class range. However this produced surprising results, and our implementation didn't properly handle case sensitivity. Update the logic to instead only allow single scalar NFC bounds. The input is then converted to NFC in grapheme semantic mode, and checked against the range. In scalar semantic mode, the input scalar is checked on its own. Additionally, fix the case sensitivity handling such that we check both the lowercase and uppercase version of the input against the range.
1 parent 2d13a10 commit 4a58679

File tree

9 files changed

+278
-74
lines changed

9 files changed

+278
-74
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -816,8 +816,16 @@ extension AST.Atom {
816816
/// Whether this atom is valid as the operand of a custom character class
817817
/// range.
818818
public var isValidCharacterClassRangeBound: Bool {
819-
// If we have a literal character value for this, it can be used as a bound.
820-
if literalCharacterValue != nil { return true }
819+
if let c = literalCharacterValue {
820+
// We only match character range bounds that are single scalars under NFC.
821+
// FIXME: Update to use the stdlib's data. We'll need to decide whether or
822+
// not the `hasExactlyOneScalar` check should remain. In grapheme semantic
823+
// mode, we ought to be able to convert multi-scalar characters to
824+
// single-scalar NFC, but in scalar semantic mode it seems like we ought
825+
// to preserve the scalar as-written, making multi-scalar cases invalid.
826+
// As such there may be value in maintaining this restriction.
827+
return c.hasExactlyOneScalar && !c.unicodeScalars.first!.expandsUnderNFC
828+
}
821829
switch kind {
822830
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
823831
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

Sources/_RegexParser/Utility/Misc.swift

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,40 @@ extension Substring {
1919
var string: String { String(self) }
2020
}
2121

22+
extension Character {
23+
/// Whether this character is made up of exactly one Unicode scalar value.
24+
public var hasExactlyOneScalar: Bool {
25+
let scalars = unicodeScalars
26+
return scalars.index(after: scalars.startIndex) == scalars.endIndex
27+
}
28+
}
29+
30+
extension UnicodeScalar {
31+
/// Whether a given scalar expands when NFC is applied. This is a stop-gap
32+
/// until we're able to use the normalization data provided by the stdlib.
33+
var expandsUnderNFC: Bool {
34+
if isASCII { return false }
35+
// As of Unicode 14, there are 85 code points that expand under NFC.
36+
// FIXME: This should use the stdlib data, but we don't have access to it
37+
// from _RegexParser.
38+
switch value {
39+
case 0x344, 0x958, 0x959, 0x95A, 0x95B, 0x95C, 0x95D, 0x95E, 0x95F, 0x9DC,
40+
0x9DD, 0x9DF, 0xA33, 0xA36, 0xA59, 0xA5A, 0xA5B, 0xA5E, 0xB5C, 0xB5D,
41+
0xF43, 0xF4D, 0xF52, 0xF57, 0xF5C, 0xF69, 0xF73, 0xF75, 0xF76, 0xF78,
42+
0xF81, 0xF93, 0xF9D, 0xFA2, 0xFA7, 0xFAC, 0xFB9, 0x2ADC, 0xFB1D, 0xFB1F,
43+
0xFB2A, 0xFB2B, 0xFB2C, 0xFB2D, 0xFB2E, 0xFB2F, 0xFB30, 0xFB31, 0xFB32,
44+
0xFB33, 0xFB34, 0xFB35, 0xFB36, 0xFB38, 0xFB39, 0xFB3A, 0xFB3B, 0xFB3C,
45+
0xFB3E, 0xFB40, 0xFB41, 0xFB43, 0xFB44, 0xFB46, 0xFB47, 0xFB48, 0xFB49,
46+
0xFB4A, 0xFB4B, 0xFB4C, 0xFB4D, 0xFB4E, 0x1D15E, 0x1D15F, 0x1D160,
47+
0x1D161, 0x1D162, 0x1D163, 0x1D164, 0x1D1BB, 0x1D1BC, 0x1D1BD, 0x1D1BE,
48+
0x1D1BF, 0x1D1C0:
49+
return true
50+
default:
51+
return false
52+
}
53+
}
54+
}
55+
2256
extension CustomStringConvertible {
2357
@_alwaysEmitIntoClient
2458
public var halfWidthCornerQuoted: String {

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -342,38 +342,60 @@ extension DSLTree.CustomCharacterClass.Member {
342342
}
343343
return c
344344
case let .range(low, high):
345-
// TODO:
346-
guard let lhs = low.literalCharacterValue else {
345+
guard let lhs = low.literalCharacterValue?.singleNFCScalar else {
347346
throw Unsupported("\(low) in range")
348347
}
349-
guard let rhs = high.literalCharacterValue else {
348+
guard let rhs = high.literalCharacterValue?.singleNFCScalar else {
350349
throw Unsupported("\(high) in range")
351350
}
351+
guard lhs <= rhs else {
352+
throw Unsupported("Invalid range \(low)-\(high)")
353+
}
352354

353-
if opts.isCaseInsensitive {
354-
let lhsLower = lhs.lowercased()
355-
let rhsLower = rhs.lowercased()
356-
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
357-
return { input, bounds in
358-
// TODO: check for out of bounds?
359-
let curIdx = bounds.lowerBound
360-
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
361-
// TODO: semantic level
362-
return input.index(after: curIdx)
363-
}
364-
return nil
355+
let isCaseInsensitive = opts.isCaseInsensitive
356+
let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
357+
358+
return { input, bounds in
359+
let curIdx = bounds.lowerBound
360+
let nextIndex = isCharacterSemantic
361+
? input.index(after: curIdx)
362+
: input.unicodeScalars.index(after: curIdx)
363+
364+
// Under grapheme semantics, we compare based on single NFC scalars. If
365+
// such a character is not single scalar under NFC, the match fails. In
366+
// scalar semantics, we compare the exact scalar value to the NFC
367+
// bounds.
368+
let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar
369+
: input.unicodeScalars[curIdx]
370+
guard let scalar = scalar else { return nil }
371+
let scalarRange = lhs ... rhs
372+
if scalarRange.contains(scalar) {
373+
return nextIndex
365374
}
366-
} else {
367-
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
368-
return { input, bounds in
369-
// TODO: check for out of bounds?
370-
let curIdx = bounds.lowerBound
371-
if (lhs...rhs).contains(input[curIdx]) {
372-
// TODO: semantic level
373-
return input.index(after: curIdx)
375+
376+
// Check for case insensitive matches.
377+
func matchesCased(
378+
_ cased: (UnicodeScalar.Properties) -> String
379+
) -> Bool {
380+
let casedStr = cased(scalar.properties)
381+
// In character semantic mode, we need to map to NFC. In scalar
382+
// semantics, we should have an exact scalar.
383+
let mapped = isCharacterSemantic ? casedStr.singleNFCScalar
384+
: casedStr.singleScalar
385+
guard let mapped = mapped else { return false }
386+
return scalarRange.contains(mapped)
387+
}
388+
if isCaseInsensitive {
389+
if scalar.properties.changesWhenLowercased,
390+
matchesCased(\.lowercaseMapping) {
391+
return nextIndex
392+
}
393+
if scalar.properties.changesWhenUppercased,
394+
matchesCased(\.uppercaseMapping) {
395+
return nextIndex
374396
}
375-
return nil
376397
}
398+
return nil
377399
}
378400

379401
case let .custom(ccc):

Sources/_StringProcessing/Unicode/CharacterProps.swift

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,3 @@
1111

1212

1313
// TODO
14-
15-
extension Character {
16-
/// Whether this character is made up of exactly one Unicode scalar value.
17-
var hasExactlyOneScalar: Bool {
18-
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
19-
}
20-
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
15+
extension Character {
16+
/// If the given character consists of a single NFC scalar, returns it. If
17+
/// there are multiple NFC scalars, returns `nil`.
18+
var singleNFCScalar: UnicodeScalar? {
19+
// SwiftStdlib is always >= 5.7 for a shipped StringProcessing.
20+
guard #available(SwiftStdlib 5.7, *) else { return nil }
21+
var nfcIter = String(self)._nfc.makeIterator()
22+
guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil }
23+
return scalar
24+
}
25+
}
26+
27+
extension String {
28+
/// If the given string consists of a single NFC scalar, returns it. If none
29+
/// or multiple NFC scalars are present, returns `nil`.
30+
var singleNFCScalar: UnicodeScalar? {
31+
guard !isEmpty && index(after: startIndex) == endIndex else { return nil }
32+
return first!.singleNFCScalar
33+
}
34+
35+
/// If the given string contains a single scalar, returns it. If none or
36+
/// multiple scalars are present, returns `nil`.
37+
var singleScalar: UnicodeScalar? {
38+
let scalars = unicodeScalars
39+
guard !scalars.isEmpty &&
40+
scalars.index(after: scalars.startIndex) == scalars.endIndex
41+
else { return nil }
42+
return scalars.first!
43+
}
44+
}

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ class RegexDSLTests: XCTestCase {
7171
}
7272

7373
func testCharacterClasses() throws {
74+
// Must have new stdlib for character class ranges.
75+
guard ensureNewStdlib() else { return }
76+
7477
try _testDSLCaptures(
7578
("a c", ("a c", " ", "c")),
7679
matchType: (Substring, Substring, Substring).self, ==)
@@ -114,6 +117,9 @@ class RegexDSLTests: XCTestCase {
114117
}
115118

116119
func testCharacterClassOperations() throws {
120+
// Must have new stdlib for character class ranges.
121+
guard ensureNewStdlib() else { return }
122+
117123
try _testDSLCaptures(
118124
("bcdefn1a", "bcdefn1a"),
119125
("nbcdef1a", nil), // fails symmetric difference lookahead
@@ -345,6 +351,9 @@ class RegexDSLTests: XCTestCase {
345351
}
346352

347353
func testQuantificationBehavior() throws {
354+
// Must have new stdlib for character class ranges.
355+
guard ensureNewStdlib() else { return }
356+
348357
// Eager by default
349358
try _testDSLCaptures(
350359
("abc1def2", ("abc1def2", "2")),

0 commit comments

Comments
 (0)