From 148ccbcd91fd818021b23f8a104b448c2f41e995 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Jul 2022 11:41:55 +0100 Subject: [PATCH 1/3] Guard against testing with older stdlibs Replace a couple of `#if os(Linux)` checks with a check to see if we have a newer stdlib available. This lets us emit an expected failure in the case where we're testing on an older stdlib. --- Package.swift | 6 ++-- Sources/TestSupport/TestSupport.swift | 33 +++++++++++++++++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 5 +--- Tests/RegexTests/MatchTests.swift | 27 +++++++++-------- 4 files changed, 52 insertions(+), 19 deletions(-) create mode 100644 Sources/TestSupport/TestSupport.swift diff --git a/Package.swift b/Package.swift index 4e9f1c24d..18764fcaf 100644 --- a/Package.swift +++ b/Package.swift @@ -75,15 +75,17 @@ let package = Package( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], swiftSettings: publicStdlibSettings), + .target(name: "TestSupport", + swiftSettings: [availabilityDefinition]), .testTarget( name: "RegexTests", - dependencies: ["_StringProcessing"], + dependencies: ["_StringProcessing", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", - dependencies: ["_StringProcessing", "RegexBuilder"], + dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), diff --git a/Sources/TestSupport/TestSupport.swift b/Sources/TestSupport/TestSupport.swift new file mode 100644 index 000000000..b60adb63f --- /dev/null +++ b/Sources/TestSupport/TestSupport.swift @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +// We need to split this out of the test files, as it needs to be compiled +// *without* `-disable-availability-checking` to ensure the #available check is +// not compiled into a no-op. + +#if os(Linux) +public func XCTExpectFailure( + _ message: String? = nil, body: () throws -> Void +) rethrows {} +#endif + +/// Guards certain tests to make sure we have a new stdlib available. +public func ensureNewStdlib( + file: StaticString = #file, line: UInt = #line +) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { + XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) } + return false + } + return true +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 05375a1f7..d95d4ce2c 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -12,10 +12,7 @@ import XCTest import _StringProcessing import RegexBuilder - -#if os(Linux) -func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} -#endif +import TestSupport class RegexDSLTests: XCTestCase { func _testDSLCaptures( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c087974a7..62c1e1efe 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -12,6 +12,7 @@ import XCTest @testable import _RegexParser @testable import _StringProcessing +import TestSupport struct MatchError: Error { var message: String @@ -1037,6 +1038,9 @@ extension RegexTests { } func testMatchAnchors() throws { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Anchors firstMatchTests( #"^\d+"#, @@ -1085,8 +1089,6 @@ extension RegexTests { (" 123\n456\n", nil), ("123 456", "456")) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) firstMatchTests( #"\d+\b"#, ("123", "123"), @@ -1104,7 +1106,6 @@ extension RegexTests { ("123", "23"), (" 123", "23"), ("123 456", "23")) -#endif // TODO: \G and \K do { @@ -1135,9 +1136,10 @@ extension RegexTests { ("Sol Cafe", nil), xfail: true) } - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) func testLevel2WordBoundaries() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Level 2 Word Boundaries firstMatchTest(#"\b😊\b"#, input: "πŸ”₯πŸ˜ŠπŸ‘", match: "😊") firstMatchTest(#"\bπŸ‘¨πŸ½\b"#, input: "πŸ‘©πŸ»πŸ‘ΆπŸΏπŸ‘¨πŸ½πŸ§‘πŸΎπŸ‘©πŸΌ", match: "πŸ‘¨πŸ½") @@ -1153,8 +1155,7 @@ extension RegexTests { firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't") firstMatchTest(#"\bΓ·\b"#, input: "3 Γ· 3 = 1", match: "Γ·") } -#endif - + func testMatchGroups() { // MARK: Groups @@ -1379,6 +1380,9 @@ extension RegexTests { } func testMatchExamples() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // Backreferences matchTest( #"(sens|respons)e and \1ibility"#, @@ -1428,8 +1432,6 @@ extension RegexTests { xfail: true ) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) // HTML tags matchTest( #"<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>.*?"#, @@ -1447,7 +1449,6 @@ extension RegexTests { ("pass me the the kettle", ["the"]), ("this doesn't have any", nil) ) -#endif // Floats flatCaptureTest( @@ -1527,6 +1528,9 @@ extension RegexTests { } func testASCIIClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // 'D' ASCII-only digits matchTest( #"\d+"#, @@ -1555,8 +1559,6 @@ extension RegexTests { ("aeiou", true), ("Γ₯e\u{301}ïôú", false)) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) matchTest( #"abcd\b.+"#, ("abcd ef", true), @@ -1572,7 +1574,6 @@ extension RegexTests { ("abcd ef", true), ("abcdef", false), ("abcdΓ©f", false)) -#endif // 'S' ASCII-only spaces matchTest( From 8d6445075241f70984f014e67d4e44f552c9b92d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Jul 2022 11:41:56 +0100 Subject: [PATCH 2/3] Add some extra character class newline matching tests --- Tests/RegexTests/MatchTests.swift | 70 ++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 62c1e1efe..f464fbf56 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -320,8 +320,6 @@ extension RegexTests { input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") - firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") - // MARK: Quotes firstMatchTest( @@ -1464,6 +1462,74 @@ extension RegexTests { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + + func testMatchNewlines() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + firstMatchTest( + #"\r\n"#, input: "\r\n", match: "\r\n", + semanticLevel: semantics + ) + firstMatchTest( + #"\r\n"#, input: "\n", match: nil, semanticLevel: semantics) + firstMatchTest( + #"\r\n"#, input: "\r", match: nil, semanticLevel: semantics) + + // \r\n is not treated as ASCII. + firstMatchTest( + #"^\p{ASCII}$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\r$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\r]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\n$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\n]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\u{0}-\u{7F}]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + + let scalarSemantics = semantics == .unicodeScalar + firstMatchTest( + #"\p{ASCII}"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\r"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\r]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\n"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\n]"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\u{0}-\u{7F}]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + } + } func testCaseSensitivity() { matchTest( From cd5cc37819460800e4c7cb5030eccbb62398accf Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Jul 2022 11:41:57 +0100 Subject: [PATCH 3/3] Fix character class range matching Previously we performed a lexicographic comparison with the bounds of a character class range. However this produced surprising results, and our implementation didn't properly handle case sensitivity. Update the logic to instead only allow single scalar NFC bounds. The input is then converted to NFC in grapheme semantic mode, and checked against the range. In scalar semantic mode, the input scalar is checked on its own. Additionally, fix the case sensitivity handling such that we check both the lowercase and uppercase version of the input against the range. --- Sources/_RegexParser/Regex/AST/Atom.swift | 6 +- Sources/_RegexParser/Utility/Misc.swift | 15 ++ .../_StringProcessing/ConsumerInterface.swift | 70 +++++--- .../Unicode/CharacterProps.swift | 7 - Sources/_StringProcessing/Unicode/NFC.swift | 55 ++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 9 + Tests/RegexTests/MatchTests.swift | 164 +++++++++++++----- Tests/RegexTests/ParseTests.swift | 41 ++++- Tests/RegexTests/UTS18Tests.swift | 7 + 9 files changed, 299 insertions(+), 75 deletions(-) create mode 100644 Sources/_StringProcessing/Unicode/NFC.swift diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index b03ce8c39..8706327f7 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -755,8 +755,10 @@ extension AST.Atom { /// Whether this atom is valid as the operand of a custom character class /// range. public var isValidCharacterClassRangeBound: Bool { - // If we have a literal character value for this, it can be used as a bound. - if literalCharacterValue != nil { return true } + if let c = literalCharacterValue { + // We only match character range bounds that are single scalar NFC. + return c.hasExactlyOneScalar && c.isNFC + } switch kind { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift index d37dfbd4a..70dc7a7d5 100644 --- a/Sources/_RegexParser/Utility/Misc.swift +++ b/Sources/_RegexParser/Utility/Misc.swift @@ -19,6 +19,21 @@ extension Substring { var string: String { String(self) } } +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + public var hasExactlyOneScalar: Bool { + let scalars = unicodeScalars + return scalars.index(after: scalars.startIndex) == scalars.endIndex + } + + /// Whether the given character is in NFC form. + internal var isNFC: Bool { + if isASCII { return true } + let str = String(self) + return str._nfcCodeUnits.elementsEqual(str.utf8) + } +} + extension CustomStringConvertible { @_alwaysEmitIntoClient public var halfWidthCornerQuoted: String { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 668d16eb6..c96775500 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -361,38 +361,60 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - // TODO: - guard let lhs = low.literalCharacterValue else { + guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue else { + guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else { throw Unsupported("\(high) in range") } + guard lhs <= rhs else { + throw Unsupported("Invalid range \(low)-\(high)") + } - if opts.isCaseInsensitive { - let lhsLower = lhs.lowercased() - let rhsLower = rhs.lowercased() - guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil + let isCaseInsensitive = opts.isCaseInsensitive + let isCharacterSemantic = opts.semanticLevel == .graphemeCluster + + return { input, bounds in + let curIdx = bounds.lowerBound + let nextIndex = isCharacterSemantic + ? input.index(after: curIdx) + : input.unicodeScalars.index(after: curIdx) + + // Under grapheme semantics, we compare based on single NFC scalars. If + // such a character is not single scalar under NFC, the match fails. In + // scalar semantics, we compare the exact scalar value to the NFC + // bounds. + let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar + : input.unicodeScalars[curIdx] + guard let scalar = scalar else { return nil } + let scalarRange = lhs ... rhs + if scalarRange.contains(scalar) { + return nextIndex } - } else { - guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + + // Check for case insensitive matches. + func matchesCased( + _ cased: (UnicodeScalar.Properties) -> String + ) -> Bool { + let casedStr = cased(scalar.properties) + // In character semantic mode, we need to map to NFC. In scalar + // semantics, we should have an exact scalar. + let mapped = isCharacterSemantic ? casedStr.singleNFCScalar + : casedStr.singleScalar + guard let mapped = mapped else { return false } + return scalarRange.contains(mapped) + } + if isCaseInsensitive { + if scalar.properties.changesWhenLowercased, + matchesCased(\.lowercaseMapping) { + return nextIndex + } + if scalar.properties.changesWhenUppercased, + matchesCased(\.uppercaseMapping) { + return nextIndex } - return nil } + return nil } case let .custom(ccc): diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index 80f6819a6..e0be4e386 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -11,10 +11,3 @@ // TODO - -extension Character { - /// Whether this character is made up of exactly one Unicode scalar value. - var hasExactlyOneScalar: Bool { - unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex - } -} diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift new file mode 100644 index 000000000..5c2c4aa48 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_spi(_Unicode) +import Swift + +extension UnicodeScalar { + /// Checks whether the scalar is in NFC form. + var isNFC: Bool { Character(self).singleNFCScalar == self } +} + +extension Character { + /// If the given character consists of a single NFC scalar, returns it. If + /// there are multiple NFC scalars, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + // SwiftStdlib is always >= 5.7 for a shipped StringProcessing. + guard #available(SwiftStdlib 5.7, *) else { return nil } + var nfcIter = String(self)._nfc.makeIterator() + guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil } + return scalar + } + + /// If the given character contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + hasExactlyOneScalar ? unicodeScalars.first! : nil + } +} + +extension String { + /// If the given string consists of a single NFC scalar, returns it. If none + /// or multiple NFC scalars are present, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + guard !isEmpty && index(after: startIndex) == endIndex else { return nil } + return first!.singleNFCScalar + } + + /// If the given string contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + let scalars = unicodeScalars + guard !scalars.isEmpty && + scalars.index(after: scalars.startIndex) == scalars.endIndex + else { return nil } + return scalars.first! + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index d95d4ce2c..1d186e0bc 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -74,6 +74,9 @@ class RegexDSLTests: XCTestCase { let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" func testCharacterClasses() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("a c", ("a c", " ", "c")), matchType: (Substring, Substring, Substring).self, ==) @@ -248,6 +251,9 @@ class RegexDSLTests: XCTestCase { } func testCharacterClassOperations() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("bcdefn1a", "bcdefn1a"), ("nbcdef1a", nil), // fails symmetric difference lookahead @@ -591,6 +597,9 @@ class RegexDSLTests: XCTestCase { } func testQuantificationBehavior() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + // Eager by default try _testDSLCaptures( ("abc1def2", ("abc1def2", "2")), diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f464fbf56..f24ae89d9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -570,6 +570,9 @@ extension RegexTests { } func testMatchCharacterClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Character classes firstMatchTest(#"abc\d"#, input: "xyzabc123", match: "abc1") @@ -766,10 +769,14 @@ extension RegexTests { } firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") - // FIXME: This produces a different result with and without optimizations. - firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, xfail: true) - firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, - validateOptimizations: false) + firstMatchTest(#"[12]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣") + firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil) + firstMatchTest("[0-2&&1-3]", input: "1️⃣", match: nil) + firstMatchTest("[1-2e\u{301}]", input: "1️⃣", match: nil) + + firstMatchTest(#"[\u{3A9}-\u{3A9}]"#, input: "\u{3A9}", match: "\u{3A9}") // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { @@ -824,6 +831,35 @@ extension RegexTests { firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + // Case sensitivity and ranges. + for ch in "abcD" { + firstMatchTest("[a-cD]", input: String(ch), match: String(ch)) + } + for ch in "ABCd" { + firstMatchTest("[a-cD]", input: String(ch), match: nil) + } + for ch in "abcABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[a-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[A-CD]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcd" { + let input = String(ch) + firstMatchTest( + "[X-cd]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcxyzABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[X-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[X-cD]", input: input, match: input, semanticLevel: semantics) + } + } } func testCharacterProperties() { @@ -1155,6 +1191,9 @@ extension RegexTests { } func testMatchGroups() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Groups // Named captures @@ -1532,6 +1571,9 @@ extension RegexTests { } func testCaseSensitivity() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"c..e"#, ("cafe", true), @@ -1765,6 +1807,9 @@ extension RegexTests { var eComposed: String { "Γ©" } var eDecomposed: String { "e\u{301}" } + var eComposedUpper: String { "Γ‰" } + var eDecomposedUpper: String { "E\u{301}" } + func testIndividualScalars() { // Expectation: A standalone Unicode scalar value in a regex literal // can match either that specific scalar value or participate in matching @@ -1851,31 +1896,62 @@ extension RegexTests { } func testCanonicalEquivalenceCustomCharacterClass() throws { - // Expectation: Concatenations with custom character classes should be able - // to match within a grapheme cluster. That is, a regex should be able to - // match the scalar values that comprise a grapheme cluster in separate, - // or repeated, custom character classes. - + // Expectation: Custom character class matches do not cross grapheme + // character boundaries by default. When matching with Unicode scalar + // semantics, grapheme cluster boundaries are ignored, so matching + // sequences of custom character classes can succeed. + + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"[Ñéíóú]$"#, (eComposed, true), (eDecomposed, true)) - // FIXME: Custom char classes don't use canonical equivalence with composed characters - firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) + for input in [eDecomposed, eComposed] { + // Unicode scalar semantics means that only the decomposed version can + // match here. + let match = input.unicodeScalars.count == 2 ? input : nil + firstMatchTest( + #"e[\u{301}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"e[\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e-e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[a-z][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + } + for input in [eComposed, eDecomposed] { + // Grapheme cluster semantics means that we can't match the 'e' separately + // from the accent. + firstMatchTest(#"e[\u{301}]$"#, input: input, match: nil) + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e-e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: input, match: nil) + + // A range that covers Γ© (U+E9). Inputs are mapped to NFC, so match. + firstMatchTest(#"[\u{E8}-\u{EA}]"#, input: input, match: input) + } - // FIXME: Custom char classes don't match decomposed characters - firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + // A range that covers Γ‰ (U+C9). Inputs are mapped to NFC, so match. + for input in [eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"[\u{C9}-\u{C9}]"#, input: input, match: input) + } + // Case insensitive matching of Γ‰ (U+C9). + for input in [eComposed, eDecomposed, eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"(?i)[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"(?i)[\u{C9}-\u{C9}]"#, input: input, match: input) + } let flag = "πŸ‡°πŸ‡·" firstMatchTest(#"πŸ‡°πŸ‡·"#, input: flag, match: flag) @@ -1884,27 +1960,33 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, - xfail: true) - - // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character + firstMatchTest( + #"^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of regional indicators followed by the second Unicode scalar + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) // A CCC of regional indicators x 2 - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, - xfail: true) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of N regional indicators + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]+$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) - // FIXME: A single CCC of regional indicators matches the whole flag character - // A CCC of regional indicators followed by the second Unicode scalar - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, - xfail: true) // A single CCC of regional indicators - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, - xfail: true) - - // A single CCC of actual flag emojis / combined regional indicators - firstMatchTest(#"[πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό]"#, input: flag, match: flag) - // This succeeds (correctly) because \u{1F1F0} is lexicographically - // within the CCC range - firstMatchTest(#"[πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil, + semanticLevel: .unicodeScalar + ) } func testAnyChar() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 52a272915..f5e93c2bd 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -374,10 +374,21 @@ extension RegexTests { // MARK: Allowed combining characters - parseTest("e\u{301}", "e\u{301}") parseTest("1\u{358}", "1\u{358}") parseTest(#"\ \#u{361}"#, " \u{361}") + parseTest("e\u{301}", "e\u{301}") + parseTest("[e\u{301}]", charClass("e\u{301}")) + parseTest("\u{E9}", "e\u{301}") + parseTest("[\u{E9}]", charClass("e\u{301}")) + + parseTest( + "\\e\u{301}", "e\u{301}", throwsError: .invalidEscape("e\u{301}")) + parseTest( + "[\\e\u{301}]", charClass("e\u{301}"), + throwsError: .invalidEscape("e\u{301}") + ) + // MARK: Alternations parseTest( @@ -2885,6 +2896,34 @@ extension RegexTests { diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence")) diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence")) + diagnosticTest(#"|([πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"|([πŸ‘¨β€πŸ‘©β€πŸ‘¦-πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§])?"#, .invalidCharacterClassRangeOperand) + + // Not single-scalar NFC. + diagnosticTest("[e\u{301}-e\u{302}]", .invalidCharacterClassRangeOperand) + + // These scalar values expand under NFC. + let nfcExpandingScalars: [UInt32] = [ + 0x344, 0x958, 0x959, 0x95A, 0x95B, 0x95C, 0x95D, 0x95E, 0x95F, 0x9DC, + 0x9DD, 0x9DF, 0xA33, 0xA36, 0xA59, 0xA5A, 0xA5B, 0xA5E, 0xB5C, 0xB5D, + 0xF43, 0xF4D, 0xF52, 0xF57, 0xF5C, 0xF69, 0xF73, 0xF75, 0xF76, 0xF78, + 0xF81, 0xF93, 0xF9D, 0xFA2, 0xFA7, 0xFAC, 0xFB9, 0x2ADC, 0xFB1D, 0xFB1F, + 0xFB2A, 0xFB2B, 0xFB2C, 0xFB2D, 0xFB2E, 0xFB2F, 0xFB30, 0xFB31, 0xFB32, + 0xFB33, 0xFB34, 0xFB35, 0xFB36, 0xFB38, 0xFB39, 0xFB3A, 0xFB3B, 0xFB3C, + 0xFB3E, 0xFB40, 0xFB41, 0xFB43, 0xFB44, 0xFB46, 0xFB47, 0xFB48, 0xFB49, + 0xFB4A, 0xFB4B, 0xFB4C, 0xFB4D, 0xFB4E, 0x1D15E, 0x1D15F, 0x1D160, + 0x1D161, 0x1D162, 0x1D163, 0x1D164, 0x1D1BB, 0x1D1BC, 0x1D1BD, 0x1D1BE, + 0x1D1BF, 0x1D1C0 + ] + for scalar in nfcExpandingScalars { + let hex = String(scalar, radix: 16) + diagnosticTest( + #"[\u{\#(hex)}-\u{\#(hex)}]"#, .invalidCharacterClassRangeOperand) + } + + // The NFC form of U+2126 is U+3A9. + diagnosticTest(#"[\u{2126}-\u{2126}]"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index aa3639ea6..11479bfb6 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -21,6 +21,7 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +import TestSupport extension UnicodeScalar { var value4Digits: String { @@ -316,6 +317,9 @@ extension UTS18Tests { // surrogate followed by a trailing surrogate shall be handled as a single // code point in matching. func testSupplementaryCodePoints() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + XCTAssertTrue("πŸ‘".contains(regex(#"\u{1F44D}"#))) XCTAssertTrue("πŸ‘".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) XCTAssertTrue("πŸ‘πŸ‘Ž".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) @@ -388,6 +392,9 @@ extension UTS18Tests { } func testCharacterClassesWithStrings() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + let regex = regex(#"[a-zπŸ§πŸ‡§πŸ‡ͺπŸ‡§πŸ‡«πŸ‡§πŸ‡¬]"#) XCTAssertEqual("🧐", "🧐".wholeMatch(of: regex)?.0) XCTAssertEqual("πŸ‡§πŸ‡«", "πŸ‡§πŸ‡«".wholeMatch(of: regex)?.0)