From 60456b1c33d5e06748d47f440b88495b4cbfb53a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 6 May 2022 11:34:35 -0500 Subject: [PATCH 1/8] Implement \R, \v, \h for character/scalar modes --- .../Unicode/ScalarProps.swift | 16 +++++++++++ .../_CharacterClassModel.swift | 28 +++++++++++++------ Tests/RegexTests/UTS18Tests.swift | 14 +++++++++- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/Sources/_StringProcessing/Unicode/ScalarProps.swift b/Sources/_StringProcessing/Unicode/ScalarProps.swift index 52a870357..edab08c21 100644 --- a/Sources/_StringProcessing/Unicode/ScalarProps.swift +++ b/Sources/_StringProcessing/Unicode/ScalarProps.swift @@ -46,3 +46,19 @@ extension Unicode.Script { return result } } + +extension UnicodeScalar { + var isHorizontalWhitespace: Bool { + value == 0x09 || properties.generalCategory == .spaceSeparator + } + + var isVerticalWhitespace: Bool { + switch value { + case 0x000A...0x000D /* LF ... CR */: return true + case 0x0085 /* NEXT LINE (NEL) */: return true + case 0x2028 /* LINE SEPARATOR */: return true + case 0x2029 /* PARAGRAPH SEPARATOR */: return true + default: return false + } + } +} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index fc3fd5741..d61e13822 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -178,15 +178,18 @@ public struct _CharacterClassModel: Hashable { matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isVerticalWhitespace == true + && (c.isASCII || !options.usesASCIISpaces) case .whitespace: matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): matched = set.any { $0.matches(c, with: options) } + case .custom(let set): + matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -206,14 +209,21 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: fatalError("Not implemented") - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) + case .verticalWhitespace: + matched = c.isVerticalWhitespace && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence: + matched = c.isVerticalWhitespace && (c.isASCII || !options.usesASCIISpaces) + if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" { + str.unicodeScalars.formIndex(after: &nextIndex) + } case .whitespace: matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom: fatalError("Not supported") + case .custom(let set): + matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index eff9f9b4e..262c798d5 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -268,7 +268,10 @@ extension UTS18Tests { var lines = lineInput.matches(of: regex(#"\d{2}"#)) XCTAssertEqual(lines.count, 11) // Test \R - newline sequence - lines = lineInput.matches(of: regex(#"\d{2}\R"#)) + lines = lineInput.matches(of: regex(#"\d{2}\R^"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test \v - vertical space + lines = lineInput.matches(of: regex(#"\d{2}\v^"#).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) // Test anchors as line boundaries lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) @@ -277,6 +280,15 @@ extension UTS18Tests { lines = lineInput.matches(of: regex(#".+"#)) XCTAssertEqual(lines.count, 11) + // Unicode scalar semantics - \R still matches all, including \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\R^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Unicode scalar semantics - \v matches all except for \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 10) + // Does not contain an empty line XCTAssertFalse(lineInput.contains(regex(#"^$"#))) // Does contain an empty line (between \n and \r, which are reversed here) From 42414d454b6aff3d9b0c5de579d85bad57c83179 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 6 May 2022 16:13:45 -0500 Subject: [PATCH 2/8] Extra \R vs \v testing --- Tests/RegexTests/UTS18Tests.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 262c798d5..7872e1b84 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -284,10 +284,14 @@ extension UTS18Tests { lines = lineInput.matches( of: regex(#"\d{2}\R^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) + XCTAssertNotNil(lineInput.firstMatch( + of: regex(#"08\R^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())) // Unicode scalar semantics - \v matches all except for \r\n sequence lines = lineInput.matches( of: regex(#"\d{2}\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 10) + XCTAssertNil(lineInput.firstMatch( + of: regex(#"08\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())) // Does not contain an empty line XCTAssertFalse(lineInput.contains(regex(#"^$"#))) From e71ac8421e984ee525814dc83f6dcfe34376e7e8 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 10:58:54 -0500 Subject: [PATCH 3/8] print("here") --- Tests/RegexTests/UTS18Tests.swift | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 7872e1b84..027965ded 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -22,6 +22,14 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +extension UnicodeScalar { + var value4Digits: String { + let valueString = String(value, radix: 16, uppercase: true) + if valueString.count >= 4 { return valueString } + return String(repeating: "0", count: 4 - valueString.count) + valueString + } +} + class UTS18Tests: XCTestCase { var input: String { "ABCdefghîøu\u{308}\u{FFF0} -–—[]123" @@ -293,6 +301,16 @@ extension UTS18Tests { XCTAssertNil(lineInput.firstMatch( of: regex(#"08\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())) + XCTAssertNotNil(lineInput.firstMatch(of: regex(#"08\u{d}\u{a}"#).matchingSemantics(.unicodeScalar))) + XCTAssertNotNil(lineInput.firstMatch( + of: regex(#"08..09"#).matchingSemantics(.unicodeScalar).dotMatchesNewlines())) + + for _ in 0..<10 { print("---") } + for (i, s) in lineInput.unicodeScalars.enumerated() { + print("\(i): scalar U+\(s.value4Digits)") + } + for _ in 0..<10 { print("---") } + // Does not contain an empty line XCTAssertFalse(lineInput.contains(regex(#"^$"#))) // Does contain an empty line (between \n and \r, which are reversed here) From 6034a90a76edd7dd8bfb7e1200f79b3a50887e94 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 13:56:21 -0500 Subject: [PATCH 4/8] print("here, really") --- Tests/RegexTests/UTS18Tests.swift | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 027965ded..7ed7a6819 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -305,11 +305,15 @@ extension UTS18Tests { XCTAssertNotNil(lineInput.firstMatch( of: regex(#"08..09"#).matchingSemantics(.unicodeScalar).dotMatchesNewlines())) - for _ in 0..<10 { print("---") } + for _ in 0..<5 { print("---") } for (i, s) in lineInput.unicodeScalars.enumerated() { print("\(i): scalar U+\(s.value4Digits)") } - for _ in 0..<10 { print("---") } + for _ in 0..<5 { print("---") } + for match in lineInput.matches(of: regex(#"\v"#).matchingSemantics(.unicodeScalar)) { + print(lineInput.unicodeScalars.offsets(of: match.0.startIndex.. Date: Mon, 9 May 2022 14:06:52 -0500 Subject: [PATCH 5/8] print("1") --- Tests/RegexTests/UTS18Tests.swift | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 7ed7a6819..a9154c240 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -270,11 +270,11 @@ extension UTS18Tests { 09\u{85}\ 10\u{2028}\ 11\u{2029}\ - + 12 """ // Check the input counts var lines = lineInput.matches(of: regex(#"\d{2}"#)) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) // Test \R - newline sequence lines = lineInput.matches(of: regex(#"\d{2}\R^"#).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) @@ -283,20 +283,35 @@ extension UTS18Tests { XCTAssertEqual(lines.count, 11) // Test anchors as line boundaries lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) // Test that dot does not match line endings lines = lineInput.matches(of: regex(#".+"#)) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) // Unicode scalar semantics - \R still matches all, including \r\n sequence lines = lineInput.matches( of: regex(#"\d{2}\R^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) + lines = lineInput.matches( + of: regex(#"\d{2}\R(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) XCTAssertNotNil(lineInput.firstMatch( of: regex(#"08\R^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())) // Unicode scalar semantics - \v matches all except for \r\n sequence + print("\n\n\n-------", #line, "\n\n\n") lines = lineInput.matches( of: regex(#"\d{2}\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + + print(lines.map { + "\n\n```\($0.0)```\n\n" + + + "```\($0.0.unicodeScalars.map { $0.value4Digits })```" + }.joined()) + + XCTAssertEqual(lines.count, 10) + print("\n\n\n-------", #line, "\n\n\n") + lines = lineInput.matches( + of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 10) XCTAssertNil(lineInput.firstMatch( of: regex(#"08\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())) From ab30b49ca14f40119e42d49de90930a5559ad378 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 17:46:01 -0500 Subject: [PATCH 6/8] Calm the line break tests back down --- Tests/RegexTests/UTS18Tests.swift | 33 ------------------------------- 1 file changed, 33 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index a9154c240..e53e4ed40 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -289,46 +289,13 @@ extension UTS18Tests { XCTAssertEqual(lines.count, 12) // Unicode scalar semantics - \R still matches all, including \r\n sequence - lines = lineInput.matches( - of: regex(#"\d{2}\R^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) - XCTAssertEqual(lines.count, 11) lines = lineInput.matches( of: regex(#"\d{2}\R(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) - XCTAssertNotNil(lineInput.firstMatch( - of: regex(#"08\R^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())) // Unicode scalar semantics - \v matches all except for \r\n sequence - print("\n\n\n-------", #line, "\n\n\n") - lines = lineInput.matches( - of: regex(#"\d{2}\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) - - print(lines.map { - "\n\n```\($0.0)```\n\n" - + - "```\($0.0.unicodeScalars.map { $0.value4Digits })```" - }.joined()) - - XCTAssertEqual(lines.count, 10) - print("\n\n\n-------", #line, "\n\n\n") lines = lineInput.matches( of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 10) - XCTAssertNil(lineInput.firstMatch( - of: regex(#"08\v^"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())) - - XCTAssertNotNil(lineInput.firstMatch(of: regex(#"08\u{d}\u{a}"#).matchingSemantics(.unicodeScalar))) - XCTAssertNotNil(lineInput.firstMatch( - of: regex(#"08..09"#).matchingSemantics(.unicodeScalar).dotMatchesNewlines())) - - for _ in 0..<5 { print("---") } - for (i, s) in lineInput.unicodeScalars.enumerated() { - print("\(i): scalar U+\(s.value4Digits)") - } - for _ in 0..<5 { print("---") } - for match in lineInput.matches(of: regex(#"\v"#).matchingSemantics(.unicodeScalar)) { - print(lineInput.unicodeScalars.offsets(of: match.0.startIndex.. Date: Mon, 9 May 2022 17:46:23 -0500 Subject: [PATCH 7/8] Audit assertions for semantic level --- Sources/_StringProcessing/ByteCodeGen.swift | 32 +++++++++++++++---- .../Unicode/ScalarProps.swift | 2 +- .../_CharacterClassModel.swift | 6 ++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 2131d1eb5..d30cab209 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -80,10 +80,16 @@ extension Compiler.ByteCodeGen { } case .endOfSubjectBeforeNewline: - builder.buildAssert { (input, pos, bounds) in + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in if pos == input.endIndex { return true } - return input.index(after: pos) == input.endIndex - && input[pos].isNewline + switch semanticLevel { + case .graphemeCluster: + return input.index(after: pos) == input.endIndex + && input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: pos) == input.endIndex + && input.unicodeScalars[pos].isNewline + } } case .endOfSubject: @@ -115,8 +121,14 @@ extension Compiler.ByteCodeGen { case .startOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.startIndex || input[input.index(before: pos)].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.startIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in @@ -126,8 +138,14 @@ extension Compiler.ByteCodeGen { case .endOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.endIndex || input[pos].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.endIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in diff --git a/Sources/_StringProcessing/Unicode/ScalarProps.swift b/Sources/_StringProcessing/Unicode/ScalarProps.swift index edab08c21..0894fa572 100644 --- a/Sources/_StringProcessing/Unicode/ScalarProps.swift +++ b/Sources/_StringProcessing/Unicode/ScalarProps.swift @@ -52,7 +52,7 @@ extension UnicodeScalar { value == 0x09 || properties.generalCategory == .spaceSeparator } - var isVerticalWhitespace: Bool { + var isNewline: Bool { switch value { case 0x000A...0x000D /* LF ... CR */: return true case 0x0085 /* NEXT LINE (NEL) */: return true diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index d61e13822..21e471192 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -182,7 +182,7 @@ public struct _CharacterClassModel: Hashable { matched = c.unicodeScalars.first?.isHorizontalWhitespace == true && (c.isASCII || !options.usesASCIISpaces) case .newlineSequence, .verticalWhitespace: - matched = c.unicodeScalars.first?.isVerticalWhitespace == true + matched = c.unicodeScalars.first?.isNewline == true && (c.isASCII || !options.usesASCIISpaces) case .whitespace: matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) @@ -212,9 +212,9 @@ public struct _CharacterClassModel: Hashable { case .horizontalWhitespace: matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) case .verticalWhitespace: - matched = c.isVerticalWhitespace && (c.isASCII || !options.usesASCIISpaces) + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) case .newlineSequence: - matched = c.isVerticalWhitespace && (c.isASCII || !options.usesASCIISpaces) + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" { str.unicodeScalars.formIndex(after: &nextIndex) } From cfc7aed70b999722496e4ab8847c1909e963cf9b Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 17:56:44 -0500 Subject: [PATCH 8/8] Allow \v and \V in validation --- Sources/_RegexParser/Regex/Parse/Sema.swift | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index f9f2b996a..263902a8e 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -182,7 +182,7 @@ extension RegexValidator { _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation ) throws { switch esc { - case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab, + case .resetStartOfMatch, .singleDataUnit, // '\N' needs to be emitted using 'emitAny'. .notNewline: throw error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -190,7 +190,8 @@ extension RegexValidator { // Character classes. case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar, - .horizontalWhitespace, .notHorizontalWhitespace: + .horizontalWhitespace, .notHorizontalWhitespace, + .verticalTab, .notVerticalTab: break case .newlineSequence: