diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index f9f2b996a..263902a8e 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -182,7 +182,7 @@ extension RegexValidator { _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation ) throws { switch esc { - case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab, + case .resetStartOfMatch, .singleDataUnit, // '\N' needs to be emitted using 'emitAny'. .notNewline: throw error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -190,7 +190,8 @@ extension RegexValidator { // Character classes. case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar, - .horizontalWhitespace, .notHorizontalWhitespace: + .horizontalWhitespace, .notHorizontalWhitespace, + .verticalTab, .notVerticalTab: break case .newlineSequence: diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 2131d1eb5..d30cab209 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -80,10 +80,16 @@ extension Compiler.ByteCodeGen { } case .endOfSubjectBeforeNewline: - builder.buildAssert { (input, pos, bounds) in + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in if pos == input.endIndex { return true } - return input.index(after: pos) == input.endIndex - && input[pos].isNewline + switch semanticLevel { + case .graphemeCluster: + return input.index(after: pos) == input.endIndex + && input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: pos) == input.endIndex + && input.unicodeScalars[pos].isNewline + } } case .endOfSubject: @@ -115,8 +121,14 @@ extension Compiler.ByteCodeGen { case .startOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.startIndex || input[input.index(before: pos)].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.startIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in @@ -126,8 +138,14 @@ extension Compiler.ByteCodeGen { case .endOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.endIndex || input[pos].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.endIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in diff --git a/Sources/_StringProcessing/Unicode/ScalarProps.swift b/Sources/_StringProcessing/Unicode/ScalarProps.swift index 52a870357..0894fa572 100644 --- a/Sources/_StringProcessing/Unicode/ScalarProps.swift +++ b/Sources/_StringProcessing/Unicode/ScalarProps.swift @@ -46,3 +46,19 @@ extension Unicode.Script { return result } } + +extension UnicodeScalar { + var isHorizontalWhitespace: Bool { + value == 0x09 || properties.generalCategory == .spaceSeparator + } + + var isNewline: Bool { + switch value { + case 0x000A...0x000D /* LF ... CR */: return true + case 0x0085 /* NEXT LINE (NEL) */: return true + case 0x2028 /* LINE SEPARATOR */: return true + case 0x2029 /* PARAGRAPH SEPARATOR */: return true + default: return false + } + } +} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 27a24cf46..85dd1ca37 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -178,15 +178,18 @@ public struct _CharacterClassModel: Hashable { matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isNewline == true + && (c.isASCII || !options.usesASCIISpaces) case .whitespace: matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): matched = set.any { $0.matches(c, with: options) } + case .custom(let set): + matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -206,14 +209,21 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: fatalError("Not implemented") - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) + case .verticalWhitespace: + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) + if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" { + str.unicodeScalars.formIndex(after: &nextIndex) + } case .whitespace: matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom: fatalError("Not supported") + case .custom(let set): + matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 145087ee7..d13b47b8d 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -22,6 +22,14 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +extension UnicodeScalar { + var value4Digits: String { + let valueString = String(value, radix: 16, uppercase: true) + if valueString.count >= 4 { return valueString } + return String(repeating: "0", count: 4 - valueString.count) + valueString + } +} + class UTS18Tests: XCTestCase { var input: String { "ABCdefghîøu\u{308}\u{FFF0} -–—[]123" @@ -262,21 +270,33 @@ extension UTS18Tests { 09\u{85}\ 10\u{2028}\ 11\u{2029}\ - + 12 """ // Check the input counts var lines = lineInput.matches(of: regex(#"\d{2}"#)) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) // Test \R - newline sequence - lines = lineInput.matches(of: regex(#"\d{2}\R"#)) + lines = lineInput.matches(of: regex(#"\d{2}\R^"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test \v - vertical space + lines = lineInput.matches(of: regex(#"\d{2}\v^"#).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) // Test anchors as line boundaries lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) // Test that dot does not match line endings lines = lineInput.matches(of: regex(#".+"#)) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) + // Unicode scalar semantics - \R still matches all, including \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\R(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Unicode scalar semantics - \v matches all except for \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 10) + // Does not contain an empty line XCTAssertFalse(lineInput.contains(regex(#"^$"#))) // Does contain an empty line (between \n and \r, which are reversed here)