diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index bdf076e68..f63cd435e 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -342,8 +342,8 @@ extension Source { }.value } - /// Eat a scalar off the front, starting from after the - /// backslash and base character (e.g. `\u` or `\x`). + /// Try to eat a scalar off the front, starting from after the backslash and + /// base character (e.g. `\u` or `\x`). /// /// UniScalar -> 'u{' UniScalarSequence '}' /// | 'u' HexDigit{4} @@ -353,60 +353,60 @@ extension Source { /// | 'o{' OctalDigit{1...} '}' /// | '0' OctalDigit{0...3} /// - mutating func expectUnicodeScalar( - escapedCharacter base: Character - ) throws -> AST.Atom.Kind { + mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? { try recordLoc { src in + try src.tryEating { src in - func nullScalar() -> AST.Atom.Kind { - let pos = src.currentPosition - return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) - } - - // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. - switch base { - // Hex numbers. - case "u" where src.tryEat("{"): - return try src.expectUnicodeScalarSequence(eating: "}") - - case "x" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .hex)) - - case "x": - // \x expects *up to* 2 digits. - guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) - else { - // In PCRE, \x without any valid hex digits is \u{0}. - // TODO: This doesn't appear to be followed by ICU or Oniguruma, so - // could be changed to throw an error if we had a parsing mode for - // them. - return nullScalar() + func nullScalar() -> AST.Atom.Kind { + let pos = src.currentPosition + return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) } - return .scalar(try Source.validateUnicodeScalar(digits, .hex)) - case "u": - return .scalar(try src.expectUnicodeScalar(numDigits: 4)) - case "U": - return .scalar(try src.expectUnicodeScalar(numDigits: 8)) + // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. + switch src.tryEat() { + // Hex numbers. + case "u" where src.tryEat("{"): + return try src.expectUnicodeScalarSequence(eating: "}") + + case "x" where src.tryEat("{"): + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .hex)) + + case "x": + // \x expects *up to* 2 digits. + guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) + else { + // In PCRE, \x without any valid hex digits is \u{0}. + // TODO: This doesn't appear to be followed by ICU or Oniguruma, so + // could be changed to throw an error if we had a parsing mode for + // them. + return nullScalar() + } + return .scalar(try Source.validateUnicodeScalar(digits, .hex)) + + case "u": + return .scalar(try src.expectUnicodeScalar(numDigits: 4)) + case "U": + return .scalar(try src.expectUnicodeScalar(numDigits: 8)) + + // Octal numbers. + case "o" where src.tryEat("{"): + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .octal)) + + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) + else { + return nullScalar() + } + return .scalar(try Source.validateUnicodeScalar(digits, .octal)) - // Octal numbers. - case "o" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .octal)) - - case "0": - // We can read *up to* 3 more octal digits. - // FIXME: PCRE can only read up to 2 octal digits, if we get a strict - // PCRE mode, we should limit it here. - guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) - else { - return nullScalar() + default: + return nil } - return .scalar(try Source.validateUnicodeScalar(digits, .octal)) - - default: - fatalError("Unexpected scalar start") } }.value } @@ -802,6 +802,11 @@ extension Source { mutating func lexMatchingOptionSequence( context: ParsingContext ) throws -> AST.MatchingOptionSequence? { + // PCRE accepts '(?)' + // TODO: This is a no-op, should we warn? + if peek() == ")" { + return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: []) + } let ateCaret = recordLoc { $0.tryEat("^") } // TODO: Warn on duplicate options, and options appearing in both adding @@ -1707,6 +1712,11 @@ extension Source { return ref } + // Hexadecimal and octal unicode scalars. + if let scalar = try src.lexUnicodeScalar() { + return scalar + } + guard let char = src.tryEat() else { throw ParseError.expectedEscape } @@ -1718,14 +1728,6 @@ extension Source { return .escaped(builtin) } - switch char { - // Hexadecimal and octal unicode scalars. - case "u", "x", "U", "o", "0": - return try src.expectUnicodeScalar(escapedCharacter: char) - default: - break - } - // We only allow unknown escape sequences for non-letter non-number ASCII, // and non-ASCII whitespace. // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`. diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 958c53c26..e28c72514 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -61,41 +61,6 @@ extension RegexTests { _ = try src.lexNumber() } - func diagnoseUniScalarOverflow(_ input: String, base: Character) { - let scalars = input.first == "{" - ? String(input.dropFirst().dropLast()) - : input - diagnose( - input, - expecting: .numberOverflow(scalars) - ) { src in - _ = try src.expectUnicodeScalar(escapedCharacter: base) - } - } - func diagnoseUniScalar( - _ input: String, - base: Character, - expectedDigits numDigits: Int - ) { - let scalars = input.first == "{" - ? String(input.dropFirst().dropLast()) - : input - diagnose( - input, - expecting: .expectedNumDigits(scalars, numDigits) - ) { src in - _ = try src.expectUnicodeScalar(escapedCharacter: base) - } - _ = scalars - } - - diagnoseUniScalar( - "12", base: "u", expectedDigits: 4) - diagnoseUniScalar( - "12", base: "U", expectedDigits: 8) - diagnoseUniScalarOverflow("{123456789}", base: "u") - diagnoseUniScalarOverflow("{123456789}", base: "x") - // TODO: want to dummy print out source ranges, etc, test that. } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 493b6e1c6..425f44f48 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1002,6 +1002,9 @@ extension RegexTests { concat("a", atomicScriptRun("b"), "c"), throwsError: .unsupported) // Matching option changing groups. + parseTest("(?)", changeMatchingOptions( + matchingOptions() + )) parseTest("(?-)", changeMatchingOptions( matchingOptions() )) @@ -2666,6 +2669,8 @@ extension RegexTests { diagnosticTest("\\", .expectedEscape) + diagnosticTest(#"\o"#, .invalidEscape("o")) + // TODO: Custom diagnostic for control sequence diagnosticTest(#"\c"#, .unexpectedEndOfInput) @@ -2877,6 +2882,11 @@ extension RegexTests { diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class")) diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class")) + diagnosticTest(#"\u12"#, .expectedNumDigits("12", 4)) + diagnosticTest(#"\U12"#, .expectedNumDigits("12", 8)) + diagnosticTest(#"\u{123456789}"#, .numberOverflow("123456789")) + diagnosticTest(#"\x{123456789}"#, .numberOverflow("123456789")) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret)