diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index a7dcd2015..c7eaa4a9e 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -53,6 +53,9 @@ extension AST { /// Comments, non-semantic whitespace, etc case trivia(Trivia) + /// Intepolation `<{...}>`, currently reserved for future use. + case interpolation(Interpolation) + case atom(Atom) case customCharacterClass(CustomCharacterClass) @@ -78,6 +81,7 @@ extension AST.Node { case let .quantification(v): return v case let .quote(v): return v case let .trivia(v): return v + case let .interpolation(v): return v case let .atom(v): return v case let .customCharacterClass(v): return v case let .empty(v): return v @@ -128,7 +132,7 @@ extension AST.Node { case .group, .conditional, .customCharacterClass, .absentFunction: return true case .alternation, .concatenation, .quantification, .quote, .trivia, - .empty: + .empty, .interpolation: return false } } @@ -192,6 +196,16 @@ extension AST { } } + public struct Interpolation: Hashable, _ASTNode { + public let contents: String + public let location: SourceLocation + + public init(_ contents: String, _ location: SourceLocation) { + self.contents = contents + self.location = location + } + } + public struct Empty: Hashable, _ASTNode { public let location: SourceLocation diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index ff75260e8..24fa4a5e1 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -822,7 +822,7 @@ extension AST.Node { case .alternation, .concatenation, .group, .conditional, .quantification, .quote, .trivia, .customCharacterClass, .empty, - .absentFunction: + .absentFunction, .interpolation: return nil } } diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift index 0287e7337..c8f3af561 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureList.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -103,7 +103,7 @@ extension AST.Node { break } - case .quote, .trivia, .atom, .customCharacterClass, .empty: + case .quote, .trivia, .atom, .customCharacterClass, .empty, .interpolation: break } } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index d87fba918..67a492787 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -42,6 +42,7 @@ enum ParseError: Error, Hashable { case expectedNonEmptyContents case expectedEscape case invalidEscape(Character) + case confusableCharacter(Character) case cannotReferToWholePattern @@ -128,6 +129,8 @@ extension ParseError: CustomStringConvertible { return "expected escape sequence" case .invalidEscape(let c): return "invalid escape sequence '\\\(c)'" + case .confusableCharacter(let c): + return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .quantifierRequiresOperand(let q): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index e8783dc86..cbc9d5eb2 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -589,6 +589,26 @@ extension Source { return AST.Quote(str.value, str.location) } + /// Try to consume an interpolation sequence. + /// + /// Interpolation -> '<{' String '}>' + /// + mutating func lexInterpolation() throws -> AST.Interpolation? { + let contents = try recordLoc { src -> String? in + try src.tryEating { src in + guard src.tryEat(sequence: "<{") else { return nil } + _ = src.lexUntil { $0.isEmpty || $0.starts(with: "}>") } + guard src.tryEat(sequence: "}>") else { return nil } + + // Not currently supported. We error here instead of during Sema to + // get a better error for something like `(<{)}>`. + throw ParseError.unsupported("interpolation") + } + } + guard let contents = contents else { return nil } + return .init(contents.value, contents.location) + } + /// Try to consume a comment /// /// Comment -> '(?#' [^')']* ')' @@ -1674,9 +1694,10 @@ extension Source { break } - // We only allow unknown escape sequences for non-letter ASCII, and - // non-ASCII whitespace. - guard (char.isASCII && !char.isLetter) || + // We only allow unknown escape sequences for non-letter non-number ASCII, + // and non-ASCII whitespace. + // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`. + guard (char.isASCII && !char.isLetter && !char.isNumber) || (!char.isASCII && char.isWhitespace) else { throw ParseError.invalidEscape(char) @@ -1981,10 +2002,21 @@ extension Source { case "]": assert(!customCC, "parser should have prevented this") - fallthrough + break - default: return .char(char) + default: + // Reject non-letter non-number non-`\r\n` ASCII characters that have + // multiple scalars. These may be confusable for metacharacters, e.g + // `[\u{301}]` wouldn't be interpreted as a custom character class due + // to the combining accent (assuming it is literal, not `\u{...}`). + let scalars = char.unicodeScalars + if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" && + !char.isLetter && !char.isNumber { + throw ParseError.confusableCharacter(char) + } + break } + return .char(char) } guard let kind = kind else { return nil } return AST.Atom(kind.value, kind.location) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 112f32358..8f3a722d6 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -222,6 +222,13 @@ extension Parser { result.append(.quote(quote)) continue } + + // Interpolation -> `lexInterpolation` + if let interpolation = try source.lexInterpolation() { + result.append(.interpolation(interpolation)) + continue + } + // Quantification -> QuantOperand Quantifier? if let operand = try parseQuantifierOperand() { if let (amt, kind, trivia) = diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 9d5ae4576..459e57e94 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -395,6 +395,11 @@ extension RegexValidator { // These are Oniguruma specific. throw error(.unsupported("absent function"), at: a.location) + case .interpolation(let i): + // This is currently rejected in the parser for better diagnostics, but + // reject here too until we get runtime support. + throw error(.unsupported("interpolation"), at: i.location) + case .quote, .trivia, .empty: break } diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index b8937d518..110b7b480 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -101,6 +101,10 @@ extension AST.Trivia { } } +extension AST.Interpolation { + public var _dumpBase: String { "interpolation <\(contents)>" } +} + extension AST.Empty { public var _dumpBase: String { "" } } diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index 59c0cc04a..ac553a115 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -97,6 +97,9 @@ extension PrettyPrinter { case let .trivia(t): output(t._canonicalBase) + case let .interpolation(i): + output(i._canonicalBase) + case let .atom(a): output(a._canonicalBase) @@ -178,6 +181,12 @@ extension AST.Quote { } } +extension AST.Interpolation { + var _canonicalBase: String { + "<{\(contents)}>" + } +} + extension AST.Group.Kind { var _canonicalBase: String { switch self { diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 79a515033..a76a19607 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -137,6 +137,9 @@ extension AST.Node { case let .trivia(v): return .trivia(v.contents) + case .interpolation: + throw Unsupported("TODO: interpolation") + case let .atom(v): switch v.kind { case .scalarSequence(let seq): diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 36056e85a..ce6b8f273 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -218,6 +218,12 @@ extension RegexTests { firstMatchTest( #"abc\d"#, input: "xyzabc123", match: "abc1") + // MARK: Allowed combining characters + + firstMatchTest("e\u{301}", input: "e\u{301}", match: "e\u{301}") + firstMatchTest("1\u{358}", input: "1\u{358}", match: "1\u{358}") + firstMatchTest(#"\ \#u{361}"#, input: " \u{361}", match: " \u{361}") + // MARK: Alternations firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 8163c1359..236edb561 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -394,6 +394,12 @@ extension RegexTests { #"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) + // MARK: Allowed combining characters + + parseTest("e\u{301}", "e\u{301}") + parseTest("1\u{358}", "1\u{358}") + parseTest(#"\ \#u{361}"#, " \u{361}") + // MARK: Alternations parseTest( @@ -466,14 +472,6 @@ extension RegexTests { parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8")) parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}"))) - // TODO: These are treated as octal sequences by PCRE, we should warn and - // suggest user prefix with 0. - parseTest(#"[\1]"#, charClass("1")) - parseTest(#"[\123]"#, charClass("1", "2", "3")) - parseTest(#"[\101]"#, charClass("1", "0", "1")) - parseTest(#"[\7777]"#, charClass("7", "7", "7", "7")) - parseTest(#"[\181]"#, charClass("1", "8", "1")) - // We take *up to* the first two valid digits for \x. No valid digits is 0. parseTest(#"\x"#, scalar("\u{0}")) parseTest(#"\x5"#, scalar("\u{5}")) @@ -484,6 +482,8 @@ extension RegexTests { parseTest(#"\u{ a }"#, scalar("\u{A}")) parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}"))) + parseTest(#"[\u{301}]"#, charClass(scalar_m("\u{301}"))) + // MARK: Scalar sequences parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}")) @@ -804,6 +804,20 @@ extension RegexTests { #"a(?#. comment)b"#, concat("a", "b")) + // MARK: Interpolation + + // These are literal as there's no closing '}>' + parseTest("<{", concat("<", "{")) + parseTest("<{a", concat("<", "{", "a")) + parseTest("<{a}", concat("<", "{", "a", "}")) + parseTest("<{<{}", concat("<", "{", "<", "{", "}")) + + // Literal as escaped + parseTest(#"\<{}>"#, concat("<", "{", "}", ">")) + + // A quantification + parseTest(#"<{2}"#, exactly(2, of: "<")) + // MARK: Quantification parseTest("a*", zeroOrMore(of: "a")) @@ -1267,10 +1281,6 @@ extension RegexTests { parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) - // Backreferences are not valid in custom character classes. - parseTest(#"[\8]"#, charClass("8")) - parseTest(#"[\9]"#, charClass("9")) - // These are valid references. parseTest(#"()\1"#, concat( capture(empty()), backreference(.absolute(1)) @@ -2547,6 +2557,17 @@ extension RegexTests { // TODO: Custom diagnostic for missing '\Q' diagnosticTest(#"\E"#, .invalidEscape("E")) + // PCRE treats these as octal, but we require a `0` prefix. + diagnosticTest(#"[\1]"#, .invalidEscape("1")) + diagnosticTest(#"[\123]"#, .invalidEscape("1")) + diagnosticTest(#"[\101]"#, .invalidEscape("1")) + diagnosticTest(#"[\7777]"#, .invalidEscape("7")) + diagnosticTest(#"[\181]"#, .invalidEscape("1")) + + // Backreferences are not valid in custom character classes. + diagnosticTest(#"[\8]"#, .invalidEscape("8")) + diagnosticTest(#"[\9]"#, .invalidEscape("9")) + // Non-ASCII non-whitespace cases. diagnosticTest(#"\🔥"#, .invalidEscape("🔥")) diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰")) @@ -2555,6 +2576,27 @@ extension RegexTests { diagnosticTest(#"\˂"#, .invalidEscape("˂")) diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}")) + // MARK: Confusable characters + + diagnosticTest("[\u{301}]", .confusableCharacter("[\u{301}")) + diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}")) + diagnosticTest("{\u{35B}}", .confusableCharacter("{\u{35B}")) + diagnosticTest(#"\\#u{35C}"#, .confusableCharacter(#"\\#u{35C}"#)) + diagnosticTest("^\u{35D}", .confusableCharacter("^\u{35D}")) + diagnosticTest("$\u{35E}", .confusableCharacter("$\u{35E}")) + diagnosticTest(".\u{35F}", .confusableCharacter(".\u{35F}")) + diagnosticTest("|\u{360}", .confusableCharacter("|\u{360}")) + diagnosticTest(" \u{361}", .confusableCharacter(" \u{361}")) + + // MARK: Interpolation (currently unsupported) + + diagnosticTest("<{}>", .unsupported("interpolation")) + diagnosticTest("<{...}>", .unsupported("interpolation")) + diagnosticTest("<{)}>", .unsupported("interpolation")) + diagnosticTest("<{}}>", .unsupported("interpolation")) + diagnosticTest("<{<{}>", .unsupported("interpolation")) + diagnosticTest("(<{)}>", .unsupported("interpolation")) + // MARK: Character properties diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx"))