From a79767801587a3630e4a0101f7f3a595b9eb2d58 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 28 Jun 2022 15:00:50 +0100 Subject: [PATCH 1/3] Small `parseCustomCharacterClass` cleanup We can do the semantic members check up-front. --- Sources/_RegexParser/Regex/Parse/Parse.swift | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 3e20ae8c0..87afb0e11 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -502,6 +502,12 @@ extension Parser { var members: Array = [] try parseCCCMembers(into: &members) + // Make sure we have at least one semantic member. + if members.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) + } + // If we have a binary set operator, parse it and the next members. Note // that this means we left associate for a chain of operators. // TODO: We may want to diagnose and require users to disambiguate, at least @@ -511,16 +517,12 @@ extension Parser { var rhs: Array = [] try parseCCCMembers(into: &rhs) - if members.none(\.isSemantic) || rhs.none(\.isSemantic) { + if rhs.none(\.isSemantic) { throw Source.LocatedError( ParseError.expectedCustomCharacterClassMembers, start.location) } members = [.setOperation(members, binOp, rhs)] } - if members.none(\.isSemantic) { - throw Source.LocatedError( - ParseError.expectedCustomCharacterClassMembers, start.location) - } try source.expect("]") return CustomCC(start, members, loc(start.location.start)) } From a722b20cea94f2e474c8b9513a6ac70cac08240c Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 28 Jun 2022 15:00:51 +0100 Subject: [PATCH 2/3] Factor out parsePotentialCCRange --- Sources/_RegexParser/Regex/Parse/Parse.swift | 78 +++++++++++--------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 87afb0e11..f8ec982e7 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -552,47 +552,53 @@ extension Parser { return nil } + /// Attempt to parse a custom character class range into `members`, or regular + /// members if a range cannot be formed. + mutating func parsePotentialCCRange( + into members: inout [CustomCC.Member] + ) throws { + guard case .atom(let lhs)? = members.last else { return } + + // Try and see if we can parse a character class range. Each time we parse + // a component of the range, we append to `members` in case it ends up not + // being a range, and we bail. If we succeed in parsing, we remove the + // intermediate members. + let membersBeforeRange = members.count - 1 + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } + guard let dash = source.lexCustomCharacterClassRangeOperator() else { + return + } + + // If we can't parse a range, '-' becomes literal, e.g `[6-]`. + members.append(.atom(.init(.char("-"), dash))) + + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } + guard let rhs = try parseCCCMember() else { return } + members.append(rhs) + + guard case let .atom(rhs) = rhs else { return } + + // We've successfully parsed an atom LHS and RHS, so form a range, + // collecting the trivia we've parsed, and replacing the members that + // would have otherwise been added to the custom character class. + let rangeMemberCount = members.count - membersBeforeRange + let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) + members.removeLast(rangeMemberCount) + members.append(.range(.init(lhs, dash, rhs, trivia: trivia))) + } + mutating func parseCCCMembers( into members: inout Array ) throws { - // Parse members until we see the end of the custom char class or an - // operator. + // Parse members and ranges until we see the end of the custom char class + // or an operator. while let member = try parseCCCMember() { members.append(member) - - // If we have an atom, we can try to parse a character class range. Each - // time we parse a component of the range, we append to `members` in case - // it ends up not being a range, and we bail. If we succeed in parsing, we - // remove the intermediate members. - if case .atom(let lhs) = member { - let membersBeforeRange = members.count - 1 - - while let t = try source.lexTrivia(context: context) { - members.append(.trivia(t)) - } - - guard let dash = source.lexCustomCharacterClassRangeOperator() else { - continue - } - // If we can't parse a range, '-' becomes literal, e.g `[6-]`. - members.append(.atom(.init(.char("-"), dash))) - - while let t = try source.lexTrivia(context: context) { - members.append(.trivia(t)) - } - guard let rhs = try parseCCCMember() else { continue } - members.append(rhs) - - guard case let .atom(rhs) = rhs else { continue } - - // We've successfully parsed an atom LHS and RHS, so form a range, - // collecting the trivia we've parsed, and replacing the members that - // would have otherwise been added to the custom character class. - let rangeMemberCount = members.count - membersBeforeRange - let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) - members.removeLast(rangeMemberCount) - members.append(.range(.init(lhs, dash, rhs, trivia: trivia))) - } + try parsePotentialCCRange(into: &members) } } } From dd333f3d71f5cb158b354fcff0d700dc5cf786bb Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 28 Jun 2022 15:00:51 +0100 Subject: [PATCH 3/3] Reject .NET subtraction and quote range operands Tighten up validation of character class range operands such that we reject quotes and custom character classes. This includes rejecting syntax that would be a subtraction in .NET. We throw a custom error that suggests using `--` instead. --- .../Regex/AST/CustomCharClass.swift | 23 +++++++ .../Regex/Parse/Diagnostics.swift | 5 +- .../Regex/Parse/LexicalAnalysis.swift | 19 ++++++ Sources/_RegexParser/Regex/Parse/Parse.swift | 40 +++++++++++- Tests/RegexTests/ParseTests.swift | 65 ++++++++++++++++++- 5 files changed, 147 insertions(+), 5 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index 4602b55d3..7e6e8c285 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -62,6 +62,10 @@ extension AST { self.rhs = rhs self.trivia = trivia } + + public var location: SourceLocation { + lhs.location.union(with: rhs.location) + } } public enum SetOp: String, Hashable { case subtraction = "--" @@ -108,6 +112,25 @@ extension CustomCC.Member { public var isSemantic: Bool { !isTrivia } + + public var location: SourceLocation { + switch self { + case let .custom(c): return c.location + case let .range(r): return r.location + case let .atom(a): return a.location + case let .quote(q): return q.location + case let .trivia(t): return t.location + case let .setOperation(lhs, dash, rhs): + var loc = dash.location + if let lhs = lhs.first { + loc = loc.union(with: lhs.location) + } + if let rhs = rhs.last { + loc = loc.union(with: rhs.location) + } + return loc + } + } } extension AST.CustomCharacterClass { diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 618ae2412..5bca2ad13 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -94,6 +94,7 @@ enum ParseError: Error, Hashable { case invalidNamedReference(String) case duplicateNamedCapture(String) case invalidCharacterClassRangeOperand + case unsupportedDotNetSubtraction case invalidQuantifierRange(Int, Int) case invalidCharacterRange(from: Character, to: Character) case notQuantifiable @@ -174,7 +175,9 @@ extension ParseError: CustomStringConvertible { case .expectedCustomCharacterClassMembers: return "expected custom character class members" case .invalidCharacterClassRangeOperand: - return "invalid character class range" + return "invalid bound for character class range" + case .unsupportedDotNetSubtraction: + return "subtraction with '-' is unsupported; use '--' instead" case .emptyProperty: return "empty property" case .unknownProperty(let key, let value): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 41b744234..be6f13fc7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1245,6 +1245,25 @@ extension Source { return nil } + /// Check to see if we can lex a .NET subtraction. Returns the + /// location of the `-`. + /// + /// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass + /// + func canLexDotNetCharClassSubtraction( + context: ParsingContext + ) -> SourceLocation? { + lookahead { src in + // We can lex '-' as a .NET subtraction if it precedes a custom character + // class. + while (try? src.lexTrivia(context: context)) != nil {} + guard let dashLoc = src.tryEatWithLoc("-") else { return nil } + while (try? src.lexTrivia(context: context)) != nil {} + guard src.lexCustomCCStart() != nil else { return nil } + return dashLoc + } + } + private mutating func lexPOSIXCharacterProperty( ) throws -> Located? { try recordLoc { src in diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index f8ec982e7..52861f23d 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -557,7 +557,7 @@ extension Parser { mutating func parsePotentialCCRange( into members: inout [CustomCC.Member] ) throws { - guard case .atom(let lhs)? = members.last else { return } + guard let lhs = members.last, lhs.isSemantic else { return } // Try and see if we can parse a character class range. Each time we parse // a component of the range, we append to `members` in case it ends up not @@ -580,7 +580,33 @@ extension Parser { guard let rhs = try parseCCCMember() else { return } members.append(rhs) - guard case let .atom(rhs) = rhs else { return } + func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom { + switch m { + case .atom(let a): + return a + case .custom: + // Not supported. While .NET allows `x-[...]` to spell subtraction, we + // require `x--[...]`. We also ban `[...]-x` for consistency. + if isLHS { + throw Source.LocatedError( + ParseError.invalidCharacterClassRangeOperand, m.location) + } else { + throw Source.LocatedError( + ParseError.unsupportedDotNetSubtraction, m.location) + } + case .quote: + // Currently unsupported, we need to figure out what the semantics + // would be for grapheme/scalar modes. + throw Source.LocatedError( + ParseError.unsupported("range with quoted sequence"), m.location) + case .trivia: + throw Unreachable("Should have been lexed separately") + case .range, .setOperation: + throw Unreachable("Parsed later") + } + } + let lhsOp = try makeOperand(lhs, isLHS: true) + let rhsOp = try makeOperand(rhs, isLHS: false) // We've successfully parsed an atom LHS and RHS, so form a range, // collecting the trivia we've parsed, and replacing the members that @@ -588,7 +614,15 @@ extension Parser { let rangeMemberCount = members.count - membersBeforeRange let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) members.removeLast(rangeMemberCount) - members.append(.range(.init(lhs, dash, rhs, trivia: trivia))) + members.append(.range(.init(lhsOp, dash, rhsOp, trivia: trivia))) + + // We need to specially check if we can lex a .NET character class + // subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd + // treat the second `-` as literal. + if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) { + throw Source.LocatedError( + ParseError.unsupportedDotNetSubtraction, dashLoc) + } } mutating func parseCCCMembers( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 5beb67448..f23878142 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -515,10 +515,36 @@ extension RegexTests { parseTest( "[a-b-c]", charClass(range_m("a", "b"), "-", "c")) + parseTest( + "[a-b-c-d]", charClass(range_m("a", "b"), "-", range_m("c", "d"))) + + parseTest("[a-c---]", charClass( + setOp(range_m("a", "c"), op: .subtraction, "-") + )) + + parseTest("(?x)[a-c -- -]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(setOp(range_m("a", "c"), op: .subtraction, "-")) + )) + + parseTest("(?x)[a-c - - -]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "c"), range_m("-", "-")) + )) parseTest("[-a-]", charClass("-", "a", "-")) parseTest("[[a]-]", charClass(charClass("a"), "-")) - parseTest("[[a]-b]", charClass(charClass("a"), "-", "b")) + parseTest("[-[a]]", charClass("-", charClass("a"))) + + parseTest(#"(?x)[ -[b]]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("-", charClass("b")) + )) + + parseTest(#"[ - [ ]]"#, charClass(range_m(" ", " "), charClass(" "))) + parseTest(#"[ - [ ] ]"#, charClass(range_m(" ", " "), charClass(" "), " ")) + + parseTest(#"[a-c-\Qd\E]"#, charClass(range_m("a", "c"), "-", quote_m("d"))) parseTest("[a-z]", charClass(range_m("a", "z"))) parseTest("[a-a]", charClass(range_m("a", "a"))) @@ -2688,6 +2714,32 @@ extension RegexTests { diagnosticTest("[[:=:]]", .emptyProperty) diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest("[[a]-b]", .invalidCharacterClassRangeOperand) + + // .NET subtraction is banned, we require explicit '--'. + diagnosticTest("[a-[b]]", .unsupportedDotNetSubtraction) + diagnosticTest(#"[abc-[def]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[abc-[^def]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction) + diagnosticTest("[a-z-[d-w-[m-o]]]", .unsupportedDotNetSubtraction) + diagnosticTest(#"[a-[:b]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[[a]-[b]]"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[ -[ ]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"(?x)[a - [b] ]"#, .unsupportedDotNetSubtraction) + + diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"[-[]]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"(?x)[ - [ ] ]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"[a-[:digit:]]"#, .invalidCharacterClassRangeOperand) + + diagnosticTest("[--]", .expectedCustomCharacterClassMembers) + diagnosticTest("[---]", .expectedCustomCharacterClassMembers) + diagnosticTest("[----]", .expectedCustomCharacterClassMembers) + + // Quoted sequences aren't currently supported as range operands. + diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence")) + diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence")) diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) @@ -2873,6 +2925,17 @@ extension RegexTests { /# """#, .quoteMayNotSpanMultipleLines) + // .NET subtraction + diagnosticWithDelimitersTest(#""" + #/ + [ + a # interesting + - #a + [ b] # comment + ] + /# + """#, .unsupportedDotNetSubtraction) + // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*"))