diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index 4602b55d3..7e6e8c285 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -62,6 +62,10 @@ extension AST { self.rhs = rhs self.trivia = trivia } + + public var location: SourceLocation { + lhs.location.union(with: rhs.location) + } } public enum SetOp: String, Hashable { case subtraction = "--" @@ -108,6 +112,25 @@ extension CustomCC.Member { public var isSemantic: Bool { !isTrivia } + + public var location: SourceLocation { + switch self { + case let .custom(c): return c.location + case let .range(r): return r.location + case let .atom(a): return a.location + case let .quote(q): return q.location + case let .trivia(t): return t.location + case let .setOperation(lhs, dash, rhs): + var loc = dash.location + if let lhs = lhs.first { + loc = loc.union(with: lhs.location) + } + if let rhs = rhs.last { + loc = loc.union(with: rhs.location) + } + return loc + } + } } extension AST.CustomCharacterClass { diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 618ae2412..5bca2ad13 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -94,6 +94,7 @@ enum ParseError: Error, Hashable { case invalidNamedReference(String) case duplicateNamedCapture(String) case invalidCharacterClassRangeOperand + case unsupportedDotNetSubtraction case invalidQuantifierRange(Int, Int) case invalidCharacterRange(from: Character, to: Character) case notQuantifiable @@ -174,7 +175,9 @@ extension ParseError: CustomStringConvertible { case .expectedCustomCharacterClassMembers: return "expected custom character class members" case .invalidCharacterClassRangeOperand: - return "invalid character class range" + return "invalid bound for character class range" + case .unsupportedDotNetSubtraction: + return "subtraction with '-' is unsupported; use '--' instead" case .emptyProperty: return "empty property" case .unknownProperty(let key, let value): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 41b744234..be6f13fc7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1245,6 +1245,25 @@ extension Source { return nil } + /// Check to see if we can lex a .NET subtraction. Returns the + /// location of the `-`. + /// + /// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass + /// + func canLexDotNetCharClassSubtraction( + context: ParsingContext + ) -> SourceLocation? { + lookahead { src in + // We can lex '-' as a .NET subtraction if it precedes a custom character + // class. + while (try? src.lexTrivia(context: context)) != nil {} + guard let dashLoc = src.tryEatWithLoc("-") else { return nil } + while (try? src.lexTrivia(context: context)) != nil {} + guard src.lexCustomCCStart() != nil else { return nil } + return dashLoc + } + } + private mutating func lexPOSIXCharacterProperty( ) throws -> Located? { try recordLoc { src in diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 3e20ae8c0..52861f23d 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -502,6 +502,12 @@ extension Parser { var members: Array = [] try parseCCCMembers(into: &members) + // Make sure we have at least one semantic member. + if members.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) + } + // If we have a binary set operator, parse it and the next members. Note // that this means we left associate for a chain of operators. // TODO: We may want to diagnose and require users to disambiguate, at least @@ -511,16 +517,12 @@ extension Parser { var rhs: Array = [] try parseCCCMembers(into: &rhs) - if members.none(\.isSemantic) || rhs.none(\.isSemantic) { + if rhs.none(\.isSemantic) { throw Source.LocatedError( ParseError.expectedCustomCharacterClassMembers, start.location) } members = [.setOperation(members, binOp, rhs)] } - if members.none(\.isSemantic) { - throw Source.LocatedError( - ParseError.expectedCustomCharacterClassMembers, start.location) - } try source.expect("]") return CustomCC(start, members, loc(start.location.start)) } @@ -550,48 +552,88 @@ extension Parser { return nil } - mutating func parseCCCMembers( - into members: inout Array + /// Attempt to parse a custom character class range into `members`, or regular + /// members if a range cannot be formed. + mutating func parsePotentialCCRange( + into members: inout [CustomCC.Member] ) throws { - // Parse members until we see the end of the custom char class or an - // operator. - while let member = try parseCCCMember() { - members.append(member) - - // If we have an atom, we can try to parse a character class range. Each - // time we parse a component of the range, we append to `members` in case - // it ends up not being a range, and we bail. If we succeed in parsing, we - // remove the intermediate members. - if case .atom(let lhs) = member { - let membersBeforeRange = members.count - 1 - - while let t = try source.lexTrivia(context: context) { - members.append(.trivia(t)) - } + guard let lhs = members.last, lhs.isSemantic else { return } + + // Try and see if we can parse a character class range. Each time we parse + // a component of the range, we append to `members` in case it ends up not + // being a range, and we bail. If we succeed in parsing, we remove the + // intermediate members. + let membersBeforeRange = members.count - 1 + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } + guard let dash = source.lexCustomCharacterClassRangeOperator() else { + return + } - guard let dash = source.lexCustomCharacterClassRangeOperator() else { - continue - } - // If we can't parse a range, '-' becomes literal, e.g `[6-]`. - members.append(.atom(.init(.char("-"), dash))) + // If we can't parse a range, '-' becomes literal, e.g `[6-]`. + members.append(.atom(.init(.char("-"), dash))) - while let t = try source.lexTrivia(context: context) { - members.append(.trivia(t)) + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } + guard let rhs = try parseCCCMember() else { return } + members.append(rhs) + + func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom { + switch m { + case .atom(let a): + return a + case .custom: + // Not supported. While .NET allows `x-[...]` to spell subtraction, we + // require `x--[...]`. We also ban `[...]-x` for consistency. + if isLHS { + throw Source.LocatedError( + ParseError.invalidCharacterClassRangeOperand, m.location) + } else { + throw Source.LocatedError( + ParseError.unsupportedDotNetSubtraction, m.location) } - guard let rhs = try parseCCCMember() else { continue } - members.append(rhs) - - guard case let .atom(rhs) = rhs else { continue } - - // We've successfully parsed an atom LHS and RHS, so form a range, - // collecting the trivia we've parsed, and replacing the members that - // would have otherwise been added to the custom character class. - let rangeMemberCount = members.count - membersBeforeRange - let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) - members.removeLast(rangeMemberCount) - members.append(.range(.init(lhs, dash, rhs, trivia: trivia))) + case .quote: + // Currently unsupported, we need to figure out what the semantics + // would be for grapheme/scalar modes. + throw Source.LocatedError( + ParseError.unsupported("range with quoted sequence"), m.location) + case .trivia: + throw Unreachable("Should have been lexed separately") + case .range, .setOperation: + throw Unreachable("Parsed later") } } + let lhsOp = try makeOperand(lhs, isLHS: true) + let rhsOp = try makeOperand(rhs, isLHS: false) + + // We've successfully parsed an atom LHS and RHS, so form a range, + // collecting the trivia we've parsed, and replacing the members that + // would have otherwise been added to the custom character class. + let rangeMemberCount = members.count - membersBeforeRange + let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) + members.removeLast(rangeMemberCount) + members.append(.range(.init(lhsOp, dash, rhsOp, trivia: trivia))) + + // We need to specially check if we can lex a .NET character class + // subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd + // treat the second `-` as literal. + if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) { + throw Source.LocatedError( + ParseError.unsupportedDotNetSubtraction, dashLoc) + } + } + + mutating func parseCCCMembers( + into members: inout Array + ) throws { + // Parse members and ranges until we see the end of the custom char class + // or an operator. + while let member = try parseCCCMember() { + members.append(member) + try parsePotentialCCRange(into: &members) + } } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 5beb67448..f23878142 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -515,10 +515,36 @@ extension RegexTests { parseTest( "[a-b-c]", charClass(range_m("a", "b"), "-", "c")) + parseTest( + "[a-b-c-d]", charClass(range_m("a", "b"), "-", range_m("c", "d"))) + + parseTest("[a-c---]", charClass( + setOp(range_m("a", "c"), op: .subtraction, "-") + )) + + parseTest("(?x)[a-c -- -]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(setOp(range_m("a", "c"), op: .subtraction, "-")) + )) + + parseTest("(?x)[a-c - - -]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "c"), range_m("-", "-")) + )) parseTest("[-a-]", charClass("-", "a", "-")) parseTest("[[a]-]", charClass(charClass("a"), "-")) - parseTest("[[a]-b]", charClass(charClass("a"), "-", "b")) + parseTest("[-[a]]", charClass("-", charClass("a"))) + + parseTest(#"(?x)[ -[b]]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("-", charClass("b")) + )) + + parseTest(#"[ - [ ]]"#, charClass(range_m(" ", " "), charClass(" "))) + parseTest(#"[ - [ ] ]"#, charClass(range_m(" ", " "), charClass(" "), " ")) + + parseTest(#"[a-c-\Qd\E]"#, charClass(range_m("a", "c"), "-", quote_m("d"))) parseTest("[a-z]", charClass(range_m("a", "z"))) parseTest("[a-a]", charClass(range_m("a", "a"))) @@ -2688,6 +2714,32 @@ extension RegexTests { diagnosticTest("[[:=:]]", .emptyProperty) diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest("[[a]-b]", .invalidCharacterClassRangeOperand) + + // .NET subtraction is banned, we require explicit '--'. + diagnosticTest("[a-[b]]", .unsupportedDotNetSubtraction) + diagnosticTest(#"[abc-[def]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[abc-[^def]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction) + diagnosticTest("[a-z-[d-w-[m-o]]]", .unsupportedDotNetSubtraction) + diagnosticTest(#"[a-[:b]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[[a]-[b]]"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[ -[ ]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"(?x)[a - [b] ]"#, .unsupportedDotNetSubtraction) + + diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"[-[]]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"(?x)[ - [ ] ]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"[a-[:digit:]]"#, .invalidCharacterClassRangeOperand) + + diagnosticTest("[--]", .expectedCustomCharacterClassMembers) + diagnosticTest("[---]", .expectedCustomCharacterClassMembers) + diagnosticTest("[----]", .expectedCustomCharacterClassMembers) + + // Quoted sequences aren't currently supported as range operands. + diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence")) + diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence")) diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) @@ -2873,6 +2925,17 @@ extension RegexTests { /# """#, .quoteMayNotSpanMultipleLines) + // .NET subtraction + diagnosticWithDelimitersTest(#""" + #/ + [ + a # interesting + - #a + [ b] # comment + ] + /# + """#, .unsupportedDotNetSubtraction) + // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*"))