From 9740416c5e724c2b9d412e19d3cb84c55114f452 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 17:15:42 +0100 Subject: [PATCH 1/4] Introduce ASTStage parameter to `parse` This allows specifying whether or not to perform semantic checks on the AST. Some clients, e.g syntax coloring, only care about the syntactic structure. But other clients want errors to be emitted for e.g unsupported constructs. --- .../PatternConverter/PatternConverter.swift | 2 +- .../Regex/Parse/CompilerInterface.swift | 2 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 18 +++++++++++++++--- Sources/_StringProcessing/Compiler.swift | 2 +- .../Regex/AnyRegexOutput.swift | 4 ++-- Sources/_StringProcessing/Regex/Core.swift | 4 ++-- Tests/RegexTests/CaptureTests.swift | 2 +- Tests/RegexTests/DiagnosticTests.swift | 6 +++--- Tests/RegexTests/ParseTests.swift | 10 +++++----- 9 files changed, 31 insertions(+), 19 deletions(-) diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index a10698526..497d54506 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -50,7 +50,7 @@ struct PatternConverter: ParsableCommand { print("Converting '\(delim)\(regex)\(delim)'") let ast = try _RegexParser.parse( - regex, + regex, .semantic, experimentalSyntax ? .experimental : .traditional) // Show rendered source ranges diff --git a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift index 0856361d8..4ae518dcd 100644 --- a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift +++ b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral( _ input: String, captureBufferOut: UnsafeMutableRawBufferPointer ) throws -> (regexToEmit: String, version: Int) { do { - let ast = try parseWithDelimiters(input) + let ast = try parseWithDelimiters(input, .semantic) // Serialize the capture structure for later type inference. assert(captureBufferOut.count >= input.utf8.count) ast.captureStructure.encode(to: captureBufferOut) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 54e46948a..168adf4a2 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -558,8 +558,19 @@ extension Parser { } } +public enum ASTStage { + /// The regex is parsed, and a syntactically valid AST is returned. Otherwise + /// an error is thrown. This is useful for e.g syntax coloring. + case syntactic + + /// The regex is parsed, and a syntactically and semantically valid AST is + /// returned. Otherwise an error is thrown. A semantically valid AST has been + /// checked for e.g unsupported constructs and invalid backreferences. + case semantic +} + public func parse( - _ regex: S, _ syntax: SyntaxOptions + _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions ) throws -> AST where S.SubSequence == Substring { let source = Source(String(regex)) @@ -591,11 +602,12 @@ fileprivate func defaultSyntaxOptions( /// Parses a given regex string with delimiters, inferring the syntax options /// from the delimiters used. public func parseWithDelimiters( - _ regex: S + _ regex: S, _ stage: ASTStage ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) do { - return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) + let syntax = defaultSyntaxOptions(delim, contents: contents) + return try parse(contents, stage, syntax) } catch let error as LocatedErrorProtocol { // Convert the range in 'contents' to the range in 'regex'. let delimCount = delim.opening.count diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 47faa23ed..1c20761c8 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -38,7 +38,7 @@ class Compiler { func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { - let ast = try parse(regex, syntax) + let ast = try parse(regex, .semantic, syntax) let program = try Compiler(ast: ast).emit() return Executor(program: program) } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 00fc2e952..6dd8e17b6 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -17,7 +17,7 @@ extension Regex where Output == AnyRegexOutput { /// /// - Parameter pattern: The regular expression. public init(_ pattern: String) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } @@ -31,7 +31,7 @@ extension Regex { _ pattern: String, as: Output.Type = Output.self ) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 1f9a35dad..29d2267b2 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -44,7 +44,7 @@ public struct Regex: RegexComponent { // Compiler interface. Do not change independently. @usableFromInline init(_regexString pattern: String) { - self.init(ast: try! parse(pattern, .traditional)) + self.init(ast: try! parse(pattern, .semantic, .traditional)) } // Compiler interface. Do not change independently. @@ -53,7 +53,7 @@ public struct Regex: RegexComponent { assert(version == currentRegexLiteralFormatVersion) // The version argument is passed by the compiler using the value defined // in libswiftParseRegexLiteral. - self.init(ast: try! parseWithDelimiters(pattern)) + self.init(ast: try! parseWithDelimiters(pattern, .semantic)) } public var regex: Regex { diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index b48e1f0a5..ad78cd5b5 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -150,7 +150,7 @@ func captureTest( file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(regex, .traditional) + let ast = try! parse(regex, .semantic, .traditional) let capList = ast.root._captureList guard capList == expected else { XCTFail(""" diff --git a/Tests/RegexTests/DiagnosticTests.swift b/Tests/RegexTests/DiagnosticTests.swift index 428020b80..0100a3a86 100644 --- a/Tests/RegexTests/DiagnosticTests.swift +++ b/Tests/RegexTests/DiagnosticTests.swift @@ -20,7 +20,7 @@ extension RegexTests { XCTAssert(SourceLocation.fake.isFake) XCTAssert(group(.capture, "a").location.isFake) - let ast = try! parse("(a)", .traditional).root + let ast = try! parse("(a)", .semantic, .traditional).root XCTAssert(ast.location.isReal) } @@ -31,7 +31,7 @@ extension RegexTests { // // Input should be a concatenation or alternation func flatTest(_ str: String, _ expected: [String]) { - guard let ast = try? parse(str, .traditional).root else { + guard let ast = try? parse(str, .semantic, .traditional).root else { XCTFail("Fail to parse: \(str)") return } @@ -54,7 +54,7 @@ extension RegexTests { func renderTest(_ str: String, _ expected: [String]) { let lines = try! parse( - str, .traditional + str, .semantic, .traditional )._render(in: str) func fail() { XCTFail(""" diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0ef021442..eed96becc 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -170,8 +170,8 @@ func parseNotEqualTest( syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line ) { - let lhsAST = try! parse(lhs, syntax) - let rhsAST = try! parse(rhs, syntax) + let lhsAST = try! parse(lhs, .syntactic, syntax) + let rhsAST = try! parse(rhs, .syntactic, syntax) if lhsAST == rhsAST || lhsAST._dump() == rhsAST._dump() { XCTFail(""" AST: \(lhsAST._dump()) @@ -187,7 +187,7 @@ func rangeTest( at locFn: (AST.Node) -> SourceLocation = \.location, file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax).root + let ast = try! parse(input, .syntactic, syntax).root let range = input.offsets(of: locFn(ast).range) let expected = expectedRange(input) @@ -207,7 +207,7 @@ func diagnosticTest( file: StaticString = #file, line: UInt = #line ) { do { - let ast = try parse(input, syntax) + let ast = try parse(input, .semantic, syntax) XCTFail(""" Passed \(ast) @@ -236,7 +236,7 @@ func diagnosticWithDelimitersTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) do { - let orig = try parseWithDelimiters(literal) + let orig = try parseWithDelimiters(literal, .semantic) let ast = orig.root XCTFail(""" From 4b31736f109fe4815696e00880b228947f66fea8 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 17:15:43 +0100 Subject: [PATCH 2/4] Implement semantic diagnostics Start emitting errors for unsupported constructs, and other semantic errors such as duplicate group names. Once we start emitting bytecode for regex at compile time, these errors could potentially be subsumed into the bytecode generator. But for now, implement them as a separate pass. --- .../Regex/Parse/Diagnostics.swift | 26 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 14 +- Sources/_RegexParser/Regex/Parse/Sema.swift | 384 ++++++++++++ .../_StringProcessing/ConsumerInterface.swift | 5 +- .../_CharacterClassModel.swift | 6 +- Tests/RegexTests/CaptureTests.swift | 4 +- Tests/RegexTests/MatchTests.swift | 6 +- Tests/RegexTests/ParseTests.swift | 589 +++++++++++------- Tests/RegexTests/UTS18Tests.swift | 2 +- 9 files changed, 798 insertions(+), 238 deletions(-) create mode 100644 Sources/_RegexParser/Regex/Parse/Sema.swift diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index c3d74c30b..7a8dfe771 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -15,6 +15,8 @@ enum ParseError: Error, Hashable { // TODO: I wonder if it makes sense to store the string. // This can make equality weird. + // MARK: Syntactic Errors + case numberOverflow(String) case expectedNumDigits(String, Int) case expectedNumber(String, kind: RadixKind) @@ -55,7 +57,6 @@ enum ParseError: Error, Hashable { case cannotRemoveMatchingOptionsAfterCaret case expectedCustomCharacterClassMembers - case invalidCharacterClassRangeOperand case emptyProperty case unknownProperty(key: String?, value: String) @@ -73,6 +74,15 @@ enum ParseError: Error, Hashable { case cannotRemoveExtendedSyntaxInMultilineMode case expectedCalloutArgument + + // MARK: Semantic Errors + + case unsupported(String) + case deprecatedUnicode(String) + case invalidReference(Int) + case duplicateNamedCapture(String) + case invalidCharacterClassRangeOperand + case invalidQuantifierRange(Int, Int) } extension IdentifierKind { @@ -88,6 +98,7 @@ extension IdentifierKind { extension ParseError: CustomStringConvertible { var description: String { switch self { + // MARK: Syntactic Errors case let .numberOverflow(s): return "number overflow: \(s)" case let .expectedNumDigits(s, i): @@ -167,6 +178,19 @@ extension ParseError: CustomStringConvertible { return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" + + // MARK: Semantic Errors + + case let .unsupported(kind): + return "\(kind) is not currently supported" + case let .deprecatedUnicode(kind): + return "\(kind) is a deprecated Unicode property, and is not supported" + case let .invalidReference(i): + return "no capture numbered \(i)" + case let .duplicateNamedCapture(str): + return "group named '\(str)' already exists" + case let .invalidQuantifierRange(lhs, rhs): + return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'" } } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 168adf4a2..2d33e4d7e 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -543,11 +543,6 @@ extension Parser { // Range between atoms. if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(context: context) { - guard atom.isValidCharacterClassRangeBound && - rhs.isValidCharacterClassRangeBound else { - throw ParseError.invalidCharacterClassRangeOperand - } - // TODO: Validate lower <= upper? members.append(.range(.init(atom, dashLoc, rhs))) continue } @@ -575,7 +570,14 @@ public func parse( { let source = Source(String(regex)) var parser = Parser(source, syntax: syntax) - return try parser.parse() + let ast = try parser.parse() + switch stage { + case .syntactic: + break + case .semantic: + try validate(ast) + } + return ast } /// Retrieve the default set of syntax options that a delimiter and literal diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift new file mode 100644 index 000000000..32859812c --- /dev/null +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -0,0 +1,384 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +/// Validate a regex AST for semantic validity. Once bytecode is emitted at +/// compile time, this could potentially be subsumed by the bytecode generator. +fileprivate struct RegexValidator { + let ast: AST + let captures: CaptureList + + init(_ ast: AST) { + self.ast = ast + self.captures = ast.captureList + } + + func error(_ kind: ParseError, at loc: SourceLocation) -> Error { + Source.LocatedError(kind, loc) + } +} + +extension String { + fileprivate var quoted: String { "'\(self)'" } +} + +extension RegexValidator { + func validate() throws { + for opt in ast.globalOptions?.options ?? [] { + try validateGlobalMatchingOption(opt) + } + try validateNode(ast.root) + } + + func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) throws { + switch opt.kind { + case .limitDepth, .limitHeap, .limitMatch, .notEmpty, .notEmptyAtStart, + .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, + .unicodeProperties: + // These are PCRE specific, and not something we're likely to ever + // support. + throw error(.unsupported("global matching option"), at: opt.location) + + case .newlineMatching: + // We have implemented the correct behavior for multi-line literals, but + // these should also affect '.' and '\N' matching, which we haven't + // implemented. + throw error(.unsupported("newline matching mode"), at: opt.location) + + case .newlineSequenceMatching: + // We haven't yet implemented the '\R' matching specifics of these. + throw error( + .unsupported("newline sequence matching mode"), at: opt.location) + } + } + + func validateReference(_ ref: AST.Reference) throws { + switch ref.kind { + case .absolute(let i): + guard i <= captures.captures.count else { + throw error(.invalidReference(i), at: ref.innerLoc) + } + case .relative: + throw error(.unsupported("relative capture reference"), at: ref.innerLoc) + case .named: + // TODO: This could be implemented by querying the capture list for an + // index. + throw error(.unsupported("named capture reference"), at: ref.innerLoc) + } + if let recLevel = ref.recursionLevel { + throw error(.unsupported("recursion level"), at: recLevel.location) + } + } + + func validateMatchingOption(_ opt: AST.MatchingOption) throws { + let loc = opt.location + switch opt.kind { + case .allowDuplicateGroupNames: + // Not currently supported as we need to figure out what to do with + // the capture type. + throw error(.unsupported("duplicate group naming"), at: loc) + + case .unicodeWordBoundaries: + throw error(.unsupported("unicode word boundary mode"), at: loc) + + case .textSegmentWordMode, .textSegmentGraphemeMode: + throw error(.unsupported("text segment mode"), at: loc) + + case .byteSemantics: + throw error(.unsupported("byte semantic mode"), at: loc) + + case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, + .unicodeScalarSemantics, .graphemeClusterSemantics, + .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, + .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps: + break + } + } + + func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) throws { + for opt in opts.adding { + try validateMatchingOption(opt) + } + for opt in opts.removing { + try validateMatchingOption(opt) + } + } + + func validateBinaryProperty( + _ prop: Unicode.BinaryProperty, at loc: SourceLocation + ) throws { + switch prop { + case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable, + .changesWhenCasefolded, .changesWhenCasemapped, + .changesWhenNFKCCasefolded, .changesWhenLowercased, + .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated, + .defaultIgnorableCodePoint, .diacratic, .extender, + .fullCompositionExclusion, .graphemeBase, .graphemeExtended, .hexDigit, + .idContinue, .ideographic, .idStart, .idsBinaryOperator, + .idsTrinaryOperator, .joinControl, .logicalOrderException, .lowercase, + .math, .noncharacterCodePoint, .patternSyntax, .patternWhitespace, + .quotationMark, .radical, .regionalIndicator, .softDotted, + .sentenceTerminal, .terminalPunctuation, .unifiedIdiograph, .uppercase, + .variationSelector, .whitespace, .xidContinue, .xidStart: + break + + case .emojiModifierBase, .emojiModifier, .emoji, .emojiPresentation: + // These are available on macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1. + // TODO: We should ideally check deployment target for such conditionally + // available properties. + break + + case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: + throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) + + case .bidiControl, .compositionExclusion, .emojiComponent, + .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, + .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, + .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, + .otherUppercase, .prependedConcatenationMark: + throw error(.unsupported(prop.rawValue.quoted), at: loc) + } + } + + func validateCharacterProperty( + _ prop: AST.Atom.CharacterProperty, at loc: SourceLocation + ) throws { + // TODO: We could re-add the .other case to diagnose unknown properties + // here instead of in the parser. + // TODO: Should we store an 'inner location' for the contents of `\p{...}`? + switch prop.kind { + case .binary(let b, _): + try validateBinaryProperty(b, at: loc) + case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, + .scriptExtension: + break + case .pcreSpecial: + throw error(.unsupported("PCRE property"), at: loc) + case .onigurumaSpecial: + throw error(.unsupported("Unicode block property"), at: loc) + } + } + + func validateEscaped( + _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation + ) throws { + switch esc { + case .resetStartOfMatch, .singleDataUnit, .horizontalWhitespace, + .notHorizontalWhitespace, .verticalTab, .notVerticalTab, + // '\N' needs to be emitted using 'emitAny'. + .notNewline: + throw error(.unsupported("'\\\(esc.character)'"), at: loc) + + // Character classes. + case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, + .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar: + // TODO: What about scalar matching mode for .graphemeCluster? We + // currently crash at runtime. + break + + case .newlineSequence: + break + + // Assertions. + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, .textSegment, + .notTextSegment, .firstMatchingPositionInSubject: + break + + // Literal escapes. + case .alarm, .backspace, .escape, .formfeed, .newline, .carriageReturn, + .tab: + break + } + } + + func validateAtom(_ atom: AST.Atom) throws { + switch atom.kind { + case .escaped(let esc): + try validateEscaped(esc, at: atom.location) + + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: + // We need to implement the scalar computations for these. + throw error(.unsupported("control sequence"), at: atom.location) + + case .property(let p): + try validateCharacterProperty(p, at: atom.location) + + case .backreference(let r): + try validateReference(r) + + case .subpattern: + throw error(.unsupported("subpattern"), at: atom.location) + + case .callout: + // These are PCRE and Oniguruma specific, supporting them is future work. + throw error(.unsupported("callout"), at: atom.location) + + case .backtrackingDirective: + // These are PCRE-specific, and are unlikely to be fully supported. + throw error(.unsupported("backtracking directive"), at: atom.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + + case .namedCharacter: + // TODO: We should error on unknown Unicode scalar names. + break + + case .char, .scalar, .startOfLine, .endOfLine, .any: + break + } + } + + func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) throws { + for member in c.members { + try validateCharacterClassMember(member) + } + } + + func validateCharacterClassRange( + _ range: AST.CustomCharacterClass.Range + ) throws { + let lhs = range.lhs + let rhs = range.rhs + + try validateAtom(lhs) + try validateAtom(rhs) + + guard lhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: lhs.location) + } + guard rhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: rhs.location) + } + + guard lhs.literalCharacterValue != nil else { + throw error( + .unsupported("character class range operand"), at: lhs.location) + } + + guard rhs.literalCharacterValue != nil else { + throw error( + .unsupported("character class range operand"), at: rhs.location) + } + + // TODO: Validate lhs <= rhs? That may require knowledge of case + // insensitivity though. + } + + func validateCharacterClassMember( + _ member: AST.CustomCharacterClass.Member + ) throws { + switch member { + case .custom(let c): + try validateCustomCharacterClass(c) + + case .range(let r): + try validateCharacterClassRange(r) + + case .atom(let a): + try validateAtom(a) + + case .setOperation(let lhs, _, let rhs): + for lh in lhs { try validateCharacterClassMember(lh) } + for rh in rhs { try validateCharacterClassMember(rh) } + + case .quote, .trivia: + break + } + } + + func validateGroup(_ group: AST.Group) throws { + let kind = group.kind + switch kind.value { + case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead: + break + + case .balancedCapture: + // These are .NET specific, and kinda niche. + throw error(.unsupported("balanced capture"), at: kind.location) + + case .nonCaptureReset: + // We need to figure out how these interact with typed captures. + throw error(.unsupported("branch reset group"), at: kind.location) + + case .atomicNonCapturing: + throw error(.unsupported("atomic group"), at: kind.location) + + case .nonAtomicLookahead: + throw error(.unsupported("non-atomic lookahead"), at: kind.location) + + case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: + throw error(.unsupported("lookbehind"), at: kind.location) + + case .scriptRun, .atomicScriptRun: + throw error(.unsupported("script run"), at: kind.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + } + try validateNode(group.child) + } + + func validateQuantification(_ quant: AST.Quantification) throws { + try validateNode(quant.child) + switch quant.amount.value { + case .range(let lhs, let rhs): + guard lhs.value <= rhs.value else { + throw error( + .invalidQuantifierRange(lhs.value, rhs.value), at: quant.location) + } + case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN: + break + } + } + + func validateNode(_ node: AST.Node) throws { + switch node { + case .alternation(let a): + for branch in a.children { + try validateNode(branch) + } + case .concatenation(let c): + for child in c.children { + try validateNode(child) + } + + case .group(let g): + try validateGroup(g) + + case .conditional(let c): + // Note even once we get runtime support for this, we need to change the + // parsing to incorporate what is specified in the syntax proposal. + throw error(.unsupported("conditional"), at: c.location) + + case .quantification(let q): + try validateQuantification(q) + + case .atom(let a): + try validateAtom(a) + + case .customCharacterClass(let c): + try validateCustomCharacterClass(c) + + case .absentFunction(let a): + // These are Oniguruma specific. + throw error(.unsupported("absent function"), at: a.location) + + case .quote, .trivia, .empty: + break + } + } +} + +/// Check a regex AST for semantic validity. +public func validate(_ ast: AST) throws { + try RegexValidator(ast).validate() +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 637b1a37a..9c0c3522c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -513,7 +513,10 @@ extension Unicode.BinaryProperty { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { let consume = consumeFunction(for: opts) - + + // Note if you implement support for any of the below, you need to adjust + // the switch in Sema.swift to not have it be diagnosed as unsupported + // (potentially guarded on deployment version). switch self { case .asciiHexDigit: return consume(propertyScalarPredicate { diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index fc3fd5741..27a24cf46 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -454,9 +454,13 @@ extension AST.Atom.EscapedBuiltin { case .notHorizontalWhitespace: return .horizontalWhitespace.inverted - case .notNewline: return .newlineSequence.inverted case .newlineSequence: return .newlineSequence + // FIXME: This is more like '.' than inverted '\R', as it is affected + // by e.g (*CR). We should therefore really be emitting it through + // emitAny(). For now we treat it as semantically invalid. + case .notNewline: return .newlineSequence.inverted + case .whitespace: return .whitespace case .notWhitespace: return .whitespace.inverted diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index ad78cd5b5..45be547db 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -38,8 +38,8 @@ extension CaptureList.Capture { return Self(optionalDepth: 6) } - static func named(_ name: String) -> Self { - return Self(name: name, optionalDepth: 0) + static func named(_ name: String, opt: Int = 0) -> Self { + return Self(name: name, optionalDepth: opt) } } extension CaptureList { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 83b73fe35..3b7def90b 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -717,7 +717,7 @@ extension RegexTests { firstMatchTest( #"\N{ASTERISK}+"#, input: "123***xyz", match: "***") firstMatchTest( - #"\N {2}"#, input: "123 xyz", match: "3 ") + #"\N {2}"#, input: "123 xyz", match: "3 ", xfail: true) firstMatchTest(#"\N{U+2C}"#, input: "123,xyz", match: ",") firstMatchTest(#"\N{U+1F4BF}"#, input: "123šŸ’æxyz", match: "šŸ’æ") @@ -1014,7 +1014,7 @@ extension RegexTests { firstMatchTest( #"a(?:b)c"#, input: "123abcxyz", match: "abc") firstMatchTest( - "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a") + "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a", xfail: true) firstMatchTest( #"(?:a|.b)c"#, input: "123abcacxyz", match: "abc") @@ -1130,6 +1130,8 @@ extension RegexTests { firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) + + firstMatchTest(#"\1(.)"#, input: "112", match: nil) } func testMatchExamples() { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index eed96becc..0ff96fa0b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -33,30 +33,56 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } } +enum SemanticErrorKind { + case unsupported, invalid +} class RegexTests: XCTestCase {} func parseTest( _ input: String, _ expectedAST: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { parseTest( - input, .init(expectedAST, globalOptions: nil), syntax: syntax, - captures: expectedCaptures, file: file, line: line + input, .init(expectedAST, globalOptions: nil), throwsError: errorKind, + syntax: syntax, captures: expectedCaptures, file: file, line: line ) } func parseTest( _ input: String, _ expectedAST: AST, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax) + let ast: AST + do { + ast = try parse(input, errorKind != nil ? .syntactic : .semantic, syntax) + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parse(input, .semantic, syntax) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expectedAST || ast._dump() == expectedAST._dump() // EQ workaround else { @@ -143,15 +169,37 @@ func delimiterLexingTest( /// true, there may be additional characters that follow the literal that are /// not considered part of it. func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, - file: StaticString = #file, line: UInt = #line + _ input: String, _ expecting: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, + ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let orig = try! parseWithDelimiters(literal) - let ast = orig.root + let ast: AST.Node + do { + ast = try parseWithDelimiters( + literal, errorKind != nil ? .syntactic : .semantic).root + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parseWithDelimiters(input, .semantic) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround else { @@ -437,6 +485,12 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) + // FIXME: '\N' should be emitted through 'emitAny', not through the + // _CharacterClassModel model. + parseTest(#"\N"#, escaped(.notNewline), throwsError: .unsupported) + + parseTest(#"\R"#, escaped(.newlineSequence)) + parseTest( "[-|$^:?+*())(*-+-]", charClass( @@ -595,10 +649,12 @@ extension RegexTests { range_m(.keyboardControl("A"), .keyboardControl("B")), range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), range_m(.keyboardMeta("A"), .keyboardMeta("B")) - )) + ), throwsError: .unsupported) - parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( - range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE")))) + parseTest( + #"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( + range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), + throwsError: .unsupported) // MARK: Operators @@ -691,13 +747,13 @@ extension RegexTests { parseTest(#"\\#u{3000}"#, "\u{3000}") // Control and meta controls. - parseTest(#"\c "#, atom(.keyboardControl(" "))) - parseTest(#"\c!"#, atom(.keyboardControl("!"))) - parseTest(#"\c~"#, atom(.keyboardControl("~"))) - parseTest(#"\C--"#, atom(.keyboardControl("-"))) - parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a"))) - parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-"))) - parseTest(#"\M-a"#, atom(.keyboardMeta("a"))) + parseTest(#"\c "#, atom(.keyboardControl(" ")), throwsError: .unsupported) + parseTest(#"\c!"#, atom(.keyboardControl("!")), throwsError: .unsupported) + parseTest(#"\c~"#, atom(.keyboardControl("~")), throwsError: .unsupported) + parseTest(#"\C--"#, atom(.keyboardControl("-")), throwsError: .unsupported) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), throwsError: .unsupported) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), throwsError: .unsupported) + parseTest(#"\M-a"#, atom(.keyboardMeta("a")), throwsError: .unsupported) // MARK: Comments @@ -734,6 +790,9 @@ extension RegexTests { parseTest( #"a{0,0}"#, quantRange(0...0, of: "a")) + parseTest( + #"a{1,1}"#, + quantRange(1...1, of: "a")) // Make sure ranges get treated as literal if invalid. parseTest("{", "{") @@ -786,11 +845,42 @@ extension RegexTests { // Balanced captures parseTest(#"(?)"#, balancedCapture(name: "a", priorName: "c", empty()), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) parseTest(#"(?<-c>)"#, balancedCapture(name: nil, priorName: "c", empty()), - captures: [.cap]) + throwsError: .unsupported, captures: [.cap]) parseTest(#"(?'a-b'c)"#, balancedCapture(name: "a", priorName: "b", "c"), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) + + // Capture resets. + // FIXME: The captures in each branch should be unified. For now, we don't + // treat any capture reset as semantically valid. + parseTest( + "(?|(a)|(b))", + nonCaptureReset(alt(capture("a"), capture("b"))), + throwsError: .unsupported, captures: [.opt, .opt] + ) + parseTest( + "(?|(?a)|(b))", + nonCaptureReset(alt(namedCapture("x", "a"), capture("b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .opt] + ) + parseTest( + "(?|(a)|(?b))", + nonCaptureReset(alt(capture("a"), namedCapture("x", "b"))), + throwsError: .unsupported, captures: [.opt, .named("x", opt: 1)] + ) + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .named("x", opt: 1)] + ) + + // TODO: Reject mismatched names? + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("y", "b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .named("y", opt: 1)] + ) // Other groups parseTest( @@ -798,13 +888,13 @@ extension RegexTests { concat("a", nonCapture("b"), "c")) parseTest( #"a(?|b)c"#, - concat("a", nonCaptureReset("b"), "c")) + concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) parseTest( #"a(?>b)c"#, - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest( "a(*atomic:b)c", - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest("a(?=b)c", concat("a", lookahead("b"), "c")) parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c")) @@ -815,31 +905,42 @@ extension RegexTests { parseTest("a(*negative_lookahead:b)c", concat("a", negativeLookahead("b"), "c")) - parseTest("a(?<=b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*plb:b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*positive_lookbehind:b)c", concat("a", lookbehind("b"), "c")) - - parseTest("a(?"#, backreference(.relative(4))) - parseTest(#"\k<2>"#, backreference(.absolute(2))) - parseTest(#"\k'-3'"#, backreference(.relative(-3))) - parseTest(#"\k'1'"#, backreference(.absolute(1))) - - parseTest(#"\k{a0}"#, backreference(.named("a0"))) - parseTest(#"\k"#, backreference(.named("bc"))) - parseTest(#"\g{abc}"#, backreference(.named("abc"))) - parseTest(#"(?P=abc)"#, backreference(.named("abc"))) + parseTest(#"\113"#, backreference(.absolute(113)), throwsError: .invalid) + parseTest(#"\377"#, backreference(.absolute(377)), throwsError: .invalid) + parseTest(#"\81"#, backreference(.absolute(81)), throwsError: .invalid) + + parseTest(#"\g1"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g001"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g52"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g-01"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g+30"#, backreference(.relative(30)), throwsError: .unsupported) + + parseTest(#"\g{1}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{001}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{52}"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g{-01}"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g{+30}"#, backreference(.relative(30)), throwsError: .unsupported) + parseTest(#"\k<+4>"#, backreference(.relative(4)), throwsError: .unsupported) + parseTest(#"\k<2>"#, backreference(.absolute(2)), throwsError: .invalid) + parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) + parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) + + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .unsupported) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported) // Oniguruma recursion levels. - parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0)) - parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0)) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1)) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8)) - parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8)) - parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8)) - parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8)) - parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8)) - - parseTest(#"(?R)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?0)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?1)"#, subpattern(.absolute(1))) - parseTest(#"(?+12)"#, subpattern(.relative(12))) - parseTest(#"(?-2)"#, subpattern(.relative(-2))) - parseTest(#"(?&hello)"#, subpattern(.named("hello"))) - parseTest(#"(?P>P)"#, subpattern(.named("P"))) + parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid) + parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid) + parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8), throwsError: .unsupported) + + parseTest(#"(?R)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?0)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?1)"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"(?+12)"#, subpattern(.relative(12)), throwsError: .unsupported) + parseTest(#"(?-2)"#, subpattern(.relative(-2)), throwsError: .unsupported) + parseTest(#"(?&hello)"#, subpattern(.named("hello")), throwsError: .unsupported) + parseTest(#"(?P>P)"#, subpattern(.named("P")), throwsError: .unsupported) parseTest(#"[(?R)]"#, charClass("(", "?", "R", ")")) parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) - parseTest(#"\g<1>"#, subpattern(.absolute(1))) - parseTest(#"\g<001>"#, subpattern(.absolute(1))) - parseTest(#"\g'52'"#, subpattern(.absolute(52))) - parseTest(#"\g'-01'"#, subpattern(.relative(-1))) - parseTest(#"\g'+30'"#, subpattern(.relative(30))) - parseTest(#"\g'abc'"#, subpattern(.named("abc"))) + parseTest(#"\g<1>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g<001>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g'52'"#, subpattern(.absolute(52)), throwsError: .unsupported) + parseTest(#"\g'-01'"#, subpattern(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) + parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) // Backreferences are not valid in custom character classes. parseTest(#"[\8]"#, charClass("8")) parseTest(#"[\9]"#, charClass("9")) + // These are valid references. + parseTest(#"()\1"#, concat( + capture(empty()), backreference(.absolute(1)) + ), captures: [.cap]) + parseTest(#"\1()"#, concat( + backreference(.absolute(1)), capture(empty()) + ), captures: [.cap]) + parseTest(#"()()\2"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(2)) + ), captures: [.cap, .cap]) + parseTest(#"()\2()"#, concat( + capture(empty()), backreference(.absolute(2)), capture(empty()) + ), captures: [.cap, .cap]) + // MARK: Character names. parseTest(#"\N{abc}"#, atom(.namedCharacter("abc"))) @@ -1137,7 +1252,7 @@ extension RegexTests { parseTest(#"\N{abc}+"#, oneOrMore(of: atom(.namedCharacter("abc")))) parseTest( #"\N {2}"#, - concat(atom(.escaped(.notNewline)), exactly(2, of: " ")) + concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), throwsError: .unsupported ) parseTest(#"\N{AA}"#, atom(.namedCharacter("AA"))) @@ -1203,13 +1318,13 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic))) + parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic)), throwsError: .unsupported) - parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric))) - parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace))) - parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace))) - parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed))) - parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord))) + parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) + parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) + parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), throwsError: .unsupported) + parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), throwsError: .unsupported) + parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), throwsError: .unsupported) parseTest(#"\p{alnum}"#, prop(.posix(.alnum))) parseTest(#"\p{is_alnum}"#, prop(.posix(.alnum))) @@ -1229,45 +1344,45 @@ extension RegexTests { // MARK: Conditionals parseTest(#"(?(1))"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a|)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)a|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)(a|b|c)|d)"#, conditional( .groupMatched(ref(1)), trueBranch: capture(alt("a", "b", "c")), falseBranch: "d" - ), captures: [.opt]) + ), throwsError: .unsupported, captures: [.opt]) parseTest(#"(?(+3))"#, conditional( - .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(-21))"#, conditional( - .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) // Oniguruma recursion levels. parseTest(#"(?(1+1))"#, conditional( .groupMatched(ref(1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(-1+1))"#, conditional( .groupMatched(ref(minus: 1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(1-3))"#, conditional( .groupMatched(ref(1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(+1-3))"#, conditional( .groupMatched(ref(plus: 1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest( #"(?)(?(a+5))"#, @@ -1275,7 +1390,7 @@ extension RegexTests { .groupMatched(ref("a", recursionLevel: 5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a")] + throwsError: .unsupported, captures: [.named("a")] ) parseTest( #"(?)(?(a1-5))"#, @@ -1283,50 +1398,50 @@ extension RegexTests { .groupMatched(ref("a1", recursionLevel: -5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a1")] + throwsError: .unsupported, captures: [.named("a1")] ) parseTest(#"(?(1))?"#, zeroOrOne(of: conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()))) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), throwsError: .unsupported) parseTest(#"(?(R)a|b)"#, conditional( - .recursionCheck, trueBranch: "a", falseBranch: "b")) + .recursionCheck, trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(R1))"#, conditional( - .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(R&abc)a|b)"#, conditional( - .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?()a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?('abc')a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(abc)a|b)"#, conditional( groupCondition(.capture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), captures: [.cap]) + ), throwsError: .unsupported, captures: [.cap]) parseTest(#"(?(?:abc)a|b)"#, conditional( groupCondition(.nonCapture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?=abc)a|b)"#, conditional( groupCondition(.lookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?!abc)a|b)"#, conditional( groupCondition(.negativeLookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?<=abc)a|b)"#, conditional( groupCondition(.lookbehind, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?y)(?(xxx)a|b)"#, concat( namedCapture("xxx", "y"), conditional(.groupMatched(ref("xxx")), trueBranch: "a", falseBranch: "b") - ), captures: [.named("xxx")]) + ), throwsError: .unsupported, captures: [.named("xxx")]) parseTest(#"(?(1)(?(2)(?(3)))|a)"#, conditional( .groupMatched(ref(1)), @@ -1356,115 +1471,119 @@ extension RegexTests { trueBranch: empty(), falseBranch: empty()), falseBranch: empty()), - falseBranch: "a")) + falseBranch: "a"), throwsError: .unsupported) parseTest(#"(?(DEFINE))"#, conditional( - .defineGroup, trueBranch: empty(), falseBranch: empty())) + .defineGroup, trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(VERSION>=3.1))"#, conditional( pcreVersionCheck(.greaterThanOrEqual, 3, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(VERSION=0.1))"#, conditional( pcreVersionCheck(.equal, 0, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) // MARK: Callouts // PCRE callouts - parseTest(#"(?C)"#, pcreCallout(.number(0))) - parseTest(#"(?C0)"#, pcreCallout(.number(0))) - parseTest(#"(?C20)"#, pcreCallout(.number(20))) - parseTest("(?C{abc})", pcreCallout(.string("abc"))) + parseTest(#"(?C)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C0)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C20)"#, pcreCallout(.number(20)), throwsError: .unsupported) + parseTest("(?C{abc})", pcreCallout(.string("abc")), throwsError: .unsupported) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { - parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello"))) + parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello")), + throwsError: .unsupported) } // Oniguruma named callouts - parseTest("(*X)", onigurumaNamedCallout("X")) - parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t")) - parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b")) - parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b")) - parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c")) - parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c")) - parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>")) - parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c")) + parseTest("(*X)", onigurumaNamedCallout("X"), throwsError: .unsupported) + parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), throwsError: .unsupported) + parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), throwsError: .unsupported) + parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), throwsError: .unsupported) + parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), throwsError: .unsupported) + parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), throwsError: .unsupported) // Oniguruma 'of contents' callouts - parseTest("(?{x})", onigurumaCalloutOfContents("x")) - parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y")) - parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x")) - parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag")) - parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction)) - parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both)) - parseTest("(?{x}>)", onigurumaCalloutOfContents("x")) - parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x")) - parseTest("(?{\\})", onigurumaCalloutOfContents("\\")) + parseTest("(?{x})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), throwsError: .unsupported) + parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), throwsError: .unsupported) + parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), throwsError: .unsupported) + parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), throwsError: .unsupported) + parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), throwsError: .unsupported) + parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), throwsError: .unsupported) // MARK: Backtracking directives - parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept))) + parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), throwsError: .unsupported) parseTest( "(*ACCEPT:a)??", - zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")) + zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")), + throwsError: .unsupported ) - parseTest("(*:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*F)", backtrackingDirective(.fail)) - parseTest("(*COMMIT)", backtrackingDirective(.commit)) - parseTest("(*SKIP)", backtrackingDirective(.skip)) - parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP")) - parseTest("(*PRUNE)", backtrackingDirective(.prune)) - parseTest("(*THEN)", backtrackingDirective(.then)) + parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*F)", backtrackingDirective(.fail), throwsError: .unsupported) + parseTest("(*COMMIT)", backtrackingDirective(.commit), throwsError: .unsupported) + parseTest("(*SKIP)", backtrackingDirective(.skip), throwsError: .unsupported) + parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), throwsError: .unsupported) + parseTest("(*PRUNE)", backtrackingDirective(.prune), throwsError: .unsupported) + parseTest("(*THEN)", backtrackingDirective(.then), throwsError: .unsupported) // MARK: Oniguruma absent functions - parseTest("(?~)", absentRepeater(empty())) - parseTest("(?~abc)", absentRepeater(concat("a", "b", "c"))) - parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a"))) - parseTest("(?~~)", absentRepeater("~")) - parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c"))) - parseTest("(?~(a))", absentRepeater(capture("a")), captures: []) - parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty()))) - - parseTest("(?~|abc)", absentStopper(concat("a", "b", "c"))) - parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a"))) - parseTest("(?~|~)", absentStopper("~")) - parseTest("(?~|(a))", absentStopper(capture("a")), captures: []) - parseTest("(?~|a){2}", exactly(2, of: absentStopper("a"))) - - parseTest("(?~|a|b)", absentExpression("a", "b")) - parseTest("(?~|~|~)", absentExpression("~", "~")) + parseTest("(?~)", absentRepeater(empty()), throwsError: .unsupported) + parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~~)", absentRepeater("~"), throwsError: .unsupported) + parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~(a))", absentRepeater(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), throwsError: .unsupported) + + parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~|~)", absentStopper("~"), throwsError: .unsupported) + parseTest("(?~|(a))", absentStopper(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), throwsError: .unsupported) + + parseTest("(?~|a|b)", absentExpression("a", "b"), throwsError: .unsupported) + parseTest("(?~|~|~)", absentExpression("~", "~"), throwsError: .unsupported) parseTest("(?~|(a)|(?:b))", absentExpression(capture("a"), nonCapture("b")), - captures: []) + throwsError: .unsupported, captures: []) parseTest("(?~|(a)|(?:(b)|c))", absentExpression( capture("a"), nonCapture(alt(capture("b"), "c")) - ), captures: [.opt]) - parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b"))) + ), throwsError: .unsupported, captures: [.opt]) + parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), throwsError: .unsupported) - parseTest("(?~|)", absentRangeClear()) + parseTest("(?~|)", absentRangeClear(), throwsError: .unsupported) // TODO: It's not really clear what this means, but Oniguruma parses it... // Maybe we should diagnose it? - parseTest("(?~|)+", oneOrMore(of: absentRangeClear())) + parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), throwsError: .unsupported) // MARK: Global matching options parseTest("(*CR)(*UTF)(*LIMIT_DEPTH=3)", ast( empty(), opts: .newlineMatching(.carriageReturnOnly), .utfMode, .limitDepth(.init(faking: 3)) - )) + ), throwsError: .unsupported) parseTest( - "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode))) + "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode)), + throwsError: .unsupported) parseTest( "(*BSR_ANYCRLF)", ast( - empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed))) + empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed)), + throwsError: .unsupported) // TODO: Diagnose on multiple line matching modes? parseTest( @@ -1472,7 +1591,7 @@ extension RegexTests { ast(empty(), opts: [ .carriageReturnOnly, .linefeedOnly, .carriageAndLinefeedOnly, .anyCarriageReturnOrLinefeed, .anyUnicode, .nulCharacter - ].map { .newlineMatching($0) })) + ].map { .newlineMatching($0) }), throwsError: .unsupported) parseTest( """ @@ -1485,7 +1604,7 @@ extension RegexTests { .limitMatch(.init(faking: 2)), .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties - ) + ), throwsError: .unsupported ) parseTest("[(*CR)]", charClass("(", "*", "C", "R", ")")) @@ -1699,7 +1818,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1710,7 +1829,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1721,7 +1840,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1732,7 +1851,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1743,7 +1862,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1754,7 +1873,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1765,7 +1884,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1776,7 +1895,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1787,7 +1906,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1808,7 +1927,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1819,7 +1938,7 @@ extension RegexTests { # h """, ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1830,7 +1949,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1844,7 +1963,7 @@ extension RegexTests { opts: .newlineMatching(.carriageReturnOnly), .newlineMatching(.nulCharacter) ), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) // MARK: Parse with delimiters @@ -1947,30 +2066,37 @@ extension RegexTests { #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) parseWithDelimitersTest( #"re'(?'a_bcA0-c1A'x*)'"#, - balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")), + throwsError: .unsupported) parseWithDelimitersTest( #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( - .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"), + throwsError: .unsupported + ) parseWithDelimitersTest( #"re'(?('+20')\')'"#, conditional( - .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) - + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"), throwsError: .unsupported) parseWithDelimitersTest( - #"re'(?C'a*b\c šŸ”„_ ;')'"#, pcreCallout(.string(#"a*b\c šŸ”„_ ;"#))) + #"re'(?C'a*b\c šŸ”„_ ;')'"#, pcreCallout(.string(#"a*b\c šŸ”„_ ;"#)), + throwsError: .unsupported) // Fine, because we don't end up skipping. delimiterLexingTest(#"re'(?'"#) @@ -2314,6 +2440,8 @@ extension RegexTests { diagnosticTest("[[::]]", .emptyProperty) diagnosticTest("[[:=:]]", .emptyProperty) + diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) @@ -2419,6 +2547,7 @@ extension RegexTests { diagnosticTest("*?", .quantifierRequiresOperand("*?")) diagnosticTest("{5}", .quantifierRequiresOperand("{5}")) diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) + diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) // MARK: Unicode scalars @@ -2458,6 +2587,16 @@ extension RegexTests { diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal)) diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal)) + diagnosticTest(#"()\k<1+1>"#, .unsupported("recursion level")) + diagnosticTest(#"()\k<1-1>"#, .unsupported("recursion level")) + + diagnosticTest(#"\k<0>"#, .cannotReferToWholePattern) + diagnosticTest(#"\1"#, .invalidReference(1)) + diagnosticTest(#"(?:)\1"#, .invalidReference(1)) + diagnosticTest(#"()\2"#, .invalidReference(2)) + diagnosticTest(#"\2()"#, .invalidReference(2)) + diagnosticTest(#"(?:)()\2"#, .invalidReference(2)) + diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2)) // MARK: Conditionals @@ -2560,5 +2699,7 @@ extension RegexTests { func testCompilerInterfaceDiagnostics() { compilerInterfaceDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'") + compilerInterfaceDiagnosticMessageTest( + "/a{3,2}/", "cannot parse regular expression: range lower bound '3' must be less than or equal to upper bound '2'") } } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index eff9f9b4e..145087ee7 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -444,7 +444,7 @@ extension UTS18Tests { func testIndividuallyNamedCharacters_XFail() { XCTExpectFailure("Need to support named chars in custom character classes") { - XCTFail("\(regex(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#))") + XCTFail(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#) // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) } From 466b375a82627c51a02d1d1c30ef15f9b0aeaf34 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 17:15:44 +0100 Subject: [PATCH 3/4] Validate capture lists Begin storing source location on capture lists, and start erroring on duplicate named captures. --- .../Regex/Parse/CaptureList.swift | 15 +++++++---- Sources/_RegexParser/Regex/Parse/Sema.swift | 12 +++++++++ Sources/_StringProcessing/Regex/DSLTree.swift | 2 +- Tests/RegexTests/CaptureTests.swift | 26 ++++++++++++------- Tests/RegexTests/ParseTests.swift | 10 +++++-- 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift index d112b2010..0287e7337 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureList.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -26,15 +26,18 @@ extension CaptureList { public var name: String? public var type: Any.Type? public var optionalDepth: Int + public var location: SourceLocation public init( name: String? = nil, type: Any.Type? = nil, - optionalDepth: Int + optionalDepth: Int, + _ location: SourceLocation ) { self.name = name self.type = type self.optionalDepth = optionalDepth + self.location = location } } } @@ -61,13 +64,14 @@ extension AST.Node { case let .group(g): switch g.kind.value { case .capture: - list.append(.init(optionalDepth: nesting)) + list.append(.init(optionalDepth: nesting, g.location)) case .namedCapture(let name): - list.append(.init(name: name.value, optionalDepth: nesting)) + list.append(.init(name: name.value, optionalDepth: nesting, g.location)) case .balancedCapture(let b): - list.append(.init(name: b.name?.value, optionalDepth: nesting)) + list.append(.init(name: b.name?.value, optionalDepth: nesting, + g.location)) default: break } @@ -124,7 +128,8 @@ extension CaptureList.Capture: Equatable { public static func == (lhs: Self, rhs: Self) -> Bool { lhs.name == rhs.name && lhs.optionalDepth == rhs.optionalDepth && - lhs.type == rhs.type + lhs.type == rhs.type && + lhs.location == rhs.location } } extension CaptureList: Equatable {} diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 32859812c..9d3f037f7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -34,6 +34,7 @@ extension RegexValidator { for opt in ast.globalOptions?.options ?? [] { try validateGlobalMatchingOption(opt) } + try validateCaptures() try validateNode(ast.root) } @@ -59,6 +60,17 @@ extension RegexValidator { } } + func validateCaptures() throws { + // TODO: Should this be validated when creating the capture list? + var usedNames = Set() + for capture in captures.captures { + guard let name = capture.name else { continue } + guard usedNames.insert(name).inserted else { + throw error(.duplicateNamedCapture(name), at: capture.location) + } + } + } + func validateReference(_ ref: AST.Reference) throws { switch ref.kind { case .absolute(let i): diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b279c08e4..ff057f2ee 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -472,7 +472,7 @@ extension DSLTree.Node { list.append(.init( name: name, type: child.valueCaptureType?.base, - optionalDepth: nesting)) + optionalDepth: nesting, .fake)) child._addCaptures(to: &list, optionalNesting: nesting) case let .nonCapturingGroup(kind, child): diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 45be547db..9efbf2f76 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -16,36 +16,44 @@ import XCTest extension CaptureList.Capture { static var cap: Self { - return Self(optionalDepth: 0) + return Self(optionalDepth: 0, .fake) } static var opt: Self { - return Self(optionalDepth: 1) + return Self(optionalDepth: 1, .fake) } static var opt_opt: Self { - return Self(optionalDepth: 2) + return Self(optionalDepth: 2, .fake) } static var opt_opt_opt: Self { - return Self(optionalDepth: 3) + return Self(optionalDepth: 3, .fake) } static var opt_opt_opt_opt: Self { - return Self(optionalDepth: 4) + return Self(optionalDepth: 4, .fake) } static var opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 5) + return Self(optionalDepth: 5, .fake) } static var opt_opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 6) + return Self(optionalDepth: 6, .fake) } static func named(_ name: String, opt: Int = 0) -> Self { - return Self(name: name, optionalDepth: opt) + return Self(name: name, optionalDepth: opt, .fake) } } extension CaptureList { static func caps(count: Int) -> Self { Self(Array(repeating: .cap, count: count)) } + + var withoutLocs: Self { + var copy = self + for idx in copy.captures.indices { + copy.captures[idx].location = .fake + } + return copy + } } extension StructuredCapture { @@ -151,7 +159,7 @@ func captureTest( line: UInt = #line ) { let ast = try! parse(regex, .semantic, .traditional) - let capList = ast.root._captureList + let capList = ast.root._captureList.withoutLocs guard capList == expected else { XCTFail(""" Expected: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0ff96fa0b..b6decf437 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -94,7 +94,7 @@ func parseTest( file: file, line: line) return } - let captures = ast.captureList + let captures = ast.captureList.withoutLocs guard captures == expectedCaptures else { XCTFail(""" @@ -872,7 +872,7 @@ extension RegexTests { parseTest( "(?|(?a)|(?b))", nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), - throwsError: .unsupported, captures: [.named("x", opt: 1), .named("x", opt: 1)] + throwsError: .invalid, captures: [.named("x", opt: 1), .named("x", opt: 1)] ) // TODO: Reject mismatched names? @@ -2539,6 +2539,12 @@ extension RegexTests { diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) + diagnosticTest("(?)(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)|(?)", .duplicateNamedCapture("x")) + diagnosticTest("((?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(|(?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)(?)(?)", .duplicateNamedCapture("x")) + // MARK: Quantifiers diagnosticTest("*", .quantifierRequiresOperand("*")) From c95e8621dc9bfd3aadde0867ed7646b9335ec9a1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 20:45:06 +0100 Subject: [PATCH 4/4] Address review feedback - Make `\h` and `\H` supported for now - Check character class ranges - Diagnose unquantifiable escape sequences --- Sources/_RegexParser/Regex/AST/Atom.swift | 19 +++++++++++ .../Regex/Parse/Diagnostics.swift | 9 ++++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 3 -- Sources/_RegexParser/Regex/Parse/Sema.swift | 21 ++++++------ Tests/RegexTests/ParseTests.swift | 32 +++++++++++++++---- 5 files changed, 62 insertions(+), 22 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index d6062115a..9b0f1cb2e 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -668,6 +668,23 @@ extension AST.Atom.EscapedBuiltin { return nil } } + + public var isQuantifiable: Bool { + switch self { + case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab, + .singleDataUnit, .decimalDigit, .notDecimalDigit, .horizontalWhitespace, + .notHorizontalWhitespace, .notNewline, .newlineSequence, .whitespace, + .notWhitespace, .verticalTab, .notVerticalTab, .wordCharacter, + .notWordCharacter, .backspace, .graphemeCluster, .trueAnychar: + return true + + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .textSegment, + .notTextSegment: + return false + } + } } extension AST.Atom { @@ -749,6 +766,8 @@ extension AST.Atom { case .changeMatchingOptions: return false // TODO: Are callouts quantifiable? + case .escaped(let esc): + return esc.isQuantifiable default: return true } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 7a8dfe771..0054ae6b6 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -45,7 +45,6 @@ enum ParseError: Error, Hashable { case cannotReferToWholePattern - case notQuantifiable case quantifierRequiresOperand(String) case backtrackingDirectiveMustHaveName(String) @@ -83,6 +82,8 @@ enum ParseError: Error, Hashable { case duplicateNamedCapture(String) case invalidCharacterClassRangeOperand case invalidQuantifierRange(Int, Int) + case invalidCharacterRange(from: Character, to: Character) + case notQuantifiable } extension IdentifierKind { @@ -125,8 +126,6 @@ extension ParseError: CustomStringConvertible { return "invalid escape sequence '\\\(c)'" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" - case .notQuantifiable: - return "expression is not quantifiable" case .quantifierRequiresOperand(let q): return "quantifier '\(q)' must appear after expression" case .backtrackingDirectiveMustHaveName(let b): @@ -191,6 +190,10 @@ extension ParseError: CustomStringConvertible { return "group named '\(str)' already exists" case let .invalidQuantifierRange(lhs, rhs): return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'" + case let .invalidCharacterRange(from: lhs, to: rhs): + return "character '\(lhs)' must compare less than or equal to '\(rhs)'" + case .notQuantifiable: + return "expression is not quantifiable" } } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 2d33e4d7e..112f32358 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -227,9 +227,6 @@ extension Parser { if let (amt, kind, trivia) = try source.lexQuantifier(context: context) { let location = loc(_start) - guard operand.isQuantifiable else { - throw Source.LocatedError(ParseError.notQuantifiable, location) - } result.append(.quantification( .init(amt, kind, operand, location, trivia: trivia))) } else { diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 9d3f037f7..f9f2b996a 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -182,17 +182,15 @@ extension RegexValidator { _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation ) throws { switch esc { - case .resetStartOfMatch, .singleDataUnit, .horizontalWhitespace, - .notHorizontalWhitespace, .verticalTab, .notVerticalTab, + case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab, // '\N' needs to be emitted using 'emitAny'. .notNewline: throw error(.unsupported("'\\\(esc.character)'"), at: loc) // Character classes. case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, - .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar: - // TODO: What about scalar matching mode for .graphemeCluster? We - // currently crash at runtime. + .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar, + .horizontalWhitespace, .notHorizontalWhitespace: break case .newlineSequence: @@ -271,18 +269,20 @@ extension RegexValidator { throw error(.invalidCharacterClassRangeOperand, at: rhs.location) } - guard lhs.literalCharacterValue != nil else { + guard let lhsChar = lhs.literalCharacterValue else { throw error( .unsupported("character class range operand"), at: lhs.location) } - guard rhs.literalCharacterValue != nil else { + guard let rhsChar = rhs.literalCharacterValue else { throw error( .unsupported("character class range operand"), at: rhs.location) } - // TODO: Validate lhs <= rhs? That may require knowledge of case - // insensitivity though. + guard lhsChar <= rhsChar else { + throw error( + .invalidCharacterRange(from: lhsChar, to: rhsChar), at: range.dashLoc) + } } func validateCharacterClassMember( @@ -341,6 +341,9 @@ extension RegexValidator { func validateQuantification(_ quant: AST.Quantification) throws { try validateNode(quant.child) + guard quant.child.isQuantifiable else { + throw error(.notQuantifiable, at: quant.child.location) + } switch quant.amount.value { case .range(let lhs, let rhs): guard lhs.value <= rhs.value else { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b6decf437..9dfcff99e 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -503,6 +503,8 @@ extension RegexTests { parseTest("[-a-]", charClass("-", "a", "-")) parseTest("[a-z]", charClass(range_m("a", "z"))) + parseTest("[a-a]", charClass(range_m("a", "a"))) + parseTest("[B-a]", charClass(range_m("B", "a"))) // FIXME: AST builder helpers for custom char class types parseTest("[a-d--a-c]", charClass( @@ -2442,6 +2444,11 @@ extension RegexTests { diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) + diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) @@ -2555,6 +2562,17 @@ extension RegexTests { diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) + // These are not quantifiable. + diagnosticTest(#"\b?"#, .notQuantifiable) + diagnosticTest(#"\B*"#, .notQuantifiable) + diagnosticTest(#"\A+"#, .notQuantifiable) + diagnosticTest(#"\Z??"#, .notQuantifiable) + diagnosticTest(#"\G*?"#, .notQuantifiable) + diagnosticTest(#"\z+?"#, .notQuantifiable) + diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#)) + diagnosticTest(#"\y{2,5}"#, .notQuantifiable) + diagnosticTest(#"\Y{3,}"#, .notQuantifiable) + // MARK: Unicode scalars diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) @@ -2641,13 +2659,13 @@ extension RegexTests { diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) diagnosticTest("(*:)", .expectedNonEmptyContents) - diagnosticTest("(*MARK:a)?", .notQuantifiable) - diagnosticTest("(*FAIL)+", .notQuantifiable) - diagnosticTest("(*COMMIT:b)*", .notQuantifiable) - diagnosticTest("(*PRUNE:a)??", .notQuantifiable) - diagnosticTest("(*SKIP:a)*?", .notQuantifiable) - diagnosticTest("(*F)+?", .notQuantifiable) - diagnosticTest("(*:a){2}", .notQuantifiable) + diagnosticTest("(*MARK:a)?", .unsupported("backtracking directive")) + diagnosticTest("(*FAIL)+", .unsupported("backtracking directive")) + diagnosticTest("(*COMMIT:b)*", .unsupported("backtracking directive")) + diagnosticTest("(*PRUNE:a)??", .unsupported("backtracking directive")) + diagnosticTest("(*SKIP:a)*?", .unsupported("backtracking directive")) + diagnosticTest("(*F)+?", .unsupported("backtracking directive")) + diagnosticTest("(*:a){2}", .unsupported("backtracking directive")) // MARK: Oniguruma absent functions