diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index a10698526..497d54506 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -50,7 +50,7 @@ struct PatternConverter: ParsableCommand { print("Converting '\(delim)\(regex)\(delim)'") let ast = try _RegexParser.parse( - regex, + regex, .semantic, experimentalSyntax ? .experimental : .traditional) // Show rendered source ranges diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index d6062115a..9b0f1cb2e 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -668,6 +668,23 @@ extension AST.Atom.EscapedBuiltin { return nil } } + + public var isQuantifiable: Bool { + switch self { + case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab, + .singleDataUnit, .decimalDigit, .notDecimalDigit, .horizontalWhitespace, + .notHorizontalWhitespace, .notNewline, .newlineSequence, .whitespace, + .notWhitespace, .verticalTab, .notVerticalTab, .wordCharacter, + .notWordCharacter, .backspace, .graphemeCluster, .trueAnychar: + return true + + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .textSegment, + .notTextSegment: + return false + } + } } extension AST.Atom { @@ -749,6 +766,8 @@ extension AST.Atom { case .changeMatchingOptions: return false // TODO: Are callouts quantifiable? + case .escaped(let esc): + return esc.isQuantifiable default: return true } diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift index d112b2010..0287e7337 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureList.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -26,15 +26,18 @@ extension CaptureList { public var name: String? public var type: Any.Type? public var optionalDepth: Int + public var location: SourceLocation public init( name: String? = nil, type: Any.Type? = nil, - optionalDepth: Int + optionalDepth: Int, + _ location: SourceLocation ) { self.name = name self.type = type self.optionalDepth = optionalDepth + self.location = location } } } @@ -61,13 +64,14 @@ extension AST.Node { case let .group(g): switch g.kind.value { case .capture: - list.append(.init(optionalDepth: nesting)) + list.append(.init(optionalDepth: nesting, g.location)) case .namedCapture(let name): - list.append(.init(name: name.value, optionalDepth: nesting)) + list.append(.init(name: name.value, optionalDepth: nesting, g.location)) case .balancedCapture(let b): - list.append(.init(name: b.name?.value, optionalDepth: nesting)) + list.append(.init(name: b.name?.value, optionalDepth: nesting, + g.location)) default: break } @@ -124,7 +128,8 @@ extension CaptureList.Capture: Equatable { public static func == (lhs: Self, rhs: Self) -> Bool { lhs.name == rhs.name && lhs.optionalDepth == rhs.optionalDepth && - lhs.type == rhs.type + lhs.type == rhs.type && + lhs.location == rhs.location } } extension CaptureList: Equatable {} diff --git a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift index 0856361d8..4ae518dcd 100644 --- a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift +++ b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral( _ input: String, captureBufferOut: UnsafeMutableRawBufferPointer ) throws -> (regexToEmit: String, version: Int) { do { - let ast = try parseWithDelimiters(input) + let ast = try parseWithDelimiters(input, .semantic) // Serialize the capture structure for later type inference. assert(captureBufferOut.count >= input.utf8.count) ast.captureStructure.encode(to: captureBufferOut) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index c3d74c30b..0054ae6b6 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -15,6 +15,8 @@ enum ParseError: Error, Hashable { // TODO: I wonder if it makes sense to store the string. // This can make equality weird. + // MARK: Syntactic Errors + case numberOverflow(String) case expectedNumDigits(String, Int) case expectedNumber(String, kind: RadixKind) @@ -43,7 +45,6 @@ enum ParseError: Error, Hashable { case cannotReferToWholePattern - case notQuantifiable case quantifierRequiresOperand(String) case backtrackingDirectiveMustHaveName(String) @@ -55,7 +56,6 @@ enum ParseError: Error, Hashable { case cannotRemoveMatchingOptionsAfterCaret case expectedCustomCharacterClassMembers - case invalidCharacterClassRangeOperand case emptyProperty case unknownProperty(key: String?, value: String) @@ -73,6 +73,17 @@ enum ParseError: Error, Hashable { case cannotRemoveExtendedSyntaxInMultilineMode case expectedCalloutArgument + + // MARK: Semantic Errors + + case unsupported(String) + case deprecatedUnicode(String) + case invalidReference(Int) + case duplicateNamedCapture(String) + case invalidCharacterClassRangeOperand + case invalidQuantifierRange(Int, Int) + case invalidCharacterRange(from: Character, to: Character) + case notQuantifiable } extension IdentifierKind { @@ -88,6 +99,7 @@ extension IdentifierKind { extension ParseError: CustomStringConvertible { var description: String { switch self { + // MARK: Syntactic Errors case let .numberOverflow(s): return "number overflow: \(s)" case let .expectedNumDigits(s, i): @@ -114,8 +126,6 @@ extension ParseError: CustomStringConvertible { return "invalid escape sequence '\\\(c)'" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" - case .notQuantifiable: - return "expression is not quantifiable" case .quantifierRequiresOperand(let q): return "quantifier '\(q)' must appear after expression" case .backtrackingDirectiveMustHaveName(let b): @@ -167,6 +177,23 @@ extension ParseError: CustomStringConvertible { return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" + + // MARK: Semantic Errors + + case let .unsupported(kind): + return "\(kind) is not currently supported" + case let .deprecatedUnicode(kind): + return "\(kind) is a deprecated Unicode property, and is not supported" + case let .invalidReference(i): + return "no capture numbered \(i)" + case let .duplicateNamedCapture(str): + return "group named '\(str)' already exists" + case let .invalidQuantifierRange(lhs, rhs): + return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'" + case let .invalidCharacterRange(from: lhs, to: rhs): + return "character '\(lhs)' must compare less than or equal to '\(rhs)'" + case .notQuantifiable: + return "expression is not quantifiable" } } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 54e46948a..112f32358 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -227,9 +227,6 @@ extension Parser { if let (amt, kind, trivia) = try source.lexQuantifier(context: context) { let location = loc(_start) - guard operand.isQuantifiable else { - throw Source.LocatedError(ParseError.notQuantifiable, location) - } result.append(.quantification( .init(amt, kind, operand, location, trivia: trivia))) } else { @@ -543,11 +540,6 @@ extension Parser { // Range between atoms. if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(context: context) { - guard atom.isValidCharacterClassRangeBound && - rhs.isValidCharacterClassRangeBound else { - throw ParseError.invalidCharacterClassRangeOperand - } - // TODO: Validate lower <= upper? members.append(.range(.init(atom, dashLoc, rhs))) continue } @@ -558,13 +550,31 @@ extension Parser { } } +public enum ASTStage { + /// The regex is parsed, and a syntactically valid AST is returned. Otherwise + /// an error is thrown. This is useful for e.g syntax coloring. + case syntactic + + /// The regex is parsed, and a syntactically and semantically valid AST is + /// returned. Otherwise an error is thrown. A semantically valid AST has been + /// checked for e.g unsupported constructs and invalid backreferences. + case semantic +} + public func parse( - _ regex: S, _ syntax: SyntaxOptions + _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions ) throws -> AST where S.SubSequence == Substring { let source = Source(String(regex)) var parser = Parser(source, syntax: syntax) - return try parser.parse() + let ast = try parser.parse() + switch stage { + case .syntactic: + break + case .semantic: + try validate(ast) + } + return ast } /// Retrieve the default set of syntax options that a delimiter and literal @@ -591,11 +601,12 @@ fileprivate func defaultSyntaxOptions( /// Parses a given regex string with delimiters, inferring the syntax options /// from the delimiters used. public func parseWithDelimiters( - _ regex: S + _ regex: S, _ stage: ASTStage ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) do { - return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) + let syntax = defaultSyntaxOptions(delim, contents: contents) + return try parse(contents, stage, syntax) } catch let error as LocatedErrorProtocol { // Convert the range in 'contents' to the range in 'regex'. let delimCount = delim.opening.count diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift new file mode 100644 index 000000000..f9f2b996a --- /dev/null +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -0,0 +1,399 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +/// Validate a regex AST for semantic validity. Once bytecode is emitted at +/// compile time, this could potentially be subsumed by the bytecode generator. +fileprivate struct RegexValidator { + let ast: AST + let captures: CaptureList + + init(_ ast: AST) { + self.ast = ast + self.captures = ast.captureList + } + + func error(_ kind: ParseError, at loc: SourceLocation) -> Error { + Source.LocatedError(kind, loc) + } +} + +extension String { + fileprivate var quoted: String { "'\(self)'" } +} + +extension RegexValidator { + func validate() throws { + for opt in ast.globalOptions?.options ?? [] { + try validateGlobalMatchingOption(opt) + } + try validateCaptures() + try validateNode(ast.root) + } + + func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) throws { + switch opt.kind { + case .limitDepth, .limitHeap, .limitMatch, .notEmpty, .notEmptyAtStart, + .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, + .unicodeProperties: + // These are PCRE specific, and not something we're likely to ever + // support. + throw error(.unsupported("global matching option"), at: opt.location) + + case .newlineMatching: + // We have implemented the correct behavior for multi-line literals, but + // these should also affect '.' and '\N' matching, which we haven't + // implemented. + throw error(.unsupported("newline matching mode"), at: opt.location) + + case .newlineSequenceMatching: + // We haven't yet implemented the '\R' matching specifics of these. + throw error( + .unsupported("newline sequence matching mode"), at: opt.location) + } + } + + func validateCaptures() throws { + // TODO: Should this be validated when creating the capture list? + var usedNames = Set() + for capture in captures.captures { + guard let name = capture.name else { continue } + guard usedNames.insert(name).inserted else { + throw error(.duplicateNamedCapture(name), at: capture.location) + } + } + } + + func validateReference(_ ref: AST.Reference) throws { + switch ref.kind { + case .absolute(let i): + guard i <= captures.captures.count else { + throw error(.invalidReference(i), at: ref.innerLoc) + } + case .relative: + throw error(.unsupported("relative capture reference"), at: ref.innerLoc) + case .named: + // TODO: This could be implemented by querying the capture list for an + // index. + throw error(.unsupported("named capture reference"), at: ref.innerLoc) + } + if let recLevel = ref.recursionLevel { + throw error(.unsupported("recursion level"), at: recLevel.location) + } + } + + func validateMatchingOption(_ opt: AST.MatchingOption) throws { + let loc = opt.location + switch opt.kind { + case .allowDuplicateGroupNames: + // Not currently supported as we need to figure out what to do with + // the capture type. + throw error(.unsupported("duplicate group naming"), at: loc) + + case .unicodeWordBoundaries: + throw error(.unsupported("unicode word boundary mode"), at: loc) + + case .textSegmentWordMode, .textSegmentGraphemeMode: + throw error(.unsupported("text segment mode"), at: loc) + + case .byteSemantics: + throw error(.unsupported("byte semantic mode"), at: loc) + + case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, + .unicodeScalarSemantics, .graphemeClusterSemantics, + .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, + .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps: + break + } + } + + func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) throws { + for opt in opts.adding { + try validateMatchingOption(opt) + } + for opt in opts.removing { + try validateMatchingOption(opt) + } + } + + func validateBinaryProperty( + _ prop: Unicode.BinaryProperty, at loc: SourceLocation + ) throws { + switch prop { + case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable, + .changesWhenCasefolded, .changesWhenCasemapped, + .changesWhenNFKCCasefolded, .changesWhenLowercased, + .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated, + .defaultIgnorableCodePoint, .diacratic, .extender, + .fullCompositionExclusion, .graphemeBase, .graphemeExtended, .hexDigit, + .idContinue, .ideographic, .idStart, .idsBinaryOperator, + .idsTrinaryOperator, .joinControl, .logicalOrderException, .lowercase, + .math, .noncharacterCodePoint, .patternSyntax, .patternWhitespace, + .quotationMark, .radical, .regionalIndicator, .softDotted, + .sentenceTerminal, .terminalPunctuation, .unifiedIdiograph, .uppercase, + .variationSelector, .whitespace, .xidContinue, .xidStart: + break + + case .emojiModifierBase, .emojiModifier, .emoji, .emojiPresentation: + // These are available on macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1. + // TODO: We should ideally check deployment target for such conditionally + // available properties. + break + + case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: + throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) + + case .bidiControl, .compositionExclusion, .emojiComponent, + .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, + .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, + .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, + .otherUppercase, .prependedConcatenationMark: + throw error(.unsupported(prop.rawValue.quoted), at: loc) + } + } + + func validateCharacterProperty( + _ prop: AST.Atom.CharacterProperty, at loc: SourceLocation + ) throws { + // TODO: We could re-add the .other case to diagnose unknown properties + // here instead of in the parser. + // TODO: Should we store an 'inner location' for the contents of `\p{...}`? + switch prop.kind { + case .binary(let b, _): + try validateBinaryProperty(b, at: loc) + case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, + .scriptExtension: + break + case .pcreSpecial: + throw error(.unsupported("PCRE property"), at: loc) + case .onigurumaSpecial: + throw error(.unsupported("Unicode block property"), at: loc) + } + } + + func validateEscaped( + _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation + ) throws { + switch esc { + case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab, + // '\N' needs to be emitted using 'emitAny'. + .notNewline: + throw error(.unsupported("'\\\(esc.character)'"), at: loc) + + // Character classes. + case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, + .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar, + .horizontalWhitespace, .notHorizontalWhitespace: + break + + case .newlineSequence: + break + + // Assertions. + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, .textSegment, + .notTextSegment, .firstMatchingPositionInSubject: + break + + // Literal escapes. + case .alarm, .backspace, .escape, .formfeed, .newline, .carriageReturn, + .tab: + break + } + } + + func validateAtom(_ atom: AST.Atom) throws { + switch atom.kind { + case .escaped(let esc): + try validateEscaped(esc, at: atom.location) + + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: + // We need to implement the scalar computations for these. + throw error(.unsupported("control sequence"), at: atom.location) + + case .property(let p): + try validateCharacterProperty(p, at: atom.location) + + case .backreference(let r): + try validateReference(r) + + case .subpattern: + throw error(.unsupported("subpattern"), at: atom.location) + + case .callout: + // These are PCRE and Oniguruma specific, supporting them is future work. + throw error(.unsupported("callout"), at: atom.location) + + case .backtrackingDirective: + // These are PCRE-specific, and are unlikely to be fully supported. + throw error(.unsupported("backtracking directive"), at: atom.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + + case .namedCharacter: + // TODO: We should error on unknown Unicode scalar names. + break + + case .char, .scalar, .startOfLine, .endOfLine, .any: + break + } + } + + func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) throws { + for member in c.members { + try validateCharacterClassMember(member) + } + } + + func validateCharacterClassRange( + _ range: AST.CustomCharacterClass.Range + ) throws { + let lhs = range.lhs + let rhs = range.rhs + + try validateAtom(lhs) + try validateAtom(rhs) + + guard lhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: lhs.location) + } + guard rhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: rhs.location) + } + + guard let lhsChar = lhs.literalCharacterValue else { + throw error( + .unsupported("character class range operand"), at: lhs.location) + } + + guard let rhsChar = rhs.literalCharacterValue else { + throw error( + .unsupported("character class range operand"), at: rhs.location) + } + + guard lhsChar <= rhsChar else { + throw error( + .invalidCharacterRange(from: lhsChar, to: rhsChar), at: range.dashLoc) + } + } + + func validateCharacterClassMember( + _ member: AST.CustomCharacterClass.Member + ) throws { + switch member { + case .custom(let c): + try validateCustomCharacterClass(c) + + case .range(let r): + try validateCharacterClassRange(r) + + case .atom(let a): + try validateAtom(a) + + case .setOperation(let lhs, _, let rhs): + for lh in lhs { try validateCharacterClassMember(lh) } + for rh in rhs { try validateCharacterClassMember(rh) } + + case .quote, .trivia: + break + } + } + + func validateGroup(_ group: AST.Group) throws { + let kind = group.kind + switch kind.value { + case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead: + break + + case .balancedCapture: + // These are .NET specific, and kinda niche. + throw error(.unsupported("balanced capture"), at: kind.location) + + case .nonCaptureReset: + // We need to figure out how these interact with typed captures. + throw error(.unsupported("branch reset group"), at: kind.location) + + case .atomicNonCapturing: + throw error(.unsupported("atomic group"), at: kind.location) + + case .nonAtomicLookahead: + throw error(.unsupported("non-atomic lookahead"), at: kind.location) + + case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: + throw error(.unsupported("lookbehind"), at: kind.location) + + case .scriptRun, .atomicScriptRun: + throw error(.unsupported("script run"), at: kind.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + } + try validateNode(group.child) + } + + func validateQuantification(_ quant: AST.Quantification) throws { + try validateNode(quant.child) + guard quant.child.isQuantifiable else { + throw error(.notQuantifiable, at: quant.child.location) + } + switch quant.amount.value { + case .range(let lhs, let rhs): + guard lhs.value <= rhs.value else { + throw error( + .invalidQuantifierRange(lhs.value, rhs.value), at: quant.location) + } + case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN: + break + } + } + + func validateNode(_ node: AST.Node) throws { + switch node { + case .alternation(let a): + for branch in a.children { + try validateNode(branch) + } + case .concatenation(let c): + for child in c.children { + try validateNode(child) + } + + case .group(let g): + try validateGroup(g) + + case .conditional(let c): + // Note even once we get runtime support for this, we need to change the + // parsing to incorporate what is specified in the syntax proposal. + throw error(.unsupported("conditional"), at: c.location) + + case .quantification(let q): + try validateQuantification(q) + + case .atom(let a): + try validateAtom(a) + + case .customCharacterClass(let c): + try validateCustomCharacterClass(c) + + case .absentFunction(let a): + // These are Oniguruma specific. + throw error(.unsupported("absent function"), at: a.location) + + case .quote, .trivia, .empty: + break + } + } +} + +/// Check a regex AST for semantic validity. +public func validate(_ ast: AST) throws { + try RegexValidator(ast).validate() +} diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 47faa23ed..1c20761c8 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -38,7 +38,7 @@ class Compiler { func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { - let ast = try parse(regex, syntax) + let ast = try parse(regex, .semantic, syntax) let program = try Compiler(ast: ast).emit() return Executor(program: program) } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 637b1a37a..9c0c3522c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -513,7 +513,10 @@ extension Unicode.BinaryProperty { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { let consume = consumeFunction(for: opts) - + + // Note if you implement support for any of the below, you need to adjust + // the switch in Sema.swift to not have it be diagnosed as unsupported + // (potentially guarded on deployment version). switch self { case .asciiHexDigit: return consume(propertyScalarPredicate { diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 00fc2e952..6dd8e17b6 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -17,7 +17,7 @@ extension Regex where Output == AnyRegexOutput { /// /// - Parameter pattern: The regular expression. public init(_ pattern: String) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } @@ -31,7 +31,7 @@ extension Regex { _ pattern: String, as: Output.Type = Output.self ) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 1f9a35dad..29d2267b2 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -44,7 +44,7 @@ public struct Regex: RegexComponent { // Compiler interface. Do not change independently. @usableFromInline init(_regexString pattern: String) { - self.init(ast: try! parse(pattern, .traditional)) + self.init(ast: try! parse(pattern, .semantic, .traditional)) } // Compiler interface. Do not change independently. @@ -53,7 +53,7 @@ public struct Regex: RegexComponent { assert(version == currentRegexLiteralFormatVersion) // The version argument is passed by the compiler using the value defined // in libswiftParseRegexLiteral. - self.init(ast: try! parseWithDelimiters(pattern)) + self.init(ast: try! parseWithDelimiters(pattern, .semantic)) } public var regex: Regex { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b279c08e4..ff057f2ee 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -472,7 +472,7 @@ extension DSLTree.Node { list.append(.init( name: name, type: child.valueCaptureType?.base, - optionalDepth: nesting)) + optionalDepth: nesting, .fake)) child._addCaptures(to: &list, optionalNesting: nesting) case let .nonCapturingGroup(kind, child): diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index fc3fd5741..27a24cf46 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -454,9 +454,13 @@ extension AST.Atom.EscapedBuiltin { case .notHorizontalWhitespace: return .horizontalWhitespace.inverted - case .notNewline: return .newlineSequence.inverted case .newlineSequence: return .newlineSequence + // FIXME: This is more like '.' than inverted '\R', as it is affected + // by e.g (*CR). We should therefore really be emitting it through + // emitAny(). For now we treat it as semantically invalid. + case .notNewline: return .newlineSequence.inverted + case .whitespace: return .whitespace case .notWhitespace: return .whitespace.inverted diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index b48e1f0a5..9efbf2f76 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -16,36 +16,44 @@ import XCTest extension CaptureList.Capture { static var cap: Self { - return Self(optionalDepth: 0) + return Self(optionalDepth: 0, .fake) } static var opt: Self { - return Self(optionalDepth: 1) + return Self(optionalDepth: 1, .fake) } static var opt_opt: Self { - return Self(optionalDepth: 2) + return Self(optionalDepth: 2, .fake) } static var opt_opt_opt: Self { - return Self(optionalDepth: 3) + return Self(optionalDepth: 3, .fake) } static var opt_opt_opt_opt: Self { - return Self(optionalDepth: 4) + return Self(optionalDepth: 4, .fake) } static var opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 5) + return Self(optionalDepth: 5, .fake) } static var opt_opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 6) + return Self(optionalDepth: 6, .fake) } - static func named(_ name: String) -> Self { - return Self(name: name, optionalDepth: 0) + static func named(_ name: String, opt: Int = 0) -> Self { + return Self(name: name, optionalDepth: opt, .fake) } } extension CaptureList { static func caps(count: Int) -> Self { Self(Array(repeating: .cap, count: count)) } + + var withoutLocs: Self { + var copy = self + for idx in copy.captures.indices { + copy.captures[idx].location = .fake + } + return copy + } } extension StructuredCapture { @@ -150,8 +158,8 @@ func captureTest( file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(regex, .traditional) - let capList = ast.root._captureList + let ast = try! parse(regex, .semantic, .traditional) + let capList = ast.root._captureList.withoutLocs guard capList == expected else { XCTFail(""" Expected: diff --git a/Tests/RegexTests/DiagnosticTests.swift b/Tests/RegexTests/DiagnosticTests.swift index 428020b80..0100a3a86 100644 --- a/Tests/RegexTests/DiagnosticTests.swift +++ b/Tests/RegexTests/DiagnosticTests.swift @@ -20,7 +20,7 @@ extension RegexTests { XCTAssert(SourceLocation.fake.isFake) XCTAssert(group(.capture, "a").location.isFake) - let ast = try! parse("(a)", .traditional).root + let ast = try! parse("(a)", .semantic, .traditional).root XCTAssert(ast.location.isReal) } @@ -31,7 +31,7 @@ extension RegexTests { // // Input should be a concatenation or alternation func flatTest(_ str: String, _ expected: [String]) { - guard let ast = try? parse(str, .traditional).root else { + guard let ast = try? parse(str, .semantic, .traditional).root else { XCTFail("Fail to parse: \(str)") return } @@ -54,7 +54,7 @@ extension RegexTests { func renderTest(_ str: String, _ expected: [String]) { let lines = try! parse( - str, .traditional + str, .semantic, .traditional )._render(in: str) func fail() { XCTFail(""" diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 83b73fe35..3b7def90b 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -717,7 +717,7 @@ extension RegexTests { firstMatchTest( #"\N{ASTERISK}+"#, input: "123***xyz", match: "***") firstMatchTest( - #"\N {2}"#, input: "123 xyz", match: "3 ") + #"\N {2}"#, input: "123 xyz", match: "3 ", xfail: true) firstMatchTest(#"\N{U+2C}"#, input: "123,xyz", match: ",") firstMatchTest(#"\N{U+1F4BF}"#, input: "123šŸ’æxyz", match: "šŸ’æ") @@ -1014,7 +1014,7 @@ extension RegexTests { firstMatchTest( #"a(?:b)c"#, input: "123abcxyz", match: "abc") firstMatchTest( - "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a") + "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a", xfail: true) firstMatchTest( #"(?:a|.b)c"#, input: "123abcacxyz", match: "abc") @@ -1130,6 +1130,8 @@ extension RegexTests { firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) + + firstMatchTest(#"\1(.)"#, input: "112", match: nil) } func testMatchExamples() { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0ef021442..9dfcff99e 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -33,30 +33,56 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } } +enum SemanticErrorKind { + case unsupported, invalid +} class RegexTests: XCTestCase {} func parseTest( _ input: String, _ expectedAST: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { parseTest( - input, .init(expectedAST, globalOptions: nil), syntax: syntax, - captures: expectedCaptures, file: file, line: line + input, .init(expectedAST, globalOptions: nil), throwsError: errorKind, + syntax: syntax, captures: expectedCaptures, file: file, line: line ) } func parseTest( _ input: String, _ expectedAST: AST, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax) + let ast: AST + do { + ast = try parse(input, errorKind != nil ? .syntactic : .semantic, syntax) + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parse(input, .semantic, syntax) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expectedAST || ast._dump() == expectedAST._dump() // EQ workaround else { @@ -68,7 +94,7 @@ func parseTest( file: file, line: line) return } - let captures = ast.captureList + let captures = ast.captureList.withoutLocs guard captures == expectedCaptures else { XCTFail(""" @@ -143,15 +169,37 @@ func delimiterLexingTest( /// true, there may be additional characters that follow the literal that are /// not considered part of it. func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, - file: StaticString = #file, line: UInt = #line + _ input: String, _ expecting: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, + ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let orig = try! parseWithDelimiters(literal) - let ast = orig.root + let ast: AST.Node + do { + ast = try parseWithDelimiters( + literal, errorKind != nil ? .syntactic : .semantic).root + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parseWithDelimiters(input, .semantic) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround else { @@ -170,8 +218,8 @@ func parseNotEqualTest( syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line ) { - let lhsAST = try! parse(lhs, syntax) - let rhsAST = try! parse(rhs, syntax) + let lhsAST = try! parse(lhs, .syntactic, syntax) + let rhsAST = try! parse(rhs, .syntactic, syntax) if lhsAST == rhsAST || lhsAST._dump() == rhsAST._dump() { XCTFail(""" AST: \(lhsAST._dump()) @@ -187,7 +235,7 @@ func rangeTest( at locFn: (AST.Node) -> SourceLocation = \.location, file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax).root + let ast = try! parse(input, .syntactic, syntax).root let range = input.offsets(of: locFn(ast).range) let expected = expectedRange(input) @@ -207,7 +255,7 @@ func diagnosticTest( file: StaticString = #file, line: UInt = #line ) { do { - let ast = try parse(input, syntax) + let ast = try parse(input, .semantic, syntax) XCTFail(""" Passed \(ast) @@ -236,7 +284,7 @@ func diagnosticWithDelimitersTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) do { - let orig = try parseWithDelimiters(literal) + let orig = try parseWithDelimiters(literal, .semantic) let ast = orig.root XCTFail(""" @@ -437,6 +485,12 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) + // FIXME: '\N' should be emitted through 'emitAny', not through the + // _CharacterClassModel model. + parseTest(#"\N"#, escaped(.notNewline), throwsError: .unsupported) + + parseTest(#"\R"#, escaped(.newlineSequence)) + parseTest( "[-|$^:?+*())(*-+-]", charClass( @@ -449,6 +503,8 @@ extension RegexTests { parseTest("[-a-]", charClass("-", "a", "-")) parseTest("[a-z]", charClass(range_m("a", "z"))) + parseTest("[a-a]", charClass(range_m("a", "a"))) + parseTest("[B-a]", charClass(range_m("B", "a"))) // FIXME: AST builder helpers for custom char class types parseTest("[a-d--a-c]", charClass( @@ -595,10 +651,12 @@ extension RegexTests { range_m(.keyboardControl("A"), .keyboardControl("B")), range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), range_m(.keyboardMeta("A"), .keyboardMeta("B")) - )) + ), throwsError: .unsupported) - parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( - range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE")))) + parseTest( + #"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( + range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), + throwsError: .unsupported) // MARK: Operators @@ -691,13 +749,13 @@ extension RegexTests { parseTest(#"\\#u{3000}"#, "\u{3000}") // Control and meta controls. - parseTest(#"\c "#, atom(.keyboardControl(" "))) - parseTest(#"\c!"#, atom(.keyboardControl("!"))) - parseTest(#"\c~"#, atom(.keyboardControl("~"))) - parseTest(#"\C--"#, atom(.keyboardControl("-"))) - parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a"))) - parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-"))) - parseTest(#"\M-a"#, atom(.keyboardMeta("a"))) + parseTest(#"\c "#, atom(.keyboardControl(" ")), throwsError: .unsupported) + parseTest(#"\c!"#, atom(.keyboardControl("!")), throwsError: .unsupported) + parseTest(#"\c~"#, atom(.keyboardControl("~")), throwsError: .unsupported) + parseTest(#"\C--"#, atom(.keyboardControl("-")), throwsError: .unsupported) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), throwsError: .unsupported) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), throwsError: .unsupported) + parseTest(#"\M-a"#, atom(.keyboardMeta("a")), throwsError: .unsupported) // MARK: Comments @@ -734,6 +792,9 @@ extension RegexTests { parseTest( #"a{0,0}"#, quantRange(0...0, of: "a")) + parseTest( + #"a{1,1}"#, + quantRange(1...1, of: "a")) // Make sure ranges get treated as literal if invalid. parseTest("{", "{") @@ -786,11 +847,42 @@ extension RegexTests { // Balanced captures parseTest(#"(?)"#, balancedCapture(name: "a", priorName: "c", empty()), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) parseTest(#"(?<-c>)"#, balancedCapture(name: nil, priorName: "c", empty()), - captures: [.cap]) + throwsError: .unsupported, captures: [.cap]) parseTest(#"(?'a-b'c)"#, balancedCapture(name: "a", priorName: "b", "c"), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) + + // Capture resets. + // FIXME: The captures in each branch should be unified. For now, we don't + // treat any capture reset as semantically valid. + parseTest( + "(?|(a)|(b))", + nonCaptureReset(alt(capture("a"), capture("b"))), + throwsError: .unsupported, captures: [.opt, .opt] + ) + parseTest( + "(?|(?a)|(b))", + nonCaptureReset(alt(namedCapture("x", "a"), capture("b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .opt] + ) + parseTest( + "(?|(a)|(?b))", + nonCaptureReset(alt(capture("a"), namedCapture("x", "b"))), + throwsError: .unsupported, captures: [.opt, .named("x", opt: 1)] + ) + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), + throwsError: .invalid, captures: [.named("x", opt: 1), .named("x", opt: 1)] + ) + + // TODO: Reject mismatched names? + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("y", "b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .named("y", opt: 1)] + ) // Other groups parseTest( @@ -798,13 +890,13 @@ extension RegexTests { concat("a", nonCapture("b"), "c")) parseTest( #"a(?|b)c"#, - concat("a", nonCaptureReset("b"), "c")) + concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) parseTest( #"a(?>b)c"#, - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest( "a(*atomic:b)c", - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest("a(?=b)c", concat("a", lookahead("b"), "c")) parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c")) @@ -815,31 +907,42 @@ extension RegexTests { parseTest("a(*negative_lookahead:b)c", concat("a", negativeLookahead("b"), "c")) - parseTest("a(?<=b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*plb:b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*positive_lookbehind:b)c", concat("a", lookbehind("b"), "c")) - - parseTest("a(?"#, backreference(.relative(4))) - parseTest(#"\k<2>"#, backreference(.absolute(2))) - parseTest(#"\k'-3'"#, backreference(.relative(-3))) - parseTest(#"\k'1'"#, backreference(.absolute(1))) - - parseTest(#"\k{a0}"#, backreference(.named("a0"))) - parseTest(#"\k"#, backreference(.named("bc"))) - parseTest(#"\g{abc}"#, backreference(.named("abc"))) - parseTest(#"(?P=abc)"#, backreference(.named("abc"))) + parseTest(#"\113"#, backreference(.absolute(113)), throwsError: .invalid) + parseTest(#"\377"#, backreference(.absolute(377)), throwsError: .invalid) + parseTest(#"\81"#, backreference(.absolute(81)), throwsError: .invalid) + + parseTest(#"\g1"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g001"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g52"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g-01"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g+30"#, backreference(.relative(30)), throwsError: .unsupported) + + parseTest(#"\g{1}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{001}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{52}"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g{-01}"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g{+30}"#, backreference(.relative(30)), throwsError: .unsupported) + parseTest(#"\k<+4>"#, backreference(.relative(4)), throwsError: .unsupported) + parseTest(#"\k<2>"#, backreference(.absolute(2)), throwsError: .invalid) + parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) + parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) + + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .unsupported) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported) // Oniguruma recursion levels. - parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0)) - parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0)) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1)) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8)) - parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8)) - parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8)) - parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8)) - parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8)) - - parseTest(#"(?R)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?0)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?1)"#, subpattern(.absolute(1))) - parseTest(#"(?+12)"#, subpattern(.relative(12))) - parseTest(#"(?-2)"#, subpattern(.relative(-2))) - parseTest(#"(?&hello)"#, subpattern(.named("hello"))) - parseTest(#"(?P>P)"#, subpattern(.named("P"))) + parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid) + parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid) + parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8), throwsError: .unsupported) + + parseTest(#"(?R)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?0)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?1)"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"(?+12)"#, subpattern(.relative(12)), throwsError: .unsupported) + parseTest(#"(?-2)"#, subpattern(.relative(-2)), throwsError: .unsupported) + parseTest(#"(?&hello)"#, subpattern(.named("hello")), throwsError: .unsupported) + parseTest(#"(?P>P)"#, subpattern(.named("P")), throwsError: .unsupported) parseTest(#"[(?R)]"#, charClass("(", "?", "R", ")")) parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) - parseTest(#"\g<1>"#, subpattern(.absolute(1))) - parseTest(#"\g<001>"#, subpattern(.absolute(1))) - parseTest(#"\g'52'"#, subpattern(.absolute(52))) - parseTest(#"\g'-01'"#, subpattern(.relative(-1))) - parseTest(#"\g'+30'"#, subpattern(.relative(30))) - parseTest(#"\g'abc'"#, subpattern(.named("abc"))) + parseTest(#"\g<1>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g<001>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g'52'"#, subpattern(.absolute(52)), throwsError: .unsupported) + parseTest(#"\g'-01'"#, subpattern(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) + parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) // Backreferences are not valid in custom character classes. parseTest(#"[\8]"#, charClass("8")) parseTest(#"[\9]"#, charClass("9")) + // These are valid references. + parseTest(#"()\1"#, concat( + capture(empty()), backreference(.absolute(1)) + ), captures: [.cap]) + parseTest(#"\1()"#, concat( + backreference(.absolute(1)), capture(empty()) + ), captures: [.cap]) + parseTest(#"()()\2"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(2)) + ), captures: [.cap, .cap]) + parseTest(#"()\2()"#, concat( + capture(empty()), backreference(.absolute(2)), capture(empty()) + ), captures: [.cap, .cap]) + // MARK: Character names. parseTest(#"\N{abc}"#, atom(.namedCharacter("abc"))) @@ -1137,7 +1254,7 @@ extension RegexTests { parseTest(#"\N{abc}+"#, oneOrMore(of: atom(.namedCharacter("abc")))) parseTest( #"\N {2}"#, - concat(atom(.escaped(.notNewline)), exactly(2, of: " ")) + concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), throwsError: .unsupported ) parseTest(#"\N{AA}"#, atom(.namedCharacter("AA"))) @@ -1203,13 +1320,13 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic))) + parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic)), throwsError: .unsupported) - parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric))) - parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace))) - parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace))) - parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed))) - parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord))) + parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) + parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) + parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), throwsError: .unsupported) + parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), throwsError: .unsupported) + parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), throwsError: .unsupported) parseTest(#"\p{alnum}"#, prop(.posix(.alnum))) parseTest(#"\p{is_alnum}"#, prop(.posix(.alnum))) @@ -1229,45 +1346,45 @@ extension RegexTests { // MARK: Conditionals parseTest(#"(?(1))"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a|)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)a|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)(a|b|c)|d)"#, conditional( .groupMatched(ref(1)), trueBranch: capture(alt("a", "b", "c")), falseBranch: "d" - ), captures: [.opt]) + ), throwsError: .unsupported, captures: [.opt]) parseTest(#"(?(+3))"#, conditional( - .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(-21))"#, conditional( - .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) // Oniguruma recursion levels. parseTest(#"(?(1+1))"#, conditional( .groupMatched(ref(1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(-1+1))"#, conditional( .groupMatched(ref(minus: 1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(1-3))"#, conditional( .groupMatched(ref(1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(+1-3))"#, conditional( .groupMatched(ref(plus: 1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest( #"(?)(?(a+5))"#, @@ -1275,7 +1392,7 @@ extension RegexTests { .groupMatched(ref("a", recursionLevel: 5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a")] + throwsError: .unsupported, captures: [.named("a")] ) parseTest( #"(?)(?(a1-5))"#, @@ -1283,50 +1400,50 @@ extension RegexTests { .groupMatched(ref("a1", recursionLevel: -5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a1")] + throwsError: .unsupported, captures: [.named("a1")] ) parseTest(#"(?(1))?"#, zeroOrOne(of: conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()))) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), throwsError: .unsupported) parseTest(#"(?(R)a|b)"#, conditional( - .recursionCheck, trueBranch: "a", falseBranch: "b")) + .recursionCheck, trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(R1))"#, conditional( - .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(R&abc)a|b)"#, conditional( - .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?()a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?('abc')a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(abc)a|b)"#, conditional( groupCondition(.capture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), captures: [.cap]) + ), throwsError: .unsupported, captures: [.cap]) parseTest(#"(?(?:abc)a|b)"#, conditional( groupCondition(.nonCapture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?=abc)a|b)"#, conditional( groupCondition(.lookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?!abc)a|b)"#, conditional( groupCondition(.negativeLookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?<=abc)a|b)"#, conditional( groupCondition(.lookbehind, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?y)(?(xxx)a|b)"#, concat( namedCapture("xxx", "y"), conditional(.groupMatched(ref("xxx")), trueBranch: "a", falseBranch: "b") - ), captures: [.named("xxx")]) + ), throwsError: .unsupported, captures: [.named("xxx")]) parseTest(#"(?(1)(?(2)(?(3)))|a)"#, conditional( .groupMatched(ref(1)), @@ -1356,115 +1473,119 @@ extension RegexTests { trueBranch: empty(), falseBranch: empty()), falseBranch: empty()), - falseBranch: "a")) + falseBranch: "a"), throwsError: .unsupported) parseTest(#"(?(DEFINE))"#, conditional( - .defineGroup, trueBranch: empty(), falseBranch: empty())) + .defineGroup, trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(VERSION>=3.1))"#, conditional( pcreVersionCheck(.greaterThanOrEqual, 3, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(VERSION=0.1))"#, conditional( pcreVersionCheck(.equal, 0, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) // MARK: Callouts // PCRE callouts - parseTest(#"(?C)"#, pcreCallout(.number(0))) - parseTest(#"(?C0)"#, pcreCallout(.number(0))) - parseTest(#"(?C20)"#, pcreCallout(.number(20))) - parseTest("(?C{abc})", pcreCallout(.string("abc"))) + parseTest(#"(?C)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C0)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C20)"#, pcreCallout(.number(20)), throwsError: .unsupported) + parseTest("(?C{abc})", pcreCallout(.string("abc")), throwsError: .unsupported) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { - parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello"))) + parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello")), + throwsError: .unsupported) } // Oniguruma named callouts - parseTest("(*X)", onigurumaNamedCallout("X")) - parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t")) - parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b")) - parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b")) - parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c")) - parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c")) - parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>")) - parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c")) + parseTest("(*X)", onigurumaNamedCallout("X"), throwsError: .unsupported) + parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), throwsError: .unsupported) + parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), throwsError: .unsupported) + parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), throwsError: .unsupported) + parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), throwsError: .unsupported) + parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), throwsError: .unsupported) // Oniguruma 'of contents' callouts - parseTest("(?{x})", onigurumaCalloutOfContents("x")) - parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y")) - parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x")) - parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag")) - parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction)) - parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both)) - parseTest("(?{x}>)", onigurumaCalloutOfContents("x")) - parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x")) - parseTest("(?{\\})", onigurumaCalloutOfContents("\\")) + parseTest("(?{x})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), throwsError: .unsupported) + parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), throwsError: .unsupported) + parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), throwsError: .unsupported) + parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), throwsError: .unsupported) + parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), throwsError: .unsupported) + parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), throwsError: .unsupported) // MARK: Backtracking directives - parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept))) + parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), throwsError: .unsupported) parseTest( "(*ACCEPT:a)??", - zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")) + zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")), + throwsError: .unsupported ) - parseTest("(*:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*F)", backtrackingDirective(.fail)) - parseTest("(*COMMIT)", backtrackingDirective(.commit)) - parseTest("(*SKIP)", backtrackingDirective(.skip)) - parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP")) - parseTest("(*PRUNE)", backtrackingDirective(.prune)) - parseTest("(*THEN)", backtrackingDirective(.then)) + parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*F)", backtrackingDirective(.fail), throwsError: .unsupported) + parseTest("(*COMMIT)", backtrackingDirective(.commit), throwsError: .unsupported) + parseTest("(*SKIP)", backtrackingDirective(.skip), throwsError: .unsupported) + parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), throwsError: .unsupported) + parseTest("(*PRUNE)", backtrackingDirective(.prune), throwsError: .unsupported) + parseTest("(*THEN)", backtrackingDirective(.then), throwsError: .unsupported) // MARK: Oniguruma absent functions - parseTest("(?~)", absentRepeater(empty())) - parseTest("(?~abc)", absentRepeater(concat("a", "b", "c"))) - parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a"))) - parseTest("(?~~)", absentRepeater("~")) - parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c"))) - parseTest("(?~(a))", absentRepeater(capture("a")), captures: []) - parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty()))) - - parseTest("(?~|abc)", absentStopper(concat("a", "b", "c"))) - parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a"))) - parseTest("(?~|~)", absentStopper("~")) - parseTest("(?~|(a))", absentStopper(capture("a")), captures: []) - parseTest("(?~|a){2}", exactly(2, of: absentStopper("a"))) - - parseTest("(?~|a|b)", absentExpression("a", "b")) - parseTest("(?~|~|~)", absentExpression("~", "~")) + parseTest("(?~)", absentRepeater(empty()), throwsError: .unsupported) + parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~~)", absentRepeater("~"), throwsError: .unsupported) + parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~(a))", absentRepeater(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), throwsError: .unsupported) + + parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~|~)", absentStopper("~"), throwsError: .unsupported) + parseTest("(?~|(a))", absentStopper(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), throwsError: .unsupported) + + parseTest("(?~|a|b)", absentExpression("a", "b"), throwsError: .unsupported) + parseTest("(?~|~|~)", absentExpression("~", "~"), throwsError: .unsupported) parseTest("(?~|(a)|(?:b))", absentExpression(capture("a"), nonCapture("b")), - captures: []) + throwsError: .unsupported, captures: []) parseTest("(?~|(a)|(?:(b)|c))", absentExpression( capture("a"), nonCapture(alt(capture("b"), "c")) - ), captures: [.opt]) - parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b"))) + ), throwsError: .unsupported, captures: [.opt]) + parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), throwsError: .unsupported) - parseTest("(?~|)", absentRangeClear()) + parseTest("(?~|)", absentRangeClear(), throwsError: .unsupported) // TODO: It's not really clear what this means, but Oniguruma parses it... // Maybe we should diagnose it? - parseTest("(?~|)+", oneOrMore(of: absentRangeClear())) + parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), throwsError: .unsupported) // MARK: Global matching options parseTest("(*CR)(*UTF)(*LIMIT_DEPTH=3)", ast( empty(), opts: .newlineMatching(.carriageReturnOnly), .utfMode, .limitDepth(.init(faking: 3)) - )) + ), throwsError: .unsupported) parseTest( - "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode))) + "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode)), + throwsError: .unsupported) parseTest( "(*BSR_ANYCRLF)", ast( - empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed))) + empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed)), + throwsError: .unsupported) // TODO: Diagnose on multiple line matching modes? parseTest( @@ -1472,7 +1593,7 @@ extension RegexTests { ast(empty(), opts: [ .carriageReturnOnly, .linefeedOnly, .carriageAndLinefeedOnly, .anyCarriageReturnOrLinefeed, .anyUnicode, .nulCharacter - ].map { .newlineMatching($0) })) + ].map { .newlineMatching($0) }), throwsError: .unsupported) parseTest( """ @@ -1485,7 +1606,7 @@ extension RegexTests { .limitMatch(.init(faking: 2)), .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties - ) + ), throwsError: .unsupported ) parseTest("[(*CR)]", charClass("(", "*", "C", "R", ")")) @@ -1699,7 +1820,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1710,7 +1831,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1721,7 +1842,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1732,7 +1853,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1743,7 +1864,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1754,7 +1875,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1765,7 +1886,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1776,7 +1897,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1787,7 +1908,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1808,7 +1929,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1819,7 +1940,7 @@ extension RegexTests { # h """, ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1830,7 +1951,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1844,7 +1965,7 @@ extension RegexTests { opts: .newlineMatching(.carriageReturnOnly), .newlineMatching(.nulCharacter) ), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) // MARK: Parse with delimiters @@ -1947,30 +2068,37 @@ extension RegexTests { #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) parseWithDelimitersTest( #"re'(?'a_bcA0-c1A'x*)'"#, - balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")), + throwsError: .unsupported) parseWithDelimitersTest( #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( - .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"), + throwsError: .unsupported + ) parseWithDelimitersTest( #"re'(?('+20')\')'"#, conditional( - .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) - + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"), throwsError: .unsupported) parseWithDelimitersTest( - #"re'(?C'a*b\c šŸ”„_ ;')'"#, pcreCallout(.string(#"a*b\c šŸ”„_ ;"#))) + #"re'(?C'a*b\c šŸ”„_ ;')'"#, pcreCallout(.string(#"a*b\c šŸ”„_ ;"#)), + throwsError: .unsupported) // Fine, because we don't end up skipping. delimiterLexingTest(#"re'(?'"#) @@ -2314,6 +2442,13 @@ extension RegexTests { diagnosticTest("[[::]]", .emptyProperty) diagnosticTest("[[:=:]]", .emptyProperty) + diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) + diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) @@ -2411,6 +2546,12 @@ extension RegexTests { diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) + diagnosticTest("(?)(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)|(?)", .duplicateNamedCapture("x")) + diagnosticTest("((?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(|(?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)(?)(?)", .duplicateNamedCapture("x")) + // MARK: Quantifiers diagnosticTest("*", .quantifierRequiresOperand("*")) @@ -2419,6 +2560,18 @@ extension RegexTests { diagnosticTest("*?", .quantifierRequiresOperand("*?")) diagnosticTest("{5}", .quantifierRequiresOperand("{5}")) diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) + diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) + + // These are not quantifiable. + diagnosticTest(#"\b?"#, .notQuantifiable) + diagnosticTest(#"\B*"#, .notQuantifiable) + diagnosticTest(#"\A+"#, .notQuantifiable) + diagnosticTest(#"\Z??"#, .notQuantifiable) + diagnosticTest(#"\G*?"#, .notQuantifiable) + diagnosticTest(#"\z+?"#, .notQuantifiable) + diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#)) + diagnosticTest(#"\y{2,5}"#, .notQuantifiable) + diagnosticTest(#"\Y{3,}"#, .notQuantifiable) // MARK: Unicode scalars @@ -2458,6 +2611,16 @@ extension RegexTests { diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal)) diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal)) + diagnosticTest(#"()\k<1+1>"#, .unsupported("recursion level")) + diagnosticTest(#"()\k<1-1>"#, .unsupported("recursion level")) + + diagnosticTest(#"\k<0>"#, .cannotReferToWholePattern) + diagnosticTest(#"\1"#, .invalidReference(1)) + diagnosticTest(#"(?:)\1"#, .invalidReference(1)) + diagnosticTest(#"()\2"#, .invalidReference(2)) + diagnosticTest(#"\2()"#, .invalidReference(2)) + diagnosticTest(#"(?:)()\2"#, .invalidReference(2)) + diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2)) // MARK: Conditionals @@ -2496,13 +2659,13 @@ extension RegexTests { diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) diagnosticTest("(*:)", .expectedNonEmptyContents) - diagnosticTest("(*MARK:a)?", .notQuantifiable) - diagnosticTest("(*FAIL)+", .notQuantifiable) - diagnosticTest("(*COMMIT:b)*", .notQuantifiable) - diagnosticTest("(*PRUNE:a)??", .notQuantifiable) - diagnosticTest("(*SKIP:a)*?", .notQuantifiable) - diagnosticTest("(*F)+?", .notQuantifiable) - diagnosticTest("(*:a){2}", .notQuantifiable) + diagnosticTest("(*MARK:a)?", .unsupported("backtracking directive")) + diagnosticTest("(*FAIL)+", .unsupported("backtracking directive")) + diagnosticTest("(*COMMIT:b)*", .unsupported("backtracking directive")) + diagnosticTest("(*PRUNE:a)??", .unsupported("backtracking directive")) + diagnosticTest("(*SKIP:a)*?", .unsupported("backtracking directive")) + diagnosticTest("(*F)+?", .unsupported("backtracking directive")) + diagnosticTest("(*:a){2}", .unsupported("backtracking directive")) // MARK: Oniguruma absent functions @@ -2560,5 +2723,7 @@ extension RegexTests { func testCompilerInterfaceDiagnostics() { compilerInterfaceDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'") + compilerInterfaceDiagnosticMessageTest( + "/a{3,2}/", "cannot parse regular expression: range lower bound '3' must be less than or equal to upper bound '2'") } } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index eff9f9b4e..145087ee7 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -444,7 +444,7 @@ extension UTS18Tests { func testIndividuallyNamedCharacters_XFail() { XCTExpectFailure("Need to support named chars in custom character classes") { - XCTFail("\(regex(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#))") + XCTFail(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#) // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) }