diff --git a/.gitignore b/.gitignore index a7e7e4d09..ff85b9fa3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ .DS_Store +# The current toolchain is dumping files in the package root, rude +*.emit-module.* + # Xcode # # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore diff --git a/Documentation/Evolution/ProposalOverview.md b/Documentation/Evolution/ProposalOverview.md index 7656526a6..5f526f963 100644 --- a/Documentation/Evolution/ProposalOverview.md +++ b/Documentation/Evolution/ProposalOverview.md @@ -3,6 +3,7 @@ ## Regex Type and Overview +- [Second review](https://forums.swift.org/t/se-0350-second-review-regex-type-and-overview/56886) - [Proposal](https://github.com/apple/swift-evolution/blob/main/proposals/0350-regex-type-overview.md), [Thread](https://forums.swift.org/t/se-0350-regex-type-and-overview/56530) - [Pitch thread](https://forums.swift.org/t/pitch-regex-type-and-overview/56029) diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index f66204884..a10698526 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -30,9 +30,6 @@ struct PatternConverter: ParsableCommand { @Flag(help: "Whether to show canonical regex literal") var showCanonical: Bool = false - @Flag(help: "Whether to show capture structure") - var showCaptureStructure: Bool = false - @Flag(help: "Whether to skip result builder DSL") var skipDSL: Bool = false @@ -71,13 +68,6 @@ struct PatternConverter: ParsableCommand { print() } - if showCaptureStructure { - print("Capture structure:") - print() - print(ast.captureStructure) - print() - } - print() if !skipDSL { let render = ast.renderAsBuilderDSL( diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index e8cd4ac54..ae66310af 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -12,6 +12,12 @@ @_implementationOnly import _RegexParser @_spi(RegexBuilder) import _StringProcessing +/// A regex component that matches a specific condition at a particular position +/// in an input string. +/// +/// You can use anchors to guarantee that a match only occurs at certain points +/// in an input string, such as at the beginning of the string or at the end of +/// a line. @available(SwiftStdlib 5.7, *) public struct Anchor { internal enum Kind { @@ -53,14 +59,24 @@ extension Anchor: RegexComponent { @available(SwiftStdlib 5.7, *) extension Anchor { + /// An anchor that matches at the start of the input string. + /// + /// This anchor is equivalent to `\A` in regex syntax. public static var startOfSubject: Anchor { Anchor(kind: .startOfSubject) } - + + /// An anchor that matches at the end of the input string or at the end of + /// the line immediately before the the end of the string. + /// + /// This anchor is equivalent to `\Z` in regex syntax. public static var endOfSubjectBeforeNewline: Anchor { Anchor(kind: .endOfSubjectBeforeNewline) } - + + /// An anchor that matches at the end of the input string. + /// + /// This anchor is equivalent to `\z` in regex syntax. public static var endOfSubject: Anchor { Anchor(kind: .endOfSubject) } @@ -70,26 +86,53 @@ extension Anchor { // Anchor(kind: resetStartOfMatch) // } + /// An anchor that matches at the first position of a match in the input + /// string. public static var firstMatchingPositionInSubject: Anchor { Anchor(kind: .firstMatchingPositionInSubject) } + /// An anchor that matches at a grapheme cluster boundary. + /// + /// This anchor is equivalent to `\y` in regex syntax. public static var textSegmentBoundary: Anchor { Anchor(kind: .textSegmentBoundary) } + /// An anchor that matches at the start of a line, including the start of + /// the input string. + /// + /// This anchor is equivalent to `^` in regex syntax when the `m` option + /// has been enabled or `anchorsMatchLineEndings(true)` has been called. public static var startOfLine: Anchor { Anchor(kind: .startOfLine) } + /// An anchor that matches at the end of a line, including at the end of + /// the input string. + /// + /// This anchor is equivalent to `$` in regex syntax when the `m` option + /// has been enabled or `anchorsMatchLineEndings(true)` has been called. public static var endOfLine: Anchor { Anchor(kind: .endOfLine) } + /// An anchor that matches at a word boundary. + /// + /// Word boundaries are identified using the Unicode default word boundary + /// algorithm by default. To specify a different word boundary algorithm, + /// see the `RegexComponent.wordBoundaryKind(_:)` method. + /// + /// This anchor is equivalent to `\b` in regex syntax. public static var wordBoundary: Anchor { Anchor(kind: .wordBoundary) } + /// The inverse of this anchor, which matches at every position that this + /// anchor does not. + /// + /// For the `wordBoundary` and `textSegmentBoundary` anchors, the inverted + /// version corresponds to `\B` and `\Y`, respectively. public var inverted: Anchor { var result = self result.isInverted.toggle() @@ -97,6 +140,13 @@ extension Anchor { } } +/// A regex component that allows a match to continue only if its contents +/// match at the given location. +/// +/// A lookahead is a zero-length assertion that its included regex matches at +/// a particular position. Lookaheads do not advance the overall matching +/// position in the input string — once a lookahead succeeds, matching continues +/// in the regex from the same position. @available(SwiftStdlib 5.7, *) public struct Lookahead: _BuiltinRegexComponent { public var regex: Regex @@ -105,19 +155,48 @@ public struct Lookahead: _BuiltinRegexComponent { self.regex = regex } + /// Creates a lookahead from the given regex component. public init( - _ component: R, - negative: Bool = false + _ component: R ) where R.RegexOutput == Output { - self.init(node: .nonCapturingGroup( - negative ? .negativeLookahead : .lookahead, component.regex.root)) + self.init(node: .nonCapturingGroup(.lookahead, component.regex.root)) } + + /// Creates a lookahead from the regex generated by the given builder closure. + public init( + @RegexComponentBuilder _ component: () -> R + ) where R.RegexOutput == Output { + self.init(node: .nonCapturingGroup(.lookahead, component().regex.root)) + } +} +/// A regex component that allows a match to continue only if its contents +/// do not match at the given location. +/// +/// A negative lookahead is a zero-length assertion that its included regex +/// does not match at a particular position. Lookaheads do not advance the +/// overall matching position in the input string — once a lookahead succeeds, +/// matching continues in the regex from the same position. +@available(SwiftStdlib 5.7, *) +public struct NegativeLookahead: _BuiltinRegexComponent { + public var regex: Regex + + init(_ regex: Regex) { + self.regex = regex + } + + /// Creates a negative lookahead from the given regex component. + public init( + _ component: R + ) where R.RegexOutput == Output { + self.init(node: .nonCapturingGroup(.negativeLookahead, component.regex.root)) + } + + /// Creates a negative lookahead from the regex generated by the given builder + /// closure. public init( - negative: Bool = false, @RegexComponentBuilder _ component: () -> R ) where R.RegexOutput == Output { - self.init(node: .nonCapturingGroup( - negative ? .negativeLookahead : .lookahead, component().regex.root)) + self.init(node: .nonCapturingGroup(.negativeLookahead, component().regex.root)) } } diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index eae393289..a7dcd2015 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -25,12 +25,6 @@ public struct AST: Hashable { extension AST { /// Whether this AST tree contains at least one capture nested inside of it. public var hasCapture: Bool { root.hasCapture } - - /// The capture structure of this AST tree. - public var captureStructure: CaptureStructure { - var constructor = CaptureStructure.Constructor(.flatten) - return root._captureStructure(&constructor) - } } extension AST { diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index e779c39fb..d3dbc1666 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -17,7 +17,7 @@ extension AST { case caseInsensitive // i case allowDuplicateGroupNames // J case multiline // m - case noAutoCapture // n + case namedCapturesOnly // n case singleLine // s case reluctantByDefault // U case extended // x diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift new file mode 100644 index 000000000..d112b2010 --- /dev/null +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -0,0 +1,154 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +public struct CaptureList { + public var captures: [Capture] + + public init(_ s: S) where S.Element == Capture { + captures = Array(s) + } + + public mutating func append(_ c: Capture) { + captures.append(c) + } +} + +extension CaptureList { + public struct Capture { + public var name: String? + public var type: Any.Type? + public var optionalDepth: Int + + public init( + name: String? = nil, + type: Any.Type? = nil, + optionalDepth: Int + ) { + self.name = name + self.type = type + self.optionalDepth = optionalDepth + } + } +} + +// MARK: Generating from AST + +extension AST.Node { + public func _addCaptures( + to list: inout CaptureList, + optionalNesting nesting: Int + ) { + let addOptional = nesting+1 + switch self { + case let .alternation(a): + for child in a.children { + child._addCaptures(to: &list, optionalNesting: addOptional) + } + + case let .concatenation(c): + for child in c.children { + child._addCaptures(to: &list, optionalNesting: nesting) + } + + case let .group(g): + switch g.kind.value { + case .capture: + list.append(.init(optionalDepth: nesting)) + + case .namedCapture(let name): + list.append(.init(name: name.value, optionalDepth: nesting)) + + case .balancedCapture(let b): + list.append(.init(name: b.name?.value, optionalDepth: nesting)) + + default: break + } + g.child._addCaptures(to: &list, optionalNesting: nesting) + + case .conditional(let c): + switch c.condition.kind { + case .group(let g): + AST.Node.group(g)._addCaptures(to: &list, optionalNesting: nesting) + default: + break + } + + c.trueBranch._addCaptures(to: &list, optionalNesting: addOptional) + c.falseBranch._addCaptures(to: &list, optionalNesting: addOptional) + + case .quantification(let q): + var optNesting = nesting + if q.amount.value.bounds.atLeast == 0 { + optNesting += 1 + } + q.child._addCaptures(to: &list, optionalNesting: optNesting) + + case .absentFunction(let abs): + switch abs.kind { + case .expression(_, _, let child): + child._addCaptures(to: &list, optionalNesting: nesting) + case .clearer, .repeater, .stopper: + break + } + + case .quote, .trivia, .atom, .customCharacterClass, .empty: + break + } + } + + public var _captureList: CaptureList { + var caps = CaptureList() + self._addCaptures(to: &caps, optionalNesting: 0) + return caps + } +} + +extension AST { + /// Get the capture list for this AST + public var captureList: CaptureList { + root._captureList + } +} + +// MARK: Convenience for testing and inspection + +extension CaptureList.Capture: Equatable { + public static func == (lhs: Self, rhs: Self) -> Bool { + lhs.name == rhs.name && + lhs.optionalDepth == rhs.optionalDepth && + lhs.type == rhs.type + } +} +extension CaptureList: Equatable {} + +extension CaptureList.Capture: CustomStringConvertible { + public var description: String { + let typeStr: String + if let ty = type { + typeStr = "\(ty)" + } else { + typeStr = "Substring" + } + let suffix = String(repeating: "?", count: optionalDepth) + return typeStr + suffix + } +} +extension CaptureList: CustomStringConvertible { + public var description: String { + "(" + captures.map(\.description).joined(separator: ", ") + ")" + } +} + +extension CaptureList: ExpressibleByArrayLiteral { + public init(arrayLiteral elements: Capture...) { + self.init(elements) + } +} diff --git a/Sources/_RegexParser/Regex/Parse/CaptureStructure.swift b/Sources/_RegexParser/Regex/Parse/CaptureStructure.swift index 9cb31c7d9..6cd8001ba 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureStructure.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureStructure.swift @@ -9,258 +9,35 @@ // //===----------------------------------------------------------------------===// -// A tree representing the type of some captures. -public enum CaptureStructure: Equatable { +// TODO: Remove and directly serialize CaptureList instead + +// A tree representing the type of some captures, used for communication +// with the compiler. +enum CaptureStructure: Equatable { case atom(name: String? = nil, type: AnyType? = nil) indirect case optional(CaptureStructure) indirect case tuple([CaptureStructure]) - public static func tuple(_ children: CaptureStructure...) -> Self { + static func tuple(_ children: CaptureStructure...) -> Self { tuple(children) } - public static var empty: Self { + static var empty: Self { .tuple([]) } } -// TODO: Below are all flattening constructors. Instead create -// a builder/visitor that can store the structuralization -// approach - -extension CaptureStructure { - public struct Constructor { - var strategy: Strategy - - public init(_ strategy: Strategy = .flatten) { - guard strategy == .flatten else { - fatalError("TODO: adjust creator methods") - } - self.strategy = strategy - } - } -} - -extension CaptureStructure.Constructor { - public mutating func alternating( - _ children: C - ) -> CaptureStructure where C.Element: _TreeNode { - return children.map { - $0._captureStructure(&self) - }.reduce(.empty, +) - .map(CaptureStructure.optional) - } - public mutating func concatenating( - _ children: C - ) -> CaptureStructure where C.Element: _TreeNode { - return children.map { - $0._captureStructure(&self) - }.reduce(.empty, +) - } - - public mutating func grouping( - _ child: T, - as kind: AST.Group.Kind - ) -> CaptureStructure { - switch kind { - case .capture: - return capturing(child) - case .namedCapture(let name): - return capturing(name: name.value, child) - case .balancedCapture(let b): - return capturing(name: b.name?.value, child) - default: - precondition(!kind.isCapturing) - return child._captureStructure(&self) - } - } - - public mutating func capturing( - name: String? = nil, - _ child: T, - withType type: AnyType? = nil - ) -> CaptureStructure { - .atom(name: name, type: type) - + child._captureStructure(&self) - } - - // TODO: We'll likely want/need a generalization of - // conditional's condition kind. - public mutating func condition( - _ condition: AST.Conditional.Condition.Kind, - trueBranch: T, - falseBranch: T - ) -> CaptureStructure { - // A conditional's capture structure is effectively that of an alternation - // between the true and false branches. However the condition may also - // have captures in the case of a group condition. - var captures = CaptureStructure.empty - switch condition { - case .group(let g): - captures = captures + AST.Node.group(g)._captureStructure(&self) - default: - break - } - let branchCaptures = trueBranch._captureStructure(&self) + - falseBranch._captureStructure(&self) - return captures + branchCaptures.map(CaptureStructure.optional) - } - - public mutating func quantifying( - _ child: T, amount: AST.Quantification.Amount - ) -> CaptureStructure { - let result = child._captureStructure(&self) - return amount.bounds.atLeast == 0 - ? result.map(CaptureStructure.optional) : result - } - - // TODO: Will need to adjust for DSLTree support, and - // "absent" isn't the best name for these. - public mutating func absent( - _ kind: AST.AbsentFunction.Kind - ) -> CaptureStructure { - // Only the child of an expression absent function is relevant, as the - // other expressions don't actually get matched against. - switch kind { - case .expression(_, _, let child): - return child._captureStructure(&self) - case .clearer, .repeater, .stopper: - return .empty - } - } - -} - -extension AST.Node { - public func _captureStructure( - _ constructor: inout CaptureStructure.Constructor - ) -> CaptureStructure { - guard constructor.strategy == .flatten else { - fatalError("TODO") - } - - // Note: This implementation could be more optimized. - switch self { - case let .alternation(a): - return constructor.alternating(a.children) - - case let .concatenation(c): - return constructor.concatenating(c.children) - - case let .group(g): - return constructor.grouping(g.child, as: g.kind.value) - - case .conditional(let c): - return constructor.condition( - c.condition.kind, - trueBranch: c.trueBranch, - falseBranch: c.falseBranch) - - case .quantification(let q): - return constructor.quantifying( - q.child, amount: q.amount.value) - - case .absentFunction(let abs): - return constructor.absent(abs.kind) - - case .quote, .trivia, .atom, .customCharacterClass, .empty: - return .empty - } - } -} - -// MARK: - Combination and transformation - -extension CaptureStructure { - /// Returns a capture structure by concatenating any tuples in `self` and - /// `other`. - func concatenating(with other: CaptureStructure) -> CaptureStructure { - switch (self, other) { - // (T...) + (U...) ==> (T..., U...) - case let (.tuple(lhs), .tuple(rhs)): - return .tuple(lhs + rhs) - // T + () ==> T - case (_, .tuple(let rhs)) where rhs.isEmpty: - return self - // () + T ==> T - case (.tuple(let lhs), _) where lhs.isEmpty: - return other - // (T...) + U ==> (T..., U) - case let (.tuple(lhs), _): - return .tuple(lhs + [other]) - // T + (U...) ==> (T, U...) - case let (_, .tuple(rhs)): - return .tuple([self] + rhs) - // T + U ==> (T, U) - default: - return .tuple([self, other]) - } - } - - static func + ( - lhs: CaptureStructure, rhs: CaptureStructure - ) -> CaptureStructure { - lhs.concatenating(with: rhs) - } - - /// Returns a capture structure by transforming any tuple element of `self` - /// or transforming `self` directly if it is not a tuple. - func map( - _ transform: (CaptureStructure) -> CaptureStructure - ) -> CaptureStructure { - if case .tuple(let children) = self { - return .tuple(children.map(transform)) - } - return transform(self) - } -} - // MARK: - Common properties extension CaptureStructure { /// Returns a Boolean indicating whether the structure does not contain any /// captures. - public var isEmpty: Bool { + private var isEmpty: Bool { if case .tuple(let elements) = self, elements.isEmpty { return true } return false } - - public func type(withAtomType atomType: Any.Type) -> Any.Type { - switch self { - case .atom(_, type: nil): - return atomType - case .atom(_, type: let type?): - return type.base - case .optional(let child): - return TypeConstruction.optionalType(of: child.type(withAtomType: atomType)) - case .tuple(let children): - return TypeConstruction.tupleType(of: children.map { - $0.type(withAtomType: atomType) - }) - } - } - - public typealias DefaultAtomType = Substring - - public var type: Any.Type { - type(withAtomType: DefaultAtomType.self) - } - - public var atomType: AnyType { - switch self { - case .atom(_, type: nil): - return .init(Substring.self) - case .atom(_, type: let type?): - return type - case .optional(let child): - return child.atomType - case .tuple: - fatalError("Recursive nesting has no single atom type") - } - - } } // MARK: - Serialization @@ -280,7 +57,7 @@ extension CaptureStructure { private typealias SerializationVersion = UInt16 private static let currentSerializationVersion: SerializationVersion = 1 - public static func serializationBufferSize( + static func serializationBufferSize( forInputUTF8CodeUnitCount inputUTF8CodeUnitCount: Int ) -> Int { MemoryLayout.stride + inputUTF8CodeUnitCount + 1 @@ -302,7 +79,7 @@ extension CaptureStructure { /// /// - Parameter buffer: A buffer whose byte count is at least the byte count /// of the regular expression string that produced this capture structure. - public func encode(to buffer: UnsafeMutableRawBufferPointer) { + func encode(to buffer: UnsafeMutableRawBufferPointer) { assert(!buffer.isEmpty, "Buffer must not be empty") assert( buffer.count >= @@ -361,7 +138,7 @@ extension CaptureStructure { /// Creates a capture structure by decoding a serialized representation from /// the given buffer. - public init?(decoding buffer: UnsafeRawBufferPointer) { + init?(decoding buffer: UnsafeRawBufferPointer) { var scopes: [[CaptureStructure]] = [[]] var currentScope: [CaptureStructure] { get { scopes[scopes.endIndex - 1] } @@ -415,13 +192,13 @@ extension CaptureStructure { } extension CaptureStructure: CustomStringConvertible { - public var description: String { + var description: String { var printer = PrettyPrinter() _print(&printer) return printer.finish() } - private func _print(_ printer: inout PrettyPrinter) { + func _print(_ printer: inout PrettyPrinter) { switch self { case let .atom(name, type): let name = name ?? "" @@ -445,10 +222,41 @@ extension CaptureStructure: CustomStringConvertible { } } -extension CaptureStructure.Constructor { - public enum Strategy { - case flatten - case nest - // case drop(after: Int)... +extension AST { + /// The capture structure of this AST for compiler communication. + var captureStructure: CaptureStructure { + root._captureList._captureStructure(nestOptionals: true) + } +} + +// MARK: Convert CaptureList into CaptureStructure + +extension CaptureList { + func _captureStructure(nestOptionals: Bool) -> CaptureStructure { + if captures.isEmpty { return .empty } + if captures.count == 1 { + return captures.first!._captureStructure(nestOptionals: nestOptionals) + } + return .tuple(captures.map { + $0._captureStructure(nestOptionals: nestOptionals) + }) + } +} + +extension CaptureList.Capture { + func _captureStructure(nestOptionals: Bool) -> CaptureStructure { + if optionalDepth == 0 { + if let ty = type { + return .atom(name: name, type: .init(ty)) + } + return .atom(name: name) + } + var copy = self + copy.optionalDepth = 0 + var base = copy._captureStructure(nestOptionals: false) + for _ in 0..<(nestOptionals ? optionalDepth : 1) { + base = .optional(base) + } + return base } } diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 911312121..5cc920063 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -32,8 +32,8 @@ extension Source { static private func classifyGeneralCategory( _ str: String ) -> Unicode.ExtendedGeneralCategory? { - // This uses the aliases defined in - // https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // This uses the aliases defined in https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // Additionally, uses the `L& = Lc` alias defined by PCRE. withNormalizedForms(str) { str in switch str { case "c", "other": return .other @@ -43,7 +43,7 @@ extension Source { case "co", "privateuse": return .privateUse case "cs", "surrogate": return .surrogate case "l", "letter": return .letter - case "lc", "casedletter": return .casedLetter + case "lc", "l&", "casedletter": return .casedLetter case "ll", "lowercaseletter": return .lowercaseLetter case "lm", "modifierletter": return .modifierLetter case "lo", "otherletter": return .otherLetter diff --git a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift new file mode 100644 index 000000000..0856361d8 --- /dev/null +++ b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift @@ -0,0 +1,115 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// The version number for the regex. This gets emitted as an argument to the +// Regex(_regexString:version:) initializer and should be bumped if the format +// of the regex string needs to be changed in such a that requires the runtime +// to updated. +public let currentRegexLiteralFormatVersion = 1 + +@_spi(CompilerInterface) +public struct CompilerLexError: Error { + public var message: String + public var location: UnsafeRawPointer + public var completelyErroneous: Bool +} + +/// Interface for the Swift compiler. +/// +/// Attempt to lex a regex literal string. +/// +/// - Parameters: +/// - start: The pointer at which to start lexing the literal. +/// - bufferEnd: A pointer to the end of the buffer, which should not be lexed +/// past. +/// - mustBeRegex: Whether we expect a regex literal to be lexed here. If +/// `false`, a regex literal will only be lexed if it does not +/// produce an error. +/// +/// - Returns: If a regex literal was lexed, `resumePtr` specifies where to +/// resume lexing and `error` specifies a lexing error to emit. If +/// a regex literal was not lexed, `nil` is returned. +/// +@_spi(CompilerInterface) +public func swiftCompilerLexRegexLiteral( + start: UnsafeRawPointer, bufferEnd: UnsafeRawPointer, mustBeRegex: Bool +) -> (resumePtr: UnsafeRawPointer, error: CompilerLexError?)? { + do { + let (_, _, endPtr) = try lexRegex(start: start, end: bufferEnd) + return (resumePtr: endPtr, error: nil) + } catch let error as DelimiterLexError { + if !mustBeRegex { + // This token can be something else. Let the client fallback. + return nil + } + let completelyErroneous: Bool + switch error.kind { + case .unterminated, .multilineClosingNotOnNewline: + // These can be recovered from. + completelyErroneous = false + case .unprintableASCII, .invalidUTF8: + // We don't currently have good recovery behavior for these. + completelyErroneous = true + case .unknownDelimiter: + // An unknown delimiter should be recovered from, as we may want to try + // lex something else. + return nil + } + // For now every lexer error is emitted at the starting delimiter. + let compilerError = CompilerLexError( + message: "\(error)", location: start, + completelyErroneous: completelyErroneous + ) + return (error.resumePtr, compilerError) + } catch { + fatalError("Should be a DelimiterLexError") + } +} + +@_spi(CompilerInterface) +public struct CompilerParseError: Error { + public var message: String + public var location: String.Index? +} + +/// Interface for the Swift compiler. +/// +/// Attempt to parse a regex literal string. +/// +/// - Parameters: +/// - input: The regex input string, including delimiters. +/// - captureBufferOut: A buffer into which the captures of the regex will +/// be encoded into upon a successful parse. +/// +/// - Returns: The string to emit along with its version number. +/// - Throws: `CompilerParseError` if there was a parsing error. +@_spi(CompilerInterface) +public func swiftCompilerParseRegexLiteral( + _ input: String, captureBufferOut: UnsafeMutableRawBufferPointer +) throws -> (regexToEmit: String, version: Int) { + do { + let ast = try parseWithDelimiters(input) + // Serialize the capture structure for later type inference. + assert(captureBufferOut.count >= input.utf8.count) + ast.captureStructure.encode(to: captureBufferOut) + + // For now we just return the input as the regex to emit. This could be + // changed in the future if need to back-deploy syntax to something already + // known to the matching engine, or otherwise change the format. Note + // however that it will need plumbing through on the compiler side. + return (regexToEmit: input, version: currentRegexLiteralFormatVersion) + } catch { + throw CompilerParseError( + message: "cannot parse regular expression: \(String(describing: error))", + location: (error as? LocatedErrorProtocol)?.location.start + ) + } +} diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 9633b607e..c2cce67e8 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -616,7 +616,7 @@ extension Source { case "i": return advanceAndReturn(.caseInsensitive) case "J": return advanceAndReturn(.allowDuplicateGroupNames) case "m": return advanceAndReturn(.multiline) - case "n": return advanceAndReturn(.noAutoCapture) + case "n": return advanceAndReturn(.namedCapturesOnly) case "s": return advanceAndReturn(.singleLine) case "U": return advanceAndReturn(.reluctantByDefault) case "x": @@ -914,6 +914,10 @@ extension Source { } // TODO: (name:) + // If (?n) is set, a bare (...) group is non-capturing. + if context.syntax.contains(.namedCapturesOnly) { + return .nonCapture + } return .capture } } diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift deleted file mode 100644 index 56294e2d3..000000000 --- a/Sources/_RegexParser/Regex/Parse/Mocking.swift +++ /dev/null @@ -1,128 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -@available(*, deprecated, message: "moving to SwiftCompilerModules") -private func copyCString(_ str: String) -> UnsafePointer { - let count = str.utf8.count + 1 - return str.withCString { - assert($0[count-1] == 0) - let ptr = UnsafeMutablePointer.allocate(capacity: count) - ptr.initialize(from: $0, count: count) - return UnsafePointer(ptr) - } -} - -/// Interface for libswift. -/// -/// Attempt to lex a regex literal string. -/// -/// - Parameters: -/// - CurPtrPtr: A pointer to the current pointer of lexer, which should be -/// the start of the literal. This will be advanced to the point -/// at which the lexer should resume, or will remain the same if -/// this is not a regex literal. -/// - BufferEnd: A pointer to the end of the buffer, which should not be lexed -/// past. -/// - ErrorOut: If an error is encountered, this will be set to the error -/// string. -/// -/// - Returns: A bool indicating whether lexing was completely erroneous, and -/// cannot be recovered from, or false if there either was no error, -/// or there was a recoverable error. -@available(*, deprecated, message: "moving to SwiftCompilerModules") -func libswiftLexRegexLiteral( - _ curPtrPtr: UnsafeMutablePointer?>?, - _ bufferEndPtr: UnsafePointer?, - _ errOut: UnsafeMutablePointer?>? -) -> /*CompletelyErroneous*/ CBool { - guard let curPtrPtr = curPtrPtr, let inputPtr = curPtrPtr.pointee, - let bufferEndPtr = bufferEndPtr - else { - fatalError("Expected lexing pointers") - } - guard let errOut = errOut else { fatalError("Expected error out param") } - - do { - let (_, _, endPtr) = try lexRegex(start: inputPtr, end: bufferEndPtr) - curPtrPtr.pointee = endPtr.assumingMemoryBound(to: CChar.self) - return false - } catch let error as DelimiterLexError { - if error.kind == .unknownDelimiter { - // An unknown delimiter should be recovered from, as we may want to try - // lex something else. - return false - } - errOut.pointee = copyCString("\(error)") - curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) - - switch error.kind { - case .unterminated, .multilineClosingNotOnNewline: - // These can be recovered from. - return false - case .unprintableASCII, .invalidUTF8: - // We don't currently have good recovery behavior for these. - return true - case .unknownDelimiter: - fatalError("Already handled") - } - } catch { - fatalError("Should be a DelimiterLexError") - } -} - -// The version number for the regex. This gets emitted as an argument to the -// Regex(_regexString:version:) initializer and should be bumped if the format -// of the regex string needs to be changed in such a that requires the runtime -// to updated. -public let currentRegexLiteralFormatVersion: CUnsignedInt = 1 - -/// Interface for libswift. -/// -/// - Parameters: -/// - inputPtr: A null-terminated C string. -/// - errOut: A buffer accepting an error string upon error. -/// - versionOut: A buffer accepting a regex literal format -/// version. -/// - captureStructureOut: A buffer accepting a byte sequence representing the -/// capture structure. -/// - captureStructureSize: The size of the capture structure buffer. Must be -/// greater than or equal to `strlen(inputPtr)`. -@available(*, deprecated, message: "moving to SwiftCompilerModules") -func libswiftParseRegexLiteral( - _ inputPtr: UnsafePointer?, - _ errOut: UnsafeMutablePointer?>?, - _ versionOut: UnsafeMutablePointer?, - _ captureStructureOut: UnsafeMutableRawPointer?, - _ captureStructureSize: CUnsignedInt -) { - guard let s = inputPtr else { fatalError("Expected input param") } - guard let errOut = errOut else { fatalError("Expected error out param") } - guard let versionOut = versionOut else { - fatalError("Expected version out param") - } - - versionOut.pointee = currentRegexLiteralFormatVersion - - let str = String(cString: s) - do { - let ast = try parseWithDelimiters(str) - // Serialize the capture structure for later type inference. - if let captureStructureOut = captureStructureOut { - assert(captureStructureSize >= str.utf8.count) - let buffer = UnsafeMutableRawBufferPointer( - start: captureStructureOut, count: Int(captureStructureSize)) - ast.captureStructure.encode(to: buffer) - } - } catch { - errOut.pointee = copyCString( - "cannot parse regular expression: \(String(describing: error))") - } -} diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index ec6e1c26c..54e46948a 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -287,23 +287,34 @@ extension Parser { private mutating func applySyntaxOptions( of opts: AST.MatchingOptionSequence ) { - // We skip this for multi-line, as extended syntax is always enabled there. - if context.syntax.contains(.multilineExtendedSyntax) { return } + func mapOption(_ option: SyntaxOptions, + _ pred: (AST.MatchingOption) -> Bool) { + if opts.resetsCurrentOptions { + context.syntax.remove(option) + } + if opts.adding.contains(where: pred) { + context.syntax.insert(option) + } + if opts.removing.contains(where: pred) { + context.syntax.remove(option) + } + } + func mapOption(_ option: SyntaxOptions, _ kind: AST.MatchingOption.Kind) { + mapOption(option, { $0.kind == kind }) + } + + // (?n) + mapOption(.namedCapturesOnly, .namedCapturesOnly) - // Check if we're introducing or removing extended syntax. + // (?x), (?xx) + // We skip this for multi-line, as extended syntax is always enabled there. // TODO: PCRE differentiates between (?x) and (?xx) where only the latter // handles non-semantic whitespace in a custom character class. Other // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, // treat (?x) and (?xx) as the same option here. If we ever get a strict // PCRE mode, we will need to change this to handle that. - if opts.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if opts.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if opts.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) + if !context.syntax.contains(.multilineExtendedSyntax) { + mapOption(.extendedSyntax, \.isAnyExtended) } } diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift index 0a6270f1b..dbfe5f2d6 100644 --- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift @@ -63,6 +63,9 @@ public struct SyntaxOptions: OptionSet { return [Self(1 << 6), .extendedSyntax] } + /// `(?n)` + public static var namedCapturesOnly: Self { Self(1 << 7) } + /* /// `*` == `[[:digit:]]*` == `\d*` diff --git a/Sources/_RegexParser/Regex/Printing/PrettyPrinter.swift b/Sources/_RegexParser/Regex/Printing/PrettyPrinter.swift index bf379fc14..8ddcd73c7 100644 --- a/Sources/_RegexParser/Regex/Printing/PrettyPrinter.swift +++ b/Sources/_RegexParser/Regex/Printing/PrettyPrinter.swift @@ -40,6 +40,9 @@ public struct PrettyPrinter { // The indentation level fileprivate var indentLevel = 0 + + // The current default quantification behavior + public var quantificationBehavior: AST.Quantification.Kind = .eager } // MARK: - Raw interface diff --git a/Sources/_RegexParser/Regex/TreeProtocols.swift b/Sources/_RegexParser/Regex/TreeProtocols.swift index c14db65ce..7f1ccb5f7 100644 --- a/Sources/_RegexParser/Regex/TreeProtocols.swift +++ b/Sources/_RegexParser/Regex/TreeProtocols.swift @@ -2,10 +2,6 @@ public protocol _TreeNode { var children: [Self]? { get } - - func _captureStructure( - _: inout CaptureStructure.Constructor - ) -> CaptureStructure } extension _TreeNode { diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift index bd9bc665e..d37dfbd4a 100644 --- a/Sources/_RegexParser/Utility/Misc.swift +++ b/Sources/_RegexParser/Utility/Misc.swift @@ -167,7 +167,7 @@ extension BinaryInteger { } /// A wrapper of an existential metatype, equatable and hashable by reference. -public struct AnyType: Equatable, Hashable { +public struct AnyType: Hashable { public var base: Any.Type public init(_ type: Any.Type) { @@ -182,3 +182,5 @@ public struct AnyType: Equatable, Hashable { hasher.combine(ObjectIdentifier(base)) } } + + diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 96476f42b..47faa23ed 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -28,7 +28,7 @@ class Compiler { __consuming func emit() throws -> Program { // TODO: Handle global options var codegen = ByteCodeGen(options: options) - codegen.builder.captureStructure = tree.captureStructure + codegen.builder.captureList = tree.root._captureList try codegen.emitNode(tree.root) let program = try codegen.finish() return program diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 356b7cc4b..a44c2c876 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -691,8 +691,9 @@ extension Unicode.ExtendedGeneralCategory { ]) case .casedLetter: - throw Unsupported( - "TODO: cased letter? not the property?") + return consumeScalarGCs([ + .uppercaseLetter, .lowercaseLetter, .titlecaseLetter + ]) case .control: return consumeScalarGC(.control) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 2b38ace0a..cae8194bd 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -38,9 +38,7 @@ extension MEProgram where Input.Element: Hashable { // Special addresses or instructions var failAddressToken: AddressToken? = nil - // TODO: Should we have better API for building this up - // as we compile? - var captureStructure: CaptureStructure = .empty + var captureList = CaptureList() // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] @@ -353,7 +351,7 @@ extension MEProgram.Builder { staticTransformFunctions: transformFunctions, staticMatcherFunctions: matcherFunctions, registerInfo: regInfo, - captureStructure: captureStructure, + captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, namedCaptureOffsets: namedCaptureOffsets) } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 807598637..e3a542c1e 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -142,7 +142,7 @@ extension Processor._StoredCapture: CustomStringConvertible { } } -struct CaptureList { +struct MECaptureList { var values: Array._StoredCapture> var referencedCaptureOffsets: [ReferenceID: Int] var namedCaptureOffsets: [String: Int] diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 0bfa0ecba..8f1c721b0 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -34,7 +34,7 @@ struct MEProgram where Input.Element: Equatable { var enableTracing: Bool = false - let captureStructure: CaptureStructure + let captureList: CaptureList let referencedCaptureOffsets: [ReferenceID: Int] let namedCaptureOffsets: [String: Int] } diff --git a/Sources/_StringProcessing/Engine/Structuralize.swift b/Sources/_StringProcessing/Engine/Structuralize.swift index 12d2e1242..a8cfeb20c 100644 --- a/Sources/_StringProcessing/Engine/Structuralize.swift +++ b/Sources/_StringProcessing/Engine/Structuralize.swift @@ -1,78 +1,21 @@ @_implementationOnly import _RegexParser -extension CaptureStructure { - var optionalCount: Int { - switch self { - case .atom: return 0 - case .optional(let o): - return 1 + o.optionalCount - case .tuple: - // FIXME: Separate CaptureStructure and a component - fatalError("Recursive nesting") - @unknown default: - fatalError("Unknown default") - } - } - - // FIXME: Do it all in one pass, no need for all these - // intermediary arrays +extension CaptureList { func structuralize( - _ list: CaptureList, + _ list: MECaptureList, _ input: String - ) throws -> [StructuredCapture] { - - func mapCap( - _ cap: CaptureStructure, - _ storedCap: Processor._StoredCapture - ) -> StructuredCapture { - // TODO: CaptureList perhaps should store a - // metatype or relevant info... - let optCount = cap.optionalCount - - if cap.atomType.base == Substring.self { - // FIXME: What if a typed capture is Substring? - assert(!storedCap.hasValues) - - if let r = storedCap.latest { - return StructuredCapture( - optionalCount: optCount, - storedCapture: StoredCapture(range: r)) - } + ) -> [StructuredCapture] { + assert(list.values.count == captures.count) - return StructuredCapture( - optionalCount: optCount, - storedCapture: nil) - } + var result = [StructuredCapture]() + for (cap, meStored) in zip(self.captures, list.values) { + let stored = StoredCapture( + range: meStored.latest, value: meStored.latestValue) - guard (storedCap.isEmpty || storedCap.hasValues) else { - print(storedCap) - fatalError() - } - // TODO: assert types are the same, under all the - // optionals - - if let v = storedCap.latestValue { - return StructuredCapture( - optionalCount: optCount, - storedCapture: StoredCapture(range: storedCap.latest, value: v)) - } - return StructuredCapture( - optionalCount: optCount, - storedCapture: nil) - } - - switch self { - case let .tuple(values): - assert(list.values.count == values.count) - var result = Array() - for (cap, storedCap) in zip(values, list.values) { - result.append(mapCap(cap, storedCap)) - } - return result - - default: - assert(list.values.count == 1) - return [mapCap(self, list.values.first!)] + result.append(.init( + optionalCount: cap.optionalDepth, storedCapture: stored)) } + return result } } + diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 6ebb93f5c..e44b110e5 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -35,15 +35,13 @@ struct Executor { return nil } - let capList = CaptureList( + let capList = MECaptureList( values: cpu.storedCaptures, referencedCaptureOffsets: engine.program.referencedCaptureOffsets, namedCaptureOffsets: engine.program.namedCaptureOffsets) - let capStruct = engine.program.captureStructure let range = inputRange.lowerBound.. 0 { + printer.output(",") + } + + printer.terminateLine() + } + + for (i, member) in nonCharMembers.enumerated() { + printer.printAsPattern(member) + + if i != nonCharMembers.count - 1 { + printer.output(",") + } + + printer.terminateLine() + } + } + + if terminateLine { + print(")") + } else { + indent() + output(")") } } @@ -194,73 +328,324 @@ extension PrettyPrinter { ) { switch member { case let .custom(ccc): - printAsPattern(ccc) + printAsPattern(ccc, terminateLine: false) + case let .range(lhs, rhs): - if case let .char(lhs) = lhs, - case let .char(rhs) = rhs { + if let lhs = lhs._patternBase(&self), let rhs = rhs._patternBase(&self) { indent() - output(String(lhs)._quoted) + output("(") + output(lhs) output("...") - output(String(rhs)._quoted) - terminateLine() - } else { - print("// TODO: Range \(lhs) to \(rhs)") + output(rhs) + output(")") } + case let .atom(a): - if case let .char(c) = a { - print(String(c)._quoted) + indent() + switch a { + case let .char(c): + output(".anyOf(\(String(c)._quoted))") + case let .scalar(s): + output(".anyOf(\"\\u{\(String(s.value, radix: 16))}\")") + case let .unconverted(a): + output(a.ast._patternBase) + default: + print(" // TODO: Atom \(a)") + } + + case .quotedLiteral(let s): + output(".anyOf(\(s._quoted))") + + case .trivia(_): + // We never print trivia + break + + case .intersection(let first, let second): + printAsPattern(first) + printIndented { printer in + printer.indent() + printer.output(".intersection(") + printer.printAsPattern(second, terminateLine: false) + printer.output(")") + } + + case .subtraction(let first, let second): + printAsPattern(first) + printIndented { printer in + printer.indent() + printer.output(".subtracting(") + printer.printAsPattern(second, terminateLine: false) + printer.output(")") + } + + case .symmetricDifference(let first, let second): + printAsPattern(first) + printIndented { printer in + printer.indent() + printer.output(".symmetricDifference(") + printer.printAsPattern(second, terminateLine: false) + printer.output(")") + } + } + } + + mutating func printAsRegex( + _ ccc: DSLTree.CustomCharacterClass, + asFullRegex: Bool = true, + terminateLine: Bool = true + ) { + indent() + + if asFullRegex { + output("#/") + } + + output("[") + + if ccc.isInverted { + output("^") + } + + for member in ccc.members { + printAsRegex(member) + } + + output("]") + + if asFullRegex { + if terminateLine { + print("/#") } else { - print(" // TODO: Atom \(a) ") + output("/#") } + } + } + + mutating func printAsRegex(_ member: DSLTree.CustomCharacterClass.Member) { + switch member { + case let .custom(ccc): + printAsRegex(ccc, terminateLine: false) + + case let .range(lhs, rhs): + output(lhs._regexBase) + output("-") + output(rhs._regexBase) + + case let .atom(a): + switch a { + case let .char(c): + output(String(c)) + case let .unconverted(a): + output(a.ast._regexBase) + default: + print(" // TODO: Atom \(a)") + } + case .quotedLiteral(let s): - print("// TODO: quote \(s._quoted) in custom character classes (should we split it?)") - case .trivia(let t): - // TODO: We might want to output comments... - _ = t - case .symmetricDifference, .intersection, .subtraction: - print("// TODO: Set operation: \(member)") + output("\\Q\(s)\\E") + + case .trivia(_): + // We never print trivia + break + + case .intersection(let first, let second): + printAsRegex(first, asFullRegex: false, terminateLine: false) + output("&&") + printAsRegex(second, asFullRegex: false, terminateLine: false) + + case .subtraction(let first, let second): + printAsRegex(first, asFullRegex: false, terminateLine: false) + output("--") + printAsRegex(second, asFullRegex: false, terminateLine: false) + + case .symmetricDifference(let first, let second): + printAsRegex(first, asFullRegex: false, terminateLine: false) + output("~~") + printAsRegex(second, asFullRegex: false, terminateLine: false) } } } extension String { // TODO: Escaping? - fileprivate var _quoted: String { "\"\(self)\"" } + fileprivate var _quoted: String { + "\"\(self.replacing("\"", with: "\\\""))\"" + } } extension AST.Atom.AssertionKind { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { - case .startOfSubject: return "Anchor(.startOfSubject)" - case .endOfSubject: return "Anchor(.endOfSubject)" - case .textSegment: return "Anchor(.textSegment)" - case .notTextSegment: return "Anchor(.notTextSegment)" - case .startOfLine: return "Anchor(.startOfLine)" - case .endOfLine: return "Anchor(.endOfLine)" - case .wordBoundary: return "Anchor(.wordBoundary)" - case .notWordBoundary: return "Anchor(.notWordBoundary)" - - case .resetStartOfMatch: - return "Anchor(.resetStartOfMatch)" + case .startOfLine: + return "Anchor.startOfLine" + case .endOfLine: + return "Anchor.endOfLine" + case .wordBoundary: + return "Anchor.wordBoundary" + case .notWordBoundary: + return "Anchor.wordBoundary.inverted" + case .startOfSubject: + return "Anchor.startOfSubject" + case .endOfSubject: + return "Anchor.endOfSubject" case .endOfSubjectBeforeNewline: - return "Anchor(.endOfSubjectBeforeNewline)" + return "Anchor.endOfSubjectBeforeNewline" + case .textSegment: + return "Anchor.textSegmentBoundary" + case .notTextSegment: + return "Anchor.textSegmentBoundary.inverted" case .firstMatchingPositionInSubject: - return "Anchor(.firstMatchingPositionInSubject)" + return "Anchor.firstMatchingPositionInSubject" + + case .resetStartOfMatch: + return "TODO: Assertion resetStartOfMatch" } } } extension AST.Atom.CharacterProperty { - // TODO: Some way to integrate this with conversion... - var _patternBase: String { - "Property(\(kind._patternBase)\(isInverted ? ", inverted: true" : ""))" + var isUnprintableProperty: Bool { + switch kind { + case .ascii: + return true + case .binary(let b, value: _): + return isUnprintableBinary(b) + case .generalCategory(let gc): + return isUnprintableGeneralCategory(gc) + case .posix(let p): + return isUnprintablePOSIX(p) + case .script(_), .scriptExtension(_): + return true + default: + return false + } + } + + func isUnprintableBinary(_ binary: Unicode.BinaryProperty) -> Bool { + // List out the ones we can print because that list is smaller. + switch binary { + case .whitespace: + return false + default: + return true + } + } + + func isUnprintableGeneralCategory( + _ gc: Unicode.ExtendedGeneralCategory + ) -> Bool { + // List out the ones we can print because that list is smaller. + switch gc { + case .decimalNumber: + return false + default: + return true + } + } + + func isUnprintablePOSIX(_ posix: Unicode.POSIXProperty) -> Bool { + // List out the ones we can print because that list is smaller. + switch posix { + case .xdigit: + return false + case .word: + return false + default: + return true + } } } -extension AST.Atom.CharacterProperty.Kind { + +extension AST.Atom.CharacterProperty { // TODO: Some way to integrate this with conversion... var _patternBase: String { - "/* TODO: character properties */" + if isUnprintableProperty { + return _regexBase + } + + return _dslBase + } + + var _dslBase: String { + switch kind { + case .binary(let bp, _): + switch bp { + case .whitespace: + return ".whitespace" + default: + return "" + } + + case .generalCategory(let gc): + switch gc { + case .decimalNumber: + return ".digit" + default: + return "" + } + + case .posix(let p): + switch p { + case .xdigit: + return ".hexDigit" + case .word: + return ".word" + default: + return "" + } + + default: + return "" + } + } + + var _regexBase: String { + switch kind { + case .ascii: + return "[:\(isInverted ? "^" : "")ascii:]" + + case .binary(let b, value: _): + if isInverted { + return "[^\\p{\(b.rawValue)}]" + } else { + return "\\p{\(b.rawValue)}" + } + + case .generalCategory(let gc): + if isInverted { + return "[^\\p{\(gc.rawValue)}]" + } else { + return "\\p{\(gc.rawValue)}" + } + + case .posix(let p): + return "[:\(isInverted ? "^" : "")\(p.rawValue):]" + + case .script(let s): + return "[:\(isInverted ? "^" : "")script=\(s.rawValue):]" + + case .scriptExtension(let s): + return "[:\(isInverted ? "^" : "")scx=\(s.rawValue):]" + + default: + return " // TODO: Property \(self)" + } + } +} + +extension AST.Atom { + var isUnprintableAtom: Bool { + switch kind { + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: + return true + case .namedCharacter(_): + return true + case .property(let p): + return p.isUnprintableProperty + default: + return false + } } } @@ -278,6 +663,14 @@ extension AST.Atom { return anchor._patternBase } + if isUnprintableAtom { + return _regexBase + } + + return _dslBase + } + + var _dslBase: String { switch kind { case let .char(c): return String(c) @@ -287,21 +680,62 @@ extension AST.Atom { return "\\u{\(hex)}" case let .property(p): - return p._patternBase - + return p._dslBase + case let .escaped(e): - // TODO: API names instead of case names - return ".\(e)" - - case .keyboardControl: - return " /* TODO: keyboard control */" - - case .keyboardMeta: - return " /* TODO: keyboard meta */" - - case .keyboardMetaControl: - return " /* TODO: keyboard meta-control */" - + switch e { + // Anchors + case .wordBoundary: + return "Anchor.wordBoundary" + case .notWordBoundary: + return "Anchor.wordBoundary.inverted" + case .startOfSubject: + return "Anchor.startOfSubject" + case .endOfSubject: + return "Anchor.endOfSubject" + case .endOfSubjectBeforeNewline: + return "Anchor.endOfSubjectBeforeNewline" + case .firstMatchingPositionInSubject: + return "Anchor.firstMatchingPositionInSubject" + case .textSegment: + return "Anchor.textSegmentBoundary" + case .notTextSegment: + return "Anchor.textSegmentBoundary.inverted" + + // Character Classes + case .decimalDigit: + return ".digit" + case .notDecimalDigit: + return ".digit.inverted" + case .horizontalWhitespace: + return ".horizontalWhitespace" + case .notHorizontalWhitespace: + return ".horizontalWhitespace.inverted" + case .whitespace: + return ".whitespace" + case .notWhitespace: + return ".whitespace.inverted" + case .wordCharacter: + return ".word" + case .notWordCharacter: + return ".word.inverted" + case .graphemeCluster: + return ".anyGraphemeCluster" + case .newlineSequence: + return ".newlineSequence" + case .notNewline: + return ".newlineSequence.inverted" + case .verticalTab: + return ".verticalWhitespace" + case .notVerticalTab: + return ".verticalWhitespace.inverted" + + // Literal single characters all get converted into DSLTree.Atom.scalar + + default: + return "TODO: escaped \(e)" + } + case .namedCharacter: return " /* TODO: named character */" @@ -325,51 +759,61 @@ extension AST.Atom { case .changeMatchingOptions: return "/* TODO: change matching options */" + + // Every other case we've already decided cannot be represented inside the + // DSL. + default: + return "" } } -} - -extension AST.Group.Kind { - var _patternBase: String { - switch self { - case .capture: - // TODO: We probably want this to be a property after group - return ".capture" - - case .namedCapture(let n): - return "name: \"\(n)\"" - - case .balancedCapture: - return "/* TODO: balanced captures */" - - case .nonCapture: return "" - - case .nonCaptureReset: - return "/* TODO: non-capture reset */" - - case .atomicNonCapturing: - return "/* TODO: atomicNonCapturing */" - case .lookahead: - return "/* TODO: lookahead */" - case .negativeLookahead: - return "/* TODO: negativeLookahead */" - case .nonAtomicLookahead: - return "/* TODO: nonAtomicLookahead */" - case .lookbehind: - return "/* TODO: lookbehind */" - case .negativeLookbehind: - return "/* TODO: negativeLookbehind */" - case .nonAtomicLookbehind: - return "/* TODO: nonAtomicLookbehind */" - case .scriptRun: - return "/* TODO: scriptRun */" - case .atomicScriptRun: - return "/* TODO: atomicScriptRun */" + + var _regexBase: String { + switch kind { + case let .char(c): + return String(c) + + case let .scalar(s): + let hex = String(s.value, radix: 16, uppercase: true) + return "\\u{\(hex)}" + + case let .property(p): + return p._regexBase + + case let .escaped(e): + return "\\\(e.character)" + + case .keyboardControl(let k): + return "\\c\(k)" + + case .keyboardMeta(let k): + return "\\M-\(k)" + + case .keyboardMetaControl(let k): + return "\\M-\\C-\(k)" + + case .namedCharacter(let n): + return "\\N{\(n)}" + + case .any: + return "." + + case .startOfLine, .endOfLine: + fatalError("unreachable") + + case .backreference: + return " /* TODO: back reference */" + + case .subpattern: + return " /* TODO: subpattern */" + + case .callout: + return " /* TODO: callout */" + + case .backtrackingDirective: + return " /* TODO: backtracking directive */" + case .changeMatchingOptions: - return "/* TODO: changeMatchingOptions */" - - @unknown default: - fatalError() + return "/* TODO: change matching options */" } } } @@ -379,11 +823,11 @@ extension AST.Quantification.Amount { switch self { case .zeroOrMore: return "ZeroOrMore" case .oneOrMore: return "OneOrMore" - case .zeroOrOne: return "ZeroOrOne" - case let .exactly(n): return "Quantitified(exactly: \(n))" - case let .nOrMore(n): return "Quantified(\(n)...)" - case let .upToN(n): return "Quantified(...\(n))" - case let .range(n, m): return "Quantified(\(n)...\(m))" + case .zeroOrOne: return "Optionally" + case let .exactly(n): return "Repeat(count: \(n.value))" + case let .nOrMore(n): return "Repeat(\(n.value)...)" + case let .upToN(n): return "Repeat(...\(n.value))" + case let .range(n, m): return "Repeat(\(n.value)...\(m.value))" } } } @@ -403,3 +847,142 @@ extension DSLTree.QuantificationKind { (ast ?? .eager)._patternBase } } + +extension DSLTree.CustomCharacterClass.Member { + var isUnprintableMember: Bool { + switch self { + case .atom(.unconverted(let a)): + return a.ast.isUnprintableAtom + case .custom(let c): + return c.hasUnprintableProperty + case .range(.unconverted(let lhs), .unconverted(let rhs)): + return lhs.ast.isUnprintableAtom || rhs.ast.isQuantifiable + case .intersection(let first, let second): + return first.hasUnprintableProperty || second.hasUnprintableProperty + case .subtraction(let first, let second): + return first.hasUnprintableProperty || second.hasUnprintableProperty + case .symmetricDifference(let first, let second): + return first.hasUnprintableProperty || second.hasUnprintableProperty + default: + return false + } + } +} + +extension DSLTree.CustomCharacterClass { + var hasUnprintableProperty: Bool { + members.contains { + $0.isUnprintableMember + } + } +} + +extension DSLTree.Atom { + func _patternBase(_ printer: inout PrettyPrinter) -> String? { + switch self { + case .any: + return ".any" + + case let .char(c): + return String(c)._quoted + + case let .scalar(s): + let hex = String(s.value, radix: 16, uppercase: true) + return "\\u{\(hex)}"._quoted + + case let .unconverted(a): + if a.ast.isUnprintableAtom { + return "#/\(a.ast._regexBase)/#" + } else { + return a.ast._dslBase + } + + case .assertion(let a): + return a.ast._patternBase + + case .backreference(_): + return "/* TOOD: backreferences */" + + case .symbolicReference: + return "/* TOOD: symbolic references */" + + case .changeMatchingOptions(let matchingOptions): + for add in matchingOptions.ast.adding { + switch add.kind { + case .reluctantByDefault: + printer.quantificationBehavior = .reluctant + default: + break + } + } + } + + return nil + } + + var _regexBase: String { + switch self { + case .any: + return "." + + case let .char(c): + return String(c) + + case let .scalar(s): + let hex = String(s.value, radix: 16, uppercase: true) + return "\\u{\(hex)}"._quoted + + case let .unconverted(a): + return a.ast._regexBase + + case .assertion: + return "/* TODO: assertions */" + case .backreference: + return "/* TOOD: backreferences */" + case .symbolicReference: + return "/* TOOD: symbolic references */" + case .changeMatchingOptions(let matchingOptions): + var result = "" + + for add in matchingOptions.ast.adding { + switch add.kind { + case .reluctantByDefault: + result += "(?U)" + default: + break + } + } + + return result + } + } +} + +extension DSLTree.Node { + func getNamedCaptures() -> [String] { + var result: [String] = [] + + switch self { + case .capture(let name, _, _): + if let name = name { + result.append(name) + } + + case .concatenation(let nodes): + for node in nodes { + result += node.getNamedCaptures() + } + + case .convertedRegexLiteral(let node, _): + result += node.getNamedCaptures() + + case .quantification(_, _, let node): + result += node.getNamedCaptures() + + default: + break + } + + return result + } +} diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 23222da00..00fc2e952 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -62,6 +62,7 @@ public struct AnyRegexOutput { /// The depth of `Optioals`s wrapping the underlying value. For example, /// `Substring` has optional depth `0`, and `Int??` has optional depth `2`. let optionalDepth: Int + /// The bounds of the output element. let bounds: Range? } @@ -90,7 +91,7 @@ extension AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: The output, if the underlying value can be converted to the /// output type; otherwise `nil`. - public func `as`(_ type: Output.Type) -> Output? { + public func `as`(_ type: Output.Type = Output.self) -> Output? { let elements = _elements.map { StructuredCapture( optionalCount: $0.optionalDepth, @@ -206,23 +207,30 @@ extension Regex.Match where Output == AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: A match generic over the output type, if the underlying values /// can be converted to the output type; otherwise, `nil`. - public func `as`(_ type: Output.Type) -> Regex.Match? { + public func `as`( + _ type: Output.Type = Output.self + ) -> Regex.Match? { fatalError("FIXME: Not implemented") } } @available(SwiftStdlib 5.7, *) -extension Regex where Output == AnyRegexOutput { +extension Regex { /// Returns whether a named-capture with `name` exists public func contains(captureNamed name: String) -> Bool { - fatalError("FIXME: not implemented") + program.tree.root._captureList.captures.contains(where: { + $0.name == name + }) } +} +@available(SwiftStdlib 5.7, *) +extension Regex where Output == AnyRegexOutput { /// Creates a type-erased regex from an existing regex. /// /// Use this initializer to fit a regex with strongly typed captures into the /// use site of a dynamic regex, i.e. one that was created from a string. - public init(_ match: Regex) { + public init(_ regex: Regex) { fatalError("FIXME: Not implemented") } @@ -231,7 +239,9 @@ extension Regex where Output == AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: A regex generic over the output type if the underlying types can be converted. /// Returns `nil` otherwise. - public func `as`(_ type: Output.Type) -> Regex? { + public func `as`( + _ type: Output.Type = Output.self + ) -> Regex? { fatalError("FIXME: Not implemented") } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 52eaeffb0..b279c08e4 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -343,13 +343,6 @@ extension DSLTree.Node { } } -extension DSLTree { - var captureStructure: CaptureStructure { - // TODO: nesting - var constructor = CaptureStructure.Constructor(.flatten) - return _Tree(root)._captureStructure(&constructor) - } -} extension DSLTree.Node { /// For typed capture-producing nodes, the type produced. var valueCaptureType: AnyType? { @@ -458,6 +451,88 @@ public struct CaptureTransform: Hashable, CustomStringConvertible { // These wrapper types are required because even @_spi-marked public APIs can't // include symbols from implementation-only dependencies. +extension DSLTree.Node { + func _addCaptures( + to list: inout CaptureList, + optionalNesting nesting: Int + ) { + let addOptional = nesting+1 + switch self { + case let .orderedChoice(children): + for child in children { + child._addCaptures(to: &list, optionalNesting: addOptional) + } + + case let .concatenation(children): + for child in children { + child._addCaptures(to: &list, optionalNesting: nesting) + } + + case let .capture(name, _, child): + list.append(.init( + name: name, + type: child.valueCaptureType?.base, + optionalDepth: nesting)) + child._addCaptures(to: &list, optionalNesting: nesting) + + case let .nonCapturingGroup(kind, child): + assert(!kind.ast.isCapturing) + child._addCaptures(to: &list, optionalNesting: nesting) + + case let .conditional(cond, trueBranch, falseBranch): + switch cond.ast { + case .group(let g): + AST.Node.group(g)._addCaptures(to: &list, optionalNesting: nesting) + default: + break + } + + trueBranch._addCaptures(to: &list, optionalNesting: addOptional) + falseBranch._addCaptures(to: &list, optionalNesting: addOptional) + + + case let .quantification(amount, _, child): + var optNesting = nesting + if amount.ast.bounds.atLeast == 0 { + optNesting += 1 + } + child._addCaptures(to: &list, optionalNesting: optNesting) + + case let .regexLiteral(re): + return re.ast._addCaptures(to: &list, optionalNesting: nesting) + + case let .absentFunction(abs): + switch abs.ast.kind { + case .expression(_, _, let child): + child._addCaptures(to: &list, optionalNesting: nesting) + case .clearer, .repeater, .stopper: + break + @unknown default: + fatalError() + } + + case let .convertedRegexLiteral(n, _): + return n._addCaptures(to: &list, optionalNesting: nesting) + + case .matcher: + break + + case .transform(_, let child): + child._addCaptures(to: &list, optionalNesting: nesting) + + case .customCharacterClass, .atom, .trivia, .empty, + .quotedLiteral, .consumer, .characterPredicate: + break + } + } + + var _captureList: CaptureList { + var list = CaptureList() + self._addCaptures(to: &list, optionalNesting: 0) + return list + } +} + extension DSLTree { /// Presents a wrapped version of `DSLTree.Node` that can provide an internal /// `_TreeNode` conformance. @@ -494,60 +569,6 @@ extension DSLTree { return abs.ast.children.map(\.dslTreeNode).map(_Tree.init) } } - - func _captureStructure( - _ constructor: inout CaptureStructure.Constructor - ) -> CaptureStructure { - switch node { - case let .orderedChoice(children): - return constructor.alternating(children.map(_Tree.init)) - - case let .concatenation(children): - return constructor.concatenating(children.map(_Tree.init)) - - case let .capture(name, _, child): - if let type = child.valueCaptureType { - return constructor.capturing( - name: name, _Tree(child), withType: type) - } - return constructor.capturing(name: name, _Tree(child)) - - case let .nonCapturingGroup(kind, child): - assert(!kind.ast.isCapturing) - return constructor.grouping(_Tree(child), as: kind.ast) - - case let .conditional(cond, trueBranch, falseBranch): - return constructor.condition( - cond.ast, - trueBranch: _Tree(trueBranch), - falseBranch: _Tree(falseBranch)) - - case let .quantification(amount, _, child): - return constructor.quantifying( - Self(child), amount: amount.ast) - - case let .regexLiteral(re): - // TODO: Force a re-nesting? - return re.ast._captureStructure(&constructor) - - case let .absentFunction(abs): - return constructor.absent(abs.ast.kind) - - case let .convertedRegexLiteral(n, _): - // TODO: Switch nesting strategy? - return Self(n)._captureStructure(&constructor) - - case .matcher: - return .empty - - case .transform(_, let child): - return Self(child)._captureStructure(&constructor) - - case .customCharacterClass, .atom, .trivia, .empty, - .quotedLiteral, .consumer, .characterPredicate: - return .empty - } - } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 3e8f8e9e8..8172e993b 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -189,11 +189,11 @@ extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) extension RegexComponent { - /*public*/ static func ~=(regex: Self, input: String) -> Bool { + public static func ~=(regex: Self, input: String) -> Bool { input.wholeMatch(of: regex) != nil } - /*public*/ static func ~=(regex: Self, input: Substring) -> Bool { + public static func ~=(regex: Self, input: Substring) -> Bool { input.wholeMatch(of: regex) != nil } } diff --git a/Tests/RegexBuilderTests/AlgorithmsTests.swift b/Tests/RegexBuilderTests/AlgorithmsTests.swift index 0a2e6bc21..173d41598 100644 --- a/Tests/RegexBuilderTests/AlgorithmsTests.swift +++ b/Tests/RegexBuilderTests/AlgorithmsTests.swift @@ -104,6 +104,66 @@ class RegexConsumerTests: XCTestCase { result: "9+16, 3, 10, 99+1") ) } + + func testSwitches() { + // Failure cases + do { + switch "abcde" { + case Regex { + "a" + ZeroOrMore(.any) + "f" + }: + XCTFail() + + case OneOrMore { CharacterClass.whitespace }: + XCTFail() + + case "abc": + XCTFail() + + case Regex { + "a" + "b" + "c" + }: + XCTFail() + + default: + break + } + } + // Success cases + do { + let input = "abcde" + + switch input { + case Regex { + "a" + ZeroOrMore(.any) + "e" + }: + break + + default: + XCTFail() + } + + guard case Regex({ + "a" + ZeroOrMore(.any) + "e" + }) = input else { + XCTFail() + return + } + + guard case OneOrMore(.word) = input else { + XCTFail() + return + } + } + } } class AlgorithmsResultBuilderTests: XCTestCase { diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index b646f16f7..5673aa348 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -115,7 +115,7 @@ class RegexDSLTests: XCTestCase { { let disallowedChars = CharacterClass.hexDigit .symmetricDifference("a"..."z") - Lookahead(disallowedChars, negative: true) // No: 0-9 + g-z + NegativeLookahead(disallowedChars) // No: 0-9 + g-z OneOrMore(("b"..."g").union("d"..."n")) // b-n @@ -487,7 +487,7 @@ class RegexDSLTests: XCTestCase { { OneOrMore("a") Lookahead(CharacterClass.digit) - Lookahead("2", negative: true) + NegativeLookahead { "2" } CharacterClass.word } } @@ -570,6 +570,7 @@ class RegexDSLTests: XCTestCase { } let _: (Substring, Substring, Substring).Type = type(of: regex1).RegexOutput.self + let regex2 = Regex { OneOrMore("a") Capture { @@ -581,6 +582,7 @@ class RegexDSLTests: XCTestCase { } let _: (Substring, Substring, Int?).Type = type(of: regex2).RegexOutput.self + let regex3 = Regex { OneOrMore("a") Capture { @@ -593,6 +595,7 @@ class RegexDSLTests: XCTestCase { } let _: (Substring, Substring, Int, Double?).Type = type(of: regex3).RegexOutput.self + let regex4 = Regex { OneOrMore("a") Capture { @@ -739,43 +742,6 @@ class RegexDSLTests: XCTestCase { } } - func testDynamicCaptures() throws { - do { - let regex = try Regex("aabcc.") - let line = "aabccd" - let match = try XCTUnwrap(line.wholeMatch(of: regex)) - XCTAssertEqual(match.0, line[...]) - let output = match.output - XCTAssertEqual(output[0].substring, line[...]) - } - do { - let regex = try Regex( - #""" - (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* - """#) - let line = """ - A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ - COMBINING MARK TUKWENTIS - """ - let match = try XCTUnwrap(line.wholeMatch(of: regex)) - XCTAssertEqual(match.0, line[...]) - let output = match.output - XCTAssertEqual(output[0].substring, line[...]) - XCTAssertTrue(output[1].substring == "A6F0") - XCTAssertTrue(output["lower"]?.substring == "A6F0") - XCTAssertTrue(output[2].substring == "A6F1") - XCTAssertTrue(output["upper"]?.substring == "A6F1") - XCTAssertTrue(output[3].substring == "Extend") - XCTAssertTrue(output["desc"]?.substring == "Extend") - let typedOutput = try XCTUnwrap(output.as( - (Substring, lower: Substring, upper: Substring?, Substring).self)) - XCTAssertEqual(typedOutput.0, line[...]) - XCTAssertTrue(typedOutput.lower == "A6F0") - XCTAssertTrue(typedOutput.upper == "A6F1") - XCTAssertTrue(typedOutput.3 == "Extend") - } - } - func testBackreference() throws { try _testDSLCaptures( ("abc#41#42abcabcabc", ("abc#41#42abcabcabc", "abc", 42, "abc", nil)), diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index 0965559ad..d63333c01 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -334,4 +334,18 @@ class AlgorithmTests: XCTestCase { ["aa"]) } + func testSwitches() { + switch "abcde" { + case try! Regex("a.*f"): + XCTFail() + case try! Regex("abc"): + XCTFail() + + case try! Regex("a.*e"): + break // success + + default: + XCTFail() + } + } } diff --git a/Tests/RegexTests/AnyRegexOutputTests.swift b/Tests/RegexTests/AnyRegexOutputTests.swift new file mode 100644 index 000000000..8d91c0ec8 --- /dev/null +++ b/Tests/RegexTests/AnyRegexOutputTests.swift @@ -0,0 +1,157 @@ + +import _StringProcessing +import XCTest + +// Test that our existential capture and concrete captures are +// the same +private func checkSame( + _ aro: AnyRegexOutput, + _ concrete: (Substring, fieldA: Substring, fieldB: Substring) +) { + XCTAssertEqual(aro[0].substring, concrete.0) + + XCTAssertEqual(aro["fieldA"]!.substring, concrete.1) + XCTAssertEqual(aro["fieldA"]!.substring, concrete.fieldA) + + XCTAssertEqual(aro[1].substring, concrete.1) + + XCTAssertEqual(aro["fieldB"]!.substring, concrete.2) + XCTAssertEqual(aro["fieldB"]!.substring, concrete.fieldB) + + XCTAssertEqual(aro[2].substring, concrete.2) + +} +private func checkSame( + _ aro: Regex.Match, + _ concrete: Regex<(Substring, fieldA: Substring, fieldB: Substring)>.Match +) { + checkSame(aro.output, concrete.output) + + XCTAssertEqual(aro.0, concrete.0) + XCTAssertEqual(aro[0].substring, concrete.0) + + XCTAssertEqual(aro["fieldA"]!.substring, concrete.1) + XCTAssertEqual(aro["fieldA"]!.substring, concrete.fieldA) + XCTAssertEqual(aro[1].substring, concrete.1) + + XCTAssertEqual(aro["fieldB"]!.substring, concrete.2) + XCTAssertEqual(aro["fieldB"]!.substring, concrete.fieldB) + XCTAssertEqual(aro[2].substring, concrete.2) +} +private func checkSame( + _ aro: Regex, + _ concrete: Regex<(Substring, fieldA: Substring, fieldB: Substring)> +) { + XCTAssertEqual( + aro.contains(captureNamed: "fieldA"), + concrete.contains(captureNamed: "fieldA")) + XCTAssertEqual( + aro.contains(captureNamed: "fieldB"), + concrete.contains(captureNamed: "fieldB")) + XCTAssertEqual( + aro.contains(captureNamed: "notAField"), + concrete.contains(captureNamed: "notAField")) +} + +extension RegexTests { + func testAnyRegexOutput() { + let regex = try! Regex(#""" + (?x) + (? [^,]*) + , + (? [^,]*) + """#) + + let match = "abc,def".wholeMatch(of: regex)! + XCTAssertEqual(match.0, "abc,def") + XCTAssertEqual(match[0].substring, "abc,def") + + XCTAssertEqual(match["fieldA"]!.substring, "abc") + XCTAssertEqual(match.output["fieldA"]!.substring, "abc") + XCTAssertEqual(match[1].substring, "abc") + + XCTAssertEqual(match["fieldB"]!.substring, "def") + XCTAssertEqual(match.output["fieldB"]!.substring, "def") + XCTAssertEqual(match[2].substring, "def") + + XCTAssertNil(match["notACapture"]) + XCTAssertNil(match.output["notACapture"]) + XCTAssertEqual(match.count, 3) + + XCTAssert(regex.contains(captureNamed: "fieldA")) + XCTAssert(regex.contains(captureNamed: "fieldB")) + XCTAssertFalse(regex.contains(captureNamed: "notAField")) + + // MARK: Check equivalence with concrete + + let regexConcrete: + Regex<(Substring, fieldA: Substring, fieldB: Substring)> + = try! Regex(#""" + (?x) + (? [^,]*) + , + (? [^,]*) + """#) + checkSame(regex, regexConcrete) + + let matchConcrete = "abc,def".wholeMatch(of: regexConcrete)! + checkSame(match, matchConcrete) + + let output = match.output + let concreteOutput = matchConcrete.output + checkSame(output, concreteOutput) + + // TODO: ARO init from concrete match tuple + + let concreteOutputCasted = output.as( + (Substring, fieldA: Substring, fieldB: Substring).self + )! + checkSame(output, concreteOutputCasted) + + var concreteOutputCopy = concreteOutput + concreteOutputCopy = output.as()! + checkSame(output, concreteOutputCopy) + + // TODO: Regex.Match: init from tuple match and as to tuple match + + // TODO: Regex: init from tuple regex and as cast to tuple regex + + } + + func testDynamicCaptures() throws { + do { + let regex = try Regex("aabcc.") + let line = "aabccd" + let match = try XCTUnwrap(line.wholeMatch(of: regex)) + XCTAssertEqual(match.0, line[...]) + let output = match.output + XCTAssertEqual(output[0].substring, line[...]) + } + do { + let regex = try Regex( + #""" + (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* + """#) + let line = """ + A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ + COMBINING MARK TUKWENTIS + """ + let match = try XCTUnwrap(line.wholeMatch(of: regex)) + XCTAssertEqual(match.0, line[...]) + let output = match.output + XCTAssertEqual(output[0].substring, line[...]) + XCTAssertTrue(output[1].substring == "A6F0") + XCTAssertTrue(output["lower"]?.substring == "A6F0") + XCTAssertTrue(output[2].substring == "A6F1") + XCTAssertTrue(output["upper"]?.substring == "A6F1") + XCTAssertTrue(output[3].substring == "Extend") + XCTAssertTrue(output["desc"]?.substring == "Extend") + let typedOutput = try XCTUnwrap(output.as( + (Substring, lower: Substring, upper: Substring?, Substring).self)) + XCTAssertEqual(typedOutput.0, line[...]) + XCTAssertTrue(typedOutput.lower == "A6F0") + XCTAssertTrue(typedOutput.upper == "A6F1") + XCTAssertTrue(typedOutput.3 == "Extend") + } + } +} diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 7d4266071..b48e1f0a5 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -11,7 +11,42 @@ import XCTest @testable @_spi(RegexBuilder) import _StringProcessing -import _RegexParser +@testable import _RegexParser + + +extension CaptureList.Capture { + static var cap: Self { + return Self(optionalDepth: 0) + } + + static var opt: Self { + return Self(optionalDepth: 1) + } + static var opt_opt: Self { + return Self(optionalDepth: 2) + } + static var opt_opt_opt: Self { + return Self(optionalDepth: 3) + } + static var opt_opt_opt_opt: Self { + return Self(optionalDepth: 4) + } + static var opt_opt_opt_opt_opt: Self { + return Self(optionalDepth: 5) + } + static var opt_opt_opt_opt_opt_opt: Self { + return Self(optionalDepth: 6) + } + + static func named(_ name: String) -> Self { + return Self(name: name, optionalDepth: 0) + } +} +extension CaptureList { + static func caps(count: Int) -> Self { + Self(Array(repeating: .cap, count: count)) + } +} extension StructuredCapture { func formatStringCapture(input: String) -> String { @@ -109,36 +144,35 @@ func compile(_ ast: AST) -> Executor { func captureTest( _ regex: String, - _ expected: CaptureStructure, + _ expected: CaptureList, _ tests: (input: String, output: [StringCapture])..., skipEngine: Bool = false, file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(regex, .traditional) - let capStructure = ast.captureStructure - guard capStructure == expected else { + let capList = ast.root._captureList + guard capList == expected else { XCTFail(""" - Expected: - \(expected) - Actual: - \(capStructure) - """, - file: file, - line: line) + Expected: + \(expected) + Actual: + \(capList) + """, + file: file, + line: line) return } // Ensure DSLTree preserves literal captures - let dslCapStructure = ast.dslTree.captureStructure - guard dslCapStructure == capStructure else { + let dslCapList = ast.dslTree.root._captureList + guard dslCapList == capList else { XCTFail(""" DSLTree did not preserve structure: AST: - \(capStructure) + \(capList) DSLTree: - \(dslCapStructure) + \(dslCapList) """, file: file, line: line) @@ -192,168 +226,150 @@ extension RegexTests { func testLiteralStructuredCaptures() throws { captureTest( "abc", - .empty, + [], ("abc", [])) captureTest( "a(b)c", - .atom(), + [.cap], ("abc", ["b"])) captureTest( "a(b*)c", - .atom(), + [.cap], ("abc", ["b"]), ("ac", [""]), ("abbc", ["bb"])) captureTest( "a(b)*c", - .optional(.atom()), + [.opt], ("abc", [.some("b")]), ("ac", [.none]), ("abbc", [.some("b")])) captureTest( "a(b)+c", - .atom(), + [.cap], ("abc", ["b"]), ("abbc", ["b"])) captureTest( "a(b)?c", - .optional(.atom()), + [.opt], ("ac", [.none]), ("abc", [.some("b")])) captureTest( "(a)(b)(c)", - .tuple([.atom(),.atom(),.atom()]), + [.cap, .cap, .cap], ("abc", ["a", "b", "c"])) captureTest( "a|(b)", - .optional(.atom()), + [.opt], ("a", [.none]), ("b", [.some("b")])) captureTest( "(a)|(b)", - .tuple(.optional(.atom()), .optional(.atom())), + [.opt, .opt], ("a", [.some("a"), .none]), ("b", [.none, .some("b")])) captureTest( "((a)|(b))", - .tuple(.atom(), .optional(.atom()), .optional(.atom())), + [.cap, .opt, .opt], ("a", ["a", .some("a"), .none]), ("b", ["b", .none, .some("b")])) captureTest( "((a)|(b))?", - .tuple( - .optional(.atom()), - .optional(.optional(.atom())), - .optional(.optional(.atom()))), + [.opt, .opt_opt, .opt_opt], ("a", [.some("a"), .some(.some("a")), .some(.none)]), ("b", [.some("b"), .some(.none), .some(.some("b"))])) + // FIXME captureTest( "((a)|(b))*", - .tuple( - .optional(.atom()), - .optional(.optional(.atom())), - .optional(.optional(.atom()))), + [.opt, .opt_opt, .opt_opt], ("a", [.some("a"), .some(.some("a")), .some(.none)]), skipEngine: true) + // FIXME captureTest( "((a)|(b))+", - .tuple( - .atom(), - .optional(.atom()), - .optional(.atom())), + [.cap, .opt, .opt], // TODO: test cases skipEngine: true) + // FIXME captureTest( "(((a)|(b))*)", - .tuple( - .atom(), - .optional(.atom()), - .optional(.optional(.atom())), - .optional(.optional(.atom()))), + [.cap, .opt, .opt_opt, .opt_opt], // TODO: test cases skipEngine: true) - + // FIXME captureTest( "(((a)|(b))?)", - .tuple( - .atom(), - .optional(.atom()), - .optional(.optional(.atom())), - .optional(.optional(.atom()))), + [.cap, .opt, .opt_opt, .opt_opt], // TODO: test cases skipEngine: true) captureTest( "(a)", - .atom(), + [.cap], ("a", ["a"])) captureTest( "((a))", - .tuple([.atom(), .atom()]), + [.cap, .cap], ("a", ["a", "a"])) captureTest( "(((a)))", - .tuple([.atom(), .atom(), .atom()]), + [.cap, .cap, .cap], ("a", ["a", "a", "a"])) - - // broke + // FIXME captureTest( "((((a)*)?)*)?", - .tuple([ - .optional(.atom()), - .optional(.optional(.atom())), - .optional(.optional(.optional(.atom()))), - .optional(.optional(.optional(.optional(.atom())))), - ]), + [.opt, .opt_opt, .opt_opt_opt, .opt_opt_opt_opt], // TODO: test cases skipEngine: true) - captureTest( "a|(b*)", - .optional(.atom()), + [.opt], ("a", [.none]), ("", [.some("")]), ("b", [.some("b")]), ("bbb", [.some("bbb")])) + // FIXME captureTest( "a|(b)*", - .optional(.optional(.atom())), + [.opt_opt], ("a", [.none]), ("", [.some("")]), ("b", [.some("b")]), ("bbb", [.some("b")]), skipEngine: true) + // FIXME captureTest( "a|(b)+", - .optional(.atom()), + [.opt], ("a", [.none]), ("b", [.some("b")]), ("bbb", [.some("b")]), skipEngine: true) + // FIXME captureTest( "a|(b)?", - .optional(.optional(.atom())), + [.opt_opt], ("a", [.none]), ("", [.none]), ("b", [.some(.some("b"))]), @@ -361,78 +377,78 @@ extension RegexTests { captureTest( "a|(b|c)", - .optional(.atom()), + [.opt], ("a", [.none]), ("b", [.some("b")]), ("c", [.some("c")])) captureTest( "a|(b*|c)", - .optional(.atom()), + [.opt], ("a", [.none]), ("b", [.some("b")]), ("c", [.some("c")])) + // FIXME captureTest( "a|(b|c)*", - .optional(.optional(.atom())), + [.opt_opt], ("a", [.none]), ("", [.some("")]), ("b", [.some("b")]), ("bbb", [.some("b")]), skipEngine: true) + // FIXME captureTest( "a|(b|c)?", - .optional(.optional(.atom())), + [.opt_opt], ("a", [.none]), ("", [.none]), ("b", [.some(.some("b"))]), ("c", [.some(.some("c"))]), skipEngine: true) - captureTest( "a(b(c))", - .tuple(.atom(), .atom()), + [.cap, .cap], ("abc", ["bc", "c"])) captureTest( "a(b(c*))", - .tuple(.atom(), .atom()), + [.cap, .cap], ("ab", ["b", ""]), ("abc", ["bc", "c"]), ("abcc", ["bcc", "cc"])) captureTest( "a(b(c)*)", - .tuple(.atom(), .optional(.atom())), + [.cap, .opt], ("ab", ["b", .none]), ("abc", ["bc", .some("c")]), ("abcc", ["bcc", .some("c")])) captureTest( "a(b(c)?)", - .tuple(.atom(), .optional(.atom())), + [.cap, .opt], ("ab", ["b", .none]), ("abc", ["bc", .some("c")])) - captureTest( "a(b(c))*", - .tuple(.optional(.atom()), .optional(.atom())), + [.opt, .opt], ("a", [.none, .none]), ("abc", [.some("bc"), .some("c")]), ("abcbc", [.some("bc"), .some("c")])) captureTest( "a(b(c))?", - .tuple(.optional(.atom()), .optional(.atom())), + [.opt, .opt], ("a", [.none, .none]), ("abc", [.some("bc"), .some("c")])) -// TODO: "((a|b)*|c)*" -// TODO: "((a|b)|c)*" + // TODO: "((a|b)*|c)*" + // TODO: "((a|b)|c)*" } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 345e80e22..2c6b858cc 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -693,6 +693,14 @@ extension RegexTests { firstMatchTest(#"\p{gc=L}"#, input: "123abcXYZ", match: "a") firstMatchTest(#"\p{Lu}"#, input: "123abcXYZ", match: "X") + // U+0374 GREEK NUMERAL SIGN (Lm) + // U+00AA FEMININE ORDINAL INDICATOR (Lo) + firstMatchTest(#"\p{L}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "\u{0374}") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest( #"\P{Cc}"#, input: "\n\n\nXYZ", match: "X") firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 94c134853..831f904c6 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -@testable import _RegexParser +@testable @_spi(CompilerInterface) import _RegexParser import XCTest @testable import _StringProcessing @@ -39,7 +39,7 @@ class RegexTests: XCTestCase {} func parseTest( _ input: String, _ expectedAST: AST.Node, syntax: SyntaxOptions = .traditional, - captures expectedCaptures: CaptureStructure = .empty, + captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { @@ -52,7 +52,7 @@ func parseTest( func parseTest( _ input: String, _ expectedAST: AST, syntax: SyntaxOptions = .traditional, - captures expectedCaptures: CaptureStructure = .empty, + captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { @@ -68,7 +68,7 @@ func parseTest( file: file, line: line) return } - let captures = ast.captureStructure + let captures = ast.captureList guard captures == expectedCaptures else { XCTFail(""" @@ -78,13 +78,16 @@ func parseTest( file: file, line: line) return } + // Test capture structure round trip serialization. + let capStruct = captures._captureStructure(nestOptionals: true) let serializedCapturesSize = CaptureStructure.serializationBufferSize( forInputUTF8CodeUnitCount: input.utf8.count) let serializedCaptures = UnsafeMutableRawBufferPointer.allocate( byteCount: serializedCapturesSize, alignment: MemoryLayout.alignment) - captures.encode(to: serializedCaptures) + + capStruct.encode(to: serializedCaptures) guard let decodedCaptures = CaptureStructure( decoding: UnsafeRawBufferPointer(serializedCaptures) ) else { @@ -95,7 +98,7 @@ func parseTest( """) return } - guard decodedCaptures == captures else { + guard decodedCaptures == capStruct else { XCTFail(""" Expected captures: \(expectedCaptures) @@ -281,24 +284,20 @@ func delimiterLexingDiagnosticTest( } } -func libswiftDiagnosticMessageTest( - _ input: String, _ expectedErr: String, file: StaticString = #file, - line: UInt = #line +func compilerInterfaceDiagnosticMessageTest( + _ input: String, _ expectedErr: String, + file: StaticString = #file, line: UInt = #line ) { - var errPtr: UnsafePointer? - var version: CUnsignedInt = 0 - - libswiftParseRegexLiteral( - input, &errPtr, &version, /*captureStructure*/ nil, - /*captureStructureSize*/ 0 - ) - - guard let errPtr = errPtr else { - XCTFail("Unexpected test pass", file: file, line: line) - return + do { + let captureBuffer = UnsafeMutableRawBufferPointer(start: nil, count: 0) + _ = try swiftCompilerParseRegexLiteral( + input, captureBufferOut: captureBuffer) + XCTFail("Expected parse error", file: file, line: line) + } catch let error as CompilerParseError { + XCTAssertEqual(expectedErr, error.message, file: file, line: line) + } catch { + fatalError("Expected CompilerParseError") } - let err = String(cString: errPtr) - XCTAssertEqual(expectedErr, err, file: file, line: line) } extension RegexTests { @@ -310,7 +309,7 @@ extension RegexTests { concat("a", "b", "c", "+", zeroOrMore(of: "d"))) parseTest( "a(b)", concat("a", capture("b")), - captures: .atom()) + captures: [.cap]) parseTest( "abc(?:de)+fghi*k|j", alt( @@ -336,15 +335,13 @@ extension RegexTests { concat( zeroOrMore(of: capture(atom(.any))), capture(zeroOrMore(of: atom(.any)))), - captures: .tuple([.optional(.atom()), .atom()])) + captures: [.opt, .cap]) parseTest( "((.))*((.)?)", concat( zeroOrMore(of: capture(capture(atom(.any)))), capture(zeroOrOne(of: capture(atom(.any))))), - captures: .tuple([ - .optional(.atom()), .optional(.atom()), .atom(), .optional(.atom()) - ])) + captures: [.opt, .opt, .cap, .opt]) parseTest( #"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) @@ -357,33 +354,33 @@ extension RegexTests { parseTest( "(a|b)c", concat(capture(alt("a", "b")), "c"), - captures: .atom()) + captures: [.cap]) parseTest( "(a)|b", alt(capture("a"), "b"), - captures: .optional(.atom())) + captures: [.opt]) parseTest( "(a)|(b)|c", alt(capture("a"), capture("b"), "c"), - captures: .tuple(.optional(.atom()), .optional(.atom()))) + captures: [.opt, .opt]) parseTest( "((a|b))c", concat(capture(capture(alt("a", "b"))), "c"), - captures: .tuple([.atom(), .atom()])) + captures: [.cap, .cap]) parseTest( "(?:((a|b)))*?c", concat(quant( .zeroOrMore, .reluctant, nonCapture(capture(capture(alt("a", "b"))))), "c"), - captures: .tuple(.optional(.atom()), .optional(.atom()))) + captures: [.opt, .opt]) parseTest( "(a)|b|(c)d", alt(capture("a"), "b", concat(capture("c"), "d")), - captures: .tuple([.optional(.atom()), .optional(.atom())])) + captures: [.opt, .opt]) // Alternations with empty branches are permitted. parseTest("|", alt(empty(), empty())) - parseTest("(|)", capture(alt(empty(), empty())), captures: .atom()) + parseTest("(|)", capture(alt(empty(), empty())), captures: [.cap]) parseTest("a|", alt("a", empty())) parseTest("|b", alt(empty(), "b")) parseTest("|b|", alt(empty(), "b", empty())) @@ -768,32 +765,32 @@ extension RegexTests { parseTest( #"a(?