diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d30cab209..f19c85f4d 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -35,6 +35,9 @@ extension Compiler.ByteCodeGen { builder.buildUnresolvedReference(id: id) case let .changeMatchingOptions(optionSequence): + if !builder.hasReceivedInstructions { + builder.initialOptions.apply(optionSequence.ast) + } options.apply(optionSequence.ast) case let .unconverted(astAtom): @@ -379,6 +382,9 @@ extension Compiler.ByteCodeGen { throw Unreachable("These should produce a capture node") case .changeMatchingOptions(let optionSequence): + if !builder.hasReceivedInstructions { + builder.initialOptions.apply(optionSequence) + } options.apply(optionSequence) try emitNode(child) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index cae8194bd..416583f7b 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -39,6 +39,7 @@ extension MEProgram where Input.Element: Hashable { var failAddressToken: AddressToken? = nil var captureList = CaptureList() + var initialOptions = MatchingOptions() // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] @@ -77,6 +78,11 @@ extension MEProgram.Builder { var lastInstructionAddress: InstructionAddress { .init(instructions.endIndex - 1) } + + /// `true` if the builder has received any instructions. + var hasReceivedInstructions: Bool { + !instructions.isEmpty + } mutating func buildNop(_ r: StringRegister? = nil) { instructions.append(.init(.nop, .init(optionalString: r))) @@ -353,7 +359,8 @@ extension MEProgram.Builder { registerInfo: regInfo, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, - namedCaptureOffsets: namedCaptureOffsets) + namedCaptureOffsets: namedCaptureOffsets, + initialOptions: initialOptions) } mutating func reset() { self = Self() } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 8f1c721b0..5c3010a75 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -37,6 +37,8 @@ struct MEProgram where Input.Element: Equatable { let captureList: CaptureList let referencedCaptureOffsets: [ReferenceID: Int] let namedCaptureOffsets: [String: Int] + + var initialOptions: MatchingOptions } extension MEProgram: CustomStringConvertible { diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index c60f08d17..a5daa3f73 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -54,6 +54,13 @@ extension MatchingOptions { stack[stack.count - 1].apply(sequence) _invariantCheck() } + + // @testable + /// Returns true if the options at the top of `stack` are equal to those + /// for `other`. + func _equal(to other: MatchingOptions) -> Bool { + stack.last == other.stack.last + } } // MARK: Matching behavior API @@ -127,6 +134,7 @@ extension MatchingOptions { } } +// MARK: - Implementation extension MatchingOptions { /// An option that changes the behavior of a regular expression. fileprivate enum Option: Int { diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index a76a19607..320d10897 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -13,15 +13,7 @@ extension AST { var dslTree: DSLTree { - return DSLTree( - root.dslTreeNode, options: globalOptions?.dslTreeOptions) - } -} - -extension AST.GlobalMatchingOptionSequence { - var dslTreeOptions: DSLTree.Options { - // TODO: map options - return .init() + return DSLTree(root.dslTreeNode) } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 29d2267b2..5d2101afe 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -91,6 +91,18 @@ extension Regex { self.tree = tree } } + + /// The set of matching options that applies to the start of this regex. + /// + /// Note that the initial options may not apply to the entire regex. For + /// example, in this regex, only case insensitivity (`i`) and Unicode scalar + /// semantics (set by API) apply to the entire regex, while ASCII character + /// classes (`P`) is part of `initialOptions` but not global: + /// + /// let regex = /(?i)(?P:\d+\s*)abc/.semanticLevel(.unicodeScalar) + var initialOptions: MatchingOptions { + program.loweredProgram.initialOptions + } } @available(SwiftStdlib 5.7, *) @@ -102,6 +114,6 @@ extension Regex { @_spi(RegexBuilder) public init(node: DSLTree.Node) { - self.program = Program(tree: .init(node, options: nil)) + self.program = Program(tree: .init(node)) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 91f102999..8ca6dce8d 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -14,11 +14,9 @@ @_spi(RegexBuilder) public struct DSLTree { var root: Node - var options: Options? - init(_ r: Node, options: Options?) { + init(_ r: Node) { self.root = r - self.options = options } } diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 8c3aead21..e2c6e1a87 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -158,8 +158,12 @@ extension Regex { if let m = try _match(input, in: low..= high { return nil } + if regex.initialOptions.semanticLevel == .graphemeCluster { + input.formIndex(after: &low) + } else { + input.unicodeScalars.formIndex(after: &low) + } } } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index eab46dca0..9e94a886a 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -88,4 +88,51 @@ extension RegexTests { try testCompilationEquivalence(row) } } + + func testCompileInitialOptions() throws { + func expectInitialOptions( + _ regex: Regex, + _ optionSequence: AST.MatchingOptionSequence, + file: StaticString = #file, + line: UInt = #line + ) throws { + var options = MatchingOptions() + options.apply(optionSequence) + + XCTAssertTrue( + regex.program.loweredProgram.initialOptions._equal(to: options), + file: file, line: line) + } + + func expectInitialOptions( + _ pattern: String, + _ optionSequence: AST.MatchingOptionSequence, + file: StaticString = #file, + line: UInt = #line + ) throws { + let regex = try Regex(pattern) + try expectInitialOptions(regex, optionSequence, file: file, line: line) + } + + try expectInitialOptions(".", matchingOptions()) + try expectInitialOptions("(?i)(?-i).", matchingOptions()) + + try expectInitialOptions("(?i).", matchingOptions(adding: [.caseInsensitive])) + try expectInitialOptions("(?i).(?-i)", matchingOptions(adding: [.caseInsensitive])) + + try expectInitialOptions( + "(?im)(?s).", + matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine])) + try expectInitialOptions(".", matchingOptions()) + try expectInitialOptions( + "(?im)(?s).(?u)", + matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine])) + + try expectInitialOptions( + "(?i:.)", + matchingOptions(adding: [.caseInsensitive])) + try expectInitialOptions( + "(?i:.)(?m:.)", + matchingOptions(adding: [.caseInsensitive])) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index ffedd0406..496e02658 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1618,5 +1618,15 @@ extension RegexTests { // TODO: Add test for grapheme boundaries at start/end of match + func testCase() { + let regex = try! Regex(#".\N{SPARKLING HEART}."#) + let input = "🧟‍♀️💖🧠 or 🧠💖☕️" + let characterMatches = input.matches(of: regex) + XCTAssertEqual(characterMatches.map { $0.0 }, ["🧟‍♀️💖🧠", "🧠💖☕️"]) + + let scalarMatches = input.matches(of: regex.matchingSemantics(.unicodeScalar)) + let scalarExpected: [Substring] = ["\u{FE0F}💖🧠", "🧠💖☕"] + XCTAssertEqual(scalarMatches.map { $0.0 }, scalarExpected) + } }