From a8bea8d7d9a8611984db871df7f4130a4c0d9bf7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 13 May 2022 07:43:40 -0500 Subject: [PATCH 1/3] Keep track of a regex's initial options The initial options are stored in the lower program, and include all options that are set before the first attempted match. Note that not all initial options are global - a leading option-setting group is included in initial options, even though it applies only to a portion of the overall regex. --- Sources/_StringProcessing/ByteCodeGen.swift | 6 +++ .../_StringProcessing/Engine/MEBuilder.swift | 9 +++- .../_StringProcessing/Engine/MEProgram.swift | 2 + .../_StringProcessing/MatchingOptions.swift | 8 ++++ .../Regex/ASTConversion.swift | 10 +--- Sources/_StringProcessing/Regex/Core.swift | 2 +- Sources/_StringProcessing/Regex/DSLTree.swift | 4 +- Tests/RegexTests/CompileTests.swift | 47 +++++++++++++++++++ 8 files changed, 74 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d30cab209..f19c85f4d 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -35,6 +35,9 @@ extension Compiler.ByteCodeGen { builder.buildUnresolvedReference(id: id) case let .changeMatchingOptions(optionSequence): + if !builder.hasReceivedInstructions { + builder.initialOptions.apply(optionSequence.ast) + } options.apply(optionSequence.ast) case let .unconverted(astAtom): @@ -379,6 +382,9 @@ extension Compiler.ByteCodeGen { throw Unreachable("These should produce a capture node") case .changeMatchingOptions(let optionSequence): + if !builder.hasReceivedInstructions { + builder.initialOptions.apply(optionSequence) + } options.apply(optionSequence) try emitNode(child) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index cae8194bd..416583f7b 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -39,6 +39,7 @@ extension MEProgram where Input.Element: Hashable { var failAddressToken: AddressToken? = nil var captureList = CaptureList() + var initialOptions = MatchingOptions() // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] @@ -77,6 +78,11 @@ extension MEProgram.Builder { var lastInstructionAddress: InstructionAddress { .init(instructions.endIndex - 1) } + + /// `true` if the builder has received any instructions. + var hasReceivedInstructions: Bool { + !instructions.isEmpty + } mutating func buildNop(_ r: StringRegister? = nil) { instructions.append(.init(.nop, .init(optionalString: r))) @@ -353,7 +359,8 @@ extension MEProgram.Builder { registerInfo: regInfo, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, - namedCaptureOffsets: namedCaptureOffsets) + namedCaptureOffsets: namedCaptureOffsets, + initialOptions: initialOptions) } mutating func reset() { self = Self() } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 8f1c721b0..5c3010a75 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -37,6 +37,8 @@ struct MEProgram where Input.Element: Equatable { let captureList: CaptureList let referencedCaptureOffsets: [ReferenceID: Int] let namedCaptureOffsets: [String: Int] + + var initialOptions: MatchingOptions } extension MEProgram: CustomStringConvertible { diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index c60f08d17..a5daa3f73 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -54,6 +54,13 @@ extension MatchingOptions { stack[stack.count - 1].apply(sequence) _invariantCheck() } + + // @testable + /// Returns true if the options at the top of `stack` are equal to those + /// for `other`. + func _equal(to other: MatchingOptions) -> Bool { + stack.last == other.stack.last + } } // MARK: Matching behavior API @@ -127,6 +134,7 @@ extension MatchingOptions { } } +// MARK: - Implementation extension MatchingOptions { /// An option that changes the behavior of a regular expression. fileprivate enum Option: Int { diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 79a515033..d025893b8 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -13,15 +13,7 @@ extension AST { var dslTree: DSLTree { - return DSLTree( - root.dslTreeNode, options: globalOptions?.dslTreeOptions) - } -} - -extension AST.GlobalMatchingOptionSequence { - var dslTreeOptions: DSLTree.Options { - // TODO: map options - return .init() + return DSLTree(root.dslTreeNode) } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 29d2267b2..e6ac3b594 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -102,6 +102,6 @@ extension Regex { @_spi(RegexBuilder) public init(node: DSLTree.Node) { - self.program = Program(tree: .init(node, options: nil)) + self.program = Program(tree: .init(node)) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 91f102999..8ca6dce8d 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -14,11 +14,9 @@ @_spi(RegexBuilder) public struct DSLTree { var root: Node - var options: Options? - init(_ r: Node, options: Options?) { + init(_ r: Node) { self.root = r - self.options = options } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index eab46dca0..9e94a886a 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -88,4 +88,51 @@ extension RegexTests { try testCompilationEquivalence(row) } } + + func testCompileInitialOptions() throws { + func expectInitialOptions( + _ regex: Regex, + _ optionSequence: AST.MatchingOptionSequence, + file: StaticString = #file, + line: UInt = #line + ) throws { + var options = MatchingOptions() + options.apply(optionSequence) + + XCTAssertTrue( + regex.program.loweredProgram.initialOptions._equal(to: options), + file: file, line: line) + } + + func expectInitialOptions( + _ pattern: String, + _ optionSequence: AST.MatchingOptionSequence, + file: StaticString = #file, + line: UInt = #line + ) throws { + let regex = try Regex(pattern) + try expectInitialOptions(regex, optionSequence, file: file, line: line) + } + + try expectInitialOptions(".", matchingOptions()) + try expectInitialOptions("(?i)(?-i).", matchingOptions()) + + try expectInitialOptions("(?i).", matchingOptions(adding: [.caseInsensitive])) + try expectInitialOptions("(?i).(?-i)", matchingOptions(adding: [.caseInsensitive])) + + try expectInitialOptions( + "(?im)(?s).", + matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine])) + try expectInitialOptions(".", matchingOptions()) + try expectInitialOptions( + "(?im)(?s).(?u)", + matchingOptions(adding: [.caseInsensitive, .multiline, .singleLine])) + + try expectInitialOptions( + "(?i:.)", + matchingOptions(adding: [.caseInsensitive])) + try expectInitialOptions( + "(?i:.)(?m:.)", + matchingOptions(adding: [.caseInsensitive])) + } } From 4706e092ba92843ab8c82102e2cfafdacf9686e7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 13 May 2022 11:41:52 -0500 Subject: [PATCH 2/3] Search by unicode scalar when in that mode Previously, searching via firstMatch or matches(of:) would only _start_ searches at a character index, even when a regex has Unicode scalar semantics. --- Sources/_StringProcessing/Regex/Match.swift | 6 +++++- Tests/RegexTests/MatchTests.swift | 10 ++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 8172e993b..4215cfb80 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -158,7 +158,11 @@ extension Regex { if let m = try _match(input, in: low.. Date: Sun, 15 May 2022 21:52:40 -0500 Subject: [PATCH 3/3] Make accessing the initial options a little less gnarly --- Sources/_StringProcessing/Regex/Core.swift | 12 ++++++++++++ Sources/_StringProcessing/Regex/Match.swift | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index e6ac3b594..5d2101afe 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -91,6 +91,18 @@ extension Regex { self.tree = tree } } + + /// The set of matching options that applies to the start of this regex. + /// + /// Note that the initial options may not apply to the entire regex. For + /// example, in this regex, only case insensitivity (`i`) and Unicode scalar + /// semantics (set by API) apply to the entire regex, while ASCII character + /// classes (`P`) is part of `initialOptions` but not global: + /// + /// let regex = /(?i)(?P:\d+\s*)abc/.semanticLevel(.unicodeScalar) + var initialOptions: MatchingOptions { + program.loweredProgram.initialOptions + } } @available(SwiftStdlib 5.7, *) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 7022fd55a..e2c6e1a87 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -159,7 +159,7 @@ extension Regex { return m } if low >= high { return nil } - if regex.program.loweredProgram.initialOptions.semanticLevel == .graphemeCluster { + if regex.initialOptions.semanticLevel == .graphemeCluster { input.formIndex(after: &low) } else { input.unicodeScalars.formIndex(after: &low)