From a786546f1d855dfe295bce39c4521dc5a0d544cc Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:03 +0100 Subject: [PATCH 1/6] Coalesce adjacent scalars and characters in the DSL Previously we would emit a series of scalars written in the DSL as a series of individual characters in grapheme semantic mode. Change the behavior such that we coalesce any adjacent scalars and characters, including those in regex literals and nested concatenations. We then perform grapheme breaking over the result, and can emit character matches for scalars that coalesced into a grapheme. This transform subsumes a similar transform we performed for regex literals when converting them to a DSLTree. This has the nice side effect of allowing us to better preserve scalar syntax in the DSL transform. rdar://96942688 --- Sources/_StringProcessing/ByteCodeGen.swift | 39 +++++++- .../_StringProcessing/PrintAsPattern.swift | 96 +++++++++++++------ .../Regex/ASTConversion.swift | 63 +----------- Sources/_StringProcessing/Regex/DSLTree.swift | 8 ++ Sources/_StringProcessing/Utility/Misc.swift | 47 +++++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 77 ++++++++++++--- Tests/RegexTests/MatchTests.swift | 37 +++++++ Tests/RegexTests/RenderDSLTests.swift | 53 +++++++++- 8 files changed, 318 insertions(+), 102 deletions(-) create mode 100644 Sources/_StringProcessing/Utility/Misc.swift diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 477760ef8..da21ea26a 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -791,6 +791,41 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { + // Before emitting a concatenation, we need to flatten out any nested + // concatenations, and coalesce any adjacent characters and scalars, forming + // quoted literals of their contents, over which we can perform grapheme + // breaking. + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .convertedRegexLiteral(let n, _): + return flatten(n) + default: + return [node] + } + } + let children = children + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + default: + return false + } + } + for child in children { + try emitConcatenationComponent(child) + } + } + @discardableResult mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { switch node { @@ -799,9 +834,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAlternation(children) case let .concatenation(children): - for child in children { - try emitConcatenationComponent(child) - } + try emitConcatenation(children) case let .capture(name, refId, child, transform): options.beginScope() diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 54ce66d0d..1c15bd549 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -70,16 +70,9 @@ extension PrettyPrinter { for namedCapture in namedCaptures { print("let \(namedCapture) = Reference(Substring.self)") } - - switch node { - case .concatenation(_): - printAsPattern(convertedFromAST: node) - case .convertedRegexLiteral(.concatenation(_), _): - printAsPattern(convertedFromAST: node) - default: - printBlock("Regex") { printer in - printer.printAsPattern(convertedFromAST: node) - } + + printBlock("Regex") { printer in + printer.printAsPattern(convertedFromAST: node, isTopLevel: true) } } @@ -89,7 +82,7 @@ extension PrettyPrinter { // to have a non-backing-off pretty-printer that this // can defer to. private mutating func printAsPattern( - convertedFromAST node: DSLTree.Node + convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false ) { if patternBackoff(DSLTree._Tree(node)) { printBackoff(node) @@ -106,11 +99,7 @@ extension PrettyPrinter { } case let .concatenation(c): - printBlock("Regex") { printer in - c.forEach { - printer.printAsPattern(convertedFromAST: $0) - } - } + printConcatenationAsPattern(c, isTopLevel: isTopLevel) case let .nonCapturingGroup(kind, child): switch kind.ast { @@ -273,7 +262,7 @@ extension PrettyPrinter { // check above, so it should work out. Need a // cleaner way to do this. This means the argument // label is a lie. - printAsPattern(convertedFromAST: n) + printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -289,6 +278,60 @@ extension PrettyPrinter { print("/* TODO: absent function */") } } + + enum NodeToPrint { + case dslNode(DSLTree.Node) + case stringLiteral(String) + } + + mutating func printAsPattern(_ node: NodeToPrint) { + switch node { + case .dslNode(let n): + printAsPattern(convertedFromAST: n) + case .stringLiteral(let str): + print(str) + } + } + + mutating func printConcatenationAsPattern( + _ nodes: [DSLTree.Node], isTopLevel: Bool + ) { + // We need to coalesce any adjacent character and scalar elements into a + // string literal, preserving scalar syntax. + let nodes = nodes + .map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) } + .coalescing( + with: StringLiteralBuilder(), into: { .stringLiteral($0.result) } + ) { literal, node in + guard case .dslNode(let node) = node else { return false } + switch node { + case let .atom(.char(c)): + literal.append(c) + return true + case let .atom(.scalar(s)): + literal.append(unescaped: s._dslBase) + return true + case .quotedLiteral(let q): + literal.append(q) + return true + default: + return false + } + } + if isTopLevel || nodes.count == 1 { + // If we're at the top level, or we coalesced everything into a single + // element, we don't need to print a surrounding Regex { ... }. + for n in nodes { + printAsPattern(n) + } + return + } + printBlock("Regex") { printer in + for n in nodes { + printer.printAsPattern(n) + } + } + } mutating func printAsPattern( _ ccc: DSLTree.CustomCharacterClass, @@ -351,8 +394,7 @@ extension PrettyPrinter { charMembers.append(c) return false case let .scalar(s): - charMembers.append( - unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") + charMembers.append(unescaped: s._dslBase) return false case .unconverted(_): return true @@ -459,9 +501,9 @@ extension PrettyPrinter { case let .scalar(s): if wrap { - output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))") + output("One(.anyOf(\(s._dslBase._bareQuoted)))") } else { - output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")") + output(".anyOf(\(s._dslBase._bareQuoted))") } case let .unconverted(a): @@ -635,6 +677,10 @@ extension String { } } +extension UnicodeScalar { + var _dslBase: String { "\\u{\(String(value, radix: 16, uppercase: true))}" } +} + /// A helper for building string literals, which handles escaping the contents /// appended. fileprivate struct StringLiteralBuilder { @@ -861,19 +907,15 @@ extension AST.Atom { } var _dslBase: (String, canBeWrapped: Bool) { - func scalarLiteral(_ s: UnicodeScalar) -> String { - let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - } switch kind { case let .char(c): return (String(c), false) case let .scalar(s): - return (scalarLiteral(s.value), false) + return (s.value._dslBase, false) case let .scalarSequence(seq): - return (seq.scalarValues.map(scalarLiteral).joined(), false) + return (seq.scalarValues.map(\._dslBase).joined(), false) case let .property(p): return (p._dslBase, true) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index c4ac8e759..4eb7bc42c 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -43,61 +43,7 @@ extension AST.Node { return .orderedChoice(children) case let .concatenation(v): - // Coalesce adjacent children who can produce a - // string literal representation - let astChildren = v.children - func coalesce( - _ idx: Array.Index - ) -> (Array.Index, String)? { - var result = "" - var idx = idx - while idx < astChildren.endIndex { - guard let atom: AST.Atom = astChildren[idx].as() else { break } - - // TODO: For printing, nice to coalesce - // scalars literals too. We likely need a different - // approach even before we have a better IR. - if let char = atom.singleCharacter { - result.append(char) - } else if let scalar = atom.singleScalar { - result.append(Character(scalar)) - } else if case .scalarSequence(let seq) = atom.kind { - result += seq.scalarValues.map(Character.init) - } else { - break - } - - astChildren.formIndex(after: &idx) - } - return result.isEmpty ? nil : (idx, result) - } - - // No need to nest single children concatenations - if astChildren.count == 1 { - return astChildren.first!.dslTreeNode - } - - // Check for a single child post-coalescing - if let (idx, str) = coalesce(astChildren.startIndex), - idx == astChildren.endIndex - { - return .quotedLiteral(str) - } - - // Coalesce adjacent string children - var curIdx = astChildren.startIndex - var children = Array() - while curIdx < astChildren.endIndex { - if let (nextIdx, str) = coalesce(curIdx) { - // TODO: Track source info... - children.append(.quotedLiteral(str)) - curIdx = nextIdx - } else { - children.append(astChildren[curIdx].dslTreeNode) - astChildren.formIndex(after: &curIdx) - } - } - return .concatenation(children) + return .concatenation(v.children.map(\.dslTreeNode)) case let .group(v): let child = v.child.dslTreeNode @@ -135,10 +81,9 @@ extension AST.Node { case let .atom(v): switch v.kind { case .scalarSequence(let seq): - // Scalar sequences are splatted into concatenated scalars, which - // becomes a quoted literal. Sequences nested in concatenations have - // already been coalesced, this just handles the lone atom case. - return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + // The DSL doesn't have an equivalent node for scalar sequences. Splat + // them into a concatenation of scalars. + return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) }) default: return .atom(v.dslTreeAtom) } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 4ea905fd5..520f4991a 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -334,6 +334,14 @@ extension DSLTree.Node { default: return nil } } + + /// If this node is for a converted literal, look through it. + var lookingThroughConvertedLiteral: Self { + switch self { + case let .convertedRegexLiteral(n, _): return n + default: return self + } + } } extension DSLTree.Atom { diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift new file mode 100644 index 000000000..139a1be34 --- /dev/null +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -0,0 +1,47 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +extension Array { + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into an element of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Element, + accumulate: (inout T, Element) -> Bool + ) -> Self { + var didAccumulate = false + var accumulator = initialAccumulator + + var result = Self() + for elt in self { + if accumulate(&accumulator, elt) { + // The element has been coalesced into accumulator, there is nothing + // else to do. + didAccumulate = true + continue + } + if didAccumulate { + // We have a leftover accumulator, which needs to be finished before we + // can append the next element. + result.append(finish(accumulator)) + accumulator = initialAccumulator + didAccumulate = false + } + result.append(elt) + } + // Handle a leftover accumulation. + if didAccumulate { + result.append(finish(accumulator)) + } + return result + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 1d186e0bc..e25f2df05 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1435,7 +1435,8 @@ class RegexDSLTests: XCTestCase { "\u{200D}" as UnicodeScalar "๐Ÿ‘ฆ" as UnicodeScalar } - XCTAssertNil(try r3.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) @@ -1447,18 +1448,72 @@ class RegexDSLTests: XCTestCase { try r4.firstMatch(in: "รฉ") ) - try XCTExpectFailure("Need stronger scalar coalescing logic") { - let r5 = Regex { - "e" - "\u{301}" as UnicodeScalar + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil(try r5.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r5.firstMatch(in: "รฉ")) + + let r6 = Regex { + "abcde" + "\u{301}" + } + XCTAssertNotNil(try r6.firstMatch(in: "abcde\u{301}")) + XCTAssertNotNil(try r6.firstMatch(in: "abcdรฉ")) + + let r7 = Regex { + "e" as Character + "\u{301}" as Character + } + XCTAssertNotNil(try r7.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r7.firstMatch(in: "รฉ")) + + // You can't match a partial grapheme in grapheme semantic mode. + let r8 = Regex { + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ง" as UnicodeScalar + } + XCTAssertNil(try r8.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNil(try r8.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r8.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNil(try r8.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + + // Scalar coalescing occurs across nested concatenations and literals. + let r9 = Regex { + Regex { + try! Regex(#"๐Ÿ‘จ"#) + "\u{200D}" as UnicodeScalar + Regex { + "๐Ÿ‘จ" as UnicodeScalar + } } - XCTAssertNotNil( - try r5.firstMatch(in: "e\u{301}") - ) - XCTAssertNotNil( - try r5.firstMatch(in: "รฉ") - ) + Regex { + Regex { + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ง" + } + try! Regex(#"\u{200D}๐Ÿ‘ฆ"#) + } + } + XCTAssertNotNil(try r9.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r9.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + + let r10 = Regex { + "๐Ÿ‘จ" as UnicodeScalar + try! Regex(#"\u{200D 1F468 200D 1F467}"#) + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ฆ" as UnicodeScalar } + XCTAssertNotNil(try r10.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r10.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) } struct SemanticVersion: Equatable { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f24ae89d9..25e36ad6c 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -312,6 +312,38 @@ extension RegexTests { match: "\u{006f}\u{031b}\u{0323}" ) + // e + combining accents + firstMatchTest( + #"e\u{301 302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{315 35C 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\u{302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{315}\u{301}\u{35C}", + match: "e\u{315}\u{301}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\de\u{302}"#, + input: "e\u{301}0e\u{302}", + match: "e\u{301}0e\u{302}" + ) + // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", @@ -1852,6 +1884,11 @@ extension RegexTests { #"e$"#, (eComposed, false), (eDecomposed, false)) + + matchTest( + #"\u{65 301}"#, + (eComposed, true), + (eDecomposed, true)) } func testCanonicalEquivalenceCharacterClass() throws { diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 52906d1ad..1fdd1e644 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -197,10 +197,59 @@ extension RenderDSLTests { } """#) - // TODO: We ought to try and preserve the scalar syntax here. try testConversion(#"a\u{301}"#, #""" Regex { - "aฬ" + "a\u{301}" + } + """#) + + try testConversion(#"๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" + Regex { + "๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" + } + """#) + + try testConversion(#"(๐Ÿ‘จ\u{200D}๐Ÿ‘จ)\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" + Regex { + Capture { + "๐Ÿ‘จ\u{200D}๐Ÿ‘จ" + } + "\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" + } + """#) + + // We preserve the structure of non-capturing groups. + try testConversion(#"abcd(?:e\u{301}\d)"#, #""" + Regex { + "abcd" + Regex { + "e\u{301}" + One(.digit) + } + } + """#) + + try testConversion(#"\u{A B C}"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + } + """#) + + // TODO: We might want to consider preserving scalar sequences in the DSL, + // and allowing them to merge with other concatenations. + try testConversion(#"\u{A B C}\u{d}efg"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + "\u{D}efg" + } + """#) + + // FIXME: We don't actually have a way of specifying in the DSL that we + // shouldn't join these together, should we print them as regex instead? + try testConversion(#"a(?:\u{301})"#, #""" + Regex { + "a" + "\u{301}" } """#) } From 618325a7324954965dc6458a925dbcf9433691a5 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:04 +0100 Subject: [PATCH 2/6] Fix scalar mode for quoted sequences in character class Previously we would only match entire characters. Update to use the generic Character consumer logic that can handle scalar semantic mode. rdar://97209131 --- .../_StringProcessing/ConsumerInterface.swift | 24 ++++------ Tests/RegexTests/MatchTests.swift | 46 +++++++++++++++++++ 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index c96775500..cb9c79fa6 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -63,7 +63,7 @@ extension DSLTree._AST.Atom { extension Character { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { + ) throws -> MEProgram.ConsumeFunction { let isCaseInsensitive = opts.isCaseInsensitive switch opts.semanticLevel { case .graphemeCluster: @@ -456,21 +456,17 @@ extension DSLTree.CustomCharacterClass.Member { } return rhs(input, bounds) } - case .quotedLiteral(let s): - if opts.isCaseInsensitive { - return { input, bounds in - guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { - return nil - } - return input.index(after: bounds.lowerBound) - } - } else { - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil + case .quotedLiteral(let str): + let consumers = try str.map { + try $0.generateConsumer(opts) + } + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx } - return input.index(after: bounds.lowerBound) } + return nil } case .trivia: // TODO: Should probably strip this earlier... diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 25e36ad6c..c356eee85 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -191,6 +191,7 @@ func firstMatchTests( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -203,6 +204,7 @@ func firstMatchTests( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + semanticLevel: semanticLevel, file: file, line: line) } @@ -719,6 +721,50 @@ extension RegexTests { ("a\u{301}", true), semanticLevel: .unicodeScalar) + // Scalar matching in quoted sequences. + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", nil), + ("\u{C9}", nil) + ) + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", nil), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "\u{301}"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", "E\u{301}"), + ("\u{C9}", "\u{C9}") + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", "E"), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "E"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters From 47bd7c5b2e5fbd0e02030e4d58e7bd9e655aaf9c Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:05 +0100 Subject: [PATCH 3/6] Form ASCII bitsets for quoted sequences in character classes --- .../_StringProcessing/ConsumerInterface.swift | 25 ++++++++++--------- Tests/RegexTests/CompileTests.swift | 9 +++++++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index cb9c79fa6..0c89faae0 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -327,24 +327,25 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions, _ isInverted: Bool ) -> DSLTree.CustomCharacterClass.AsciiBitset? { + typealias Bitset = DSLTree.CustomCharacterClass.AsciiBitset switch self { case let .atom(a): if let val = a.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - val, - isInverted, - opts.isCaseInsensitive - ) + return Bitset(val, isInverted, opts.isCaseInsensitive) } case let .range(low, high): - if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - low: lowVal, - high: highVal, - isInverted: isInverted, - isCaseInsensitive: opts.isCaseInsensitive - ) + if let lowVal = low.singleScalarASCIIValue, + let highVal = high.singleScalarASCIIValue { + return Bitset(low: lowVal, high: highVal, isInverted: isInverted, + isCaseInsensitive: opts.isCaseInsensitive) + } + case .quotedLiteral(let str): + var bitset = Bitset(isInverted: isInverted) + for c in str { + guard let ascii = c._singleScalarAsciiValue else { return nil } + bitset = bitset.union(Bitset(ascii, isInverted, opts.isCaseInsensitive)) } + return bitset default: return nil } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6c8f66e10..90694fc19 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -317,6 +317,15 @@ extension RegexTests { semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar], doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: #"[\Qab\Ec]"#, + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: #"[\Qab\Ec]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) } func testScalarOptimizeCompilation() { From c1a1299e97485c38b54bd7c3512baf6c460c8fee Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:05 +0100 Subject: [PATCH 4/6] Coalesce character class members In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars. --- Sources/_StringProcessing/ByteCodeGen.swift | 118 +++++++++ Sources/_StringProcessing/Utility/Misc.swift | 20 +- Tests/RegexTests/MatchTests.swift | 247 +++++++++++++++++++ Tests/RegexTests/ParseTests.swift | 2 + 4 files changed, 383 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index da21ea26a..446d62d30 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -775,9 +775,127 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + /// Coalesce any adjacent scalar members in a custom character class together. + /// This is required in order to produce correct grapheme matching behavior. + func coalescingCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + struct Accumulator { + /// A series of range operands. For example, in `[ab-cde-fg]`, this will + /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting + /// ranges will be created. + private var rangeOperands: [String] = [""] + + /// The current range operand. + private var current: String { + _read { yield rangeOperands[rangeOperands.count - 1] } + _modify { yield &rangeOperands[rangeOperands.count - 1] } + } + + /// Try to accumulate a character class member, returning `true` if + /// successful, `false` otherwise. + mutating func tryAccumulate( + _ member: DSLTree.CustomCharacterClass.Member + ) -> Bool { + switch member { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + current.append(c) + return true + case .quotedLiteral(let str): + current += str + return true + case let .range(lhs, rhs): + guard let lhs = lhs.literalCharacterValue, + let rhs = rhs.literalCharacterValue + else { return false } + current.append(lhs) + rangeOperands.append(String(rhs)) + return true + default: + return false + } + } + + func finish() -> [DSLTree.CustomCharacterClass.Member] { + if rangeOperands.count == 1 { + // If we didn't have any additional range operands, this isn't a + // range, we can just form a standard quoted literal. + return [.quotedLiteral(current)] + } + var members = [DSLTree.CustomCharacterClass.Member]() + + // We have other range operands, splice them together. For N operands + // we have N - 1 ranges. + for (i, lhs) in rangeOperands.dropLast().enumerated() { + let rhs = rangeOperands[i + 1] + + // If this is the first operand we only need to drop the last + // character for its quoted members, otherwise this is both an LHS + // and RHS of a range, and as such needs both sides trimmed. + let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast() + if !leading.isEmpty { + members.append(.quotedLiteral(String(leading))) + } + members.append(.range(.char(lhs.last!), .char(rhs.first!))) + } + // We've handled everything except the quoted portion of the last + // operand, add it now. + let trailing = rangeOperands.last!.dropFirst() + if !trailing.isEmpty { + members.append(.quotedLiteral(String(trailing))) + } + return members + } + } + return members + .map { m -> DSLTree.CustomCharacterClass.Member in + // First we need to recursively coalsce any child character classes. + switch m { + case .custom(let ccc): + return .custom(coalescingCustomCharacterClass(ccc)) + case .intersection(let lhs, let rhs): + return .intersection( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .subtraction(let lhs, let rhs): + return .subtraction( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .symmetricDifference(let lhs, let rhs): + return .symmetricDifference( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .atom, .range, .quotedLiteral, .trivia: + return m + } + } + .coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in + accum.tryAccumulate(member) + } + } + + func coalescingCustomCharacterClass( + _ ccc: DSLTree.CustomCharacterClass + ) -> DSLTree.CustomCharacterClass { + // This only needs to be done in grapheme semantic mode. In scalar semantic + // mode, we don't want to coalesce any scalars into a grapheme. This + // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and + // U+302. + guard options.semanticLevel == .graphemeCluster else { return ccc } + + let members = coalescingCustomCharacterClassMembers(ccc.members) + return .init(members: members, isInverted: ccc.isInverted) + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { + // Before emitting a custom character class in grapheme semantic mode, we + // need to coalesce together any adjacent characters and scalars, over which + // we can perform grapheme breaking. This includes e.g range bounds for + // `[e\u{301}-\u{302}]`. + let ccc = coalescingCustomCharacterClass(ccc) if let asciiBitset = ccc.asAsciiBitset(options), optimizationsEnabled { if options.semanticLevel == .unicodeScalar { diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index 139a1be34..8a9cbe325 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -11,11 +11,11 @@ extension Array { /// Coalesce adjacent elements using a given accumulator. The accumulator is - /// transformed into an element of the array by `finish`. The `accumulate` + /// transformed into elements of the array by `finish`. The `accumulate` /// function should return `true` if the accumulator has coalesced the /// element, `false` otherwise. func coalescing( - with initialAccumulator: T, into finish: (T) -> Element, + with initialAccumulator: T, into finish: (T) -> Self, accumulate: (inout T, Element) -> Bool ) -> Self { var didAccumulate = false @@ -32,7 +32,7 @@ extension Array { if didAccumulate { // We have a leftover accumulator, which needs to be finished before we // can append the next element. - result.append(finish(accumulator)) + result += finish(accumulator) accumulator = initialAccumulator didAccumulate = false } @@ -40,8 +40,20 @@ extension Array { } // Handle a leftover accumulation. if didAccumulate { - result.append(finish(accumulator)) + result += finish(accumulator) } return result } + + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into an element of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Element, + accumulate: (inout T, Element) -> Bool + ) -> Self { + coalescing( + with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c356eee85..3de201ca6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -765,6 +765,253 @@ extension RegexTests { semanticLevel: .unicodeScalar ) + // Scalar coalescing. + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[๐Ÿ‘จ\u{200D}๐Ÿ‘ฉ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ]"#, + ("๐Ÿ‘จ", nil), + ("๐Ÿ‘ฉ", nil), + ("๐Ÿ‘ง", nil), + ("๐Ÿ‘ฆ", nil), + ("\u{200D}", nil), + ("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") + ) + firstMatchTests( + #"[๐Ÿ‘จ\u{200D}๐Ÿ‘ฉ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ]"#, + ("๐Ÿ‘จ", "๐Ÿ‘จ"), + ("๐Ÿ‘ฉ", "๐Ÿ‘ฉ"), + ("๐Ÿ‘ง", "๐Ÿ‘ง"), + ("๐Ÿ‘ฆ", "๐Ÿ‘ฆ"), + ("\u{200D}", "\u{200D}"), + ("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ", "๐Ÿ‘จ"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + + firstMatchTests( + #"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#, + ("a", "a"), + ("a\u{301}", "a\u{301}"), + ("\u{E1}", "\u{E1}"), + ("\u{E2}", nil), + ("z", "z"), + ("e", "e"), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("\u{302}", "\u{302}"), + ("1", "1"), + ("2", nil), + ("3", "3"), + ("4", "4"), + ("5", "5"), + ("6", nil), + ("7", nil), + ("8", nil), + ("9", "9") + ) + firstMatchTests( + #"[ab-df-hik-lm]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", "d"), + ("e", nil), + ("f", "f"), + ("g", "g"), + ("h", "h"), + ("i", "i"), + ("j", nil), + ("k", "k"), + ("l", "l"), + ("m", "m") + ) + firstMatchTests( + #"[a-ce-fh-j]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", nil), + ("e", "e"), + ("f", "f"), + ("g", nil), + ("h", "h"), + ("i", "i"), + ("j", "j") + ) + + + // These can't compile in grapheme semantic mode, but make sure they work in + // scalar semantic mode. + firstMatchTests( + #"[a\u{315}\u{301}-\u{302}]"#, + ("a", "a"), + ("\u{315}", "\u{315}"), + ("\u{301}", "\u{301}"), + ("\u{302}", "\u{302}"), + ("\u{303}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + ("\u{73}", "\u{73}"), + ("\u{323}", "\u{323}"), + ("\u{307}", "\u{307}"), + ("\u{400}", "\u{400}"), + ("\u{500}", "\u{500}"), + ("\u{1E00}", "\u{1E00}"), + ("\u{1E01}", nil), + ("\u{1E69}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[a\u{302}-โœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("โœ…", "โœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[a\u{302}-โœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "A"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("โœ…", "โœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "\u{301}"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "E"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + + // Set operation scalar coalescing. + firstMatchTests( + #"[e\u{301}&&e\u{301}e\u{302}]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", "e\u{301}"), + ("e\u{302}", nil)) + firstMatchTests( + #"[e\u{301}~~[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", nil), + ("e\u{302}", "e\u{302}")) + firstMatchTests( + #"[e\u{301}[e\u{303}]--[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f5e93c2bd..84ce361f3 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2929,6 +2929,8 @@ extension RegexTests { diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e")) + diagnosticTest("(?x)[(?#)]", .expected("]")) diagnosticTest("(?x)[(?#abc)]", .expected("]")) From 96adc3cda72bc2c71558de9992d63e0be4979ab5 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:06 +0100 Subject: [PATCH 5/6] Throw `RegexCompilationError` for invalid character class bounds Make sure we throw the right error for ranges that are invalid in grapheme mode, but are valid in scalar mode. --- Sources/_StringProcessing/Compiler.swift | 30 ++++++++++++-- .../_StringProcessing/ConsumerInterface.swift | 12 +++++- Tests/RegexTests/CompileTests.swift | 40 +++++++++++++++++++ 3 files changed, 77 insertions(+), 5 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 530126a32..b8daa8b21 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -42,19 +42,43 @@ class Compiler { } } +/// Hashable wrapper for `Any.Type`. +struct AnyHashableType: CustomStringConvertible, Hashable { + var ty: Any.Type + init(_ ty: Any.Type) { + self.ty = ty + } + var description: String { "\(ty)" } + + static func == (lhs: Self, rhs: Self) -> Bool { + lhs.ty == rhs.ty + } + func hash(into hasher: inout Hasher) { + hasher.combine(ObjectIdentifier(ty)) + } +} + // An error produced when compiling a regular expression. -enum RegexCompilationError: Error, CustomStringConvertible { +enum RegexCompilationError: Error, Hashable, CustomStringConvertible { // TODO: Source location? case uncapturedReference + case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType) + case invalidCharacterClassRangeOperand(Character) + + static func incorrectOutputType( + incorrect: Any.Type, correct: Any.Type + ) -> Self { + .incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct)) + } - case incorrectOutputType(incorrect: Any.Type, correct: Any.Type) - var description: String { switch self { case .uncapturedReference: return "Found a reference used before it captured any match." case .incorrectOutputType(let incorrect, let correct): return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'" + case .invalidCharacterClassRangeOperand(let c): + return "'\(c)' is an invalid bound for character class range" } } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 0c89faae0..083781120 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -362,12 +362,20 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else { + guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else { + guard let rhsChar = high.literalCharacterValue else { throw Unsupported("\(high) in range") } + + // We must have NFC single scalar bounds. + guard let lhs = lhsChar.singleScalar, lhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(lhsChar) + } + guard let rhs = rhsChar.singleScalar, rhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(rhsChar) + } guard lhs <= rhs else { throw Unsupported("Invalid range \(low)-\(high)") } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 90694fc19..27f8d79cb 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -11,6 +11,7 @@ @testable import _RegexParser @testable import _StringProcessing +import TestSupport import XCTest @@ -168,6 +169,45 @@ extension RegexTests { } } + private func testCompileError( + _ regex: String, _ error: RegexCompilationError, + file: StaticString = #file, line: UInt = #line + ) { + do { + _ = try _compileRegex(regex) + XCTFail("Expected compile error", file: file, line: line) + } catch let err as RegexCompilationError { + XCTAssertEqual(err, error, file: file, line: line) + } catch { + XCTFail("Unknown compile error", file: file, line: line) + } + } + + func testInvalidScalarCoalescing() throws { + guard ensureNewStdlib() else { return } + + // Non-single-scalar bounds. + testCompileError( + #"[a\u{302}-โœ…]"#, .invalidCharacterClassRangeOperand("a\u{302}")) + testCompileError( + #"[e\u{301}-\u{302}]"#, .invalidCharacterClassRangeOperand("e\u{301}")) + testCompileError( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + .invalidCharacterClassRangeOperand("\u{73}\u{323}\u{307}")) + testCompileError( + #"[a\u{315}\u{301}-\u{302}]"#, + .invalidCharacterClassRangeOperand("a\u{315}\u{301}") + ) + testCompileError( + #"[a-z1e\u{301}-\u{302}\u{E1}3-59]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + testCompileError( + #"[[e\u{301}-\u{302}]&&e\u{303}]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + } + func testCompileQuantification() throws { // NOTE: While we might change how we compile From dc4171f269bd22a607e21759ec748416ed7a5d82 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:06 +0100 Subject: [PATCH 6/6] Allow coalescing through trivia I also noticed that `lexQuantifier` could silently eat trivia if it failed to lex a quantification, so also fix that. --- .../Regex/Parse/LexicalAnalysis.swift | 46 +++++++-------- Sources/_StringProcessing/ByteCodeGen.swift | 8 +++ .../_StringProcessing/PrintAsPattern.swift | 4 ++ Tests/RegexTests/MatchTests.swift | 56 +++++++++++++++++++ Tests/RegexTests/RenderDSLTests.swift | 12 ++++ 5 files changed, 104 insertions(+), 22 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 4a4f5c05f..a830a18b7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -480,35 +480,37 @@ extension Parser { /// mutating func lexQuantifier( ) -> (Located, Located, [AST.Trivia])? { - var trivia: [AST.Trivia] = [] + tryEating { p in + var trivia: [AST.Trivia] = [] - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let amt: Located? = recordLoc { p in - if p.tryEat("*") { return .zeroOrMore } - if p.tryEat("+") { return .oneOrMore } - if p.tryEat("?") { return .zeroOrOne } + let amt: Located? = p.recordLoc { p in + if p.tryEat("*") { return .zeroOrMore } + if p.tryEat("+") { return .oneOrMore } + if p.tryEat("?") { return .zeroOrOne } - return p.tryEating { p in - guard p.tryEat("{"), - let range = p.lexRange(trivia: &trivia), - p.tryEat("}") - else { return nil } - return range.value + return p.tryEating { p in + guard p.tryEat("{"), + let range = p.lexRange(trivia: &trivia), + p.tryEat("}") + else { return nil } + return range.value + } } - } - guard let amt = amt else { return nil } + guard let amt = amt else { return nil } - // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + // PCRE allows non-semantic whitespace here in extended syntax mode. + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let kind: Located = recordLoc { p in - if p.tryEat("?") { return .reluctant } - if p.tryEat("+") { return .possessive } - return .eager - } + let kind: Located = p.recordLoc { p in + if p.tryEat("?") { return .reluctant } + if p.tryEat("+") { return .possessive } + return .eager + } - return (amt, kind, trivia) + return (amt, kind, trivia) + } } /// Try to consume a range, returning `nil` if unsuccessful. diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 446d62d30..e8c92f2b5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -812,6 +812,10 @@ fileprivate extension Compiler.ByteCodeGen { current.append(lhs) rangeOperands.append(String(rhs)) return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !current.isEmpty default: return false } @@ -935,6 +939,10 @@ fileprivate extension Compiler.ByteCodeGen { case .quotedLiteral(let q): str += q return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty default: return false } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 1c15bd549..9eb5aa0e1 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -314,6 +314,10 @@ extension PrettyPrinter { case .quotedLiteral(let q): literal.append(q) return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !literal.isEmpty default: return false } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3de201ca6..5f4c8bb30 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -345,6 +345,23 @@ extension RegexTests { input: "e\u{301}0e\u{302}", match: "e\u{301}0e\u{302}" ) + firstMatchTest( + #"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + + // We don't coalesce across groups. + firstMatchTests( + #"e\u{301}(?:\u{315}\u{35C})?"#, + ("e\u{301}", "e\u{301}"), + ("e\u{301}\u{315}\u{35C}", nil) + ) // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, @@ -824,6 +841,30 @@ extension RegexTests { ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") ) + firstMatchTests( + #"(?x) [ e \u{315} \u{301} \u{35C} ]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + + // We don't coalesce across character classes. + firstMatchTests( + #"e[\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{315}\u{301}", nil), + ("e\u{301}\u{315}\u{35C}", nil) + ) + firstMatchTests( + #"[e[\u{301}]]"#, + ("e", "e"), + ("\u{301}", "\u{301}"), + ("e\u{301}", nil) + ) firstMatchTests( #"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#, @@ -1012,6 +1053,16 @@ extension RegexTests { ("e\u{302}", nil), ("e\u{303}", "e\u{303}")) + firstMatchTests( + #"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters @@ -2182,6 +2233,11 @@ extension RegexTests { #"\u{65 301}"#, (eComposed, true), (eDecomposed, true)) + + matchTest( + #"(?x) \u{65} \u{301}"#, + (eComposed, true), + (eDecomposed, true)) } func testCanonicalEquivalenceCharacterClass() throws { diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 1fdd1e644..31d59cb66 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -203,6 +203,18 @@ extension RenderDSLTests { } """#) + try testConversion(#"(?x) a \u{301}"#, #""" + Regex { + "a\u{301}" + } + """#) + + try testConversion(#"(?x) [ a b c \u{301} ] "#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + try testConversion(#"๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" Regex { "๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"