diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 4a4f5c05f..a830a18b7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -480,35 +480,37 @@ extension Parser { /// mutating func lexQuantifier( ) -> (Located, Located, [AST.Trivia])? { - var trivia: [AST.Trivia] = [] + tryEating { p in + var trivia: [AST.Trivia] = [] - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let amt: Located? = recordLoc { p in - if p.tryEat("*") { return .zeroOrMore } - if p.tryEat("+") { return .oneOrMore } - if p.tryEat("?") { return .zeroOrOne } + let amt: Located? = p.recordLoc { p in + if p.tryEat("*") { return .zeroOrMore } + if p.tryEat("+") { return .oneOrMore } + if p.tryEat("?") { return .zeroOrOne } - return p.tryEating { p in - guard p.tryEat("{"), - let range = p.lexRange(trivia: &trivia), - p.tryEat("}") - else { return nil } - return range.value + return p.tryEating { p in + guard p.tryEat("{"), + let range = p.lexRange(trivia: &trivia), + p.tryEat("}") + else { return nil } + return range.value + } } - } - guard let amt = amt else { return nil } + guard let amt = amt else { return nil } - // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + // PCRE allows non-semantic whitespace here in extended syntax mode. + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let kind: Located = recordLoc { p in - if p.tryEat("?") { return .reluctant } - if p.tryEat("+") { return .possessive } - return .eager - } + let kind: Located = p.recordLoc { p in + if p.tryEat("?") { return .reluctant } + if p.tryEat("+") { return .possessive } + return .eager + } - return (amt, kind, trivia) + return (amt, kind, trivia) + } } /// Try to consume a range, returning `nil` if unsuccessful. diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 477760ef8..e8c92f2b5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + /// Coalesce any adjacent scalar members in a custom character class together. + /// This is required in order to produce correct grapheme matching behavior. + func coalescingCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + struct Accumulator { + /// A series of range operands. For example, in `[ab-cde-fg]`, this will + /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting + /// ranges will be created. + private var rangeOperands: [String] = [""] + + /// The current range operand. + private var current: String { + _read { yield rangeOperands[rangeOperands.count - 1] } + _modify { yield &rangeOperands[rangeOperands.count - 1] } + } + + /// Try to accumulate a character class member, returning `true` if + /// successful, `false` otherwise. + mutating func tryAccumulate( + _ member: DSLTree.CustomCharacterClass.Member + ) -> Bool { + switch member { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + current.append(c) + return true + case .quotedLiteral(let str): + current += str + return true + case let .range(lhs, rhs): + guard let lhs = lhs.literalCharacterValue, + let rhs = rhs.literalCharacterValue + else { return false } + current.append(lhs) + rangeOperands.append(String(rhs)) + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !current.isEmpty + default: + return false + } + } + + func finish() -> [DSLTree.CustomCharacterClass.Member] { + if rangeOperands.count == 1 { + // If we didn't have any additional range operands, this isn't a + // range, we can just form a standard quoted literal. + return [.quotedLiteral(current)] + } + var members = [DSLTree.CustomCharacterClass.Member]() + + // We have other range operands, splice them together. For N operands + // we have N - 1 ranges. + for (i, lhs) in rangeOperands.dropLast().enumerated() { + let rhs = rangeOperands[i + 1] + + // If this is the first operand we only need to drop the last + // character for its quoted members, otherwise this is both an LHS + // and RHS of a range, and as such needs both sides trimmed. + let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast() + if !leading.isEmpty { + members.append(.quotedLiteral(String(leading))) + } + members.append(.range(.char(lhs.last!), .char(rhs.first!))) + } + // We've handled everything except the quoted portion of the last + // operand, add it now. + let trailing = rangeOperands.last!.dropFirst() + if !trailing.isEmpty { + members.append(.quotedLiteral(String(trailing))) + } + return members + } + } + return members + .map { m -> DSLTree.CustomCharacterClass.Member in + // First we need to recursively coalsce any child character classes. + switch m { + case .custom(let ccc): + return .custom(coalescingCustomCharacterClass(ccc)) + case .intersection(let lhs, let rhs): + return .intersection( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .subtraction(let lhs, let rhs): + return .subtraction( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .symmetricDifference(let lhs, let rhs): + return .symmetricDifference( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .atom, .range, .quotedLiteral, .trivia: + return m + } + } + .coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in + accum.tryAccumulate(member) + } + } + + func coalescingCustomCharacterClass( + _ ccc: DSLTree.CustomCharacterClass + ) -> DSLTree.CustomCharacterClass { + // This only needs to be done in grapheme semantic mode. In scalar semantic + // mode, we don't want to coalesce any scalars into a grapheme. This + // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and + // U+302. + guard options.semanticLevel == .graphemeCluster else { return ccc } + + let members = coalescingCustomCharacterClassMembers(ccc.members) + return .init(members: members, isInverted: ccc.isInverted) + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { + // Before emitting a custom character class in grapheme semantic mode, we + // need to coalesce together any adjacent characters and scalars, over which + // we can perform grapheme breaking. This includes e.g range bounds for + // `[e\u{301}-\u{302}]`. + let ccc = coalescingCustomCharacterClass(ccc) if let asciiBitset = ccc.asAsciiBitset(options), optimizationsEnabled { if options.semanticLevel == .unicodeScalar { @@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { + // Before emitting a concatenation, we need to flatten out any nested + // concatenations, and coalesce any adjacent characters and scalars, forming + // quoted literals of their contents, over which we can perform grapheme + // breaking. + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .convertedRegexLiteral(let n, _): + return flatten(n) + default: + return [node] + } + } + let children = children + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty + default: + return false + } + } + for child in children { + try emitConcatenationComponent(child) + } + } + @discardableResult mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { switch node { @@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAlternation(children) case let .concatenation(children): - for child in children { - try emitConcatenationComponent(child) - } + try emitConcatenation(children) case let .capture(name, refId, child, transform): options.beginScope() diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 530126a32..b8daa8b21 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -42,19 +42,43 @@ class Compiler { } } +/// Hashable wrapper for `Any.Type`. +struct AnyHashableType: CustomStringConvertible, Hashable { + var ty: Any.Type + init(_ ty: Any.Type) { + self.ty = ty + } + var description: String { "\(ty)" } + + static func == (lhs: Self, rhs: Self) -> Bool { + lhs.ty == rhs.ty + } + func hash(into hasher: inout Hasher) { + hasher.combine(ObjectIdentifier(ty)) + } +} + // An error produced when compiling a regular expression. -enum RegexCompilationError: Error, CustomStringConvertible { +enum RegexCompilationError: Error, Hashable, CustomStringConvertible { // TODO: Source location? case uncapturedReference + case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType) + case invalidCharacterClassRangeOperand(Character) + + static func incorrectOutputType( + incorrect: Any.Type, correct: Any.Type + ) -> Self { + .incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct)) + } - case incorrectOutputType(incorrect: Any.Type, correct: Any.Type) - var description: String { switch self { case .uncapturedReference: return "Found a reference used before it captured any match." case .incorrectOutputType(let incorrect, let correct): return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'" + case .invalidCharacterClassRangeOperand(let c): + return "'\(c)' is an invalid bound for character class range" } } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index c96775500..083781120 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -63,7 +63,7 @@ extension DSLTree._AST.Atom { extension Character { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { + ) throws -> MEProgram.ConsumeFunction { let isCaseInsensitive = opts.isCaseInsensitive switch opts.semanticLevel { case .graphemeCluster: @@ -327,24 +327,25 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions, _ isInverted: Bool ) -> DSLTree.CustomCharacterClass.AsciiBitset? { + typealias Bitset = DSLTree.CustomCharacterClass.AsciiBitset switch self { case let .atom(a): if let val = a.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - val, - isInverted, - opts.isCaseInsensitive - ) + return Bitset(val, isInverted, opts.isCaseInsensitive) } case let .range(low, high): - if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - low: lowVal, - high: highVal, - isInverted: isInverted, - isCaseInsensitive: opts.isCaseInsensitive - ) + if let lowVal = low.singleScalarASCIIValue, + let highVal = high.singleScalarASCIIValue { + return Bitset(low: lowVal, high: highVal, isInverted: isInverted, + isCaseInsensitive: opts.isCaseInsensitive) + } + case .quotedLiteral(let str): + var bitset = Bitset(isInverted: isInverted) + for c in str { + guard let ascii = c._singleScalarAsciiValue else { return nil } + bitset = bitset.union(Bitset(ascii, isInverted, opts.isCaseInsensitive)) } + return bitset default: return nil } @@ -361,12 +362,20 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else { + guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else { + guard let rhsChar = high.literalCharacterValue else { throw Unsupported("\(high) in range") } + + // We must have NFC single scalar bounds. + guard let lhs = lhsChar.singleScalar, lhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(lhsChar) + } + guard let rhs = rhsChar.singleScalar, rhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(rhsChar) + } guard lhs <= rhs else { throw Unsupported("Invalid range \(low)-\(high)") } @@ -456,21 +465,17 @@ extension DSLTree.CustomCharacterClass.Member { } return rhs(input, bounds) } - case .quotedLiteral(let s): - if opts.isCaseInsensitive { - return { input, bounds in - guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { - return nil - } - return input.index(after: bounds.lowerBound) - } - } else { - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil + case .quotedLiteral(let str): + let consumers = try str.map { + try $0.generateConsumer(opts) + } + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx } - return input.index(after: bounds.lowerBound) } + return nil } case .trivia: // TODO: Should probably strip this earlier... diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 54ce66d0d..9eb5aa0e1 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -70,16 +70,9 @@ extension PrettyPrinter { for namedCapture in namedCaptures { print("let \(namedCapture) = Reference(Substring.self)") } - - switch node { - case .concatenation(_): - printAsPattern(convertedFromAST: node) - case .convertedRegexLiteral(.concatenation(_), _): - printAsPattern(convertedFromAST: node) - default: - printBlock("Regex") { printer in - printer.printAsPattern(convertedFromAST: node) - } + + printBlock("Regex") { printer in + printer.printAsPattern(convertedFromAST: node, isTopLevel: true) } } @@ -89,7 +82,7 @@ extension PrettyPrinter { // to have a non-backing-off pretty-printer that this // can defer to. private mutating func printAsPattern( - convertedFromAST node: DSLTree.Node + convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false ) { if patternBackoff(DSLTree._Tree(node)) { printBackoff(node) @@ -106,11 +99,7 @@ extension PrettyPrinter { } case let .concatenation(c): - printBlock("Regex") { printer in - c.forEach { - printer.printAsPattern(convertedFromAST: $0) - } - } + printConcatenationAsPattern(c, isTopLevel: isTopLevel) case let .nonCapturingGroup(kind, child): switch kind.ast { @@ -273,7 +262,7 @@ extension PrettyPrinter { // check above, so it should work out. Need a // cleaner way to do this. This means the argument // label is a lie. - printAsPattern(convertedFromAST: n) + printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -289,6 +278,64 @@ extension PrettyPrinter { print("/* TODO: absent function */") } } + + enum NodeToPrint { + case dslNode(DSLTree.Node) + case stringLiteral(String) + } + + mutating func printAsPattern(_ node: NodeToPrint) { + switch node { + case .dslNode(let n): + printAsPattern(convertedFromAST: n) + case .stringLiteral(let str): + print(str) + } + } + + mutating func printConcatenationAsPattern( + _ nodes: [DSLTree.Node], isTopLevel: Bool + ) { + // We need to coalesce any adjacent character and scalar elements into a + // string literal, preserving scalar syntax. + let nodes = nodes + .map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) } + .coalescing( + with: StringLiteralBuilder(), into: { .stringLiteral($0.result) } + ) { literal, node in + guard case .dslNode(let node) = node else { return false } + switch node { + case let .atom(.char(c)): + literal.append(c) + return true + case let .atom(.scalar(s)): + literal.append(unescaped: s._dslBase) + return true + case .quotedLiteral(let q): + literal.append(q) + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !literal.isEmpty + default: + return false + } + } + if isTopLevel || nodes.count == 1 { + // If we're at the top level, or we coalesced everything into a single + // element, we don't need to print a surrounding Regex { ... }. + for n in nodes { + printAsPattern(n) + } + return + } + printBlock("Regex") { printer in + for n in nodes { + printer.printAsPattern(n) + } + } + } mutating func printAsPattern( _ ccc: DSLTree.CustomCharacterClass, @@ -351,8 +398,7 @@ extension PrettyPrinter { charMembers.append(c) return false case let .scalar(s): - charMembers.append( - unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") + charMembers.append(unescaped: s._dslBase) return false case .unconverted(_): return true @@ -459,9 +505,9 @@ extension PrettyPrinter { case let .scalar(s): if wrap { - output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))") + output("One(.anyOf(\(s._dslBase._bareQuoted)))") } else { - output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")") + output(".anyOf(\(s._dslBase._bareQuoted))") } case let .unconverted(a): @@ -635,6 +681,10 @@ extension String { } } +extension UnicodeScalar { + var _dslBase: String { "\\u{\(String(value, radix: 16, uppercase: true))}" } +} + /// A helper for building string literals, which handles escaping the contents /// appended. fileprivate struct StringLiteralBuilder { @@ -861,19 +911,15 @@ extension AST.Atom { } var _dslBase: (String, canBeWrapped: Bool) { - func scalarLiteral(_ s: UnicodeScalar) -> String { - let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - } switch kind { case let .char(c): return (String(c), false) case let .scalar(s): - return (scalarLiteral(s.value), false) + return (s.value._dslBase, false) case let .scalarSequence(seq): - return (seq.scalarValues.map(scalarLiteral).joined(), false) + return (seq.scalarValues.map(\._dslBase).joined(), false) case let .property(p): return (p._dslBase, true) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index c4ac8e759..4eb7bc42c 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -43,61 +43,7 @@ extension AST.Node { return .orderedChoice(children) case let .concatenation(v): - // Coalesce adjacent children who can produce a - // string literal representation - let astChildren = v.children - func coalesce( - _ idx: Array.Index - ) -> (Array.Index, String)? { - var result = "" - var idx = idx - while idx < astChildren.endIndex { - guard let atom: AST.Atom = astChildren[idx].as() else { break } - - // TODO: For printing, nice to coalesce - // scalars literals too. We likely need a different - // approach even before we have a better IR. - if let char = atom.singleCharacter { - result.append(char) - } else if let scalar = atom.singleScalar { - result.append(Character(scalar)) - } else if case .scalarSequence(let seq) = atom.kind { - result += seq.scalarValues.map(Character.init) - } else { - break - } - - astChildren.formIndex(after: &idx) - } - return result.isEmpty ? nil : (idx, result) - } - - // No need to nest single children concatenations - if astChildren.count == 1 { - return astChildren.first!.dslTreeNode - } - - // Check for a single child post-coalescing - if let (idx, str) = coalesce(astChildren.startIndex), - idx == astChildren.endIndex - { - return .quotedLiteral(str) - } - - // Coalesce adjacent string children - var curIdx = astChildren.startIndex - var children = Array() - while curIdx < astChildren.endIndex { - if let (nextIdx, str) = coalesce(curIdx) { - // TODO: Track source info... - children.append(.quotedLiteral(str)) - curIdx = nextIdx - } else { - children.append(astChildren[curIdx].dslTreeNode) - astChildren.formIndex(after: &curIdx) - } - } - return .concatenation(children) + return .concatenation(v.children.map(\.dslTreeNode)) case let .group(v): let child = v.child.dslTreeNode @@ -135,10 +81,9 @@ extension AST.Node { case let .atom(v): switch v.kind { case .scalarSequence(let seq): - // Scalar sequences are splatted into concatenated scalars, which - // becomes a quoted literal. Sequences nested in concatenations have - // already been coalesced, this just handles the lone atom case. - return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + // The DSL doesn't have an equivalent node for scalar sequences. Splat + // them into a concatenation of scalars. + return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) }) default: return .atom(v.dslTreeAtom) } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 4ea905fd5..520f4991a 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -334,6 +334,14 @@ extension DSLTree.Node { default: return nil } } + + /// If this node is for a converted literal, look through it. + var lookingThroughConvertedLiteral: Self { + switch self { + case let .convertedRegexLiteral(n, _): return n + default: return self + } + } } extension DSLTree.Atom { diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift new file mode 100644 index 000000000..8a9cbe325 --- /dev/null +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +extension Array { + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into elements of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Self, + accumulate: (inout T, Element) -> Bool + ) -> Self { + var didAccumulate = false + var accumulator = initialAccumulator + + var result = Self() + for elt in self { + if accumulate(&accumulator, elt) { + // The element has been coalesced into accumulator, there is nothing + // else to do. + didAccumulate = true + continue + } + if didAccumulate { + // We have a leftover accumulator, which needs to be finished before we + // can append the next element. + result += finish(accumulator) + accumulator = initialAccumulator + didAccumulate = false + } + result.append(elt) + } + // Handle a leftover accumulation. + if didAccumulate { + result += finish(accumulator) + } + return result + } + + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into an element of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Element, + accumulate: (inout T, Element) -> Bool + ) -> Self { + coalescing( + with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 1d186e0bc..e25f2df05 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1435,7 +1435,8 @@ class RegexDSLTests: XCTestCase { "\u{200D}" as UnicodeScalar "๐Ÿ‘ฆ" as UnicodeScalar } - XCTAssertNil(try r3.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) @@ -1447,18 +1448,72 @@ class RegexDSLTests: XCTestCase { try r4.firstMatch(in: "รฉ") ) - try XCTExpectFailure("Need stronger scalar coalescing logic") { - let r5 = Regex { - "e" - "\u{301}" as UnicodeScalar + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil(try r5.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r5.firstMatch(in: "รฉ")) + + let r6 = Regex { + "abcde" + "\u{301}" + } + XCTAssertNotNil(try r6.firstMatch(in: "abcde\u{301}")) + XCTAssertNotNil(try r6.firstMatch(in: "abcdรฉ")) + + let r7 = Regex { + "e" as Character + "\u{301}" as Character + } + XCTAssertNotNil(try r7.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r7.firstMatch(in: "รฉ")) + + // You can't match a partial grapheme in grapheme semantic mode. + let r8 = Regex { + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ง" as UnicodeScalar + } + XCTAssertNil(try r8.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNil(try r8.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r8.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNil(try r8.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + + // Scalar coalescing occurs across nested concatenations and literals. + let r9 = Regex { + Regex { + try! Regex(#"๐Ÿ‘จ"#) + "\u{200D}" as UnicodeScalar + Regex { + "๐Ÿ‘จ" as UnicodeScalar + } } - XCTAssertNotNil( - try r5.firstMatch(in: "e\u{301}") - ) - XCTAssertNotNil( - try r5.firstMatch(in: "รฉ") - ) + Regex { + Regex { + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ง" + } + try! Regex(#"\u{200D}๐Ÿ‘ฆ"#) + } + } + XCTAssertNotNil(try r9.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r9.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + + let r10 = Regex { + "๐Ÿ‘จ" as UnicodeScalar + try! Regex(#"\u{200D 1F468 200D 1F467}"#) + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ฆ" as UnicodeScalar } + XCTAssertNotNil(try r10.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r10.wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) } struct SemanticVersion: Equatable { diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6c8f66e10..27f8d79cb 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -11,6 +11,7 @@ @testable import _RegexParser @testable import _StringProcessing +import TestSupport import XCTest @@ -168,6 +169,45 @@ extension RegexTests { } } + private func testCompileError( + _ regex: String, _ error: RegexCompilationError, + file: StaticString = #file, line: UInt = #line + ) { + do { + _ = try _compileRegex(regex) + XCTFail("Expected compile error", file: file, line: line) + } catch let err as RegexCompilationError { + XCTAssertEqual(err, error, file: file, line: line) + } catch { + XCTFail("Unknown compile error", file: file, line: line) + } + } + + func testInvalidScalarCoalescing() throws { + guard ensureNewStdlib() else { return } + + // Non-single-scalar bounds. + testCompileError( + #"[a\u{302}-โœ…]"#, .invalidCharacterClassRangeOperand("a\u{302}")) + testCompileError( + #"[e\u{301}-\u{302}]"#, .invalidCharacterClassRangeOperand("e\u{301}")) + testCompileError( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + .invalidCharacterClassRangeOperand("\u{73}\u{323}\u{307}")) + testCompileError( + #"[a\u{315}\u{301}-\u{302}]"#, + .invalidCharacterClassRangeOperand("a\u{315}\u{301}") + ) + testCompileError( + #"[a-z1e\u{301}-\u{302}\u{E1}3-59]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + testCompileError( + #"[[e\u{301}-\u{302}]&&e\u{303}]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + } + func testCompileQuantification() throws { // NOTE: While we might change how we compile @@ -317,6 +357,15 @@ extension RegexTests { semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar], doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: #"[\Qab\Ec]"#, + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: #"[\Qab\Ec]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) } func testScalarOptimizeCompilation() { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f24ae89d9..5f4c8bb30 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -191,6 +191,7 @@ func firstMatchTests( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -203,6 +204,7 @@ func firstMatchTests( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + semanticLevel: semanticLevel, file: file, line: line) } @@ -312,6 +314,55 @@ extension RegexTests { match: "\u{006f}\u{031b}\u{0323}" ) + // e + combining accents + firstMatchTest( + #"e\u{301 302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{315 35C 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\u{302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{315}\u{301}\u{35C}", + match: "e\u{315}\u{301}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\de\u{302}"#, + input: "e\u{301}0e\u{302}", + match: "e\u{301}0e\u{302}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + + // We don't coalesce across groups. + firstMatchTests( + #"e\u{301}(?:\u{315}\u{35C})?"#, + ("e\u{301}", "e\u{301}"), + ("e\u{301}\u{315}\u{35C}", nil) + ) + // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", @@ -687,6 +738,331 @@ extension RegexTests { ("a\u{301}", true), semanticLevel: .unicodeScalar) + // Scalar matching in quoted sequences. + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", nil), + ("\u{C9}", nil) + ) + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", nil), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "\u{301}"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", "E\u{301}"), + ("\u{C9}", "\u{C9}") + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", "E"), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "E"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + + // Scalar coalescing. + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[๐Ÿ‘จ\u{200D}๐Ÿ‘ฉ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ]"#, + ("๐Ÿ‘จ", nil), + ("๐Ÿ‘ฉ", nil), + ("๐Ÿ‘ง", nil), + ("๐Ÿ‘ฆ", nil), + ("\u{200D}", nil), + ("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") + ) + firstMatchTests( + #"[๐Ÿ‘จ\u{200D}๐Ÿ‘ฉ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ]"#, + ("๐Ÿ‘จ", "๐Ÿ‘จ"), + ("๐Ÿ‘ฉ", "๐Ÿ‘ฉ"), + ("๐Ÿ‘ง", "๐Ÿ‘ง"), + ("๐Ÿ‘ฆ", "๐Ÿ‘ฆ"), + ("\u{200D}", "\u{200D}"), + ("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ", "๐Ÿ‘จ"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + firstMatchTests( + #"(?x) [ e \u{315} \u{301} \u{35C} ]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + + // We don't coalesce across character classes. + firstMatchTests( + #"e[\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{315}\u{301}", nil), + ("e\u{301}\u{315}\u{35C}", nil) + ) + firstMatchTests( + #"[e[\u{301}]]"#, + ("e", "e"), + ("\u{301}", "\u{301}"), + ("e\u{301}", nil) + ) + + firstMatchTests( + #"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#, + ("a", "a"), + ("a\u{301}", "a\u{301}"), + ("\u{E1}", "\u{E1}"), + ("\u{E2}", nil), + ("z", "z"), + ("e", "e"), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("\u{302}", "\u{302}"), + ("1", "1"), + ("2", nil), + ("3", "3"), + ("4", "4"), + ("5", "5"), + ("6", nil), + ("7", nil), + ("8", nil), + ("9", "9") + ) + firstMatchTests( + #"[ab-df-hik-lm]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", "d"), + ("e", nil), + ("f", "f"), + ("g", "g"), + ("h", "h"), + ("i", "i"), + ("j", nil), + ("k", "k"), + ("l", "l"), + ("m", "m") + ) + firstMatchTests( + #"[a-ce-fh-j]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", nil), + ("e", "e"), + ("f", "f"), + ("g", nil), + ("h", "h"), + ("i", "i"), + ("j", "j") + ) + + + // These can't compile in grapheme semantic mode, but make sure they work in + // scalar semantic mode. + firstMatchTests( + #"[a\u{315}\u{301}-\u{302}]"#, + ("a", "a"), + ("\u{315}", "\u{315}"), + ("\u{301}", "\u{301}"), + ("\u{302}", "\u{302}"), + ("\u{303}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + ("\u{73}", "\u{73}"), + ("\u{323}", "\u{323}"), + ("\u{307}", "\u{307}"), + ("\u{400}", "\u{400}"), + ("\u{500}", "\u{500}"), + ("\u{1E00}", "\u{1E00}"), + ("\u{1E01}", nil), + ("\u{1E69}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[a\u{302}-โœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("โœ…", "โœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[a\u{302}-โœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "A"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("โœ…", "โœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "\u{301}"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "E"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + + // Set operation scalar coalescing. + firstMatchTests( + #"[e\u{301}&&e\u{301}e\u{302}]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", "e\u{301}"), + ("e\u{302}", nil)) + firstMatchTests( + #"[e\u{301}~~[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", nil), + ("e\u{302}", "e\u{302}")) + firstMatchTests( + #"[e\u{301}[e\u{303}]--[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + + firstMatchTests( + #"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters @@ -1852,6 +2228,16 @@ extension RegexTests { #"e$"#, (eComposed, false), (eDecomposed, false)) + + matchTest( + #"\u{65 301}"#, + (eComposed, true), + (eDecomposed, true)) + + matchTest( + #"(?x) \u{65} \u{301}"#, + (eComposed, true), + (eDecomposed, true)) } func testCanonicalEquivalenceCharacterClass() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f5e93c2bd..84ce361f3 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2929,6 +2929,8 @@ extension RegexTests { diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e")) + diagnosticTest("(?x)[(?#)]", .expected("]")) diagnosticTest("(?x)[(?#abc)]", .expected("]")) diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 52906d1ad..31d59cb66 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -197,10 +197,71 @@ extension RenderDSLTests { } """#) - // TODO: We ought to try and preserve the scalar syntax here. try testConversion(#"a\u{301}"#, #""" Regex { - "aฬ" + "a\u{301}" + } + """#) + + try testConversion(#"(?x) a \u{301}"#, #""" + Regex { + "a\u{301}" + } + """#) + + try testConversion(#"(?x) [ a b c \u{301} ] "#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + try testConversion(#"๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" + Regex { + "๐Ÿ‘จ\u{200D}๐Ÿ‘จ\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" + } + """#) + + try testConversion(#"(๐Ÿ‘จ\u{200D}๐Ÿ‘จ)\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ"#, #""" + Regex { + Capture { + "๐Ÿ‘จ\u{200D}๐Ÿ‘จ" + } + "\u{200D}๐Ÿ‘ง\u{200D}๐Ÿ‘ฆ" + } + """#) + + // We preserve the structure of non-capturing groups. + try testConversion(#"abcd(?:e\u{301}\d)"#, #""" + Regex { + "abcd" + Regex { + "e\u{301}" + One(.digit) + } + } + """#) + + try testConversion(#"\u{A B C}"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + } + """#) + + // TODO: We might want to consider preserving scalar sequences in the DSL, + // and allowing them to merge with other concatenations. + try testConversion(#"\u{A B C}\u{d}efg"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + "\u{D}efg" + } + """#) + + // FIXME: We don't actually have a way of specifying in the DSL that we + // shouldn't join these together, should we print them as regex instead? + try testConversion(#"a(?:\u{301})"#, #""" + Regex { + "a" + "\u{301}" } """#) }