diff --git a/Package.swift b/Package.swift index abc895813..c1e9bff37 100644 --- a/Package.swift +++ b/Package.swift @@ -75,15 +75,17 @@ let package = Package( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], swiftSettings: publicStdlibSettings), + .target(name: "TestSupport", + swiftSettings: [availabilityDefinition]), .testTarget( name: "RegexTests", - dependencies: ["_StringProcessing"], + dependencies: ["_StringProcessing", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", - dependencies: ["_StringProcessing", "RegexBuilder"], + dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), diff --git a/Sources/TestSupport/TestSupport.swift b/Sources/TestSupport/TestSupport.swift new file mode 100644 index 000000000..b60adb63f --- /dev/null +++ b/Sources/TestSupport/TestSupport.swift @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +// We need to split this out of the test files, as it needs to be compiled +// *without* `-disable-availability-checking` to ensure the #available check is +// not compiled into a no-op. + +#if os(Linux) +public func XCTExpectFailure( + _ message: String? = nil, body: () throws -> Void +) rethrows {} +#endif + +/// Guards certain tests to make sure we have a new stdlib available. +public func ensureNewStdlib( + file: StaticString = #file, line: UInt = #line +) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { + XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) } + return false + } + return true +} diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index b03ce8c39..8706327f7 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -755,8 +755,10 @@ extension AST.Atom { /// Whether this atom is valid as the operand of a custom character class /// range. public var isValidCharacterClassRangeBound: Bool { - // If we have a literal character value for this, it can be used as a bound. - if literalCharacterValue != nil { return true } + if let c = literalCharacterValue { + // We only match character range bounds that are single scalar NFC. + return c.hasExactlyOneScalar && c.isNFC + } switch kind { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 4a4f5c05f..a830a18b7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -480,35 +480,37 @@ extension Parser { /// mutating func lexQuantifier( ) -> (Located, Located, [AST.Trivia])? { - var trivia: [AST.Trivia] = [] + tryEating { p in + var trivia: [AST.Trivia] = [] - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let amt: Located? = recordLoc { p in - if p.tryEat("*") { return .zeroOrMore } - if p.tryEat("+") { return .oneOrMore } - if p.tryEat("?") { return .zeroOrOne } + let amt: Located? = p.recordLoc { p in + if p.tryEat("*") { return .zeroOrMore } + if p.tryEat("+") { return .oneOrMore } + if p.tryEat("?") { return .zeroOrOne } - return p.tryEating { p in - guard p.tryEat("{"), - let range = p.lexRange(trivia: &trivia), - p.tryEat("}") - else { return nil } - return range.value + return p.tryEating { p in + guard p.tryEat("{"), + let range = p.lexRange(trivia: &trivia), + p.tryEat("}") + else { return nil } + return range.value + } } - } - guard let amt = amt else { return nil } + guard let amt = amt else { return nil } - // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + // PCRE allows non-semantic whitespace here in extended syntax mode. + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let kind: Located = recordLoc { p in - if p.tryEat("?") { return .reluctant } - if p.tryEat("+") { return .possessive } - return .eager - } + let kind: Located = p.recordLoc { p in + if p.tryEat("?") { return .reluctant } + if p.tryEat("+") { return .possessive } + return .eager + } - return (amt, kind, trivia) + return (amt, kind, trivia) + } } /// Try to consume a range, returning `nil` if unsuccessful. diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift index d37dfbd4a..70dc7a7d5 100644 --- a/Sources/_RegexParser/Utility/Misc.swift +++ b/Sources/_RegexParser/Utility/Misc.swift @@ -19,6 +19,21 @@ extension Substring { var string: String { String(self) } } +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + public var hasExactlyOneScalar: Bool { + let scalars = unicodeScalars + return scalars.index(after: scalars.startIndex) == scalars.endIndex + } + + /// Whether the given character is in NFC form. + internal var isNFC: Bool { + if isASCII { return true } + let str = String(self) + return str._nfcCodeUnits.elementsEqual(str.utf8) + } +} + extension CustomStringConvertible { @_alwaysEmitIntoClient public var halfWidthCornerQuoted: String { diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 477760ef8..e8c92f2b5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + /// Coalesce any adjacent scalar members in a custom character class together. + /// This is required in order to produce correct grapheme matching behavior. + func coalescingCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + struct Accumulator { + /// A series of range operands. For example, in `[ab-cde-fg]`, this will + /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting + /// ranges will be created. + private var rangeOperands: [String] = [""] + + /// The current range operand. + private var current: String { + _read { yield rangeOperands[rangeOperands.count - 1] } + _modify { yield &rangeOperands[rangeOperands.count - 1] } + } + + /// Try to accumulate a character class member, returning `true` if + /// successful, `false` otherwise. + mutating func tryAccumulate( + _ member: DSLTree.CustomCharacterClass.Member + ) -> Bool { + switch member { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + current.append(c) + return true + case .quotedLiteral(let str): + current += str + return true + case let .range(lhs, rhs): + guard let lhs = lhs.literalCharacterValue, + let rhs = rhs.literalCharacterValue + else { return false } + current.append(lhs) + rangeOperands.append(String(rhs)) + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !current.isEmpty + default: + return false + } + } + + func finish() -> [DSLTree.CustomCharacterClass.Member] { + if rangeOperands.count == 1 { + // If we didn't have any additional range operands, this isn't a + // range, we can just form a standard quoted literal. + return [.quotedLiteral(current)] + } + var members = [DSLTree.CustomCharacterClass.Member]() + + // We have other range operands, splice them together. For N operands + // we have N - 1 ranges. + for (i, lhs) in rangeOperands.dropLast().enumerated() { + let rhs = rangeOperands[i + 1] + + // If this is the first operand we only need to drop the last + // character for its quoted members, otherwise this is both an LHS + // and RHS of a range, and as such needs both sides trimmed. + let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast() + if !leading.isEmpty { + members.append(.quotedLiteral(String(leading))) + } + members.append(.range(.char(lhs.last!), .char(rhs.first!))) + } + // We've handled everything except the quoted portion of the last + // operand, add it now. + let trailing = rangeOperands.last!.dropFirst() + if !trailing.isEmpty { + members.append(.quotedLiteral(String(trailing))) + } + return members + } + } + return members + .map { m -> DSLTree.CustomCharacterClass.Member in + // First we need to recursively coalsce any child character classes. + switch m { + case .custom(let ccc): + return .custom(coalescingCustomCharacterClass(ccc)) + case .intersection(let lhs, let rhs): + return .intersection( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .subtraction(let lhs, let rhs): + return .subtraction( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .symmetricDifference(let lhs, let rhs): + return .symmetricDifference( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .atom, .range, .quotedLiteral, .trivia: + return m + } + } + .coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in + accum.tryAccumulate(member) + } + } + + func coalescingCustomCharacterClass( + _ ccc: DSLTree.CustomCharacterClass + ) -> DSLTree.CustomCharacterClass { + // This only needs to be done in grapheme semantic mode. In scalar semantic + // mode, we don't want to coalesce any scalars into a grapheme. This + // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and + // U+302. + guard options.semanticLevel == .graphemeCluster else { return ccc } + + let members = coalescingCustomCharacterClassMembers(ccc.members) + return .init(members: members, isInverted: ccc.isInverted) + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { + // Before emitting a custom character class in grapheme semantic mode, we + // need to coalesce together any adjacent characters and scalars, over which + // we can perform grapheme breaking. This includes e.g range bounds for + // `[e\u{301}-\u{302}]`. + let ccc = coalescingCustomCharacterClass(ccc) if let asciiBitset = ccc.asAsciiBitset(options), optimizationsEnabled { if options.semanticLevel == .unicodeScalar { @@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { + // Before emitting a concatenation, we need to flatten out any nested + // concatenations, and coalesce any adjacent characters and scalars, forming + // quoted literals of their contents, over which we can perform grapheme + // breaking. + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .convertedRegexLiteral(let n, _): + return flatten(n) + default: + return [node] + } + } + let children = children + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty + default: + return false + } + } + for child in children { + try emitConcatenationComponent(child) + } + } + @discardableResult mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { switch node { @@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAlternation(children) case let .concatenation(children): - for child in children { - try emitConcatenationComponent(child) - } + try emitConcatenation(children) case let .capture(name, refId, child, transform): options.beginScope() diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 530126a32..b8daa8b21 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -42,19 +42,43 @@ class Compiler { } } +/// Hashable wrapper for `Any.Type`. +struct AnyHashableType: CustomStringConvertible, Hashable { + var ty: Any.Type + init(_ ty: Any.Type) { + self.ty = ty + } + var description: String { "\(ty)" } + + static func == (lhs: Self, rhs: Self) -> Bool { + lhs.ty == rhs.ty + } + func hash(into hasher: inout Hasher) { + hasher.combine(ObjectIdentifier(ty)) + } +} + // An error produced when compiling a regular expression. -enum RegexCompilationError: Error, CustomStringConvertible { +enum RegexCompilationError: Error, Hashable, CustomStringConvertible { // TODO: Source location? case uncapturedReference + case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType) + case invalidCharacterClassRangeOperand(Character) + + static func incorrectOutputType( + incorrect: Any.Type, correct: Any.Type + ) -> Self { + .incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct)) + } - case incorrectOutputType(incorrect: Any.Type, correct: Any.Type) - var description: String { switch self { case .uncapturedReference: return "Found a reference used before it captured any match." case .incorrectOutputType(let incorrect, let correct): return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'" + case .invalidCharacterClassRangeOperand(let c): + return "'\(c)' is an invalid bound for character class range" } } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 668d16eb6..083781120 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -63,7 +63,7 @@ extension DSLTree._AST.Atom { extension Character { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { + ) throws -> MEProgram.ConsumeFunction { let isCaseInsensitive = opts.isCaseInsensitive switch opts.semanticLevel { case .graphemeCluster: @@ -327,24 +327,25 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions, _ isInverted: Bool ) -> DSLTree.CustomCharacterClass.AsciiBitset? { + typealias Bitset = DSLTree.CustomCharacterClass.AsciiBitset switch self { case let .atom(a): if let val = a.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - val, - isInverted, - opts.isCaseInsensitive - ) + return Bitset(val, isInverted, opts.isCaseInsensitive) } case let .range(low, high): - if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - low: lowVal, - high: highVal, - isInverted: isInverted, - isCaseInsensitive: opts.isCaseInsensitive - ) + if let lowVal = low.singleScalarASCIIValue, + let highVal = high.singleScalarASCIIValue { + return Bitset(low: lowVal, high: highVal, isInverted: isInverted, + isCaseInsensitive: opts.isCaseInsensitive) + } + case .quotedLiteral(let str): + var bitset = Bitset(isInverted: isInverted) + for c in str { + guard let ascii = c._singleScalarAsciiValue else { return nil } + bitset = bitset.union(Bitset(ascii, isInverted, opts.isCaseInsensitive)) } + return bitset default: return nil } @@ -361,38 +362,68 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - // TODO: - guard let lhs = low.literalCharacterValue else { + guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue else { + guard let rhsChar = high.literalCharacterValue else { throw Unsupported("\(high) in range") } - if opts.isCaseInsensitive { - let lhsLower = lhs.lowercased() - let rhsLower = rhs.lowercased() - guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil + // We must have NFC single scalar bounds. + guard let lhs = lhsChar.singleScalar, lhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(lhsChar) + } + guard let rhs = rhsChar.singleScalar, rhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(rhsChar) + } + guard lhs <= rhs else { + throw Unsupported("Invalid range \(low)-\(high)") + } + + let isCaseInsensitive = opts.isCaseInsensitive + let isCharacterSemantic = opts.semanticLevel == .graphemeCluster + + return { input, bounds in + let curIdx = bounds.lowerBound + let nextIndex = isCharacterSemantic + ? input.index(after: curIdx) + : input.unicodeScalars.index(after: curIdx) + + // Under grapheme semantics, we compare based on single NFC scalars. If + // such a character is not single scalar under NFC, the match fails. In + // scalar semantics, we compare the exact scalar value to the NFC + // bounds. + let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar + : input.unicodeScalars[curIdx] + guard let scalar = scalar else { return nil } + let scalarRange = lhs ... rhs + if scalarRange.contains(scalar) { + return nextIndex } - } else { - guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + + // Check for case insensitive matches. + func matchesCased( + _ cased: (UnicodeScalar.Properties) -> String + ) -> Bool { + let casedStr = cased(scalar.properties) + // In character semantic mode, we need to map to NFC. In scalar + // semantics, we should have an exact scalar. + let mapped = isCharacterSemantic ? casedStr.singleNFCScalar + : casedStr.singleScalar + guard let mapped = mapped else { return false } + return scalarRange.contains(mapped) + } + if isCaseInsensitive { + if scalar.properties.changesWhenLowercased, + matchesCased(\.lowercaseMapping) { + return nextIndex + } + if scalar.properties.changesWhenUppercased, + matchesCased(\.uppercaseMapping) { + return nextIndex } - return nil } + return nil } case let .custom(ccc): @@ -434,21 +465,17 @@ extension DSLTree.CustomCharacterClass.Member { } return rhs(input, bounds) } - case .quotedLiteral(let s): - if opts.isCaseInsensitive { - return { input, bounds in - guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { - return nil - } - return input.index(after: bounds.lowerBound) - } - } else { - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil + case .quotedLiteral(let str): + let consumers = try str.map { + try $0.generateConsumer(opts) + } + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx } - return input.index(after: bounds.lowerBound) } + return nil } case .trivia: // TODO: Should probably strip this earlier... diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 80f2e7697..c1753c49d 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -70,16 +70,9 @@ extension PrettyPrinter { for namedCapture in namedCaptures { print("let \(namedCapture) = Reference(Substring.self)") } - - switch node { - case .concatenation(_): - printAsPattern(convertedFromAST: node) - case .convertedRegexLiteral(.concatenation(_), _): - printAsPattern(convertedFromAST: node) - default: - printBlock("Regex") { printer in - printer.printAsPattern(convertedFromAST: node) - } + + printBlock("Regex") { printer in + printer.printAsPattern(convertedFromAST: node, isTopLevel: true) } } @@ -89,7 +82,7 @@ extension PrettyPrinter { // to have a non-backing-off pretty-printer that this // can defer to. private mutating func printAsPattern( - convertedFromAST node: DSLTree.Node + convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false ) { if patternBackoff(DSLTree._Tree(node)) { printBackoff(node) @@ -106,11 +99,7 @@ extension PrettyPrinter { } case let .concatenation(c): - printBlock("Regex") { printer in - c.forEach { - printer.printAsPattern(convertedFromAST: $0) - } - } + printConcatenationAsPattern(c, isTopLevel: isTopLevel) case let .nonCapturingGroup(kind, child): switch kind.ast { @@ -263,7 +252,7 @@ extension PrettyPrinter { // check above, so it should work out. Need a // cleaner way to do this. This means the argument // label is a lie. - printAsPattern(convertedFromAST: n) + printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -279,6 +268,64 @@ extension PrettyPrinter { print("/* TODO: absent function */") } } + + enum NodeToPrint { + case dslNode(DSLTree.Node) + case stringLiteral(String) + } + + mutating func printAsPattern(_ node: NodeToPrint) { + switch node { + case .dslNode(let n): + printAsPattern(convertedFromAST: n) + case .stringLiteral(let str): + print(str) + } + } + + mutating func printConcatenationAsPattern( + _ nodes: [DSLTree.Node], isTopLevel: Bool + ) { + // We need to coalesce any adjacent character and scalar elements into a + // string literal, preserving scalar syntax. + let nodes = nodes + .map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) } + .coalescing( + with: StringLiteralBuilder(), into: { .stringLiteral($0.result) } + ) { literal, node in + guard case .dslNode(let node) = node else { return false } + switch node { + case let .atom(.char(c)): + literal.append(c) + return true + case let .atom(.scalar(s)): + literal.append(unescaped: s._dslBase) + return true + case .quotedLiteral(let q): + literal.append(q) + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !literal.isEmpty + default: + return false + } + } + if isTopLevel || nodes.count == 1 { + // If we're at the top level, or we coalesced everything into a single + // element, we don't need to print a surrounding Regex { ... }. + for n in nodes { + printAsPattern(n) + } + return + } + printBlock("Regex") { printer in + for n in nodes { + printer.printAsPattern(n) + } + } + } mutating func printAsPattern( _ ccc: DSLTree.CustomCharacterClass, @@ -341,8 +388,7 @@ extension PrettyPrinter { charMembers.append(c) return false case let .scalar(s): - charMembers.append( - unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") + charMembers.append(unescaped: s._dslBase) return false case .unconverted(_): return true @@ -449,9 +495,9 @@ extension PrettyPrinter { case let .scalar(s): if wrap { - output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))") + output("One(.anyOf(\(s._dslBase._bareQuoted)))") } else { - output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")") + output(".anyOf(\(s._dslBase._bareQuoted))") } case let .unconverted(a): @@ -625,6 +671,10 @@ extension String { } } +extension UnicodeScalar { + var _dslBase: String { "\\u{\(String(value, radix: 16, uppercase: true))}" } +} + /// A helper for building string literals, which handles escaping the contents /// appended. fileprivate struct StringLiteralBuilder { @@ -851,19 +901,15 @@ extension AST.Atom { } var _dslBase: (String, canBeWrapped: Bool) { - func scalarLiteral(_ s: UnicodeScalar) -> String { - let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - } switch kind { case let .char(c): return (String(c), false) case let .scalar(s): - return (scalarLiteral(s.value), false) + return (s.value._dslBase, false) case let .scalarSequence(seq): - return (seq.scalarValues.map(scalarLiteral).joined(), false) + return (seq.scalarValues.map(\._dslBase).joined(), false) case let .property(p): return (p._dslBase, true) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index c4ac8e759..4eb7bc42c 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -43,61 +43,7 @@ extension AST.Node { return .orderedChoice(children) case let .concatenation(v): - // Coalesce adjacent children who can produce a - // string literal representation - let astChildren = v.children - func coalesce( - _ idx: Array.Index - ) -> (Array.Index, String)? { - var result = "" - var idx = idx - while idx < astChildren.endIndex { - guard let atom: AST.Atom = astChildren[idx].as() else { break } - - // TODO: For printing, nice to coalesce - // scalars literals too. We likely need a different - // approach even before we have a better IR. - if let char = atom.singleCharacter { - result.append(char) - } else if let scalar = atom.singleScalar { - result.append(Character(scalar)) - } else if case .scalarSequence(let seq) = atom.kind { - result += seq.scalarValues.map(Character.init) - } else { - break - } - - astChildren.formIndex(after: &idx) - } - return result.isEmpty ? nil : (idx, result) - } - - // No need to nest single children concatenations - if astChildren.count == 1 { - return astChildren.first!.dslTreeNode - } - - // Check for a single child post-coalescing - if let (idx, str) = coalesce(astChildren.startIndex), - idx == astChildren.endIndex - { - return .quotedLiteral(str) - } - - // Coalesce adjacent string children - var curIdx = astChildren.startIndex - var children = Array() - while curIdx < astChildren.endIndex { - if let (nextIdx, str) = coalesce(curIdx) { - // TODO: Track source info... - children.append(.quotedLiteral(str)) - curIdx = nextIdx - } else { - children.append(astChildren[curIdx].dslTreeNode) - astChildren.formIndex(after: &curIdx) - } - } - return .concatenation(children) + return .concatenation(v.children.map(\.dslTreeNode)) case let .group(v): let child = v.child.dslTreeNode @@ -135,10 +81,9 @@ extension AST.Node { case let .atom(v): switch v.kind { case .scalarSequence(let seq): - // Scalar sequences are splatted into concatenated scalars, which - // becomes a quoted literal. Sequences nested in concatenations have - // already been coalesced, this just handles the lone atom case. - return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + // The DSL doesn't have an equivalent node for scalar sequences. Splat + // them into a concatenation of scalars. + return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) }) default: return .atom(v.dslTreeAtom) } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 4ea905fd5..520f4991a 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -334,6 +334,14 @@ extension DSLTree.Node { default: return nil } } + + /// If this node is for a converted literal, look through it. + var lookingThroughConvertedLiteral: Self { + switch self { + case let .convertedRegexLiteral(n, _): return n + default: return self + } + } } extension DSLTree.Atom { diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index 80f6819a6..e0be4e386 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -11,10 +11,3 @@ // TODO - -extension Character { - /// Whether this character is made up of exactly one Unicode scalar value. - var hasExactlyOneScalar: Bool { - unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex - } -} diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift new file mode 100644 index 000000000..5c2c4aa48 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_spi(_Unicode) +import Swift + +extension UnicodeScalar { + /// Checks whether the scalar is in NFC form. + var isNFC: Bool { Character(self).singleNFCScalar == self } +} + +extension Character { + /// If the given character consists of a single NFC scalar, returns it. If + /// there are multiple NFC scalars, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + // SwiftStdlib is always >= 5.7 for a shipped StringProcessing. + guard #available(SwiftStdlib 5.7, *) else { return nil } + var nfcIter = String(self)._nfc.makeIterator() + guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil } + return scalar + } + + /// If the given character contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + hasExactlyOneScalar ? unicodeScalars.first! : nil + } +} + +extension String { + /// If the given string consists of a single NFC scalar, returns it. If none + /// or multiple NFC scalars are present, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + guard !isEmpty && index(after: startIndex) == endIndex else { return nil } + return first!.singleNFCScalar + } + + /// If the given string contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + let scalars = unicodeScalars + guard !scalars.isEmpty && + scalars.index(after: scalars.startIndex) == scalars.endIndex + else { return nil } + return scalars.first! + } +} diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift new file mode 100644 index 000000000..8a9cbe325 --- /dev/null +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +extension Array { + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into elements of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Self, + accumulate: (inout T, Element) -> Bool + ) -> Self { + var didAccumulate = false + var accumulator = initialAccumulator + + var result = Self() + for elt in self { + if accumulate(&accumulator, elt) { + // The element has been coalesced into accumulator, there is nothing + // else to do. + didAccumulate = true + continue + } + if didAccumulate { + // We have a leftover accumulator, which needs to be finished before we + // can append the next element. + result += finish(accumulator) + accumulator = initialAccumulator + didAccumulate = false + } + result.append(elt) + } + // Handle a leftover accumulation. + if didAccumulate { + result += finish(accumulator) + } + return result + } + + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into an element of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Element, + accumulate: (inout T, Element) -> Bool + ) -> Self { + coalescing( + with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 05375a1f7..e25f2df05 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -12,10 +12,7 @@ import XCTest import _StringProcessing import RegexBuilder - -#if os(Linux) -func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} -#endif +import TestSupport class RegexDSLTests: XCTestCase { func _testDSLCaptures( @@ -77,6 +74,9 @@ class RegexDSLTests: XCTestCase { let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" func testCharacterClasses() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("a c", ("a c", " ", "c")), matchType: (Substring, Substring, Substring).self, ==) @@ -251,6 +251,9 @@ class RegexDSLTests: XCTestCase { } func testCharacterClassOperations() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("bcdefn1a", "bcdefn1a"), ("nbcdef1a", nil), // fails symmetric difference lookahead @@ -594,6 +597,9 @@ class RegexDSLTests: XCTestCase { } func testQuantificationBehavior() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + // Eager by default try _testDSLCaptures( ("abc1def2", ("abc1def2", "2")), @@ -1429,7 +1435,8 @@ class RegexDSLTests: XCTestCase { "\u{200D}" as UnicodeScalar "πŸ‘¦" as UnicodeScalar } - XCTAssertNil(try r3.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r3.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r3.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) @@ -1441,18 +1448,72 @@ class RegexDSLTests: XCTestCase { try r4.firstMatch(in: "Γ©") ) - try XCTExpectFailure("Need stronger scalar coalescing logic") { - let r5 = Regex { - "e" - "\u{301}" as UnicodeScalar + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil(try r5.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r5.firstMatch(in: "Γ©")) + + let r6 = Regex { + "abcde" + "\u{301}" + } + XCTAssertNotNil(try r6.firstMatch(in: "abcde\u{301}")) + XCTAssertNotNil(try r6.firstMatch(in: "abcdΓ©")) + + let r7 = Regex { + "e" as Character + "\u{301}" as Character + } + XCTAssertNotNil(try r7.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r7.firstMatch(in: "Γ©")) + + // You can't match a partial grapheme in grapheme semantic mode. + let r8 = Regex { + "πŸ‘¨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "πŸ‘¨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "πŸ‘§" as UnicodeScalar + } + XCTAssertNil(try r8.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNil(try r8.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r8.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNil(try r8.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + + // Scalar coalescing occurs across nested concatenations and literals. + let r9 = Regex { + Regex { + try! Regex(#"πŸ‘¨"#) + "\u{200D}" as UnicodeScalar + Regex { + "πŸ‘¨" as UnicodeScalar + } } - XCTAssertNotNil( - try r5.firstMatch(in: "e\u{301}") - ) - XCTAssertNotNil( - try r5.firstMatch(in: "Γ©") - ) + Regex { + Regex { + "\u{200D}" as UnicodeScalar + "πŸ‘§" + } + try! Regex(#"\u{200D}πŸ‘¦"#) + } + } + XCTAssertNotNil(try r9.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r9.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + + let r10 = Regex { + "πŸ‘¨" as UnicodeScalar + try! Regex(#"\u{200D 1F468 200D 1F467}"#) + "\u{200D}" as UnicodeScalar + "πŸ‘¦" as UnicodeScalar } + XCTAssertNotNil(try r10.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r10.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) } struct SemanticVersion: Equatable { diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6c8f66e10..27f8d79cb 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -11,6 +11,7 @@ @testable import _RegexParser @testable import _StringProcessing +import TestSupport import XCTest @@ -168,6 +169,45 @@ extension RegexTests { } } + private func testCompileError( + _ regex: String, _ error: RegexCompilationError, + file: StaticString = #file, line: UInt = #line + ) { + do { + _ = try _compileRegex(regex) + XCTFail("Expected compile error", file: file, line: line) + } catch let err as RegexCompilationError { + XCTAssertEqual(err, error, file: file, line: line) + } catch { + XCTFail("Unknown compile error", file: file, line: line) + } + } + + func testInvalidScalarCoalescing() throws { + guard ensureNewStdlib() else { return } + + // Non-single-scalar bounds. + testCompileError( + #"[a\u{302}-βœ…]"#, .invalidCharacterClassRangeOperand("a\u{302}")) + testCompileError( + #"[e\u{301}-\u{302}]"#, .invalidCharacterClassRangeOperand("e\u{301}")) + testCompileError( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + .invalidCharacterClassRangeOperand("\u{73}\u{323}\u{307}")) + testCompileError( + #"[a\u{315}\u{301}-\u{302}]"#, + .invalidCharacterClassRangeOperand("a\u{315}\u{301}") + ) + testCompileError( + #"[a-z1e\u{301}-\u{302}\u{E1}3-59]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + testCompileError( + #"[[e\u{301}-\u{302}]&&e\u{303}]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + } + func testCompileQuantification() throws { // NOTE: While we might change how we compile @@ -317,6 +357,15 @@ extension RegexTests { semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar], doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: #"[\Qab\Ec]"#, + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: #"[\Qab\Ec]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) } func testScalarOptimizeCompilation() { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a8f7977d6..8e01582a9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -12,6 +12,7 @@ import XCTest @testable import _RegexParser @testable import _StringProcessing +import TestSupport struct MatchError: Error { var message: String @@ -26,23 +27,33 @@ func _firstMatch( validateOptimizations: Bool, semanticLevel: RegexSemanticLevel = .graphemeCluster, syntax: SyntaxOptions = .traditional -) throws -> (String, [String?]) { +) throws -> (String, [String?])? { var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) - guard let result = try regex.firstMatch(in: input) else { - throw MatchError("match not found for \(regexStr) in \(input)") - } - let caps = result.output.slices(from: input) - + let result = try regex.firstMatch(in: input) + if validateOptimizations { regex._setCompilerOptionsForTesting(.disableOptimizations) - guard let unoptResult = try regex.firstMatch(in: input) else { + let unoptResult = try regex.firstMatch(in: input) + if result != nil && unoptResult == nil { throw MatchError("match not found for unoptimized \(regexStr) in \(input)") } - XCTAssertEqual( - String(input[result.range]), - String(input[unoptResult.range]), - "Unoptimized regex returned a different result") + if result == nil && unoptResult != nil { + throw MatchError("match not found in optimized \(regexStr) in \(input)") + } + if let result = result, let unoptResult = unoptResult { + let optMatch = String(input[result.range]) + let unoptMatch = String(input[unoptResult.range]) + if optMatch != unoptMatch { + throw MatchError(""" + + Unoptimized regex returned: '\(unoptMatch)' + Optimized regex returned: '\(optMatch)' + """) + } + } } + guard let result = result else { return nil } + let caps = result.output.slices(from: input) return (String(input[result.range]), caps.map { $0.map(String.init) }) } @@ -153,12 +164,12 @@ func firstMatchTest( line: UInt = #line ) { do { - let (found, _) = try _firstMatch( + let found = try _firstMatch( regex, input: input, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax) + syntax: syntax)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line) @@ -166,9 +177,7 @@ func firstMatchTest( XCTAssertEqual(found, match, "Incorrect match", file: file, line: line) } } catch { - // FIXME: This allows non-matches to succeed even when xfail'd - // When xfail == true, this should report failure for match == nil - if !xfail && match != nil { + if !xfail { XCTFail("\(error)", file: file, line: line) } return @@ -182,6 +191,7 @@ func firstMatchTests( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -194,6 +204,7 @@ func firstMatchTests( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + semanticLevel: semanticLevel, file: file, line: line) } @@ -303,6 +314,55 @@ extension RegexTests { match: "\u{006f}\u{031b}\u{0323}" ) + // e + combining accents + firstMatchTest( + #"e\u{301 302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{315 35C 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\u{302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{315}\u{301}\u{35C}", + match: "e\u{315}\u{301}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\de\u{302}"#, + input: "e\u{301}0e\u{302}", + match: "e\u{301}0e\u{302}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + + // We don't coalesce across groups. + firstMatchTests( + #"e\u{301}(?:\u{315}\u{35C})?"#, + ("e\u{301}", "e\u{301}"), + ("e\u{301}\u{315}\u{35C}", nil) + ) + // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", @@ -311,8 +371,6 @@ extension RegexTests { input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") - firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") - // MARK: Quotes firstMatchTest( @@ -428,8 +486,7 @@ extension RegexTests { "a++a", ("babc", nil), ("baaabc", nil), - ("bb", nil), - xfail: true) + ("bb", nil)) firstMatchTests( "a+?a", ("babc", nil), @@ -505,23 +562,19 @@ extension RegexTests { ("baabc", nil), ("bb", nil)) - // XFAIL'd versions of the above firstMatchTests( "a{2,4}+a", - ("baaabc", nil), - xfail: true) + ("baaabc", nil)) firstMatchTests( "a{,4}+a", ("babc", nil), ("baabc", nil), - ("baaabc", nil), - xfail: true) + ("baaabc", nil)) firstMatchTests( "a{2,}+a", ("baaabc", nil), ("baaaaabc", nil), - ("baaaaaaaabc", nil), - xfail: true) + ("baaaaaaaabc", nil)) // XFAIL'd possessive tests firstMatchTests( @@ -568,6 +621,9 @@ extension RegexTests { } func testMatchCharacterClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Character classes firstMatchTest(#"abc\d"#, input: "xyzabc123", match: "abc1") @@ -691,6 +747,331 @@ extension RegexTests { ("a\u{301}", true), semanticLevel: .unicodeScalar) + // Scalar matching in quoted sequences. + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", nil), + ("\u{C9}", nil) + ) + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", nil), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "\u{301}"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", "E\u{301}"), + ("\u{C9}", "\u{C9}") + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", "E"), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "E"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + + // Scalar coalescing. + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[πŸ‘¨\u{200D}πŸ‘©\u{200D}πŸ‘§\u{200D}πŸ‘¦]"#, + ("πŸ‘¨", nil), + ("πŸ‘©", nil), + ("πŸ‘§", nil), + ("πŸ‘¦", nil), + ("\u{200D}", nil), + ("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦") + ) + firstMatchTests( + #"[πŸ‘¨\u{200D}πŸ‘©\u{200D}πŸ‘§\u{200D}πŸ‘¦]"#, + ("πŸ‘¨", "πŸ‘¨"), + ("πŸ‘©", "πŸ‘©"), + ("πŸ‘§", "πŸ‘§"), + ("πŸ‘¦", "πŸ‘¦"), + ("\u{200D}", "\u{200D}"), + ("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + firstMatchTests( + #"(?x) [ e \u{315} \u{301} \u{35C} ]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + + // We don't coalesce across character classes. + firstMatchTests( + #"e[\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{315}\u{301}", nil), + ("e\u{301}\u{315}\u{35C}", nil) + ) + firstMatchTests( + #"[e[\u{301}]]"#, + ("e", "e"), + ("\u{301}", "\u{301}"), + ("e\u{301}", nil) + ) + + firstMatchTests( + #"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#, + ("a", "a"), + ("a\u{301}", "a\u{301}"), + ("\u{E1}", "\u{E1}"), + ("\u{E2}", nil), + ("z", "z"), + ("e", "e"), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("\u{302}", "\u{302}"), + ("1", "1"), + ("2", nil), + ("3", "3"), + ("4", "4"), + ("5", "5"), + ("6", nil), + ("7", nil), + ("8", nil), + ("9", "9") + ) + firstMatchTests( + #"[ab-df-hik-lm]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", "d"), + ("e", nil), + ("f", "f"), + ("g", "g"), + ("h", "h"), + ("i", "i"), + ("j", nil), + ("k", "k"), + ("l", "l"), + ("m", "m") + ) + firstMatchTests( + #"[a-ce-fh-j]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", nil), + ("e", "e"), + ("f", "f"), + ("g", nil), + ("h", "h"), + ("i", "i"), + ("j", "j") + ) + + + // These can't compile in grapheme semantic mode, but make sure they work in + // scalar semantic mode. + firstMatchTests( + #"[a\u{315}\u{301}-\u{302}]"#, + ("a", "a"), + ("\u{315}", "\u{315}"), + ("\u{301}", "\u{301}"), + ("\u{302}", "\u{302}"), + ("\u{303}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + ("\u{73}", "\u{73}"), + ("\u{323}", "\u{323}"), + ("\u{307}", "\u{307}"), + ("\u{400}", "\u{400}"), + ("\u{500}", "\u{500}"), + ("\u{1E00}", "\u{1E00}"), + ("\u{1E01}", nil), + ("\u{1E69}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[a\u{302}-βœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("βœ…", "βœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[a\u{302}-βœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "A"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("βœ…", "βœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "\u{301}"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "E"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + + // Set operation scalar coalescing. + firstMatchTests( + #"[e\u{301}&&e\u{301}e\u{302}]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", "e\u{301}"), + ("e\u{302}", nil)) + firstMatchTests( + #"[e\u{301}~~[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", nil), + ("e\u{302}", "e\u{302}")) + firstMatchTests( + #"[e\u{301}[e\u{303}]--[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + + firstMatchTests( + #"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters @@ -773,6 +1154,15 @@ extension RegexTests { } firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") + firstMatchTest(#"[12]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣") + firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil) + firstMatchTest("[0-2&&1-3]", input: "1️⃣", match: nil) + firstMatchTest("[1-2e\u{301}]", input: "1️⃣", match: nil) + + firstMatchTest(#"[\u{3A9}-\u{3A9}]"#, input: "\u{3A9}", match: "\u{3A9}") + // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", @@ -826,6 +1216,35 @@ extension RegexTests { firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + // Case sensitivity and ranges. + for ch in "abcD" { + firstMatchTest("[a-cD]", input: String(ch), match: String(ch)) + } + for ch in "ABCd" { + firstMatchTest("[a-cD]", input: String(ch), match: nil) + } + for ch in "abcABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[a-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[A-CD]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcd" { + let input = String(ch) + firstMatchTest( + "[X-cd]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcxyzABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[X-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[X-cD]", input: input, match: input, semanticLevel: semantics) + } + } } func testCharacterProperties() { @@ -1038,6 +1457,9 @@ extension RegexTests { } func testMatchAnchors() throws { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Anchors firstMatchTests( #"^\d+"#, @@ -1086,8 +1508,6 @@ extension RegexTests { (" 123\n456\n", nil), ("123 456", "456")) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) firstMatchTests( #"\d+\b"#, ("123", "123"), @@ -1105,7 +1525,6 @@ extension RegexTests { ("123", "23"), (" 123", "23"), ("123 456", "23")) -#endif // TODO: \G and \K do { @@ -1118,8 +1537,8 @@ extension RegexTests { // TODO: Oniguruma \y and \Y firstMatchTests( #"\u{65}"#, // Scalar 'e' is present in both - ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match - xfail: true) + ("Cafe\u{301}", nil)) // but scalar mode requires boundary at end of match + firstMatchTests( #"\u{65}"#, // Scalar 'e' is present in both ("Sol Cafe", "e")) // standalone is okay @@ -1136,9 +1555,10 @@ extension RegexTests { ("Sol Cafe", nil), xfail: true) } - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) func testLevel2WordBoundaries() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Level 2 Word Boundaries firstMatchTest(#"\b😊\b"#, input: "πŸ”₯πŸ˜ŠπŸ‘", match: "😊") firstMatchTest(#"\bπŸ‘¨πŸ½\b"#, input: "πŸ‘©πŸ»πŸ‘ΆπŸΏπŸ‘¨πŸ½πŸ§‘πŸΎπŸ‘©πŸΌ", match: "πŸ‘¨πŸ½") @@ -1154,9 +1574,11 @@ extension RegexTests { firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't") firstMatchTest(#"\bΓ·\b"#, input: "3 Γ· 3 = 1", match: "Γ·") } -#endif - + func testMatchGroups() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Groups // Named captures @@ -1380,6 +1802,9 @@ extension RegexTests { } func testMatchExamples() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // Backreferences matchTest( #"(sens|respons)e and \1ibility"#, @@ -1429,8 +1854,6 @@ extension RegexTests { xfail: true ) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) // HTML tags matchTest( #"<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>.*?"#, @@ -1448,7 +1871,6 @@ extension RegexTests { ("pass me the the kettle", ["the"]), ("this doesn't have any", nil) ) -#endif // Floats flatCaptureTest( @@ -1464,8 +1886,79 @@ extension RegexTests { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + + func testMatchNewlines() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + firstMatchTest( + #"\r\n"#, input: "\r\n", match: "\r\n", + semanticLevel: semantics + ) + firstMatchTest( + #"\r\n"#, input: "\n", match: nil, semanticLevel: semantics) + firstMatchTest( + #"\r\n"#, input: "\r", match: nil, semanticLevel: semantics) + + // \r\n is not treated as ASCII. + firstMatchTest( + #"^\p{ASCII}$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\r$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\r]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\n$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\n]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\u{0}-\u{7F}]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + + let scalarSemantics = semantics == .unicodeScalar + firstMatchTest( + #"\p{ASCII}"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\r"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\r]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\n"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\n]"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\u{0}-\u{7F}]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + } + } func testCaseSensitivity() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"c..e"#, ("cafe", true), @@ -1528,6 +2021,9 @@ extension RegexTests { } func testASCIIClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // 'D' ASCII-only digits matchTest( #"\d+"#, @@ -1556,8 +2052,6 @@ extension RegexTests { ("aeiou", true), ("Γ₯e\u{301}ïôú", false)) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) matchTest( #"abcd\b.+"#, ("abcd ef", true), @@ -1573,7 +2067,6 @@ extension RegexTests { ("abcd ef", true), ("abcdef", false), ("abcdΓ©f", false)) -#endif // 'S' ASCII-only spaces matchTest( @@ -1699,6 +2192,9 @@ extension RegexTests { var eComposed: String { "Γ©" } var eDecomposed: String { "e\u{301}" } + var eComposedUpper: String { "Γ‰" } + var eDecomposedUpper: String { "E\u{301}" } + func testIndividualScalars() { // Expectation: A standalone Unicode scalar value in a regex literal // can match either that specific scalar value or participate in matching @@ -1711,19 +2207,15 @@ extension RegexTests { firstMatchTest(#"\u{65 301}$"#, input: eComposed, match: eComposed) // FIXME: Implicit \y at end of match - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, - xfail: true) + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) - // FIXME: \y is unsupported - firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, - xfail: true) + firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil) // FIXME: Unicode scalars are only matched at the start of a grapheme cluster firstMatchTest(#"\u{301}"#, input: eDecomposed, match: "\u{301}", xfail: true) - // FIXME: \y is unsupported - firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil, - xfail: true) + + firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil) } func testCanonicalEquivalence() throws { @@ -1745,6 +2237,16 @@ extension RegexTests { #"e$"#, (eComposed, false), (eDecomposed, false)) + + matchTest( + #"\u{65 301}"#, + (eComposed, true), + (eDecomposed, true)) + + matchTest( + #"(?x) \u{65} \u{301}"#, + (eComposed, true), + (eDecomposed, true)) } func testCanonicalEquivalenceCharacterClass() throws { @@ -1781,41 +2283,70 @@ extension RegexTests { // \s firstMatchTest(#"\s"#, input: " ", match: " ") // FIXME: \s shouldn't match a number composed with a non-number character - firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, - xfail: true) + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil) // \p{Whitespace} firstMatchTest(#"\s"#, input: " ", match: " ") - // FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character - firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, - xfail: true) + // \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil) } func testCanonicalEquivalenceCustomCharacterClass() throws { - // Expectation: Concatenations with custom character classes should be able - // to match within a grapheme cluster. That is, a regex should be able to - // match the scalar values that comprise a grapheme cluster in separate, - // or repeated, custom character classes. - + // Expectation: Custom character class matches do not cross grapheme + // character boundaries by default. When matching with Unicode scalar + // semantics, grapheme cluster boundaries are ignored, so matching + // sequences of custom character classes can succeed. + + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"[Ñéíóú]$"#, (eComposed, true), (eDecomposed, true)) - // FIXME: Custom char classes don't use canonical equivalence with composed characters - firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) + for input in [eDecomposed, eComposed] { + // Unicode scalar semantics means that only the decomposed version can + // match here. + let match = input.unicodeScalars.count == 2 ? input : nil + firstMatchTest( + #"e[\u{301}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"e[\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e-e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[a-z][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + } + for input in [eComposed, eDecomposed] { + // Grapheme cluster semantics means that we can't match the 'e' separately + // from the accent. + firstMatchTest(#"e[\u{301}]$"#, input: input, match: nil) + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e-e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: input, match: nil) + + // A range that covers Γ© (U+E9). Inputs are mapped to NFC, so match. + firstMatchTest(#"[\u{E8}-\u{EA}]"#, input: input, match: input) + } - // FIXME: Custom char classes don't match decomposed characters - firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + // A range that covers Γ‰ (U+C9). Inputs are mapped to NFC, so match. + for input in [eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"[\u{C9}-\u{C9}]"#, input: input, match: input) + } + // Case insensitive matching of Γ‰ (U+C9). + for input in [eComposed, eDecomposed, eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"(?i)[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"(?i)[\u{C9}-\u{C9}]"#, input: input, match: input) + } let flag = "πŸ‡°πŸ‡·" firstMatchTest(#"πŸ‡°πŸ‡·"#, input: flag, match: flag) @@ -1824,27 +2355,33 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, - xfail: true) - - // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character + firstMatchTest( + #"^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of regional indicators followed by the second Unicode scalar + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) // A CCC of regional indicators x 2 - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, - xfail: true) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of N regional indicators + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]+$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) - // FIXME: A single CCC of regional indicators matches the whole flag character - // A CCC of regional indicators followed by the second Unicode scalar - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, - xfail: true) // A single CCC of regional indicators - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, - xfail: true) - - // A single CCC of actual flag emojis / combined regional indicators - firstMatchTest(#"[πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό]"#, input: flag, match: flag) - // This succeeds (correctly) because \u{1F1F0} is lexicographically - // within the CCC range - firstMatchTest(#"[πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil, + semanticLevel: .unicodeScalar + ) } func testAnyChar() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 52a272915..84ce361f3 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -374,10 +374,21 @@ extension RegexTests { // MARK: Allowed combining characters - parseTest("e\u{301}", "e\u{301}") parseTest("1\u{358}", "1\u{358}") parseTest(#"\ \#u{361}"#, " \u{361}") + parseTest("e\u{301}", "e\u{301}") + parseTest("[e\u{301}]", charClass("e\u{301}")) + parseTest("\u{E9}", "e\u{301}") + parseTest("[\u{E9}]", charClass("e\u{301}")) + + parseTest( + "\\e\u{301}", "e\u{301}", throwsError: .invalidEscape("e\u{301}")) + parseTest( + "[\\e\u{301}]", charClass("e\u{301}"), + throwsError: .invalidEscape("e\u{301}") + ) + // MARK: Alternations parseTest( @@ -2885,11 +2896,41 @@ extension RegexTests { diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence")) diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence")) + diagnosticTest(#"|([πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"|([πŸ‘¨β€πŸ‘©β€πŸ‘¦-πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§])?"#, .invalidCharacterClassRangeOperand) + + // Not single-scalar NFC. + diagnosticTest("[e\u{301}-e\u{302}]", .invalidCharacterClassRangeOperand) + + // These scalar values expand under NFC. + let nfcExpandingScalars: [UInt32] = [ + 0x344, 0x958, 0x959, 0x95A, 0x95B, 0x95C, 0x95D, 0x95E, 0x95F, 0x9DC, + 0x9DD, 0x9DF, 0xA33, 0xA36, 0xA59, 0xA5A, 0xA5B, 0xA5E, 0xB5C, 0xB5D, + 0xF43, 0xF4D, 0xF52, 0xF57, 0xF5C, 0xF69, 0xF73, 0xF75, 0xF76, 0xF78, + 0xF81, 0xF93, 0xF9D, 0xFA2, 0xFA7, 0xFAC, 0xFB9, 0x2ADC, 0xFB1D, 0xFB1F, + 0xFB2A, 0xFB2B, 0xFB2C, 0xFB2D, 0xFB2E, 0xFB2F, 0xFB30, 0xFB31, 0xFB32, + 0xFB33, 0xFB34, 0xFB35, 0xFB36, 0xFB38, 0xFB39, 0xFB3A, 0xFB3B, 0xFB3C, + 0xFB3E, 0xFB40, 0xFB41, 0xFB43, 0xFB44, 0xFB46, 0xFB47, 0xFB48, 0xFB49, + 0xFB4A, 0xFB4B, 0xFB4C, 0xFB4D, 0xFB4E, 0x1D15E, 0x1D15F, 0x1D160, + 0x1D161, 0x1D162, 0x1D163, 0x1D164, 0x1D1BB, 0x1D1BC, 0x1D1BD, 0x1D1BE, + 0x1D1BF, 0x1D1C0 + ] + for scalar in nfcExpandingScalars { + let hex = String(scalar, radix: 16) + diagnosticTest( + #"[\u{\#(hex)}-\u{\#(hex)}]"#, .invalidCharacterClassRangeOperand) + } + + // The NFC form of U+2126 is U+3A9. + diagnosticTest(#"[\u{2126}-\u{2126}]"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e")) + diagnosticTest("(?x)[(?#)]", .expected("]")) diagnosticTest("(?x)[(?#abc)]", .expected("]")) diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 3b0a8d5b3..e925d255c 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -171,10 +171,71 @@ extension RenderDSLTests { } """#) - // TODO: We ought to try and preserve the scalar syntax here. try testConversion(#"a\u{301}"#, #""" Regex { - "á" + "a\u{301}" + } + """#) + + try testConversion(#"(?x) a \u{301}"#, #""" + Regex { + "a\u{301}" + } + """#) + + try testConversion(#"(?x) [ a b c \u{301} ] "#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + try testConversion(#"πŸ‘¨\u{200D}πŸ‘¨\u{200D}πŸ‘§\u{200D}πŸ‘¦"#, #""" + Regex { + "πŸ‘¨\u{200D}πŸ‘¨\u{200D}πŸ‘§\u{200D}πŸ‘¦" + } + """#) + + try testConversion(#"(πŸ‘¨\u{200D}πŸ‘¨)\u{200D}πŸ‘§\u{200D}πŸ‘¦"#, #""" + Regex { + Capture { + "πŸ‘¨\u{200D}πŸ‘¨" + } + "\u{200D}πŸ‘§\u{200D}πŸ‘¦" + } + """#) + + // We preserve the structure of non-capturing groups. + try testConversion(#"abcd(?:e\u{301}\d)"#, #""" + Regex { + "abcd" + Regex { + "e\u{301}" + One(.digit) + } + } + """#) + + try testConversion(#"\u{A B C}"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + } + """#) + + // TODO: We might want to consider preserving scalar sequences in the DSL, + // and allowing them to merge with other concatenations. + try testConversion(#"\u{A B C}\u{d}efg"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + "\u{D}efg" + } + """#) + + // FIXME: We don't actually have a way of specifying in the DSL that we + // shouldn't join these together, should we print them as regex instead? + try testConversion(#"a(?:\u{301})"#, #""" + Regex { + "a" + "\u{301}" } """#) } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index aa3639ea6..11479bfb6 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -21,6 +21,7 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +import TestSupport extension UnicodeScalar { var value4Digits: String { @@ -316,6 +317,9 @@ extension UTS18Tests { // surrogate followed by a trailing surrogate shall be handled as a single // code point in matching. func testSupplementaryCodePoints() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + XCTAssertTrue("πŸ‘".contains(regex(#"\u{1F44D}"#))) XCTAssertTrue("πŸ‘".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) XCTAssertTrue("πŸ‘πŸ‘Ž".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) @@ -388,6 +392,9 @@ extension UTS18Tests { } func testCharacterClassesWithStrings() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + let regex = regex(#"[a-zπŸ§πŸ‡§πŸ‡ͺπŸ‡§πŸ‡«πŸ‡§πŸ‡¬]"#) XCTAssertEqual("🧐", "🧐".wholeMatch(of: regex)?.0) XCTAssertEqual("πŸ‡§πŸ‡«", "πŸ‡§πŸ‡«".wholeMatch(of: regex)?.0)