From 8b0e5f0533a2227d69a8b71814d4aaae277f43a0 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 16 May 2022 13:02:27 -0500 Subject: [PATCH] More unicode properties (#385) Add validation testing for supported and unsupported Unicode properties, along with support for the following properties: - age - numeric type - numeric value - lower/upper/titlecase mapping - canonical combining class --- Sources/_RegexParser/Regex/AST/Atom.swift | 21 + .../CharacterPropertyClassification.swift | 87 +++- .../Regex/Parse/Diagnostics.swift | 20 +- Sources/_RegexParser/Regex/Parse/Sema.swift | 8 +- .../_StringProcessing/ConsumerInterface.swift | 31 +- Tests/RegexTests/ParseTests.swift | 14 + Tests/RegexTests/UTS18Tests.swift | 385 +++++++++++++++++- 7 files changed, 526 insertions(+), 40 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 075818bac..6721076fc 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -427,11 +427,32 @@ extension AST.Atom.CharacterProperty { /// Character name in the form `\p{name=...}` case named(String) + /// Numeric type. + case numericType(Unicode.NumericType) + + /// Numeric value. + case numericValue(Double) + + /// Case mapping. + case mapping(MapKind, String) + + /// Canonical Combining Class. + case ccc(Unicode.CanonicalCombiningClass) + + /// Character age, as per UnicodeScalar.Properties.age. + case age(major: Int, minor: Int) + case posix(Unicode.POSIXProperty) /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) case onigurumaSpecial(OnigurumaSpecialProperty) + + public enum MapKind: Hashable { + case lowercase + case uppercase + case titlecase + } } // TODO: erm, separate out or fold into something? splat it in? diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index c0ece78ff..21b5ddc68 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -13,17 +13,17 @@ extension Source { typealias PropertyKind = AST.Atom.CharacterProperty.Kind static private func withNormalizedForms( - _ str: String, match: (String) -> T? - ) -> T? { + _ str: String, match: (String) throws -> T? + ) rethrows -> T? { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" } .lowercased() - if let m = match(str) { + if let m = try match(str) { return m } - if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) { + if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) { return m } return nil @@ -79,6 +79,19 @@ extension Source { } } + static private func classifyNumericType( + _ str: String + ) -> Unicode.NumericType? { + withNormalizedForms(str) { str in + switch str { + case "decimal": return .decimal + case "digit": return .digit + case "numeric": return .numeric + default: return nil + } + } + } + static private func classifyBoolProperty( _ str: String ) -> Unicode.BinaryProperty? { @@ -361,6 +374,27 @@ extension Source { } } } + + static func parseAge(_ value: String) -> Unicode.Version? { + // Age can be specified in the form '3.0' or 'V3_0'. + // Other formats are not supported. + var str = value[...] + + let separator: Character + if str.first == "V" { + str.removeFirst() + separator = "_" + } else { + separator = "." + } + + guard let sepIndex = str.firstIndex(of: separator), + let major = Int(str[.. PropertyKind? in - switch key { + let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in + switch normalizedKey { case "script", "sc": - if let script = classifyScriptProperty(value) { - return .script(script) + guard let script = classifyScriptProperty(value) else { + throw ParseError.unrecognizedScript(value) } + return .script(script) case "scriptextensions", "scx": - if let script = classifyScriptProperty(value) { - return .scriptExtension(script) + guard let script = classifyScriptProperty(value) else { + throw ParseError.unrecognizedScript(value) } + return .scriptExtension(script) case "gc", "generalcategory": - if let cat = classifyGeneralCategory(value) { - return .generalCategory(cat) + guard let cat = classifyGeneralCategory(value) else { + throw ParseError.unrecognizedCategory(value) + } + return .generalCategory(cat) + case "age": + guard let (major, minor) = parseAge(value) else { + throw ParseError.invalidAge(value) } + return .age(major: major, minor: minor) case "name", "na": return .named(value) + case "numericvalue", "nv": + guard let numericValue = Double(value) else { + throw ParseError.invalidNumericValue(value) + } + return .numericValue(numericValue) + case "numerictype", "nt": + guard let type = classifyNumericType(value) else { + throw ParseError.unrecognizedNumericType(value) + } + return .numericType(type) + case "slc", "simplelowercasemapping": + return .mapping(.lowercase, value) + case "suc", "simpleuppercasemapping": + return .mapping(.uppercase, value) + case "stc", "simpletitlecasemapping": + return .mapping(.titlecase, value) + case "ccc", "canonicalcombiningclass": + guard let cccValue = UInt8(value), cccValue <= 254 else { + throw ParseError.invalidCCC(value) + } + return .ccc(.init(rawValue: cccValue)) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index d87fba918..05bf4ba1a 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -59,7 +59,13 @@ enum ParseError: Error, Hashable { case emptyProperty case unknownProperty(key: String?, value: String) - + case unrecognizedScript(String) + case unrecognizedCategory(String) + case invalidAge(String) + case invalidNumericValue(String) + case unrecognizedNumericType(String) + case invalidCCC(String) + case expectedGroupSpecifier case unbalancedEndOfGroup @@ -181,6 +187,18 @@ extension ParseError: CustomStringConvertible { return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" + case .unrecognizedScript(let value): + return "unrecognized script '\(value)'" + case .unrecognizedCategory(let value): + return "unrecognized category '\(value)'" + case .unrecognizedNumericType(let value): + return "unrecognized numeric type '\(value)'" + case .invalidAge(let value): + return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats" + case .invalidNumericValue(let value): + return "invalid numeric value '\(value)'" + case .invalidCCC(let value): + return "invalid canonical combining class '\(value)'" // MARK: Semantic Errors diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 9d5ae4576..be28754b8 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -127,8 +127,8 @@ extension RegexValidator { _ prop: Unicode.BinaryProperty, at loc: SourceLocation ) throws { switch prop { - case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable, - .changesWhenCasefolded, .changesWhenCasemapped, + case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased, + .caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped, .changesWhenNFKCCasefolded, .changesWhenLowercased, .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated, .defaultIgnorableCodePoint, .diacratic, .extender, @@ -150,7 +150,7 @@ extension RegexValidator { case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) - case .bidiControl, .compositionExclusion, .emojiComponent, + case .compositionExclusion, .emojiComponent, .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, @@ -169,7 +169,7 @@ extension RegexValidator { case .binary(let b, _): try validateBinaryProperty(b, at: loc) case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, - .scriptExtension: + .scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc: break case .pcreSpecial: throw error(.unsupported("PCRE property"), at: loc) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 90e573824..b6bbfd83e 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -145,10 +145,7 @@ extension String { } func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram.ConsumeFunction { - let consume = opts.semanticLevel == .graphemeCluster - ? consumeCharacterWithSingleScalar - : consumeScalar - + let consume = consumeFunction(for: opts) return consume(propertyScalarPredicate { // FIXME: name aliases not covered by $0.nameAlias are missed // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases @@ -491,6 +488,30 @@ extension AST.Atom.CharacterProperty { case .named(let n): return consumeName(n, opts: opts) + case .age(let major, let minor): + return consume { + guard let age = $0.properties.age else { return false } + return age <= (major, minor) + } + + case .numericValue(let value): + return consume { $0.properties.numericValue == value } + + case .numericType(let type): + return consume { $0.properties.numericType == type } + + case .ccc(let ccc): + return consume { $0.properties.canonicalCombiningClass == ccc } + + case .mapping(.lowercase, let value): + return consume { $0.properties.lowercaseMapping == value } + + case .mapping(.uppercase, let value): + return consume { $0.properties.uppercaseMapping == value } + + case .mapping(.titlecase, let value): + return consume { $0.properties.titlecaseMapping == value } + case .posix(let p): return p.generateConsumer(opts) @@ -525,7 +546,7 @@ extension Unicode.BinaryProperty { case .alphabetic: return consume(propertyScalarPredicate(\.isAlphabetic)) case .bidiControl: - break + return consume(propertyScalarPredicate(\.isBidiControl)) case .bidiMirrored: return consume(propertyScalarPredicate(\.isBidiMirrored)) case .cased: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 2d22ad252..d5325268e 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2447,6 +2447,20 @@ extension RegexTests { diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) + diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) + diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category")) + diagnosticTest(#"\p{age=3}"#, .invalidAge("3")) + diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3")) + diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1")) + diagnosticTest(#"\p{nv=A}"#, .invalidNumericValue("A")) + diagnosticTest(#"\p{Numeric_Value=1.2.3.4}"#, .invalidNumericValue("1.2.3.4")) + diagnosticTest(#"\p{nt=Not_A_NumericType}"#, .unrecognizedNumericType("Not_A_NumericType")) + diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric")) + diagnosticTest(#"\p{Simple_Lowercase_Mapping}"#, .unknownProperty(key: nil, value: "Simple_Lowercase_Mapping")) + diagnosticTest(#"\p{Simple_Lowercase_Mapping=}"#, .emptyProperty) + diagnosticTest(#"\p{ccc=255}"#, .invalidCCC("255")) + diagnosticTest(#"\p{ccc=Nada}"#, .invalidCCC("Nada")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index d13b47b8d..7306632da 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -78,6 +78,9 @@ extension UTS18Tests { func testHexNotation() { expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab") expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞") + expectFirstMatch("\n", regex(#"\u{0A}"#), "\n") + expectFirstMatch("\r", regex(#"\u{0D}"#), "\r") + expectFirstMatch("\r\n", regex(#"\u{0D}\u{0A}"#), "\r\n") } // 1.1.1 Hex Notation and Normalization @@ -148,12 +151,8 @@ extension UTS18Tests { } func testProperties_XFail() { - XCTExpectFailure("Need to support 'age' and 'block' properties") { - // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) - XCTFail(#"\(#/\p{age=3.1}/#)"#) - // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) - XCTFail(#"\(#/\p{Block=Greek}/#)"#) - } + // Certain properties are unsupported, see below. + XCTAssertThrowsError(try Regex(#"\p{Block=Greek}"#)) } // RL1.2a Compatibility Properties @@ -171,11 +170,16 @@ extension UTS18Tests { expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6]) expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11]) expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13]) - // TODO: blank - // TODO: cntrl expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11]) expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...]) expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11]) + + let blankAndControl = """ + \t\u{01}\u{19} + """ + // \t - tab is in both [:blank:] and [:cntrl:] + expectFirstMatch(blankAndControl, regex(#"[[:blank:]]+"#), blankAndControl[pos: ..<2]) + expectFirstMatch(blankAndControl, regex(#"[[:cntrl:]]+"#), blankAndControl[pos: 1...]) } //RL1.3 Subtraction and Intersection @@ -196,7 +200,7 @@ extension UTS18Tests { // Non-ASCII lowercase + non-lowercase ASCII expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3]) - XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) + XCTAssertTrue("123%&^ABCDéîøü".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) } func testSubtractionAndIntersectionPrecedence() { @@ -380,12 +384,15 @@ extension UTS18Tests { XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#))) XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar))) XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar))) + XCTAssertFalse("abcdef🇬🇭".contains(regex(#"abcdef.$"#).matchingSemantics(.unicodeScalar))) } func testCharacterClassesWithStrings() { let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) XCTAssertTrue("🧐".contains(regex)) XCTAssertTrue("🇧🇫".contains(regex)) + XCTAssertTrue("🧐".contains(regex.matchingSemantics(.unicodeScalar))) + XCTAssertTrue("🇧🇫".contains(regex.matchingSemantics(.unicodeScalar))) } // RL2.3 Default Word Boundaries @@ -468,7 +475,7 @@ extension UTS18Tests { // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) } - XCTExpectFailure("Other named char failures -- investigate") { + XCTExpectFailure("Other named char failures -- name aliases") { XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#))) XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#))) XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#))) @@ -486,7 +493,8 @@ extension UTS18Tests { // To meet this requirement, an implementation shall support wildcards in // Unicode property values. func testWildcardsInPropertyValues() { - XCTExpectFailure { XCTFail("Implement tests") } + // Unsupported + XCTAssertThrowsError(try Regex(#"\p{name=/a/"#)) } // RL2.7 Full Properties @@ -498,121 +506,462 @@ extension UTS18Tests { func testFullProperties() { // MARK: General // Name (Name_Alias) + XCTAssertTrue("a".contains(regex(#"\p{name=latin small letter a}"#))) + // Block + // Unsupported + // Age + XCTAssertTrue("a".contains(regex(#"\p{age=1.1}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=V1_1}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=14.0}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=V99_99}"#))) + + XCTAssertTrue("🥱".contains(regex(#"\p{age=12.0}"#))) + XCTAssertFalse("🥱".contains(regex(#"\p{age=11.0}"#))) + + XCTAssertTrue("⌁".contains(regex(#"\p{age=3.0}"#))) + XCTAssertFalse("⌁".contains(regex(#"\p{age=2.0}"#))) + XCTAssertTrue("⌁".contains(regex(#"[\p{age=3.0}--\p{age=2.0}]"#))) + // General_Category + XCTAssertTrue("a".contains(regex(#"\p{Ll}"#))) + XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertFalse("A".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertTrue("A".contains(regex(#"\p{gc=L}"#))) + + XCTAssertTrue("a".contains(regex(#"\p{Any}"#))) + XCTAssertTrue("a".contains(regex(#"\p{Assigned}"#))) + XCTAssertTrue("a".contains(regex(#"\p{ASCII}"#))) + // Script (Script_Extensions) + XCTAssertTrue("a".contains(regex(#"\p{script=latin}"#))) + XCTAssertTrue("강".contains(regex(#"\p{script=hangul}"#))) + // White_Space + XCTAssertTrue(" ".contains(regex(#"\p{whitespace}"#))) + XCTAssertTrue("\n".contains(regex(#"\p{White_Space}"#))) + XCTAssertFalse("a".contains(regex(#"\p{whitespace}"#))) + // Alphabetic + XCTAssertTrue("aéîøüƒ".contains(regex(#"^\p{Alphabetic}+$"#))) + // Hangul_Syllable_Type + // Unsupported + // Noncharacter_Code_Point + XCTAssertTrue("\u{10FFFF}".contains(regex(#"\p{Noncharacter_Code_Point}"#))) + // Default_Ignorable_Code_Point + XCTAssertTrue("\u{00AD}".contains(regex(#"\p{Default_Ignorable_Code_Point}"#))) + // Deprecated + XCTAssertTrue("ʼn".contains(regex(#"\p{Deprecated}"#))) // Logical_Order_Exception + XCTAssertTrue("ແ".contains(regex(#"\p{Logical_Order_Exception}"#))) // Variation_Selector + XCTAssertTrue("\u{FE07}".contains(regex(#"\p{Variation_Selector}"#))) // MARK: Numeric // Numeric_Value + XCTAssertTrue("3".contains(regex(#"\p{Numeric_Value=3}"#))) + XCTAssertFalse("4".contains(regex(#"\p{Numeric_Value=3}"#))) + XCTAssertTrue("④".contains(regex(#"\p{Numeric_Value=4}"#))) + XCTAssertTrue("⅕".contains(regex(#"\p{Numeric_Value=0.2}"#))) + // Numeric_Type + XCTAssertTrue("3".contains(regex(#"\p{Numeric_Type=Decimal}"#))) + XCTAssertFalse("4".contains(regex(#"\p{Numeric_Type=Digit}"#))) + // Hex_Digit + XCTAssertTrue("0123456789abcdef0123456789ABCDEF" + .contains(regex(#"^\p{Hex_Digit}+$"#))) + XCTAssertFalse("0123456789abcdefg".contains(regex(#"^\p{Hex_Digit}+$"#))) // ASCII_Hex_Digit + XCTAssertTrue("0123456789abcdef".contains(regex(#"^\p{ASCII_Hex_Digit}+$"#))) + XCTAssertFalse("0123456789abcdef0123456789ABCDEF" + .contains(regex(#"^\p{ASCII_Hex_Digit}+$"#))) // MARK: Identifiers - // ID_Continue // ID_Start - // XID_Continue + XCTAssertTrue("ABcd".contains(regex(#"^\p{ID_Start}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{ID_Start}"#))) + + // ID_Continue + XCTAssertTrue("ABcd_1234".contains(regex(#"^\p{ID_Continue}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{ID_Continue}"#))) + // XID_Start + XCTAssertTrue("ABcd".contains(regex(#"^\p{XID_Start}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{XID_Start}"#))) + + // XID_Continue + XCTAssertTrue("ABcd_1234".contains(regex(#"^\p{XID_Continue}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{XID_Continue}"#))) + // Pattern_Syntax + XCTAssertTrue(".+-:".contains(regex(#"^\p{Pattern_Syntax}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Pattern_Syntax}"#))) + // Pattern_White_Space + XCTAssertTrue(" \t\n".contains(regex(#"^\p{Pattern_White_Space}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Pattern_White_Space}"#))) + // Identifier_Status + // Unsupported + // Identifier_Type + // Unsupported // MARK: CJK // Ideographic + XCTAssertTrue("微笑".contains(regex(#"^\p{IsIdeographic}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Ideographic}"#))) + // Unified_Ideograph + XCTAssertTrue("微笑".contains(regex(#"^\p{Unified_Ideograph}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Unified_Ideograph}"#))) + // Radical + XCTAssertTrue("⺁⺂⺆".contains(regex(#"^\p{Radical}+$"#))) + // IDS_Binary_Operator + XCTAssertTrue("⿰⿸⿻".contains(regex(#"^\p{IDS_Binary_Operator}+$"#))) + // IDS_Trinary_Operator + XCTAssertTrue("⿲⿳".contains(regex(#"^\p{IDS_Trinary_Operator}+$"#))) + // Equivalent_Unified_Ideograph - XCTExpectFailure { - XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) - // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) - } + // Unsupported // MARK: Case // Uppercase + XCTAssertTrue("AÉÎØÜ".contains(regex(#"^\p{isUppercase}+$"#))) + XCTAssertFalse("123abc".contains(regex(#"^\p{isUppercase}+$"#))) + // Lowercase + XCTAssertTrue("aéîøü".contains(regex(#"^\p{Lowercase}+$"#))) + XCTAssertFalse("123ABC".contains(regex(#"\p{Lowercase}+$"#))) + // Simple_Lowercase_Mapping + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Lowercase_Mapping=a}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Lowercase_Mapping=a}"#))) + // Simple_Titlecase_Mapping + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Titlecase_Mapping=A}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Titlecase_Mapping=A}"#))) + // Simple_Uppercase_Mapping + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Uppercase_Mapping=A}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Uppercase_Mapping=A}"#))) + // Simple_Case_Folding + // Unsupported + // Soft_Dotted + XCTAssertTrue("ijɨʝⅈⅉ".contains(regex(#"^\p{Soft_Dotted}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Soft_Dotted}"#))) + // Cased + XCTAssertTrue("A".contains(regex(#"\p{Cased}"#))) + XCTAssertTrue("A".contains(regex(#"\p{Is_Cased}"#))) + XCTAssertFalse("0".contains(regex(#"\p{Cased}"#))) + // Case_Ignorable + XCTAssertTrue(":".contains(regex(#"\p{Case_Ignorable}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Case_Ignorable}"#))) + // Changes_When_Lowercased + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased}"#))) + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased=true}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Lowercased}"#))) + // Changes_When_Uppercased XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#))) XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#))) XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#))) + // Changes_When_Titlecased + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Titlecased=true}"#))) + XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Titlecased}"#))) + // Changes_When_Casefolded - // Changes_When_Casemapped + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Casefolded=true}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Casefolded}"#))) + XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casefolded}"#))) + // Changes_When_Casemapped + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Casemapped}"#))) + XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casemapped}"#))) + // MARK: Normalization // Canonical_Combining_Class + XCTAssertTrue("\u{0321}\u{0322}\u{1DD0}".contains(regex(#"^\p{Canonical_Combining_Class=202}+$"#))) + XCTAssertFalse("123".contains(regex(#"\p{Canonical_Combining_Class=202}"#))) + // Decomposition_Type + // Unsupported + // NFC_Quick_Check + // Unsupported + // NFKC_Quick_Check + // Unsupported + // NFD_Quick_Check + // Unsupported + // NFKD_Quick_Check + // Unsupported + // NFKC_Casefold + // Unsupported + // Changes_When_NFKC_Casefolded + XCTAssertTrue("ABCÊÖ".contains(regex(#"^\p{Changes_When_NFKC_Casefolded}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Changes_When_NFKC_Casefolded}"#))) // MARK: Emoji // Emoji + XCTAssertTrue("🥰🥳🤩".contains(regex(#"^\p{Emoji}+$"#))) + XCTAssertFalse("abc ◎✩℥".contains(regex(#"\p{Emoji}"#))) + // Emoji_Presentation + XCTAssertTrue("⌚☕☔".contains(regex(#"^\p{Emoji_Presentation}+$"#))) + XCTAssertFalse("abc ǽǮ".contains(regex(#"\p{Emoji_Presentation}"#))) + // Emoji_Modifier + XCTAssertTrue("\u{1F3FB}\u{1F3FC}\u{1F3FD}".contains(regex(#"^\p{Emoji_Modifier}+$"#))) + XCTAssertFalse("🧒".contains(regex(#"\p{Emoji_Modifier}"#))) + // Emoji_Modifier_Base + XCTAssertTrue("🧒".contains(regex(#"^\p{Emoji_Modifier_Base}+$"#))) + XCTAssertFalse("123 🧠".contains(regex(#"\p{Emoji_Modifier_Base}"#))) + // Emoji_Component + // Unsupported + // Extended_Pictographic + // Unsupported + // Basic_Emoji* + // Unsupported + // Emoji_Keycap_Sequence* + // Unsupported + // RGI_Emoji_Modifier_Sequence* + // Unsupported + // RGI_Emoji_Flag_Sequence* + // Unsupported + // RGI_Emoji_Tag_Sequence* + // Unsupported + // RGI_Emoji_ZWJ_Sequence* + // Unsupported + // RGI_Emoji* + // Unsupported // MARK: Shaping and Rendering // Join_Control + XCTAssertTrue("\u{200C}\u{200D}".contains(regex(#"^\p{Join_Control}+$"#))) + XCTAssertFalse("123".contains(regex(#"\p{Join_Control}"#))) + // Joining_Group + // Unsupported + // Joining_Type + // Unsupported + // Vertical_Orientation + // Unsupported + // Line_Break + // Unsupported + // Grapheme_Cluster_Break + // Unsupported + // Sentence_Break + // Unsupported + // Word_Break + // Unsupported + // East_Asian_Width + // Unsupported + // Prepended_Concatenation_Mark + // Unsupported // MARK: Bidirectional // Bidi_Class + // Unsupported + // Bidi_Control + XCTAssertTrue("\u{200E}\u{200F}\u{2069}".contains(regex(#"^\p{Bidi_Control}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Bidi_Control}"#))) + // Bidi_Mirrored + XCTAssertTrue("()<>{}❮❯«»".contains(regex(#"^\p{Bidi_Mirrored}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Bidi_Mirrored}"#))) + // Bidi_Mirroring_Glyph + // Unsupported + // Bidi_Paired_Bracket + // Unsupported + // Bidi_Paired_Bracket_Type + // Unsupported // MARK: Miscellaneous // Math + XCTAssertTrue("𝒶𝖇𝕔𝖽𝗲𝘧𝙜𝚑𝛊𝜅𝝀𝝡𝞰𝟙𝟐𝟯𝟺".contains(regex(#"^\p{Math}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Math}"#))) + // Quotation_Mark + XCTAssertTrue(#"“«‘"’»”"#.contains(regex(#"^\p{Quotation_Mark}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Quotation_Mark}"#))) + // Dash + XCTAssertTrue("—-–".contains(regex(#"^\p{Dash}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Dash}"#))) + // Sentence_Terminal + XCTAssertTrue(".!?".contains(regex(#"^\p{Sentence_Terminal}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Sentence_Terminal}"#))) + // Terminal_Punctuation + XCTAssertTrue(":?!.".contains(regex(#"^\p{Terminal_Punctuation}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Terminal_Punctuation}"#))) + // Diacritic + XCTAssertTrue("¨`^¯ʸ".contains(regex(#"^\p{Diacritic}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Diacritic}"#))) + // Extender + XCTAssertTrue("ᪧː々".contains(regex(#"^\p{Extender}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Extender}"#))) + // Grapheme_Base + XCTAssertTrue("abc".contains(regex(#"^\p{Grapheme_Base}+$"#))) + XCTAssertFalse("\u{301}\u{FE0F}".contains(regex(#"\p{Grapheme_Base}"#))) + // Grapheme_Extend + XCTAssertTrue("\u{301}\u{302}\u{303}".contains(regex(#"^\p{Grapheme_Extend}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Grapheme_Extend}"#))) + // Regional_Indicator + XCTAssertTrue("🇰🇷🇬🇭🇵🇪".contains(regex(#"^\p{Regional_Indicator}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Regional_Indicator}"#))) + } + + func testFullProperties_Unsupported() { + // Block + XCTAssertThrowsError(try Regex(#"\p{block=Block_Elements}"#)) + + // Hangul_Syllable_Type + XCTAssertThrowsError(try Regex(#"\p{Hangul_Syllable_Type=L}/"#)) + + // Identifier_Status + XCTAssertThrowsError(try Regex(#"\p{Identifier_Status=Allowed}"#)) + + // Identifier_Type + XCTAssertThrowsError(try Regex(#"\p{Identifier_Type=Inclusion}/"#)) + + // Equivalent_Unified_Ideograph + XCTAssertThrowsError(try Regex(#"\p{Equivalent_Unified_Ideograph=⼚}"#)) + + // Simple_Case_Folding + XCTAssertThrowsError(try Regex(#"\p{Simple_Case_Folding=a}/"#)) + + // Decomposition_Type + XCTAssertThrowsError(try Regex(#"\p{Decomposition_Type}"#)) + + // NFC_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFC_Quick_Check}"#)) + + // NFKC_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFKC_Quick_Check}"#)) + + // NFD_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFD_Quick_Check}"#)) + + // NFKD_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFKD_Quick_Check}"#)) + + // NFKC_Casefold + XCTAssertThrowsError(try Regex(#"\p{NFKC_Casefold}"#)) + + // Emoji_Component + XCTAssertThrowsError(try Regex(#"\p{Emoji_Component}"#)) + + // Extended_Pictographic + XCTAssertThrowsError(try Regex(#"\p{Extended_Pictographic}"#)) + + // Basic_Emoji* + XCTAssertThrowsError(try Regex(#"\p{Basic_Emoji*}"#)) + + // Emoji_Keycap_Sequence* + XCTAssertThrowsError(try Regex(#"\p{Emoji_Keycap_Sequence*}"#)) + + // RGI_Emoji_Modifier_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Modifier_Sequence*}"#)) + + // RGI_Emoji_Flag_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Flag_Sequence*}"#)) + + // RGI_Emoji_Tag_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Tag_Sequence*}"#)) + + // RGI_Emoji_ZWJ_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_ZWJ_Sequence*}"#)) + + // RGI_Emoji* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji*}"#)) + + // Joining_Group + XCTAssertThrowsError(try Regex(#"\p{Joining_Group}"#)) + + // Joining_Type + XCTAssertThrowsError(try Regex(#"\p{Joining_Type}"#)) + + // Vertical_Orientation + XCTAssertThrowsError(try Regex(#"\p{Vertical_Orientation}"#)) + + // Line_Break + XCTAssertThrowsError(try Regex(#"\p{Line_Break}"#)) + + // Grapheme_Cluster_Break + XCTAssertThrowsError(try Regex(#"\p{Grapheme_Cluster_Break}"#)) + + // Sentence_Break + XCTAssertThrowsError(try Regex(#"\p{Sentence_Break}"#)) + + // Word_Break + XCTAssertThrowsError(try Regex(#"\p{Word_Break}"#)) + + // East_Asian_Width + XCTAssertThrowsError(try Regex(#"\p{East_Asian_Width}"#)) + + // Prepended_Concatenation_Mark + XCTAssertThrowsError(try Regex(#"\p{Prepended_Concatenation_Mark}"#)) + + // Bidi_Class + XCTAssertThrowsError(try Regex(#"\p{Bidi_Class}"#)) + + // Bidi_Mirroring_Glyph + XCTAssertThrowsError(try Regex(#"\p{Bidi_Mirroring_Glyph}"#)) + + // Bidi_Paired_Bracket + XCTAssertThrowsError(try Regex(#"\p{Bidi_Paired_Bracket}"#)) + + // Bidi_Paired_Bracket_Type + XCTAssertThrowsError(try Regex(#"\p{Bidi_Paired_Bracket_Type}"#)) } }