diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index ae66310af..b905ba0e0 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -102,8 +102,9 @@ extension Anchor { /// An anchor that matches at the start of a line, including the start of /// the input string. /// - /// This anchor is equivalent to `^` in regex syntax when the `m` option - /// has been enabled or `anchorsMatchLineEndings(true)` has been called. + /// This anchor is unaffected by the `anchorsMatchLineEndings(_:)` method. It + /// is equivalent to `^` in regex syntax with the `m` option enabled, or + /// `(?m:^)`. public static var startOfLine: Anchor { Anchor(kind: .startOfLine) } @@ -111,8 +112,9 @@ extension Anchor { /// An anchor that matches at the end of a line, including at the end of /// the input string. /// - /// This anchor is equivalent to `$` in regex syntax when the `m` option - /// has been enabled or `anchorsMatchLineEndings(true)` has been called. + /// This anchor is unaffected by the `anchorsMatchLineEndings(_:)` method. It + /// is equivalent to `$` in regex syntax with the `m` option enabled, or + /// `(?m:$)`. public static var endOfLine: Anchor { Anchor(kind: .endOfLine) } @@ -147,6 +149,9 @@ extension Anchor { /// a particular position. Lookaheads do not advance the overall matching /// position in the input string — once a lookahead succeeds, matching continues /// in the regex from the same position. +/// +/// Using `Lookahead` in regex builder syntax is equivalent to using the regex +/// syntax `/(?=...)/`. @available(SwiftStdlib 5.7, *) public struct Lookahead: _BuiltinRegexComponent { public var regex: Regex @@ -177,6 +182,9 @@ public struct Lookahead: _BuiltinRegexComponent { /// does not match at a particular position. Lookaheads do not advance the /// overall matching position in the input string — once a lookahead succeeds, /// matching continues in the regex from the same position. +/// +/// Using `NegativeLookahead` in regex builder syntax is equivalent to using +/// the regex syntax `/(?!...)/`. @available(SwiftStdlib 5.7, *) public struct NegativeLookahead: _BuiltinRegexComponent { public var regex: Regex diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 3a96ba363..43c9626d3 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -12,6 +12,11 @@ @_implementationOnly import _RegexParser @_spi(RegexBuilder) import _StringProcessing +/// A class of characters that match in a regex. +/// +/// A character class can represent individual characters, a group of +/// characters, the set of character that match some set of criteria, or +/// a set algebraic combination of all of the above. @available(SwiftStdlib 5.7, *) public struct CharacterClass { internal var ccc: DSLTree.CustomCharacterClass @@ -37,6 +42,8 @@ extension CharacterClass: RegexComponent { @available(SwiftStdlib 5.7, *) extension CharacterClass { + /// A character class that matches any character that does not match this + /// character class. public var inverted: CharacterClass { CharacterClass(ccc.inverted) } @@ -44,26 +51,45 @@ extension CharacterClass { @available(SwiftStdlib 5.7, *) extension RegexComponent where Self == CharacterClass { + /// A character class that matches any element. + /// + /// This character class is unaffected by the `dotMatchesNewlines()` method. + /// To match any character that isn't a newline, see + /// ``CharacterClass.anyNonNewline``. + /// + /// This character class is equivalent to the regex syntax "dot" + /// metacharacter in single-line mode: `(?s:.)`. public static var any: CharacterClass { .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } + /// A character class that matches any element that isn't a newline. + /// + /// This character class is unaffected by the `dotMatchesNewlines()` method. + /// To match any character, including newlines, see ``CharacterClass.any``. + /// + /// This character class is equivalent to the regex syntax "dot" + /// metacharacter with single-line mode disabled: `(?-s:.)`. + public static var anyNonNewline: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) + } + + /// A character class that matches any single `Character`, or extended + /// grapheme cluster, regardless of the current semantic level. + /// + /// This character class is equivalent to `\X` in regex syntax. public static var anyGrapheme: CharacterClass { .init(unconverted: .anyGrapheme) } - - public static var anyUnicodeScalar: CharacterClass { - .init(unconverted: .anyUnicodeScalar) - } - public static var whitespace: CharacterClass { - .init(unconverted: .whitespace) - } - + /// A character class that matches any digit. + /// + /// This character class is equivalent to `\d` in regex syntax. public static var digit: CharacterClass { .init(unconverted: .digit) } + /// A character class that matches any hexadecimal digit. public static var hexDigit: CharacterClass { .init(DSLTree.CustomCharacterClass(members: [ .range(.char("A"), .char("F")), @@ -72,20 +98,42 @@ extension RegexComponent where Self == CharacterClass { ])) } - public static var horizontalWhitespace: CharacterClass { - .init(unconverted: .horizontalWhitespace) + /// A character class that matches any element that is a "word character". + /// + /// This character class is equivalent to `\w` in regex syntax. + public static var word: CharacterClass { + .init(unconverted: .word) } - public static var newlineSequence: CharacterClass { - .init(unconverted: .newlineSequence) + /// A character class that matches any element that is classified as + /// whitespace. + /// + /// This character class is equivalent to `\s` in regex syntax. + public static var whitespace: CharacterClass { + .init(unconverted: .whitespace) + } + + /// A character class that matches any element that is classified as + /// horizontal whitespace. + /// + /// This character class is equivalent to `\h` in regex syntax. + public static var horizontalWhitespace: CharacterClass { + .init(unconverted: .horizontalWhitespace) } + /// A character class that matches any element that is classified as + /// vertical whitespace. + /// + /// This character class is equivalent to `\v` in regex syntax. public static var verticalWhitespace: CharacterClass { .init(unconverted: .verticalWhitespace) } - - public static var word: CharacterClass { - .init(unconverted: .word) + + /// A character class that matches any newline sequence. + /// + /// This character class is equivalent to `\R` or `\n` in regex syntax. + public static var newlineSequence: CharacterClass { + .init(unconverted: .newlineSequence) } } @@ -93,6 +141,15 @@ extension RegexComponent where Self == CharacterClass { extension RegexComponent where Self == CharacterClass { /// Returns a character class that matches any character in the given string /// or sequence. + /// + /// Calling this method with a group of characters is equivalent to listing + /// those characters in a custom character class in regex syntax. For example, + /// the two regexes in this example are equivalent: + /// + /// let regex1 = /[abcd]+/ + /// let regex2 = OneOrMore(.anyOf("abcd")) + /// + /// - Parameter s: A sequence of characters to match against. public static func anyOf(_ s: S) -> CharacterClass where S.Element == Character { @@ -102,20 +159,136 @@ extension RegexComponent where Self == CharacterClass { /// Returns a character class that matches any Unicode scalar in the given /// sequence. + /// + /// Calling this method with a group of Unicode scalars is equivalent to + /// listing them in a custom character class in regex syntax. + /// + /// - Parameter s: A sequence of Unicode scalar values to match against. public static func anyOf(_ s: S) -> CharacterClass where S.Element == UnicodeScalar { CharacterClass(DSLTree.CustomCharacterClass( members: s.map { .atom(.scalar($0)) })) } + + /// Returns a character class that matches none of the characters in the given + /// string or sequence. + /// + /// Calling this method with a group of characters is equivalent to listing + /// those characters in a negated custom character class in regex syntax. For + /// example, the two regexes in this example are equivalent: + /// + /// let regex1 = /[^abcd]+/ + /// let regex2 = OneOrMore(.noneOf("abcd")) + /// + /// - Parameter s: A sequence of characters to match against. + public static func noneOf(_ s: S) -> CharacterClass + where S.Element == Character + { + CharacterClass(DSLTree.CustomCharacterClass( + members: s.map { .atom(.char($0)) })).inverted + } + + /// Returns a character class that matches none of the Unicode scalars in the + /// given sequence. + /// + /// Calling this method with a group of Unicode scalars is equivalent to + /// listing them in a negated custom character class in regex syntax. + /// + /// - Parameter s: A sequence of Unicode scalar values to match against. + public static func noneOf(_ s: S) -> CharacterClass + where S.Element == UnicodeScalar + { + CharacterClass(DSLTree.CustomCharacterClass( + members: s.map { .atom(.scalar($0)) })).inverted + } } // Unicode properties @available(SwiftStdlib 5.7, *) -extension CharacterClass { +extension RegexComponent where Self == CharacterClass { + /// Returns a character class that matches any element with the given Unicode + /// general category. + /// + /// For example, when passed `.uppercaseLetter`, this method is equivalent to + /// `/\p{Uppercase_Letter}/` or `/\p{Lu}/`. public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass { return CharacterClass(.generalCategory(category)) } + + /// Returns a character class that matches any element with the given Unicode + /// binary property. + /// + /// For example, when passed `\.isAlphabetic`, this method is equivalent to + /// `/\p{Alphabetic}/` or `/\p{Is_Alphabetic=true}/`. + public static func binaryProperty(_ property: KeyPath, value: Bool = true) -> CharacterClass { + return CharacterClass(.binaryProperty(property, value: value)) + } + + /// Returns a character class that matches any element with the given Unicode + /// name. + /// + /// This method is equivalent to `/\p{Name=name}/`. + public static func name(_ name: String) -> CharacterClass { + return CharacterClass(.named(name)) + } + + /// Returns a character class that matches any element that was included in + /// the specified Unicode version. + /// + /// This method is equivalent to `/\p{Age=version}/`. + public static func age(_ version: Unicode.Version) -> CharacterClass { + return CharacterClass(.age(version)) + } + + /// Returns a character class that matches any element with the given Unicode + /// numeric type. + /// + /// This method is equivalent to `/\p{Numeric_Type=type}/`. + public static func numericType(_ type: Unicode.NumericType) -> CharacterClass { + return CharacterClass(.numericType(type)) + } + + /// Returns a character class that matches any element with the given numeric + /// value. + /// + /// This method is equivalent to `/\p{Numeric_Value=value}/`. + public static func numericValue(_ value: Double) -> CharacterClass { + return CharacterClass(.numericValue(value)) + } + + /// Returns a character class that matches any element with the given Unicode + /// canonical combining class. + /// + /// This method is equivalent to + /// `/\p{Canonical_Combining_Class=combiningClass}/`. + public static func canonicalCombiningClass(_ combiningClass: Unicode.CanonicalCombiningClass) -> CharacterClass { + return CharacterClass(.ccc(combiningClass)) + } + + /// Returns a character class that matches any element with the given + /// lowercase mapping. + /// + /// This method is equivalent to `/\p{Lowercase_Mapping=value}/`. + public static func lowercaseMapping(_ value: String) -> CharacterClass { + return CharacterClass(.lowercaseMapping(value)) + } + + /// Returns a character class that matches any element with the given + /// uppercase mapping. + /// + /// This method is equivalent to `/\p{Uppercase_Mapping=value}/`. + public static func uppercaseMapping(_ value: String) -> CharacterClass { + return CharacterClass(.uppercaseMapping(value)) + } + + /// Returns a character class that matches any element with the given + /// titlecase mapping. + /// + /// This method is equivalent to `/\p{Titlecase_Mapping=value}/`. + public static func titlecaseMapping(_ value: String) -> CharacterClass { + return CharacterClass(.titlecaseMapping(value)) + } } /// Returns a character class that includes the characters in the given range. @@ -139,37 +312,56 @@ public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass { @available(SwiftStdlib 5.7, *) extension RegexComponent where Self == CharacterClass { + /// Returns a character class that combines all the given characters classes + /// via union. public init(_ first: CharacterClass, _ rest: CharacterClass...) { if rest.isEmpty { self.init(first.ccc) } else { - let members: [DSLTree.CustomCharacterClass.Member] = - (CollectionOfOne(first) + rest).map { .custom($0.ccc) } - self.init(.init(members: members)) + self.init([[first], rest].joined()) } } + + /// Returns a character class that combines the characters classes in the + /// given sequence or collection via union. + /// + /// - Parameter characterClasses: A sequence or collection of character class + /// instances. + public init(_ characterClasses: S) where S.Element == CharacterClass { + let members: [DSLTree.CustomCharacterClass.Member] = + characterClasses.map { .custom($0.ccc) } + self.init(.init(members: members)) + } } @available(SwiftStdlib 5.7, *) extension CharacterClass { + /// Returns a character class that is matches the union of this class and the + /// given class. public func union(_ other: CharacterClass) -> CharacterClass { CharacterClass(.init(members: [ .custom(self.ccc), .custom(other.ccc)])) } + /// Returns a character class that is matches the intersection of this class + /// and the given class. public func intersection(_ other: CharacterClass) -> CharacterClass { CharacterClass(.init(members: [ .intersection(self.ccc, other.ccc) ])) } - + + /// Returns a character class that is matches the difference of this class + /// and the given class. public func subtracting(_ other: CharacterClass) -> CharacterClass { CharacterClass(.init(members: [ .subtraction(self.ccc, other.ccc) ])) } - + + /// Returns a character class that is matches the symmetric difference of + /// this class and the given class. public func symmetricDifference(_ other: CharacterClass) -> CharacterClass { CharacterClass(.init(members: [ .symmetricDifference(self.ccc, other.ccc) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 72c5f1526..4e13ff989 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -138,12 +138,122 @@ extension DSLTree { self.isInverted = isInverted } - public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self { - let property = AST.Atom.CharacterProperty(.generalCategory(category.extendedGeneralCategory!), isInverted: false, isPOSIX: false) + private static func astCharacterProperty( + _ property: AST.Atom.CharacterProperty.Kind + ) -> Self { + let property = AST.Atom.CharacterProperty(property, isInverted: false, isPOSIX: false) let astAtom = AST.Atom(.property(property), .fake) return .init(members: [.atom(.unconverted(.init(ast: astAtom)))]) } + public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self { + astCharacterProperty(.generalCategory(category.extendedGeneralCategory!)) + } + + public static func binaryProperty( + _ property: KeyPath, + value: Bool + ) -> Self { + var binaryProperty: Unicode.BinaryProperty? = nil + switch property { + case \.isAlphabetic: binaryProperty = .alphabetic + case \.isASCIIHexDigit: binaryProperty = .asciiHexDigit + case \.isBidiControl: binaryProperty = .bidiControl + case \.isBidiMirrored: binaryProperty = .bidiMirrored + case \.isCased: binaryProperty = .cased + case \.isCaseIgnorable: binaryProperty = .caseIgnorable + case \.changesWhenCaseFolded: binaryProperty = .changesWhenCasefolded + case \.changesWhenCaseMapped: binaryProperty = .changesWhenCasemapped + case \.changesWhenNFKCCaseFolded: binaryProperty = .changesWhenNFKCCasefolded + case \.changesWhenLowercased: binaryProperty = .changesWhenLowercased + case \.changesWhenTitlecased: binaryProperty = .changesWhenTitlecased + case \.changesWhenUppercased: binaryProperty = .changesWhenUppercased + case \.isDash: binaryProperty = .dash + case \.isDefaultIgnorableCodePoint: binaryProperty = .defaultIgnorableCodePoint + case \.isDeprecated: binaryProperty = .deprecated + case \.isDiacritic: binaryProperty = .diacratic + case \.isExtender: binaryProperty = .extender + case \.isFullCompositionExclusion: binaryProperty = .fullCompositionExclusion + case \.isGraphemeBase: binaryProperty = .graphemeBase + case \.isGraphemeExtend: binaryProperty = .graphemeExtended + case \.isHexDigit: binaryProperty = .hexDigit + case \.isIDContinue: binaryProperty = .idContinue + case \.isIdeographic: binaryProperty = .ideographic + case \.isIDStart: binaryProperty = .idStart + case \.isIDSBinaryOperator: binaryProperty = .idsBinaryOperator + case \.isIDSTrinaryOperator: binaryProperty = .idsTrinaryOperator + case \.isJoinControl: binaryProperty = .joinControl + case \.isLogicalOrderException: binaryProperty = .logicalOrderException + case \.isLowercase: binaryProperty = .lowercase + case \.isMath: binaryProperty = .math + case \.isNoncharacterCodePoint: binaryProperty = .noncharacterCodePoint + case \.isPatternSyntax: binaryProperty = .patternSyntax + case \.isPatternWhitespace: binaryProperty = .patternWhitespace + case \.isQuotationMark: binaryProperty = .quotationMark + case \.isRadical: binaryProperty = .radical + case \.isSoftDotted: binaryProperty = .softDotted + case \.isSentenceTerminal: binaryProperty = .sentenceTerminal + case \.isTerminalPunctuation: binaryProperty = .terminalPunctuation + case \.isUnifiedIdeograph: binaryProperty = .unifiedIdiograph + case \.isUppercase: binaryProperty = .uppercase + case \.isVariationSelector: binaryProperty = .variationSelector + case \.isWhitespace: binaryProperty = .whitespace + case \.isXIDContinue: binaryProperty = .xidContinue + case \.isXIDStart: binaryProperty = .xidStart + default: + if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { + // FIXME: Other platforms + switch property { + case \.isEmojiModifierBase: binaryProperty = .emojiModifierBase + case \.isEmojiModifier: binaryProperty = .emojiModifier + case \.isEmoji: binaryProperty = .emoji + case \.isEmojiPresentation: binaryProperty = .emojiPresentation + default: + break + } + } + } + + if let binaryProperty = binaryProperty { + return astCharacterProperty(.binary(binaryProperty, value: value)) + } else { + // FIXME: Support via a _UnicodeScalarPredicate interface? + fatalError("Unsupported Unicode binary property") + } + } + + public static func age(_ version: Unicode.Version) -> Self { + astCharacterProperty(.age(major: version.major, minor: version.minor)) + } + + public static func named(_ name: String) -> Self { + astCharacterProperty(.named(name)) + } + + public static func numericType(_ type: Unicode.NumericType) -> Self { + astCharacterProperty(.numericType(type)) + } + + public static func numericValue(_ value: Double) -> Self { + astCharacterProperty(.numericValue(value)) + } + + public static func ccc(_ combiningClass: Unicode.CanonicalCombiningClass) -> Self { + astCharacterProperty(.ccc(combiningClass)) + } + + public static func lowercaseMapping(_ value: String) -> Self { + astCharacterProperty(.mapping(.lowercase, value)) + } + + public static func uppercaseMapping(_ value: String) -> Self { + astCharacterProperty(.mapping(.uppercase, value)) + } + + public static func titlecaseMapping(_ value: String) -> Self { + astCharacterProperty(.mapping(.titlecase, value)) + } + public var inverted: CustomCharacterClass { var result = self result.isInverted.toggle() diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 24d5c422e..6eb98ff0c 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -27,45 +27,29 @@ extension RegexComponent { /// - Parameter useASCII: A Boolean value indicating whether to match only /// ASCII characters as word characters. /// - Returns: The modified regular expression. - public func asciiOnlyWordCharacters(_ useASCII: Bool = true) -> Regex { - wrapInOption(.asciiOnlyWord, addingIf: useASCII) - } - - /// Returns a regular expression that matches only ASCII characters as digits. - /// - /// - Parameter useasciiOnlyDigits: A Boolean value indicating whether to - /// match only ASCII characters as digits. - /// - Returns: The modified regular expression. - public func asciiOnlyDigits(_ useASCII: Bool = true) -> Regex { - wrapInOption(.asciiOnlyDigit, addingIf: useASCII) - } - - /// Returns a regular expression that matches only ASCII characters as space - /// characters. - /// - /// - Parameter asciiOnlyWhitespace: A Boolean value indicating whether to - /// match only ASCII characters as space characters. - /// - Returns: The modified regular expression. - public func asciiOnlyWhitespace(_ useASCII: Bool = true) -> Regex { - wrapInOption(.asciiOnlySpace, addingIf: useASCII) + public func asciiOnlyClasses(_ kinds: RegexCharacterClassKind = .all) -> Regex { + if kinds == [] { + return Regex(node: .nonCapturingGroup( + .init(ast: .changeMatchingOptions(AST.MatchingOptionSequence(removing: [ + .init(.asciiOnlyDigit, location: .fake), + .init(.asciiOnlySpace, location: .fake), + .init(.asciiOnlyWord, location: .fake), + .init(.asciiOnlyPOSIXProps, location: .fake), + ]))), regex.root)) + } + return self + .wrapInOption(.asciiOnlyDigit, addingIf: kinds.contains(.digit)) + .wrapInOption(.asciiOnlySpace, addingIf: kinds.contains(.whitespace)) + .wrapInOption(.asciiOnlyWord, addingIf: kinds.contains(.wordCharacter)) + .wrapInOption(.asciiOnlyPOSIXProps, addingIf: kinds.contains(.all)) } - /// Returns a regular expression that matches only ASCII characters when - /// matching character classes. - /// - /// - Parameter useASCII: A Boolean value indicating whether to match only - /// ASCII characters when matching character classes. - /// - Returns: The modified regular expression. - public func asciiOnlyCharacterClasses(_ useASCII: Bool = true) -> Regex { - wrapInOption(.asciiOnlyPOSIXProps, addingIf: useASCII) - } - /// Returns a regular expression that uses the specified word boundary algorithm. /// /// - Parameter wordBoundaryKind: The algorithm to use for determining word boundaries. /// - Returns: The modified regular expression. public func wordBoundaryKind(_ wordBoundaryKind: RegexWordBoundaryKind) -> Regex { - wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .unicodeLevel2) + wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .defaultBoundaries) } /// Returns a regular expression where the start and end of input @@ -96,14 +80,28 @@ extension RegexComponent { /// Returns a regular expression where quantifiers use the specified behavior /// by default. /// - /// This setting does not affect calls to quantifier methods, such as - /// `OneOrMore`, that include an explicit `behavior` parameter. + /// You can call this method to change the default repetition behavior for + /// quantifier operators in regex syntax and `RegexBuilder` quantifier + /// methods. For example, in the following example, both regexes use + /// possessive quantification when matching a quotation surround by `"` + /// quote marks: + /// + /// let regex1 = /"[^"]*"/.defaultRepetitionBehavior(.possessive) + /// + /// let quoteMark = "\"" + /// let regex2 = Regex { + /// quoteMark + /// ZeroOrMore(.noneOf(quoteMark)) + /// quoteMark + /// }.defaultRepetitionBehavior(.possessive) /// - /// Passing `.eager` or `.reluctant` to this method corresponds to applying - /// the `(?-U)` or `(?U)` option in regex syntax, respectively. + /// This setting only changes the default behavior of quantifiers, and does + /// not affect regex syntax operators with an explicit behavior indicator, + /// such as `*?` or `++`. Likewise, calls to quantifier methods such as + /// `OneOrMore` always use the explicit `behavior`, when given. /// /// - Parameter behavior: The default behavior to use for quantifiers. - public func repetitionBehavior(_ behavior: RegexRepetitionBehavior) -> Regex { + public func defaultRepetitionBehavior(_ behavior: RegexRepetitionBehavior) -> Regex { if behavior == .possessive { return wrapInOption(.possessiveByDefault, addingIf: true) } else { @@ -161,8 +159,47 @@ extension RegexComponent { } } +/// A built-in regex character class kind. +/// +/// Pass one or more `RegexCharacterClassKind` classes to `asciiOnlyClasses(_:)` +/// to control whether character classes match any character or only members +/// of the ASCII character set. @available(SwiftStdlib 5.7, *) +public struct RegexCharacterClassKind: OptionSet, Hashable { + public var rawValue: Int + + public init(rawValue: Int) { + self.rawValue = rawValue + } + + /// Regex digit-matching character classes, like `\d`, `[:digit:]`, and + /// `\p{HexDigit}`. + public static var digit: RegexCharacterClassKind { + .init(rawValue: 1) + } + + /// Regex whitespace-matching character classes, like `\s`, `[:space:]`, + /// and `\p{Whitespace}`. + public static var whitespace: RegexCharacterClassKind { + .init(rawValue: 1 << 1) + } + + /// Regex word character-matching character classes, like `\w`. + public static var wordCharacter: RegexCharacterClassKind { + .init(rawValue: 1 << 2) + } + + /// All built-in regex character classes. + public static var all: RegexCharacterClassKind { + .init(rawValue: 1 << 3) + } + + /// No built-in regex character classes. + public static var none: RegexCharacterClassKind { [] } +} + /// A semantic level to use during regex matching. +@available(SwiftStdlib 5.7, *) public struct RegexSemanticLevel: Hashable { internal enum Representation { case graphemeCluster @@ -188,8 +225,8 @@ public struct RegexSemanticLevel: Hashable { } } -@available(SwiftStdlib 5.7, *) /// A word boundary algorithm to use during regex matching. +@available(SwiftStdlib 5.7, *) public struct RegexWordBoundaryKind: Hashable { internal enum Representation { case unicodeLevel1 @@ -205,7 +242,7 @@ public struct RegexWordBoundaryKind: Hashable { /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input /// and a `\w` character. Word boundaries therefore depend on the option- /// defined behavior of `\w`. - public static var unicodeLevel1: Self { + public static var simpleBoundaries: Self { .init(base: .unicodeLevel1) } @@ -215,7 +252,7 @@ public struct RegexWordBoundaryKind: Hashable { /// Default word boundaries use a Unicode algorithm that handles some cases /// better than simple word boundaries, such as words with internal /// punctuation, changes in script, and Emoji. - public static var unicodeLevel2: Self { + public static var defaultBoundaries: Self { .init(base: .unicodeLevel2) } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index fc31e575f..8abfe5773 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -102,6 +102,33 @@ class RegexDSLTests: XCTestCase { CharacterClass.whitespace.inverted } } + + try _testDSLCaptures( + ("abc1def2", ("abc1def2", "abc1")), + matchType: (Substring, Substring).self, ==) + { + Capture { + OneOrMore(CharacterClass.noneOf("def")) + } + + OneOrMore { + CharacterClass.noneOf("abc") + } + } + + try _testDSLCaptures( + ("a-5-É-ü", "a-5-É-ü"), + matchType: Substring.self, ==) + { + CharacterClass.binaryProperty(\.isLowercase) + "-" + Lookahead(CharacterClass.numericType(.decimal)) + CharacterClass.numericValue(5) + "-" + CharacterClass.lowercaseMapping("é") + "-" + CharacterClass.uppercaseMapping("Ü") + } } func testCharacterClassOperations() throws { @@ -280,7 +307,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.word) Anchor.wordBoundary } - .wordBoundaryKind(.unicodeLevel1) + .wordBoundaryKind(.simpleBoundaries) OneOrMore(.any, .reluctant) "stop" } @@ -293,14 +320,14 @@ class RegexDSLTests: XCTestCase { Capture { // Reluctant behavior due to option OneOrMore(.anyOf("abcd")) - .repetitionBehavior(.reluctant) + .defaultRepetitionBehavior(.reluctant) } ZeroOrMore("a"..."z") Capture { // Eager behavior due to explicit parameter, despite option OneOrMore(.digit, .eager) - .repetitionBehavior(.reluctant) + .defaultRepetitionBehavior(.reluctant) } ZeroOrMore(.digit) } @@ -312,7 +339,7 @@ class RegexDSLTests: XCTestCase { Capture { OneOrMore(.word) } - .asciiOnlyWordCharacters() + .asciiOnlyClasses(.wordCharacter) ZeroOrMore(.any) } @@ -345,7 +372,7 @@ class RegexDSLTests: XCTestCase { { OneOrMore(.reluctant) { One(.word) - }.repetitionBehavior(.possessive) + }.defaultRepetitionBehavior(.possessive) Capture(.digit) ZeroOrMore(.any) } @@ -358,7 +385,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.word) Capture(.digit) ZeroOrMore(.any) - }.repetitionBehavior(.reluctant) + }.defaultRepetitionBehavior(.reluctant) } // Default set to reluctant applies to regex syntax try _testDSLCaptures( @@ -366,7 +393,7 @@ class RegexDSLTests: XCTestCase { matchType: (Substring, Substring).self, ==) { try! Regex(#"\w+(\d).*"#, as: (Substring, Substring).self) - .repetitionBehavior(.reluctant) + .defaultRepetitionBehavior(.reluctant) } // Explicitly possessive @@ -387,7 +414,7 @@ class RegexDSLTests: XCTestCase { Regex { OneOrMore("a") "a" - }.repetitionBehavior(.possessive) + }.defaultRepetitionBehavior(.possessive) } // More specific default set to eager try _testDSLCaptures( @@ -397,10 +424,10 @@ class RegexDSLTests: XCTestCase { Regex { Capture { OneOrMore("a") - .repetitionBehavior(.eager) + .defaultRepetitionBehavior(.eager) } OneOrMore("a") - }.repetitionBehavior(.possessive) + }.defaultRepetitionBehavior(.possessive) } // More specific default set to reluctant try _testDSLCaptures( @@ -410,10 +437,10 @@ class RegexDSLTests: XCTestCase { Regex { Capture { OneOrMore("a") - .repetitionBehavior(.reluctant) + .defaultRepetitionBehavior(.reluctant) } OneOrMore("a") - }.repetitionBehavior(.possessive) + }.defaultRepetitionBehavior(.possessive) } try _testDSLCaptures( diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index be01fecb3..e54247f4a 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -222,7 +222,7 @@ extension UTS18Tests { // - Nonspacing marks are never divided from their base characters, and // otherwise ignored in locating boundaries. func testSimpleWordBoundaries() { - let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1) + let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.simpleBoundaries) expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) expectFirstMatch("don't", simpleWordRegex, "don") expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café")