diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index d00862e9b..618ae2412 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -44,6 +44,9 @@ enum ParseError: Error, Hashable { case invalidEscape(Character) case confusableCharacter(Character) + case quoteMayNotSpanMultipleLines + case unsetExtendedSyntaxMayNotSpanMultipleLines + case cannotReferToWholePattern case quantifierRequiresOperand(String) @@ -79,6 +82,7 @@ enum ParseError: Error, Hashable { case cannotRemoveTextSegmentOptions case cannotRemoveSemanticsOptions case cannotRemoveExtendedSyntaxInMultilineMode + case cannotResetExtendedSyntaxInMultilineMode case expectedCalloutArgument @@ -139,6 +143,10 @@ extension ParseError: CustomStringConvertible { return "invalid escape sequence '\\\(c)'" case .confusableCharacter(let c): return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead" + case .quoteMayNotSpanMultipleLines: + return "quoted sequence may not span multiple lines in multi-line literal" + case .unsetExtendedSyntaxMayNotSpanMultipleLines: + return "group that unsets extended syntax may not span multiple lines in multi-line literal" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .quantifierRequiresOperand(let q): @@ -190,6 +198,8 @@ extension ParseError: CustomStringConvertible { return "semantic level cannot be unset, only changed" case .cannotRemoveExtendedSyntaxInMultilineMode: return "extended syntax may not be disabled in multi-line mode" + case .cannotResetExtendedSyntaxInMultilineMode: + return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead" case .expectedCalloutArgument: return "expected argument to callout" case .unrecognizedScript(let value): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index a6dfa0ce9..41b744234 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -342,8 +342,8 @@ extension Source { }.value } - /// Eat a scalar off the front, starting from after the - /// backslash and base character (e.g. `\u` or `\x`). + /// Try to eat a scalar off the front, starting from after the backslash and + /// base character (e.g. `\u` or `\x`). /// /// UniScalar -> 'u{' UniScalarSequence '}' /// | 'u' HexDigit{4} @@ -353,60 +353,60 @@ extension Source { /// | 'o{' OctalDigit{1...} '}' /// | '0' OctalDigit{0...3} /// - mutating func expectUnicodeScalar( - escapedCharacter base: Character - ) throws -> AST.Atom.Kind { + mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? { try recordLoc { src in + try src.tryEating { src in - func nullScalar() -> AST.Atom.Kind { - let pos = src.currentPosition - return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) - } - - // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. - switch base { - // Hex numbers. - case "u" where src.tryEat("{"): - return try src.expectUnicodeScalarSequence(eating: "}") - - case "x" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .hex)) - - case "x": - // \x expects *up to* 2 digits. - guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) - else { - // In PCRE, \x without any valid hex digits is \u{0}. - // TODO: This doesn't appear to be followed by ICU or Oniguruma, so - // could be changed to throw an error if we had a parsing mode for - // them. - return nullScalar() + func nullScalar() -> AST.Atom.Kind { + let pos = src.currentPosition + return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) } - return .scalar(try Source.validateUnicodeScalar(digits, .hex)) - case "u": - return .scalar(try src.expectUnicodeScalar(numDigits: 4)) - case "U": - return .scalar(try src.expectUnicodeScalar(numDigits: 8)) + // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. + switch src.tryEat() { + // Hex numbers. + case "u" where src.tryEat("{"): + return try src.expectUnicodeScalarSequence(eating: "}") + + case "x" where src.tryEat("{"): + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .hex)) + + case "x": + // \x expects *up to* 2 digits. + guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) + else { + // In PCRE, \x without any valid hex digits is \u{0}. + // TODO: This doesn't appear to be followed by ICU or Oniguruma, so + // could be changed to throw an error if we had a parsing mode for + // them. + return nullScalar() + } + return .scalar(try Source.validateUnicodeScalar(digits, .hex)) + + case "u": + return .scalar(try src.expectUnicodeScalar(numDigits: 4)) + case "U": + return .scalar(try src.expectUnicodeScalar(numDigits: 8)) + + // Octal numbers. + case "o" where src.tryEat("{"): + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .octal)) + + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) + else { + return nullScalar() + } + return .scalar(try Source.validateUnicodeScalar(digits, .octal)) - // Octal numbers. - case "o" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .octal)) - - case "0": - // We can read *up to* 3 more octal digits. - // FIXME: PCRE can only read up to 2 octal digits, if we get a strict - // PCRE mode, we should limit it here. - guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) - else { - return nullScalar() + default: + return nil } - return .scalar(try Source.validateUnicodeScalar(digits, .octal)) - - default: - fatalError("Unexpected scalar start") } }.value } @@ -579,7 +579,7 @@ extension Source { /// Try to consume quoted content /// - /// Quote -> '\Q' (!'\E' .)* '\E' + /// Quote -> '\Q' (!'\E' .)* '\E'? /// /// With `SyntaxOptions.experimentalQuotes`, also accepts /// @@ -592,9 +592,24 @@ extension Source { mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? { let str = try recordLoc { src -> String? in if src.tryEat(sequence: #"\Q"#) { - return try src.expectQuoted(endingWith: #"\E"#).value + let contents = src.lexUntil { src in + src.isEmpty || src.tryEat(sequence: #"\E"#) + }.value + + // In multi-line literals, the quote may not span multiple lines. + if context.syntax.contains(.multilineCompilerLiteral), + contents.spansMultipleLinesInRegexLiteral { + throw ParseError.quoteMayNotSpanMultipleLines + } + + // The sequence must not be empty in a custom character class. + if context.isInCustomCharacterClass && contents.isEmpty { + throw ParseError.expectedNonEmptyContents + } + return contents } if context.experimentalQuotes, src.tryEat("\"") { + // TODO: Can experimental quotes be empty? return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value } return nil @@ -787,6 +802,11 @@ extension Source { mutating func lexMatchingOptionSequence( context: ParsingContext ) throws -> AST.MatchingOptionSequence? { + // PCRE accepts '(?)' + // TODO: This is a no-op, should we warn? + if peek() == ")" { + return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: []) + } let ateCaret = recordLoc { $0.tryEat("^") } // TODO: Warn on duplicate options, and options appearing in both adding @@ -820,11 +840,6 @@ extension Source { if opt.isSemanticMatchingLevel { throw ParseError.cannotRemoveSemanticsOptions } - // Extended syntax may not be removed if in multi-line mode. - if context.syntax.contains(.multilineExtendedSyntax) && - opt.isAnyExtended { - throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode - } removing.append(opt) } return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, @@ -1692,6 +1707,11 @@ extension Source { return ref } + // Hexadecimal and octal unicode scalars. + if let scalar = try src.lexUnicodeScalar() { + return scalar + } + guard let char = src.tryEat() else { throw ParseError.expectedEscape } @@ -1703,14 +1723,6 @@ extension Source { return .escaped(builtin) } - switch char { - // Hexadecimal and octal unicode scalars. - case "u", "x", "U", "o", "0": - return try src.expectUnicodeScalar(escapedCharacter: char) - default: - break - } - // We only allow unknown escape sequences for non-letter non-number ASCII, // and non-ASCII whitespace. // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 84957220c..3e20ae8c0 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -289,8 +289,8 @@ extension Parser { /// Apply the syntax options of a given matching option sequence to the /// current set of options. private mutating func applySyntaxOptions( - of opts: AST.MatchingOptionSequence - ) { + of opts: AST.MatchingOptionSequence, isScoped: Bool + ) throws { func mapOption(_ option: SyntaxOptions, _ pred: (AST.MatchingOption) -> Bool) { if opts.resetsCurrentOptions { @@ -311,22 +311,41 @@ extension Parser { mapOption(.namedCapturesOnly, .namedCapturesOnly) // (?x), (?xx) - // We skip this for multi-line, as extended syntax is always enabled there. + // This cannot be unset in a multi-line literal, unless in a scoped group + // e.g (?-x:...). We later enforce that such a group does not span multiple + // lines. // TODO: PCRE differentiates between (?x) and (?xx) where only the latter // handles non-semantic whitespace in a custom character class. Other // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, // treat (?x) and (?xx) as the same option here. If we ever get a strict // PCRE mode, we will need to change this to handle that. - if !context.syntax.contains(.multilineExtendedSyntax) { + if !isScoped && context.syntax.contains(.multilineCompilerLiteral) { + // An unscoped removal of extended syntax is not allowed in a multi-line + // literal. + if let opt = opts.removing.first(where: \.isAnyExtended) { + throw Source.LocatedError( + ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location) + } + if opts.resetsCurrentOptions { + throw Source.LocatedError( + ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!) + } + // The only remaning case is an unscoped addition of extended syntax, + // which is a no-op. + } else { + // We either have a scoped change of extended syntax, or this is a + // single-line literal. mapOption(.extendedSyntax, \.isAnyExtended) } } /// Apply the syntax options of a matching option changing group to the /// current set of options. - private mutating func applySyntaxOptions(of group: AST.Group.Kind) { + private mutating func applySyntaxOptions( + of group: AST.Group.Kind, isScoped: Bool + ) throws { if case .changeMatchingOptions(let seq) = group { - applySyntaxOptions(of: seq) + try applySyntaxOptions(of: seq, isScoped: isScoped) } } @@ -337,14 +356,25 @@ extension Parser { context.recordGroup(kind.value) let currentSyntax = context.syntax - applySyntaxOptions(of: kind.value) + try applySyntaxOptions(of: kind.value, isScoped: true) defer { context.syntax = currentSyntax } - + let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) && + !context.syntax.contains(.extendedSyntax) let child = try parseNode() try source.expect(")") - return .init(kind, child, loc(start)) + let groupLoc = loc(start) + + // In multi-line literals, the body of a group that unsets extended syntax + // may not span multiple lines. + if unsetsExtendedSyntax && + context.syntax.contains(.multilineCompilerLiteral) && + source[child.location.range].spansMultipleLinesInRegexLiteral { + throw Source.LocatedError( + ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc) + } + return .init(kind, child, groupLoc) } /// Consume the body of an absent function. @@ -438,7 +468,7 @@ extension Parser { // If we have a change matching options atom, apply the syntax options. We // already take care of scoping syntax options within a group. if case .changeMatchingOptions(let opts) = atom.kind { - applySyntaxOptions(of: opts) + try applySyntaxOptions(of: opts, isScoped: false) } // TODO: track source locations return .atom(atom) @@ -592,6 +622,13 @@ public func parse( return ast } +extension StringProtocol { + /// Whether the given string is considered multi-line for a regex literal. + var spansMultipleLinesInRegexLiteral: Bool { + unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) + } +} + /// Retrieve the default set of syntax options that a delimiter and literal /// contents indicates. fileprivate func defaultSyntaxOptions( @@ -601,9 +638,8 @@ fileprivate func defaultSyntaxOptions( case .forwardSlash: // For an extended syntax forward slash e.g #/.../#, extended syntax is // permitted if it spans multiple lines. - if delim.poundCount > 0 && - contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) { - return .multilineExtendedSyntax + if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral { + return [.multilineCompilerLiteral, .extendedSyntax] } return .traditional case .reSingleQuote: diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift index dbfe5f2d6..302032fd3 100644 --- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift @@ -58,10 +58,10 @@ public struct SyntaxOptions: OptionSet { /// `(_: .*)` == `(?:.*)` public static var experimentalCaptures: Self { Self(1 << 5) } - /// The default syntax for a multi-line regex literal. - public static var multilineExtendedSyntax: Self { - return [Self(1 << 6), .extendedSyntax] - } + /// The syntax kind of a multi-line literal. This will always be set when + /// parsing a multi-line `#/.../#` literal. Note this does not imply extended + /// syntax, as that may be temporarily disabled while parsing. + public static var multilineCompilerLiteral: Self { Self(1 << 6) } /// `(?n)` public static var namedCapturesOnly: Self { Self(1 << 7) } @@ -76,8 +76,8 @@ public struct SyntaxOptions: OptionSet { public static var traditional: Self { Self(0) } public static var experimental: Self { - // Experimental syntax enables everything except end-of-line comments. - Self(~0).subtracting(.endOfLineComments) + [.nonSemanticWhitespace, .experimentalQuotes, .experimentalComments, + .experimentalRanges, .experimentalCaptures] } // TODO: Probably want to model strict-PCRE etc. options too. diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 958c53c26..e28c72514 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -61,41 +61,6 @@ extension RegexTests { _ = try src.lexNumber() } - func diagnoseUniScalarOverflow(_ input: String, base: Character) { - let scalars = input.first == "{" - ? String(input.dropFirst().dropLast()) - : input - diagnose( - input, - expecting: .numberOverflow(scalars) - ) { src in - _ = try src.expectUnicodeScalar(escapedCharacter: base) - } - } - func diagnoseUniScalar( - _ input: String, - base: Character, - expectedDigits numDigits: Int - ) { - let scalars = input.first == "{" - ? String(input.dropFirst().dropLast()) - : input - diagnose( - input, - expecting: .expectedNumDigits(scalars, numDigits) - ) { src in - _ = try src.expectUnicodeScalar(escapedCharacter: base) - } - _ = scalars - } - - diagnoseUniScalar( - "12", base: "u", expectedDigits: 4) - diagnoseUniScalar( - "12", base: "U", expectedDigits: 8) - diagnoseUniScalarOverflow("{123456789}", base: "u") - diagnoseUniScalarOverflow("{123456789}", base: "x") - // TODO: want to dummy print out source ranges, etc, test that. } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index a0dfd2447..5beb67448 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -754,6 +754,14 @@ extension RegexTests { // This follows the PCRE behavior. parseTest(#"\Q\\E"#, quote("\\")) + // ICU allows quotes to be empty outside of custom character classes. + parseTest(#"\Q\E"#, quote("")) + + // Quotes may be unterminated. + parseTest(#"\Qab"#, quote("ab")) + parseTest(#"\Q"#, quote("")) + parseTest("\\Qab\\", quote("ab\\")) + parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), @@ -762,6 +770,9 @@ extension RegexTests { syntax: .experimental) parseTest(#""\"""#, quote("\""), syntax: .experimental) + parseTest(#"(abc)"#, capture(concat("a", "b", "c")), + syntax: .experimental, captures: [.cap]) + // Quotes in character classes. parseTest(#"[\Q-\E]"#, charClass(quote_m("-"))) parseTest(#"[\Qa-b[[*+\\E]"#, charClass(quote_m("a-b[[*+\\"))) @@ -992,6 +1003,9 @@ extension RegexTests { concat("a", atomicScriptRun("b"), "c"), throwsError: .unsupported) // Matching option changing groups. + parseTest("(?)", changeMatchingOptions( + matchingOptions() + )) parseTest("(?-)", changeMatchingOptions( matchingOptions() )) @@ -1762,6 +1776,13 @@ extension RegexTests { " ", "b" ) ) + parseTest( + "(?x) a (?^: b)", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", + changeMatchingOptions(unsetMatchingOptions(), concat(" ", "b")) + ) + ) parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c")) parseTest("[#]", charClass("#")) @@ -2084,6 +2105,17 @@ extension RegexTests { throwsError: .unsupported, syntax: .extendedSyntax ) + parseWithDelimitersTest( + #""" + #/ + a\ + b\ + c + /# + """#, + concat("a", "\n", "b", "\n", "c") + ) + // MARK: Parse with delimiters parseWithDelimitersTest("/a b/", concat("a", " ", "b")) @@ -2159,22 +2191,40 @@ extension RegexTests { /# """, concat("a", "b")) - // Make sure (?^) is ignored. + // (?x) has no effect. parseWithDelimitersTest(""" #/ - (?^) + (?x) # comment /# - """, changeMatchingOptions(unsetMatchingOptions()) + """, changeMatchingOptions(matchingOptions(adding: .extended)) ) - // (?x) has no effect. + // Scoped removal of extended syntax is allowed as long as it does not span + // multiple lines. parseWithDelimitersTest(""" #/ - (?x) - # comment + (?-x:a b) /# - """, changeMatchingOptions(matchingOptions(adding: .extended)) + """, changeMatchingOptions( + matchingOptions(removing: .extended), + concat("a", " ", "b") + ) + ) + parseWithDelimitersTest(""" + #/ + (?-xx:a b) + /# + """, changeMatchingOptions( + matchingOptions(removing: .extraExtended), + concat("a", " ", "b") + ) + ) + parseWithDelimitersTest(""" + #/ + (?^: a b ) # comment + /# + """, changeMatchingOptions(unsetMatchingOptions(), concat(" ", "a", " ", "b", " ")) ) parseWithDelimitersTest(#""" @@ -2592,8 +2642,6 @@ extension RegexTests { diagnosticTest(#"(?P"#, .expected(")")) diagnosticTest(#"(?R"#, .expected(")")) - diagnosticTest(#"\Qab"#, .expected("\\E")) - diagnosticTest("\\Qab\\", .expected("\\E")) diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental) diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental) diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental) @@ -2656,6 +2704,8 @@ extension RegexTests { diagnosticTest("\\", .expectedEscape) + diagnosticTest(#"\o"#, .invalidEscape("o")) + // TODO: Custom diagnostic for control sequence diagnosticTest(#"\c"#, .unexpectedEndOfInput) @@ -2672,6 +2722,9 @@ extension RegexTests { // TODO: Custom diagnostic for missing '\Q' diagnosticTest(#"\E"#, .invalidEscape("E")) + diagnosticTest(#"[\Q\E]"#, .expectedNonEmptyContents) + diagnosticTest(#"[\Q]"#, .expected("]")) + // PCRE treats these as octal, but we require a `0` prefix. diagnosticTest(#"[\1]"#, .invalidEscape("1")) diagnosticTest(#"[\123]"#, .invalidEscape("1")) @@ -2754,18 +2807,71 @@ extension RegexTests { /# """, .cannotRemoveExtendedSyntaxInMultilineMode ) + + // Scoped removal of extended syntax may not span multiple lines diagnosticWithDelimitersTest(""" #/ - (?-x:a b) + (?-x:a b + ) /# - """, .cannotRemoveExtendedSyntaxInMultilineMode + """, .unsetExtendedSyntaxMayNotSpanMultipleLines ) diagnosticWithDelimitersTest(""" #/ - (?-xx:a b) + (?-x:a + b) /# - """, .cannotRemoveExtendedSyntaxInMultilineMode + """, .unsetExtendedSyntaxMayNotSpanMultipleLines + ) + diagnosticWithDelimitersTest(""" + #/ + (?-xx: + a b) + /# + """, .unsetExtendedSyntaxMayNotSpanMultipleLines + ) + diagnosticWithDelimitersTest(""" + #/ + (?x-x: + a b) + /# + """, .unsetExtendedSyntaxMayNotSpanMultipleLines + ) + diagnosticWithDelimitersTest(""" + #/ + (?^) + # comment + /# + """, .cannotResetExtendedSyntaxInMultilineMode ) + diagnosticWithDelimitersTest(""" + #/ + (?^: + # comment + ) + /# + """, .unsetExtendedSyntaxMayNotSpanMultipleLines + ) + + diagnosticWithDelimitersTest(#""" + #/ + \Q + \E + /# + """#, .quoteMayNotSpanMultipleLines) + + diagnosticWithDelimitersTest(#""" + #/ + \Qabc + \E + /# + """#, .quoteMayNotSpanMultipleLines) + + diagnosticWithDelimitersTest(#""" + #/ + \Q + /# + """#, .quoteMayNotSpanMultipleLines) // MARK: Group specifiers @@ -2843,9 +2949,10 @@ extension RegexTests { diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class")) diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class")) - // MARK: Unicode scalars - - diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u12"#, .expectedNumDigits("12", 4)) + diagnosticTest(#"\U12"#, .expectedNumDigits("12", 8)) + diagnosticTest(#"\u{123456789}"#, .numberOverflow("123456789")) + diagnosticTest(#"\x{123456789}"#, .numberOverflow("123456789")) // MARK: Matching options