Skip to content

Commit d74c7c1

Browse files
committed
Add conversion from regex syntax to BNF syntax
1 parent 3d2bdaa commit d74c7c1

File tree

5 files changed

+826
-0
lines changed

5 files changed

+826
-0
lines changed

Package.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,13 @@ let package = Package(
135135
"_RegexParser",
136136
"_StringProcessing"
137137
]),
138+
.executableTarget(
139+
name: "Regex2BNF",
140+
dependencies: [
141+
.product(name: "ArgumentParser", package: "swift-argument-parser"),
142+
"_RegexParser"
143+
],
144+
swiftSettings: [availabilityDefinition]),
138145
.executableTarget(
139146
name: "RegexTester",
140147
dependencies: [

Sources/Regex2BNF/Regex2BNF.swift

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import ArgumentParser
13+
import _RegexParser
14+
15+
@main
16+
@available(SwiftStdlib 5.8, *)
17+
struct Regex2BNF: ParsableCommand {
18+
@Argument(help: "The regex pattern to convert to BNF.")
19+
var pattern: String
20+
21+
@Flag(
22+
name: [.customShort("e"), .customLong("examples")],
23+
help: "Run several examples")
24+
var runExamples = false
25+
26+
func convert(_ pattern: String) throws {
27+
print("/\(pattern)/\n")
28+
let bnf = try convertRegexToBNF(
29+
regex: pattern, namespace: "RE2BNF", version: 0)
30+
print(bnf)
31+
}
32+
33+
mutating func run() throws {
34+
if runExamples {
35+
// TODO: Turn into test cases
36+
// print("[Examples")
37+
38+
// print("Single-scalar character literals:")
39+
try convert("a")
40+
try convert("Z")
41+
try convert("")
42+
try convert("")
43+
try convert("\u{301}")
44+
45+
46+
// print("Multi-scalar character literals")
47+
try convert("🧟‍♀️")
48+
try convert("e\u{301}")
49+
50+
// print("Simple alternations")
51+
try convert("a|b")
52+
try convert("a|b|c|d")
53+
try convert("a|🧟‍♀️\u{301}日|z")
54+
55+
// print("Simple quantifications")
56+
try convert("a*")
57+
try convert("a+")
58+
try convert("a?")
59+
try convert("a{2,10}")
60+
try convert("a{,10}")
61+
try convert("a{2,}")
62+
63+
// print("Grouping")
64+
try convert("a(b|c)d")
65+
try convert("a(?:b|c)d")
66+
try convert("a(bcd|def(g|h)+)z")
67+
68+
// print("Dot")
69+
try convert(".*")
70+
try convert("(a|b)*.{3}(a|b)")
71+
72+
// print("Bultin character classes")
73+
try convert(#"\(\d{3}\)\d{3}-\d{4}"#)
74+
try convert(#"\s+keyword\s+"#)
75+
76+
77+
// print("[Done]")
78+
79+
// Look at optimizer output, the quant child is very long
80+
try convert("a(123456789)+b")
81+
82+
try convert("Hi the time right now is (AM|PM)")
83+
84+
try convert("a(b|c)*d{2,4}e?")
85+
}
86+
try convert(pattern)
87+
88+
89+
90+
}
91+
}
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2025 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
protocol BNFNode: CustomStringConvertible {
13+
func render() -> String
14+
}
15+
extension BNFNode {
16+
var description: String { render() }
17+
}
18+
19+
struct BNF: BNFNode {
20+
var root: Rule
21+
var rules: [Rule]
22+
23+
func render() -> String {
24+
var str = ""// root.render() + "\n"
25+
if rules.isEmpty {
26+
return str
27+
}
28+
return str
29+
+ rules.lazy.map {
30+
$0.render()
31+
}.joined(separator: "\n")
32+
+ "\n"
33+
}
34+
}
35+
36+
struct Rule: BNFNode {
37+
// The left-hand side
38+
var symbol: NonTerminalSymbol
39+
40+
var expression: Expression
41+
42+
var predicates: [CharacterPredicate] = []
43+
44+
func render() -> String {
45+
"\(symbol.render()) ::= \(expression.render())"
46+
}
47+
}
48+
49+
struct CharacterPredicate {
50+
// TODO: convention c or trivial?
51+
let impl: (Unicode.Scalar) -> Bool
52+
}
53+
54+
struct NonTerminalSymbol: Hashable, BNFNode {
55+
var name: String
56+
57+
func render() -> String {
58+
"<\(name)>"
59+
}
60+
}
61+
62+
struct Expression: BNFNode {
63+
var choices: [Choice]
64+
65+
func render() -> String {
66+
"\(choices.map({ $0.render() }).joined(separator: " | "))"
67+
}
68+
}
69+
70+
struct Choice: BNFNode {
71+
var sequence: [Symbol]
72+
73+
init(_ symbols: Array<Symbol>) {
74+
self.sequence = symbols
75+
}
76+
init(_ symbols: Symbol...) {
77+
self.init(symbols)
78+
}
79+
80+
func render() -> String {
81+
"\(sequence.map({ $0.render() }).joined(separator: " "))"
82+
}
83+
}
84+
85+
enum Symbol: BNFNode {
86+
case terminal(TerminalSymbol)
87+
case terminalSequence([TerminalSymbol])
88+
case nonTerminal(NonTerminalSymbol)
89+
case builtin(Builtin)
90+
91+
func render() -> String {
92+
switch self {
93+
case .terminal(let t):
94+
return t.render()
95+
96+
case .terminalSequence(let s):
97+
guard !s.isEmpty else {
98+
return "\"\""
99+
}
100+
return "\(s.map({ $0.render() }).joined(separator: " "))"
101+
102+
case .nonTerminal(let n):
103+
return n.render()
104+
105+
case .builtin(let b):
106+
return b.render()
107+
}
108+
}
109+
}
110+
111+
enum Builtin: BNFNode {
112+
case any // NOTE: we map dot to this, not sure if we want non-newline dots
113+
case whitespace
114+
case notWhitespace
115+
case decimalDigit
116+
case notDecimalDigit
117+
case wordCharacter
118+
case notWordCharacter
119+
120+
func render() -> String {
121+
switch self {
122+
case .any:
123+
return "<ALL_CHARACTERS_EXCEPT_QUOTE_AND_BACKSLASH>"
124+
case .whitespace:
125+
return "<WHITESPACES_AND_NEWLINES>"
126+
case .notWhitespace:
127+
fatalError()
128+
case .decimalDigit:
129+
return "<DECIMAL_DIGITS>"
130+
case .notDecimalDigit:
131+
fatalError()
132+
case .wordCharacter:
133+
return "<ALPHANUMERICS>"
134+
case .notWordCharacter:
135+
fatalError()
136+
}
137+
}
138+
}
139+
140+
enum CharacterSet {}
141+
142+
enum TerminalSymbol: BNFNode {
143+
case character(Unicode.Scalar)
144+
case characterSet(CharacterSet)
145+
case utf8CodeUnit(UInt8)
146+
147+
case characterPredicate(CharacterPredicate)
148+
149+
func render() -> String {
150+
switch self {
151+
case .character(let c):
152+
return "\"\(c)\""
153+
case .characterSet(let _):
154+
fatalError()
155+
case .utf8CodeUnit(let u):
156+
return "\"\(u)\""
157+
case .characterPredicate(_):
158+
fatalError()
159+
}
160+
}
161+
}
162+
163+
extension Expression {
164+
init(_ choices: [Choice]) {
165+
self.init(choices: choices)
166+
}
167+
init(_ choices: Choice...) {
168+
self.init(choices)
169+
}
170+
}
171+
172+
extension Choice {
173+
init(_ elements: [NonTerminalSymbol]) {
174+
self.init(elements.map { .nonTerminal($0) })
175+
}
176+
init(_ elements: NonTerminalSymbol...) {
177+
self.init(elements)
178+
}
179+
}
180+
181+
/*
182+
183+
184+
node -> choice
185+
186+
*/

0 commit comments

Comments
 (0)