Skip to content

Adhere to the JavaScript spec #22

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ On success, `compile` and `compile_safe` return a RegExp object. On failure, `co

Each RegExp object `re` has the following fields
```lua
re.last_index -- the position at wchich the next match will be searched in re:exec or re:test (see notes below)
re.source -- the regexp string
re.last_index -- the position at which the next match will be searched in re:exec or re:test (see notes below)
re.source -- the pattern string
re.flags -- a string representing the active flags
re.dot_all -- is the dod_all flag set?
re.dot_all -- is the dot_all flag set?
re.global -- is the global flag set?
re.has_indices -- is the indices flag set?
re.ignore_case -- is the ignore_case flag set?
Expand All @@ -63,8 +63,8 @@ The RegExp object `re` has the following methods corresponding to JavaScript reg
```lua
re:exec(str) -- returns the next match of re in str (see notes below)
re:test(str) -- returns true if the regex matches str (see notes below)
re:match(str) -- returns a list of all matches or nil if no match
re:match_all(str) -- returns a closure that repeatedly calls re:exec, to be used in for-loops
re:match(str) -- returns, for a global regexp, a list of all match strings or nil if no match, calls re:exec(str) otherwise
re:match_all(str) -- returns a closure that repeatedly calls re:exec on a global regexp, to be used in for-loops
re:match_all_list(str) -- returns a list of all matches
re:search(str) -- returns the 1-based index of the first match of re in str, or -1 if no match
re:split(str, limit?) -- splits str at re, at most limit times
Expand Down
22 changes: 14 additions & 8 deletions jsregexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ static int jsstring_new(lua_State *lstate) {
&indices, &rev_indices);

if (!input_utf16) {
luaL_error(lstate, "malformed unicode");
return luaL_error(lstate, "malformed unicode");
}

ud = lua_newuserdata(lstate, sizeof(*ud));
Expand Down Expand Up @@ -221,7 +221,7 @@ static int regexp_tostring(lua_State *lstate) {
static int match_tostring(lua_State *lstate) {
// luaL_getmetatable(lstate, JSREGEXP_MATCH);
// if (!lua_getmetatable(lstate, 1) || !lua_equal(lstate, -1, -2)) {
// luaL_argerror(lstate, 1, "match object expected");
// return luaL_argerror(lstate, 1, "match object expected");
// }
lua_rawgeti(lstate, 1, 0);
return 1;
Expand All @@ -241,8 +241,14 @@ static int regexp_exec(lua_State *lstate) {
uint32_t rlast_index = r->last_index;
// translate wide char to correct index
if (input->is_wide_char) {
// only translate if possible
if (rlast_index <= input->bstr_len) {
// only translate indices if possible
if (rlast_index > 0 && rlast_index <= input->bstr_len) {
// move to the next valid index, rlast_index might be somewhere within a
// multibyte character
while (rlast_index < input->bstr_len &&
!input->rev_indices[rlast_index]) {
rlast_index++;
}
rlast_index = input->rev_indices[rlast_index];
}
}
Expand All @@ -264,7 +270,7 @@ static int regexp_exec(lua_State *lstate) {
input->len, input->is_wide_char ? 1 : 0, NULL);

if (ret < 0) {
luaL_error(lstate, "out of memory in regexp execution");
return luaL_error(lstate, "out of memory in regexp execution");
}

if (ret == 0) {
Expand Down Expand Up @@ -454,7 +460,7 @@ static int regexp_newindex(lua_State *lstate) {
luaL_argcheck(lstate, ind >= 1, 3, "last_index must be positive");
r->last_index = ind - 1;
} else {
luaL_argerror(lstate, 2, "unrecognized key");
return luaL_argerror(lstate, 2, "unrecognized key");
}

return 0;
Expand All @@ -477,7 +483,7 @@ static int jsregexp_compile(lua_State *lstate) {
// lre_compile can segfault if the input contains 0x8f, which
// indicated the beginning of a six byte sequence, but is now illegal.
if (strchr(regexp, 0xfd)) {
luaL_argerror(lstate, 1, "malformed unicode");
return luaL_argerror(lstate, 1, "malformed unicode");
}

if (utf8_contains_non_bmp(regexp)) {
Expand Down Expand Up @@ -522,7 +528,7 @@ static int jsregexp_compile(lua_State *lstate) {
strlen(regexp), re_flags, NULL);

if (!bc) {
luaL_argerror(lstate, 1, error_msg);
return luaL_argerror(lstate, 1, error_msg);
}

struct regexp *ud = lua_newuserdata(lstate, sizeof *ud);
Expand Down
30 changes: 23 additions & 7 deletions jsregexp.lua
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,16 @@ function jsregexp.mt.match(re, str)
return re:exec(jstr)
end
local matches = {}
local val

re.last_index = 1

while true do
val = re:exec(jstr)
if val == nil then
local match = re:exec(jstr)
if match == nil then
break
end
table.insert(matches, val)
if #val[0] == 0 then
table.insert(matches, match[0])
if #match[0] == 0 then
re.last_index = re.last_index + 1
end
end
Expand All @@ -33,11 +32,24 @@ function jsregexp.mt.match(re, str)
end

function jsregexp.mt.match_all(re, str)
if not re.global then
error("match_all must be called with a global RegExp")
end
-- must duplicate (according to string.proptype.matchAll spec)
-- TODO: since nobody can "subclass" this, we can probably just
-- restore last_index, as it is the only way the regexp object is mutated
local re2 = jsregexp.compile(re.source, re.flags)
local jstr = jsregexp.to_jsstring(str)
re2.last_index = re.last_index
return function()
return re2:exec(jstr)
local match = re2:exec(jstr)
if not match then
return nil
end
if #match[0] == 0 then
re2.last_index = re2.last_index + 1
end
return match
end
end

Expand Down Expand Up @@ -183,6 +195,10 @@ local function get_substitution(match, str, replacement)
end

function jsregexp.mt.replace_all(re, str, replacement)
if not re.global then
error("replace_all must be called with a global RegExp")
end

local jstr = jsregexp.to_jsstring(str)

re.last_index = 1
Expand Down Expand Up @@ -229,7 +245,7 @@ function jsregexp.mt.replace(re, str, replacement)
else
table.insert(output, get_substitution(match, str, replacement))
end
table.insert(output, string.sub(str, re.last_index + #match[0] + 1))
table.insert(output, string.sub(str, match.index + #match[0]))
else
table.insert(output, str)
end
Expand Down
55 changes: 48 additions & 7 deletions test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ local function test_exec(str, regex, flags, want)
if match and not match_wanted then
return fail(string.format("no match expected, got %s", match))
end
if not match and match_wanted then
if not match then
return fail(string.format("match expected, wanted %s", match_wanted))
end
if #match_wanted ~= #match then
Expand Down Expand Up @@ -167,18 +167,21 @@ local function test_match(str, regex, flags, want)
if #want ~= #matches then
return fail("number of matches mismatch, wanted %d, got %d", #want, #matches)
end
for i, match_want in ipairs(want) do
local match = matches[i][0]
if match ~= match_want then
return fail("match mismatch, wanted %s, got %s", match_want, match)
if r.global then
for i, match_want in ipairs(want) do
local match = matches[i]
if match ~= match_want then
return fail("match mismatch, wanted %s, got %s", match_want, match)
end
end
else
-- TODO: compare match object
end
end
successes = successes + 1
end

local function test_match_all_list(str, regex, flags, want)
print(str, "~", regex, "flags")
local function fail(fmt, ...)
print(str, regex, flags, want)
print(string.format(fmt, ...))
Expand Down Expand Up @@ -271,13 +274,32 @@ local function test_replace(str, regex, flags, replacement, want)
successes = successes + 1
end

local function test_replace_all(str, regex, flags, replacement, want)
local function fail(fmt, ...)
print(str, regex, flags, want)
print(string.format(fmt, ...))
fails = fails + 1
end
tests = tests + 1
local r = jsregexp.compile_safe(regex, flags)
if not r then
return fail("compilation error")
end
local res = r:replace(str, replacement)
if res ~= want then
return fail("replacement mismatch, wanted %s, got %s", want, res)
end
successes = successes + 1
end

test_compile("dummy", "(.*", "", nil)
test_compile("dummy", "[", "", nil)

-- 0xfd (together with other wird chars) crashes lre_compile if not caught
-- 0xfd (together with other weird chars) crashes lre_compile if not caught
-- (luajit at least..)
test_compile("dummy", string.char(0xfd, 166, 178, 165, 138, 183), "", nil)

test_exec("wut", "wot", "", {})
test_exec("The quick brown", "\\w+", "g", { { [0] = "The" }, { [0] = "quick" }, { [0] = "brown" } })
test_exec(
"The quick brown fox",
Expand Down Expand Up @@ -320,6 +342,7 @@ test_match("The quick brown", "\\w+", "g", { "The", "quick", "brown" })

test_match_all_list("The quick brown", "\\d+", "g", {})
test_match_all_list("The quick brown", "\\w+", "g", { "The", "quick", "brown" })
test_match_all_list("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { "𝄞𝄞", "", "𝄞𝄞", "" })

test_search("The quick brown", "nothing", "g", -1)
test_search("The quick brown", "quick", "g", 5)
Expand All @@ -333,12 +356,30 @@ test_split("-2-3", "-", "g", { "", "2", "3" })
test_split("--", "-", "g", { "", "", "" })
test_split("Hello 1 word. Sentence number 2.", "(\\d)", "g", { "Hello ", "1", " word. Sentence number ", "2", "." })

test_replace("a b", "\\w+", "", "_", "_ b")
test_replace("a b", "\\w+", "", function()
return "_"
end, "_ b")
test_replace("12 34", "\\d+", "", "_", "_ 34")
test_replace("123 456", "\\d+", "", "_", "_ 456")
test_replace("a1b2c", "X", "g", "_", "a1b2c")
test_replace("a1b2c", "\\d", "", "_", "a_b2c")
test_replace("a1b2c", "\\d", "g", "_", "a_b_c")
test_replace("a1b2c", "(\\d)(.)", "g", "$1", "a12")
test_replace("a1b2c", "(\\d)(.)", "g", "$2", "abc")

test_replace_all("a b", "\\w+", "g", "_", "_ _")
test_replace_all("a b", "\\w+", "g", function()
return "_"
end, "_ _")
test_replace_all("12 34", "\\d+", "g", "_", "_ _")
test_replace_all("123 456", "\\d+", "g", "_", "_ _")
test_replace_all("a1b2c", "X", "g", "_", "a1b2c")
test_replace_all("a1b2c", "\\d", "g", "_", "a_b_c")
test_replace_all("a1b2c", "\\d", "g", "_", "a_b_c")
test_replace_all("a1b2c", "(\\d)(.)", "g", "$1", "a12")
test_replace_all("a1b2c", "(\\d)(.)", "g", "$2", "abc")

local bold_green = "\27[1;32m"
local bold_red = "\27[1;31m"
local normal = "\27[0m"
Expand Down
Loading