From f500761b55c6ae7a6e6cc832e438e2ebaca63bdc Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Fri, 5 Jul 2024 20:13:18 +0200 Subject: [PATCH 1/9] adhere to match and matchAll spec --- jsregexp.c | 10 ++++++++-- jsregexp.lua | 24 ++++++++++++++++++------ 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/jsregexp.c b/jsregexp.c index c43d64a..40a50d2 100644 --- a/jsregexp.c +++ b/jsregexp.c @@ -241,8 +241,14 @@ static int regexp_exec(lua_State *lstate) { uint32_t rlast_index = r->last_index; // translate wide char to correct index if (input->is_wide_char) { - // only translate if possible - if (rlast_index <= input->bstr_len) { + // only translate indices if possible + if (rlast_index > 0 && rlast_index <= input->bstr_len) { + // move to the next valid index, rlast_index might be somewhere within a + // multibyte character + while (rlast_index < input->bstr_len && + !input->rev_indices[rlast_index]) { + rlast_index++; + } rlast_index = input->rev_indices[rlast_index]; } } diff --git a/jsregexp.lua b/jsregexp.lua index 6af94b2..126c0df 100644 --- a/jsregexp.lua +++ b/jsregexp.lua @@ -12,17 +12,16 @@ function jsregexp.mt.match(re, str) return re:exec(jstr) end local matches = {} - local val re.last_index = 1 while true do - val = re:exec(jstr) - if val == nil then + local match = re:exec(jstr) + if match == nil then break end - table.insert(matches, val) - if #val[0] == 0 then + table.insert(matches, match[0]) + if #match[0] == 0 then re.last_index = re.last_index + 1 end end @@ -33,11 +32,24 @@ function jsregexp.mt.match(re, str) end function jsregexp.mt.match_all(re, str) + if not re.global then + error("match_all must be called with on global RegExp") + end -- must duplicate (according to string.proptype.matchAll spec) + -- TODO: since nobody can "subclass" this, we can probably just + -- restore last_index, as it is the only way the regexp object is mutated local re2 = jsregexp.compile(re.source, re.flags) local jstr = jsregexp.to_jsstring(str) + re2.last_index = re.last_index return function() - return re2:exec(jstr) + local match = re2:exec(jstr) + if not match then + return nil + end + if #match[0] == 0 then + re2.last_index = re2.last_index + 1 + end + return match end end From d60b266de15bfbcfcab5ece377f27f166bd11e51 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Fri, 5 Jul 2024 20:13:29 +0200 Subject: [PATCH 2/9] adjust match and matchAll test --- test.lua | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test.lua b/test.lua index 1c6805e..ab47909 100644 --- a/test.lua +++ b/test.lua @@ -167,11 +167,15 @@ local function test_match(str, regex, flags, want) if #want ~= #matches then return fail("number of matches mismatch, wanted %d, got %d", #want, #matches) end - for i, match_want in ipairs(want) do - local match = matches[i][0] - if match ~= match_want then - return fail("match mismatch, wanted %s, got %s", match_want, match) + if r.global then + for i, match_want in ipairs(want) do + local match = matches[i] + if match ~= match_want then + return fail("match mismatch, wanted %s, got %s", match_want, match) + end end + else + -- TODO: compare match object end end successes = successes + 1 @@ -271,6 +275,10 @@ local function test_replace(str, regex, flags, replacement, want) successes = successes + 1 end +-- test_call("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { { "𝄞𝄞" }, { "" }, { "𝄞𝄞" }, { "" } }) +-- test_match_all_list("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { "𝄞𝄞", "", "𝄞𝄞", "" }) +-- os.exit(1) + test_compile("dummy", "(.*", "", nil) test_compile("dummy", "[", "", nil) @@ -320,6 +328,7 @@ test_match("The quick brown", "\\w+", "g", { "The", "quick", "brown" }) test_match_all_list("The quick brown", "\\d+", "g", {}) test_match_all_list("The quick brown", "\\w+", "g", { "The", "quick", "brown" }) +test_match_all_list("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { "𝄞𝄞", "", "𝄞𝄞", "" }) test_search("The quick brown", "nothing", "g", -1) test_search("The quick brown", "quick", "g", 5) From d62ca169409bbbffbaeaac1420bc8f08e68e09c8 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Fri, 5 Jul 2024 20:20:10 +0200 Subject: [PATCH 3/9] update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 387e0a8..28aaecf 100644 --- a/README.md +++ b/README.md @@ -46,10 +46,10 @@ On success, `compile` and `compile_safe` return a RegExp object. On failure, `co Each RegExp object `re` has the following fields ```lua -re.last_index -- the position at wchich the next match will be searched in re:exec or re:test (see notes below) -re.source -- the regexp string +re.last_index -- the position at which the next match will be searched in re:exec or re:test (see notes below) +re.source -- the pattern string re.flags -- a string representing the active flags -re.dot_all -- is the dod_all flag set? +re.dot_all -- is the dot_all flag set? re.global -- is the global flag set? re.has_indices -- is the indices flag set? re.ignore_case -- is the ignore_case flag set? @@ -63,8 +63,8 @@ The RegExp object `re` has the following methods corresponding to JavaScript reg ```lua re:exec(str) -- returns the next match of re in str (see notes below) re:test(str) -- returns true if the regex matches str (see notes below) -re:match(str) -- returns a list of all matches or nil if no match -re:match_all(str) -- returns a closure that repeatedly calls re:exec, to be used in for-loops +re:match(str) -- returns, for a global regexp, a list of all match strings or nil if no match, calls re:exec(str) otherwise +re:match_all(str) -- returns a closure that repeatedly calls re:exec on a global regexp, to be used in for-loops re:match_all_list(str) -- returns a list of all matches re:search(str) -- returns the 1-based index of the first match of re in str, or -1 if no match re:split(str, limit?) -- splits str at re, at most limit times From d1bda1791df24cba443736f5dc1faed12b1d7c74 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Sun, 4 May 2025 10:39:49 +0200 Subject: [PATCH 4/9] use `return lua_error` idiom --- jsregexp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/jsregexp.c b/jsregexp.c index 40a50d2..f5b20ff 100644 --- a/jsregexp.c +++ b/jsregexp.c @@ -135,7 +135,7 @@ static int jsstring_new(lua_State *lstate) { &indices, &rev_indices); if (!input_utf16) { - luaL_error(lstate, "malformed unicode"); + return luaL_error(lstate, "malformed unicode"); } ud = lua_newuserdata(lstate, sizeof(*ud)); @@ -221,7 +221,7 @@ static int regexp_tostring(lua_State *lstate) { static int match_tostring(lua_State *lstate) { // luaL_getmetatable(lstate, JSREGEXP_MATCH); // if (!lua_getmetatable(lstate, 1) || !lua_equal(lstate, -1, -2)) { - // luaL_argerror(lstate, 1, "match object expected"); + // return luaL_argerror(lstate, 1, "match object expected"); // } lua_rawgeti(lstate, 1, 0); return 1; @@ -270,7 +270,7 @@ static int regexp_exec(lua_State *lstate) { input->len, input->is_wide_char ? 1 : 0, NULL); if (ret < 0) { - luaL_error(lstate, "out of memory in regexp execution"); + return luaL_error(lstate, "out of memory in regexp execution"); } if (ret == 0) { @@ -460,7 +460,7 @@ static int regexp_newindex(lua_State *lstate) { luaL_argcheck(lstate, ind >= 1, 3, "last_index must be positive"); r->last_index = ind - 1; } else { - luaL_argerror(lstate, 2, "unrecognized key"); + return luaL_argerror(lstate, 2, "unrecognized key"); } return 0; @@ -483,7 +483,7 @@ static int jsregexp_compile(lua_State *lstate) { // lre_compile can segfault if the input contains 0x8f, which // indicated the beginning of a six byte sequence, but is now illegal. if (strchr(regexp, 0xfd)) { - luaL_argerror(lstate, 1, "malformed unicode"); + return luaL_argerror(lstate, 1, "malformed unicode"); } if (utf8_contains_non_bmp(regexp)) { @@ -528,7 +528,7 @@ static int jsregexp_compile(lua_State *lstate) { strlen(regexp), re_flags, NULL); if (!bc) { - luaL_argerror(lstate, 1, error_msg); + return luaL_argerror(lstate, 1, error_msg); } struct regexp *ud = lua_newuserdata(lstate, sizeof *ud); From 2f730fb8dc6ad1169e5f119a9f1424347c057391 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Sun, 4 May 2025 10:47:52 +0200 Subject: [PATCH 5/9] cleanup --- jsregexp.lua | 2 +- test.lua | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/jsregexp.lua b/jsregexp.lua index 126c0df..883e150 100644 --- a/jsregexp.lua +++ b/jsregexp.lua @@ -33,7 +33,7 @@ end function jsregexp.mt.match_all(re, str) if not re.global then - error("match_all must be called with on global RegExp") + error("match_all must be called with a global RegExp") end -- must duplicate (according to string.proptype.matchAll spec) -- TODO: since nobody can "subclass" this, we can probably just diff --git a/test.lua b/test.lua index ab47909..3cb1fe5 100644 --- a/test.lua +++ b/test.lua @@ -275,14 +275,10 @@ local function test_replace(str, regex, flags, replacement, want) successes = successes + 1 end --- test_call("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { { "𝄞𝄞" }, { "" }, { "𝄞𝄞" }, { "" } }) --- test_match_all_list("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { "𝄞𝄞", "", "𝄞𝄞", "" }) --- os.exit(1) - test_compile("dummy", "(.*", "", nil) test_compile("dummy", "[", "", nil) --- 0xfd (together with other wird chars) crashes lre_compile if not caught +-- 0xfd (together with other weird chars) crashes lre_compile if not caught -- (luajit at least..) test_compile("dummy", string.char(0xfd, 166, 178, 165, 138, 183), "", nil) From 1fea1faa45931133a66f8023fff6da00ad94cd07 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Sun, 4 May 2025 12:03:18 +0200 Subject: [PATCH 6/9] fix re:replace with non-global regexp --- jsregexp.lua | 2 +- test.lua | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/jsregexp.lua b/jsregexp.lua index 883e150..ee66c28 100644 --- a/jsregexp.lua +++ b/jsregexp.lua @@ -241,7 +241,7 @@ function jsregexp.mt.replace(re, str, replacement) else table.insert(output, get_substitution(match, str, replacement)) end - table.insert(output, string.sub(str, re.last_index + #match[0] + 1)) + table.insert(output, string.sub(str, match.index + #match[0])) else table.insert(output, str) end diff --git a/test.lua b/test.lua index 3cb1fe5..2a86df8 100644 --- a/test.lua +++ b/test.lua @@ -39,7 +39,7 @@ local function test_exec(str, regex, flags, want) if match and not match_wanted then return fail(string.format("no match expected, got %s", match)) end - if not match and match_wanted then + if not match then return fail(string.format("match expected, wanted %s", match_wanted)) end if #match_wanted ~= #match then @@ -282,6 +282,7 @@ test_compile("dummy", "[", "", nil) -- (luajit at least..) test_compile("dummy", string.char(0xfd, 166, 178, 165, 138, 183), "", nil) +test_exec("wut", "wot", "", {}) test_exec("The quick brown", "\\w+", "g", { { [0] = "The" }, { [0] = "quick" }, { [0] = "brown" } }) test_exec( "The quick brown fox", @@ -338,6 +339,12 @@ test_split("-2-3", "-", "g", { "", "2", "3" }) test_split("--", "-", "g", { "", "", "" }) test_split("Hello 1 word. Sentence number 2.", "(\\d)", "g", { "Hello ", "1", " word. Sentence number ", "2", "." }) +test_replace("a b", "\\w+", "", "_", "_ b") +test_replace("a b", "\\w+", "", function() + return "_" +end, "_ b") +test_replace("12 34", "\\d+", "", "_", "_ 34") +test_replace("123 456", "\\d+", "", "_", "_ 456") test_replace("a1b2c", "X", "g", "_", "a1b2c") test_replace("a1b2c", "\\d", "", "_", "a_b2c") test_replace("a1b2c", "\\d", "g", "_", "a_b_c") From 3dfb866465d9faa7193235792a09ea0325acb635 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Sun, 4 May 2025 12:11:57 +0200 Subject: [PATCH 7/9] allow only global regexes for replace_all --- jsregexp.lua | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/jsregexp.lua b/jsregexp.lua index ee66c28..62de154 100644 --- a/jsregexp.lua +++ b/jsregexp.lua @@ -195,6 +195,10 @@ local function get_substitution(match, str, replacement) end function jsregexp.mt.replace_all(re, str, replacement) + if not re.global then + error("replace_all must be called with a global RegExp") + end + local jstr = jsregexp.to_jsstring(str) re.last_index = 1 From d4485b9d5764a6baaee0f4fa4f2750c73a038617 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Sun, 4 May 2025 12:12:08 +0200 Subject: [PATCH 8/9] add tests for replace_all --- test.lua | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test.lua b/test.lua index 2a86df8..4130d39 100644 --- a/test.lua +++ b/test.lua @@ -275,6 +275,24 @@ local function test_replace(str, regex, flags, replacement, want) successes = successes + 1 end +local function test_replace_all(str, regex, flags, replacement, want) + local function fail(fmt, ...) + print(str, regex, flags, want) + print(string.format(fmt, ...)) + fails = fails + 1 + end + tests = tests + 1 + local r = jsregexp.compile_safe(regex, flags) + if not r then + return fail("compilation error") + end + local res = r:replace(str, replacement) + if res ~= want then + return fail("replacement mismatch, wanted %s, got %s", want, res) + end + successes = successes + 1 +end + test_compile("dummy", "(.*", "", nil) test_compile("dummy", "[", "", nil) @@ -351,6 +369,18 @@ test_replace("a1b2c", "\\d", "g", "_", "a_b_c") test_replace("a1b2c", "(\\d)(.)", "g", "$1", "a12") test_replace("a1b2c", "(\\d)(.)", "g", "$2", "abc") +test_replace_all("a b", "\\w+", "g", "_", "_ _") +test_replace_all("a b", "\\w+", "g", function() + return "_" +end, "_ _") +test_replace_all("12 34", "\\d+", "g", "_", "_ _") +test_replace_all("123 456", "\\d+", "g", "_", "_ _") +test_replace_all("a1b2c", "X", "g", "_", "a1b2c") +test_replace_all("a1b2c", "\\d", "g", "_", "a_b_c") +test_replace_all("a1b2c", "\\d", "g", "_", "a_b_c") +test_replace_all("a1b2c", "(\\d)(.)", "g", "$1", "a12") +test_replace_all("a1b2c", "(\\d)(.)", "g", "$2", "abc") + local bold_green = "\27[1;32m" local bold_red = "\27[1;31m" local normal = "\27[0m" From cf650847d57dcef07f9f79b7510ec530addd0ba8 Mon Sep 17 00:00:00 2001 From: kmarius <5224719+kmarius@users.noreply.github.com> Date: Sun, 4 May 2025 12:24:25 +0200 Subject: [PATCH 9/9] fix tests for match.indices --- test.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/test.lua b/test.lua index 4130d39..41fb8a8 100644 --- a/test.lua +++ b/test.lua @@ -182,7 +182,6 @@ local function test_match(str, regex, flags, want) end local function test_match_all_list(str, regex, flags, want) - print(str, "~", regex, "flags") local function fail(fmt, ...) print(str, regex, flags, want) print(string.format(fmt, ...))