Skip to content

Commit 9679fdd

Browse files
authored
Merge pull request #22 from kmarius/fix-matchAll
Adhere to the JavaScript spec
2 parents 008fe1b + cf65084 commit 9679fdd

File tree

4 files changed

+90
-27
lines changed

4 files changed

+90
-27
lines changed

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,10 @@ On success, `compile` and `compile_safe` return a RegExp object. On failure, `co
4646

4747
Each RegExp object `re` has the following fields
4848
```lua
49-
re.last_index -- the position at wchich the next match will be searched in re:exec or re:test (see notes below)
50-
re.source -- the regexp string
49+
re.last_index -- the position at which the next match will be searched in re:exec or re:test (see notes below)
50+
re.source -- the pattern string
5151
re.flags -- a string representing the active flags
52-
re.dot_all -- is the dod_all flag set?
52+
re.dot_all -- is the dot_all flag set?
5353
re.global -- is the global flag set?
5454
re.has_indices -- is the indices flag set?
5555
re.ignore_case -- is the ignore_case flag set?
@@ -63,8 +63,8 @@ The RegExp object `re` has the following methods corresponding to JavaScript reg
6363
```lua
6464
re:exec(str) -- returns the next match of re in str (see notes below)
6565
re:test(str) -- returns true if the regex matches str (see notes below)
66-
re:match(str) -- returns a list of all matches or nil if no match
67-
re:match_all(str) -- returns a closure that repeatedly calls re:exec, to be used in for-loops
66+
re:match(str) -- returns, for a global regexp, a list of all match strings or nil if no match, calls re:exec(str) otherwise
67+
re:match_all(str) -- returns a closure that repeatedly calls re:exec on a global regexp, to be used in for-loops
6868
re:match_all_list(str) -- returns a list of all matches
6969
re:search(str) -- returns the 1-based index of the first match of re in str, or -1 if no match
7070
re:split(str, limit?) -- splits str at re, at most limit times

jsregexp.c

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ static int jsstring_new(lua_State *lstate) {
135135
&indices, &rev_indices);
136136

137137
if (!input_utf16) {
138-
luaL_error(lstate, "malformed unicode");
138+
return luaL_error(lstate, "malformed unicode");
139139
}
140140

141141
ud = lua_newuserdata(lstate, sizeof(*ud));
@@ -221,7 +221,7 @@ static int regexp_tostring(lua_State *lstate) {
221221
static int match_tostring(lua_State *lstate) {
222222
// luaL_getmetatable(lstate, JSREGEXP_MATCH);
223223
// if (!lua_getmetatable(lstate, 1) || !lua_equal(lstate, -1, -2)) {
224-
// luaL_argerror(lstate, 1, "match object expected");
224+
// return luaL_argerror(lstate, 1, "match object expected");
225225
// }
226226
lua_rawgeti(lstate, 1, 0);
227227
return 1;
@@ -241,8 +241,14 @@ static int regexp_exec(lua_State *lstate) {
241241
uint32_t rlast_index = r->last_index;
242242
// translate wide char to correct index
243243
if (input->is_wide_char) {
244-
// only translate if possible
245-
if (rlast_index <= input->bstr_len) {
244+
// only translate indices if possible
245+
if (rlast_index > 0 && rlast_index <= input->bstr_len) {
246+
// move to the next valid index, rlast_index might be somewhere within a
247+
// multibyte character
248+
while (rlast_index < input->bstr_len &&
249+
!input->rev_indices[rlast_index]) {
250+
rlast_index++;
251+
}
246252
rlast_index = input->rev_indices[rlast_index];
247253
}
248254
}
@@ -264,7 +270,7 @@ static int regexp_exec(lua_State *lstate) {
264270
input->len, input->is_wide_char ? 1 : 0, NULL);
265271

266272
if (ret < 0) {
267-
luaL_error(lstate, "out of memory in regexp execution");
273+
return luaL_error(lstate, "out of memory in regexp execution");
268274
}
269275

270276
if (ret == 0) {
@@ -454,7 +460,7 @@ static int regexp_newindex(lua_State *lstate) {
454460
luaL_argcheck(lstate, ind >= 1, 3, "last_index must be positive");
455461
r->last_index = ind - 1;
456462
} else {
457-
luaL_argerror(lstate, 2, "unrecognized key");
463+
return luaL_argerror(lstate, 2, "unrecognized key");
458464
}
459465

460466
return 0;
@@ -477,7 +483,7 @@ static int jsregexp_compile(lua_State *lstate) {
477483
// lre_compile can segfault if the input contains 0x8f, which
478484
// indicated the beginning of a six byte sequence, but is now illegal.
479485
if (strchr(regexp, 0xfd)) {
480-
luaL_argerror(lstate, 1, "malformed unicode");
486+
return luaL_argerror(lstate, 1, "malformed unicode");
481487
}
482488

483489
if (utf8_contains_non_bmp(regexp)) {
@@ -522,7 +528,7 @@ static int jsregexp_compile(lua_State *lstate) {
522528
strlen(regexp), re_flags, NULL);
523529

524530
if (!bc) {
525-
luaL_argerror(lstate, 1, error_msg);
531+
return luaL_argerror(lstate, 1, error_msg);
526532
}
527533

528534
struct regexp *ud = lua_newuserdata(lstate, sizeof *ud);

jsregexp.lua

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,16 @@ function jsregexp.mt.match(re, str)
1212
return re:exec(jstr)
1313
end
1414
local matches = {}
15-
local val
1615

1716
re.last_index = 1
1817

1918
while true do
20-
val = re:exec(jstr)
21-
if val == nil then
19+
local match = re:exec(jstr)
20+
if match == nil then
2221
break
2322
end
24-
table.insert(matches, val)
25-
if #val[0] == 0 then
23+
table.insert(matches, match[0])
24+
if #match[0] == 0 then
2625
re.last_index = re.last_index + 1
2726
end
2827
end
@@ -33,11 +32,24 @@ function jsregexp.mt.match(re, str)
3332
end
3433

3534
function jsregexp.mt.match_all(re, str)
35+
if not re.global then
36+
error("match_all must be called with a global RegExp")
37+
end
3638
-- must duplicate (according to string.proptype.matchAll spec)
39+
-- TODO: since nobody can "subclass" this, we can probably just
40+
-- restore last_index, as it is the only way the regexp object is mutated
3741
local re2 = jsregexp.compile(re.source, re.flags)
3842
local jstr = jsregexp.to_jsstring(str)
43+
re2.last_index = re.last_index
3944
return function()
40-
return re2:exec(jstr)
45+
local match = re2:exec(jstr)
46+
if not match then
47+
return nil
48+
end
49+
if #match[0] == 0 then
50+
re2.last_index = re2.last_index + 1
51+
end
52+
return match
4153
end
4254
end
4355

@@ -183,6 +195,10 @@ local function get_substitution(match, str, replacement)
183195
end
184196

185197
function jsregexp.mt.replace_all(re, str, replacement)
198+
if not re.global then
199+
error("replace_all must be called with a global RegExp")
200+
end
201+
186202
local jstr = jsregexp.to_jsstring(str)
187203

188204
re.last_index = 1
@@ -229,7 +245,7 @@ function jsregexp.mt.replace(re, str, replacement)
229245
else
230246
table.insert(output, get_substitution(match, str, replacement))
231247
end
232-
table.insert(output, string.sub(str, re.last_index + #match[0] + 1))
248+
table.insert(output, string.sub(str, match.index + #match[0]))
233249
else
234250
table.insert(output, str)
235251
end

test.lua

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ local function test_exec(str, regex, flags, want)
3939
if match and not match_wanted then
4040
return fail(string.format("no match expected, got %s", match))
4141
end
42-
if not match and match_wanted then
42+
if not match then
4343
return fail(string.format("match expected, wanted %s", match_wanted))
4444
end
4545
if #match_wanted ~= #match then
@@ -167,18 +167,21 @@ local function test_match(str, regex, flags, want)
167167
if #want ~= #matches then
168168
return fail("number of matches mismatch, wanted %d, got %d", #want, #matches)
169169
end
170-
for i, match_want in ipairs(want) do
171-
local match = matches[i][0]
172-
if match ~= match_want then
173-
return fail("match mismatch, wanted %s, got %s", match_want, match)
170+
if r.global then
171+
for i, match_want in ipairs(want) do
172+
local match = matches[i]
173+
if match ~= match_want then
174+
return fail("match mismatch, wanted %s, got %s", match_want, match)
175+
end
174176
end
177+
else
178+
-- TODO: compare match object
175179
end
176180
end
177181
successes = successes + 1
178182
end
179183

180184
local function test_match_all_list(str, regex, flags, want)
181-
print(str, "~", regex, "flags")
182185
local function fail(fmt, ...)
183186
print(str, regex, flags, want)
184187
print(string.format(fmt, ...))
@@ -271,13 +274,32 @@ local function test_replace(str, regex, flags, replacement, want)
271274
successes = successes + 1
272275
end
273276

277+
local function test_replace_all(str, regex, flags, replacement, want)
278+
local function fail(fmt, ...)
279+
print(str, regex, flags, want)
280+
print(string.format(fmt, ...))
281+
fails = fails + 1
282+
end
283+
tests = tests + 1
284+
local r = jsregexp.compile_safe(regex, flags)
285+
if not r then
286+
return fail("compilation error")
287+
end
288+
local res = r:replace(str, replacement)
289+
if res ~= want then
290+
return fail("replacement mismatch, wanted %s, got %s", want, res)
291+
end
292+
successes = successes + 1
293+
end
294+
274295
test_compile("dummy", "(.*", "", nil)
275296
test_compile("dummy", "[", "", nil)
276297

277-
-- 0xfd (together with other wird chars) crashes lre_compile if not caught
298+
-- 0xfd (together with other weird chars) crashes lre_compile if not caught
278299
-- (luajit at least..)
279300
test_compile("dummy", string.char(0xfd, 166, 178, 165, 138, 183), "", nil)
280301

302+
test_exec("wut", "wot", "", {})
281303
test_exec("The quick brown", "\\w+", "g", { { [0] = "The" }, { [0] = "quick" }, { [0] = "brown" } })
282304
test_exec(
283305
"The quick brown fox",
@@ -320,6 +342,7 @@ test_match("The quick brown", "\\w+", "g", { "The", "quick", "brown" })
320342

321343
test_match_all_list("The quick brown", "\\d+", "g", {})
322344
test_match_all_list("The quick brown", "\\w+", "g", { "The", "quick", "brown" })
345+
test_match_all_list("𝄞𝄞𐐷𝄞𝄞", "𝄞*", "g", { "𝄞𝄞", "", "𝄞𝄞", "" })
323346

324347
test_search("The quick brown", "nothing", "g", -1)
325348
test_search("The quick brown", "quick", "g", 5)
@@ -333,12 +356,30 @@ test_split("-2-3", "-", "g", { "", "2", "3" })
333356
test_split("--", "-", "g", { "", "", "" })
334357
test_split("Hello 1 word. Sentence number 2.", "(\\d)", "g", { "Hello ", "1", " word. Sentence number ", "2", "." })
335358

359+
test_replace("a b", "\\w+", "", "_", "_ b")
360+
test_replace("a b", "\\w+", "", function()
361+
return "_"
362+
end, "_ b")
363+
test_replace("12 34", "\\d+", "", "_", "_ 34")
364+
test_replace("123 456", "\\d+", "", "_", "_ 456")
336365
test_replace("a1b2c", "X", "g", "_", "a1b2c")
337366
test_replace("a1b2c", "\\d", "", "_", "a_b2c")
338367
test_replace("a1b2c", "\\d", "g", "_", "a_b_c")
339368
test_replace("a1b2c", "(\\d)(.)", "g", "$1", "a12")
340369
test_replace("a1b2c", "(\\d)(.)", "g", "$2", "abc")
341370

371+
test_replace_all("a b", "\\w+", "g", "_", "_ _")
372+
test_replace_all("a b", "\\w+", "g", function()
373+
return "_"
374+
end, "_ _")
375+
test_replace_all("12 34", "\\d+", "g", "_", "_ _")
376+
test_replace_all("123 456", "\\d+", "g", "_", "_ _")
377+
test_replace_all("a1b2c", "X", "g", "_", "a1b2c")
378+
test_replace_all("a1b2c", "\\d", "g", "_", "a_b_c")
379+
test_replace_all("a1b2c", "\\d", "g", "_", "a_b_c")
380+
test_replace_all("a1b2c", "(\\d)(.)", "g", "$1", "a12")
381+
test_replace_all("a1b2c", "(\\d)(.)", "g", "$2", "abc")
382+
342383
local bold_green = "\27[1;32m"
343384
local bold_red = "\27[1;31m"
344385
local normal = "\27[0m"

0 commit comments

Comments
 (0)