Skip to content

Commit d93ddbe

Browse files
committed
automata: add internal HalfMatch APIs for NFA engines
Welp, okay, turns out we do need to know at least the end offset of a match even when the NFA has no capture states. This is necessary for correctly handling the case where a regex can match the empty string but the caller has asked that matches not split a codepoint. If we don't know the end offset of a match, then we can't correctly determine whether a match exists or not and are forced to return no match even when a match exists. We can get away with this I think for `find`-style APIs where the caller has specifically requested match offsets while simultaneously configuring the NFA to not track offsets, but with `is_match`-style APIs, we really should be able to handle it correctly. We should eventually just expose the `HalfMatch` APIs on `PikeVM` and `BoundedBacktracker`, but for now we keep them private.
1 parent e003cae commit d93ddbe

File tree

2 files changed

+49
-70
lines changed

2 files changed

+49
-70
lines changed

regex-automata/src/nfa/thompson/backtrack.rs

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use crate::{
1919
empty, iter,
2020
prefilter::Prefilter,
2121
primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
22-
search::{Anchored, Input, Match, MatchError, Span},
22+
search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
2323
},
2424
};
2525

@@ -1295,27 +1295,29 @@ impl BoundedBacktracker {
12951295
) -> Result<Option<PatternID>, MatchError> {
12961296
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
12971297
if !utf8empty {
1298-
return self.try_search_slots_imp(cache, input, slots);
1298+
let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1299+
return Ok(maybe_hm.map(|hm| hm.pattern()));
12991300
}
13001301
// See PikeVM::try_search_slots for why we do this.
13011302
let min = self.get_nfa().group_info().implicit_slot_len();
13021303
if slots.len() >= min {
1303-
return self.try_search_slots_imp(cache, input, slots);
1304+
let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1305+
return Ok(maybe_hm.map(|hm| hm.pattern()));
13041306
}
13051307
if self.get_nfa().pattern_len() == 1 {
13061308
let mut enough = [None, None];
13071309
let got = self.try_search_slots_imp(cache, input, &mut enough)?;
13081310
// This is OK because we know `enough_slots` is strictly bigger
13091311
// than `slots`, otherwise this special case isn't reached.
13101312
slots.copy_from_slice(&enough[..slots.len()]);
1311-
return Ok(got);
1313+
return Ok(got.map(|hm| hm.pattern()));
13121314
}
13131315
let mut enough = vec![None; min];
13141316
let got = self.try_search_slots_imp(cache, input, &mut enough)?;
13151317
// This is OK because we know `enough_slots` is strictly bigger than
13161318
// `slots`, otherwise this special case isn't reached.
13171319
slots.copy_from_slice(&enough[..slots.len()]);
1318-
Ok(got)
1320+
Ok(got.map(|hm| hm.pattern()))
13191321
}
13201322

13211323
/// This is the actual implementation of `try_search_slots_imp` that
@@ -1328,30 +1330,17 @@ impl BoundedBacktracker {
13281330
cache: &mut Cache,
13291331
input: &Input<'_>,
13301332
slots: &mut [Option<NonMaxUsize>],
1331-
) -> Result<Option<PatternID>, MatchError> {
1333+
) -> Result<Option<HalfMatch>, MatchError> {
13321334
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1333-
let (pid, end) = match self.search_imp(cache, input, slots)? {
1335+
let hm = match self.search_imp(cache, input, slots)? {
13341336
None => return Ok(None),
1335-
Some(pid) if !utf8empty => return Ok(Some(pid)),
1336-
Some(pid) => {
1337-
let slot_start = pid.as_usize() * 2;
1338-
let slot_end = slot_start + 1;
1339-
// OK because we know we have a match and we know our caller
1340-
// provided slots are big enough (which we make true above if
1341-
// the caller didn't). Namely, we're only here when 'utf8empty'
1342-
// is true, and when that's true, we require slots for every
1343-
// pattern.
1344-
(pid, slots[slot_end].unwrap().get())
1345-
}
1337+
Some(hm) if !utf8empty => return Ok(Some(hm)),
1338+
Some(hm) => hm,
13461339
};
1347-
empty::skip_splits_fwd(input, pid, end, |input| {
1348-
let pid = match self.search_imp(cache, input, slots)? {
1349-
None => return Ok(None),
1350-
Some(pid) => pid,
1351-
};
1352-
let slot_start = pid.as_usize() * 2;
1353-
let slot_end = slot_start + 1;
1354-
Ok(Some((pid, slots[slot_end].unwrap().get())))
1340+
empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1341+
Ok(self
1342+
.search_imp(cache, input, slots)?
1343+
.map(|hm| (hm, hm.offset())))
13551344
})
13561345
}
13571346

@@ -1367,7 +1356,7 @@ impl BoundedBacktracker {
13671356
cache: &mut Cache,
13681357
input: &Input<'_>,
13691358
slots: &mut [Option<NonMaxUsize>],
1370-
) -> Result<Option<PatternID>, MatchError> {
1359+
) -> Result<Option<HalfMatch>, MatchError> {
13711360
// Unlike in the PikeVM, we write our capturing group spans directly
13721361
// into the caller's captures groups. So we have to make sure we're
13731362
// starting with a blank slate first. In the PikeVM, we avoid this
@@ -1414,10 +1403,9 @@ impl BoundedBacktracker {
14141403
Some(ref span) => at = span.start,
14151404
}
14161405
}
1417-
if let Some(pid) =
1418-
self.backtrack(cache, input, at, start_id, slots)
1406+
if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
14191407
{
1420-
return Ok(Some(pid));
1408+
return Ok(Some(hm));
14211409
}
14221410
at += 1;
14231411
}
@@ -1438,14 +1426,13 @@ impl BoundedBacktracker {
14381426
at: usize,
14391427
start_id: StateID,
14401428
slots: &mut [Option<NonMaxUsize>],
1441-
) -> Option<PatternID> {
1429+
) -> Option<HalfMatch> {
14421430
cache.stack.push(Frame::Step { sid: start_id, at });
14431431
while let Some(frame) = cache.stack.pop() {
14441432
match frame {
14451433
Frame::Step { sid, at } => {
1446-
if let Some(pid) = self.step(cache, input, sid, at, slots)
1447-
{
1448-
return Some(pid);
1434+
if let Some(hm) = self.step(cache, input, sid, at, slots) {
1435+
return Some(hm);
14491436
}
14501437
}
14511438
Frame::RestoreCapture { slot, offset } => {
@@ -1475,7 +1462,7 @@ impl BoundedBacktracker {
14751462
mut sid: StateID,
14761463
mut at: usize,
14771464
slots: &mut [Option<NonMaxUsize>],
1478-
) -> Option<PatternID> {
1465+
) -> Option<HalfMatch> {
14791466
loop {
14801467
if !cache.visited.insert(sid, at - input.start()) {
14811468
return None;
@@ -1558,7 +1545,7 @@ impl BoundedBacktracker {
15581545
}
15591546
State::Fail => return None,
15601547
State::Match { pattern_id } => {
1561-
return Some(pattern_id);
1548+
return Some(HalfMatch::new(pattern_id, at));
15621549
}
15631550
}
15641551
}

regex-automata/src/nfa/thompson/pikevm.rs

Lines changed: 26 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ use crate::{
1717
empty, iter,
1818
prefilter::Prefilter,
1919
primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
20-
search::{Anchored, Input, Match, MatchKind, PatternSet, Span},
20+
search::{
21+
Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
22+
},
2123
sparse_set::SparseSet,
2224
},
2325
};
@@ -1094,7 +1096,8 @@ impl PikeVM {
10941096
) -> Option<PatternID> {
10951097
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
10961098
if !utf8empty {
1097-
return self.search_slots_imp(cache, input, slots);
1099+
let hm = self.search_slots_imp(cache, input, slots)?;
1100+
return Some(hm.pattern());
10981101
}
10991102
// There is an unfortunate special case where if the regex can
11001103
// match the empty string and UTF-8 mode is enabled, the search
@@ -1109,22 +1112,23 @@ impl PikeVM {
11091112
// this case.
11101113
let min = self.get_nfa().group_info().implicit_slot_len();
11111114
if slots.len() >= min {
1112-
return self.search_slots_imp(cache, input, slots);
1115+
let hm = self.search_slots_imp(cache, input, slots)?;
1116+
return Some(hm.pattern());
11131117
}
11141118
if self.get_nfa().pattern_len() == 1 {
11151119
let mut enough = [None, None];
11161120
let got = self.search_slots_imp(cache, input, &mut enough);
11171121
// This is OK because we know `enough` is strictly bigger than
11181122
// `slots`, otherwise this special case isn't reached.
11191123
slots.copy_from_slice(&enough[..slots.len()]);
1120-
return got;
1124+
return got.map(|hm| hm.pattern());
11211125
}
11221126
let mut enough = vec![None; min];
11231127
let got = self.search_slots_imp(cache, input, &mut enough);
11241128
// This is OK because we know `enough` is strictly bigger than `slots`,
11251129
// otherwise this special case isn't reached.
11261130
slots.copy_from_slice(&enough[..slots.len()]);
1127-
got
1131+
got.map(|hm| hm.pattern())
11281132
}
11291133

11301134
/// This is the actual implementation of `search_slots_imp` that
@@ -1137,30 +1141,17 @@ impl PikeVM {
11371141
cache: &mut Cache,
11381142
input: &Input<'_>,
11391143
slots: &mut [Option<NonMaxUsize>],
1140-
) -> Option<PatternID> {
1144+
) -> Option<HalfMatch> {
11411145
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1142-
let (pid, end) = match self.search_imp(cache, input, slots) {
1146+
let hm = match self.search_imp(cache, input, slots) {
11431147
None => return None,
1144-
Some(pid) if !utf8empty => return Some(pid),
1145-
Some(pid) => {
1146-
let slot_start = pid.as_usize() * 2;
1147-
let slot_end = slot_start + 1;
1148-
// OK because we know we have a match and we know our caller
1149-
// provided slots are big enough (which we make true above if
1150-
// the caller didn't). Namely, we're only here when 'utf8empty'
1151-
// is true, and when that's true, we require slots for every
1152-
// pattern.
1153-
(pid, slots[slot_end].unwrap().get())
1154-
}
1148+
Some(hm) if !utf8empty => return Some(hm),
1149+
Some(hm) => hm,
11551150
};
1156-
empty::skip_splits_fwd(input, pid, end, |input| {
1157-
let pid = match self.search_imp(cache, input, slots) {
1158-
None => return Ok(None),
1159-
Some(pid) => pid,
1160-
};
1161-
let slot_start = pid.as_usize() * 2;
1162-
let slot_end = slot_start + 1;
1163-
Ok(Some((pid, slots[slot_end].unwrap().get())))
1151+
empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1152+
Ok(self
1153+
.search_imp(cache, input, slots)
1154+
.map(|hm| (hm, hm.offset())))
11641155
})
11651156
// OK because the PikeVM never errors.
11661157
.unwrap()
@@ -1235,7 +1226,7 @@ impl PikeVM {
12351226
cache: &mut Cache,
12361227
input: &Input<'_>,
12371228
slots: &mut [Option<NonMaxUsize>],
1238-
) -> Option<PatternID> {
1229+
) -> Option<HalfMatch> {
12391230
cache.setup_search(slots.len());
12401231
if input.is_done() {
12411232
return None;
@@ -1264,7 +1255,7 @@ impl PikeVM {
12641255
let pre =
12651256
if anchored { None } else { self.get_config().get_prefilter() };
12661257
let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1267-
let mut pid = None;
1258+
let mut hm = None;
12681259
// Yes, our search doesn't end at input.end(), but includes it. This
12691260
// is necessary because matches are delayed by one byte, just like
12701261
// how the DFA engines work. The delay is used to handle look-behind
@@ -1283,7 +1274,7 @@ impl PikeVM {
12831274
if curr.set.is_empty() {
12841275
// We have a match and we haven't been instructed to continue
12851276
// on even after finding a match, so we can quit.
1286-
if pid.is_some() && !allmatches {
1277+
if hm.is_some() && !allmatches {
12871278
break;
12881279
}
12891280
// If we're running an anchored search and we've advanced
@@ -1353,7 +1344,7 @@ impl PikeVM {
13531344
// search. If we re-computed it at every position, we would be
13541345
// simulating an unanchored search when we were tasked to perform
13551346
// an anchored search.
1356-
if (!pid.is_some() || allmatches)
1347+
if (!hm.is_some() || allmatches)
13571348
&& (!anchored || at == input.start())
13581349
{
13591350
// Since we are adding to the 'curr' active states and since
@@ -1372,22 +1363,23 @@ impl PikeVM {
13721363
let slots = next.slot_table.all_absent();
13731364
self.epsilon_closure(stack, slots, curr, input, at, start_id);
13741365
}
1375-
if let Some(x) = self.nexts(stack, curr, next, input, at, slots) {
1376-
pid = Some(x);
1366+
if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
1367+
{
1368+
hm = Some(HalfMatch::new(pid, at));
13771369
}
13781370
// Unless the caller asked us to return early, we need to mush on
13791371
// to see if we can extend our match. (But note that 'nexts' will
13801372
// quit right after seeing a match when match_kind==LeftmostFirst,
13811373
// as is consistent with leftmost-first match priority.)
1382-
if input.get_earliest() && pid.is_some() {
1374+
if input.get_earliest() && hm.is_some() {
13831375
break;
13841376
}
13851377
core::mem::swap(curr, next);
13861378
next.set.clear();
13871379
at += 1;
13881380
}
13891381
instrument!(|c| c.eprint(&self.nfa));
1390-
pid
1382+
hm
13911383
}
13921384

13931385
/// The implementation for the 'which_overlapping_matches' API. Basically,

0 commit comments

Comments
 (0)