automata: add internal HalfMatch APIs for NFA engines

BurntSushi · BurntSushi · commit d93ddbefd77f · 2023-08-05T14:28:50.000-04:00
Welp, okay, turns out we do need to know at least the end offset of a
match even when the NFA has no capture states. This is necessary for
correctly handling the case where a regex can match the empty string but
the caller has asked that matches not split a codepoint. If we don't
know the end offset of a match, then we can't correctly determine
whether a match exists or not and are forced to return no match even
when a match exists. We can get away with this I think for `find`-style
APIs where the caller has specifically requested match offsets while
simultaneously configuring the NFA to not track offsets, but with
`is_match`-style APIs, we really should be able to handle it correctly.

We should eventually just expose the `HalfMatch` APIs on `PikeVM` and
`BoundedBacktracker`, but for now we keep them private.
diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs
@@ -19,7 +19,7 @@ use crate::{
         empty, iter,
         prefilter::Prefilter,
         primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
-        search::{Anchored, Input, Match, MatchError, Span},
+        search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
     },
 };
 
@@ -1295,27 +1295,29 @@ impl BoundedBacktracker {
     ) -> Result<Option<PatternID>, MatchError> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
         if !utf8empty {
-            return self.try_search_slots_imp(cache, input, slots);
+            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
+            return Ok(maybe_hm.map(|hm| hm.pattern()));
         }
         // See PikeVM::try_search_slots for why we do this.
         let min = self.get_nfa().group_info().implicit_slot_len();
         if slots.len() >= min {
-            return self.try_search_slots_imp(cache, input, slots);
+            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
+            return Ok(maybe_hm.map(|hm| hm.pattern()));
         }
         if self.get_nfa().pattern_len() == 1 {
             let mut enough = [None, None];
             let got = self.try_search_slots_imp(cache, input, &mut enough)?;
             // This is OK because we know `enough_slots` is strictly bigger
             // than `slots`, otherwise this special case isn't reached.
             slots.copy_from_slice(&enough[..slots.len()]);
-            return Ok(got);
+            return Ok(got.map(|hm| hm.pattern()));
         }
         let mut enough = vec![None; min];
         let got = self.try_search_slots_imp(cache, input, &mut enough)?;
         // This is OK because we know `enough_slots` is strictly bigger than
         // `slots`, otherwise this special case isn't reached.
         slots.copy_from_slice(&enough[..slots.len()]);
-        Ok(got)
+        Ok(got.map(|hm| hm.pattern()))
     }
 
     /// This is the actual implementation of `try_search_slots_imp` that
@@ -1328,30 +1330,17 @@ impl BoundedBacktracker {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Result<Option<PatternID>, MatchError> {
+    ) -> Result<Option<HalfMatch>, MatchError> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
-        let (pid, end) = match self.search_imp(cache, input, slots)? {
+        let hm = match self.search_imp(cache, input, slots)? {
             None => return Ok(None),
-            Some(pid) if !utf8empty => return Ok(Some(pid)),
-            Some(pid) => {
-                let slot_start = pid.as_usize() * 2;
-                let slot_end = slot_start + 1;
-                // OK because we know we have a match and we know our caller
-                // provided slots are big enough (which we make true above if
-                // the caller didn't). Namely, we're only here when 'utf8empty'
-                // is true, and when that's true, we require slots for every
-                // pattern.
-                (pid, slots[slot_end].unwrap().get())
-            }
+            Some(hm) if !utf8empty => return Ok(Some(hm)),
+            Some(hm) => hm,
         };
-        empty::skip_splits_fwd(input, pid, end, |input| {
-            let pid = match self.search_imp(cache, input, slots)? {
-                None => return Ok(None),
-                Some(pid) => pid,
-            };
-            let slot_start = pid.as_usize() * 2;
-            let slot_end = slot_start + 1;
-            Ok(Some((pid, slots[slot_end].unwrap().get())))
+        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+            Ok(self
+                .search_imp(cache, input, slots)?
+                .map(|hm| (hm, hm.offset())))
         })
     }
 
@@ -1367,7 +1356,7 @@ impl BoundedBacktracker {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Result<Option<PatternID>, MatchError> {
+    ) -> Result<Option<HalfMatch>, MatchError> {
         // Unlike in the PikeVM, we write our capturing group spans directly
         // into the caller's captures groups. So we have to make sure we're
         // starting with a blank slate first. In the PikeVM, we avoid this
@@ -1414,10 +1403,9 @@ impl BoundedBacktracker {
                     Some(ref span) => at = span.start,
                 }
             }
-            if let Some(pid) =
-                self.backtrack(cache, input, at, start_id, slots)
+            if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
             {
-                return Ok(Some(pid));
+                return Ok(Some(hm));
             }
             at += 1;
         }
@@ -1438,14 +1426,13 @@ impl BoundedBacktracker {
         at: usize,
         start_id: StateID,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         cache.stack.push(Frame::Step { sid: start_id, at });
         while let Some(frame) = cache.stack.pop() {
             match frame {
                 Frame::Step { sid, at } => {
-                    if let Some(pid) = self.step(cache, input, sid, at, slots)
-                    {
-                        return Some(pid);
+                    if let Some(hm) = self.step(cache, input, sid, at, slots) {
+                        return Some(hm);
                     }
                 }
                 Frame::RestoreCapture { slot, offset } => {
@@ -1475,7 +1462,7 @@ impl BoundedBacktracker {
         mut sid: StateID,
         mut at: usize,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         loop {
             if !cache.visited.insert(sid, at - input.start()) {
                 return None;
@@ -1558,7 +1545,7 @@ impl BoundedBacktracker {
                 }
                 State::Fail => return None,
                 State::Match { pattern_id } => {
-                    return Some(pattern_id);
+                    return Some(HalfMatch::new(pattern_id, at));
                 }
             }
         }
diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs
@@ -17,7 +17,9 @@ use crate::{
         empty, iter,
         prefilter::Prefilter,
         primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
-        search::{Anchored, Input, Match, MatchKind, PatternSet, Span},
+        search::{
+            Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
+        },
         sparse_set::SparseSet,
     },
 };
@@ -1094,7 +1096,8 @@ impl PikeVM {
     ) -> Option<PatternID> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
         if !utf8empty {
-            return self.search_slots_imp(cache, input, slots);
+            let hm = self.search_slots_imp(cache, input, slots)?;
+            return Some(hm.pattern());
         }
         // There is an unfortunate special case where if the regex can
         // match the empty string and UTF-8 mode is enabled, the search
@@ -1109,22 +1112,23 @@ impl PikeVM {
         // this case.
         let min = self.get_nfa().group_info().implicit_slot_len();
         if slots.len() >= min {
-            return self.search_slots_imp(cache, input, slots);
+            let hm = self.search_slots_imp(cache, input, slots)?;
+            return Some(hm.pattern());
         }
         if self.get_nfa().pattern_len() == 1 {
             let mut enough = [None, None];
             let got = self.search_slots_imp(cache, input, &mut enough);
             // This is OK because we know `enough` is strictly bigger than
             // `slots`, otherwise this special case isn't reached.
             slots.copy_from_slice(&enough[..slots.len()]);
-            return got;
+            return got.map(|hm| hm.pattern());
         }
         let mut enough = vec![None; min];
         let got = self.search_slots_imp(cache, input, &mut enough);
         // This is OK because we know `enough` is strictly bigger than `slots`,
         // otherwise this special case isn't reached.
         slots.copy_from_slice(&enough[..slots.len()]);
-        got
+        got.map(|hm| hm.pattern())
     }
 
     /// This is the actual implementation of `search_slots_imp` that
@@ -1137,30 +1141,17 @@ impl PikeVM {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
-        let (pid, end) = match self.search_imp(cache, input, slots) {
+        let hm = match self.search_imp(cache, input, slots) {
             None => return None,
-            Some(pid) if !utf8empty => return Some(pid),
-            Some(pid) => {
-                let slot_start = pid.as_usize() * 2;
-                let slot_end = slot_start + 1;
-                // OK because we know we have a match and we know our caller
-                // provided slots are big enough (which we make true above if
-                // the caller didn't). Namely, we're only here when 'utf8empty'
-                // is true, and when that's true, we require slots for every
-                // pattern.
-                (pid, slots[slot_end].unwrap().get())
-            }
+            Some(hm) if !utf8empty => return Some(hm),
+            Some(hm) => hm,
         };
-        empty::skip_splits_fwd(input, pid, end, |input| {
-            let pid = match self.search_imp(cache, input, slots) {
-                None => return Ok(None),
-                Some(pid) => pid,
-            };
-            let slot_start = pid.as_usize() * 2;
-            let slot_end = slot_start + 1;
-            Ok(Some((pid, slots[slot_end].unwrap().get())))
+        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+            Ok(self
+                .search_imp(cache, input, slots)
+                .map(|hm| (hm, hm.offset())))
         })
         // OK because the PikeVM never errors.
         .unwrap()
@@ -1235,7 +1226,7 @@ impl PikeVM {
         cache: &mut Cache,
         input: &Input<'_>,
         slots: &mut [Option<NonMaxUsize>],
-    ) -> Option<PatternID> {
+    ) -> Option<HalfMatch> {
         cache.setup_search(slots.len());
         if input.is_done() {
             return None;
@@ -1264,7 +1255,7 @@ impl PikeVM {
         let pre =
             if anchored { None } else { self.get_config().get_prefilter() };
         let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
-        let mut pid = None;
+        let mut hm = None;
         // Yes, our search doesn't end at input.end(), but includes it. This
         // is necessary because matches are delayed by one byte, just like
         // how the DFA engines work. The delay is used to handle look-behind
@@ -1283,7 +1274,7 @@ impl PikeVM {
             if curr.set.is_empty() {
                 // We have a match and we haven't been instructed to continue
                 // on even after finding a match, so we can quit.
-                if pid.is_some() && !allmatches {
+                if hm.is_some() && !allmatches {
                     break;
                 }
                 // If we're running an anchored search and we've advanced
@@ -1353,7 +1344,7 @@ impl PikeVM {
             // search. If we re-computed it at every position, we would be
             // simulating an unanchored search when we were tasked to perform
             // an anchored search.
-            if (!pid.is_some() || allmatches)
+            if (!hm.is_some() || allmatches)
                 && (!anchored || at == input.start())
             {
                 // Since we are adding to the 'curr' active states and since
@@ -1372,22 +1363,23 @@ impl PikeVM {
                 let slots = next.slot_table.all_absent();
                 self.epsilon_closure(stack, slots, curr, input, at, start_id);
             }
-            if let Some(x) = self.nexts(stack, curr, next, input, at, slots) {
-                pid = Some(x);
+            if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
+            {
+                hm = Some(HalfMatch::new(pid, at));
             }
             // Unless the caller asked us to return early, we need to mush on
             // to see if we can extend our match. (But note that 'nexts' will
             // quit right after seeing a match when match_kind==LeftmostFirst,
             // as is consistent with leftmost-first match priority.)
-            if input.get_earliest() && pid.is_some() {
+            if input.get_earliest() && hm.is_some() {
                 break;
             }
             core::mem::swap(curr, next);
             next.set.clear();
             at += 1;
         }
         instrument!(|c| c.eprint(&self.nfa));
-        pid
+        hm
     }
 
     /// The implementation for the 'which_overlapping_matches' API. Basically,