Skip to content

Commit d05cf6d

Browse files
tlm365alamb
andauthored
Implement native support StringViewArray for regexp_is_match and regexp_is_match_scalar function, deprecate regexp_is_match_utf8 and regexp_is_match_utf8_scalar (#6376)
* Implement native support StringViewArray for regex_is_match function * Update test cases cover StringViewArray length more then 12 bytes * Add StringView benchmark for regexp_is_match Signed-off-by: Tai Le Manh <[email protected]> * Implement native support StringViewArray for regex_is_match function Signed-off-by: Tai Le Manh <[email protected]> * Remove duplicate implementation, fix clippy, add docs more --------- Signed-off-by: Tai Le Manh <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent c90713b commit d05cf6d

File tree

4 files changed

+261
-39
lines changed

4 files changed

+261
-39
lines changed

arrow-string/src/like.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray, Arr
155155
///
156156
/// This trait helps to abstract over the different types of string arrays
157157
/// so that we don't need to duplicate the implementation for each type.
158-
trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
158+
pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
159159
fn is_ascii(&self) -> bool;
160160
fn iter(&self) -> ArrayIter<Self>;
161161
}

arrow-string/src/regexp.rs

Lines changed: 203 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,16 @@
1818
//! Defines kernel to extract substrings based on a regular
1919
//! expression of a \[Large\]StringArray
2020
21+
use crate::like::StringArrayType;
22+
2123
use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder};
2224
use arrow_array::cast::AsArray;
2325
use arrow_array::*;
2426
use arrow_buffer::NullBuffer;
2527
use arrow_data::{ArrayData, ArrayDataBuilder};
2628
use arrow_schema::{ArrowError, DataType, Field};
2729
use regex::Regex;
30+
2831
use std::collections::HashMap;
2932
use std::sync::Arc;
3033

@@ -35,16 +38,64 @@ use std::sync::Arc;
3538
/// special search modes, such as case insensitive and multi-line mode.
3639
/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
3740
/// for more information.
41+
#[deprecated(since = "54.0.0", note = "please use `regex_is_match` instead")]
3842
pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
3943
array: &GenericStringArray<OffsetSize>,
4044
regex_array: &GenericStringArray<OffsetSize>,
4145
flags_array: Option<&GenericStringArray<OffsetSize>>,
4246
) -> Result<BooleanArray, ArrowError> {
47+
regexp_is_match(array, regex_array, flags_array)
48+
}
49+
50+
/// Return BooleanArray indicating which strings in an array match an array of
51+
/// regular expressions.
52+
///
53+
/// This is equivalent to the SQL `array ~ regex_array`, supporting
54+
/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`].
55+
///
56+
/// If `regex_array` element has an empty value, the corresponding result value is always true.
57+
///
58+
/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag,
59+
/// which allow special search modes, such as case-insensitive and multi-line mode.
60+
/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
61+
/// for more information.
62+
///
63+
/// # See Also
64+
/// * [`regexp_is_match_scalar`] for matching a single regular expression against an array of strings
65+
/// * [`regexp_match`] for extracting groups from a string array based on a regular expression
66+
///
67+
/// # Example
68+
/// ```
69+
/// # use arrow_array::{StringArray, BooleanArray};
70+
/// # use arrow_string::regexp::regexp_is_match;
71+
/// // First array is the array of strings to match
72+
/// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
73+
/// // Second array is the array of regular expressions to match against
74+
/// let regex_array = StringArray::from(vec!["^Foo", "^Foo", "Bar$", "Baz"]);
75+
/// // Third array is the array of flags to use for each regular expression, if desired
76+
/// // (the type must be provided to satisfy type inference for the third parameter)
77+
/// let flags_array: Option<&StringArray> = None;
78+
/// // The result is a BooleanArray indicating when each string in `array`
79+
/// // matches the corresponding regular expression in `regex_array`
80+
/// let result = regexp_is_match(&array, &regex_array, flags_array).unwrap();
81+
/// assert_eq!(result, BooleanArray::from(vec![true, false, true, true]));
82+
/// ```
83+
pub fn regexp_is_match<'a, S1, S2, S3>(
84+
array: &'a S1,
85+
regex_array: &'a S2,
86+
flags_array: Option<&'a S3>,
87+
) -> Result<BooleanArray, ArrowError>
88+
where
89+
&'a S1: StringArrayType<'a>,
90+
&'a S2: StringArrayType<'a>,
91+
&'a S3: StringArrayType<'a>,
92+
{
4393
if array.len() != regex_array.len() {
4494
return Err(ArrowError::ComputeError(
4595
"Cannot perform comparison operation on arrays of different length".to_string(),
4696
));
4797
}
98+
4899
let nulls = NullBuffer::union(array.nulls(), regex_array.nulls());
49100

50101
let mut patterns: HashMap<String, Regex> = HashMap::new();
@@ -107,25 +158,63 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
107158
.nulls(nulls)
108159
.build_unchecked()
109160
};
161+
110162
Ok(BooleanArray::from(data))
111163
}
112164

113165
/// Perform SQL `array ~ regex_array` operation on [`StringArray`] /
114166
/// [`LargeStringArray`] and a scalar.
115167
///
116168
/// See the documentation on [`regexp_is_match_utf8`] for more details.
169+
#[deprecated(since = "54.0.0", note = "please use `regex_is_match_scalar` instead")]
117170
pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>(
118171
array: &GenericStringArray<OffsetSize>,
119172
regex: &str,
120173
flag: Option<&str>,
121174
) -> Result<BooleanArray, ArrowError> {
175+
regexp_is_match_scalar(array, regex, flag)
176+
}
177+
178+
/// Return BooleanArray indicating which strings in an array match a single regular expression.
179+
///
180+
/// This is equivalent to the SQL `array ~ regex_array`, supporting
181+
/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] and a scalar.
182+
///
183+
/// See the documentation on [`regexp_is_match`] for more details on arguments
184+
///
185+
/// # See Also
186+
/// * [`regexp_is_match`] for matching an array of regular expression against an array of strings
187+
/// * [`regexp_match`] for extracting groups from a string array based on a regular expression
188+
///
189+
/// # Example
190+
/// ```
191+
/// # use arrow_array::{StringArray, BooleanArray};
192+
/// # use arrow_string::regexp::regexp_is_match_scalar;
193+
/// // array of strings to match
194+
/// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
195+
/// let regexp = "^Foo"; // regular expression to match against
196+
/// let flags: Option<&str> = None; // flags can control the matching behavior
197+
/// // The result is a BooleanArray indicating when each string in `array`
198+
/// // matches the regular expression `regexp`
199+
/// let result = regexp_is_match_scalar(&array, regexp, None).unwrap();
200+
/// assert_eq!(result, BooleanArray::from(vec![true, false, true, false]));
201+
/// ```
202+
pub fn regexp_is_match_scalar<'a, S>(
203+
array: &'a S,
204+
regex: &str,
205+
flag: Option<&str>,
206+
) -> Result<BooleanArray, ArrowError>
207+
where
208+
&'a S: StringArrayType<'a>,
209+
{
122210
let null_bit_buffer = array.nulls().map(|x| x.inner().sliced());
123211
let mut result = BooleanBufferBuilder::new(array.len());
124212

125213
let pattern = match flag {
126214
Some(flag) => format!("(?{flag}){regex}"),
127215
None => regex.to_string(),
128216
};
217+
129218
if pattern.is_empty() {
130219
result.append_n(array.len(), true);
131220
} else {
@@ -150,6 +239,7 @@ pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>(
150239
vec![],
151240
)
152241
};
242+
153243
Ok(BooleanArray::from(data))
154244
}
155245

@@ -303,6 +393,9 @@ fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
303393
/// The flags parameter is an optional text string containing zero or more single-letter flags
304394
/// that change the function's behavior.
305395
///
396+
/// # See Also
397+
/// * [`regexp_is_match`] for matching (rather than extracting) a regular expression against an array of strings
398+
///
306399
/// [regexp_match]: https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
307400
pub fn regexp_match(
308401
array: &dyn Array,
@@ -517,8 +610,8 @@ mod tests {
517610
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
518611
#[test]
519612
fn $test_name() {
520-
let left = StringArray::from($left);
521-
let right = StringArray::from($right);
613+
let left = $left;
614+
let right = $right;
522615
let res = $op(&left, &right, None).unwrap();
523616
let expected = $expected;
524617
assert_eq!(expected.len(), res.len());
@@ -531,9 +624,9 @@ mod tests {
531624
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => {
532625
#[test]
533626
fn $test_name() {
534-
let left = StringArray::from($left);
535-
let right = StringArray::from($right);
536-
let flag = Some(StringArray::from($flag));
627+
let left = $left;
628+
let right = $right;
629+
let flag = Some($flag);
537630
let res = $op(&left, &right, flag.as_ref()).unwrap();
538631
let expected = $expected;
539632
assert_eq!(expected.len(), res.len());
@@ -549,7 +642,7 @@ mod tests {
549642
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
550643
#[test]
551644
fn $test_name() {
552-
let left = StringArray::from($left);
645+
let left = $left;
553646
let res = $op(&left, $right, None).unwrap();
554647
let expected = $expected;
555648
assert_eq!(expected.len(), res.len());
@@ -569,7 +662,7 @@ mod tests {
569662
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => {
570663
#[test]
571664
fn $test_name() {
572-
let left = StringArray::from($left);
665+
let left = $left;
573666
let flag = Some($flag);
574667
let res = $op(&left, $right, flag).unwrap();
575668
let expected = $expected;
@@ -590,41 +683,126 @@ mod tests {
590683
}
591684

592685
test_flag_utf8!(
593-
test_utf8_array_regexp_is_match,
594-
vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"],
595-
vec!["^ar", "^AR", "ow$", "OW$", "foo", ""],
596-
regexp_is_match_utf8,
686+
test_array_regexp_is_match_utf8,
687+
StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
688+
StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
689+
regexp_is_match::<StringArray, StringArray, StringArray>,
597690
[true, false, true, false, false, true]
598691
);
599692
test_flag_utf8!(
600-
test_utf8_array_regexp_is_match_insensitive,
601-
vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"],
602-
vec!["^ar", "^AR", "ow$", "OW$", "foo", ""],
603-
vec!["i"; 6],
604-
regexp_is_match_utf8,
693+
test_array_regexp_is_match_utf8_insensitive,
694+
StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
695+
StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
696+
StringArray::from(vec!["i"; 6]),
697+
regexp_is_match,
605698
[true, true, true, true, false, true]
606699
);
607700

608701
test_flag_utf8_scalar!(
609-
test_utf8_array_regexp_is_match_scalar,
610-
vec!["arrow", "ARROW", "parquet", "PARQUET"],
702+
test_array_regexp_is_match_utf8_scalar,
703+
StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
611704
"^ar",
612-
regexp_is_match_utf8_scalar,
705+
regexp_is_match_scalar,
613706
[true, false, false, false]
614707
);
615708
test_flag_utf8_scalar!(
616-
test_utf8_array_regexp_is_match_empty_scalar,
617-
vec!["arrow", "ARROW", "parquet", "PARQUET"],
709+
test_array_regexp_is_match_utf8_scalar_empty,
710+
StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
618711
"",
619-
regexp_is_match_utf8_scalar,
712+
regexp_is_match_scalar,
620713
[true, true, true, true]
621714
);
622715
test_flag_utf8_scalar!(
623-
test_utf8_array_regexp_is_match_insensitive_scalar,
624-
vec!["arrow", "ARROW", "parquet", "PARQUET"],
716+
test_array_regexp_is_match_utf8_scalar_insensitive,
717+
StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
625718
"^ar",
626719
"i",
627-
regexp_is_match_utf8_scalar,
720+
regexp_is_match_scalar,
721+
[true, true, false, false]
722+
);
723+
724+
test_flag_utf8!(
725+
tes_array_regexp_is_match,
726+
StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
727+
StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
728+
regexp_is_match::<StringViewArray, StringViewArray, StringViewArray>,
729+
[true, false, true, false, false, true]
730+
);
731+
test_flag_utf8!(
732+
test_array_regexp_is_match_2,
733+
StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
734+
StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
735+
regexp_is_match::<StringViewArray, GenericStringArray<i32>, GenericStringArray<i32>>,
736+
[true, false, true, false, false, true]
737+
);
738+
test_flag_utf8!(
739+
test_array_regexp_is_match_insensitive,
740+
StringViewArray::from(vec![
741+
"Official Rust implementation of Apache Arrow",
742+
"apache/arrow-rs",
743+
"apache/arrow-rs",
744+
"parquet",
745+
"parquet",
746+
"row",
747+
"row",
748+
]),
749+
StringViewArray::from(vec![
750+
".*rust implement.*",
751+
"^ap",
752+
"^AP",
753+
"et$",
754+
"ET$",
755+
"foo",
756+
""
757+
]),
758+
StringViewArray::from(vec!["i"; 7]),
759+
regexp_is_match::<StringViewArray, StringViewArray, StringViewArray>,
760+
[true, true, true, true, true, false, true]
761+
);
762+
test_flag_utf8!(
763+
test_array_regexp_is_match_insensitive_2,
764+
LargeStringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
765+
StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
766+
StringArray::from(vec!["i"; 6]),
767+
regexp_is_match::<GenericStringArray<i64>, StringViewArray, GenericStringArray<i32>>,
768+
[true, true, true, true, false, true]
769+
);
770+
771+
test_flag_utf8_scalar!(
772+
test_array_regexp_is_match_scalar,
773+
StringViewArray::from(vec![
774+
"apache/arrow-rs",
775+
"APACHE/ARROW-RS",
776+
"parquet",
777+
"PARQUET",
778+
]),
779+
"^ap",
780+
regexp_is_match_scalar::<StringViewArray>,
781+
[true, false, false, false]
782+
);
783+
test_flag_utf8_scalar!(
784+
test_array_regexp_is_match_scalar_empty,
785+
StringViewArray::from(vec![
786+
"apache/arrow-rs",
787+
"APACHE/ARROW-RS",
788+
"parquet",
789+
"PARQUET",
790+
]),
791+
"",
792+
regexp_is_match_scalar::<StringViewArray>,
793+
[true, true, true, true]
794+
);
795+
test_flag_utf8_scalar!(
796+
test_array_regexp_is_match_scalar_insensitive,
797+
StringViewArray::from(vec![
798+
"apache/arrow-rs",
799+
"APACHE/ARROW-RS",
800+
"parquet",
801+
"PARQUET",
802+
]),
803+
"^ap",
804+
"i",
805+
regexp_is_match_scalar::<StringViewArray>,
628806
[true, true, false, false]
629807
);
630808
}

0 commit comments

Comments
 (0)