18
18
//! Defines kernel to extract substrings based on a regular
19
19
//! expression of a \[Large\]StringArray
20
20
21
+ use crate :: like:: StringArrayType ;
22
+
21
23
use arrow_array:: builder:: { BooleanBufferBuilder , GenericStringBuilder , ListBuilder } ;
22
24
use arrow_array:: cast:: AsArray ;
23
25
use arrow_array:: * ;
24
26
use arrow_buffer:: NullBuffer ;
25
27
use arrow_data:: { ArrayData , ArrayDataBuilder } ;
26
28
use arrow_schema:: { ArrowError , DataType , Field } ;
27
29
use regex:: Regex ;
30
+
28
31
use std:: collections:: HashMap ;
29
32
use std:: sync:: Arc ;
30
33
@@ -35,16 +38,64 @@ use std::sync::Arc;
35
38
/// special search modes, such as case insensitive and multi-line mode.
36
39
/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
37
40
/// for more information.
41
+ #[ deprecated( since = "54.0.0" , note = "please use `regex_is_match` instead" ) ]
38
42
pub fn regexp_is_match_utf8 < OffsetSize : OffsetSizeTrait > (
39
43
array : & GenericStringArray < OffsetSize > ,
40
44
regex_array : & GenericStringArray < OffsetSize > ,
41
45
flags_array : Option < & GenericStringArray < OffsetSize > > ,
42
46
) -> Result < BooleanArray , ArrowError > {
47
+ regexp_is_match ( array, regex_array, flags_array)
48
+ }
49
+
50
+ /// Return BooleanArray indicating which strings in an array match an array of
51
+ /// regular expressions.
52
+ ///
53
+ /// This is equivalent to the SQL `array ~ regex_array`, supporting
54
+ /// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`].
55
+ ///
56
+ /// If `regex_array` element has an empty value, the corresponding result value is always true.
57
+ ///
58
+ /// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag,
59
+ /// which allow special search modes, such as case-insensitive and multi-line mode.
60
+ /// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
61
+ /// for more information.
62
+ ///
63
+ /// # See Also
64
+ /// * [`regexp_is_match_scalar`] for matching a single regular expression against an array of strings
65
+ /// * [`regexp_match`] for extracting groups from a string array based on a regular expression
66
+ ///
67
+ /// # Example
68
+ /// ```
69
+ /// # use arrow_array::{StringArray, BooleanArray};
70
+ /// # use arrow_string::regexp::regexp_is_match;
71
+ /// // First array is the array of strings to match
72
+ /// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
73
+ /// // Second array is the array of regular expressions to match against
74
+ /// let regex_array = StringArray::from(vec!["^Foo", "^Foo", "Bar$", "Baz"]);
75
+ /// // Third array is the array of flags to use for each regular expression, if desired
76
+ /// // (the type must be provided to satisfy type inference for the third parameter)
77
+ /// let flags_array: Option<&StringArray> = None;
78
+ /// // The result is a BooleanArray indicating when each string in `array`
79
+ /// // matches the corresponding regular expression in `regex_array`
80
+ /// let result = regexp_is_match(&array, ®ex_array, flags_array).unwrap();
81
+ /// assert_eq!(result, BooleanArray::from(vec![true, false, true, true]));
82
+ /// ```
83
+ pub fn regexp_is_match < ' a , S1 , S2 , S3 > (
84
+ array : & ' a S1 ,
85
+ regex_array : & ' a S2 ,
86
+ flags_array : Option < & ' a S3 > ,
87
+ ) -> Result < BooleanArray , ArrowError >
88
+ where
89
+ & ' a S1 : StringArrayType < ' a > ,
90
+ & ' a S2 : StringArrayType < ' a > ,
91
+ & ' a S3 : StringArrayType < ' a > ,
92
+ {
43
93
if array. len ( ) != regex_array. len ( ) {
44
94
return Err ( ArrowError :: ComputeError (
45
95
"Cannot perform comparison operation on arrays of different length" . to_string ( ) ,
46
96
) ) ;
47
97
}
98
+
48
99
let nulls = NullBuffer :: union ( array. nulls ( ) , regex_array. nulls ( ) ) ;
49
100
50
101
let mut patterns: HashMap < String , Regex > = HashMap :: new ( ) ;
@@ -107,25 +158,63 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
107
158
. nulls ( nulls)
108
159
. build_unchecked ( )
109
160
} ;
161
+
110
162
Ok ( BooleanArray :: from ( data) )
111
163
}
112
164
113
165
/// Perform SQL `array ~ regex_array` operation on [`StringArray`] /
114
166
/// [`LargeStringArray`] and a scalar.
115
167
///
116
168
/// See the documentation on [`regexp_is_match_utf8`] for more details.
169
+ #[ deprecated( since = "54.0.0" , note = "please use `regex_is_match_scalar` instead" ) ]
117
170
pub fn regexp_is_match_utf8_scalar < OffsetSize : OffsetSizeTrait > (
118
171
array : & GenericStringArray < OffsetSize > ,
119
172
regex : & str ,
120
173
flag : Option < & str > ,
121
174
) -> Result < BooleanArray , ArrowError > {
175
+ regexp_is_match_scalar ( array, regex, flag)
176
+ }
177
+
178
+ /// Return BooleanArray indicating which strings in an array match a single regular expression.
179
+ ///
180
+ /// This is equivalent to the SQL `array ~ regex_array`, supporting
181
+ /// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] and a scalar.
182
+ ///
183
+ /// See the documentation on [`regexp_is_match`] for more details on arguments
184
+ ///
185
+ /// # See Also
186
+ /// * [`regexp_is_match`] for matching an array of regular expression against an array of strings
187
+ /// * [`regexp_match`] for extracting groups from a string array based on a regular expression
188
+ ///
189
+ /// # Example
190
+ /// ```
191
+ /// # use arrow_array::{StringArray, BooleanArray};
192
+ /// # use arrow_string::regexp::regexp_is_match_scalar;
193
+ /// // array of strings to match
194
+ /// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
195
+ /// let regexp = "^Foo"; // regular expression to match against
196
+ /// let flags: Option<&str> = None; // flags can control the matching behavior
197
+ /// // The result is a BooleanArray indicating when each string in `array`
198
+ /// // matches the regular expression `regexp`
199
+ /// let result = regexp_is_match_scalar(&array, regexp, None).unwrap();
200
+ /// assert_eq!(result, BooleanArray::from(vec![true, false, true, false]));
201
+ /// ```
202
+ pub fn regexp_is_match_scalar < ' a , S > (
203
+ array : & ' a S ,
204
+ regex : & str ,
205
+ flag : Option < & str > ,
206
+ ) -> Result < BooleanArray , ArrowError >
207
+ where
208
+ & ' a S : StringArrayType < ' a > ,
209
+ {
122
210
let null_bit_buffer = array. nulls ( ) . map ( |x| x. inner ( ) . sliced ( ) ) ;
123
211
let mut result = BooleanBufferBuilder :: new ( array. len ( ) ) ;
124
212
125
213
let pattern = match flag {
126
214
Some ( flag) => format ! ( "(?{flag}){regex}" ) ,
127
215
None => regex. to_string ( ) ,
128
216
} ;
217
+
129
218
if pattern. is_empty ( ) {
130
219
result. append_n ( array. len ( ) , true ) ;
131
220
} else {
@@ -150,6 +239,7 @@ pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>(
150
239
vec ! [ ] ,
151
240
)
152
241
} ;
242
+
153
243
Ok ( BooleanArray :: from ( data) )
154
244
}
155
245
@@ -303,6 +393,9 @@ fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
303
393
/// The flags parameter is an optional text string containing zero or more single-letter flags
304
394
/// that change the function's behavior.
305
395
///
396
+ /// # See Also
397
+ /// * [`regexp_is_match`] for matching (rather than extracting) a regular expression against an array of strings
398
+ ///
306
399
/// [regexp_match]: https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
307
400
pub fn regexp_match (
308
401
array : & dyn Array ,
@@ -517,8 +610,8 @@ mod tests {
517
610
( $test_name: ident, $left: expr, $right: expr, $op: expr, $expected: expr) => {
518
611
#[ test]
519
612
fn $test_name( ) {
520
- let left = StringArray :: from ( $left) ;
521
- let right = StringArray :: from ( $right) ;
613
+ let left = $left;
614
+ let right = $right;
522
615
let res = $op( & left, & right, None ) . unwrap( ) ;
523
616
let expected = $expected;
524
617
assert_eq!( expected. len( ) , res. len( ) ) ;
@@ -531,9 +624,9 @@ mod tests {
531
624
( $test_name: ident, $left: expr, $right: expr, $flag: expr, $op: expr, $expected: expr) => {
532
625
#[ test]
533
626
fn $test_name( ) {
534
- let left = StringArray :: from ( $left) ;
535
- let right = StringArray :: from ( $right) ;
536
- let flag = Some ( StringArray :: from ( $flag) ) ;
627
+ let left = $left;
628
+ let right = $right;
629
+ let flag = Some ( $flag) ;
537
630
let res = $op( & left, & right, flag. as_ref( ) ) . unwrap( ) ;
538
631
let expected = $expected;
539
632
assert_eq!( expected. len( ) , res. len( ) ) ;
@@ -549,7 +642,7 @@ mod tests {
549
642
( $test_name: ident, $left: expr, $right: expr, $op: expr, $expected: expr) => {
550
643
#[ test]
551
644
fn $test_name( ) {
552
- let left = StringArray :: from ( $left) ;
645
+ let left = $left;
553
646
let res = $op( & left, $right, None ) . unwrap( ) ;
554
647
let expected = $expected;
555
648
assert_eq!( expected. len( ) , res. len( ) ) ;
@@ -569,7 +662,7 @@ mod tests {
569
662
( $test_name: ident, $left: expr, $right: expr, $flag: expr, $op: expr, $expected: expr) => {
570
663
#[ test]
571
664
fn $test_name( ) {
572
- let left = StringArray :: from ( $left) ;
665
+ let left = $left;
573
666
let flag = Some ( $flag) ;
574
667
let res = $op( & left, $right, flag) . unwrap( ) ;
575
668
let expected = $expected;
@@ -590,41 +683,126 @@ mod tests {
590
683
}
591
684
592
685
test_flag_utf8 ! (
593
- test_utf8_array_regexp_is_match ,
594
- vec![ "arrow" , "arrow" , "arrow" , "arrow" , "arrow" , "arrow" ] ,
595
- vec![ "^ar" , "^AR" , "ow$" , "OW$" , "foo" , "" ] ,
596
- regexp_is_match_utf8 ,
686
+ test_array_regexp_is_match_utf8 ,
687
+ StringArray :: from ( vec![ "arrow" , "arrow" , "arrow" , "arrow" , "arrow" , "arrow" ] ) ,
688
+ StringArray :: from ( vec![ "^ar" , "^AR" , "ow$" , "OW$" , "foo" , "" ] ) ,
689
+ regexp_is_match :: < StringArray , StringArray , StringArray > ,
597
690
[ true , false , true , false , false , true ]
598
691
) ;
599
692
test_flag_utf8 ! (
600
- test_utf8_array_regexp_is_match_insensitive ,
601
- vec![ "arrow" , "arrow" , "arrow" , "arrow" , "arrow" , "arrow" ] ,
602
- vec![ "^ar" , "^AR" , "ow$" , "OW$" , "foo" , "" ] ,
603
- vec![ "i" ; 6 ] ,
604
- regexp_is_match_utf8 ,
693
+ test_array_regexp_is_match_utf8_insensitive ,
694
+ StringArray :: from ( vec![ "arrow" , "arrow" , "arrow" , "arrow" , "arrow" , "arrow" ] ) ,
695
+ StringArray :: from ( vec![ "^ar" , "^AR" , "ow$" , "OW$" , "foo" , "" ] ) ,
696
+ StringArray :: from ( vec![ "i" ; 6 ] ) ,
697
+ regexp_is_match ,
605
698
[ true , true , true , true , false , true ]
606
699
) ;
607
700
608
701
test_flag_utf8_scalar ! (
609
- test_utf8_array_regexp_is_match_scalar ,
610
- vec![ "arrow" , "ARROW" , "parquet" , "PARQUET" ] ,
702
+ test_array_regexp_is_match_utf8_scalar ,
703
+ StringArray :: from ( vec![ "arrow" , "ARROW" , "parquet" , "PARQUET" ] ) ,
611
704
"^ar" ,
612
- regexp_is_match_utf8_scalar ,
705
+ regexp_is_match_scalar ,
613
706
[ true , false , false , false ]
614
707
) ;
615
708
test_flag_utf8_scalar ! (
616
- test_utf8_array_regexp_is_match_empty_scalar ,
617
- vec![ "arrow" , "ARROW" , "parquet" , "PARQUET" ] ,
709
+ test_array_regexp_is_match_utf8_scalar_empty ,
710
+ StringArray :: from ( vec![ "arrow" , "ARROW" , "parquet" , "PARQUET" ] ) ,
618
711
"" ,
619
- regexp_is_match_utf8_scalar ,
712
+ regexp_is_match_scalar ,
620
713
[ true , true , true , true ]
621
714
) ;
622
715
test_flag_utf8_scalar ! (
623
- test_utf8_array_regexp_is_match_insensitive_scalar ,
624
- vec![ "arrow" , "ARROW" , "parquet" , "PARQUET" ] ,
716
+ test_array_regexp_is_match_utf8_scalar_insensitive ,
717
+ StringArray :: from ( vec![ "arrow" , "ARROW" , "parquet" , "PARQUET" ] ) ,
625
718
"^ar" ,
626
719
"i" ,
627
- regexp_is_match_utf8_scalar,
720
+ regexp_is_match_scalar,
721
+ [ true , true , false , false ]
722
+ ) ;
723
+
724
+ test_flag_utf8 ! (
725
+ tes_array_regexp_is_match,
726
+ StringViewArray :: from( vec![ "arrow" , "arrow" , "arrow" , "arrow" , "arrow" , "arrow" ] ) ,
727
+ StringViewArray :: from( vec![ "^ar" , "^AR" , "ow$" , "OW$" , "foo" , "" ] ) ,
728
+ regexp_is_match:: <StringViewArray , StringViewArray , StringViewArray >,
729
+ [ true , false , true , false , false , true ]
730
+ ) ;
731
+ test_flag_utf8 ! (
732
+ test_array_regexp_is_match_2,
733
+ StringViewArray :: from( vec![ "arrow" , "arrow" , "arrow" , "arrow" , "arrow" , "arrow" ] ) ,
734
+ StringArray :: from( vec![ "^ar" , "^AR" , "ow$" , "OW$" , "foo" , "" ] ) ,
735
+ regexp_is_match:: <StringViewArray , GenericStringArray <i32 >, GenericStringArray <i32 >>,
736
+ [ true , false , true , false , false , true ]
737
+ ) ;
738
+ test_flag_utf8 ! (
739
+ test_array_regexp_is_match_insensitive,
740
+ StringViewArray :: from( vec![
741
+ "Official Rust implementation of Apache Arrow" ,
742
+ "apache/arrow-rs" ,
743
+ "apache/arrow-rs" ,
744
+ "parquet" ,
745
+ "parquet" ,
746
+ "row" ,
747
+ "row" ,
748
+ ] ) ,
749
+ StringViewArray :: from( vec![
750
+ ".*rust implement.*" ,
751
+ "^ap" ,
752
+ "^AP" ,
753
+ "et$" ,
754
+ "ET$" ,
755
+ "foo" ,
756
+ ""
757
+ ] ) ,
758
+ StringViewArray :: from( vec![ "i" ; 7 ] ) ,
759
+ regexp_is_match:: <StringViewArray , StringViewArray , StringViewArray >,
760
+ [ true , true , true , true , true , false , true ]
761
+ ) ;
762
+ test_flag_utf8 ! (
763
+ test_array_regexp_is_match_insensitive_2,
764
+ LargeStringArray :: from( vec![ "arrow" , "arrow" , "arrow" , "arrow" , "arrow" , "arrow" ] ) ,
765
+ StringViewArray :: from( vec![ "^ar" , "^AR" , "ow$" , "OW$" , "foo" , "" ] ) ,
766
+ StringArray :: from( vec![ "i" ; 6 ] ) ,
767
+ regexp_is_match:: <GenericStringArray <i64 >, StringViewArray , GenericStringArray <i32 >>,
768
+ [ true , true , true , true , false , true ]
769
+ ) ;
770
+
771
+ test_flag_utf8_scalar ! (
772
+ test_array_regexp_is_match_scalar,
773
+ StringViewArray :: from( vec![
774
+ "apache/arrow-rs" ,
775
+ "APACHE/ARROW-RS" ,
776
+ "parquet" ,
777
+ "PARQUET" ,
778
+ ] ) ,
779
+ "^ap" ,
780
+ regexp_is_match_scalar:: <StringViewArray >,
781
+ [ true , false , false , false ]
782
+ ) ;
783
+ test_flag_utf8_scalar ! (
784
+ test_array_regexp_is_match_scalar_empty,
785
+ StringViewArray :: from( vec![
786
+ "apache/arrow-rs" ,
787
+ "APACHE/ARROW-RS" ,
788
+ "parquet" ,
789
+ "PARQUET" ,
790
+ ] ) ,
791
+ "" ,
792
+ regexp_is_match_scalar:: <StringViewArray >,
793
+ [ true , true , true , true ]
794
+ ) ;
795
+ test_flag_utf8_scalar ! (
796
+ test_array_regexp_is_match_scalar_insensitive,
797
+ StringViewArray :: from( vec![
798
+ "apache/arrow-rs" ,
799
+ "APACHE/ARROW-RS" ,
800
+ "parquet" ,
801
+ "PARQUET" ,
802
+ ] ) ,
803
+ "^ap" ,
804
+ "i" ,
805
+ regexp_is_match_scalar:: <StringViewArray >,
628
806
[ true , true , false , false ]
629
807
) ;
630
808
}
0 commit comments