Skip to content

Commit 53fc9e1

Browse files
laurencerfacebook-github-bot
authored andcommitted
feat(function): Add varbinary variants for strpos and contains (#15809)
Summary: Adds `strpos`, `strrpos` and `contains` for varbinary columns which will allow for efficient string searching in encoded text data where we don't want to pay the charset decoding price (i.e. we don't have to convert the column to utf-8 and just do byte matches as an approximation). Reviewed By: kaikalur Differential Revision: D89479354
1 parent b325a22 commit 53fc9e1

File tree

7 files changed

+527
-0
lines changed

7 files changed

+527
-0
lines changed

velox/expression/fuzzer/ExpressionFuzzerTest.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,14 @@ std::unordered_set<std::string> skipFunctions = {
191191
// https://github.com/facebookincubator/velox/issues/13047
192192
"inverse_poisson_cdf",
193193
"map_subset", // https://github.com/facebookincubator/velox/issues/12654
194+
// Presto doesn't support varbinary variants of strpos, strrpos and
195+
// contains. Presto only has contains(array(T), T), not contains(varbinary,
196+
// varbinary).
197+
"strpos(varbinary,varbinary) -> bigint",
198+
"strpos(varbinary,varbinary,bigint) -> bigint",
199+
"strrpos(varbinary,varbinary) -> bigint",
200+
"strrpos(varbinary,varbinary,bigint) -> bigint",
201+
"contains(varbinary,varbinary) -> boolean",
194202
// JSON not supported, Real doesn't match exactly, etc.
195203
"array_join(array(json),varchar) -> varchar",
196204
"array_join(array(json),varchar,varchar) -> varchar",
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include "velox/functions/Macros.h"
19+
#include "velox/functions/lib/string/StringCore.h"
20+
21+
namespace facebook::velox::functions {
22+
23+
/// strpos and strrpos functions for varbinary
24+
/// strpos(varbinary, varbinary) → bigint
25+
/// Returns the starting byte position of the first instance of the pattern
26+
/// in the binary data. Positions start with 1. If not found, 0 is returned.
27+
/// strpos(varbinary, varbinary, instance) → bigint
28+
/// Returns the byte position of the N-th instance of the pattern.
29+
/// instance must be a positive number. Positions start with 1. If not
30+
/// found, 0 is returned.
31+
/// strrpos(varbinary, varbinary) → bigint
32+
/// Returns the starting byte position of the first instance of the pattern
33+
/// in the binary data counting from the end. Positions start with 1. If not
34+
/// found, 0 is returned.
35+
/// strrpos(varbinary, varbinary, instance) → bigint
36+
/// Returns the byte position of the N-th instance of the pattern
37+
/// counting from the end. Instance must be a positive number. Positions
38+
/// start with 1. If not found, 0 is returned.
39+
template <typename T, bool lpos>
40+
struct StrPosVarbinaryFunctionBase {
41+
VELOX_DEFINE_FUNCTION_TYPES(T);
42+
43+
FOLLY_ALWAYS_INLINE void call(
44+
out_type<int64_t>& result,
45+
const arg_type<Varbinary>& haystack,
46+
const arg_type<Varbinary>& needle,
47+
const arg_type<int64_t>& instance = 1) {
48+
VELOX_USER_CHECK_GT(instance, 0, "'instance' must be a positive number");
49+
if (needle.size() == 0) {
50+
result = 1;
51+
return;
52+
}
53+
54+
int64_t byteIndex = -1;
55+
if constexpr (lpos) {
56+
byteIndex = stringCore::findNthInstanceByteIndexFromStart(
57+
std::string_view(haystack.data(), haystack.size()),
58+
std::string_view(needle.data(), needle.size()),
59+
instance);
60+
} else {
61+
byteIndex = stringCore::findNthInstanceByteIndexFromEnd(
62+
std::string_view(haystack.data(), haystack.size()),
63+
std::string_view(needle.data(), needle.size()),
64+
instance);
65+
}
66+
67+
// Return 1-based byte position, or 0 if not found.
68+
result = byteIndex == -1 ? 0 : byteIndex + 1;
69+
}
70+
};
71+
72+
template <typename T>
73+
struct StrLPosVarbinaryFunction : public StrPosVarbinaryFunctionBase<T, true> {
74+
};
75+
76+
template <typename T>
77+
struct StrRPosVarbinaryFunction : public StrPosVarbinaryFunctionBase<T, false> {
78+
};
79+
80+
/// contains for varbinary - returns true if the pattern exists in the binary
81+
/// data contains(varbinary, varbinary) → boolean
82+
template <typename T>
83+
struct ContainsVarbinaryFunction {
84+
VELOX_DEFINE_FUNCTION_TYPES(T);
85+
86+
FOLLY_ALWAYS_INLINE void call(
87+
out_type<bool>& result,
88+
const arg_type<Varbinary>& haystack,
89+
const arg_type<Varbinary>& needle) {
90+
if (needle.size() == 0) {
91+
result = true;
92+
return;
93+
}
94+
auto pos = std::string_view(haystack.data(), haystack.size())
95+
.find(std::string_view(needle.data(), needle.size()));
96+
result = pos != std::string_view::npos;
97+
}
98+
};
99+
100+
} // namespace facebook::velox::functions

velox/functions/prestosql/registration/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ velox_add_library(
4242
TDigestFunctionsRegistration.cpp
4343
QDigestFunctionsRegistration.cpp
4444
URLFunctionsRegistration.cpp
45+
VarbinaryFunctionsRegistration.cpp
4546
)
4647

4748
if(VELOX_ENABLE_GEO)

velox/functions/prestosql/registration/RegistrationFunctions.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ extern void registerFloatingPointFunctions(const std::string& prefix);
4141
extern void registerJsonFunctions(const std::string& prefix);
4242
extern void registerMapFunctions(const std::string& prefix);
4343
extern void registerStringFunctions(const std::string& prefix);
44+
extern void registerVarbinaryFunctions(const std::string& prefix);
4445
extern void registerBinaryFunctions(const std::string& prefix);
4546
extern void registerURLFunctions(const std::string& prefix);
4647
extern void registerDataSizeFunctions(const std::string& prefix);
@@ -139,6 +140,10 @@ void registerStringFunctions(const std::string& prefix) {
139140
functions::registerStringFunctions(prefix);
140141
}
141142

143+
void registerVarbinaryFunctions(const std::string& prefix) {
144+
functions::registerVarbinaryFunctions(prefix);
145+
}
146+
142147
void registerBinaryFunctions(const std::string& prefix) {
143148
functions::registerBinaryFunctions(prefix);
144149
}
@@ -173,6 +178,7 @@ void registerAllScalarFunctions(const std::string& prefix) {
173178
registerDateTimeFunctions(prefix);
174179
registerURLFunctions(prefix);
175180
registerStringFunctions(prefix);
181+
registerVarbinaryFunctions(prefix);
176182
registerBinaryFunctions(prefix);
177183
registerBitwiseFunctions(prefix);
178184
registerUuidFunctions(prefix);

velox/functions/prestosql/registration/RegistrationFunctions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ void registerURLFunctions(const std::string& prefix = "");
5353

5454
void registerStringFunctions(const std::string& prefix = "");
5555

56+
void registerVarbinaryFunctions(const std::string& prefix = "");
57+
5658
void registerBinaryFunctions(const std::string& prefix = "");
5759

5860
void registerBitwiseFunctions(const std::string& prefix = "");
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "velox/functions/Registerer.h"
17+
#include "velox/functions/prestosql/StringFunctions.h"
18+
#include "velox/functions/prestosql/VarbinaryFunctions.h"
19+
20+
namespace facebook::velox::functions {
21+
22+
void registerVarbinaryFunctions(const std::string& prefix) {
23+
// Length for varbinary
24+
registerFunction<LengthVarbinaryFunction, int64_t, Varbinary>(
25+
{prefix + "length"});
26+
27+
// substr for varbinary
28+
registerFunction<SubstrVarbinaryFunction, Varbinary, Varbinary, int64_t>(
29+
{prefix + "substr"});
30+
registerFunction<
31+
SubstrVarbinaryFunction,
32+
Varbinary,
33+
Varbinary,
34+
int64_t,
35+
int64_t>({prefix + "substr"});
36+
37+
// strpos/strrpos for varbinary
38+
registerFunction<StrLPosVarbinaryFunction, int64_t, Varbinary, Varbinary>(
39+
{prefix + "strpos"});
40+
registerFunction<
41+
StrLPosVarbinaryFunction,
42+
int64_t,
43+
Varbinary,
44+
Varbinary,
45+
int64_t>({prefix + "strpos"});
46+
registerFunction<StrRPosVarbinaryFunction, int64_t, Varbinary, Varbinary>(
47+
{prefix + "strrpos"});
48+
registerFunction<
49+
StrRPosVarbinaryFunction,
50+
int64_t,
51+
Varbinary,
52+
Varbinary,
53+
int64_t>({prefix + "strrpos"});
54+
55+
// contains for varbinary
56+
registerFunction<ContainsVarbinaryFunction, bool, Varbinary, Varbinary>(
57+
{prefix + "contains"});
58+
}
59+
} // namespace facebook::velox::functions

0 commit comments

Comments
 (0)