Skip to content

Commit 3959617

Browse files
laurencerfacebook-github-bot
authored andcommitted
feat(function): add varbinary variants for strpos and contains (facebookincubator#15809)
Summary: Adds `strpos`, `strrpos` and `contains` for varbinary columns which will allow for efficient string searching in encoded text data where we don't want to pay the charset decoding price (i.e. we don't have to convert the column to utf-8 and just do byte matches as an approximation). Differential Revision: D89479354
1 parent 84b2f34 commit 3959617

File tree

5 files changed

+513
-0
lines changed

5 files changed

+513
-0
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include "velox/functions/Macros.h"
19+
#include "velox/functions/lib/string/StringCore.h"
20+
21+
namespace facebook::velox::functions {
22+
23+
/// strpos and strrpos functions for varbinary
24+
/// strpos(varbinary, varbinary) → bigint
25+
/// Returns the starting byte position of the first instance of the pattern
26+
/// in the binary data. Positions start with 1. If not found, 0 is returned.
27+
/// strpos(varbinary, varbinary, instance) → bigint
28+
/// Returns the byte position of the N-th instance of the pattern.
29+
/// instance must be a positive number. Positions start with 1. If not
30+
/// found, 0 is returned.
31+
/// strrpos(varbinary, varbinary) → bigint
32+
/// Returns the starting byte position of the first instance of the pattern
33+
/// in the binary data counting from the end. Positions start with 1. If not
34+
/// found, 0 is returned.
35+
/// strrpos(varbinary, varbinary, instance) → bigint
36+
/// Returns the byte position of the N-th instance of the pattern
37+
/// counting from the end. Instance must be a positive number. Positions
38+
/// start with 1. If not found, 0 is returned.
39+
template <typename T, bool lpos>
40+
struct StrPosVarbinaryFunctionBase {
41+
VELOX_DEFINE_FUNCTION_TYPES(T);
42+
43+
FOLLY_ALWAYS_INLINE void call(
44+
out_type<int64_t>& result,
45+
const arg_type<Varbinary>& haystack,
46+
const arg_type<Varbinary>& needle,
47+
const arg_type<int64_t>& instance = 1) {
48+
VELOX_USER_CHECK_GT(instance, 0, "'instance' must be a positive number");
49+
if (needle.size() == 0) {
50+
result = 1;
51+
return;
52+
}
53+
54+
int64_t byteIndex = -1;
55+
if constexpr (lpos) {
56+
byteIndex = stringCore::findNthInstanceByteIndexFromStart(
57+
std::string_view(haystack.data(), haystack.size()),
58+
std::string_view(needle.data(), needle.size()),
59+
instance);
60+
} else {
61+
byteIndex = stringCore::findNthInstanceByteIndexFromEnd(
62+
std::string_view(haystack.data(), haystack.size()),
63+
std::string_view(needle.data(), needle.size()),
64+
instance);
65+
}
66+
67+
// Return 1-based byte position, or 0 if not found.
68+
result = byteIndex == -1 ? 0 : byteIndex + 1;
69+
}
70+
};
71+
72+
template <typename T>
73+
struct StrLPosVarbinaryFunction : public StrPosVarbinaryFunctionBase<T, true> {
74+
};
75+
76+
template <typename T>
77+
struct StrRPosVarbinaryFunction : public StrPosVarbinaryFunctionBase<T, false> {
78+
};
79+
80+
/// contains for varbinary - returns true if the pattern exists in the binary
81+
/// data contains(varbinary, varbinary) → boolean
82+
template <typename T>
83+
struct ContainsVarbinaryFunction {
84+
VELOX_DEFINE_FUNCTION_TYPES(T);
85+
86+
FOLLY_ALWAYS_INLINE void call(
87+
out_type<bool>& result,
88+
const arg_type<Varbinary>& haystack,
89+
const arg_type<Varbinary>& needle) {
90+
if (needle.size() == 0) {
91+
result = true;
92+
return;
93+
}
94+
auto pos = std::string_view(haystack.data(), haystack.size())
95+
.find(std::string_view(needle.data(), needle.size()));
96+
result = pos != std::string_view::npos;
97+
}
98+
};
99+
100+
} // namespace facebook::velox::functions

velox/functions/prestosql/registration/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ velox_add_library(
4242
TDigestFunctionsRegistration.cpp
4343
QDigestFunctionsRegistration.cpp
4444
URLFunctionsRegistration.cpp
45+
VarbinaryFunctionsRegistration.cpp
4546
)
4647

4748
if(VELOX_ENABLE_GEO)

velox/functions/prestosql/registration/RegistrationFunctions.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ extern void registerFloatingPointFunctions(const std::string& prefix);
4141
extern void registerJsonFunctions(const std::string& prefix);
4242
extern void registerMapFunctions(const std::string& prefix);
4343
extern void registerStringFunctions(const std::string& prefix);
44+
extern void registerVarbinaryFunctions(const std::string& prefix);
4445
extern void registerBinaryFunctions(const std::string& prefix);
4546
extern void registerURLFunctions(const std::string& prefix);
4647
extern void registerDataSizeFunctions(const std::string& prefix);
@@ -173,6 +174,7 @@ void registerAllScalarFunctions(const std::string& prefix) {
173174
registerDateTimeFunctions(prefix);
174175
registerURLFunctions(prefix);
175176
registerStringFunctions(prefix);
177+
registerVarbinaryFunctions(prefix);
176178
registerBinaryFunctions(prefix);
177179
registerBitwiseFunctions(prefix);
178180
registerUuidFunctions(prefix);
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "velox/functions/Registerer.h"
17+
#include "velox/functions/prestosql/StringFunctions.h"
18+
#include "velox/functions/prestosql/VarbinaryFunctions.h"
19+
20+
namespace facebook::velox::functions {
21+
22+
void registerVarbinaryFunctions(const std::string& prefix) {
23+
// Length for varbinary
24+
registerFunction<LengthVarbinaryFunction, int64_t, Varbinary>(
25+
{prefix + "length"});
26+
27+
// substr for varbinary
28+
registerFunction<SubstrVarbinaryFunction, Varbinary, Varbinary, int64_t>(
29+
{prefix + "substr"});
30+
registerFunction<
31+
SubstrVarbinaryFunction,
32+
Varbinary,
33+
Varbinary,
34+
int64_t,
35+
int64_t>({prefix + "substr"});
36+
37+
// strpos/strrpos for varbinary
38+
registerFunction<StrLPosVarbinaryFunction, int64_t, Varbinary, Varbinary>(
39+
{prefix + "strpos"});
40+
registerFunction<
41+
StrLPosVarbinaryFunction,
42+
int64_t,
43+
Varbinary,
44+
Varbinary,
45+
int64_t>({prefix + "strpos"});
46+
registerFunction<StrRPosVarbinaryFunction, int64_t, Varbinary, Varbinary>(
47+
{prefix + "strrpos"});
48+
registerFunction<
49+
StrRPosVarbinaryFunction,
50+
int64_t,
51+
Varbinary,
52+
Varbinary,
53+
int64_t>({prefix + "strrpos"});
54+
55+
// contains for varbinary
56+
registerFunction<ContainsVarbinaryFunction, bool, Varbinary, Varbinary>(
57+
{prefix + "contains"});
58+
}
59+
} // namespace facebook::velox::functions

0 commit comments

Comments
 (0)