Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,16 @@ struct ARROW_EXPORT StrptimeOptions : public FunctionOptions {
TimeUnit::type unit;
};

struct ARROW_EXPORT PadOptions : public FunctionOptions {
explicit PadOptions(int64_t width, std::string padding = " ")
: width(width), padding(std::move(padding)) {}

/// The desired string length.
int64_t width;
/// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
std::string padding;
};

struct ARROW_EXPORT TrimOptions : public FunctionOptions {
explicit TrimOptions(std::string characters) : characters(std::move(characters)) {}

Expand Down
194 changes: 182 additions & 12 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,7 @@ struct Utf8Length {
static OutValue Call(KernelContext*, Arg0Value val, Status*) {
auto str = reinterpret_cast<const uint8_t*>(val.data());
auto strlen = val.size();

OutValue length = 0;
while (strlen > 0) {
length += ((*str & 0xc0) != 0x80);
++str;
--strlen;
}
return length;
return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
}
};

Expand Down Expand Up @@ -2817,6 +2810,138 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDe
return Status::Invalid("strptime does not provide default StrptimeOptions");
}

// ----------------------------------------------------------------------
// string padding

template <bool PadLeft, bool PadRight>
struct AsciiPadTransform : public StringTransformBase {
using State = OptionsWrapper<PadOptions>;

const PadOptions& options_;

explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}

Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
if (options_.padding.size() != 1) {
return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
}
return Status::OK();
}

int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
// This is likely very overallocated but hard to do better without
// actually looking at each string (because of strings that may be
// longer than the given width)
return input_ncodeunits + ninputs * options_.width;
}

int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
uint8_t* output) {
if (input_string_ncodeunits >= options_.width) {
std::copy(input, input + input_string_ncodeunits, output);
return input_string_ncodeunits;
}
const int64_t spaces = options_.width - input_string_ncodeunits;
int64_t left = 0;
int64_t right = 0;
if (PadLeft && PadRight) {
// If odd number of spaces, put the extra space on the left
right = spaces / 2;
left = spaces - right;
} else if (PadLeft) {
left = spaces;
} else if (PadRight) {
right = spaces;
} else {
DCHECK(false) << "unreachable";
return 0;
}
std::fill(output, output + left, options_.padding[0]);
output += left;
output = std::copy(input, input + input_string_ncodeunits, output);
std::fill(output, output + right, options_.padding[0]);
return options_.width;
}
};

template <bool PadLeft, bool PadRight>
struct Utf8PadTransform : public StringTransformBase {
using State = OptionsWrapper<PadOptions>;

const PadOptions& options_;

explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}

Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
auto strlen = options_.padding.size();
if (util::UTF8Length(str, str + strlen) != 1) {
return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
"'");
}
return Status::OK();
}

int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
// This is likely very overallocated but hard to do better without
// actually looking at each string (because of strings that may be
// longer than the given width)
// One codepoint may be up to 4 bytes
return input_ncodeunits + 4 * ninputs * options_.width;
}

int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
uint8_t* output) {
const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
if (input_width >= options_.width) {
std::copy(input, input + input_string_ncodeunits, output);
return input_string_ncodeunits;
}
const int64_t spaces = options_.width - input_width;
int64_t left = 0;
int64_t right = 0;
if (PadLeft && PadRight) {
// If odd number of spaces, put the extra space on the left
right = spaces / 2;
left = spaces - right;
} else if (PadLeft) {
left = spaces;
} else if (PadRight) {
right = spaces;
} else {
DCHECK(false) << "unreachable";
return 0;
}
uint8_t* start = output;
while (left) {
output = std::copy(options_.padding.begin(), options_.padding.end(), output);
left--;
}
output = std::copy(input, input + input_string_ncodeunits, output);
while (right) {
output = std::copy(options_.padding.begin(), options_.padding.end(), output);
right--;
}
return output - start;
}
};

template <typename Type>
using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
template <typename Type>
using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
template <typename Type>
using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
template <typename Type>
using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
template <typename Type>
using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
template <typename Type>
using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;

// ----------------------------------------------------------------------
// string trimming

#ifdef ARROW_WITH_UTF8PROC

template <bool TrimLeft, bool TrimRight>
Expand Down Expand Up @@ -3010,6 +3135,42 @@ using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, f
template <typename Type>
using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;

const FunctionDoc utf8_center_doc(
"Center strings by padding with a given character",
("For each string in `strings`, emit a centered string by padding both sides \n"
"with the given UTF8 codeunit.\nNull values emit null."),
{"strings"}, "PadOptions");

const FunctionDoc utf8_lpad_doc(
"Right-align strings by padding with a given character",
("For each string in `strings`, emit a right-aligned string by prepending \n"
"the given UTF8 codeunit.\nNull values emit null."),
{"strings"}, "PadOptions");

const FunctionDoc utf8_rpad_doc(
"Left-align strings by padding with a given character",
("For each string in `strings`, emit a left-aligned string by appending \n"
"the given UTF8 codeunit.\nNull values emit null."),
{"strings"}, "PadOptions");

const FunctionDoc ascii_center_doc(
utf8_center_doc.description + "",
("For each string in `strings`, emit a centered string by padding both sides \n"
"with the given ASCII character.\nNull values emit null."),
{"strings"}, "PadOptions");

const FunctionDoc ascii_lpad_doc(
utf8_lpad_doc.description + "",
("For each string in `strings`, emit a right-aligned string by prepending \n"
"the given ASCII character.\nNull values emit null."),
{"strings"}, "PadOptions");

const FunctionDoc ascii_rpad_doc(
utf8_rpad_doc.description + "",
("For each string in `strings`, emit a left-aligned string by appending \n"
"the given ASCII character.\nNull values emit null."),
{"strings"}, "PadOptions");

const FunctionDoc utf8_trim_whitespace_doc(
"Trim leading and trailing whitespace characters",
("For each string in `strings`, emit a string with leading and trailing whitespace\n"
Expand Down Expand Up @@ -3897,12 +4058,21 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
&ascii_rtrim_whitespace_doc);
MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry,
&ascii_lower_doc);

MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
&ascii_center_doc);
MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
&utf8_center_doc);
MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);

MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
&ascii_lower_doc);
&ascii_ltrim_doc);
MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
&ascii_lower_doc);
&ascii_rtrim_doc);

AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);

Expand Down
47 changes: 47 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,33 @@ TYPED_TEST(TestStringKernels, BinaryJoin) {
separators, expected);
}

TYPED_TEST(TestStringKernels, PadUTF8) {
// \xe2\x80\x88 = \u2008 is punctuation space, \xc3\xa1 = \u00E1 = á
PadOptions options{/*width=*/5, "\xe2\x80\x88"};
this->CheckUnary(
"utf8_center", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
R"([null, "\u2008\u2008a\u2008\u2008", "\u2008\u2008bb\u2008", "\u2008b\u00E1r\u2008", "foobar"])",
&options);
this->CheckUnary(
"utf8_lpad", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
R"([null, "\u2008\u2008\u2008\u2008a", "\u2008\u2008\u2008bb", "\u2008\u2008b\u00E1r", "foobar"])",
&options);
this->CheckUnary(
"utf8_rpad", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
R"([null, "a\u2008\u2008\u2008\u2008", "bb\u2008\u2008\u2008", "b\u00E1r\u2008\u2008", "foobar"])",
&options);

PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
auto input = ArrayFromJSON(this->type(), R"(["foo"])");
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
::testing::HasSubstr("Padding must be one codepoint"),
CallFunction("utf8_lpad", {input}, &options_bad));
options_bad.padding = "";
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
::testing::HasSubstr("Padding must be one codepoint"),
CallFunction("utf8_lpad", {input}, &options_bad));
}

#ifdef ARROW_WITH_UTF8PROC

TYPED_TEST(TestStringKernels, TrimWhitespaceUTF8) {
Expand Down Expand Up @@ -1371,6 +1398,26 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) {

#endif // ARROW_WITH_UTF8PROC

TYPED_TEST(TestStringKernels, PadAscii) {
PadOptions options{/*width=*/5, " "};
this->CheckUnary("ascii_center", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
R"([null, " a ", " bb ", " bar ", "foobar"])", &options);
this->CheckUnary("ascii_lpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
R"([null, " a", " bb", " bar", "foobar"])", &options);
this->CheckUnary("ascii_rpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
R"([null, "a ", "bb ", "bar ", "foobar"])", &options);

PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
auto input = ArrayFromJSON(this->type(), R"(["foo"])");
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
::testing::HasSubstr("Padding must be one byte"),
CallFunction("ascii_lpad", {input}, &options_bad));
options_bad.padding = "";
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
::testing::HasSubstr("Padding must be one byte"),
CallFunction("ascii_lpad", {input}, &options_bad));
}

TYPED_TEST(TestStringKernels, TrimWhitespaceAscii) {
// \xe2\x80\x88 is punctuation space
this->CheckUnary("ascii_trim_whitespace",
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/util/utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -556,5 +556,15 @@ static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* re
return true;
}

/// Count the number of codepoints in the given string (assuming it is valid UTF8).
static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) {
int64_t length = 0;
while (first != last) {
length += ((*first & 0xc0) != 0x80);
++first;
}
return length;
}

} // namespace util
} // namespace arrow
20 changes: 20 additions & 0 deletions cpp/src/arrow/util/utf8_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -489,5 +489,25 @@ TEST(UTF8FindIf, Basics) {
CheckOkUTF8("", U'β', 0, 0);
}

TEST(UTF8Length, Basics) {
auto length = [](const std::string& s) {
const auto* p = reinterpret_cast<const uint8_t*>(s.data());
return UTF8Length(p, p + s.length());
};
ASSERT_EQ(length("abcde"), 5);
// accented a encoded as a single codepoint
ASSERT_EQ(length("\xc3\x81"
"bcde"),
5);
// accented a encoded as two codepoints via combining character
ASSERT_EQ(length("a\xcc\x81"
"bcde"),
6);
// hiragana a (3 bytes)
ASSERT_EQ(length("\xe3\x81\x81"), 1);
// raised hands emoji (4 bytes)
ASSERT_EQ(length("\xf0\x9f\x99\x8c"), 1);
}

} // namespace util
} // namespace arrow
21 changes: 21 additions & 0 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,27 @@ String transforms
If the input is not valid UTF8, then the output is undefined (but the size of output
buffers will be preserved).

String padding
~~~~~~~~~~~~~~

These functions append/prepend a given padding byte (ASCII) or codepoint (UTF8) in
order to center (center), right-align (lpad), or left-align (rpad) a string.

+--------------------------+------------+-------------------------+---------------------+----------------------------------------+
| Function name | Arity | Input types | Output type | Options class |
+==========================+============+=========================+=====================+========================================+
| ascii_lpad | Unary | String-like | String-like | :struct:`PadOptions` |
+--------------------------+------------+-------------------------+---------------------+----------------------------------------+
| ascii_rpad | Unary | String-like | String-like | :struct:`PadOptions` |
+--------------------------+------------+-------------------------+---------------------+----------------------------------------+
| ascii_center | Unary | String-like | String-like | :struct:`PadOptions` |
+--------------------------+------------+-------------------------+---------------------+----------------------------------------+
| utf8_lpad | Unary | String-like | String-like | :struct:`PadOptions` |
+--------------------------+------------+-------------------------+---------------------+----------------------------------------+
| utf8_rpad | Unary | String-like | String-like | :struct:`PadOptions` |
+--------------------------+------------+-------------------------+---------------------+----------------------------------------+
| utf8_center | Unary | String-like | String-like | :struct:`PadOptions` |
+--------------------------+------------+-------------------------+---------------------+----------------------------------------+

String trimming
~~~~~~~~~~~~~~~
Expand Down
Loading