Skip to content

Commit 5d58fc6

Browse files
GH-15053: [C++] Add option to string 'center' kernel to control left/right alignment on odd number of padding (#41449)
### Rationale for this change See the issue #15053 for some more context, but in summary: for the "center" padding, and the number of characters that are being added, one needs to decide whether to add one more character on the left or right. Our implementation (somewhat randomly, I think) decided to put the extra space on the right. The Python standard library however, puts the extra space on the left. And for the usage of pyarrow as a string compute engine in the pandas project, we would like to have the option to have consistent behaviour with Python. ### What changes are included in this PR? Add an option `align_left_on_odd_padding` to `PadOptions` that controls where the extra space is put. This keyword is quite ugly, but I am not sure what other solution there is if we want to give pyarrow users this option (also happy to hear other argument name options) ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #15053 Authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
1 parent e2b0de2 commit 5d58fc6

File tree

8 files changed

+46
-15
lines changed

8 files changed

+46
-15
lines changed

cpp/src/arrow/compute/api_scalar.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,8 @@ static auto kMatchSubstringOptionsType = GetFunctionOptionsType<MatchSubstringOp
341341
static auto kNullOptionsType = GetFunctionOptionsType<NullOptions>(
342342
DataMember("nan_is_null", &NullOptions::nan_is_null));
343343
static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
344-
DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
344+
DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding),
345+
DataMember("lean_left_on_odd_padding", &PadOptions::lean_left_on_odd_padding));
345346
static auto kReplaceSliceOptionsType = GetFunctionOptionsType<ReplaceSliceOptions>(
346347
DataMember("start", &ReplaceSliceOptions::start),
347348
DataMember("stop", &ReplaceSliceOptions::stop),
@@ -480,10 +481,11 @@ NullOptions::NullOptions(bool nan_is_null)
480481
: FunctionOptions(internal::kNullOptionsType), nan_is_null(nan_is_null) {}
481482
constexpr char NullOptions::kTypeName[];
482483

483-
PadOptions::PadOptions(int64_t width, std::string padding)
484+
PadOptions::PadOptions(int64_t width, std::string padding, bool lean_left_on_odd_padding)
484485
: FunctionOptions(internal::kPadOptionsType),
485486
width(width),
486-
padding(std::move(padding)) {}
487+
padding(std::move(padding)),
488+
lean_left_on_odd_padding(lean_left_on_odd_padding) {}
487489
PadOptions::PadOptions() : PadOptions(0, " ") {}
488490
constexpr char PadOptions::kTypeName[];
489491

cpp/src/arrow/compute/api_scalar.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,14 +358,19 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions {
358358

359359
class ARROW_EXPORT PadOptions : public FunctionOptions {
360360
public:
361-
explicit PadOptions(int64_t width, std::string padding = " ");
361+
explicit PadOptions(int64_t width, std::string padding = " ",
362+
bool lean_left_on_odd_padding = true);
362363
PadOptions();
363364
static constexpr char const kTypeName[] = "PadOptions";
364365

365366
/// The desired string length.
366367
int64_t width;
367368
/// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
368369
std::string padding;
370+
/// What to do if there is an odd number of padding characters (in case of centered
371+
/// padding). Defaults to aligning on the left (i.e. adding the extra padding character
372+
/// on the right)
373+
bool lean_left_on_odd_padding = true;
369374
};
370375

371376
class ARROW_EXPORT TrimOptions : public FunctionOptions {

cpp/src/arrow/compute/function_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ TEST(FunctionOptions, Equality) {
102102
#endif
103103
options.emplace_back(new PadOptions(5, " "));
104104
options.emplace_back(new PadOptions(10, "A"));
105+
options.emplace_back(new PadOptions(10, "A", false));
105106
options.emplace_back(new TrimOptions(" "));
106107
options.emplace_back(new TrimOptions("abc"));
107108
options.emplace_back(new SliceOptions(/*start=*/1));

cpp/src/arrow/compute/kernels/scalar_string_ascii.cc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,9 +1142,13 @@ struct AsciiPadTransform : public StringTransformBase {
11421142
int64_t left = 0;
11431143
int64_t right = 0;
11441144
if (PadLeft && PadRight) {
1145-
// If odd number of spaces, put the extra space on the right
1146-
left = spaces / 2;
1147-
right = spaces - left;
1145+
if (options_.lean_left_on_odd_padding) {
1146+
left = spaces / 2;
1147+
right = spaces - left;
1148+
} else {
1149+
right = spaces / 2;
1150+
left = spaces - right;
1151+
}
11481152
} else if (PadLeft) {
11491153
left = spaces;
11501154
} else if (PadRight) {

cpp/src/arrow/compute/kernels/scalar_string_test.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2117,6 +2117,12 @@ TYPED_TEST(TestStringKernels, PadUTF8) {
21172117
R"([null, "a\u2008\u2008\u2008\u2008", "bb\u2008\u2008\u2008", "b\u00E1r\u2008\u2008", "foobar"])",
21182118
&options);
21192119

2120+
PadOptions options2{/*width=*/5, "\xe2\x80\x88", /*lean_left_on_odd_padding=*/false};
2121+
this->CheckUnary(
2122+
"utf8_center", R"([null, "a", "bb", "b\u00E1r", "foobar"])", this->type(),
2123+
R"([null, "\u2008\u2008a\u2008\u2008", "\u2008\u2008bb\u2008", "\u2008b\u00E1r\u2008", "foobar"])",
2124+
&options2);
2125+
21202126
PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
21212127
auto input = ArrayFromJSON(this->type(), R"(["foo"])");
21222128
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
@@ -2459,6 +2465,10 @@ TYPED_TEST(TestStringKernels, PadAscii) {
24592465
this->CheckUnary("ascii_rpad", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
24602466
R"([null, "a ", "bb ", "bar ", "foobar"])", &options);
24612467

2468+
PadOptions options2{/*width=*/5, " ", /*lean_left_on_odd_padding=*/false};
2469+
this->CheckUnary("ascii_center", R"([null, "a", "bb", "bar", "foobar"])", this->type(),
2470+
R"([null, " a ", " bb ", " bar ", "foobar"])", &options2);
2471+
24622472
PadOptions options_bad{/*width=*/3, /*padding=*/"spam"};
24632473
auto input = ArrayFromJSON(this->type(), R"(["foo"])");
24642474
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,

cpp/src/arrow/compute/kernels/scalar_string_utf8.cc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -930,9 +930,13 @@ struct Utf8PadTransform : public StringTransformBase {
930930
int64_t left = 0;
931931
int64_t right = 0;
932932
if (PadLeft && PadRight) {
933-
// If odd number of spaces, put the extra space on the right
934-
left = spaces / 2;
935-
right = spaces - left;
933+
if (options_.lean_left_on_odd_padding) {
934+
left = spaces / 2;
935+
right = spaces - left;
936+
} else {
937+
right = spaces / 2;
938+
left = spaces - right;
939+
}
936940
} else if (PadLeft) {
937941
left = spaces;
938942
} else if (PadRight) {

python/pyarrow/_compute.pyx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,8 +1108,8 @@ class MatchSubstringOptions(_MatchSubstringOptions):
11081108

11091109

11101110
cdef class _PadOptions(FunctionOptions):
1111-
def _set_options(self, width, padding):
1112-
self.wrapped.reset(new CPadOptions(width, tobytes(padding)))
1111+
def _set_options(self, width, padding, lean_left_on_odd_padding):
1112+
self.wrapped.reset(new CPadOptions(width, tobytes(padding), lean_left_on_odd_padding))
11131113

11141114

11151115
class PadOptions(_PadOptions):
@@ -1122,10 +1122,14 @@ class PadOptions(_PadOptions):
11221122
Desired string length.
11231123
padding : str, default " "
11241124
What to pad the string with. Should be one byte or codepoint.
1125+
lean_left_on_odd_padding : bool, default True
1126+
What to do if there is an odd number of padding characters (in case
1127+
of centered padding). Defaults to aligning on the left (i.e. adding
1128+
the extra padding character on the right).
11251129
"""
11261130

1127-
def __init__(self, width, padding=' '):
1128-
self._set_options(width, padding)
1131+
def __init__(self, width, padding=' ', lean_left_on_odd_padding=True):
1132+
self._set_options(width, padding, lean_left_on_odd_padding)
11291133

11301134

11311135
cdef class _TrimOptions(FunctionOptions):

python/pyarrow/includes/libarrow.pxd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2359,9 +2359,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
23592359

23602360
cdef cppclass CPadOptions \
23612361
"arrow::compute::PadOptions"(CFunctionOptions):
2362-
CPadOptions(int64_t width, c_string padding)
2362+
CPadOptions(int64_t width, c_string padding, c_bool lean_left_on_odd_padding)
23632363
int64_t width
23642364
c_string padding
2365+
c_bool lean_left_on_odd_padding
23652366

23662367
cdef cppclass CSliceOptions \
23672368
"arrow::compute::SliceOptions"(CFunctionOptions):

0 commit comments

Comments
 (0)