Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 48 additions & 19 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1387,40 +1387,69 @@ gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) {
return static_cast<gdv_int32>(static_cast<signed char>(data[0]));
}

// Returns the ASCII character having the binary equivalent to A.
// If A is larger than 256 the result is equivalent to chr(A % 256).
// Returns the UTF-8 encoding of the Unicode code point A.
// Raises an error if A is not a valid code point, i.e. it is negative, greater
// than 0x10FFFF, or falls within the UTF-16 surrogate range 0xD800-0xDFFF.
FORCE_INLINE
const char* chr_int32(gdv_int64 context, gdv_int32 in, gdv_int32* out_len) {
in = in % 256;
*out_len = 1;

char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
const char* chr_int64(gdv_int64 context, gdv_int64 in, gdv_int32* out_len) {
if (in < 0 || in > 0x10FFFF || (in >= 0xD800 && in <= 0xDFFF)) {
const char* fmt =
"Input %lld is not a valid Unicode code point in the range 0 to 1114111";
int size = static_cast<int>(strlen(fmt)) + 32;
char* error = reinterpret_cast<char*>(malloc(size));
snprintf(error, size, fmt, static_cast<long long>(in));
gdv_fn_context_set_error_msg(context, error);
free(error);
*out_len = 0;
return "";
}
ret[0] = char(in);
return ret;
}

// Returns the ASCII character having the binary equivalent to A.
// If A is larger than 256 the result is equivalent to chr(A % 256).
FORCE_INLINE
const char* chr_int64(gdv_int64 context, gdv_int64 in, gdv_int32* out_len) {
in = in % 256;
*out_len = 1;
if (in <= 0x7F) {
*out_len = 1;
} else if (in <= 0x7FF) {
*out_len = 2;
} else if (in <= 0xFFFF) {
*out_len = 3;
} else {
*out_len = 4;
}

char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
ret[0] = char(in);

switch (*out_len) {
case 1:
ret[0] = static_cast<char>(in);
break;
case 2:
ret[0] = static_cast<char>(0xC0 | (in >> 6));
ret[1] = static_cast<char>(0x80 | (in & 0x3F));
break;
case 3:
ret[0] = static_cast<char>(0xE0 | (in >> 12));
ret[1] = static_cast<char>(0x80 | ((in >> 6) & 0x3F));
ret[2] = static_cast<char>(0x80 | (in & 0x3F));
break;
case 4:
ret[0] = static_cast<char>(0xF0 | (in >> 18));
ret[1] = static_cast<char>(0x80 | ((in >> 12) & 0x3F));
ret[2] = static_cast<char>(0x80 | ((in >> 6) & 0x3F));
ret[3] = static_cast<char>(0x80 | (in & 0x3F));
break;
}
return ret;
}

// Returns the UTF-8 encoding of the Unicode code point A. See chr_int64.
FORCE_INLINE
const char* chr_int32(gdv_int64 context, gdv_int32 in, gdv_int32* out_len) {
return chr_int64(context, in, out_len);
}

FORCE_INLINE
const char* convert_fromUTF8_binary(gdv_int64 context, const char* bin_in, gdv_int32 len,
gdv_int32* out_len) {
Expand Down
110 changes: 69 additions & 41 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ TEST(TestStringOps, TestAscii) {
}

TEST(TestStringOps, TestChrBigInt) {
// CHR
// CHR returns the UTF-8 encoding of the given Unicode code point.
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
int32_t out_len = 0;

// 1-byte ASCII code points.
auto out = chr_int32(ctx_ptr, 88, &out_len);
EXPECT_EQ(std::string(out, out_len), "X");

Expand All @@ -70,62 +71,89 @@ TEST(TestStringOps, TestChrBigInt) {
out = chr_int32(ctx_ptr, 49, &out_len);
EXPECT_EQ(std::string(out, out_len), "1");

out = chr_int64(ctx_ptr, 84, &out_len);
EXPECT_EQ(std::string(out, out_len), "T");
out = chr_int32(ctx_ptr, 33, &out_len);
EXPECT_EQ(std::string(out, out_len), "!");

out = chr_int32(ctx_ptr, 340, &out_len);
EXPECT_EQ(std::string(out, out_len), "T");
out = chr_int64(ctx_ptr, 0, &out_len);
EXPECT_EQ(std::string(out, out_len), std::string("\0", 1));

out = chr_int64(ctx_ptr, 256, &out_len);
EXPECT_EQ(std::strcmp(out, "\0"), 0);
// BACKSPACE
out = chr_int64(ctx_ptr, 8, &out_len);
EXPECT_EQ(std::string(out, out_len), "\b");

out = chr_int32(ctx_ptr, 33, &out_len);
EXPECT_EQ(std::string(out, out_len), "!");
// ESCAPE (ESC)
out = chr_int64(ctx_ptr, 27, &out_len);
EXPECT_EQ(std::string(out, out_len), "\x1B");

out = chr_int64(ctx_ptr, 46, &out_len);
EXPECT_EQ(std::string(out, out_len), ".");
// Highest 1-byte code point (U+007F, DELETE).
out = chr_int32(ctx_ptr, 0x7F, &out_len);
EXPECT_EQ(std::string(out, out_len), "\x7F");

out = chr_int32(ctx_ptr, 63, &out_len);
EXPECT_EQ(std::string(out, out_len), "?");
// 2-byte code points.
// Lowest 2-byte code point (U+0080).
out = chr_int32(ctx_ptr, 0x80, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xC2\x80");

out = chr_int64(ctx_ptr, 0, &out_len);
EXPECT_EQ(std::strcmp(out, "\0"), 0);
// í (U+00ED)
out = chr_int32(ctx_ptr, 237, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xC3\xAD");

out = chr_int32(ctx_ptr, -158, &out_len);
EXPECT_EQ(std::string(out, out_len), "b");
// Highest 2-byte code point (U+07FF).
out = chr_int64(ctx_ptr, 0x7FF, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xDF\xBF");

out = chr_int64(ctx_ptr, -5, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xFB");
// 3-byte code points.
// Lowest 3-byte code point (U+0800).
out = chr_int32(ctx_ptr, 0x800, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xE0\xA0\x80");

out = chr_int32(ctx_ptr, -340, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xAC");
// € (U+20AC)
out = chr_int32(ctx_ptr, 8364, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xE2\x82\xAC");

out = chr_int64(ctx_ptr, -66, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xBE");
// 日 (U+65E5)
out = chr_int64(ctx_ptr, 26085, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xE6\x97\xA5");

//
out = chr_int32(ctx_ptr, 128, &out_len);
EXPECT_EQ(std::string(out, out_len), "\x80");
// Highest 3-byte code point (U+FFFF).
out = chr_int32(ctx_ptr, 0xFFFF, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xEF\xBF\xBF");

// œ
out = chr_int64(ctx_ptr, 156, &out_len);
EXPECT_EQ(std::string(out, out_len), "\x9C");
// 4-byte code points.
// Lowest 4-byte code point (U+10000).
out = chr_int64(ctx_ptr, 0x10000, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xF0\x90\x80\x80");

// ÿ
out = chr_int32(ctx_ptr, 255, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xFF");
// 😀 (U+1F600)
out = chr_int64(ctx_ptr, 0x1F600, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xF0\x9F\x98\x80");

// BACKSPACE
out = chr_int64(ctx_ptr, 8, &out_len);
EXPECT_EQ(std::string(out, out_len), "\b");
// Highest valid code point (U+10FFFF).
out = chr_int64(ctx_ptr, 0x10FFFF, &out_len);
EXPECT_EQ(std::string(out, out_len), "\xF4\x8F\xBF\xBF");

// DEVICE CONTROL 3 (DC3)
out = chr_int32(ctx_ptr, 19, &out_len);
EXPECT_EQ(std::string(out, out_len), "\x13");
EXPECT_FALSE(ctx.has_error());

// ESCAPE (ESC)
out = chr_int64(ctx_ptr, 27, &out_len);
EXPECT_EQ(std::string(out, out_len), "\x1B");
// Invalid code points raise an error that includes the offending value.
chr_int64(ctx_ptr, -1, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.get_error().find("not a valid Unicode code point") != std::string::npos)
<< ctx.get_error();
EXPECT_TRUE(ctx.get_error().find("-1") != std::string::npos) << ctx.get_error();
ctx.Reset();

chr_int64(ctx_ptr, 0x110000, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.get_error().find("not a valid Unicode code point") != std::string::npos)
<< ctx.get_error();
ctx.Reset();

// UTF-16 surrogate range is not a valid code point.
chr_int32(ctx_ptr, 0xD800, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_TRUE(ctx.get_error().find("not a valid Unicode code point") != std::string::npos)
<< ctx.get_error();
ctx.Reset();
}

TEST(TestStringOps, TestBeginsEnds) {
Expand Down
44 changes: 39 additions & 5 deletions cpp/src/gandiva/tests/projector_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1317,13 +1317,16 @@ TEST_F(TestProjector, TestChr) {
auto status = Projector::Make(schema, {chr_expr}, TestConfiguration(), &projector);
EXPECT_TRUE(status.ok()) << status.message();

// Create a row-batch with some sample data
// Create a row-batch with code points spanning 1- to 4-byte UTF-8 encodings.
// 65 -> "A", 237 -> "í" (U+00ED), 8364 -> "€" (U+20AC), 26085 -> "日" (U+65E5),
// 128512 -> "😀" (U+1F600).
int num_records = 5;
auto array0 =
MakeArrowArrayInt64({65, 84, 255, 340, -5}, {true, true, true, true, true});
// expected output
auto exp_chr =
MakeArrowArrayUtf8({"A", "T", "\xFF", "T", "\xFB"}, {true, true, true, true, true});
MakeArrowArrayInt64({65, 237, 8364, 26085, 128512}, {true, true, true, true, true});
// expected UTF-8 output
auto exp_chr = MakeArrowArrayUtf8(
{"A", "\xC3\xAD", "\xE2\x82\xAC", "\xE6\x97\xA5", "\xF0\x9F\x98\x80"},
{true, true, true, true, true});

// prepare input record batch
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});
Expand All @@ -1337,6 +1340,37 @@ TEST_F(TestProjector, TestChr) {
EXPECT_ARROW_ARRAY_EQUALS(exp_chr, outputs.at(0));
}

TEST_F(TestProjector, TestChrInvalidCodePoint) {
// schema for input fields
auto field0 = field("f0", int64());
auto schema = arrow::schema({field0});

// output fields
auto field_chr = field("chr", arrow::utf8());

// Build expression
auto chr_expr = TreeExprBuilder::MakeExpression("chr", {field0}, field_chr);

std::shared_ptr<Projector> projector;
auto status = Projector::Make(schema, {chr_expr}, TestConfiguration(), &projector);
EXPECT_TRUE(status.ok()) << status.message();

// A code point outside the valid Unicode range (here, a negative value) must
// fail evaluation rather than wrap around.
int num_records = 1;
auto array0 = MakeArrowArrayInt64({-5}, {true});
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0});

// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
EXPECT_FALSE(status.ok());
EXPECT_NE(status.message().find("not a valid Unicode code point"), std::string::npos)
<< status.message();
// The message should include the offending value for debuggability.
EXPECT_NE(status.message().find("-5"), std::string::npos) << status.message();
}

TEST_F(TestProjector, TestBase64) {
// schema for input fields
auto field0 = field("f0", arrow::binary());
Expand Down
Loading