diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 22d6d1fc5ae9..2001c6318367 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1761,4 +1761,81 @@ TEST(TestDictionaryUnifier, TableZeroColumns) { AssertTablesEqual(*table, *unified); } +// GH-49689: Ordered dictionary tests + +TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) { + auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder)); + + auto builder_type = boxed_builder->type(); + ASSERT_TRUE(checked_cast(*builder_type).ordered()); +} + +TEST(TestDictionaryBuilderOrdered, UnorderedTypeStaysUnordered) { + auto unordered_type = dictionary(int8(), utf8(), /*ordered=*/false); + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), unordered_type, &boxed_builder)); + + auto builder_type = boxed_builder->type(); + ASSERT_FALSE(checked_cast(*builder_type).ordered()); +} + +TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) { + auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), ordered_type, &boxed_builder)); + auto& builder = checked_cast&>(*boxed_builder); + + ASSERT_OK(builder.Append("a")); + ASSERT_OK(builder.Append("b")); + ASSERT_OK(builder.Append("a")); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + const auto& result_type = checked_cast(*result->type()); + ASSERT_TRUE(result_type.ordered()); + + auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])"); + auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(ordered_type, ex_indices, ex_dict); + AssertArraysEqual(expected, *result); +} + +TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) { + auto ordered_dict_type = dictionary(int8(), utf8(), /*ordered=*/true); + auto list_type = list(field("item", ordered_dict_type)); + + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), list_type, &boxed_builder)); + auto& list_builder = checked_cast(*boxed_builder); + auto& dict_builder = + checked_cast&>(*list_builder.value_builder()); + + ASSERT_OK(list_builder.Append()); + ASSERT_OK(dict_builder.Append("a")); + ASSERT_OK(dict_builder.Append("b")); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(dict_builder.Append("a")); + + std::shared_ptr result; + ASSERT_OK(list_builder.Finish(&result)); + + const auto& result_list_type = checked_cast(*result->type()); + const auto& result_dict_type = + checked_cast(*result_list_type.value_type()); + ASSERT_TRUE(result_dict_type.ordered()); +} + +TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) { + auto ordered_type = dictionary(int8(), utf8(), /*ordered=*/true); + std::unique_ptr builder; + ASSERT_OK(MakeDictionaryBuilder(default_memory_pool(), ordered_type, + /*dictionary=*/nullptr, &builder)); + + auto builder_type = builder->type(); + ASSERT_TRUE(checked_cast(*builder_type).ordered()); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 116c82049eea..ef67ffdccb33 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -160,7 +160,8 @@ class DictionaryBuilderBase : public ArrayBuilder { delta_offset_(0), byte_width_(-1), indices_builder_(start_int_size, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(false) {} template explicit DictionaryBuilderBase( @@ -173,7 +174,8 @@ class DictionaryBuilderBase : public ArrayBuilder { delta_offset_(0), byte_width_(-1), indices_builder_(pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(false) {} template explicit DictionaryBuilderBase( @@ -187,7 +189,8 @@ class DictionaryBuilderBase : public ArrayBuilder { delta_offset_(0), byte_width_(-1), indices_builder_(index_type, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(false) {} template DictionaryBuilderBase(uint8_t start_int_size, @@ -202,7 +205,8 @@ class DictionaryBuilderBase : public ArrayBuilder { delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(start_int_size, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(false) {} template explicit DictionaryBuilderBase( @@ -214,7 +218,8 @@ class DictionaryBuilderBase : public ArrayBuilder { delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(false) {} template explicit DictionaryBuilderBase( @@ -227,7 +232,8 @@ class DictionaryBuilderBase : public ArrayBuilder { delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(index_type, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(false) {} template explicit DictionaryBuilderBase( @@ -243,7 +249,8 @@ class DictionaryBuilderBase : public ArrayBuilder { delta_offset_(0), byte_width_(-1), indices_builder_(pool, alignment), - value_type_(dictionary->type()) {} + value_type_(dictionary->type()), + ordered_(false) {} ~DictionaryBuilderBase() override = default; @@ -490,9 +497,11 @@ class DictionaryBuilderBase : public ArrayBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } std::shared_ptr type() const override { - return ::arrow::dictionary(indices_builder_.type(), value_type_); + return ::arrow::dictionary(indices_builder_.type(), value_type_, ordered_); } + void set_ordered(bool ordered) { ordered_ = ordered; } + protected: template Status AppendArraySliceImpl(const typename TypeTraits::ArrayType& dict, @@ -561,6 +570,7 @@ class DictionaryBuilderBase : public ArrayBuilder { BuilderType indices_builder_; std::shared_ptr value_type_; + bool ordered_; }; template @@ -647,7 +657,7 @@ class DictionaryBuilderBase : public ArrayBuilder { Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); - (*out)->type = dictionary((*out)->type, null()); + (*out)->type = dictionary((*out)->type, null(), ordered_); (*out)->dictionary = NullArray(0).data(); return Status::OK(); } @@ -659,11 +669,14 @@ class DictionaryBuilderBase : public ArrayBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } std::shared_ptr type() const override { - return ::arrow::dictionary(indices_builder_.type(), null()); + return ::arrow::dictionary(indices_builder_.type(), null(), ordered_); } + void set_ordered(bool ordered) { ordered_ = ordered; } + protected: BuilderType indices_builder_; + bool ordered_ = false; }; } // namespace internal diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1a6a718c15e1..04a8860eb833 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -168,17 +168,24 @@ struct DictionaryBuilderCase { template Status CreateFor() { using AdaptiveBuilderType = DictionaryBuilder; + using ExactBuilderType = + internal::DictionaryBuilderBase; if (dictionary != nullptr) { - out->reset(new AdaptiveBuilderType(dictionary, pool)); + auto builder = new AdaptiveBuilderType(dictionary, pool); + builder->set_ordered(ordered); + out->reset(builder); } else if (exact_index_type) { if (!is_integer(index_type->id())) { return Status::TypeError("MakeBuilder: invalid index type ", *index_type); } - out->reset(new internal::DictionaryBuilderBase( - index_type, value_type, pool)); + auto builder = new ExactBuilderType(index_type, value_type, pool); + builder->set_ordered(ordered); + out->reset(builder); } else { auto start_int_size = index_type->byte_width(); - out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool)); + auto builder = new AdaptiveBuilderType(start_int_size, value_type, pool); + builder->set_ordered(ordered); + out->reset(builder); } return Status::OK(); } @@ -188,8 +195,9 @@ struct DictionaryBuilderCase { MemoryPool* pool; const std::shared_ptr& index_type; const std::shared_ptr& value_type; - const std::shared_ptr& dictionary; + std::shared_ptr dictionary; bool exact_index_type; + bool ordered; std::unique_ptr* out; }; @@ -206,6 +214,7 @@ struct MakeBuilderImpl { dict_type.value_type(), /*dictionary=*/nullptr, exact_index_type, + dict_type.ordered(), &out}; return visitor.Make(); } @@ -332,9 +341,13 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& const std::shared_ptr& dictionary, std::unique_ptr* out) { const auto& dict_type = static_cast(*type); - DictionaryBuilderCase visitor = { - pool, dict_type.index_type(), dict_type.value_type(), - dictionary, /*exact_index_type=*/false, out}; + DictionaryBuilderCase visitor = {pool, + dict_type.index_type(), + dict_type.value_type(), + dictionary, + /*exact_index_type=*/false, + dict_type.ordered(), + out}; return visitor.Make(); } diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 45d68043af5a..e521b0e8db32 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -996,14 +996,6 @@ class RDictionaryConverter> Result> ToChunkedArray() override { ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish()); - auto result_type = checked_cast(result->type().get()); - if (this->dict_type_->ordered() && !result_type->ordered()) { - // TODO: we should not have to do that, there is probably something wrong - // in the DictionaryBuilder code - result->data()->type = - arrow::dictionary(result_type->index_type(), result_type->value_type(), true); - } - return std::make_shared( std::make_shared(result->data())); } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 8520160d1255..9fcc3e6b868d 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -643,6 +643,13 @@ test_that("arrow_array() handles vector -> list arrays (ARROW-7662)", { expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), list_of(dictionary(int8(), utf8()))) expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8()))) + # ordered factor (GH-49689) + expect_array_roundtrip( + list(ordered(c("b", "a"), levels = c("a", "b"))), + list_of(dictionary(int8(), utf8(), ordered = TRUE)) + ) + expect_array_roundtrip(list(ordered(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8(), ordered = TRUE))) + # struct expect_array_roundtrip( list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), @@ -742,6 +749,13 @@ test_that("arrow_array() handles vector -> large list arrays", { as = large_list_of(dictionary(int8(), utf8())) ) + # ordered factor (GH-49689) + expect_array_roundtrip( + list(ordered(c("b", "a"), levels = c("a", "b"))), + large_list_of(dictionary(int8(), utf8(), ordered = TRUE)), + as = large_list_of(dictionary(int8(), utf8(), ordered = TRUE)) + ) + # struct expect_array_roundtrip( list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), @@ -809,6 +823,13 @@ test_that("arrow_array() handles vector -> fixed size list arrays", { as = fixed_size_list_of(dictionary(int8(), utf8()), 2L) ) + # ordered factor (GH-49689) + expect_array_roundtrip( + list(ordered(c("b", "a"), levels = c("a", "b"))), + fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L), + as = fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L) + ) + # struct expect_array_roundtrip( list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)),