diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 597fe92b4e3a15..9ad09b317f1190 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -1280,6 +1280,13 @@ Status CloudTablet::calc_delete_bitmap_for_compaction( calc_compaction_output_rowset_delete_bitmap( input_rowsets, rowid_conversion, 0, version.second + 1, missed_rows.get(), location_map.get(), tablet_meta()->delete_bitmap(), output_rowset_delete_bitmap.get()); + // In cluster-key MOW compaction, rows are sorted by cluster key, so duplicate unique keys + // may be non-adjacent in merge order. Scan the output primary key index to delete older + // duplicate rows inside the output rowset. + if (!tablet_schema()->cluster_key_uids().empty()) { + RETURN_IF_ERROR(calc_compaction_output_rowset_internal_delete_bitmap( + input_rowsets, output_rowset, rowid_conversion, output_rowset_delete_bitmap.get())); + } if (missed_rows) { missed_rows_size = missed_rows->size(); if (!allow_delete_in_cumu_compaction) { diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index c84f91b89ab9cf..9746877d86ef85 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -431,6 +431,8 @@ DEFINE_mInt32(pk_index_page_cache_stale_sweep_time_sec, "600"); DEFINE_mBool(enable_low_cardinality_optimize, "true"); DEFINE_Bool(enable_low_cardinality_cache_code, "true"); +DEFINE_mBool(enable_adaptive_batch_size, "true"); + // be policy // whether check compaction checksum DEFINE_mBool(enable_compaction_checksum, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index be4d280df1ac66..c9e7acaefb4705 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -498,6 +498,11 @@ DECLARE_mInt32(pk_index_page_cache_stale_sweep_time_sec); DECLARE_mBool(enable_low_cardinality_optimize); DECLARE_Bool(enable_low_cardinality_cache_code); +// Adaptive batch size: dynamically adjust SegmentIterator chunk row count using EWMA +// so that each output block stays close to preferred_block_size_bytes. +// When false, the fixed batch_size row behaviour is preserved. +DECLARE_mBool(enable_adaptive_batch_size); + // be policy // whether check compaction checksum DECLARE_mBool(enable_compaction_checksum); diff --git a/be/src/core/block/block.h b/be/src/core/block/block.h index 43d55164750d3c..affc89392603d6 100644 --- a/be/src/core/block/block.h +++ b/be/src/core/block/block.h @@ -186,12 +186,17 @@ class Block { Status check_type_and_column() const; - /// Approximate number of bytes in memory - for profiling and limits. + /// Approximate number of bytes used by column data in memory. + /// This reflects the actual data footprint (e.g. string contents, numeric arrays) + /// and is the metric used by adaptive batch size byte budgets. size_t bytes() const; + /// Returns per-column byte sizes as a comma-separated string (for debugging). std::string columns_bytes() const; - /// Approximate number of allocated bytes in memory - for profiling and limits. + /// Approximate number of allocated (reserved) bytes in memory. + /// This may be larger than bytes() due to pre-allocated capacity in vectors/arenas. + /// Used for memory tracking and profiling. MOCK_FUNCTION size_t allocated_bytes() const; /** Get a list of column names separated by commas. */ @@ -355,6 +360,17 @@ class Block { void clear_column_mem_not_keep(const std::vector& column_keep_flags, bool need_keep_first); + // Helper: sum byte_size() of all mutable columns. + // Unlike Block::bytes() which operates on immutable ColumnPtr, + // this works on MutableColumns during block construction (e.g. in BlockReader). + static inline size_t columns_byte_size(const MutableColumns& cols) { + size_t total = 0; + for (const auto& col : cols) { + total += col->byte_size(); + } + return total; + } + private: void erase_impl(size_t position); }; diff --git a/be/src/exec/operator/mock_scan_operator.h b/be/src/exec/operator/mock_scan_operator.h index 1022a5c44fb694..8800dc97860028 100644 --- a/be/src/exec/operator/mock_scan_operator.h +++ b/be/src/exec/operator/mock_scan_operator.h @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "exec/operator/scan_operator.h" #ifdef BE_TEST @@ -80,6 +82,29 @@ class MockScanOperatorX final : public ScanOperatorX { public: friend class OlapScanLocalState; MockScanOperatorX() = default; + + void set_output_block(Block block) { + _output_blocks.clear(); + _output_blocks.push_back(std::move(block)); + } + + Status get_block(RuntimeState* state, Block* block, bool* eos) override { + if (_output_blocks.empty()) { + *eos = true; + return Status::OK(); + } + + *eos = false; + block->swap(_output_blocks.front()); + _output_blocks.pop_front(); + if (_output_blocks.empty()) { + *eos = true; + } + return Status::OK(); + } + +private: + std::list _output_blocks; }; } // namespace doris -#endif \ No newline at end of file +#endif diff --git a/be/src/exec/operator/olap_scan_operator.cpp b/be/src/exec/operator/olap_scan_operator.cpp index 669694df2814f4..12aabf2d457916 100644 --- a/be/src/exec/operator/olap_scan_operator.cpp +++ b/be/src/exec/operator/olap_scan_operator.cpp @@ -394,6 +394,11 @@ Status OlapScanLocalState::_init_profile() { ADD_COUNTER(_segment_profile, "ConditionCacheSegmentHit", TUnit::UNIT); _condition_cache_filtered_rows_counter = ADD_COUNTER(_segment_profile, "ConditionCacheFilteredRows", TUnit::UNIT); + _adaptive_batch_predict_min_rows_counter = + ADD_COUNTER(_segment_profile, "AdaptiveBatchPredictMinRows", TUnit::UNIT); + _adaptive_batch_predict_max_rows_counter = + ADD_COUNTER(_segment_profile, "AdaptiveBatchPredictMaxRows", TUnit::UNIT); + return Status::OK(); } diff --git a/be/src/exec/operator/olap_scan_operator.h b/be/src/exec/operator/olap_scan_operator.h index 3a27db78885b33..5bf32f7b8708f6 100644 --- a/be/src/exec/operator/olap_scan_operator.h +++ b/be/src/exec/operator/olap_scan_operator.h @@ -315,6 +315,9 @@ class OlapScanLocalState final : public ScanLocalState { // Variant subtree: times selecting doc snapshot all iterator (merge doc snapshot into root) RuntimeProfile::Counter* _variant_doc_value_column_iter_count = nullptr; + RuntimeProfile::Counter* _adaptive_batch_predict_min_rows_counter = nullptr; + RuntimeProfile::Counter* _adaptive_batch_predict_max_rows_counter = nullptr; + std::vector _tablets; std::vector _read_sources; diff --git a/be/src/exec/operator/operator.cpp b/be/src/exec/operator/operator.cpp index 27d8acf859aa80..3b330550faf02f 100644 --- a/be/src/exec/operator/operator.cpp +++ b/be/src/exec/operator/operator.cpp @@ -385,10 +385,7 @@ Status OperatorXBase::get_block_after_projects(RuntimeState* state, Block* block auto* local_state = state->get_local_state(operator_id()); Defer defer([&]() { if (status.ok()) { - if (auto rows = block->rows()) { - COUNTER_UPDATE(local_state->_rows_returned_counter, rows); - COUNTER_UPDATE(local_state->_blocks_returned_counter, 1); - } + local_state->update_output_block_counters(*block); } }); if (_output_row_descriptor) { @@ -505,7 +502,11 @@ PipelineXSinkLocalStateBase::PipelineXSinkLocalStateBase(DataSinkOperatorXBase* : _parent(parent), _state(state) {} PipelineXLocalStateBase::PipelineXLocalStateBase(RuntimeState* state, OperatorXBase* parent) - : _num_rows_returned(0), _rows_returned_counter(nullptr), _parent(parent), _state(state) {} + : _num_rows_returned(0), + _rows_returned_counter(nullptr), + _parent(parent), + _state(state), + _budget(state->batch_size(), state->preferred_block_size_bytes()) {} template Status PipelineXLocalState::init(RuntimeState* state, LocalStateInfo& info) { @@ -559,6 +560,12 @@ Status PipelineXLocalState::init(RuntimeState* state, LocalState _open_timer = ADD_TIMER_WITH_LEVEL(_common_profile, "OpenTime", 2); _close_timer = ADD_TIMER_WITH_LEVEL(_common_profile, "CloseTime", 2); _exec_timer = ADD_TIMER_WITH_LEVEL(_common_profile, "ExecTime", 1); + _output_block_bytes_counter = + ADD_COUNTER_WITH_LEVEL(_common_profile, "OutputBlockBytes", TUnit::BYTES, 1); + _max_output_block_bytes_counter = + ADD_COUNTER_WITH_LEVEL(_common_profile, "MaxOutputBlockBytes", TUnit::BYTES, 1); + _min_output_block_bytes_counter = + ADD_COUNTER_WITH_LEVEL(_common_profile, "MinOutputBlockBytes", TUnit::BYTES, 1); _memory_used_counter = _common_profile->AddHighWaterMarkCounter("MemoryUsage", TUnit::BYTES, "", 1); _common_profile->add_info_string("IsColocate", diff --git a/be/src/exec/operator/operator.h b/be/src/exec/operator/operator.h index 2f403d275fd758..25ae1477f8abff 100644 --- a/be/src/exec/operator/operator.h +++ b/be/src/exec/operator/operator.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include "runtime/runtime_profile.h" #include "runtime/runtime_state.h" #include "runtime/thread_context.h" +#include "util/block_budget.h" namespace doris { #include "common/compile_check_begin.h" @@ -245,11 +247,28 @@ class PipelineXLocalStateBase { RuntimeProfile::Counter* memory_used_counter() { return _memory_used_counter; } OperatorXBase* parent() { return _parent; } RuntimeState* state() { return _state; } + [[nodiscard]] const BlockBudget& block_budget() const { return _budget; } VExprContextSPtrs& conjuncts() { return _conjuncts; } VExprContextSPtrs& projections() { return _projections; } [[nodiscard]] int64_t num_rows_returned() const { return _num_rows_returned; } void add_num_rows_returned(int64_t delta) { _num_rows_returned += delta; } void set_num_rows_returned(int64_t value) { _num_rows_returned = value; } + void update_output_block_counters(const Block& block) { + if (auto rows = block.rows()) { + COUNTER_UPDATE(_rows_returned_counter, rows); + COUNTER_UPDATE(_blocks_returned_counter, 1); + auto block_bytes = static_cast(block.bytes()); + COUNTER_UPDATE(_output_block_bytes_counter, block_bytes); + if (block_bytes > _max_output_block_bytes) { + _max_output_block_bytes = block_bytes; + COUNTER_SET(_max_output_block_bytes_counter, block_bytes); + } + if (block_bytes < _min_output_block_bytes) { + _min_output_block_bytes = block_bytes; + COUNTER_SET(_min_output_block_bytes_counter, block_bytes); + } + } + } [[nodiscard]] virtual std::string debug_string(int indentation_level = 0) const = 0; [[nodiscard]] virtual bool is_blockable() const; @@ -305,6 +324,11 @@ class PipelineXLocalStateBase { RuntimeProfile::Counter* _rows_returned_counter = nullptr; RuntimeProfile::Counter* _blocks_returned_counter = nullptr; + RuntimeProfile::Counter* _output_block_bytes_counter = nullptr; + RuntimeProfile::Counter* _max_output_block_bytes_counter = nullptr; + RuntimeProfile::Counter* _min_output_block_bytes_counter = nullptr; + int64_t _max_output_block_bytes = 0; + int64_t _min_output_block_bytes = std::numeric_limits::max(); RuntimeProfile::Counter* _wait_for_dependency_timer = nullptr; // Account for current memory and peak memory used by this node RuntimeProfile::HighWaterMarkCounter* _memory_used_counter = nullptr; @@ -316,6 +340,8 @@ class PipelineXLocalStateBase { OperatorXBase* _parent = nullptr; RuntimeState* _state = nullptr; + // Execution-scoped row/byte budget derived from the session batch settings. + const BlockBudget _budget; VExprContextSPtrs _conjuncts; VExprContextSPtrs _projections; std::shared_ptr _score_runtime; diff --git a/be/src/exec/operator/scan_operator.h b/be/src/exec/operator/scan_operator.h index 635e3c8d593582..d6e2407a8d2fba 100644 --- a/be/src/exec/operator/scan_operator.h +++ b/be/src/exec/operator/scan_operator.h @@ -344,11 +344,7 @@ class ScanOperatorX : public OperatorX { Status get_block_after_projects(RuntimeState* state, Block* block, bool* eos) override { Status status = get_block(state, block, eos); if (status.ok()) { - if (auto rows = block->rows()) { - auto* local_state = state->get_local_state(operator_id()); - COUNTER_UPDATE(local_state->_rows_returned_counter, rows); - COUNTER_UPDATE(local_state->_blocks_returned_counter, 1); - } + state->get_local_state(operator_id())->update_output_block_counters(*block); } return status; } diff --git a/be/src/exec/operator/schema_scan_operator.cpp b/be/src/exec/operator/schema_scan_operator.cpp index be60af084df586..27385c3eca079e 100644 --- a/be/src/exec/operator/schema_scan_operator.cpp +++ b/be/src/exec/operator/schema_scan_operator.cpp @@ -32,6 +32,12 @@ class RuntimeState; namespace doris { +SchemaScanLocalState::SchemaScanLocalState(RuntimeState* state, OperatorXBase* parent) + : PipelineXLocalState<>(state, parent), + _data_dependency(std::make_shared(parent->operator_id(), parent->node_id(), + parent->get_name() + "_DEPENDENCY", true)) { +} + Status SchemaScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info)); @@ -243,7 +249,7 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, Block* block, bool* e break; } - if (src_block.rows() >= state->batch_size()) { + if (local_state.block_budget().exceeded(src_block.rows(), src_block.bytes())) { break; } } diff --git a/be/src/exec/operator/schema_scan_operator.h b/be/src/exec/operator/schema_scan_operator.h index c158ff0b087890..1d8cf22c4a0be0 100644 --- a/be/src/exec/operator/schema_scan_operator.h +++ b/be/src/exec/operator/schema_scan_operator.h @@ -37,11 +37,7 @@ class SchemaScanLocalState final : public PipelineXLocalState<> { public: ENABLE_FACTORY_CREATOR(SchemaScanLocalState); - SchemaScanLocalState(RuntimeState* state, OperatorXBase* parent) - : PipelineXLocalState<>(state, parent) { - _data_dependency = std::make_shared(parent->operator_id(), parent->node_id(), - parent->get_name() + "_DEPENDENCY", true); - } + SchemaScanLocalState(RuntimeState* state, OperatorXBase* parent); ~SchemaScanLocalState() override = default; Status init(RuntimeState* state, LocalStateInfo& info) override; @@ -93,4 +89,4 @@ class SchemaScanOperatorX final : public OperatorX { }; #include "common/compile_check_end.h" -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/exec/scan/file_scanner.cpp b/be/src/exec/scan/file_scanner.cpp index a48ebbed98e926..d97f5a9ae95441 100644 --- a/be/src/exec/scan/file_scanner.cpp +++ b/be/src/exec/scan/file_scanner.cpp @@ -162,6 +162,21 @@ Status FileScanner::init(RuntimeState* state, const VExprContextSPtrs& conjuncts _runtime_filter_partition_pruned_range_counter = ADD_COUNTER_WITH_LEVEL(_local_state->scanner_profile(), "RuntimeFilterPartitionPrunedRangeNum", TUnit::UNIT, 1); + // Keep the current file's adaptive state while also preserving the peak value across all + // files handled by this scanner instance. + _adaptive_batch_predicted_rows_counter = + _local_state->scanner_profile()->AddHighWaterMarkCounter( + "AdaptiveBatchPredictedRows", TUnit::UNIT, RuntimeProfile::ROOT_COUNTER, 1); + _adaptive_batch_actual_bytes_before_truncate_counter = + _local_state->scanner_profile()->AddHighWaterMarkCounter( + "AdaptiveBatchActualBytesBeforeTruncate", TUnit::BYTES, + RuntimeProfile::ROOT_COUNTER, 1); + _adaptive_batch_actual_bytes_after_truncate_counter = + _local_state->scanner_profile()->AddHighWaterMarkCounter( + "AdaptiveBatchActualBytesAfterTruncate", TUnit::BYTES, + RuntimeProfile::ROOT_COUNTER, 1); + _adaptive_batch_probe_count_counter = ADD_COUNTER_WITH_LEVEL( + _local_state->scanner_profile(), "AdaptiveBatchProbeCount", TUnit::UNIT, 1); _file_cache_statistics.reset(new io::FileCacheStatistics()); _file_reader_stats.reset(new io::FileReaderStats()); @@ -202,6 +217,98 @@ Status FileScanner::init(RuntimeState* state, const VExprContextSPtrs& conjuncts return Status::OK(); } +bool FileScanner::_should_enable_adaptive_batch_size(TFileFormatType::type format_type) const { + // Only enable for readers that support set_batch_size(). + // Table-format wrappers are covered because they delegate to native readers. + if (!config::enable_adaptive_batch_size) { + return false; + } + switch (format_type) { + case TFileFormatType::FORMAT_PARQUET: + case TFileFormatType::FORMAT_ORC: + case TFileFormatType::FORMAT_CSV_PLAIN: + case TFileFormatType::FORMAT_CSV_GZ: + case TFileFormatType::FORMAT_CSV_BZ2: + case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + case TFileFormatType::FORMAT_CSV_LZOP: + case TFileFormatType::FORMAT_CSV_DEFLATE: + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: + case TFileFormatType::FORMAT_PROTO: + case TFileFormatType::FORMAT_TEXT: + case TFileFormatType::FORMAT_JSON: + case TFileFormatType::FORMAT_JNI: + return true; + default: + return false; + } +} + +bool FileScanner::_should_run_adaptive_batch_size() const { + // Skip adaptive batch sizing for pushed-down COUNT(*): the reader is wrapped by CountReader + // and only emits a single aggregated row count instead of materializing real columns, so + // there is no per-row byte cost to learn from and no benefit in tuning the batch size. + return _block_size_predictor != nullptr && _get_push_down_agg_type() != TPushAggOp::type::COUNT; +} + +void FileScanner::_reset_adaptive_batch_size_state() { + _block_size_predictor.reset(); + COUNTER_SET(_adaptive_batch_predicted_rows_counter, int64_t(0)); + COUNTER_SET(_adaptive_batch_actual_bytes_before_truncate_counter, int64_t(0)); + COUNTER_SET(_adaptive_batch_actual_bytes_after_truncate_counter, int64_t(0)); +} + +void FileScanner::_init_adaptive_batch_size_state(TFileFormatType::type format_type) { + _reset_adaptive_batch_size_state(); + if (!_should_enable_adaptive_batch_size(format_type)) { + return; + } + + // External file readers do not provide reliable memory-size metadata hints. Use a small probe + // batch so the predictor can learn from real FileScanner output quickly. + _block_size_predictor = std::make_unique( + _state->preferred_block_size_bytes(), 0.0, ADAPTIVE_BATCH_INITIAL_PROBE_ROWS, + _state->batch_size()); +} + +size_t FileScanner::_predict_reader_batch_rows() { + DCHECK(_block_size_predictor != nullptr); + size_t predicted_rows = _block_size_predictor->predict_next_rows(); + COUNTER_SET(_adaptive_batch_predicted_rows_counter, static_cast(predicted_rows)); + return predicted_rows; +} + +void FileScanner::_update_adaptive_batch_size_before_truncate(const Block& block) { + if (!_should_run_adaptive_batch_size()) { + return; + } + + // Learn from the logical bytes before CHAR/VARCHAR truncation. The truncated block can be + // much smaller than the data the reader and FileScanner have already materialized. + COUNTER_SET(_adaptive_batch_actual_bytes_before_truncate_counter, + static_cast(block.bytes())); + if (block.rows() == 0) { + return; + } + + // Count a probe only when we actually obtain the first non-empty sample that seeds history. + if (!_block_size_predictor->has_history()) { + COUNTER_UPDATE(_adaptive_batch_probe_count_counter, 1); + } + _block_size_predictor->update(block); +} + +void FileScanner::_update_adaptive_batch_size_after_truncate(const Block& block) { + if (!_should_run_adaptive_batch_size()) { + return; + } + + // Keep the post-truncate size only for observability. It should not affect the next batch + // because truncation happens after the upstream memory cost has already been paid. + COUNTER_SET(_adaptive_batch_actual_bytes_after_truncate_counter, + static_cast(block.bytes())); +} + // check if the expr is a partition pruning expr bool FileScanner::_check_partition_prune_expr(const VExprSPtr& expr) { if (expr->is_slot_ref()) { @@ -460,12 +567,17 @@ Status FileScanner::_get_block_wrapped(RuntimeState* state, Block* block, bool* // For query job, simply set _src_block_ptr to block. size_t read_rows = 0; RETURN_IF_ERROR(_init_src_block(block)); + if (_need_iceberg_rowid_column && _current_range.__isset.table_format_params && _current_range.table_format_params.table_format_type == "iceberg") { if (auto* iceberg_reader = dynamic_cast(_cur_reader.get())) { iceberg_reader->set_row_id_column_position(_iceberg_rowid_column_pos); } } + + if (_should_run_adaptive_batch_size()) { + _cur_reader->set_batch_size(_predict_reader_batch_rows()); + } { SCOPED_TIMER(_get_block_timer); @@ -937,6 +1049,7 @@ Status FileScanner::_get_next_reader() { _state->update_num_finished_scan_range(1); } _cur_reader.reset(nullptr); + _reset_adaptive_batch_size_state(); _src_block_init = false; bool has_next = _first_scan_range; if (!_first_scan_range) { @@ -1113,24 +1226,25 @@ Status FileScanner::_get_next_reader() { case TFileFormatType::FORMAT_CSV_DEFLATE: case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: case TFileFormatType::FORMAT_PROTO: { - auto reader = CsvReader::create_unique(_state, _profile, &_counter, *_params, range, - _file_slot_descs, _io_ctx.get()); - + auto reader = + CsvReader::create_unique(_state, _profile, &_counter, *_params, range, + _file_slot_descs, _state->batch_size(), _io_ctx.get()); init_status = reader->init_reader(_is_load); _cur_reader = std::move(reader); break; } case TFileFormatType::FORMAT_TEXT: { auto reader = TextReader::create_unique(_state, _profile, &_counter, *_params, range, - _file_slot_descs, _io_ctx.get()); + _file_slot_descs, _state->batch_size(), + _io_ctx.get()); init_status = reader->init_reader(_is_load); _cur_reader = std::move(reader); break; } case TFileFormatType::FORMAT_JSON: { - _cur_reader = - NewJsonReader::create_unique(_state, _profile, &_counter, *_params, range, - _file_slot_descs, &_scanner_eof, _io_ctx.get()); + _cur_reader = NewJsonReader::create_unique(_state, _profile, &_counter, *_params, range, + _file_slot_descs, &_scanner_eof, + _state->batch_size(), _io_ctx.get()); init_status = ((NewJsonReader*)(_cur_reader.get())) ->init_reader(_col_default_value_ctx, _is_load); break; @@ -1226,6 +1340,7 @@ Status FileScanner::_get_next_reader() { } } _cur_reader_eof = false; + _init_adaptive_batch_size_state(format_type); break; } return Status::OK(); diff --git a/be/src/exec/scan/file_scanner.h b/be/src/exec/scan/file_scanner.h index 08b808ef2af1fc..8a32d154695910 100644 --- a/be/src/exec/scan/file_scanner.h +++ b/be/src/exec/scan/file_scanner.h @@ -40,6 +40,7 @@ #include "runtime/descriptors.h" #include "runtime/runtime_profile.h" #include "storage/olap_scan_common.h" +#include "storage/segment/adaptive_block_size_predictor.h" namespace doris { class RuntimeState; @@ -59,6 +60,7 @@ class FileScanner : public Scanner { public: static constexpr const char* NAME = "FileScanner"; + static constexpr size_t ADAPTIVE_BATCH_INITIAL_PROBE_ROWS = 32; // sub profile name (for parquet/orc) static const std::string FileReadBytesProfile; @@ -212,6 +214,10 @@ class FileScanner : public Scanner { RuntimeProfile::Counter* _file_read_calls_counter = nullptr; RuntimeProfile::Counter* _file_read_time_counter = nullptr; RuntimeProfile::Counter* _runtime_filter_partition_pruned_range_counter = nullptr; + RuntimeProfile::Counter* _adaptive_batch_predicted_rows_counter = nullptr; + RuntimeProfile::Counter* _adaptive_batch_actual_bytes_before_truncate_counter = nullptr; + RuntimeProfile::Counter* _adaptive_batch_actual_bytes_after_truncate_counter = nullptr; + RuntimeProfile::Counter* _adaptive_batch_probe_count_counter = nullptr; const std::unordered_map* _col_name_to_slot_id = nullptr; // single slot filter conjuncts @@ -237,7 +243,8 @@ class FileScanner : public Scanner { int64_t _last_bytes_read_from_local = 0; int64_t _last_bytes_read_from_remote = 0; -private: + std::unique_ptr _block_size_predictor; + Status _init_expr_ctxes(); Status _init_src_block(Block* block); Status _check_output_block_types(); @@ -282,11 +289,19 @@ class FileScanner : public Scanner { _counter.num_rows_filtered = 0; } - TPushAggOp::type _get_push_down_agg_type() { + TPushAggOp::type _get_push_down_agg_type() const { return _local_state == nullptr ? TPushAggOp::type::NONE : _local_state->get_push_down_agg_type(); } + void _reset_adaptive_batch_size_state(); + void _init_adaptive_batch_size_state(TFileFormatType::type format_type); + bool _should_enable_adaptive_batch_size(TFileFormatType::type format_type) const; + bool _should_run_adaptive_batch_size() const; + size_t _predict_reader_batch_rows(); + void _update_adaptive_batch_size_before_truncate(const Block& block); + void _update_adaptive_batch_size_after_truncate(const Block& block); + // enable the file meta cache only when // 1. max_external_file_meta_cache_num is > 0 // 2. the file number is less than 1/3 of cache's capacibility diff --git a/be/src/exec/scan/olap_scanner.cpp b/be/src/exec/scan/olap_scanner.cpp index 0bcf74c8e47e93..2fab478562f59e 100644 --- a/be/src/exec/scan/olap_scanner.cpp +++ b/be/src/exec/scan/olap_scanner.cpp @@ -163,6 +163,9 @@ Status OlapScanner::prepare() { // value (e.g. select a from t where a .. and b ... limit 1), // it will be very slow when reading data in segment iterator _tablet_reader->set_batch_size(_state->batch_size()); + // Adaptive batch size: pass byte-budget settings to the storage reader. + // The reader still uses batch_size() as the row ceiling. + _tablet_reader->set_preferred_block_size_bytes(_state->preferred_block_size_bytes()); { TOlapScanNode& olap_scan_node = local_state->olap_scan_node(); @@ -775,6 +778,13 @@ void OlapScanner::_collect_profile_before_close() { COUNTER_UPDATE(local_state->_variant_doc_value_column_iter_count, stats.variant_doc_value_column_iter_count); + if (stats.adaptive_batch_size_predict_max_rows > 0) { + local_state->_adaptive_batch_predict_min_rows_counter->set( + stats.adaptive_batch_size_predict_min_rows); + local_state->_adaptive_batch_predict_max_rows_counter->set( + stats.adaptive_batch_size_predict_max_rows); + } + InvertedIndexProfileReporter inverted_index_profile; inverted_index_profile.update(local_state->_index_filter_profile.get(), &stats.inverted_index_stats); diff --git a/be/src/exec/scan/scanner.cpp b/be/src/exec/scan/scanner.cpp index 0b5df2cc054264..4fc0d44561673e 100644 --- a/be/src/exec/scan/scanner.cpp +++ b/be/src/exec/scan/scanner.cpp @@ -87,13 +87,16 @@ Status Scanner::get_block_after_projects(RuntimeState* state, Block* block, bool } else { _origin_block.clear_column_data(row_descriptor.num_materialized_slots()); const auto min_batch_size = std::max(state->batch_size() / 2, 1); - while (_padding_block.rows() < min_batch_size && !*eos) { + const auto block_max_bytes = state->preferred_block_size_bytes(); + while (_padding_block.rows() < min_batch_size && + _padding_block.bytes() < block_max_bytes && !*eos) { RETURN_IF_ERROR(get_block(state, &_origin_block, eos)); if (_origin_block.rows() >= min_batch_size) { break; } - if (_origin_block.rows() + _padding_block.rows() <= state->batch_size()) { + if (_origin_block.rows() + _padding_block.rows() <= state->batch_size() && + _origin_block.bytes() + _padding_block.bytes() <= block_max_bytes) { RETURN_IF_ERROR(_merge_padding_block()); _origin_block.clear_column_data(row_descriptor.num_materialized_slots()); } else { diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 569d932c26e9ec..90d8dc27787741 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -171,8 +172,8 @@ void PlainCsvTextFieldSplitter::do_split(const Slice& line, std::vector* CsvReader::CsvReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter, const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::vector& file_slot_descs, io::IOContext* io_ctx, - std::shared_ptr io_ctx_holder) + const std::vector& file_slot_descs, size_t batch_size, + io::IOContext* io_ctx, std::shared_ptr io_ctx_holder) : _profile(profile), _params(params), _file_reader(nullptr), @@ -185,7 +186,8 @@ CsvReader::CsvReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounte _line_reader_eof(false), _skip_lines(0), _io_ctx(io_ctx), - _io_ctx_holder(std::move(io_ctx_holder)) { + _io_ctx_holder(std::move(io_ctx_holder)), + _batch_size(std::max(batch_size, 1UL)) { if (_io_ctx == nullptr && _io_ctx_holder) { _io_ctx = _io_ctx_holder.get(); } @@ -307,13 +309,22 @@ Status CsvReader::init_reader(bool is_load) { return Status::OK(); } +void CsvReader::set_batch_size(size_t batch_size) { + // 0 means "not set" / "use default" for the row-based readers; we must + // never let _batch_size be 0 because _do_get_next_block uses it as the + // upper bound of a `while (rows < _batch_size)` loop and a 0 would make + // the reader return empty blocks and incorrectly signal EOF. + _batch_size = std::max(batch_size, 1UL); +} + Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { if (_line_reader_eof) { *eof = true; return Status::OK(); } - const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); + const size_t batch_size = _batch_size; + const auto max_block_bytes = _state->preferred_block_size_bytes(); size_t rows = 0; bool success = false; @@ -355,7 +366,8 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { block->set_columns(std::move(mutate_columns)); } else { auto columns = block->mutate_columns(); - while (rows < batch_size && !_line_reader_eof) { + + while (rows < batch_size && !_line_reader_eof && (block->bytes() < max_block_bytes)) { const uint8_t* ptr = nullptr; size_t size = 0; RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 4e24be28d15b95..5120cb83ff4e21 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -172,15 +172,18 @@ class CsvReader : public GenericReader { public: CsvReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter, const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::vector& file_slot_descs, io::IOContext* io_ctx, - std::shared_ptr io_ctx_holder = nullptr); + const std::vector& file_slot_descs, size_t batch_size, + io::IOContext* io_ctx, std::shared_ptr io_ctx_holder = nullptr); ~CsvReader() override = default; Status init_reader(bool is_load); + Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; Status get_columns(std::unordered_map* name_to_type, std::unordered_set* missing_cols) override; + void set_batch_size(size_t batch_size) override; + Status init_schema_reader() override; // get schema of csv file from first one line or first two lines. // if file format is FORMAT_CSV_DEFLATE and if @@ -279,6 +282,8 @@ class CsvReader : public GenericReader { io::IOContext* _io_ctx = nullptr; std::shared_ptr _io_ctx_holder; + // Adaptive batch size set by FileScanner. 0 means not set (use _state->batch_size()). + size_t _batch_size; // Stored to adjust column_sep_positions when BOM is removed in enclose mode std::shared_ptr _enclose_reader_ctx; // save source text which have been splitted. diff --git a/be/src/format/generic_reader.h b/be/src/format/generic_reader.h index d68c9aa6bb9f33..e81358ed36dd40 100644 --- a/be/src/format/generic_reader.h +++ b/be/src/format/generic_reader.h @@ -46,6 +46,10 @@ class GenericReader : public ProfileCollector { virtual Status get_next_block(Block* block, size_t* read_rows, bool* eof) = 0; + // Override this in readers that can adjust batch size between consecutive reads. + virtual void set_batch_size(size_t batch_size) {} + virtual size_t get_batch_size() const { return 0; } + // Type is always nullable to process illegal values. virtual Status get_columns(std::unordered_map* name_to_type, std::unordered_set* missing_cols) { @@ -100,6 +104,7 @@ class GenericReader : public ProfileCollector { /// Whether the underlying FileReader has filled the partition&missing columns bool _fill_all_columns = false; + TPushAggOp::type _push_down_agg_type {}; // For TopN queries, rows will be read according to row ids produced by TopN result. diff --git a/be/src/format/json/new_json_reader.cpp b/be/src/format/json/new_json_reader.cpp index cecfcf3f0dcf54..4060744c9a85c1 100644 --- a/be/src/format/json/new_json_reader.cpp +++ b/be/src/format/json/new_json_reader.cpp @@ -79,7 +79,8 @@ using namespace ErrorCode; NewJsonReader::NewJsonReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter, const TFileScanRangeParams& params, const TFileRangeDesc& range, const std::vector& file_slot_descs, bool* scanner_eof, - io::IOContext* io_ctx, std::shared_ptr io_ctx_holder) + size_t batch_size, io::IOContext* io_ctx, + std::shared_ptr io_ctx_holder) : _vhandle_json_callback(nullptr), _state(state), _profile(profile), @@ -100,7 +101,8 @@ NewJsonReader::NewJsonReader(RuntimeState* state, RuntimeProfile* profile, Scann _scanner_eof(scanner_eof), _current_offset(0), _io_ctx(io_ctx), - _io_ctx_holder(std::move(io_ctx_holder)) { + _io_ctx_holder(std::move(io_ctx_holder)), + _batch_size(std::max(batch_size, 1UL)) { if (_io_ctx == nullptr && _io_ctx_holder) { _io_ctx = _io_ctx_holder.get(); } @@ -117,7 +119,7 @@ NewJsonReader::NewJsonReader(RuntimeState* state, RuntimeProfile* profile, Scann NewJsonReader::NewJsonReader(RuntimeProfile* profile, const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::vector& file_slot_descs, + const std::vector& file_slot_descs, size_t batch_size, io::IOContext* io_ctx, std::shared_ptr io_ctx_holder) : _vhandle_json_callback(nullptr), _state(nullptr), @@ -135,7 +137,8 @@ NewJsonReader::NewJsonReader(RuntimeProfile* profile, const TFileScanRangeParams _parse_allocator(_parse_buffer, sizeof(_parse_buffer)), _origin_json_doc(&_value_allocator, sizeof(_parse_buffer), &_parse_allocator), _io_ctx(io_ctx), - _io_ctx_holder(std::move(io_ctx_holder)) { + _io_ctx_holder(std::move(io_ctx_holder)), + _batch_size(std::max(batch_size, 1UL)) { if (_io_ctx == nullptr && _io_ctx_holder) { _io_ctx = _io_ctx_holder.get(); } @@ -203,9 +206,10 @@ Status NewJsonReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } - const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); + const auto batch_size = _batch_size; + const auto max_block_bytes = _state->preferred_block_size_bytes(); - while (block->rows() < batch_size && !_reader_eof) { + while (block->rows() < batch_size && !_reader_eof && (block->bytes() < max_block_bytes)) { if (UNLIKELY(_read_json_by_line && _skip_first_line)) { size_t size = 0; const uint8_t* line_ptr = nullptr; @@ -251,6 +255,15 @@ Status NewJsonReader::init_schema_reader() { return Status::OK(); } +void NewJsonReader::set_batch_size(size_t batch_size) { + // 0 means "not set" / "use default" for the row-based readers; we must + // never let _batch_size be 0 because _do_get_next_block uses it as the + // upper bound of a `while (block->rows() < batch_size)` loop and a 0 + // would make the reader return without setting eof, causing the scanner + // to spin on empty blocks. + _batch_size = std::max(batch_size, 1UL); +} + Status NewJsonReader::get_parsed_schema(std::vector* col_names, std::vector* col_types) { bool eof = false; diff --git a/be/src/format/json/new_json_reader.h b/be/src/format/json/new_json_reader.h index 4d803fc1050b19..58876384f0e4e2 100644 --- a/be/src/format/json/new_json_reader.h +++ b/be/src/format/json/new_json_reader.h @@ -70,19 +70,29 @@ class NewJsonReader : public GenericReader { NewJsonReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter, const TFileScanRangeParams& params, const TFileRangeDesc& range, const std::vector& file_slot_descs, bool* scanner_eof, - io::IOContext* io_ctx, std::shared_ptr io_ctx_holder = nullptr); + size_t batch_size, io::IOContext* io_ctx, + std::shared_ptr io_ctx_holder = nullptr); NewJsonReader(RuntimeProfile* profile, const TFileScanRangeParams& params, const TFileRangeDesc& range, const std::vector& file_slot_descs, - io::IOContext* io_ctx, std::shared_ptr io_ctx_holder = nullptr); + size_t batch_size, io::IOContext* io_ctx, + std::shared_ptr io_ctx_holder = nullptr); ~NewJsonReader() override = default; Status init_reader( const std::unordered_map& col_default_value_ctx, bool is_load); + Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; Status get_columns(std::unordered_map* name_to_type, std::unordered_set* missing_cols) override; + + // Row-based readers control throughput via row count, not byte budget. + // The FileScanner's AdaptiveBlockSizePredictor converts the byte budget + // into a predicted row count and calls set_batch_size() with it. + void set_batch_size(size_t batch_size) override; + size_t get_batch_size() const override { return _batch_size; } + Status init_schema_reader() override; Status get_parsed_schema(std::vector* col_names, std::vector* col_types) override; @@ -296,6 +306,8 @@ class NewJsonReader : public GenericReader { DataTypeSerDeSPtrs _serdes; DataTypeSerDe::FormatOptions _serde_options; + // Adaptive batch size set by FileScanner. + size_t _batch_size; }; #include "common/compile_check_end.h" diff --git a/be/src/format/orc/vorc_reader.cpp b/be/src/format/orc/vorc_reader.cpp index bf5a67fb0d28a1..67fc7c91613cc4 100644 --- a/be/src/format/orc/vorc_reader.cpp +++ b/be/src/format/orc/vorc_reader.cpp @@ -243,7 +243,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, _state(state), _scan_params(params), _scan_range(range), - _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)), + _batch_size(std::max(batch_size, 1UL)), _range_start_offset(range.start_offset), _range_size(range.size), _ctz(ctz), @@ -268,7 +268,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, _state(state), _scan_params(params), _scan_range(range), - _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)), + _batch_size(std::max(batch_size, 1UL)), _range_start_offset(range.start_offset), _range_size(range.size), _ctz(ctz), @@ -285,12 +285,27 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState* state, _init_file_description(); } +void OrcReader::set_batch_size(size_t batch_size) { + DCHECK_GT(batch_size, 0); + if (_batch_size == batch_size) { + return; + } + + _batch_size = batch_size; + if (_row_reader != nullptr) { + // ORC stores the batch capacity inside the row batch object returned by createRowBatch(). + // Rebuild it when the requested batch size changes so the next call uses the new limit. + _batch = _row_reader->createRowBatch(_batch_size); + } +} + OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::string& ctz, io::IOContext* io_ctx, FileMetaCache* meta_cache, - bool enable_lazy_mat) + size_t batch_size, const std::string& ctz, io::IOContext* io_ctx, + FileMetaCache* meta_cache, bool enable_lazy_mat) : _profile(nullptr), _scan_params(params), _scan_range(range), + _batch_size(std::max(batch_size, 1UL)), _ctz(ctz), _file_system(nullptr), _io_ctx(io_ctx), @@ -303,11 +318,13 @@ OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& r } OrcReader::OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::string& ctz, std::shared_ptr io_ctx_holder, - FileMetaCache* meta_cache, bool enable_lazy_mat) + size_t batch_size, const std::string& ctz, + std::shared_ptr io_ctx_holder, FileMetaCache* meta_cache, + bool enable_lazy_mat) : _profile(nullptr), _scan_params(params), _scan_range(range), + _batch_size(std::max(batch_size, 1UL)), _ctz(ctz), _file_system(nullptr), _io_ctx(io_ctx_holder ? io_ctx_holder.get() : nullptr), diff --git a/be/src/format/orc/vorc_reader.h b/be/src/format/orc/vorc_reader.h index 2697a108200f58..cfbd7abb8cb039 100644 --- a/be/src/format/orc/vorc_reader.h +++ b/be/src/format/orc/vorc_reader.h @@ -152,11 +152,11 @@ class OrcReader : public GenericReader { std::shared_ptr io_ctx_holder, FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true); - OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, + OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, io::IOContext* io_ctx, FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true); - OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, + OrcReader(const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, const std::string& ctz, std::shared_ptr io_ctx_holder, FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true); @@ -181,6 +181,8 @@ class OrcReader : public GenericReader { Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + void set_batch_size(size_t batch_size) override; + int64_t size() const; Status get_columns(std::unordered_map* name_to_type, @@ -236,6 +238,8 @@ class OrcReader : public GenericReader { bool count_read_rows() override { return true; } + size_t get_batch_size() const override { return _batch_size; } + protected: void _collect_profile_before_close() override; @@ -676,6 +680,7 @@ class OrcReader : public GenericReader { io::FileDescription _file_description; size_t _batch_size; int64_t _range_start_offset; + int64_t _range_size; std::string _ctz; diff --git a/be/src/format/parquet/vparquet_reader.cpp b/be/src/format/parquet/vparquet_reader.cpp index e1170ff08619d4..22e5ebd62b8a1b 100644 --- a/be/src/format/parquet/vparquet_reader.cpp +++ b/be/src/format/parquet/vparquet_reader.cpp @@ -87,7 +87,7 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams : _profile(profile), _scan_params(params), _scan_range(range), - _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)), + _batch_size(std::max(batch_size, 1UL)), _range_start_offset(range.start_offset), _range_size(range.size), _ctz(ctz), @@ -106,6 +106,13 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams _init_file_description(); } +void ParquetReader::set_batch_size(size_t batch_size) { + if (_batch_size == batch_size) { + return; + } + _batch_size = batch_size; +} + ParquetReader::ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params, const TFileRangeDesc& range, size_t batch_size, cctz::time_zone* ctz, std::shared_ptr io_ctx_holder, RuntimeState* state, @@ -113,7 +120,7 @@ ParquetReader::ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams : _profile(profile), _scan_params(params), _scan_range(range), - _batch_size(std::max(batch_size, _MIN_BATCH_SIZE)), + _batch_size(std::max(batch_size, 1UL)), _range_start_offset(range.start_offset), _range_size(range.size), _ctz(ctz), diff --git a/be/src/format/parquet/vparquet_reader.h b/be/src/format/parquet/vparquet_reader.h index 402fdd11138f77..e2dbd0f963f661 100644 --- a/be/src/format/parquet/vparquet_reader.h +++ b/be/src/format/parquet/vparquet_reader.h @@ -140,6 +140,8 @@ class ParquetReader : public GenericReader { Status get_next_block(Block* block, size_t* read_rows, bool* eof) override; + void set_batch_size(size_t batch_size) override; + Status close() override; // set the delete rows in current parquet file @@ -356,6 +358,7 @@ class ParquetReader : public GenericReader { const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr; const std::unordered_map* _slot_id_to_filter_conjuncts = nullptr; std::unordered_map _ignored_stats; + size_t get_batch_size() const override { return _batch_size; } std::pair, int> _row_id_column_iterator_pair = {nullptr, -1}; @@ -363,6 +366,7 @@ class ParquetReader : public GenericReader { protected: bool _filter_groups = true; + RowGroupReader::IcebergRowIdParams _iceberg_rowid_params; std::set _column_ids; diff --git a/be/src/format/text/text_reader.cpp b/be/src/format/text/text_reader.cpp index 2f98ad517cd9d6..e52da7f3249036 100644 --- a/be/src/format/text/text_reader.cpp +++ b/be/src/format/text/text_reader.cpp @@ -113,8 +113,9 @@ void HiveTextFieldSplitter::_split_field_multi_char(const Slice& line, TextReader::TextReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter, const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::vector& file_slot_descs, io::IOContext* io_ctx) - : CsvReader(state, profile, counter, params, range, file_slot_descs, io_ctx) {} + const std::vector& file_slot_descs, size_t batch_size, + io::IOContext* io_ctx) + : CsvReader(state, profile, counter, params, range, file_slot_descs, batch_size, io_ctx) {} Status TextReader::_init_options() { // get column_separator and line_delimiter diff --git a/be/src/format/text/text_reader.h b/be/src/format/text/text_reader.h index b7251d5f5f8575..22073c130a8486 100644 --- a/be/src/format/text/text_reader.h +++ b/be/src/format/text/text_reader.h @@ -56,7 +56,8 @@ class TextReader : public CsvReader { public: TextReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter, const TFileScanRangeParams& params, const TFileRangeDesc& range, - const std::vector& file_slot_descs, io::IOContext* io_ctx); + const std::vector& file_slot_descs, size_t batch_size, + io::IOContext* io_ctx); ~TextReader() override = default; diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index a6b98367cf92ce..b8a85d097f99cb 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -139,7 +139,35 @@ class RuntimeState { const DescriptorTbl& desc_tbl() const { return *_desc_tbl; } void set_desc_tbl(const DescriptorTbl* desc_tbl) { _desc_tbl = desc_tbl; } - MOCK_FUNCTION int batch_size() const { return _query_options.batch_size; } + + // Row-count limit for output blocks. Clamp to [1, 65535]. + // Adaptive byte budgeting still uses this as the hard row ceiling. + MOCK_FUNCTION int batch_size() const { + static constexpr int kMax = 65535; + auto v = _query_options.batch_size; + return std::min(std::max(1, v), kMax); + } + + // Target byte budget per output block (default 8MB when adaptive is enabled). + // The public FE/session contract is [1MB, 512MB]; this accessor still clamps any direct + // thrift or mixed-version out-of-range value into that range. Returns `kMax` when adaptive + // is disabled by BE config so the value is always a legal byte budget; callers that need + // to know whether adaptive batch size is active should test + // `config::enable_adaptive_batch_size` explicitly. + MOCK_FUNCTION size_t preferred_block_size_bytes() const { + static constexpr int64_t kDefault = 8388608L; // 8MB + static constexpr int64_t kMax = 536870912L; // 512MB + static constexpr int64_t kMin = 1048576L; // 1MB + if (!config::enable_adaptive_batch_size) [[unlikely]] { + return kMax; + } + if (_query_options.__isset.preferred_block_size_bytes) [[likely]] { + return std::max( + kMin, std::min(_query_options.preferred_block_size_bytes, kMax)); + } + return kDefault; + } + int query_parallel_instance_num() const { return _query_options.parallel_instance; } int max_errors() const { return _query_options.max_errors; } int execution_timeout() const { diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 148a2002f1fb0b..d5937fec14304e 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -828,6 +828,7 @@ void PInternalService::fetch_table_schema(google::protobuf::RpcController* contr auto file_reader_stats = std::make_shared(); io_ctx->file_cache_stats = file_cache_statis.get(); io_ctx->file_reader_stats = file_reader_stats.get(); + constexpr size_t fetch_schema_batch_size = 4064; // file_slots is no use, but the lifetime should be longer than reader std::vector file_slots; switch (params.format_type) { @@ -840,12 +841,13 @@ void PInternalService::fetch_table_schema(google::protobuf::RpcController* contr case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_CSV_DEFLATE: { reader = CsvReader::create_unique(nullptr, profile.get(), nullptr, params, range, - file_slots, io_ctx.get(), io_ctx); + file_slots, fetch_schema_batch_size, io_ctx.get(), + io_ctx); break; } case TFileFormatType::FORMAT_TEXT: { reader = TextReader::create_unique(nullptr, profile.get(), nullptr, params, range, - file_slots, io_ctx.get()); + file_slots, fetch_schema_batch_size, io_ctx.get()); break; } case TFileFormatType::FORMAT_PARQUET: { @@ -853,7 +855,7 @@ void PInternalService::fetch_table_schema(google::protobuf::RpcController* contr break; } case TFileFormatType::FORMAT_ORC: { - reader = OrcReader::create_unique(params, range, "", io_ctx); + reader = OrcReader::create_unique(params, range, fetch_schema_batch_size, "", io_ctx); break; } case TFileFormatType::FORMAT_NATIVE: { @@ -863,7 +865,7 @@ void PInternalService::fetch_table_schema(google::protobuf::RpcController* contr } case TFileFormatType::FORMAT_JSON: { reader = NewJsonReader::create_unique(profile.get(), params, range, file_slots, - io_ctx.get(), io_ctx); + fetch_schema_batch_size, io_ctx.get(), io_ctx); break; } case TFileFormatType::FORMAT_AVRO: { diff --git a/be/src/storage/compaction/compaction.cpp b/be/src/storage/compaction/compaction.cpp index 22434bc083a250..6e553bfb901dfe 100644 --- a/be/src/storage/compaction/compaction.cpp +++ b/be/src/storage/compaction/compaction.cpp @@ -1294,6 +1294,7 @@ Status CompactionMixin::modify_rowsets() { _tablet->enable_unique_key_merge_on_write()) { Version version = tablet()->max_version(); DeleteBitmap output_rowset_delete_bitmap(_tablet->tablet_id()); + DeleteBitmap output_rowset_internal_delete_bitmap(_tablet->tablet_id()); std::unique_ptr missed_rows; if ((config::enable_missing_rows_correctness_check || config::enable_mow_compaction_correctness_check_core || @@ -1313,12 +1314,20 @@ Status CompactionMixin::modify_rowsets() { // New loads are not blocked, so some keys of input rowsets might // be deleted during the time. We need to deal with delete bitmap // of incremental data later. - // TODO(LiaoXin): check if there are duplicate keys std::size_t missed_rows_size = 0; tablet()->calc_compaction_output_rowset_delete_bitmap( _input_rowsets, *_rowid_conversion, 0, version.second + 1, missed_rows.get(), location_map.get(), _tablet->tablet_meta()->delete_bitmap(), &output_rowset_delete_bitmap); + // In cluster-key MOW compaction, rows are sorted by cluster key, so duplicate unique keys + // may be non-adjacent in merge order. Scan the output primary key index to delete older + // duplicate rows inside the output rowset. + if (!tablet()->tablet_schema()->cluster_key_uids().empty()) { + RETURN_IF_ERROR(tablet()->calc_compaction_output_rowset_internal_delete_bitmap( + _input_rowsets, _output_rowset, *_rowid_conversion, + &output_rowset_internal_delete_bitmap)); + output_rowset_delete_bitmap.merge(output_rowset_internal_delete_bitmap); + } if (missed_rows) { missed_rows_size = missed_rows->size(); std::size_t merged_missed_rows_size = _stats.merged_rows; @@ -1418,6 +1427,7 @@ Status CompactionMixin::modify_rowsets() { tablet()->calc_compaction_output_rowset_delete_bitmap( _input_rowsets, *_rowid_conversion, 0, UINT64_MAX, missed_rows.get(), location_map.get(), *it.delete_bitmap.get(), &txn_output_delete_bitmap); + txn_output_delete_bitmap.merge(output_rowset_internal_delete_bitmap); if (config::enable_merge_on_write_correctness_check) { RowsetIdUnorderedSet rowsetids; rowsetids.insert(_output_rowset->rowset_id()); diff --git a/be/src/storage/iterator/block_reader.cpp b/be/src/storage/iterator/block_reader.cpp index f007a1f1e9435a..161cc8cc4b551a 100644 --- a/be/src/storage/iterator/block_reader.cpp +++ b/be/src/storage/iterator/block_reader.cpp @@ -30,6 +30,7 @@ // IWYU pragma: no_include #include "cloud/config.h" #include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" #include "common/status.h" #include "core/block/column_with_type_and_name.h" #include "core/column/column_nullable.h" @@ -55,6 +56,8 @@ namespace doris { #include "common/compile_check_begin.h" using namespace ErrorCode; +static constexpr int32_t BLOCK_SIZE_CHECK_INTERVAL_ROWS = 64; + BlockReader::~BlockReader() { for (int i = 0; i < _agg_functions.size(); ++i) { _agg_functions[i]->destroy(_agg_places[i]); @@ -166,7 +169,7 @@ Status BlockReader::_init_agg_state(const ReaderParams& read_params) { } _stored_data_columns = - _next_row.block->create_same_struct_block(_reader_context.batch_size)->mutate_columns(); + _next_row.block->create_same_struct_block(batch_max_rows())->mutate_columns(); _stored_has_null_tag.resize(_stored_data_columns.size()); _stored_has_variable_length_tag.resize(_stored_data_columns.size()); @@ -310,9 +313,16 @@ Status BlockReader::_agg_key_next_block(Block* block, bool* eof) { } if (!_next_row.is_same) { - if (target_block_row == _reader_context.batch_size) { + if (target_block_row == batch_max_rows()) { + break; + } + // Byte-budget check at group boundary: _next_row is the first row of the new group + // and is still pending (not yet inserted), so stopping here is safe. + if (target_block_row % BLOCK_SIZE_CHECK_INTERVAL_ROWS == 0 && + _reached_byte_budget(target_columns)) { break; } + _agg_data_counters.push_back(_last_agg_data_counter); _last_agg_data_counter = 0; @@ -344,7 +354,7 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { auto target_block_row = 0; auto target_columns = block->mutate_columns(); if (UNLIKELY(_reader_context.record_rowids)) { - _block_row_locations.resize(_reader_context.batch_size); + _block_row_locations.resize(batch_max_rows()); } do { @@ -372,7 +382,15 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { LOG(WARNING) << "next failed: " << res; return res; } - } while (target_block_row < _reader_context.batch_size); + // Byte-budget check: _next_row is already saved so stopping here is safe. + if (target_block_row % BLOCK_SIZE_CHECK_INTERVAL_ROWS == 0 && + _reached_byte_budget(target_columns)) { + if (UNLIKELY(_reader_context.record_rowids)) { + _block_row_locations.resize(target_block_row); + } + break; + } + } while (target_block_row < batch_max_rows()); if (_delete_sign_available) { int delete_sign_idx = _reader_context.tablet_schema->field_index(DELETE_SIGN); @@ -420,6 +438,11 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { return Status::OK(); } +bool BlockReader::_reached_byte_budget(const MutableColumns& columns) const { + return config::enable_adaptive_batch_size && _reader_context.preferred_block_size_bytes > 0 && + Block::columns_byte_size(columns) >= _reader_context.preferred_block_size_bytes; +} + Status BlockReader::_insert_data_normal(MutableColumns& columns) { auto block = _next_row.block.get(); @@ -436,9 +459,11 @@ void BlockReader::_append_agg_data(MutableColumns& columns) { _stored_row_ref.push_back(_next_row); _last_agg_data_counter++; - // execute aggregate when have `batch_size` column or some ref invalid soon + // execute aggregate when accumulated `batch_max_rows()` rows or some ref invalid soon + // `_stored_data_columns` is sized to `batch_max_rows()`, + // this flush keeps the number of rows in `_stored_row_ref` within `batch_max_rows()`. bool is_last = (_next_row.block->rows() == _next_row.row_pos + 1); - if (is_last || _stored_row_ref.size() == _reader_context.batch_size) { + if (is_last || _stored_row_ref.size() == batch_max_rows()) { _update_agg_data(columns); } } diff --git a/be/src/storage/iterator/block_reader.h b/be/src/storage/iterator/block_reader.h index 88b59971713733..270adf536650d1 100644 --- a/be/src/storage/iterator/block_reader.h +++ b/be/src/storage/iterator/block_reader.h @@ -23,6 +23,7 @@ #include #include +#include "common/config.h" #include "common/status.h" #include "core/block/block.h" #include "core/column/column.h" @@ -53,6 +54,11 @@ class BlockReader final : public TabletReader { return _vcollect_iter.update_profile(profile); } + // Returns the configured preferred output block byte budget; 0 when adaptive is disabled. + size_t preferred_block_size_bytes() const override { + return config::enable_adaptive_batch_size ? _reader_context.preferred_block_size_bytes : 0; + } + private: // Directly read row from rowset and pass to upper caller. No need to do aggregation. // This is usually used for DUPLICATE KEY tables @@ -74,6 +80,10 @@ class BlockReader final : public TabletReader { Status _insert_data_normal(MutableColumns& columns); + // Check if the accumulated output columns have reached the preferred byte budget, + // used to limit the output block size for adaptive batch sizing. + bool _reached_byte_budget(const MutableColumns& columns) const; + void _append_agg_data(MutableColumns& columns); void _update_agg_data(MutableColumns& columns); diff --git a/be/src/storage/iterator/vcollect_iterator.cpp b/be/src/storage/iterator/vcollect_iterator.cpp index 04db8011b0661b..cbf37605f4119f 100644 --- a/be/src/storage/iterator/vcollect_iterator.cpp +++ b/be/src/storage/iterator/vcollect_iterator.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" #include "common/status.h" #include "core/block/column_with_type_and_name.h" #include "core/column/column.h" @@ -446,7 +448,6 @@ Status VCollectIterator::_topn_next(Block* block) { << " sorted_row_pos.size()=" << sorted_row_pos.size() << " mutable_block.rows()=" << mutable_block.rows(); *block = mutable_block.to_block(); - _topn_eof = true; return block->rows() > 0 ? Status::OK() : Status::Error(""); } @@ -793,6 +794,46 @@ Status VCollectIterator::Level1Iterator::_normal_next(IteratorRowRef* ref) { } } +// Estimate whether the output block has collected enough data to meet the byte budget. +bool estimate_collected_enough(size_t present_bytes, size_t present_rows, int rows_to_merge, + size_t preferred_block_size_bytes) { + DCHECK_GE(rows_to_merge, 0); + + if (preferred_block_size_bytes == 0 || present_rows == 0) { + return false; + } + + if (present_bytes >= preferred_block_size_bytes) { + return true; + } + + // Predict total bytes after flushing the pending rows_to_merge. + const size_t total_rows = static_cast(rows_to_merge) + present_rows; + // Guard against overflow: if multiplication would wrap, the budget is surely exceeded. + if (present_bytes > std::numeric_limits::max() / total_rows) { + return true; + } + return present_bytes * total_rows / present_rows >= preferred_block_size_bytes; +} + +bool VCollectIterator::Level1Iterator::collected_enough_rows(const MutableColumns& columns, + int rows_to_merge) const { + if (!config::enable_adaptive_batch_size) { + return false; + } + + const auto preferred_block_size_bytes = _reader->preferred_block_size_bytes(); + if (preferred_block_size_bytes == 0) { + return false; + } + + const auto present_bytes = Block::columns_byte_size(columns); + const auto present_rows = columns.empty() ? 0 : columns[0]->size(); + + return estimate_collected_enough(present_bytes, present_rows, rows_to_merge, + preferred_block_size_bytes); +} + Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { SCOPED_RAW_TIMER(&_reader->_stats.collect_iterator_merge_next_timer); int target_block_row = 0; @@ -806,7 +847,7 @@ Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { block->insert(cur_row.block->get_by_position(i).clone_empty()); } - auto batch_size = _reader->batch_size(); + auto batch_size = _reader->batch_max_rows(); if (UNLIKELY(_reader->_reader_context.record_rowids)) { _block_row_locations.resize(batch_size); } @@ -870,6 +911,24 @@ Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { continuous_row_in_block = 0; pre_row_ref = cur_row; } + + // Byte-budget check: _merge_next() has already advanced _ref to the next unread row, + // so it is safe to stop here without duplicating any data. + if (collected_enough_rows(target_columns, continuous_row_in_block)) { + if (continuous_row_in_block > 0) { + const auto& src_block = pre_row_ref.block; + for (size_t i = 0; i < column_count; ++i) { + target_columns[i]->insert_range_from(*(src_block->get_by_position(i).column), + pre_row_ref.row_pos, + continuous_row_in_block); + } + } + if (UNLIKELY(_reader->_reader_context.record_rowids)) { + _block_row_locations.resize(target_block_row); + } + block->set_columns(std::move(target_columns)); + return Status::OK(); + } } while (true); return Status::OK(); diff --git a/be/src/storage/iterator/vcollect_iterator.h b/be/src/storage/iterator/vcollect_iterator.h index 4201546c04882b..710d6f7903e5f1 100644 --- a/be/src/storage/iterator/vcollect_iterator.h +++ b/be/src/storage/iterator/vcollect_iterator.h @@ -47,6 +47,12 @@ namespace doris { class TabletSchema; class RuntimeProfile; +// Pure-computation helper: estimate whether collected data meets the byte budget +// after flushing rows_to_merge additional rows. Extracted from Level1Iterator so +// it can be unit-tested independently. +bool estimate_collected_enough(size_t present_bytes, size_t present_rows, int rows_to_merge, + size_t preferred_block_size_bytes); + class VCollectIterator { public: // Hold reader point to get reader params @@ -303,6 +309,8 @@ class VCollectIterator { void init_level0_iterators_for_union(); + bool collected_enough_rows(const MutableColumns& columns, int rows_to_merge) const; + private: Status _merge_next(IteratorRowRef* ref); @@ -348,6 +356,9 @@ class VCollectIterator { // for topn next size_t _topn_limit = 0; bool _topn_eof = false; + // For chunked topN output when result exceeds byte budget. + Block _topn_result_block; + size_t _topn_result_offset = 0; std::vector _rs_splits; // Hold reader point to access read params, such as fetch conditions. diff --git a/be/src/storage/iterators.h b/be/src/storage/iterators.h index a55f87e0cea561..1c9b551874360c 100644 --- a/be/src/storage/iterators.h +++ b/be/src/storage/iterators.h @@ -111,6 +111,8 @@ class StorageReadOptions { OlapReaderStatistics* stats = nullptr; bool use_page_cache = false; uint32_t block_row_max = 4096 - 32; // see https://github.com/apache/doris/pull/11816 + // Effective adaptive batch size byte budget. + size_t preferred_block_size_bytes = 8388608UL; TabletSchemaSPtr tablet_schema = nullptr; bool enable_unique_key_merge_on_write = false; diff --git a/be/src/storage/olap_common.h b/be/src/storage/olap_common.h index e09146d0cde0b3..9185ec262699bd 100644 --- a/be/src/storage/olap_common.h +++ b/be/src/storage/olap_common.h @@ -444,6 +444,9 @@ struct OlapReaderStatistics { int64_t segment_create_column_readers_timer_ns = 0; int64_t segment_load_index_timer_ns = 0; + int64_t adaptive_batch_size_predict_min_rows = INT64_MAX; + int64_t adaptive_batch_size_predict_max_rows = 0; + int64_t variant_scan_sparse_column_timer_ns = 0; int64_t variant_scan_sparse_column_bytes = 0; int64_t variant_fill_path_from_sparse_column_timer_ns = 0; diff --git a/be/src/storage/rowset/beta_rowset_reader.cpp b/be/src/storage/rowset/beta_rowset_reader.cpp index e4a2b45b21fdaf..94b76272a1019e 100644 --- a/be/src/storage/rowset/beta_rowset_reader.cpp +++ b/be/src/storage/rowset/beta_rowset_reader.cpp @@ -97,6 +97,7 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context // convert RowsetReaderContext to StorageReadOptions _read_options.block_row_max = read_context->batch_size; + _read_options.preferred_block_size_bytes = read_context->preferred_block_size_bytes; _read_options.stats = _stats; _read_options.push_down_agg_type_opt = _read_context->push_down_agg_type_opt; _read_options.remaining_conjunct_roots = _read_context->remaining_conjunct_roots; diff --git a/be/src/storage/rowset/beta_rowset_writer.cpp b/be/src/storage/rowset/beta_rowset_writer.cpp index f41211feb7a4d7..f728e56e0ba811 100644 --- a/be/src/storage/rowset/beta_rowset_writer.cpp +++ b/be/src/storage/rowset/beta_rowset_writer.cpp @@ -360,72 +360,74 @@ Status BaseBetaRowsetWriter::_generate_delete_bitmap(int32_t segment_id) { // Submit the entire delete bitmap calculation process to thread pool for async execution // This avoids blocking memtable flush thread while waiting for file upload to complete // The process includes: file_writer->close(), _build_tmp, load_segments, and calc_delete_bitmap - return _calc_delete_bitmap_token->submit_func( - [this, segment_id, specified_rowsets = std::move(specified_rowsets)]() -> Status { - Status st = Status::OK(); - // Step 1: Close file_writer (must be done before load_segments) - auto* file_writer = _seg_files.get(segment_id); - if (file_writer && file_writer->state() != io::FileWriter::State::CLOSED) { - MonotonicStopWatch close_timer; - close_timer.start(); - st = file_writer->close(); - close_timer.stop(); - - auto close_time_ms = close_timer.elapsed_time_milliseconds(); - if (close_time_ms > 1000) { - LOG(INFO) << "file_writer->close() took " << close_time_ms - << "ms for segment_id=" << segment_id - << ", tablet_id=" << _context.tablet_id - << ", rowset_id=" << _context.rowset_id; - } - if (!st.ok()) { - return st; - } - } + return _calc_delete_bitmap_token->submit_func([this, segment_id, + specified_rowsets = std::move( + specified_rowsets)]() -> Status { + Status st = Status::OK(); + // Step 1: Close file_writer (must be done before load_segments) + auto* file_writer = _seg_files.get(segment_id); + if (file_writer && file_writer->state() != io::FileWriter::State::CLOSED) { + MonotonicStopWatch close_timer; + close_timer.start(); + st = file_writer->close(); + close_timer.stop(); + + auto close_time_ms = close_timer.elapsed_time_milliseconds(); + if (close_time_ms > 1000) { + LOG(INFO) << "file_writer->close() took " << close_time_ms + << "ms for segment_id=" << segment_id + << ", tablet_id=" << _context.tablet_id + << ", rowset_id=" << _context.rowset_id; + } + if (!st.ok()) { + return st; + } + } - OlapStopWatch watch; - // Step 2: Build tmp rowset (needs file_writer to be closed) - RowsetSharedPtr rowset_ptr; - st = _build_tmp(rowset_ptr); - if (!st.ok()) { - return st; - } + OlapStopWatch watch; + // Step 2: Build tmp rowset (needs file_writer to be closed) + RowsetSharedPtr rowset_ptr; + st = _build_tmp(rowset_ptr); + if (!st.ok()) { + return st; + } - // Step 3: Load segments (needs file_writer to be closed and rowset to be built) - auto* beta_rowset = reinterpret_cast(rowset_ptr.get()); - std::vector segments; - st = beta_rowset->load_segments(segment_id, segment_id + 1, &segments); - if (!st.ok()) { - return st; - } + DBUG_EXECUTE_IF("BaseBetaRowsetWriter::_generate_delete_bitmap.block_before_load_segments", + DBUG_RUN_CALLBACK(segment_id)); - // Step 4: Calculate delete bitmap - st = BaseTablet::calc_delete_bitmap( - _context.tablet, rowset_ptr, segments, specified_rowsets, - _context.mow_context->delete_bitmap, _context.mow_context->max_version, - nullptr, nullptr, nullptr); - if (!st.ok()) { - return st; - } + // Step 3: Load segments (needs file_writer to be closed and rowset to be built) + auto* beta_rowset = reinterpret_cast(rowset_ptr.get()); + std::vector segments; + st = beta_rowset->load_segments(segment_id, segment_id + 1, &segments); + if (!st.ok()) { + return st; + } - size_t total_rows = - std::accumulate(segments.begin(), segments.end(), 0, - [](size_t sum, const segment_v2::SegmentSharedPtr& s) { - return sum += s->num_rows(); - }); - LOG(INFO) << "[Memtable Flush] construct delete bitmap tablet: " - << _context.tablet->tablet_id() - << ", rowset_ids: " << _context.mow_context->rowset_ids->size() - << ", cur max_version: " << _context.mow_context->max_version - << ", transaction_id: " << _context.mow_context->txn_id - << ", delete_bitmap_count: " - << _context.mow_context->delete_bitmap->get_delete_bitmap_count() - << ", delete_bitmap_cardinality: " - << _context.mow_context->delete_bitmap->cardinality() - << ", cost: " << watch.get_elapse_time_us() - << "(us), total rows: " << total_rows; - return Status::OK(); - }); + // Step 4: Calculate delete bitmap + st = BaseTablet::calc_delete_bitmap(_context.tablet, rowset_ptr, segments, + specified_rowsets, _context.mow_context->delete_bitmap, + _context.mow_context->max_version, nullptr, nullptr, + nullptr); + if (!st.ok()) { + return st; + } + + size_t total_rows = std::accumulate(segments.begin(), segments.end(), 0, + [](size_t sum, const segment_v2::SegmentSharedPtr& s) { + return sum += s->num_rows(); + }); + LOG(INFO) << "[Memtable Flush] construct delete bitmap tablet: " + << _context.tablet->tablet_id() + << ", rowset_ids: " << _context.mow_context->rowset_ids->size() + << ", cur max_version: " << _context.mow_context->max_version + << ", transaction_id: " << _context.mow_context->txn_id + << ", delete_bitmap_count: " + << _context.mow_context->delete_bitmap->get_delete_bitmap_count() + << ", delete_bitmap_cardinality: " + << _context.mow_context->delete_bitmap->cardinality() + << ", cost: " << watch.get_elapse_time_us() << "(us), total rows: " << total_rows; + return Status::OK(); + }); } Status BetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_context) { @@ -717,7 +719,12 @@ Status BetaRowsetWriter::_segcompaction_if_necessary() { } else { status = _check_segment_number_limit(_num_segcompacted); } + if (status.ok() && (_num_segment - _segcompacted_point) >= config::segcompaction_batch_size) { + if (_calc_delete_bitmap_token != nullptr) { + status = _calc_delete_bitmap_token->wait(); + } + SegCompactionCandidatesSharedPtr segments; status = _find_longest_consecutive_small_segment(segments); if (LIKELY(status.ok()) && (!segments->empty())) { diff --git a/be/src/storage/rowset/rowset_reader_context.h b/be/src/storage/rowset/rowset_reader_context.h index e44733367c8441..c54a39f0a5557d 100644 --- a/be/src/storage/rowset/rowset_reader_context.h +++ b/be/src/storage/rowset/rowset_reader_context.h @@ -73,6 +73,9 @@ struct RowsetReaderContext { bool use_page_cache = false; int sequence_id_idx = -1; int batch_size = 1024; + // Effective adaptive batch size byte budget. 0 means disabled internally. + size_t preferred_block_size_bytes = 8388608UL; + bool is_unique = false; //record row num merged in generic iterator uint64_t* merged_rows = nullptr; diff --git a/be/src/storage/segment/adaptive_block_size_predictor.cpp b/be/src/storage/segment/adaptive_block_size_predictor.cpp new file mode 100644 index 00000000000000..d8cc700f579853 --- /dev/null +++ b/be/src/storage/segment/adaptive_block_size_predictor.cpp @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/segment/adaptive_block_size_predictor.h" + +#include +#include + +#include "core/block/block.h" + +namespace doris { +AdaptiveBlockSizePredictor::AdaptiveBlockSizePredictor(size_t preferred_block_size_bytes, + double metadata_hint_bytes_per_row, + size_t probe_rows, size_t block_size_rows) + : _block_size_bytes(preferred_block_size_bytes), + _block_size_rows(block_size_rows), + _initial_probe_rows(probe_rows), + _metadata_hint_bytes_per_row(metadata_hint_bytes_per_row) {} + +void AdaptiveBlockSizePredictor::update(const Block& block) { + size_t rows = block.rows(); + if (rows == 0) { + return; + } + double cur = static_cast(block.bytes()) / static_cast(rows); + + if (!_has_history) { + _bytes_per_row = cur; + _has_history = true; + } else { + _bytes_per_row = kAlpha * _bytes_per_row + kBeta * cur; + } +} + +size_t AdaptiveBlockSizePredictor::predict_next_rows() { + if (_block_size_bytes == 0) { + return _block_size_rows; + } + + auto clamp_predicted_rows = [&](size_t predicted_rows) { + size_t clamped_rows = std::min(predicted_rows, _block_size_rows); + if (!_has_history) { + clamped_rows = std::min(clamped_rows, _initial_probe_rows); + } + return std::max(size_t(1), clamped_rows); + }; + + double estimated_bytes_per_row = 0.0; + + if (!_has_history) { + if (_metadata_hint_bytes_per_row > 0.0) { + estimated_bytes_per_row = _metadata_hint_bytes_per_row; + } else { + return clamp_predicted_rows(_block_size_rows); + } + } else { + estimated_bytes_per_row = _bytes_per_row; + } + + if (estimated_bytes_per_row <= 0.0) { + return clamp_predicted_rows(_block_size_rows); + } + + auto predicted = + static_cast(static_cast(_block_size_bytes) / estimated_bytes_per_row); + + return clamp_predicted_rows(predicted); +} + +} // namespace doris diff --git a/be/src/storage/segment/adaptive_block_size_predictor.h b/be/src/storage/segment/adaptive_block_size_predictor.h new file mode 100644 index 00000000000000..e03f18c2a536d2 --- /dev/null +++ b/be/src/storage/segment/adaptive_block_size_predictor.h @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +#include "storage/olap_common.h" + +namespace doris { + +class Block; + +// Predicts the number of rows to read in the next batch so that the resulting Block stays close +// to |preferred_block_size_bytes|. +// +// The predictor maintains an EWMA estimate of bytes-per-row for the whole block. After each +// successful batch the caller must invoke update(); before each batch the caller invokes +// predict_next_rows() to obtain the recommended row count. +// +// Not thread-safe; must be used by a single thread per instance. +class AdaptiveBlockSizePredictor { +public: + static constexpr size_t kDefaultProbeRows = 4096; + static constexpr size_t kDefaultBlockSizeRows = 65535; + + // Per-column metadata for computing segment-level hints. + struct ColumnMetadata { + ColumnId column_id; + uint64_t raw_bytes; // total raw data bytes for this column in the segment + }; + + // |preferred_block_size_bytes|: target total bytes of each output block chunk. + // |metadata_hint_bytes_per_row|: pre-computed conservative estimate from metadata (e.g. + // segment footer or file statistics). 0.0 means no hint available. + // |probe_rows|: first-batch row cap before any real history is available. + // |block_size_rows|: hard maximum rows of each output block chunk. + AdaptiveBlockSizePredictor(size_t preferred_block_size_bytes, + double metadata_hint_bytes_per_row, + size_t probe_rows = kDefaultProbeRows, + size_t block_size_rows = kDefaultBlockSizeRows); + + // Update EWMA estimates from a completed batch. Must be called only when block.rows() > 0 + // and the batch returned Status::OK(). + void update(const Block& block); + + // Predict how many rows the next batch should read. + // Never exceeds |block_size_rows|; never returns less than 1. + // Uses pre-computed metadata hint for first-call estimate when no history exists. + // Does NOT modify internal state (_has_history is only flipped by update()). + size_t predict_next_rows(); + + bool has_history() const { return _has_history; } + +private: + // EWMA weight for historical estimate (0.9) and current sample (0.1). + static constexpr double kAlpha = 0.9; + static constexpr double kBeta = 0.1; + + const size_t _block_size_bytes; + const size_t _block_size_rows; + const size_t _initial_probe_rows; + + // EWMA estimate of total bytes per row across all output columns. + double _bytes_per_row = 0.0; + + // Whether at least one update() has been called (i.e. we have real measured history). + bool _has_history = false; + + // Cached conservative metadata estimate computed on the first predict_next_rows() call. + // Reused on subsequent first-round predictions (before _has_history is set) to avoid + // re-traversing the segment footer on every call. + double _metadata_hint_bytes_per_row = 0.0; + +#ifdef BE_TEST +public: + double bytes_per_row_for_test() const { return _bytes_per_row; } + bool has_history_for_test() const { return _has_history; } + size_t probe_rows_for_test() const { return _initial_probe_rows; } + size_t block_size_rows_for_test() const { return _block_size_rows; } + static constexpr size_t default_probe_rows_for_test() { return kDefaultProbeRows; } + static constexpr size_t default_block_size_rows_for_test() { return kDefaultBlockSizeRows; } + void set_metadata_hint_for_test(double v) { _metadata_hint_bytes_per_row = v; } + void set_has_history_for_test(bool h, double bpr) { + _has_history = h; + _bytes_per_row = bpr; + } +#endif +}; + +} // namespace doris diff --git a/be/src/storage/segment/segment.cpp b/be/src/storage/segment/segment.cpp index ec0b706bb54205..7563299a856826 100644 --- a/be/src/storage/segment/segment.cpp +++ b/be/src/storage/segment/segment.cpp @@ -29,6 +29,7 @@ #include #include "cloud/config.h" +#include "common/config.h" #include "common/exception.h" #include "common/logging.h" #include "common/status.h" @@ -617,6 +618,22 @@ Status Segment::_create_column_meta(const SegmentFooterPB& footer) { // Initialize column meta accessor which internally maintains uid -> column_ordinal mapping. _column_meta_accessor = std::make_unique(); RETURN_IF_ERROR(_column_meta_accessor->init(footer, _file_reader)); + + if (config::enable_adaptive_batch_size) { + // Cache raw_data_bytes per column uid for adaptive batch size prediction. + // This runs under call_once, so no thread-safety concerns. + auto st = _column_meta_accessor->traverse_metas(footer, [this](const ColumnMetaPB& meta) { + if (meta.has_unique_id() && meta.unique_id() != -1 && meta.has_raw_data_bytes()) { + _column_uid_to_raw_bytes[meta.unique_id()] = meta.raw_data_bytes(); + } + }); + + if (!st.ok()) { + LOG(WARNING) << "Failed to traverse column metas to cache raw_data_bytes, error: " + << st.to_string(); + } + } + _column_reader_cache = std::make_unique( _column_meta_accessor.get(), _tablet_schema, _file_reader, _num_rows, [this](std::shared_ptr& footer_pb, OlapReaderStatistics* stats) { diff --git a/be/src/storage/segment/segment.h b/be/src/storage/segment/segment.h index eb23c74943713f..3eb3a018e42a2e 100644 --- a/be/src/storage/segment/segment.h +++ b/be/src/storage/segment/segment.h @@ -30,6 +30,7 @@ #include #include "agent/be_exec_version_manager.h" +#include "common/be_mock_util.h" #include "common/status.h" // Status #include "core/column/column.h" #include "core/data_type/data_type.h" @@ -116,7 +117,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd RowsetId rowset_id() const { return _rowset_id; } - uint32_t num_rows() const { return _num_rows; } + MOCK_FUNCTION uint32_t num_rows() const { return _num_rows; } // if variant_sparse_column_cache is nullptr, means the sparse column cache is not used Status new_column_iterator(const TabletColumn& tablet_column, @@ -201,7 +202,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd } } - const TabletSchemaSPtr& tablet_schema() { return _tablet_schema; } + const TabletSchemaSPtr& tablet_schema() const { return _tablet_schema; } // get the column reader by tablet column, return NOT_FOUND if not found reader in this segment Status get_column_reader(const TabletColumn& col, std::shared_ptr* column_reader, @@ -213,6 +214,13 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd Status traverse_column_meta_pbs(const std::function& visitor); + // Returns the cached raw_data_bytes for the given column unique id, or 0 if not found. + // Data is populated during _create_column_meta (under call_once), so thread-safe after init. + uint64_t column_raw_data_bytes(int32_t column_uid) const { + auto it = _column_uid_to_raw_bytes.find(column_uid); + return it != _column_uid_to_raw_bytes.end() ? it->second : 0; + } + static StoragePageCache::CacheKey get_segment_footer_cache_key( const io::FileReaderSPtr& file_reader); @@ -287,6 +295,9 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd std::weak_ptr _footer_pb; + // Cached raw_data_bytes per column unique id, populated once in _create_column_meta(). + std::unordered_map _column_uid_to_raw_bytes; + // used to hold short key index page in memory PageHandle _sk_index_handle; // short key index decoder diff --git a/be/src/storage/segment/segment_iterator.cpp b/be/src/storage/segment/segment_iterator.cpp index f7d0107db88322..e98535fcdb5eb7 100644 --- a/be/src/storage/segment/segment_iterator.cpp +++ b/be/src/storage/segment/segment_iterator.cpp @@ -347,6 +347,40 @@ Status SegmentIterator::init(const StorageReadOptions& opts) { return status; } +std::unique_ptr SegmentIterator::_make_block_size_predictor() const { + if (!config::enable_adaptive_batch_size || _opts.preferred_block_size_bytes == 0) { + return nullptr; + } + + // Collect per-column raw byte metadata from the segment footer for the columns + // this iterator will actually output (defined by _schema, which is built from + // _opts.return_columns). + std::vector col_metadata; + uint32_t seg_rows = _segment->num_rows(); + uint64_t total_raw_bytes = 0; + double metadata_hint_bytes_per_row = 0.0; + if (seg_rows > 0) { + const auto& ts = _segment->tablet_schema(); + if (ts) { + for (ColumnId cid : _schema->column_ids()) { + if (static_cast(cid) < ts->num_columns()) { + int32_t uid = ts->column(cid).unique_id(); + uint64_t raw_bytes = _segment->column_raw_data_bytes(uid); + if (uid >= 0 && raw_bytes > 0) { + total_raw_bytes += raw_bytes; + } + } + } + metadata_hint_bytes_per_row = + static_cast(total_raw_bytes) / static_cast(seg_rows); + } + } + + return std::make_unique( + _opts.preferred_block_size_bytes, metadata_hint_bytes_per_row, + AdaptiveBlockSizePredictor::kDefaultProbeRows, _opts.block_row_max); +} + Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { // get file handle from file descriptor of segment if (_inited) { @@ -369,6 +403,10 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { // Read options will not change, so that just resize here _block_rowids.resize(_opts.block_row_max); + // Adaptive batch size: snapshot the initial row limit and create predictor if enabled. + _initial_block_row_max = _opts.block_row_max; + _block_size_predictor = _make_block_size_predictor(); + _remaining_conjunct_roots = opts.remaining_conjunct_roots; if (_schema->rowid_col_idx() > 0) { @@ -492,10 +530,14 @@ Status SegmentIterator::_lazy_init(Block* block) { _range_iter.reset(new BitmapRangeIterator(_row_bitmap)); } - // If the row bitmap size is smaller than block_row_max, there's no need to reserve that many column rows. - auto nrows_reserve_limit = std::min(_row_bitmap.cardinality(), uint64_t(_opts.block_row_max)); + // Reserve columns for _initial_block_row_max (the original max before any adaptive + // prediction) because the predictor may increase block_row_max on subsequent batches + // up to this ceiling. Using the current (possibly reduced) _opts.block_row_max would + // cause heap-buffer-overflow if a later prediction is larger. + auto nrows_reserve_limit = + std::min(_row_bitmap.cardinality(), uint64_t(_initial_block_row_max)); if (_lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval) { - _block_rowids.resize(_opts.block_row_max); + _block_rowids.resize(_initial_block_row_max); } _current_return_columns.resize(_schema->columns().size()); @@ -2510,6 +2552,28 @@ Status SegmentIterator::next_batch(Block* block) { _init_virtual_columns(block); auto status = [&]() { RETURN_IF_CATCH_EXCEPTION({ + // Adaptive batch size: predict how many rows this batch should read. + if (_block_size_predictor) { + auto predicted = static_cast(_block_size_predictor->predict_next_rows()); + _opts.block_row_max = std::min(predicted, _initial_block_row_max); + _opts.stats->adaptive_batch_size_predict_min_rows = + std::min(_opts.stats->adaptive_batch_size_predict_min_rows, + static_cast(predicted)); + _opts.stats->adaptive_batch_size_predict_max_rows = + std::max(_opts.stats->adaptive_batch_size_predict_max_rows, + static_cast(predicted)); + } else { + // No predictor — record the fixed batch size using min/max so we don't + // clobber values already accumulated by other segment iterators that + // share the same OlapReaderStatistics. + _opts.stats->adaptive_batch_size_predict_min_rows = + std::min(_opts.stats->adaptive_batch_size_predict_min_rows, + static_cast(_opts.block_row_max)); + _opts.stats->adaptive_batch_size_predict_max_rows = + std::max(_opts.stats->adaptive_batch_size_predict_max_rows, + static_cast(_opts.block_row_max)); + } + auto res = _next_batch_internal(block); if (res.is()) { @@ -2555,6 +2619,13 @@ Status SegmentIterator::next_batch(Block* block) { RETURN_IF_ERROR(block->check_type_and_column()); + // Adaptive batch size: update EWMA estimate from the completed batch. + // block->bytes() is accurate here: predicates have been applied and non-predicate + // columns have been filled for surviving rows by _next_batch_internal. + if (_block_size_predictor && block->rows() > 0) { + _block_size_predictor->update(*block); + } + return Status::OK(); }); }(); diff --git a/be/src/storage/segment/segment_iterator.h b/be/src/storage/segment/segment_iterator.h index 142d252af138ee..1d804eb106b5c3 100644 --- a/be/src/storage/segment/segment_iterator.h +++ b/be/src/storage/segment/segment_iterator.h @@ -53,6 +53,7 @@ #include "storage/predicate/column_predicate.h" #include "storage/row_cursor.h" #include "storage/schema.h" +#include "storage/segment/adaptive_block_size_predictor.h" #include "storage/segment/common.h" #include "storage/segment/segment.h" #include "util/slice.h" @@ -405,6 +406,15 @@ class SegmentIterator : public RowwiseIterator { bool _inited; StorageReadOptions _opts; + // Adaptive batch size predictor; null when the feature is disabled. + std::unique_ptr _block_size_predictor; + // Build the AdaptiveBlockSizePredictor for this segment based on segment footer + // metadata for the projected output columns. Returns nullptr if the feature is + // disabled or the byte budget is non-positive. + std::unique_ptr _make_block_size_predictor() const; + // Snapshot of _opts.block_row_max at init time; used as the hard upper bound so that + // dynamic adjustments never exceed the capacity of pre-allocated buffers. + uint32_t _initial_block_row_max = 0; // make a copy of `_opts.column_predicates` in order to make local changes std::vector> _col_predicates; VExprContextSPtrs _common_expr_ctxs_push_down; diff --git a/be/src/storage/tablet/base_tablet.cpp b/be/src/storage/tablet/base_tablet.cpp index 87079069f553c3..a87e3a75656f87 100644 --- a/be/src/storage/tablet/base_tablet.cpp +++ b/be/src/storage/tablet/base_tablet.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -73,6 +74,103 @@ bvar::LatencyRecorder g_tablet_update_delete_bitmap_latency("doris_pk", "update_ static bvar::Adder g_total_tablet_num("doris_total_tablet_num"); +struct CompactionOutputRowSource { + Version version; + RowLocation src; + bool valid = false; +}; + +struct CompactionOutputPkEntry { + std::string unique_key; + std::string encoded_seq_value; + RowLocation dst; + CompactionOutputRowSource source; +}; + +struct CompactionOutputPkScanner { + uint32_t segment_id = 0; + int64_t remaining = 0; + uint32_t next_ordinal = 0; + std::unique_ptr iter; + DataTypePtr index_type; + CompactionOutputPkEntry current; +}; + +bool is_newer_compaction_output_row(const CompactionOutputPkEntry& lhs, + const CompactionOutputPkEntry& rhs) { + if (lhs.encoded_seq_value != rhs.encoded_seq_value) { + return lhs.encoded_seq_value > rhs.encoded_seq_value; + } + if (lhs.source.version.second != rhs.source.version.second) { + return lhs.source.version.second > rhs.source.version.second; + } + if (lhs.source.version.first != rhs.source.version.first) { + return lhs.source.version.first > rhs.source.version.first; + } + if (lhs.source.src.segment_id != rhs.source.src.segment_id) { + return lhs.source.src.segment_id > rhs.source.src.segment_id; + } + return lhs.source.src.row_id > rhs.source.src.row_id; +} + +Status parse_compaction_output_pk_entry( + const Slice& encoded_key, const RowsetId& output_rowset_id, uint32_t output_segment_id, + size_t seq_col_length, + const std::vector>& output_row_sources, + CompactionOutputPkEntry* entry) { + size_t rowid_length = PrimaryKeyIndexReader::ROW_ID_LENGTH; + if (UNLIKELY(encoded_key.get_size() < seq_col_length + rowid_length)) { + return Status::InternalError("invalid cluster-key MOW primary key size: {}", + encoded_key.get_size()); + } + auto unique_key_length = encoded_key.get_size() - seq_col_length - rowid_length; + entry->unique_key.assign(encoded_key.get_data(), unique_key_length); + entry->encoded_seq_value.assign(encoded_key.get_data() + unique_key_length, seq_col_length); + + Slice rowid_slice(encoded_key.get_data() + unique_key_length + seq_col_length + 1, + rowid_length - 1); + const auto* type_info = get_scalar_type_info(); + const auto* rowid_coder = get_key_coder(type_info->type()); + uint32_t row_id = 0; + RETURN_IF_ERROR(rowid_coder->decode_ascending(&rowid_slice, rowid_length, + reinterpret_cast(&row_id))); + + entry->dst = RowLocation(output_rowset_id, output_segment_id, row_id); + if (UNLIKELY(output_segment_id >= output_row_sources.size() || + row_id >= output_row_sources[output_segment_id].size())) { + return Status::InternalError( + "invalid rowid in cluster-key MOW primary key, segment_id={}, row_id={}", + output_segment_id, row_id); + } + entry->source = output_row_sources[output_segment_id][row_id]; + if (UNLIKELY(!entry->source.valid)) { + return Status::InternalError( + "missing rowid conversion source for output rowset={}, segment_id={}, row_id={}", + output_rowset_id.to_string(), output_segment_id, row_id); + } + return Status::OK(); +} + +Status load_next_compaction_output_pk_entry( + const RowsetId& output_rowset_id, size_t seq_col_length, + const std::vector>& output_row_sources, + CompactionOutputPkScanner* scanner) { + if (scanner->remaining <= 0) { + return Status::OK(); + } + + auto index_column = scanner->index_type->create_column(); + size_t num_read = 1; + RETURN_IF_ERROR(scanner->iter->seek_to_ordinal(scanner->next_ordinal++)); + RETURN_IF_ERROR(scanner->iter->next_batch(&num_read, index_column)); + DCHECK_EQ(1, num_read); + --scanner->remaining; + + Slice encoded_key(index_column->get_data_at(0).data, index_column->get_data_at(0).size); + return parse_compaction_output_pk_entry(encoded_key, output_rowset_id, scanner->segment_id, + seq_col_length, output_row_sources, &scanner->current); +} + Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t segid, const TabletColumn& target_column, SegmentCacheHandle* segment_cache_handle, @@ -1645,6 +1743,124 @@ void BaseTablet::calc_compaction_output_rowset_delete_bitmap( } } +Status BaseTablet::calc_compaction_output_rowset_internal_delete_bitmap( + const std::vector& input_rowsets, RowsetSharedPtr output_rowset, + const RowIdConversion& rowid_conversion, DeleteBitmap* output_rowset_delete_bitmap) { + DCHECK(!tablet_schema()->cluster_key_uids().empty()); + DCHECK(output_rowset != nullptr); + + std::vector output_segments; + RETURN_IF_ERROR( + std::dynamic_pointer_cast(output_rowset)->load_segments(&output_segments)); + + std::vector> output_row_sources(output_segments.size()); + for (size_t segment_id = 0; segment_id < output_segments.size(); ++segment_id) { + output_row_sources[segment_id].resize(output_segments[segment_id]->num_rows()); + } + + std::map input_rowset_versions; + for (const auto& rowset : input_rowsets) { + input_rowset_versions.emplace(rowset->rowset_id(), rowset->version()); + } + + const auto& rowid_conversion_map = rowid_conversion.get_rowid_conversion_map(); + for (uint32_t source_segment_index = 0; source_segment_index < rowid_conversion_map.size(); + ++source_segment_index) { + auto source_segment = rowid_conversion.get_segment_by_id(source_segment_index); + auto version_iter = input_rowset_versions.find(source_segment.first); + if (UNLIKELY(version_iter == input_rowset_versions.end())) { + return Status::InternalError("missing input rowset version for rowset_id={}", + source_segment.first.to_string()); + } + const auto& source_rowid_map = rowid_conversion_map[source_segment_index]; + for (uint32_t source_rowid = 0; source_rowid < source_rowid_map.size(); ++source_rowid) { + const auto& [dst_segment_id, dst_rowid] = source_rowid_map[source_rowid]; + if (dst_segment_id == UINT32_MAX && dst_rowid == UINT32_MAX) { + continue; + } + if (UNLIKELY(dst_segment_id >= output_row_sources.size() || + dst_rowid >= output_row_sources[dst_segment_id].size())) { + return Status::InternalError( + "invalid rowid conversion destination, rowset_id={}, segment_id={}, " + "row_id={}", + output_rowset->rowset_id().to_string(), dst_segment_id, dst_rowid); + } + output_row_sources[dst_segment_id][dst_rowid] = { + .version = version_iter->second, + .src = RowLocation(source_segment.first, source_segment.second, source_rowid), + .valid = true}; + } + } + + size_t seq_col_length = 0; + if (tablet_schema()->has_sequence_col()) { + seq_col_length = tablet_schema()->column(tablet_schema()->sequence_col_idx()).length() + 1; + } + + struct ScannerComparator { + bool operator()(const CompactionOutputPkScanner* lhs, + const CompactionOutputPkScanner* rhs) const { + return lhs->current.unique_key > rhs->current.unique_key; + } + }; + std::priority_queue, + ScannerComparator> + scanners_heap; + std::vector> scanners; + scanners.reserve(output_segments.size()); + + for (uint32_t segment_id = 0; segment_id < output_segments.size(); ++segment_id) { + auto& segment = output_segments[segment_id]; + RETURN_IF_ERROR(segment->load_pk_index_and_bf(nullptr)); + const auto* pk_index = segment->get_primary_key_index(); + DCHECK(pk_index != nullptr); + if (pk_index->num_rows() == 0) { + continue; + } + + auto scanner = std::make_unique(); + scanner->segment_id = segment_id; + scanner->remaining = pk_index->num_rows(); + scanner->index_type = + DataTypeFactory::instance().create_data_type(pk_index->type_info()->type(), 1, 0); + RETURN_IF_ERROR(pk_index->new_iterator(&scanner->iter, nullptr)); + RETURN_IF_ERROR(load_next_compaction_output_pk_entry( + output_rowset->rowset_id(), seq_col_length, output_row_sources, scanner.get())); + scanners_heap.push(scanner.get()); + scanners.push_back(std::move(scanner)); + } + + bool has_current_key = false; + CompactionOutputPkEntry current_visible_entry; + const auto delete_version = output_rowset->version().second; + while (!scanners_heap.empty()) { + auto* scanner = scanners_heap.top(); + scanners_heap.pop(); + auto entry = scanner->current; + + if (!has_current_key || current_visible_entry.unique_key != entry.unique_key) { + current_visible_entry = std::move(entry); + has_current_key = true; + } else if (is_newer_compaction_output_row(entry, current_visible_entry)) { + output_rowset_delete_bitmap->add({current_visible_entry.dst.rowset_id, + current_visible_entry.dst.segment_id, delete_version}, + current_visible_entry.dst.row_id); + current_visible_entry = std::move(entry); + } else { + output_rowset_delete_bitmap->add( + {entry.dst.rowset_id, entry.dst.segment_id, delete_version}, entry.dst.row_id); + } + + if (scanner->remaining > 0) { + RETURN_IF_ERROR(load_next_compaction_output_pk_entry( + output_rowset->rowset_id(), seq_col_length, output_row_sources, scanner)); + scanners_heap.push(scanner); + } + } + + return Status::OK(); +} + Status BaseTablet::check_rowid_conversion( RowsetSharedPtr dst_rowset, const std::map>>& diff --git a/be/src/storage/tablet/base_tablet.h b/be/src/storage/tablet/base_tablet.h index b98a89eb734f2c..7e75e320aca930 100644 --- a/be/src/storage/tablet/base_tablet.h +++ b/be/src/storage/tablet/base_tablet.h @@ -261,6 +261,10 @@ class BaseTablet : public std::enable_shared_from_this { std::map>>* location_map, const DeleteBitmap& input_delete_bitmap, DeleteBitmap* output_rowset_delete_bitmap); + Status calc_compaction_output_rowset_internal_delete_bitmap( + const std::vector& input_rowsets, RowsetSharedPtr output_rowset, + const RowIdConversion& rowid_conversion, DeleteBitmap* output_rowset_delete_bitmap); + Status check_rowid_conversion( RowsetSharedPtr dst_rowset, const std::map>>& diff --git a/be/src/storage/tablet/tablet_reader.h b/be/src/storage/tablet/tablet_reader.h index 6f6683bfaa217a..43da5879874585 100644 --- a/be/src/storage/tablet/tablet_reader.h +++ b/be/src/storage/tablet/tablet_reader.h @@ -133,7 +133,7 @@ class TabletReader { bool direct_mode = false; bool aggregation = false; // for compaction, schema_change, check_sum: we don't use page cache - // for query and config::disable_storage_page_cache is false, we use page cache + // for query, when the BE config disable_storage_page_cache is false, we use page cache bool use_page_cache = false; Version version = Version(-1, 0); @@ -246,6 +246,17 @@ class TabletReader { int batch_size() const { return _reader_context.batch_size; } + size_t batch_max_rows() const { return _reader_context.batch_size; } + + void set_preferred_block_size_bytes(size_t bytes) { + _reader_context.preferred_block_size_bytes = bytes; + } + + // Returns the preferred output block byte budget. Subclasses that support adaptive batch size + // should override this; the base returns 0 (disabled) so VCollectIterator degrades safely + // when called through a TabletReader* that has not been configured. + virtual size_t preferred_block_size_bytes() const { return 0; } + const OlapReaderStatistics& stats() const { return _stats; } OlapReaderStatistics* mutable_stats() { return &_stats; } diff --git a/be/src/util/block_budget.h b/be/src/util/block_budget.h new file mode 100644 index 00000000000000..391a213107d5ca --- /dev/null +++ b/be/src/util/block_budget.h @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace doris { + +// Lightweight value type that captures the dual row+byte budget for block +// output sizing. Every operator that needs to respect the adaptive batch +// size feature can construct a BlockBudget from RuntimeState's batch_size() +// and preferred_block_size_bytes() and use the helper methods instead of +// reimplementing the same row/byte logic inline. +// +// Typical usage: +// BlockBudget budget(state->batch_size(), state->preferred_block_size_bytes()); +// size_t eff = budget.effective_max_rows(estimated_row_bytes); +// while (budget.within_budget(block.rows(), block.bytes())) { ... } +// +struct BlockBudget { + size_t max_rows; + size_t max_bytes; // byte budget from preferred_block_size_bytes(), 0 means disabled + + BlockBudget(size_t max_rows_, size_t max_bytes_) : max_rows(max_rows_), max_bytes(max_bytes_) {} + + // Pre-compute effective row limit from an estimated average row byte size. + // When max_bytes == 0 or estimated_row_bytes == 0, returns max_rows. + // Always returns at least 1. + size_t effective_max_rows(size_t estimated_row_bytes) const { + if (max_bytes > 0 && estimated_row_bytes > 0) { + size_t bytes_limit = max_bytes / estimated_row_bytes; + return std::max(size_t(1), std::min(max_rows, bytes_limit)); + } + return max_rows; + } + + // Check if a block with the given rows/bytes is still within budget. + // Use this in loop *continuation* conditions (while/for). + bool within_budget(size_t rows, size_t bytes) const { + return rows < max_rows && (max_bytes == 0 || bytes < max_bytes); + } + + // Check if a block with the given rows/bytes has exceeded the budget. + // Use this in loop *break* conditions. + bool exceeded(size_t rows, size_t bytes) const { + return rows >= max_rows || (max_bytes > 0 && bytes >= max_bytes); + } + + // Compute how many more rows can be added to a block that currently + // has current_rows rows and current_bytes bytes, respecting both the + // row cap and the byte budget. + // The 3-arg overload accepts an explicit estimated_row_bytes (useful when + // the estimate comes from a different source, e.g. a child block). + // The 2-arg overload derives the estimate from current_bytes / current_rows. + // Returns 0 when the block is already at or over budget. + size_t remaining_rows(size_t current_rows, size_t current_bytes, + size_t estimated_row_bytes) const { + size_t row_capacity = (current_rows < max_rows) ? (max_rows - current_rows) : 0; + if (max_bytes > 0 && estimated_row_bytes > 0) { + if (current_bytes >= max_bytes) { + return 0; + } + size_t byte_capacity = (max_bytes - current_bytes) / estimated_row_bytes; + row_capacity = std::min(row_capacity, byte_capacity); + } + return row_capacity; + } + + size_t remaining_rows(size_t current_rows, size_t current_bytes) const { + size_t estimated = + (current_rows > 0 && current_bytes > 0) ? (current_bytes / current_rows) : 0; + return remaining_rows(current_rows, current_bytes, estimated); + } +}; + +} // namespace doris diff --git a/be/test/common/block_budget_test.cpp b/be/test/common/block_budget_test.cpp new file mode 100644 index 00000000000000..6eb6a7be7a57e0 --- /dev/null +++ b/be/test/common/block_budget_test.cpp @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/block_budget.h" + +#include + +namespace doris { + +class BlockBudgetTest : public ::testing::Test {}; + +// ── effective_max_rows ────────────────────────────────────────────────────── + +TEST_F(BlockBudgetTest, EffectiveMaxRowsNoByteBudget) { + BlockBudget b(4096, 0); + EXPECT_EQ(b.effective_max_rows(100), 4096); + EXPECT_EQ(b.effective_max_rows(0), 4096); +} + +TEST_F(BlockBudgetTest, EffectiveMaxRowsZeroEstimate) { + BlockBudget b(4096, 8 * 1024 * 1024); + // When estimate is 0, fall back to max_rows. + EXPECT_EQ(b.effective_max_rows(0), 4096); +} + +TEST_F(BlockBudgetTest, EffectiveMaxRowsByteLimited) { + // 8 MB budget, 10 KB per row → 819 rows (< 4096 max_rows) + BlockBudget b(4096, 8 * 1024 * 1024); + EXPECT_EQ(b.effective_max_rows(10 * 1024), 819); +} + +TEST_F(BlockBudgetTest, EffectiveMaxRowsRowLimited) { + // 8 MB budget, 10 bytes per row → 838860 rows, but max_rows = 4096 + BlockBudget b(4096, 8 * 1024 * 1024); + EXPECT_EQ(b.effective_max_rows(10), 4096); +} + +TEST_F(BlockBudgetTest, EffectiveMaxRowsReturnsAtLeastOne) { + // Huge rows: 100 MB per row, 8 MB budget → 0, but clamped to 1 + BlockBudget b(4096, 8 * 1024 * 1024); + EXPECT_EQ(b.effective_max_rows(100 * 1024 * 1024), 1); +} + +// ── within_budget / exceeded ──────────────────────────────────────────────── + +TEST_F(BlockBudgetTest, WithinBudgetNoByteBudget) { + BlockBudget b(100, 0); + EXPECT_TRUE(b.within_budget(0, 0)); + EXPECT_TRUE(b.within_budget(99, 999999999)); + EXPECT_FALSE(b.within_budget(100, 0)); + EXPECT_FALSE(b.within_budget(200, 0)); +} + +TEST_F(BlockBudgetTest, WithinBudgetWithByteBudget) { + BlockBudget b(100, 1000); + EXPECT_TRUE(b.within_budget(50, 500)); // both under + EXPECT_FALSE(b.within_budget(100, 500)); // rows hit + EXPECT_FALSE(b.within_budget(50, 1000)); // bytes hit + EXPECT_FALSE(b.within_budget(100, 1000)); // both hit +} + +TEST_F(BlockBudgetTest, ExceededIsInverseOfWithinBudget) { + BlockBudget b(100, 1000); + // Note: exceeded uses >=, within_budget uses <, so they should be + // perfect logical inverses. + for (size_t r : {0, 50, 99, 100, 200}) { + for (size_t bytes : {0, 500, 999, 1000, 2000}) { + EXPECT_EQ(b.exceeded(r, bytes), !b.within_budget(r, bytes)) + << "r=" << r << " bytes=" << bytes; + } + } +} + +// ── remaining_rows ────────────────────────────────────────────────────────── + +TEST_F(BlockBudgetTest, RemainingRowsNoByteBudget) { + BlockBudget b(100, 0); + EXPECT_EQ(b.remaining_rows(0, 0), 100); + EXPECT_EQ(b.remaining_rows(60, 9999), 40); + EXPECT_EQ(b.remaining_rows(100, 0), 0); + EXPECT_EQ(b.remaining_rows(200, 0), 0); +} + +TEST_F(BlockBudgetTest, RemainingRowsByteLimited) { + // max_rows=100, max_bytes=1000, current: 50 rows, 600 bytes + // avg = 12 bytes/row, byte_capacity = (1000-600)/12 = 33 + // row_capacity = 100 - 50 = 50 + // result = min(50, 33) = 33 + BlockBudget b(100, 1000); + EXPECT_EQ(b.remaining_rows(50, 600), 33); +} + +TEST_F(BlockBudgetTest, RemainingRowsAlreadyOverByteBudget) { + BlockBudget b(100, 1000); + EXPECT_EQ(b.remaining_rows(50, 1000), 0); + EXPECT_EQ(b.remaining_rows(50, 2000), 0); +} + +TEST_F(BlockBudgetTest, RemainingRowsZeroCurrentRows) { + // No rows yet → can't estimate avg_row_bytes, fall back to row capacity + BlockBudget b(100, 1000); + EXPECT_EQ(b.remaining_rows(0, 0), 100); +} + +TEST_F(BlockBudgetTest, RemainingRowsZeroCurrentBytes) { + // Has rows but zero bytes → can't estimate avg, fall back to row capacity + BlockBudget b(100, 1000); + EXPECT_EQ(b.remaining_rows(50, 0), 50); +} + +} // namespace doris diff --git a/be/test/exec/pipeline/local_exchanger_test.cpp b/be/test/exec/pipeline/local_exchanger_test.cpp index 2a1bb3ddfc2785..68e42efe35c0d4 100644 --- a/be/test/exec/pipeline/local_exchanger_test.cpp +++ b/be/test/exec/pipeline/local_exchanger_test.cpp @@ -134,7 +134,8 @@ TEST_F(LocalExchangerTest, ShuffleExchanger) { auto* get_block_failed_counter = ADD_TIMER(profile, "_get_block_failed_counter" + std::to_string(i)); auto* copy_data_timer = ADD_TIMER(profile, "_copy_data_timer" + std::to_string(i)); - _local_states[i] = std::make_unique(nullptr, nullptr); + _local_states[i] = + std::make_unique(_runtime_state.get(), nullptr); _local_states[i]->_exchanger = shared_state->exchanger.get(); _local_states[i]->_get_block_failed_counter = get_block_failed_counter; _local_states[i]->_copy_data_timer = copy_data_timer; @@ -362,7 +363,8 @@ TEST_F(LocalExchangerTest, PassthroughExchanger) { auto* get_block_failed_counter = ADD_TIMER(profile, "_get_block_failed_counter" + std::to_string(i)); auto* copy_data_timer = ADD_TIMER(profile, "_copy_data_timer" + std::to_string(i)); - _local_states[i] = std::make_unique(nullptr, nullptr); + _local_states[i] = + std::make_unique(_runtime_state.get(), nullptr); _local_states[i]->_exchanger = shared_state->exchanger.get(); _local_states[i]->_get_block_failed_counter = get_block_failed_counter; _local_states[i]->_copy_data_timer = copy_data_timer; @@ -562,7 +564,8 @@ TEST_F(LocalExchangerTest, PassToOneExchanger) { auto* get_block_failed_counter = ADD_TIMER(profile, "_get_block_failed_counter" + std::to_string(i)); auto* copy_data_timer = ADD_TIMER(profile, "_copy_data_timer" + std::to_string(i)); - _local_states[i] = std::make_unique(nullptr, nullptr); + _local_states[i] = + std::make_unique(_runtime_state.get(), nullptr); _local_states[i]->_exchanger = shared_state->exchanger.get(); _local_states[i]->_get_block_failed_counter = get_block_failed_counter; _local_states[i]->_copy_data_timer = copy_data_timer; @@ -770,7 +773,8 @@ TEST_F(LocalExchangerTest, BroadcastExchanger) { auto* get_block_failed_counter = ADD_TIMER(profile, "_get_block_failed_counter" + std::to_string(i)); auto* copy_data_timer = ADD_TIMER(profile, "_copy_data_timer" + std::to_string(i)); - _local_states[i] = std::make_unique(nullptr, nullptr); + _local_states[i] = + std::make_unique(_runtime_state.get(), nullptr); _local_states[i]->_exchanger = shared_state->exchanger.get(); _local_states[i]->_get_block_failed_counter = get_block_failed_counter; _local_states[i]->_copy_data_timer = copy_data_timer; @@ -973,7 +977,8 @@ TEST_F(LocalExchangerTest, AdaptivePassthroughExchanger) { auto* get_block_failed_counter = ADD_TIMER(profile, "_get_block_failed_counter" + std::to_string(i)); auto* copy_data_timer = ADD_TIMER(profile, "_copy_data_timer" + std::to_string(i)); - _local_states[i] = std::make_unique(nullptr, nullptr); + _local_states[i] = + std::make_unique(_runtime_state.get(), nullptr); _local_states[i]->_exchanger = shared_state->exchanger.get(); _local_states[i]->_get_block_failed_counter = get_block_failed_counter; _local_states[i]->_copy_data_timer = copy_data_timer; @@ -1208,7 +1213,8 @@ TEST_F(LocalExchangerTest, TestShuffleExchangerWrongMap) { auto* get_block_failed_counter = ADD_TIMER(profile, "_get_block_failed_counter" + std::to_string(i)); auto* copy_data_timer = ADD_TIMER(profile, "_copy_data_timer" + std::to_string(i)); - _local_states[i] = std::make_unique(nullptr, nullptr); + _local_states[i] = + std::make_unique(_runtime_state.get(), nullptr); _local_states[i]->_exchanger = shared_state->exchanger.get(); _local_states[i]->_get_block_failed_counter = get_block_failed_counter; _local_states[i]->_copy_data_timer = copy_data_timer; diff --git a/be/test/format/csv/csv_reader_test.cpp b/be/test/format/csv/csv_reader_test.cpp new file mode 100644 index 00000000000000..498ce615302eef --- /dev/null +++ b/be/test/format/csv/csv_reader_test.cpp @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/csv/csv_reader.h" + +#include + +#include + +#include "testutil/mock/mock_runtime_state.h" + +namespace doris { + +// Test that set_batch_size stores the value correctly. +TEST(CsvReaderSetBatchSizeTest, SetBatchSizeStoresValue) { + TFileScanRangeParams params; + params.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + params.__isset.file_attributes = true; + params.file_attributes.__isset.text_params = true; + params.file_attributes.text_params.column_separator = ","; + params.file_attributes.text_params.line_delimiter = "\n"; + + TFileRangeDesc range; + range.path = "/nonexistent/test.csv"; + range.start_offset = 0; + range.size = 0; + + auto runtime_state = std::make_unique(); + + std::vector file_slot_descs; + auto reader = CsvReader::create_unique(runtime_state.get(), nullptr, nullptr, params, range, + file_slot_descs, runtime_state->batch_size(), nullptr); + + // Default: _batch_size should be 0 (not set) + // After set_batch_size, it should store the value + reader->set_batch_size(128); + // We can only verify indirectly that it was stored; the value is used + // inside get_next_block(). Since we can't call get_next_block without + // a fully initialized reader, we verify the interface doesn't crash. + + reader->set_batch_size(256); + // Calling set_batch_size multiple times should be safe. + + reader->set_batch_size(0); + // Setting to 0 should revert to default behavior. +} + +// Test that set_batch_size is callable via the GenericReader interface. +TEST(CsvReaderSetBatchSizeTest, SetBatchSizeViaGenericInterface) { + TFileScanRangeParams params; + params.format_type = TFileFormatType::FORMAT_CSV_PLAIN; + params.__isset.file_attributes = true; + params.file_attributes.__isset.text_params = true; + params.file_attributes.text_params.column_separator = ","; + params.file_attributes.text_params.line_delimiter = "\n"; + + TFileRangeDesc range; + range.path = "/nonexistent/test.csv"; + range.start_offset = 0; + range.size = 0; + + auto runtime_state = std::make_unique(); + + std::vector file_slot_descs; + auto reader = CsvReader::create_unique(runtime_state.get(), nullptr, nullptr, params, range, + file_slot_descs, runtime_state->batch_size(), nullptr); + + // Access through base class pointer — this is how FileScanner calls it. + GenericReader* base_reader = reader.get(); + base_reader->set_batch_size(128); + base_reader->set_batch_size(4096); +} + +} // namespace doris diff --git a/be/test/format/json/json_reader_test.cpp b/be/test/format/json/json_reader_test.cpp new file mode 100644 index 00000000000000..920d3ea0f9f041 --- /dev/null +++ b/be/test/format/json/json_reader_test.cpp @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "format/json/new_json_reader.h" + +namespace doris { + +static constexpr size_t kDefaultBatchSize = 4064; + +// Test that set_batch_size stores the value correctly. +TEST(NewJsonReaderSetBatchSizeTest, SetBatchSizeStoresValue) { + TFileScanRangeParams params; + params.format_type = TFileFormatType::FORMAT_JSON; + params.__isset.file_attributes = true; + params.file_attributes.__isset.text_params = true; + params.file_attributes.text_params.line_delimiter = "\n"; + + TFileRangeDesc range; + range.path = "/nonexistent/test.json"; + range.start_offset = 0; + range.size = 0; + + std::vector file_slot_descs; + // Use the second constructor (profile, params, range, file_slot_descs, io_ctx) + // to avoid the first constructor's ADD_TIMER(_profile, ...) which crashes on nullptr. + auto reader = NewJsonReader::create_unique(nullptr, params, range, file_slot_descs, + kDefaultBatchSize, nullptr); + + // Default: _batch_size is initialized to _MIN_BATCH_SIZE. + EXPECT_EQ(reader->get_batch_size(), 4064U); + + // After set_batch_size, it should store the value (clamped to >=_MIN_BATCH_SIZE). + reader->set_batch_size(8192); + EXPECT_EQ(reader->get_batch_size(), 8192U); + + // Calling set_batch_size multiple times should update the value. + reader->set_batch_size(16384); + EXPECT_EQ(reader->get_batch_size(), 16384U); + + // Setting below _MIN_BATCH_SIZE (or 0) clamps to 1 so the + // reader never spins on empty blocks. + reader->set_batch_size(0); + EXPECT_EQ(reader->get_batch_size(), 1UL); +} + +// Test that set_batch_size is callable via the GenericReader interface. +TEST(NewJsonReaderSetBatchSizeTest, SetBatchSizeViaGenericInterface) { + TFileScanRangeParams params; + params.format_type = TFileFormatType::FORMAT_JSON; + params.__isset.file_attributes = true; + params.file_attributes.__isset.text_params = true; + params.file_attributes.text_params.line_delimiter = "\n"; + + TFileRangeDesc range; + range.path = "/nonexistent/test.json"; + range.start_offset = 0; + range.size = 0; + + std::vector file_slot_descs; + // Use the second constructor to avoid nullptr profile crash in ADD_TIMER. + auto reader = NewJsonReader::create_unique(nullptr, params, range, file_slot_descs, + kDefaultBatchSize, nullptr); + + // Access through base class pointer — this is how FileScanner calls it. + GenericReader* base_reader = reader.get(); + base_reader->set_batch_size(8192); + EXPECT_EQ(base_reader->get_batch_size(), 8192U); + base_reader->set_batch_size(4096); + EXPECT_EQ(base_reader->get_batch_size(), 4096U); +} + +} // namespace doris diff --git a/be/test/format/orc/orc_convert_dict_test.cpp b/be/test/format/orc/orc_convert_dict_test.cpp index 0e64590e16014c..3a4333f5f27c49 100644 --- a/be/test/format/orc/orc_convert_dict_test.cpp +++ b/be/test/format/orc/orc_convert_dict_test.cpp @@ -81,7 +81,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnBasic) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); // Execute conversion auto result_column = reader->_convert_dict_column_to_string_column( @@ -118,7 +118,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnWithNulls) { TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto _reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( @@ -150,7 +150,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnChar) { auto orc_type_ptr = createPrimitiveType(orc::TypeKind::CHAR); TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto _reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( @@ -181,7 +181,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnEmpty) { auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto _reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get()); @@ -213,7 +213,7 @@ TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnMixed) { auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto _reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); // Execute conversion auto result_column = _reader->_convert_dict_column_to_string_column( dict_column.get(), &null_map, string_batch.get(), orc_type_ptr.get()); diff --git a/be/test/format/orc/orc_reader_fill_data_test.cpp b/be/test/format/orc/orc_reader_fill_data_test.cpp index 574f7cf26a7fdb..12c1dd209c585b 100644 --- a/be/test/format/orc/orc_reader_fill_data_test.cpp +++ b/be/test/format/orc/orc_reader_fill_data_test.cpp @@ -80,7 +80,7 @@ TEST_F(OrcReaderFillDataTest, TestFillLongColumn) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); MutableColumnPtr xx = column->assume_mutable(); @@ -106,7 +106,7 @@ TEST_F(OrcReaderFillDataTest, TestFillLongColumnWithNull) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); MutableColumnPtr xx = column->assume_mutable(); @@ -160,7 +160,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared( std::vector { @@ -246,7 +246,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared( std::vector {std::make_shared(), @@ -332,7 +332,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared( std::vector {std::make_shared(18, 5)}, @@ -446,7 +446,7 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); auto doris_struct_type = std::make_shared(std::make_shared(), std::make_shared()); diff --git a/be/test/format/orc/orc_reader_init_column_test.cpp b/be/test/format/orc/orc_reader_init_column_test.cpp index 4005edcf8fb7e6..00d165dedfc810 100644 --- a/be/test/format/orc/orc_reader_init_column_test.cpp +++ b/be/test/format/orc/orc_reader_init_column_test.cpp @@ -53,7 +53,7 @@ TEST_F(OrcReaderInitColumnTest, InitReadColumn) { TFileScanRangeParams params; TFileRangeDesc range; - auto reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); reader->_reader = std::move(orc_reader); std::vector tmp; tmp.emplace_back("col1"); @@ -72,7 +72,7 @@ TEST_F(OrcReaderInitColumnTest, CheckAcidSchemaTest) { using namespace orc; TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto _reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); // 1. Test standard ACID schema { // Create standard ACID structure @@ -139,7 +139,7 @@ TEST_F(OrcReaderInitColumnTest, RemoveAcidTest) { using namespace orc; TFileScanRangeParams params; TFileRangeDesc range; - auto _reader = OrcReader::create_unique(params, range, "", nullptr, nullptr, true); + auto _reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); // 1. Test removing ACID info from ACID schema { // Create ACID schema diff --git a/be/test/format/orc/orc_reader_test.cpp b/be/test/format/orc/orc_reader_test.cpp index 932b345c461525..dd8ecbe720ec97 100644 --- a/be/test/format/orc/orc_reader_test.cpp +++ b/be/test/format/orc/orc_reader_test.cpp @@ -83,7 +83,7 @@ class OrcReaderTest : public testing::Test { range.path = "./be/test/exec/test_data/orc_scanner/orders.orc"; range.start_offset = 0; range.size = 1293; - auto reader = OrcReader::create_unique(params, range, "", nullptr, &cache, true); + auto reader = OrcReader::create_unique(params, range, 4096, "", nullptr, &cache, true); auto status = reader->init_reader(&column_names, &col_name_to_block_idx, {}, false, tuple_desc, &row_desc, nullptr, nullptr); EXPECT_TRUE(status.ok()); diff --git a/be/test/runtime/runtime_state_block_budget_test.cpp b/be/test/runtime/runtime_state_block_budget_test.cpp new file mode 100644 index 00000000000000..22ebc5ebf8a0ee --- /dev/null +++ b/be/test/runtime/runtime_state_block_budget_test.cpp @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "common/config.h" +#include "runtime/runtime_state.h" +#include "testutil/mock/mock_runtime_state.h" +#include "util/block_budget.h" + +namespace doris { + +// --------------------------------------------------------------------------- +// RuntimeState::batch_size() +// --------------------------------------------------------------------------- + +class RuntimeStateBatchSizeTest : public ::testing::Test { +protected: + RuntimeState state; +}; + +TEST_F(RuntimeStateBatchSizeTest, DefaultWhenUnset) { + EXPECT_EQ(state.batch_size(), 4062); +} + +TEST_F(RuntimeStateBatchSizeTest, NormalValue) { + state._query_options.__set_batch_size(4096); + EXPECT_EQ(state.batch_size(), 4096); +} + +TEST_F(RuntimeStateBatchSizeTest, ClampToMin) { + state._query_options.__set_batch_size(0); + EXPECT_EQ(state.batch_size(), 1); + + state._query_options.__set_batch_size(-100); + EXPECT_EQ(state.batch_size(), 1); +} + +TEST_F(RuntimeStateBatchSizeTest, ClampToMax) { + state._query_options.__set_batch_size(100000); + EXPECT_EQ(state.batch_size(), 65535); +} + +TEST_F(RuntimeStateBatchSizeTest, ExactBoundaries) { + state._query_options.__set_batch_size(1); + EXPECT_EQ(state.batch_size(), 1); + + state._query_options.__set_batch_size(65535); + EXPECT_EQ(state.batch_size(), 65535); +} + +TEST_F(RuntimeStateBatchSizeTest, ConstructedBlockBudgetUsesBatchSizeRows) { + state._query_options.__set_batch_size(4096); + EXPECT_EQ(BlockBudget(state.batch_size(), state.preferred_block_size_bytes()).max_rows, 4096UL); +} + +// --------------------------------------------------------------------------- +// RuntimeState::preferred_block_size_bytes() +// --------------------------------------------------------------------------- + +class RuntimeStateAdaptiveBatchSizeTest : public ::testing::Test { +protected: + void SetUp() override { + _saved_enable_adaptive = config::enable_adaptive_batch_size; + config::enable_adaptive_batch_size = true; + } + + void TearDown() override { config::enable_adaptive_batch_size = _saved_enable_adaptive; } + + bool _saved_enable_adaptive = false; +}; + +class RuntimeStateBlockSizeBytesTest : public RuntimeStateAdaptiveBatchSizeTest { +protected: + RuntimeState state; +}; + +TEST_F(RuntimeStateBlockSizeBytesTest, DefaultWhenUnset) { + // Field not set → default 8MB. + EXPECT_EQ(state.preferred_block_size_bytes(), 8388608UL); +} + +TEST_F(RuntimeStateBlockSizeBytesTest, NormalValue) { + state._query_options.__set_preferred_block_size_bytes(4194304L); // 4MB + EXPECT_EQ(state.preferred_block_size_bytes(), 4194304UL); +} + +TEST_F(RuntimeStateBlockSizeBytesTest, ZeroClampsToMin) { + // FE rejects 0, but BE still clamps direct thrift / mixed-version inputs defensively. + state._query_options.__set_preferred_block_size_bytes(0); + EXPECT_EQ(state.preferred_block_size_bytes(), 1048576UL); +} + +TEST_F(RuntimeStateBlockSizeBytesTest, ClampToMin) { + // Non-zero values below 1MB should be clamped to 1MB. + state._query_options.__set_preferred_block_size_bytes(50); + EXPECT_EQ(state.preferred_block_size_bytes(), 1048576UL); // 1MB +} + +TEST_F(RuntimeStateBlockSizeBytesTest, ClampToMax) { + // Values above 512MB should be clamped to 512MB. + state._query_options.__set_preferred_block_size_bytes(1073741824L); // 1GB + EXPECT_EQ(state.preferred_block_size_bytes(), 536870912UL); // 512MB +} + +TEST_F(RuntimeStateBlockSizeBytesTest, ExactBoundaries) { + state._query_options.__set_preferred_block_size_bytes(1048576L); // 1MB + EXPECT_EQ(state.preferred_block_size_bytes(), 1048576UL); + + state._query_options.__set_preferred_block_size_bytes(536870912L); // 512MB + EXPECT_EQ(state.preferred_block_size_bytes(), 536870912UL); +} + +TEST_F(RuntimeStateBlockSizeBytesTest, DisabledWhenConfigOff) { + config::enable_adaptive_batch_size = false; + state._query_options.__set_preferred_block_size_bytes(8388608L); + EXPECT_EQ(state.preferred_block_size_bytes(), 536870912UL); + EXPECT_EQ(BlockBudget(state.batch_size(), state.preferred_block_size_bytes()).max_bytes, + 536870912UL); +} + +// --------------------------------------------------------------------------- +// MockRuntimeState: verify the test override bypasses clamping +// --------------------------------------------------------------------------- + +class MockRuntimeStateBlockBudgetTest : public RuntimeStateAdaptiveBatchSizeTest { +protected: + MockRuntimeState state; +}; + +TEST_F(MockRuntimeStateBlockBudgetTest, PreferredBlockSizeBypassesClamping) { + state._query_options.__set_preferred_block_size_bytes(50); + EXPECT_EQ(state.preferred_block_size_bytes(), 50UL); +} + +TEST_F(MockRuntimeStateBlockBudgetTest, PreferredBlockSizeDefaultFallback) { + // When not set, falls back to base class default (8MB). + EXPECT_EQ(state.preferred_block_size_bytes(), 8388608UL); +} + +TEST_F(MockRuntimeStateBlockBudgetTest, BatchSizeOverride) { + // MockRuntimeState returns _batch_size member directly. + state._batch_size = 256; + EXPECT_EQ(state.batch_size(), 256); +} + +TEST_F(MockRuntimeStateBlockBudgetTest, ConfigOffStillDisablesAdaptiveBytes) { + config::enable_adaptive_batch_size = false; + state._query_options.__set_preferred_block_size_bytes(50); + EXPECT_EQ(state.preferred_block_size_bytes(), 536870912UL); +} + +} // namespace doris diff --git a/be/test/storage/compaction/segcompaction_mow_test.cpp b/be/test/storage/compaction/segcompaction_mow_test.cpp index 760a5d953aa693..6b57c081c53860 100644 --- a/be/test/storage/compaction/segcompaction_mow_test.cpp +++ b/be/test/storage/compaction/segcompaction_mow_test.cpp @@ -19,9 +19,14 @@ #include #include +#include +#include +#include +#include #include #include #include +#include #include #include "common/config.h" @@ -40,6 +45,7 @@ #include "storage/tablet/tablet_meta.h" #include "storage/tablet/tablet_schema.h" #include "storage/utils.h" +#include "util/debug_points.h" #include "util/slice.h" namespace doris { @@ -94,6 +100,8 @@ class SegCompactionMoWTest : public ::testing::TestWithParam { } void TearDown() { + DebugPoints::instance()->clear(); + config::enable_debug_points = false; config::enable_segcompaction = false; ExecEnv* exec_env = doris::ExecEnv::GetInstance(); s_engine = nullptr; @@ -130,6 +138,35 @@ class SegCompactionMoWTest : public ::testing::TestWithParam { return true; } + bool wait_until(const std::function& pred, int timeout_ms = 10000) { + auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms); + while (std::chrono::steady_clock::now() < deadline) { + if (pred()) { + return true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return pred(); + } + + Block create_int_block(TabletSchemaSPtr tablet_schema, uint32_t segment_id, + uint32_t rows_per_segment) { + Block block = tablet_schema->create_block(); + auto columns = block.mutate_columns(); + for (uint32_t rid = 0; rid < rows_per_segment; ++rid) { + uint32_t k1 = rid * 100 + segment_id; + uint32_t k2 = segment_id; + uint32_t k3 = rid; + uint32_t seq = 0; + columns[0]->insert_data(reinterpret_cast(&k1), sizeof(k1)); + columns[1]->insert_data(reinterpret_cast(&k2), sizeof(k2)); + columns[2]->insert_data(reinterpret_cast(&k3), sizeof(k3)); + columns[3]->insert_data(reinterpret_cast(&seq), sizeof(seq)); + } + block.set_columns(std::move(columns)); + return block; + } + // (k1 int, k2 varchar(20), k3 int) keys (k1, k2) void create_tablet_schema(TabletSchemaSPtr tablet_schema) { TabletSchemaPB tablet_schema_pb; @@ -823,6 +860,85 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { total_written_rows, rows_mark_deleted)); } +TEST_F(SegCompactionMoWTest, AsyncDeleteBitmapMustNotReadSegmentsDeletedBySegcompaction) { + config::enable_segcompaction = true; + config::enable_debug_points = true; + config::segcompaction_batch_size = 5; + config::segcompaction_candidate_max_rows = 1000; + config::segcompaction_candidate_max_bytes = 1 << 20; + + TabletSchemaSPtr tablet_schema = std::make_shared(); + create_tablet_schema(tablet_schema); + + RowsetWriterContext writer_context; + const int raw_rsid = 20052; + create_rowset_writer_context(raw_rsid, tablet_schema, &writer_context); + + DeleteBitmapPtr delete_bitmap = std::make_shared(TABLET_ID); + std::shared_ptr rsids {std::make_shared()}; + std::vector rowset_ptrs; + writer_context.mow_context = + std::make_shared(1, 1, rsids, rowset_ptrs, delete_bitmap); + + auto res = RowsetFactory::create_rowset_writer(*s_engine, writer_context, false); + ASSERT_TRUE(res.has_value()) << res.error(); + auto rowset_writer = std::move(res).value(); + + constexpr int32_t target_segment_id = 3; + std::atomic target_delete_bitmap_task_blocked {false}; + std::atomic release_target_delete_bitmap_task {false}; + DebugPoints::instance()->add_with_callback( + "BaseBetaRowsetWriter::_generate_delete_bitmap.block_before_load_segments", + std::function([&](int32_t segment_id) { + if (segment_id != target_segment_id) { + return; + } + target_delete_bitmap_task_blocked.store(true); + while (!release_target_delete_bitmap_task.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + })); + + auto target_segment_path = fmt::format("{}/{}_{}.dat", lTestDir, raw_rsid, target_segment_id); + bool blocked = false; + bool source_segment_deleted_before_release = false; + std::thread release_thread([&] { + blocked = wait_until([&] { return target_delete_bitmap_task_blocked.load(); }); + if (blocked) { + source_segment_deleted_before_release = + wait_until([&] { return !std::filesystem::exists(target_segment_path); }, 1000); + } + release_target_delete_bitmap_task.store(true); + DebugPoints::instance()->remove( + "BaseBetaRowsetWriter::_generate_delete_bitmap.block_before_load_segments"); + }); + + Status write_status = Status::OK(); + const uint32_t rows_per_segment = 128; + for (int32_t segment_id = 0; segment_id < 5; ++segment_id) { + auto block = create_int_block(tablet_schema, segment_id, rows_per_segment); + write_status = rowset_writer->add_block(&block); + if (!write_status.ok()) { + break; + } + write_status = rowset_writer->flush(); + if (!write_status.ok()) { + break; + } + } + release_thread.join(); + + RowsetSharedPtr rowset; + auto build_status = write_status.ok() ? rowset_writer->build(rowset) : write_status; + + ASSERT_TRUE(blocked) << "delete bitmap task did not reach the injected wait point"; + EXPECT_FALSE(source_segment_deleted_before_release) + << "segcompaction deleted source segment before delete bitmap finished: " + << target_segment_path; + EXPECT_TRUE(write_status.ok()) << write_status; + EXPECT_TRUE(build_status.ok()) << build_status; +} + TEST_F(SegCompactionMoWTest, SegCompactionNotTrigger) { config::enable_segcompaction = true; Status s; diff --git a/be/test/storage/compaction/vertical_compaction_test.cpp b/be/test/storage/compaction/vertical_compaction_test.cpp index 3b736857242caf..a39932e2a01fcf 100644 --- a/be/test/storage/compaction/vertical_compaction_test.cpp +++ b/be/test/storage/compaction/vertical_compaction_test.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -47,6 +48,7 @@ #include "io/io_common.h" #include "json2pb/json_to_pb.h" #include "runtime/exec_env.h" +#include "storage/compaction/compaction.h" #include "storage/delete/delete_handler.h" #include "storage/field.h" #include "storage/iterator/vertical_merge_iterator.h" @@ -68,6 +70,7 @@ #include "storage/tablet/tablet.h" #include "storage/tablet/tablet_meta.h" #include "storage/tablet/tablet_schema.h" +#include "storage/txn/txn_manager.h" #include "storage/utils.h" #include "util/uid_util.h" @@ -155,6 +158,67 @@ class VerticalCompactionTest : public ::testing::Test { return tablet_schema; } + TabletSchemaSPtr create_mow_cluster_key_schema(bool has_sequence_col = false) { + TabletSchemaSPtr tablet_schema = std::make_shared(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(UNIQUE_KEYS); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(has_sequence_col ? 5 : 4); + tablet_schema_pb.add_cluster_key_uids(2); + if (has_sequence_col) { + tablet_schema_pb.set_sequence_col_idx(2); + tablet_schema_pb.add_cluster_key_uids(3); + } + + ColumnPB* column_1 = tablet_schema_pb.add_column(); + column_1->set_unique_id(1); + column_1->set_name("c1"); + column_1->set_type("INT"); + column_1->set_is_key(true); + column_1->set_length(4); + column_1->set_index_length(4); + column_1->set_is_nullable(false); + column_1->set_is_bf_column(false); + + ColumnPB* column_2 = tablet_schema_pb.add_column(); + column_2->set_unique_id(2); + column_2->set_name("c2"); + column_2->set_type("INT"); + column_2->set_length(4); + column_2->set_index_length(4); + column_2->set_is_key(false); + column_2->set_is_nullable(false); + column_2->set_is_bf_column(false); + + if (has_sequence_col) { + ColumnPB* column_3 = tablet_schema_pb.add_column(); + column_3->set_unique_id(3); + column_3->set_name("c3"); + column_3->set_type("INT"); + column_3->set_length(4); + column_3->set_index_length(4); + column_3->set_is_key(false); + column_3->set_is_nullable(false); + column_3->set_is_bf_column(false); + column_3->set_aggregation("NONE"); + } + + ColumnPB* delete_sign_column = tablet_schema_pb.add_column(); + delete_sign_column->set_unique_id(has_sequence_col ? 4 : 3); + delete_sign_column->set_name(DELETE_SIGN); + delete_sign_column->set_type("TINYINT"); + delete_sign_column->set_length(1); + delete_sign_column->set_index_length(1); + delete_sign_column->set_is_key(false); + delete_sign_column->set_is_nullable(false); + delete_sign_column->set_is_bf_column(false); + + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + TabletSchemaSPtr create_agg_schema() { TabletSchemaSPtr tablet_schema = std::make_shared(); TabletSchemaPB tablet_schema_pb; @@ -206,6 +270,9 @@ class VerticalCompactionTest : public ::testing::Test { rowset_writer_context.version = version; rowset_writer_context.segments_overlap = overlap; rowset_writer_context.max_rows_per_segment = max_rows_per_segment; + rowset_writer_context.enable_unique_key_merge_on_write = + tablet_schema->keys_type() == UNIQUE_KEYS && + !tablet_schema->cluster_key_uids().empty(); inc_id++; return rowset_writer_context; } @@ -268,6 +335,54 @@ class VerticalCompactionTest : public ::testing::Test { return rowset; } + RowsetSharedPtr create_rowset_with_sequence( + TabletSchemaSPtr tablet_schema, const SegmentsOverlapPB& overlap, + std::vector>> rowset_data, + int64_t version) { + if (overlap == NONOVERLAPPING) { + for (auto i = 1; i < rowset_data.size(); i++) { + auto& last_seg_data = rowset_data[i - 1]; + auto& cur_seg_data = rowset_data[i]; + int64_t last_seg_max = std::get<0>(last_seg_data[last_seg_data.size() - 1]); + int64_t cur_seg_min = std::get<0>(cur_seg_data[0]); + EXPECT_LT(last_seg_max, cur_seg_min); + } + } + auto writer_context = create_rowset_writer_context(tablet_schema, overlap, UINT32_MAX, + {version, version}); + + auto res = RowsetFactory::create_rowset_writer(*engine_ref, writer_context, true); + EXPECT_TRUE(res.has_value()) << res.error(); + auto rowset_writer = std::move(res).value(); + + uint32_t num_rows = 0; + for (int i = 0; i < rowset_data.size(); ++i) { + Block block = tablet_schema->create_block(); + auto columns = block.mutate_columns(); + for (int rid = 0; rid < rowset_data[i].size(); ++rid) { + int32_t c1 = std::get<0>(rowset_data[i][rid]); + int32_t c2 = std::get<1>(rowset_data[i][rid]); + int32_t c3 = std::get<2>(rowset_data[i][rid]); + columns[0]->insert_data((const char*)&c1, sizeof(c1)); + columns[1]->insert_data((const char*)&c2, sizeof(c2)); + columns[2]->insert_data((const char*)&c3, sizeof(c3)); + uint8_t num = 0; + columns[3]->insert_data((const char*)&num, sizeof(num)); + num_rows++; + } + auto s = rowset_writer->add_block(&block); + EXPECT_TRUE(s.ok()); + s = rowset_writer->flush(); + EXPECT_TRUE(s.ok()); + } + + RowsetSharedPtr rowset; + EXPECT_EQ(Status::OK(), rowset_writer->build(rowset)); + EXPECT_EQ(rowset_data.size(), rowset->rowset_meta()->num_segments()); + EXPECT_EQ(num_rows, rowset->rowset_meta()->num_rows()); + return rowset; + } + void init_rs_meta(RowsetMetaSharedPtr& rs_meta, int64_t start, int64_t end) { std::string json_rowset_meta = R"({ "rowset_id": 540081, @@ -322,6 +437,12 @@ class VerticalCompactionTest : public ::testing::Test { } else if (tablet_schema.keys_type() == AGG_KEYS) { t_tablet_schema.__set_keys_type(TKeysType::AGG_KEYS); } + for (auto uid : tablet_schema.cluster_key_uids()) { + t_tablet_schema.cluster_key_uids.push_back(uid); + } + if (tablet_schema.has_sequence_col()) { + t_tablet_schema.__set_sequence_col_idx(tablet_schema.sequence_col_idx()); + } t_tablet_schema.__set_storage_type(TStorageType::COLUMN); t_tablet_schema.__set_columns(cols); TabletMetaSharedPtr tablet_meta( @@ -372,12 +493,56 @@ class VerticalCompactionTest : public ::testing::Test { } } + void commit_txn_with_delete_bitmap(TabletSharedPtr tablet, const RowsetSharedPtr& rowset, + int64_t txn_id, DeleteBitmapPtr delete_bitmap, + const RowsetIdUnorderedSet& rowset_ids) { + PUniqueId load_id; + load_id.set_hi(txn_id); + load_id.set_lo(txn_id); + auto status = engine_ref->txn_manager()->prepare_txn(tablet->partition_id(), *tablet, + txn_id, load_id); + ASSERT_TRUE(status.ok()) << status; + status = engine_ref->txn_manager()->commit_txn(tablet->partition_id(), *tablet, txn_id, + load_id, rowset, {}, false); + ASSERT_TRUE(status.ok()) << status; + engine_ref->txn_manager()->set_txn_related_delete_bitmap( + tablet->partition_id(), txn_id, tablet->tablet_id(), tablet->tablet_uid(), true, + delete_bitmap, rowset_ids, nullptr); + } + private: const std::string kTestDir = "/ut_dir/vertical_compaction_test"; std::string absolute_dir; DataDir* _data_dir = nullptr; }; +class TestCompactionMixin : public CompactionMixin { +public: + TestCompactionMixin(StorageEngine& engine, TabletSharedPtr tablet) + : CompactionMixin(engine, std::move(tablet), "TestCompactionMixin") {} + + Status prepare_compact() override { return Status::OK(); } + + Status modify_rowsets_for_test(std::vector input_rowsets, + RowsetSharedPtr output_rowset, + std::unique_ptr rowid_conversion) { + _input_rowsets = std::move(input_rowsets); + _output_rowset = std::move(output_rowset); + _rowid_conversion = std::move(rowid_conversion); + _stats.rowid_conversion = _rowid_conversion.get(); + auto st = modify_rowsets(); + if (st.ok()) { + _state = CompactionState::SUCCESS; + } + return st; + } + +private: + std::string_view compaction_name() const override { return "test compaction"; } + + ReaderType compaction_type() const override { return ReaderType::READER_CUMULATIVE_COMPACTION; } +}; + TEST_F(VerticalCompactionTest, TestRowSourcesBuffer) { RowSourcesBuffer buffer(100, absolute_dir, ReaderType::READER_CUMULATIVE_COMPACTION); RowSource s1(0, 0); @@ -745,6 +910,165 @@ TEST_F(VerticalCompactionTest, TestUniqueKeyVerticalMerge) { } } +TEST_F(VerticalCompactionTest, ClusterKeyMowCompactionNeedsOutputRowsetInternalDedup) { + TabletSchemaSPtr tablet_schema = create_mow_cluster_key_schema(); + TabletSharedPtr tablet = create_tablet(*tablet_schema, true); + + std::vector input_rowsets; + input_rowsets.push_back(create_rowset(tablet_schema, NONOVERLAPPING, {{{1, 30}}}, 2)); + input_rowsets.push_back(create_rowset(tablet_schema, NONOVERLAPPING, {{{2, 10}, {1, 20}}}, 3)); + + std::vector input_rs_readers; + for (auto& rowset : input_rowsets) { + RowsetReaderSharedPtr rs_reader; + ASSERT_TRUE(rowset->create_reader(&rs_reader).ok()); + input_rs_readers.push_back(std::move(rs_reader)); + } + + auto writer_context = create_rowset_writer_context(tablet_schema, NONOVERLAPPING, 1024, {2, 3}); + auto res = RowsetFactory::create_rowset_writer(*engine_ref, writer_context, true); + ASSERT_TRUE(res.has_value()) << res.error(); + auto output_rs_writer = std::move(res).value(); + + Merger::Statistics stats; + RowIdConversion rowid_conversion; + stats.rowid_conversion = &rowid_conversion; + auto st = Merger::vertical_merge_rowsets(tablet, ReaderType::READER_CUMULATIVE_COMPACTION, + *tablet_schema, input_rs_readers, + output_rs_writer.get(), 1024, 1, &stats); + ASSERT_TRUE(st.ok()) << st; + + RowsetSharedPtr output_rowset; + ASSERT_EQ(Status::OK(), output_rs_writer->build(output_rowset)); + ASSERT_NE(output_rowset, nullptr); + ASSERT_EQ(1, output_rowset->num_segments()); + ASSERT_EQ(3, output_rowset->num_rows()); + ASSERT_EQ(0, stats.merged_rows); + + RowsetReaderContext reader_context; + reader_context.tablet_schema = tablet_schema; + reader_context.need_ordered_result = false; + std::vector return_columns = {0, 1}; + reader_context.return_columns = &return_columns; + RowsetReaderSharedPtr output_rs_reader; + create_and_init_rowset_reader(output_rowset.get(), reader_context, &output_rs_reader); + + std::vector> output_data; + do { + Block output_block = tablet_schema->create_block(); + st = output_rs_reader->next_batch(&output_block); + auto columns = output_block.get_columns_with_type_and_name(); + ASSERT_GE(columns.size(), 2); + for (auto i = 0; i < output_block.rows(); i++) { + output_data.emplace_back(columns[0].column->get_int(i), columns[1].column->get_int(i)); + } + } while (st.ok()); + ASSERT_TRUE(st.is()) << st; + + ASSERT_EQ(3, output_data.size()); + EXPECT_EQ(output_data[0], std::make_tuple(int64_t {2}, int64_t {10})); + EXPECT_EQ(output_data[1], std::make_tuple(int64_t {1}, int64_t {20})); + EXPECT_EQ(output_data[2], std::make_tuple(int64_t {1}, int64_t {30})); + + DeleteBitmap input_delete_bitmap(tablet->tablet_id()); + DeleteBitmap output_delete_bitmap(tablet->tablet_id()); + tablet->calc_compaction_output_rowset_delete_bitmap(input_rowsets, rowid_conversion, 0, + UINT64_MAX, nullptr, nullptr, + input_delete_bitmap, &output_delete_bitmap); + st = tablet->calc_compaction_output_rowset_internal_delete_bitmap( + input_rowsets, output_rowset, rowid_conversion, &output_delete_bitmap); + ASSERT_TRUE(st.ok()) << st; + + std::set visible_keys; + auto deleted_rows = output_delete_bitmap.get_agg({output_rowset->rowset_id(), 0, UINT64_MAX}); + for (uint32_t row_id = 0; row_id < output_data.size(); ++row_id) { + if (deleted_rows->contains(row_id)) { + continue; + } + ASSERT_TRUE(visible_keys.insert(std::get<0>(output_data[row_id])).second) + << "unique key should not be duplicated after cluster-key MOW compaction"; + } +} + +TEST_F(VerticalCompactionTest, + ClusterKeyMowCompactionWithSequenceKeepsTxnInternalDedupDeleteBitmap) { + TabletSchemaSPtr tablet_schema = create_mow_cluster_key_schema(true); + TabletSharedPtr tablet = create_tablet(*tablet_schema, true); + + std::vector input_rowsets; + input_rowsets.push_back( + create_rowset_with_sequence(tablet_schema, NONOVERLAPPING, {{{1, 30, 30}}}, 2)); + input_rowsets.push_back(create_rowset_with_sequence(tablet_schema, NONOVERLAPPING, + {{{2, 10, 10}, {1, 20, 20}}}, 3)); + for (auto& rowset : input_rowsets) { + ASSERT_TRUE(tablet->add_rowset(rowset).ok()); + } + + auto writer_context = create_rowset_writer_context(tablet_schema, NONOVERLAPPING, 1024, {2, 3}); + auto res = RowsetFactory::create_rowset_writer(*engine_ref, writer_context, true); + ASSERT_TRUE(res.has_value()) << res.error(); + auto output_rs_writer = std::move(res).value(); + + Block block = tablet_schema->create_block(); + auto columns = block.mutate_columns(); + std::vector> output_rows = { + {2, 10, 10}, {1, 20, 20}, {1, 30, 30}}; + for (auto& [c1, c2, c3] : output_rows) { + columns[0]->insert_data((const char*)&c1, sizeof(c1)); + columns[1]->insert_data((const char*)&c2, sizeof(c2)); + columns[2]->insert_data((const char*)&c3, sizeof(c3)); + uint8_t delete_sign = 0; + columns[3]->insert_data((const char*)&delete_sign, sizeof(delete_sign)); + } + auto st = output_rs_writer->add_block(&block); + ASSERT_TRUE(st.ok()) << st; + st = output_rs_writer->flush(); + ASSERT_TRUE(st.ok()) << st; + + RowsetSharedPtr output_rowset; + ASSERT_EQ(Status::OK(), output_rs_writer->build(output_rowset)); + ASSERT_NE(output_rowset, nullptr); + ASSERT_EQ(3, output_rowset->num_rows()); + + auto rowid_conversion = std::make_unique(); + ASSERT_TRUE(rowid_conversion->init_segment_map(input_rowsets[0]->rowset_id(), {1}).ok()); + ASSERT_TRUE(rowid_conversion->init_segment_map(input_rowsets[1]->rowset_id(), {2}).ok()); + rowid_conversion->set_dst_rowset_id(output_rowset->rowset_id()); + rowid_conversion->add({RowLocation(input_rowsets[1]->rowset_id(), 0, 0), + RowLocation(input_rowsets[1]->rowset_id(), 0, 1), + RowLocation(input_rowsets[0]->rowset_id(), 0, 0)}, + {3}); + + auto committed_rowset = + create_rowset_with_sequence(tablet_schema, NONOVERLAPPING, {{{3, 40, 40}}}, 4); + RowsetIdUnorderedSet txn_rowset_ids; + for (auto& rowset : input_rowsets) { + txn_rowset_ids.insert(rowset->rowset_id()); + } + txn_rowset_ids.insert(committed_rowset->rowset_id()); + auto txn_delete_bitmap = std::make_shared(tablet->tablet_id()); + constexpr int64_t txn_id = 10001; + commit_txn_with_delete_bitmap(tablet, committed_rowset, txn_id, txn_delete_bitmap, + txn_rowset_ids); + + TestCompactionMixin compaction(*engine_ref, tablet); + st = compaction.modify_rowsets_for_test(input_rowsets, output_rowset, + std::move(rowid_conversion)); + ASSERT_TRUE(st.ok()) << st; + + CommitTabletTxnInfoVec commit_tablet_txn_info_vec {}; + engine_ref->txn_manager()->get_all_commit_tablet_txn_info_by_tablet( + *tablet, &commit_tablet_txn_info_vec); + ASSERT_EQ(1, commit_tablet_txn_info_vec.size()); + + auto deleted_rows = commit_tablet_txn_info_vec[0].delete_bitmap->get_agg( + {output_rowset->rowset_id(), 0, UINT64_MAX}); + ASSERT_TRUE(deleted_rows->contains(1)) + << "committed txn delete bitmap must keep the output rowset internal dedup row"; + ASSERT_FALSE(deleted_rows->contains(2)) + << "the higher sequence row should stay visible after compaction"; +} + TEST_F(VerticalCompactionTest, TestDupKeyVerticalMergeWithDelete) { auto num_input_rowset = 2; auto num_segments = 2; diff --git a/be/test/storage/iterator/block_reader_agg_flush_test.cpp b/be/test/storage/iterator/block_reader_agg_flush_test.cpp new file mode 100644 index 00000000000000..a9c0a4a4818d82 --- /dev/null +++ b/be/test/storage/iterator/block_reader_agg_flush_test.cpp @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Verifies that BlockReader's aggregation buffer flush triggered by +// `_stored_row_ref.size() == batch_max_rows()` (block_reader.cpp:639) does not +// corrupt the final aggregated value when a single agg group spans multiple +// flush windows. Drives `_append_agg_data` / `_update_agg_data` directly. + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wkeyword-macro" +#endif +#define private public +#define protected public +#include "storage/iterator/block_reader.h" +#undef private +#undef protected +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +#include + +#include +#include + +#include "agent/be_exec_version_manager.h" +#include "common/config.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "exprs/aggregate/aggregate_function_simple_factory.h" + +namespace doris { + +namespace { + +// Builds a 2-column source block: key (Int64, all set to `key_value`) and value +// (Int64, set to 1..n_rows). The value column is what we aggregate over. +std::unique_ptr make_source_block(size_t n_rows, int64_t key_value) { + auto block = Block::create_unique(); + auto key_type = std::make_shared(); + auto val_type = std::make_shared(); + + auto key_col = ColumnInt64::create(); + auto val_col = ColumnInt64::create(); + for (size_t i = 0; i < n_rows; ++i) { + key_col->insert_value(key_value); + val_col->insert_value(static_cast(i + 1)); + } + block->insert({std::move(key_col), key_type, "k"}); + block->insert({std::move(val_col), val_type, "v"}); + return block; +} + +// Mirror BlockReader::_init_agg_state's _stored_data_columns sizing: cloned +// struct of the source block, pre-filled with `n_rows` default rows so that +// non-variable-length agg columns can be written via replace_column_data. +MutableColumns make_stored_columns(const Block& src_block, size_t n_rows) { + return src_block.create_same_struct_block(n_rows)->mutate_columns(); +} + +MutableColumns make_target_columns() { + MutableColumns cols; + cols.push_back(ColumnInt64::create()); // key (untouched in this test) + cols.push_back(ColumnInt64::create()); // agg result column + return cols; +} + +// Configure `reader` as if it had completed init() for an AGG_KEYS table with +// schema {key: Int64, value: Int64} and SUM aggregation over `value`. +void configure_reader_for_int64_sum(BlockReader& reader, const Block& src_block, + size_t batch_max_rows) { + reader._reader_context.batch_size = batch_max_rows; + // Adaptive disabled so batch_max_rows() == _reader_context.batch_size. + config::enable_adaptive_batch_size = false; + + // Column layout: [0]=key, [1]=agg value. Output layout matches input. + reader._normal_columns_idx = {0}; + reader._agg_columns_idx = {1}; + reader._return_columns_loc = {0, 1}; + + reader._stored_data_columns = make_stored_columns(src_block, batch_max_rows); + reader._stored_has_null_tag.assign(reader._stored_data_columns.size(), false); + reader._stored_has_variable_length_tag.assign(reader._stored_data_columns.size(), false); + + auto fn = AggregateFunctionSimpleFactory::instance().get( + "sum", {std::make_shared()}, std::make_shared(), + /*result_nullable=*/false, BeExecVersionManager::get_newest_version(), + {.column_names = {}}); + ASSERT_TRUE(fn != nullptr); + + auto* place = new char[fn->size_of_data()]; + fn->create(place); + reader._agg_functions.push_back(fn); + reader._agg_places.push_back(place); + // Destructor (BlockReader::~BlockReader) cleans up _agg_places. +} + +int64_t read_int64(const IColumn& col, size_t row) { + return assert_cast(col).get_data()[row]; +} + +} // namespace + +class BlockReaderAggFlushTest : public testing::Test { +protected: + void SetUp() override { _saved_enable_adaptive = config::enable_adaptive_batch_size; } + + void TearDown() override { config::enable_adaptive_batch_size = _saved_enable_adaptive; } + + bool _saved_enable_adaptive = false; +}; + +// Sanity baseline: a single group whose size is below batch_max_rows triggers +// no mid-group flush. Verifies the test fixture itself is wired correctly. +TEST_F(BlockReaderAggFlushTest, NoMidGroupFlushAggregatesCorrectly) { + constexpr size_t kBatchMaxRows = 16; + constexpr size_t kRows = 5; // < batch_max_rows, only is_last flush fires + + BlockReader reader; + auto src_block = make_source_block(kRows, /*key_value=*/42); + configure_reader_for_int64_sum(reader, *src_block, kBatchMaxRows); + + auto target_columns = make_target_columns(); + + for (size_t i = 0; i < kRows; ++i) { + reader._next_row.block = std::shared_ptr(src_block.get(), [](Block*) {}); + reader._next_row.row_pos = static_cast(i); + reader._next_row.is_same = (i > 0); + reader._append_agg_data(target_columns); + } + + // is_last flush at i=4 already drained _stored_row_ref into the aggregator + // without finalizing (because _last_agg_data_counter > 0). + EXPECT_EQ(reader._stored_row_ref.size(), 0); + EXPECT_EQ(reader._last_agg_data_counter, 0); + + // Mimic `_agg_key_next_block` end-of-group close. + reader._agg_data_counters.push_back(reader._last_agg_data_counter); + reader._last_agg_data_counter = 0; + reader._update_agg_data(target_columns); + + ASSERT_EQ(target_columns[1]->size(), 1); + EXPECT_EQ(read_int64(*target_columns[1], 0), 1 + 2 + 3 + 4 + 5); +} + +// The interesting case: a single group of 10 rows with batch_max_rows=4 forces +// `_stored_row_ref.size() == batch_max_rows()` to fire mid-group at i=3 and +// i=7, plus an `is_last` flush at i=9. Final close must still emit the full +// sum 1..10 = 55. +TEST_F(BlockReaderAggFlushTest, PeriodicFlushPreservesAggregateAcrossWindows) { + constexpr size_t kBatchMaxRows = 4; + constexpr size_t kRows = 10; + + BlockReader reader; + auto src_block = make_source_block(kRows, /*key_value=*/7); + configure_reader_for_int64_sum(reader, *src_block, kBatchMaxRows); + + auto target_columns = make_target_columns(); + + int flush_count = 0; + int prev_size = 0; + for (size_t i = 0; i < kRows; ++i) { + reader._next_row.block = std::shared_ptr(src_block.get(), [](Block*) {}); + reader._next_row.row_pos = static_cast(i); + reader._next_row.is_same = (i > 0); + reader._append_agg_data(target_columns); + + // A flush happens whenever _stored_row_ref shrinks (it's pushed to + // first, then potentially cleared by _update_agg_data). + int cur_size = static_cast(reader._stored_row_ref.size()); + if (cur_size < prev_size + 1) { + ++flush_count; + } + prev_size = cur_size; + } + + // Expected flushes: at i=3 (size==4), i=7 (size==4), i=9 (is_last). The + // final aggregated state must remain consistent across all three. + EXPECT_GE(flush_count, 3) << "expected at least 3 mid/last flushes"; + EXPECT_EQ(reader._stored_row_ref.size(), 0); + EXPECT_EQ(reader._last_agg_data_counter, 0); + + // Mimic `_agg_key_next_block` end-of-group close. + reader._agg_data_counters.push_back(reader._last_agg_data_counter); + reader._last_agg_data_counter = 0; + reader._update_agg_data(target_columns); + + ASSERT_EQ(target_columns[1]->size(), 1); + int64_t expected = 0; + for (int64_t v = 1; v <= static_cast(kRows); ++v) { + expected += v; + } + EXPECT_EQ(read_int64(*target_columns[1], 0), expected); // 55 +} + +// Stress: a single group long enough to trigger many full periodic flushes, +// followed by a group end. Catches off-by-one bugs in chunked aggregation. +TEST_F(BlockReaderAggFlushTest, PeriodicFlushManyWindowsSingleGroup) { + constexpr size_t kBatchMaxRows = 4; + constexpr size_t kRows = 100; // 25 full windows + + BlockReader reader; + auto src_block = make_source_block(kRows, /*key_value=*/3); + configure_reader_for_int64_sum(reader, *src_block, kBatchMaxRows); + + auto target_columns = make_target_columns(); + for (size_t i = 0; i < kRows; ++i) { + reader._next_row.block = std::shared_ptr(src_block.get(), [](Block*) {}); + reader._next_row.row_pos = static_cast(i); + reader._next_row.is_same = (i > 0); + reader._append_agg_data(target_columns); + } + reader._agg_data_counters.push_back(reader._last_agg_data_counter); + reader._last_agg_data_counter = 0; + reader._update_agg_data(target_columns); + + ASSERT_EQ(target_columns[1]->size(), 1); + int64_t expected = static_cast(kRows) * (kRows + 1) / 2; // 5050 + EXPECT_EQ(read_int64(*target_columns[1], 0), expected); +} + +} // namespace doris diff --git a/be/test/storage/iterator/block_reader_batch_max_rows_test.cpp b/be/test/storage/iterator/block_reader_batch_max_rows_test.cpp new file mode 100644 index 00000000000000..4569cd53cbf33d --- /dev/null +++ b/be/test/storage/iterator/block_reader_batch_max_rows_test.cpp @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Use #define private public to access private/protected members for testing +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wkeyword-macro" +#endif +#define private public +#define protected public +#include "storage/iterator/block_reader.h" +#undef private +#undef protected +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +#include + +#include "common/config.h" +#include "core/column/column_vector.h" + +namespace doris { + +namespace { + +constexpr size_t kMinPublicBlockBudgetBytes = 1048576; // 1MB + +MutableColumns make_int64_columns(size_t rows) { + MutableColumns columns; + auto col = ColumnInt64::create(); + for (size_t i = 0; i < rows; ++i) { + col->insert_value(cast_set(i)); + } + columns.push_back(std::move(col)); + return columns; +} + +} // namespace + +class BlockReaderBatchMaxRowsTest : public testing::Test { +protected: + void SetUp() override { _saved_enable_adaptive = config::enable_adaptive_batch_size; } + + void TearDown() override { config::enable_adaptive_batch_size = _saved_enable_adaptive; } + + bool _saved_enable_adaptive = false; +}; + +TEST_F(BlockReaderBatchMaxRowsTest, FallbackToBatchSizeWhenAdaptiveDisabled) { + config::enable_adaptive_batch_size = false; + + BlockReader reader; + reader._reader_context.batch_size = 4096; + + EXPECT_EQ(reader.batch_max_rows(), 4096); +} + +TEST_F(BlockReaderBatchMaxRowsTest, UseBatchSizeWhenAdaptiveEnabled) { + config::enable_adaptive_batch_size = true; + + BlockReader reader; + reader._reader_context.batch_size = 4096; + reader._reader_context.preferred_block_size_bytes = 8388608; // byte budget must be active + + EXPECT_EQ(reader.batch_max_rows(), 4096); +} + +TEST_F(BlockReaderBatchMaxRowsTest, BatchMaxRowsIgnoresByteBudget) { + config::enable_adaptive_batch_size = true; + + BlockReader reader; + reader._reader_context.batch_size = 1024; + reader._reader_context.preferred_block_size_bytes = 8388608; + + EXPECT_EQ(reader.batch_max_rows(), 1024); +} + +TEST_F(BlockReaderBatchMaxRowsTest, PreferredBlockSizeBytesWhenEnabled) { + config::enable_adaptive_batch_size = true; + + BlockReader reader; + reader._reader_context.preferred_block_size_bytes = 8388608; // 8MB + + EXPECT_EQ(reader.preferred_block_size_bytes(), 8388608); +} + +TEST_F(BlockReaderBatchMaxRowsTest, PreferredBlockSizeBytesWhenDisabled) { + config::enable_adaptive_batch_size = false; + + BlockReader reader; + reader._reader_context.preferred_block_size_bytes = 8388608; + + EXPECT_EQ(reader.preferred_block_size_bytes(), 0); +} + +TEST_F(BlockReaderBatchMaxRowsTest, ReachedByteBudgetReturnsFalseWhenDisabled) { + config::enable_adaptive_batch_size = false; + + BlockReader reader; + reader._reader_context.preferred_block_size_bytes = kMinPublicBlockBudgetBytes; + + auto columns = make_int64_columns(200000); // ~1.6MB > 1MB min public budget + + EXPECT_FALSE(reader._reached_byte_budget(columns)); +} + +TEST_F(BlockReaderBatchMaxRowsTest, ReachedByteBudgetReturnsTrueWhenExceeded) { + config::enable_adaptive_batch_size = true; + + BlockReader reader; + reader._reader_context.preferred_block_size_bytes = kMinPublicBlockBudgetBytes; + + auto columns = make_int64_columns(200000); // ~1.6MB >= 1MB min public budget + + EXPECT_TRUE(reader._reached_byte_budget(columns)); +} + +TEST_F(BlockReaderBatchMaxRowsTest, ReachedByteBudgetReturnsFalseWhenUnderBudget) { + config::enable_adaptive_batch_size = true; + + BlockReader reader; + reader._reader_context.preferred_block_size_bytes = kMinPublicBlockBudgetBytes; + + auto columns = make_int64_columns(10); // 80 bytes < 1MB min public budget + + EXPECT_FALSE(reader._reached_byte_budget(columns)); +} + +} // namespace doris diff --git a/be/test/storage/iterator/vcollect_iterator_collected_enough_test.cpp b/be/test/storage/iterator/vcollect_iterator_collected_enough_test.cpp new file mode 100644 index 00000000000000..afcf979ccca8d5 --- /dev/null +++ b/be/test/storage/iterator/vcollect_iterator_collected_enough_test.cpp @@ -0,0 +1,383 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include + +#include "common/config.h" +#include "core/block/block.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "storage/iterator/block_reader.h" +#include "storage/iterator/vcollect_iterator.h" + +namespace doris { + +// ============================================================================ +// Part 1: Pure-computation tests for estimate_collected_enough() +// ============================================================================ + +class EstimateCollectedEnoughTest : public testing::Test {}; + +// Budget is 0 → always false (feature disabled). +TEST_F(EstimateCollectedEnoughTest, BudgetZeroReturnsFalse) { + EXPECT_FALSE(estimate_collected_enough(/*present_bytes=*/1000, /*present_rows=*/10, + /*rows_to_merge=*/5, + /*preferred_block_size_bytes=*/0)); +} + +// No rows collected yet → always false (cannot estimate). +TEST_F(EstimateCollectedEnoughTest, ZeroRowsReturnsFalse) { + EXPECT_FALSE(estimate_collected_enough(/*present_bytes=*/0, /*present_rows=*/0, + /*rows_to_merge=*/5, + /*preferred_block_size_bytes=*/1024)); +} + +// Present bytes already exceed budget → true. +TEST_F(EstimateCollectedEnoughTest, PresentBytesExceedBudget) { + EXPECT_TRUE(estimate_collected_enough(/*present_bytes=*/2048, /*present_rows=*/10, + /*rows_to_merge=*/0, + /*preferred_block_size_bytes=*/1024)); +} + +// Present bytes exactly equal budget → true. +TEST_F(EstimateCollectedEnoughTest, PresentBytesEqualBudget) { + EXPECT_TRUE(estimate_collected_enough(/*present_bytes=*/1024, /*present_rows=*/10, + /*rows_to_merge=*/0, + /*preferred_block_size_bytes=*/1024)); +} + +// Prediction: 500 bytes / 10 rows = 50 bytes/row. +// With 10 pending rows → predicted 500 + 500 = 1000 < 1024 → false. +TEST_F(EstimateCollectedEnoughTest, PredictionBelowBudget) { + EXPECT_FALSE(estimate_collected_enough(/*present_bytes=*/500, /*present_rows=*/10, + /*rows_to_merge=*/10, + /*preferred_block_size_bytes=*/1024)); +} + +// Prediction: 500 bytes / 10 rows = 50 bytes/row. +// With 11 pending rows → predicted 500 * 21 / 10 = 1050 >= 1024 → true. +TEST_F(EstimateCollectedEnoughTest, PredictionMeetsBudget) { + EXPECT_TRUE(estimate_collected_enough(/*present_bytes=*/500, /*present_rows=*/10, + /*rows_to_merge=*/11, + /*preferred_block_size_bytes=*/1024)); +} + +// Zero pending rows: prediction = present_bytes (no pending rows to flush). +// 500 < 1024 → false. +TEST_F(EstimateCollectedEnoughTest, ZeroPendingRows) { + EXPECT_FALSE(estimate_collected_enough(/*present_bytes=*/500, /*present_rows=*/10, + /*rows_to_merge=*/0, + /*preferred_block_size_bytes=*/1024)); +} + +// Exact boundary: 512 bytes / 8 rows = 64 bytes/row. With 8 pending rows → +// predicted 512 * 16 / 8 = 1024 = budget → true. +TEST_F(EstimateCollectedEnoughTest, ExactBudgetBoundary) { + EXPECT_TRUE(estimate_collected_enough(/*present_bytes=*/512, /*present_rows=*/8, + /*rows_to_merge=*/8, + /*preferred_block_size_bytes=*/1024)); +} + +// One below boundary: 512 bytes / 8 rows = 64 bytes/row. With 7 pending rows → +// predicted 512 * 15 / 8 = 960 < 1024 → false. +TEST_F(EstimateCollectedEnoughTest, OneBelowBudgetBoundary) { + EXPECT_FALSE(estimate_collected_enough(/*present_bytes=*/512, /*present_rows=*/8, + /*rows_to_merge=*/7, + /*preferred_block_size_bytes=*/1024)); +} + +// Overflow guard: present_bytes close to SIZE_MAX; multiplication would wrap → true. +TEST_F(EstimateCollectedEnoughTest, OverflowGuardReturnsTrueForHugeBytes) { + const size_t huge = std::numeric_limits::max() / 2 + 1; + EXPECT_TRUE(estimate_collected_enough(/*present_bytes=*/huge, /*present_rows=*/1, + /*rows_to_merge=*/1, + /*preferred_block_size_bytes=*/1024)); +} + +// Large but no overflow: present_bytes * total_rows fits in size_t. +TEST_F(EstimateCollectedEnoughTest, LargeButNoOverflow) { + // 1GB present, 100 rows, 100 pending → 2GB total predicted. + const size_t one_gb = 1ULL << 30; + EXPECT_TRUE(estimate_collected_enough(/*present_bytes=*/one_gb, /*present_rows=*/100, + /*rows_to_merge=*/100, + /*preferred_block_size_bytes=*/one_gb + 1)); +} + +// Single present row with many pending rows should scale correctly. +// 100 bytes / 1 row → 100 bytes/row. With 99 pending → predicted 100 * 100 / 1 = 10000 >= 5000. +TEST_F(EstimateCollectedEnoughTest, SingleRowScalesCorrectly) { + EXPECT_TRUE(estimate_collected_enough(/*present_bytes=*/100, /*present_rows=*/1, + /*rows_to_merge=*/99, + /*preferred_block_size_bytes=*/5000)); +} + +// ============================================================================ +// Part 2: Integration tests — real MutableColumns + estimate_collected_enough +// +// These tests exercise the same code path as collected_enough_rows(): +// present_bytes = Block::columns_byte_size(columns) +// present_rows = columns[0]->size() +// → estimate_collected_enough(present_bytes, present_rows, rows_to_merge, budget) +// +// Level1Iterator::collected_enough_rows() is a private inner class method, so +// we replicate its logic here with real columns to verify end-to-end correctness. +// ============================================================================ + +class CollectedEnoughWithColumnsTest : public testing::Test { +protected: + void SetUp() override { _saved_adaptive = config::enable_adaptive_batch_size; } + void TearDown() override { config::enable_adaptive_batch_size = _saved_adaptive; } + + // Replicate the logic of Level1Iterator::collected_enough_rows() with a + // configurable budget, so we can test the column-byte integration path + // without instantiating a Level1Iterator (private inner class). + static bool collected_enough_rows_sim(const MutableColumns& columns, int rows_to_merge, + size_t preferred_block_size_bytes) { + if (!config::enable_adaptive_batch_size) { + return false; + } + if (preferred_block_size_bytes == 0) { + return false; + } + const auto present_bytes = Block::columns_byte_size(columns); + const auto present_rows = columns.empty() ? 0 : columns[0]->size(); + return estimate_collected_enough(present_bytes, present_rows, rows_to_merge, + preferred_block_size_bytes); + } + + // Build a MutableColumns with N_cols ColumnInt32 columns, each having `nrows` rows. + // Each Int32 is 4 bytes → total = 4 * nrows * ncols bytes. + static MutableColumns make_int32_columns(size_t ncols, size_t nrows) { + MutableColumns cols; + for (size_t c = 0; c < ncols; ++c) { + auto col = ColumnInt32::create(); + for (size_t r = 0; r < nrows; ++r) { + col->insert_value(static_cast(r)); + } + cols.push_back(std::move(col)); + } + return cols; + } + + bool _saved_adaptive = false; +}; + +// Config disabled → always false regardless of data. +TEST_F(CollectedEnoughWithColumnsTest, DisabledConfigReturnsFalse) { + config::enable_adaptive_batch_size = false; + auto cols = make_int32_columns(2, 100); + // 2 cols × 100 rows × 4 bytes = 800 bytes; budget = 100 bytes (well below) + EXPECT_FALSE(collected_enough_rows_sim(cols, 0, /*budget=*/100)); +} + +// Config enabled, budget = 0 → always false. +TEST_F(CollectedEnoughWithColumnsTest, ZeroBudgetReturnsFalse) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(2, 100); + EXPECT_FALSE(collected_enough_rows_sim(cols, 0, /*budget=*/0)); +} + +// Empty columns → present_rows = 0 → cannot estimate → false. +TEST_F(CollectedEnoughWithColumnsTest, EmptyColumnsReturnsFalse) { + config::enable_adaptive_batch_size = true; + MutableColumns empty; + EXPECT_FALSE(collected_enough_rows_sim(empty, 10, /*budget=*/1024)); +} + +// Columns with zero rows → present_rows = 0 → false. +TEST_F(CollectedEnoughWithColumnsTest, ZeroRowColumnsReturnsFalse) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(3, 0); + EXPECT_FALSE(collected_enough_rows_sim(cols, 5, /*budget=*/100)); +} + +// Single Int32 column, 256 rows → 1024 bytes. +// Budget = 1024 → already met → true (no pending rows needed). +TEST_F(CollectedEnoughWithColumnsTest, SingleColumnExactBudgetMet) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(1, 256); + // 256 rows × 4 bytes = 1024 bytes + EXPECT_EQ(Block::columns_byte_size(cols), 1024); + EXPECT_TRUE(collected_enough_rows_sim(cols, 0, /*budget=*/1024)); +} + +// Single Int32 column, 255 rows → 1020 bytes < 1024 budget. +// No pending rows → not enough → false. +TEST_F(CollectedEnoughWithColumnsTest, SingleColumnBelowBudgetNoPending) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(1, 255); + EXPECT_EQ(Block::columns_byte_size(cols), 1020); + EXPECT_FALSE(collected_enough_rows_sim(cols, 0, /*budget=*/1024)); +} + +// Single Int32 column, 255 rows → 1020 bytes. With 1 pending row: +// bytes_per_row = 1020/255 = 4, predicted = 1020 * 256 / 255 = 1024 → meets budget. +TEST_F(CollectedEnoughWithColumnsTest, SingleColumnBelowBudgetWithPending) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(1, 255); + EXPECT_TRUE(collected_enough_rows_sim(cols, 1, /*budget=*/1024)); +} + +// Multi-column: 4 Int32 columns × 100 rows = 1600 bytes. +// Budget = 2000 → not met. With 25 pending rows: +// predicted = 1600 * 125 / 100 = 2000 → meets budget. +TEST_F(CollectedEnoughWithColumnsTest, MultiColumnPredictionMeetsBudget) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(4, 100); + EXPECT_EQ(Block::columns_byte_size(cols), 1600); + EXPECT_FALSE(collected_enough_rows_sim(cols, 0, /*budget=*/2000)); + EXPECT_TRUE(collected_enough_rows_sim(cols, 25, /*budget=*/2000)); +} + +// Multi-column: just one row below the prediction threshold. +// 4 cols × 100 rows = 1600 bytes, 24 pending rows: +// predicted = 1600 * 124 / 100 = 1984 < 2000 → false. +TEST_F(CollectedEnoughWithColumnsTest, MultiColumnOneBelowPrediction) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(4, 100); + EXPECT_FALSE(collected_enough_rows_sim(cols, 24, /*budget=*/2000)); +} + +// Variable-width column: ColumnString with known data sizes. +// ColumnString::byte_size() = chars_size + offsets_size. +// Each offset is sizeof(IColumn::Offset) = 8 bytes (64-bit). +TEST_F(CollectedEnoughWithColumnsTest, StringColumnByteSizeIntegration) { + config::enable_adaptive_batch_size = true; + + auto str_col = ColumnString::create(); + // Insert 10 strings of 10 chars each → chars = 100 bytes, offsets = 10 * 8 = 80. + // Total byte_size = 180 bytes. + for (int i = 0; i < 10; ++i) { + std::string s(10, 'A' + (i % 26)); + str_col->insert_data(s.data(), s.size()); + } + + const size_t expected_bytes = 10 * 10 + 10 * sizeof(IColumn::Offset); + EXPECT_EQ(str_col->byte_size(), expected_bytes); + + MutableColumns cols; + cols.push_back(std::move(str_col)); + + // Budget = expected_bytes → met → true. + EXPECT_TRUE(collected_enough_rows_sim(cols, 0, /*budget=*/expected_bytes)); + // Budget = expected_bytes + 1 → not met with 0 pending → false. + EXPECT_FALSE(collected_enough_rows_sim(cols, 0, /*budget=*/expected_bytes + 1)); +} + +// Mixed columns: Int32 + String together. +TEST_F(CollectedEnoughWithColumnsTest, MixedColumnTypes) { + config::enable_adaptive_batch_size = true; + + auto int_col = ColumnInt32::create(); + auto str_col = ColumnString::create(); + for (int i = 0; i < 50; ++i) { + int_col->insert_value(static_cast(i)); + std::string s(20, 'x'); + str_col->insert_data(s.data(), s.size()); + } + // Int32: 50 × 4 = 200 bytes + // String: 50 × 20 chars + 50 × 8 offsets = 1000 + 400 = 1400 bytes + // Total: 1600 bytes + const size_t int_bytes = 50 * sizeof(Int32); + const size_t str_bytes = 50 * 20 + 50 * sizeof(IColumn::Offset); + EXPECT_EQ(int_col->byte_size(), int_bytes); + EXPECT_EQ(str_col->byte_size(), str_bytes); + + MutableColumns cols; + cols.push_back(std::move(int_col)); + cols.push_back(std::move(str_col)); + + const size_t total = int_bytes + str_bytes; + EXPECT_EQ(Block::columns_byte_size(cols), total); + + // Budget met exactly → true. + EXPECT_TRUE(collected_enough_rows_sim(cols, 0, /*budget=*/total)); + // Budget slightly above → not met with 0 pending → false. + EXPECT_FALSE(collected_enough_rows_sim(cols, 0, /*budget=*/total + 1)); + // With 1 pending row: predicted = total * 51 / 50 = total + total/50. + // So meets budget = total + total/50. + EXPECT_TRUE(collected_enough_rows_sim(cols, 1, /*budget=*/total + total / 50)); +} + +// Large number of rows to verify no integer issues with real column byte sizes. +TEST_F(CollectedEnoughWithColumnsTest, LargeRowCountInt32) { + config::enable_adaptive_batch_size = true; + auto cols = make_int32_columns(1, 100000); + // 100000 × 4 = 400000 bytes + EXPECT_EQ(Block::columns_byte_size(cols), 400000); + EXPECT_TRUE(collected_enough_rows_sim(cols, 0, /*budget=*/400000)); + EXPECT_FALSE(collected_enough_rows_sim(cols, 0, /*budget=*/400001)); + // With 1 pending: predicted = 400000 * 100001 / 100000 = 400004 < 400005 → false + EXPECT_FALSE(collected_enough_rows_sim(cols, 1, /*budget=*/400005)); + // predicted = 400004 >= 400004 → true + EXPECT_TRUE(collected_enough_rows_sim(cols, 1, /*budget=*/400004)); +} + +// ============================================================================ +// Part 3: BlockReader.preferred_block_size_bytes() override tests +// +// BlockReader overrides TabletReader::preferred_block_size_bytes() to gate on +// config::enable_adaptive_batch_size. These tests verify that behavior and +// ensure collected_enough_rows() would receive the correct budget value. +// ============================================================================ + +class BlockReaderByteBudgetTest : public testing::Test { +protected: + void SetUp() override { _saved_adaptive = config::enable_adaptive_batch_size; } + void TearDown() override { config::enable_adaptive_batch_size = _saved_adaptive; } + bool _saved_adaptive = false; +}; + +// When adaptive is enabled, preferred_block_size_bytes() returns the configured value. +TEST_F(BlockReaderByteBudgetTest, ReturnsConfiguredBytesWhenEnabled) { + config::enable_adaptive_batch_size = true; + BlockReader reader; + reader._reader_context.preferred_block_size_bytes = 65536; + EXPECT_EQ(reader.preferred_block_size_bytes(), 65536); +} + +// When adaptive is disabled, preferred_block_size_bytes() returns 0 regardless. +TEST_F(BlockReaderByteBudgetTest, ReturnsZeroWhenDisabled) { + config::enable_adaptive_batch_size = false; + BlockReader reader; + reader._reader_context.preferred_block_size_bytes = 65536; + EXPECT_EQ(reader.preferred_block_size_bytes(), 0); +} + +// Default value of preferred_block_size_bytes in reader context is 8MB. +TEST_F(BlockReaderByteBudgetTest, DefaultIs8MB) { + config::enable_adaptive_batch_size = true; + BlockReader reader; + EXPECT_EQ(reader.preferred_block_size_bytes(), 8388608UL); +} + +// Virtual dispatch: BlockReader override is reachable through a TabletReader pointer. +TEST_F(BlockReaderByteBudgetTest, VirtualDispatchThroughTabletReaderPtr) { + config::enable_adaptive_batch_size = true; + BlockReader concrete; + concrete._reader_context.preferred_block_size_bytes = 99999; + TabletReader* base = &concrete; + // Through the virtual dispatch, BlockReader's override should be called. + EXPECT_EQ(base->preferred_block_size_bytes(), 99999); +} + +} // namespace doris diff --git a/be/test/storage/segment/adaptive_block_size_predictor_test.cpp b/be/test/storage/segment/adaptive_block_size_predictor_test.cpp new file mode 100644 index 00000000000000..60b6f37b8ceeba --- /dev/null +++ b/be/test/storage/segment/adaptive_block_size_predictor_test.cpp @@ -0,0 +1,357 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/segment/adaptive_block_size_predictor.h" + +#include +#include + +#include +#include + +#include "common/config.h" +#include "core/block/block.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "storage/olap_common.h" + +namespace doris { + +// ── helper functions ────────────────────────────────────────────────────────── + +// Build a Block with N rows, each containing a single Int32 column of the given value. +static Block make_int32_block(size_t rows, int32_t value = 42) { + auto col = ColumnVector::create(); + col->reserve(rows); + for (size_t i = 0; i < rows; ++i) { + col->insert_value(value); + } + Block block; + block.insert({std::move(col), std::make_shared(), "c0"}); + return block; +} + +// Build a Block with N rows where each row holds a string of |str_len| bytes. +static Block make_string_block(size_t rows, size_t str_len) { + auto col = ColumnString::create(); + col->reserve(rows); + std::string s(str_len, 'x'); + for (size_t i = 0; i < rows; ++i) { + col->insert_data(s.data(), s.size()); + } + Block block; + block.insert({std::move(col), std::make_shared(), "c0"}); + return block; +} + +// ── AdaptiveBlockSizePredictorTest ─────────────────────────────────────────── + +class AdaptiveBlockSizePredictorTest : public testing::Test { +protected: + // 8 MB target + static constexpr size_t kBlockBytes = 8 * 1024 * 1024; + static constexpr size_t kMaxRows = 4096; +}; + +// ── Test 1: no history ──────────────────────────────────────────────────────── +// Before any update, has_history == false and bytes_per_row == 0. +// After the first update, has_history == true and bytes_per_row == block.bytes()/rows. +TEST_F(AdaptiveBlockSizePredictorTest, NoHistoryReturnsMaxRows) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + // Initially no history. + EXPECT_FALSE(pred.has_history_for_test()); + EXPECT_DOUBLE_EQ(pred.bytes_per_row_for_test(), 0.0); + + // After one update the first sample is stored directly (no EWMA blending). + Block blk = make_int32_block(100); + std::vector cols = {0}; + pred.update(blk); + + EXPECT_TRUE(pred.has_history_for_test()); + double expected_bpr = static_cast(blk.bytes()) / 100.0; + EXPECT_DOUBLE_EQ(pred.bytes_per_row_for_test(), expected_bpr); +} + +// ── Test 2: EWMA convergence ────────────────────────────────────────────────── +// When every update delivers the same sample, the EWMA stays exactly at that +// value (0.9*v + 0.1*v == v for any v). +TEST_F(AdaptiveBlockSizePredictorTest, EwmaConvergence) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + std::vector cols = {0}; + + // Compute expected bytes-per-row from an actual block so the test does not + // hard-code internal column memory layout assumptions. + Block probe = make_string_block(100, 100); + double expected_bpr = static_cast(probe.bytes()) / 100.0; + + // First update seeds the EWMA directly. + pred.update(probe); + EXPECT_DOUBLE_EQ(pred.bytes_per_row_for_test(), expected_bpr); + + // All subsequent updates carry the same sample → EWMA stays constant. + for (int i = 1; i < 50; ++i) { + Block blk = make_string_block(100, 100); + pred.update(blk); + } + EXPECT_NEAR(pred.bytes_per_row_for_test(), expected_bpr, 0.01); +} + +// ── Test 4: zero rows block is ignored ─────────────────────────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, ZeroRowsBlockIgnored) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + // update() with an empty block must be a no-op. + Block blk = make_int32_block(0); + std::vector cols = {0}; + pred.update(blk); + + EXPECT_FALSE(pred.has_history_for_test()); + EXPECT_DOUBLE_EQ(pred.bytes_per_row_for_test(), 0.0); + + // A subsequent real update must still work normally. + Block blk2 = make_int32_block(50); + pred.update(blk2); + EXPECT_TRUE(pred.has_history_for_test()); + double expected_bpr = static_cast(blk2.bytes()) / 50.0; + EXPECT_DOUBLE_EQ(pred.bytes_per_row_for_test(), expected_bpr); +} + +// ── Test 5: disabled when preferred_block_size_bytes == 0 ──────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, DisabledWhenBlockSizeIsZero) { + AdaptiveBlockSizePredictor pred(0, 0.0); + + Block blk = make_int32_block(1000); + std::vector cols = {0}; + pred.update(blk); + + // update() still records history even when budget == 0. + EXPECT_TRUE(pred.has_history_for_test()); + EXPECT_GT(pred.bytes_per_row_for_test(), 0.0); +} + +// ── Test 6: config flag disables predictor ──────────────────────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, ConfigDefaultEnabled) { + EXPECT_TRUE(config::enable_adaptive_batch_size); +} + +// ── predict_next_rows tests ────────────────────────────────────────────────── + +// ── Test: _block_size_bytes == 0 returns block_size_rows ───────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictReturnsBlockSizeRowsWhenDisabled) { + AdaptiveBlockSizePredictor pred(0, 0.0); + + EXPECT_EQ(pred.predict_next_rows(), pred.block_size_rows_for_test()); + + // Even after update, still returns block_size_rows because block_size_bytes == 0. + Block blk = make_int32_block(100); + std::vector cols = {0}; + pred.update(blk); + EXPECT_EQ(pred.predict_next_rows(), pred.block_size_rows_for_test()); +} + +// ── Test: no history, no metadata hint → probe_rows ──────────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictNoHistoryNoHint) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + EXPECT_EQ(pred.predict_next_rows(), pred.probe_rows_for_test()); +} + +// ── Test: no history fallback is also bounded by the first-batch safety threshold +TEST_F(AdaptiveBlockSizePredictorTest, PredictNoHistoryNoHintUsesSafetyThreshold) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + EXPECT_EQ(pred.predict_next_rows(), pred.probe_rows_for_test()); +} + +// ── Test: no history, metadata hint computed successfully ──────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictNoHistoryMetadataHint) { + // Simulate: 1 column, 400000 raw bytes in a 1000-row segment. + // bytes_per_row_hint = (400000 / 1000) * 1.2 = 480.0 + // predicted = 8MB / 480.0 = 17476 + double hint_bpr = (400000.0 / 1000.0) * 1.2; // 480.0 + AdaptiveBlockSizePredictor pred(kBlockBytes, hint_bpr); + + size_t result = pred.predict_next_rows(); + + size_t expected = static_cast(static_cast(kBlockBytes) / hint_bpr); + // No history: probe_rows clamps the result. + expected = std::min(expected, pred.probe_rows_for_test()); + EXPECT_EQ(result, expected); +} + +// ── Test: no history metadata hint is bounded by the first-batch safety threshold +TEST_F(AdaptiveBlockSizePredictorTest, PredictNoHistoryMetadataHintUsesSafetyThreshold) { + double hint_bpr = (400000.0 / 1000.0) * 1.2; // 480.0 + AdaptiveBlockSizePredictor pred(kBlockBytes, hint_bpr); + + EXPECT_EQ(pred.predict_next_rows(), pred.probe_rows_for_test()); +} + +// ── Test: no history, second call reuses same hint ─────────────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictNoHistoryCachedHint) { + double hint_bpr = (400000.0 / 1000.0) * 1.2; + AdaptiveBlockSizePredictor pred(kBlockBytes, hint_bpr); + + size_t first = pred.predict_next_rows(); + size_t second = pred.predict_next_rows(); + + EXPECT_EQ(first, second); +} + +// ── Test: has history, uses EWMA bytes_per_row ────────────────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictWithHistory) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + // Inject history: 100 bytes per row. + pred.set_has_history_for_test(true, 100.0); + + size_t result = pred.predict_next_rows(); + // predicted = 8MB / 100.0 = 83886, clamped to block_size_rows = 65535. + EXPECT_EQ(result, pred.block_size_rows_for_test()); +} + +// ── Test: has history, predicted < block_size_rows (no clamping at upper bound) ──── +TEST_F(AdaptiveBlockSizePredictorTest, PredictWithHistoryNoClamping) { + // 8 KB target, not 8 MB, so predicted is small. + AdaptiveBlockSizePredictor pred(8 * 1024, 0.0); + + // 100 bytes per row → predicted = 8192 / 100 = 81. + pred.set_has_history_for_test(true, 100.0); + + size_t result = pred.predict_next_rows(); + EXPECT_EQ(result, 81u); +} + +// ── Test: predicted > block_size_rows → clamped to block_size_rows ───────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictClampedToBlockSizeRows) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + // 1 byte/row → predicted = 8MB / 1 = 8388608 >> block_size_rows. + pred.set_has_history_for_test(true, 1.0); + + EXPECT_EQ(pred.predict_next_rows(), pred.block_size_rows_for_test()); +} + +// ── Test: huge bytes_per_row → predicted < 1 → clamped to 1 ──────────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictClampedToOne) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + // bytes_per_row so large that predicted rounds to 0. + pred.set_has_history_for_test(true, static_cast(kBlockBytes) * 10.0); + + EXPECT_EQ(pred.predict_next_rows(), 1u); +} + +// ── Test: metadata hint with multiple columns ─────────────────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, PredictNoHistoryMultiColumnMetadata) { + // Simulate: 2 columns, uid=10 with 200000B, uid=20 with 600000B, 1000 rows. + // total_bytes = 800000, hint_bpr = (800000/1000) * 1.2 = 960.0 + double hint_bpr = (800000.0 / 1000.0) * 1.2; // 960.0 + AdaptiveBlockSizePredictor pred(kBlockBytes, hint_bpr); + + size_t result = pred.predict_next_rows(); + size_t expected = static_cast(static_cast(kBlockBytes) / hint_bpr); + // No history: probe_rows clamps the result. + expected = std::min(expected, pred.probe_rows_for_test()); + EXPECT_EQ(result, expected); +} + +// ── Test: no hint (simulates empty segment with 0 rows) ───────────────────── +TEST_F(AdaptiveBlockSizePredictorTest, ConstructorHandlesNoHint) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + // No hint available → falls back to the default first-batch probe limit. + EXPECT_EQ(pred.predict_next_rows(), pred.probe_rows_for_test()); +} + +TEST_F(AdaptiveBlockSizePredictorTest, PredictUsesCustomProbeRowsWithoutHint) { + constexpr size_t custom_probe_rows = 128; + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0, custom_probe_rows); + + EXPECT_EQ(pred.probe_rows_for_test(), custom_probe_rows); + EXPECT_EQ(pred.predict_next_rows(), custom_probe_rows); +} + +TEST_F(AdaptiveBlockSizePredictorTest, PredictUsesCustomProbeRowsWithHint) { + constexpr size_t custom_probe_rows = 128; + double hint_bpr = 1.0; + AdaptiveBlockSizePredictor pred(kBlockBytes, hint_bpr, custom_probe_rows); + + EXPECT_EQ(pred.predict_next_rows(), custom_probe_rows); +} + +TEST_F(AdaptiveBlockSizePredictorTest, PredictProbeRowsZeroFallsBackToOne) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0, 0); + + EXPECT_EQ(pred.probe_rows_for_test(), 0u); + EXPECT_EQ(pred.predict_next_rows(), 1u); +} + +TEST_F(AdaptiveBlockSizePredictorTest, PredictProbeRowsOneWorks) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0, 1); + + EXPECT_EQ(pred.predict_next_rows(), 1u); +} + +// ── batch_size tests ──────────────────────────────────────────────────────── + +TEST_F(AdaptiveBlockSizePredictorTest, DefaultBlockSizeRows) { + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0); + + EXPECT_EQ(pred.block_size_rows_for_test(), + AdaptiveBlockSizePredictor::default_block_size_rows_for_test()); + EXPECT_EQ(pred.block_size_rows_for_test(), 65535u); +} + +TEST_F(AdaptiveBlockSizePredictorTest, CustomBlockSizeRows) { + constexpr size_t custom_rows = 1024; + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0, AdaptiveBlockSizePredictor::kDefaultProbeRows, + custom_rows); + + EXPECT_EQ(pred.block_size_rows_for_test(), custom_rows); + + // With history: 1 byte/row → predicted = 8MB, clamped to custom_rows = 1024. + pred.set_has_history_for_test(true, 1.0); + EXPECT_EQ(pred.predict_next_rows(), custom_rows); +} + +TEST_F(AdaptiveBlockSizePredictorTest, BlockSizeRowsClampsWithHistory) { + constexpr size_t custom_rows = 500; + AdaptiveBlockSizePredictor pred(kBlockBytes, 0.0, AdaptiveBlockSizePredictor::kDefaultProbeRows, + custom_rows); + + // 100 bytes/row → predicted = 8MB/100 = 83886, clamped to custom_rows = 500. + pred.set_has_history_for_test(true, 100.0); + EXPECT_EQ(pred.predict_next_rows(), custom_rows); +} + +TEST_F(AdaptiveBlockSizePredictorTest, BlockSizeRowsDoesNotAffectSmallPrediction) { + constexpr size_t custom_rows = 10000; + // 8 KB target, custom block_size_rows = 10000. + AdaptiveBlockSizePredictor pred(8 * 1024, 0.0, AdaptiveBlockSizePredictor::kDefaultProbeRows, + custom_rows); + + // 100 bytes/row → predicted = 8192/100 = 81 < custom_rows. + pred.set_has_history_for_test(true, 100.0); + EXPECT_EQ(pred.predict_next_rows(), 81u); +} + +} // namespace doris diff --git a/be/test/storage/segment/mock/mock_segment.h b/be/test/storage/segment/mock/mock_segment.h index 211b48c29da5bd..6c715a53dee612 100644 --- a/be/test/storage/segment/mock/mock_segment.h +++ b/be/test/storage/segment/mock/mock_segment.h @@ -56,6 +56,10 @@ class MockSegment : public Segment { void set_footer(std::shared_ptr footer) { _footer = footer; } + void set_column_raw_data_bytes(int32_t uid, uint64_t bytes) { + _column_uid_to_raw_bytes[uid] = bytes; + } + std::shared_ptr get_footer() const { return _footer; } std::shared_ptr _footer; diff --git a/be/test/storage/segment/segment_column_raw_data_bytes_test.cpp b/be/test/storage/segment/segment_column_raw_data_bytes_test.cpp new file mode 100644 index 00000000000000..85203076443ee5 --- /dev/null +++ b/be/test/storage/segment/segment_column_raw_data_bytes_test.cpp @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "storage/segment/mock/mock_segment.h" + +namespace doris::segment_v2 { + +class SegmentColumnRawDataBytesTest : public testing::Test {}; + +TEST_F(SegmentColumnRawDataBytesTest, ReturnsZeroForUnknownColumn) { + MockSegment seg; + EXPECT_EQ(seg.column_raw_data_bytes(999), 0); +} + +TEST_F(SegmentColumnRawDataBytesTest, ReturnsSetValue) { + MockSegment seg; + seg.set_column_raw_data_bytes(1, 1024); + seg.set_column_raw_data_bytes(2, 2048); + + EXPECT_EQ(seg.column_raw_data_bytes(1), 1024); + EXPECT_EQ(seg.column_raw_data_bytes(2), 2048); +} + +TEST_F(SegmentColumnRawDataBytesTest, OverwritesPreviousValue) { + MockSegment seg; + seg.set_column_raw_data_bytes(1, 100); + EXPECT_EQ(seg.column_raw_data_bytes(1), 100); + + seg.set_column_raw_data_bytes(1, 200); + EXPECT_EQ(seg.column_raw_data_bytes(1), 200); +} + +TEST_F(SegmentColumnRawDataBytesTest, HandlesMultipleColumns) { + MockSegment seg; + for (int32_t uid = 0; uid < 50; uid++) { + seg.set_column_raw_data_bytes(uid, uid * 1000); + } + for (int32_t uid = 0; uid < 50; uid++) { + EXPECT_EQ(seg.column_raw_data_bytes(uid), uid * 1000); + } +} + +TEST_F(SegmentColumnRawDataBytesTest, HandlesLargeByteValues) { + MockSegment seg; + uint64_t large_value = 1ULL << 40; // 1 TB + seg.set_column_raw_data_bytes(1, large_value); + EXPECT_EQ(seg.column_raw_data_bytes(1), large_value); +} + +} // namespace doris::segment_v2 diff --git a/be/test/storage/test_data/tablet_meta_test.hdr b/be/test/storage/test_data/tablet_meta_test.hdr new file mode 100644 index 00000000000000..017dbd8e748e0d Binary files /dev/null and b/be/test/storage/test_data/tablet_meta_test.hdr differ diff --git a/be/test/testutil/mock/mock_runtime_state.h b/be/test/testutil/mock/mock_runtime_state.h index e67e7c45ff2c0b..5e05ce8cf8a1f0 100644 --- a/be/test/testutil/mock/mock_runtime_state.h +++ b/be/test/testutil/mock/mock_runtime_state.h @@ -73,6 +73,19 @@ class MockRuntimeState : public RuntimeState { bool enable_use_hybrid_sort() const override { return false; } + // Bypass the [1MB, 512MB] clamping in RuntimeState so tests can use tiny + // byte budgets (e.g. 1 or 50) to exercise block-splitting logic. + // When adaptive is disabled, fall back to RuntimeState's behavior (kMax) + // so the value is always a legal byte budget; tests should gate on + // config::enable_adaptive_batch_size directly to detect the disabled state. + size_t preferred_block_size_bytes() const override { + if (config::enable_adaptive_batch_size && + _query_options.__isset.preferred_block_size_bytes) { + return _query_options.preferred_block_size_bytes; + } + return RuntimeState::preferred_block_size_bytes(); + } + // default batch size int _batch_size = 4096; bool _enable_shared_exchange_sink_buffer = true; diff --git a/be/test/util/profile_spec_test.cpp b/be/test/util/profile_spec_test.cpp index a97816a83ecd15..9d2561416120b7 100644 --- a/be/test/util/profile_spec_test.cpp +++ b/be/test/util/profile_spec_test.cpp @@ -21,8 +21,11 @@ #include #include "common/object_pool.h" +#include "core/column/column_string.h" +#include "core/data_type/data_type_string.h" #include "exec/operator/exchange_sink_operator.h" #include "exec/operator/mock_operator.h" +#include "exec/operator/mock_scan_operator.h" #include "exec/operator/operator.h" #include "runtime/descriptors.h" #include "runtime/runtime_state.h" @@ -62,6 +65,26 @@ class ProfileSpecTest : public testing::Test { sink.__set_dest_node_id(1); } +protected: + template + void init_source_local_state(MockRuntimeState* runtime_state, Operator* op, + RuntimeProfile* parent_profile) { + const auto max_operator_id = op->operator_id() - 1; + runtime_state->resize_op_id_to_local_state(max_operator_id); + runtime_state->set_max_operator_id(max_operator_id); + LocalStateInfo info {parent_profile, {}, nullptr, {}, 0}; + ASSERT_TRUE(op->setup_local_state(runtime_state, info).ok()); + } + + Block make_string_block(std::string value) { + auto col = ColumnString::create(); + col->insert_data(value.data(), value.size()); + Block block; + block.insert( + ColumnWithTypeAndName(std::move(col), std::make_shared(), "c0")); + return block; + } + private: class MockOperatorX : public OperatorX { public: @@ -77,13 +100,27 @@ class ProfileSpecTest : public testing::Test { return Status::OK(); } }; - class MockRuntimeState : public RuntimeState { + class ProducingMockOperatorX : public OperatorX { public: - MockRuntimeState() = default; + ProducingMockOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, + const DescriptorTbl& descs) + : OperatorX(pool, tnode, operator_id, descs) { + _op_name = "MOCK_OPERATOR"; + } - MOCK_CONST_METHOD0(enable_local_merge_sort, bool()); - }; + void set_output_block(Block block) { _block = std::move(block); } + + Status prepare(RuntimeState* state) override { return Status::OK(); } + Status close(RuntimeState* state) override { return Status::OK(); } + Status get_block(RuntimeState* state, Block* block, bool* eos) override { + *eos = true; + block->swap(_block); + return Status::OK(); + } + private: + Block _block; + }; std::unique_ptr obj_pool = std::make_unique(); TTableDescriptor tbl_desc; TScalarType scalar_type; @@ -110,8 +147,8 @@ TEST_F(ProfileSpecTest, SourceOperatorNameSuffixTest1) { MockOperatorX op(obj_pool.get(), tnode, 1, *descs); - RuntimeState* runtime_state = nullptr; - auto local_state = std::make_unique(runtime_state, &op); + auto runtime_state = std::make_unique(); + auto local_state = std::make_unique(runtime_state.get(), &op); ASSERT_EQ(local_state->name_suffix(), "(id=1)"); } @@ -127,8 +164,8 @@ TEST_F(ProfileSpecTest, SourceOperatorNameSuffixTest2) { MockOperatorX op(obj_pool.get(), tnode, 1, *descs); op._nereids_id = 100; - RuntimeState* runtime_state = nullptr; - auto local_state = std::make_unique(runtime_state, &op); + auto runtime_state = std::make_unique(); + auto local_state = std::make_unique(runtime_state.get(), &op); ASSERT_EQ(local_state->name_suffix(), "(nereids_id=100)(id=1)"); } @@ -177,4 +214,30 @@ TEST_F(ProfileSpecTest, CommonCountersCustomCounters) { ASSERT_TRUE(local_state->operator_profile()->get_child("CommonCounters") != nullptr); } +TEST_F(ProfileSpecTest, ScanSourceOperatorUpdatesOutputBlockByteCounters) { + MockScanOperatorX op; + std::unique_ptr runtime_state = std::make_unique(); + RuntimeProfile parent_profile("parent"); + init_source_local_state(runtime_state.get(), &op, &parent_profile); + + Block expected = make_string_block("scan-output"); + const auto expected_bytes = static_cast(expected.bytes()); + op.set_output_block(std::move(expected)); + + Block output; + bool eos = false; + ASSERT_TRUE(op.get_block_after_projects(runtime_state.get(), &output, &eos).ok()); + ASSERT_TRUE(eos); + + auto* local_state = runtime_state->get_local_state(op.operator_id()); + EXPECT_EQ(local_state->common_profile()->get_counter("RowsProduced")->value(), 1); + EXPECT_EQ(local_state->common_profile()->get_counter("BlocksProduced")->value(), 1); + EXPECT_EQ(local_state->common_profile()->get_counter("OutputBlockBytes")->value(), + expected_bytes); + EXPECT_EQ(local_state->common_profile()->get_counter("MaxOutputBlockBytes")->value(), + expected_bytes); + EXPECT_EQ(local_state->common_profile()->get_counter("MinOutputBlockBytes")->value(), + expected_bytes); +} + } // namespace doris diff --git a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java index e4627c56f57b2e..85655b56016c60 100644 --- a/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java +++ b/fe/be-java-extensions/hadoop-hudi-scanner/src/main/java/org/apache/doris/hudi/HadoopHudiJniScanner.java @@ -168,7 +168,7 @@ public int getNext() throws IOException { ArrayWritable value = reader.createValue(); long startTime = System.nanoTime(); int numRows = 0; - for (; numRows < fetchSize; numRows++) { + for (; numRows < batchSize; numRows++) { if (!reader.next(key, value)) { break; } diff --git a/fe/be-java-extensions/java-common/src/main/java/org/apache/doris/common/jni/JniScanner.java b/fe/be-java-extensions/java-common/src/main/java/org/apache/doris/common/jni/JniScanner.java index 8bb8a664ccf516..4e2e23b5ae0851 100644 --- a/fe/be-java-extensions/java-common/src/main/java/org/apache/doris/common/jni/JniScanner.java +++ b/fe/be-java-extensions/java-common/src/main/java/org/apache/doris/common/jni/JniScanner.java @@ -78,6 +78,10 @@ protected int getBatchSize() { return batchSize; } + public void setBatchSize(int batchSize) { + this.batchSize = batchSize; + } + public VectorTable getTable() { return vectorTable; } diff --git a/fe/be-java-extensions/java-common/src/test/java/org/apache/doris/common/jni/JniScannerTest.java b/fe/be-java-extensions/java-common/src/test/java/org/apache/doris/common/jni/JniScannerTest.java index 97a5ad5ef3895b..74683955411801 100644 --- a/fe/be-java-extensions/java-common/src/test/java/org/apache/doris/common/jni/JniScannerTest.java +++ b/fe/be-java-extensions/java-common/src/test/java/org/apache/doris/common/jni/JniScannerTest.java @@ -61,4 +61,46 @@ public void testMockJniScanner() throws IOException { scanner.releaseTable(); scanner.close(); } + + @Test + public void testSetBatchSize() throws IOException { + OffHeap.setTesting(); + MockJniScanner scanner = new MockJniScanner(16, new HashMap() { + { + put("mock_rows", "64"); + put("required_fields", "int"); + put("columns_types", "int"); + } + }); + scanner.open(); + + // First batch: batchSize = 16 + long metaAddress = scanner.getNextBatchMeta(); + Assert.assertNotEquals(0, metaAddress); + Assert.assertEquals(16, OffHeap.getLong(null, metaAddress)); + scanner.resetTable(); + + // Change batch size to 32 + scanner.setBatchSize(32); + Assert.assertEquals(32, scanner.getBatchSize()); + + // Second batch: should read 32 rows with updated batchSize + metaAddress = scanner.getNextBatchMeta(); + Assert.assertNotEquals(0, metaAddress); + Assert.assertEquals(32, OffHeap.getLong(null, metaAddress)); + scanner.resetTable(); + + // Third batch: only 16 rows remaining + metaAddress = scanner.getNextBatchMeta(); + Assert.assertNotEquals(0, metaAddress); + Assert.assertEquals(16, OffHeap.getLong(null, metaAddress)); + scanner.resetTable(); + + // EOF + metaAddress = scanner.getNextBatchMeta(); + Assert.assertEquals(0, metaAddress); + + scanner.releaseTable(); + scanner.close(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 9576f729584a5f..d8eddbd616bbfd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -162,6 +162,7 @@ public class SessionVariable implements Serializable, Writable { public static final int MIN_EXEC_MEM_LIMIT = 2097152; public static final String BATCH_SIZE = "batch_size"; public static final String BROKER_LOAD_BATCH_SIZE = "broker_load_batch_size"; + public static final String PREFERRED_BLOCK_SIZE_BYTES = "preferred_block_size_bytes"; public static final String DISABLE_STREAMING_PREAGGREGATIONS = "disable_streaming_preaggregations"; public static final String ENABLE_DISTINCT_STREAMING_AGGREGATION = "enable_distinct_streaming_aggregation"; public static final String ENABLE_STREAMING_AGG_HASH_JOIN_FORCE_PASSTHROUGH = @@ -1274,7 +1275,8 @@ public void checkQuerySlotCount(String slotCnt) { @VariableMgr.VarAttr(name = HAVE_QUERY_CACHE, flag = VariableMgr.READ_ONLY) public boolean haveQueryCache = false; - // 4096 minus 16 + 16 bytes padding that in padding pod array + // 8192 minus 16 + 16 bytes padding that in padding pod array. + // This remains the row cap for output blocks even when adaptive byte budgeting is enabled. @VariableMgr.VarAttr(name = BATCH_SIZE, fuzzy = true, checker = "checkBatchSize", needForward = true) public int batchSize = 8160; @@ -1282,7 +1284,18 @@ public void checkQuerySlotCount(String slotCnt) { @VariableMgr.VarAttr(name = BROKER_LOAD_BATCH_SIZE, fuzzy = true, checker = "checkBatchSize") public int brokerLoadBatchSize = 16352; + // Target output block size in bytes for adaptive batch size. + // Valid range: [1MB, 512MB]. Default 8MB. + @VariableMgr.VarAttr(name = PREFERRED_BLOCK_SIZE_BYTES, needForward = true, + checker = "checkPreferredBlockSizeBytes", + description = {"目标输出 Block 字节数上限,自适应 batch size 功能使用。" + + "范围 [1MB, 512MB],默认 8MB", + "Target output block size in bytes for adaptive batch size. " + + "Range [1MB, 512MB]. Default 8MB."}) + public long preferredBlockSizeBytes = 8388608L; // 8MB + @VariableMgr.VarAttr(name = DISABLE_STREAMING_PREAGGREGATIONS, fuzzy = true) + public boolean disableStreamPreaggregations = false; @VariableMgr.VarAttr(name = ENABLE_DISTINCT_STREAMING_AGGREGATION, fuzzy = true) @@ -5269,6 +5282,7 @@ public TQueryOptions toThrift() { tResult.setEnableShareHashTableForBroadcastJoin(enableShareHashTableForBroadcastJoin); tResult.setBatchSize(batchSize); + tResult.setPreferredBlockSizeBytes(preferredBlockSizeBytes); tResult.setDisableStreamPreaggregations(disableStreamPreaggregations); tResult.setEnableDistinctStreamingAggregation(enableDistinctStreamingAggregation); tResult.setEnableStreamingAggHashJoinForcePassthrough(enableStreamingAggHashJoinForcePassthrough); @@ -5944,6 +5958,20 @@ public void checkBatchSize(String batchSize) { } } + + private static final long PREFERRED_BLOCK_SIZE_BYTES_MIN = 1048576L; // 1MB + private static final long PREFERRED_BLOCK_SIZE_BYTES_MAX = 536870912L; // 512MB + + public void checkPreferredBlockSizeBytes(String value) { + long v = Long.parseLong(value); + if (v < PREFERRED_BLOCK_SIZE_BYTES_MIN || v > PREFERRED_BLOCK_SIZE_BYTES_MAX) { + throw new InvalidParameterException( + "preferred_block_size_bytes should be between 1MB (" + + PREFERRED_BLOCK_SIZE_BYTES_MIN + ") and 512MB (" + + PREFERRED_BLOCK_SIZE_BYTES_MAX + "), got " + v); + } + } + public void checkSkewRewriteAggBucketNum(String bucketNumStr) { try { long bucketNum = Long.parseLong(bucketNumStr); diff --git a/fe/fe-core/src/test/java/org/apache/doris/qe/VariableMgrTest.java b/fe/fe-core/src/test/java/org/apache/doris/qe/VariableMgrTest.java index 7004e85b5f2037..2cc9d43b31167b 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/qe/VariableMgrTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/qe/VariableMgrTest.java @@ -35,6 +35,7 @@ import org.apache.doris.nereids.trees.plans.commands.SetOptionsCommand; import org.apache.doris.nereids.trees.plans.logical.LogicalPlan; import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.thrift.TQueryOptions; import org.apache.doris.utframe.UtFrameUtils; import org.apache.commons.io.FileUtils; @@ -191,4 +192,49 @@ public void testCheckSqlConvertorFeatures() throws DdlException { VariableMgr.setVar(var, setVar); Assert.assertEquals(new String[] {""}, var.getSqlConvertorFeatures()); } + + @Test + public void testAdaptiveBatchSizeDefaultsToThrift() { + SessionVariable var = new SessionVariable(); + TQueryOptions options = var.toThrift(); + + Assert.assertEquals(8160, var.batchSize); + Assert.assertEquals(8160, options.getBatchSize()); + Assert.assertEquals(8388608L, options.getPreferredBlockSizeBytes()); + } + + @Test + public void testAdaptiveBatchSizeSessionVariables() throws Exception { + SessionVariable var = new SessionVariable(); + + VariableMgr.setVar(var, new SetVar(SetType.SESSION, SessionVariable.BATCH_SIZE, + new StringLiteral("12345"))); + VariableMgr.setVar(var, new SetVar(SetType.SESSION, SessionVariable.PREFERRED_BLOCK_SIZE_BYTES, + new StringLiteral("1048576"))); + + TQueryOptions options = var.toThrift(); + Assert.assertEquals(12345, var.batchSize); + Assert.assertEquals(1048576L, var.preferredBlockSizeBytes); + Assert.assertEquals(12345, options.getBatchSize()); + Assert.assertEquals(1048576L, options.getPreferredBlockSizeBytes()); + } + + @Test + public void testAdaptiveBatchSizeRejectsTinyNonZeroBytes() { + SessionVariable var = new SessionVariable(); + DdlException exception = Assert.assertThrows(DdlException.class, () -> VariableMgr.setVar(var, + new SetVar(SetType.SESSION, SessionVariable.PREFERRED_BLOCK_SIZE_BYTES, + new StringLiteral("1")))); + Assert.assertTrue(exception.getMessage().contains("preferred_block_size_bytes")); + } + + @Test + public void testAdaptiveBatchSizeRejectsZeroByteValues() { + SessionVariable var = new SessionVariable(); + + DdlException blockSizeException = Assert.assertThrows(DdlException.class, () -> VariableMgr.setVar(var, + new SetVar(SetType.SESSION, SessionVariable.PREFERRED_BLOCK_SIZE_BYTES, + new StringLiteral("0")))); + Assert.assertTrue(blockSizeException.getMessage().contains("preferred_block_size_bytes")); + } } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 3094be0bff3832..ff473f89e53dbe 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -472,10 +472,23 @@ struct TQueryOptions { 184: optional i32 cte_max_recursion_depth; + // Enable hybrid sorting: dynamically selects between PdqSort and TimSort based on // runtime profiling to choose the most efficient algorithm for the data pattern 210: optional bool enable_use_hybrid_sort = false; + 211: optional bool enable_adaptive_scan = false; + 212: optional bool enable_local_exchange_before_agg = true; + 213: optional double max_scan_mem_ratio = 0.3; + + // Use Rust-based Lance reader for FORMAT_LANCE scan ranges + 216: optional bool enable_rust_lance_reader = false; + 217: optional bool new_version_percentile = false + + // Adaptive batch size: target output block size in bytes. Valid range [1MB, 512MB]. + // Default 8MB. Sent by FE session variable preferred_block_size_bytes. + 218: optional i64 preferred_block_size_bytes = 8388608 + // For cloud, to control if the content would be written into file cache // In write path, to control if the content would be written into file cache. // In read path, read from file cache or remote storage when execute query. diff --git a/regression-test/data/query_p0/adaptive_batch_size/adaptive_batch_size.out b/regression-test/data/query_p0/adaptive_batch_size/adaptive_batch_size.out new file mode 100644 index 00000000000000..d42f122b65ff6a --- /dev/null +++ b/regression-test/data/query_p0/adaptive_batch_size/adaptive_batch_size.out @@ -0,0 +1,73 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !wide -- +1 1000 1000 1000 +10 1000 1000 1000 +11 1000 1000 1000 +12 1000 1000 1000 +13 1000 1000 1000 +14 1000 1000 1000 +15 1000 1000 1000 +16 1000 1000 1000 +17 1000 1000 1000 +18 1000 1000 1000 +19 1000 1000 1000 +2 1000 1000 1000 +20 1000 1000 1000 +21 1000 1000 1000 +22 1000 1000 1000 +23 1000 1000 1000 +24 1000 1000 1000 +25 1000 1000 1000 +26 1000 1000 1000 +27 1000 1000 1000 +28 1000 1000 1000 +29 1000 1000 1000 +3 1000 1000 1000 +30 1000 1000 1000 +31 1000 1000 1000 +32 1000 1000 1000 +33 1000 1000 1000 +34 1000 1000 1000 +35 1000 1000 1000 +36 1000 1000 1000 +37 1000 1000 1000 +38 1000 1000 1000 +39 1000 1000 1000 +4 1000 1000 1000 +40 1000 1000 1000 +41 1000 1000 1000 +42 1000 1000 1000 +43 1000 1000 1000 +44 1000 1000 1000 +45 1000 1000 1000 +46 1000 1000 1000 +47 1000 1000 1000 +48 1000 1000 1000 +49 1000 1000 1000 +5 1000 1000 1000 +50 1000 1000 1000 +6 1000 1000 1000 +7 1000 1000 1000 +8 1000 1000 1000 +9 1000 1000 1000 + +-- !narrow -- +24995000 37492500 49990000 + +-- !agg -- +1 3 +10 30 +2 6 +3 9 +4 12 +5 15 +6 18 +7 21 +8 24 +9 27 + +-- !unique -- +3000 4498500 + +-- !flag -- +4950 diff --git a/regression-test/suites/fault_injection_p0/test_skip_calc_between_segments.groovy b/regression-test/suites/fault_injection_p0/test_skip_calc_between_segments.groovy index 5a127335d25298..cdfcf5b4df1020 100644 --- a/regression-test/suites/fault_injection_p0/test_skip_calc_between_segments.groovy +++ b/regression-test/suites/fault_injection_p0/test_skip_calc_between_segments.groovy @@ -95,7 +95,8 @@ suite("test_skip_calc_between_segments", "nonConcurrent") { // to cause multi segments def customBeConfig = [ - doris_scanner_row_bytes : 1 + doris_scanner_row_bytes : 1, + enable_adaptive_batch_size: false ] setBeConfigTemporary(customBeConfig) { diff --git a/regression-test/suites/query_p0/adaptive_batch_size/adaptive_batch_size.groovy b/regression-test/suites/query_p0/adaptive_batch_size/adaptive_batch_size.groovy new file mode 100644 index 00000000000000..644f588a0c8a0e --- /dev/null +++ b/regression-test/suites/query_p0/adaptive_batch_size/adaptive_batch_size.groovy @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Regression tests for the Adaptive Batch Size feature. +// +// Design notes: +// - Each case runs the same query with the feature enabled and disabled, and +// asserts that results are identical (correctness check). +// - We do NOT directly assert internal block byte sizes, because the storage +// layer does not expose them via SQL result columns. Correctness is the +// primary requirement; performance / memory reduction is verified manually +// or via profile counters in a separate benchmark. + +suite("adaptive_batch_size") { + + // ── helpers ──────────────────────────────────────────────────────────────── + + def set_adaptive = { enabled -> + if (enabled) { + set_be_param("enable_adaptive_batch_size", "true") + sql "set preferred_block_size_bytes = 8388608" // 8 MB (default) + sql "set batch_size = 4096" + } else { + set_be_param("enable_adaptive_batch_size", "false") + sql "set preferred_block_size_bytes = 8388608" + sql "set batch_size = 4096" + } + } + + try { + // ── Test 1: wide table (VARCHAR columns) ────────────────────────────────── + // Each row is ~10 KB; with 4096 rows that is ~40 MB/batch which OOM-risks. + // With adaptive=on the batch is trimmed to ~8 MB worth of rows. + + sql "drop table if exists abs_wide_table" + sql """ + create table abs_wide_table ( + id int not null, + c1 varchar(4096), + c2 varchar(4096), + c3 varchar(4096) + ) + ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + // Insert 1000 rows with ~3 KB data each. + def wide_rows = (1..1000).collect { i -> + "(${i}, '${('a' * 1000)}', '${('b' * 1000)}', '${('c' * 1000)}')" + } + sql "insert into abs_wide_table values ${wide_rows.join(',')}" + + // Run query with adaptive enabled and collect result. + set_adaptive(true) + def res_enabled = sql "select id, length(c1) as l1, length(c2) as l2, length(c3) as l3 from abs_wide_table order by 1, 2, 3, 4" + + order_qt_wide "select id, length(c1) as l1, length(c2) as l2, length(c3) as l3 from abs_wide_table order by 1, 2, 3, 4 limit 50" + + // Run query with adaptive disabled and collect result. + set_adaptive(false) + def res_disabled = sql "select id, length(c1) as l1, length(c2) as l2, length(c3) as l3 from abs_wide_table order by 1, 2, 3, 4" + + // Results must be identical. + assertEquals(res_enabled.size(), res_disabled.size()) + for (int i = 0; i < res_enabled.size(); i++) { + assertEquals(res_enabled[i].toString(), res_disabled[i].toString()) + } + + + // ── Test 2: narrow table (INT columns) ─────────────────────────────────── + // Rows are ~12 bytes each; with adaptive=on the predictor should converge + // toward returning close to batch_size (batch is still row-limited). + + sql "drop table if exists abs_narrow_table" + sql """ + create table abs_narrow_table ( + id int not null, + c1 int, + c2 int, + c3 int + ) + ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + sql "insert into abs_narrow_table select number, number*2, number*3, number*4 from numbers('number'='5000')" + + set_adaptive(true) + def narrow_on = sql "select sum(c1), sum(c2), sum(c3) from abs_narrow_table" + + order_qt_narrow "select sum(c1), sum(c2), sum(c3) from abs_narrow_table" + + set_adaptive(false) + def narrow_off = sql "select sum(c1), sum(c2), sum(c3) from abs_narrow_table" + + assertEquals(narrow_on.toString(), narrow_off.toString()) + + + // ── Test 3: AGG_KEYS table ──────────────────────────────────────────────── + // Verifies that adaptive batch size does not break aggregation correctness + // (the byte check in _agg_key_next_block must only trigger at group boundaries). + + sql "drop table if exists abs_agg_table" + sql """ + create table abs_agg_table ( + id int not null, + val bigint replace + ) + ENGINE=OLAP + AGGREGATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + // 2000 distinct keys, 3 rows per key → 6000 rows total. + def agg_rows = [] + for (int k = 1; k <= 2000; k++) { + agg_rows << "(${k}, ${k})" + agg_rows << "(${k}, ${k * 2})" + agg_rows << "(${k}, ${k * 3})" + } + sql "insert into abs_agg_table values ${agg_rows.join(',')}" + + set_adaptive(true) + def agg_on = sql "select id, val from abs_agg_table order by 1, 2 limit 10" + + order_qt_agg "select id, val from abs_agg_table order by 1, 2 limit 10" + + set_adaptive(false) + def agg_off = sql "select id, val from abs_agg_table order by 1, 2 limit 10" + + assertEquals(agg_on.toString(), agg_off.toString()) + + + // ── Test 4: UNIQUE_KEYS table ───────────────────────────────────────────── + // Verifies that adaptive byte-stop in _unique_key_next_block does not + // cause duplicate or missing rows. + + sql "drop table if exists abs_unique_table" + sql """ + create table abs_unique_table ( + id int not null, + name varchar(200) + ) + ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + sql "insert into abs_unique_table select number, repeat('x', 100) from numbers('number'='3000')" + + set_adaptive(true) + def uniq_on = sql "select count(*), sum(id) from abs_unique_table" + + order_qt_unique "select count(*), sum(id) from abs_unique_table" + + set_adaptive(false) + def uniq_off = sql "select count(*), sum(id) from abs_unique_table" + + assertEquals(uniq_on.toString(), uniq_off.toString()) + + + // ── Test 5: verify setting enable_adaptive_batch_size = false disables adaptive sizing ── + + sql "drop table if exists abs_flag_table" + sql """ + create table abs_flag_table (id int not null, v int) + ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + sql "insert into abs_flag_table select number, number from numbers('number'='100')" + + set_adaptive(false) + def flag_off = sql "select sum(v) from abs_flag_table" + + order_qt_flag "select sum(v) from abs_flag_table" + + set_adaptive(true) + def flag_on = sql "select sum(v) from abs_flag_table" + + assertEquals(flag_off.toString(), flag_on.toString()) + } finally { + set_adaptive(true) + sql "set preferred_block_size_bytes = 8388608" + sql "set batch_size = 8160" + } +} diff --git a/regression-test/suites/unique_with_mow_c_p0/test_compact_multi_segments.groovy b/regression-test/suites/unique_with_mow_c_p0/test_compact_multi_segments.groovy index b9d3a28bfb6352..97ab4d5e8bed6a 100644 --- a/regression-test/suites/unique_with_mow_c_p0/test_compact_multi_segments.groovy +++ b/regression-test/suites/unique_with_mow_c_p0/test_compact_multi_segments.groovy @@ -45,13 +45,17 @@ suite("test_compact_multi_segments", "nonConcurrent") { // batch_size is 4164 in csv_reader.cpp // _batch_size is 8192 in vtablet_writer.cpp + def backendId_to_params = get_be_param("doris_scanner_row_bytes") + def backendId_to_adaptive_batch_size = get_be_param("enable_adaptive_batch_size") onFinish { GetDebugPoint().disableDebugPointForAllBEs("MemTable.need_flush") set_original_be_param("doris_scanner_row_bytes", backendId_to_params) + set_original_be_param("enable_adaptive_batch_size", backendId_to_adaptive_batch_size) } GetDebugPoint().enableDebugPointForAllBEs("MemTable.need_flush") set_be_param.call("doris_scanner_row_bytes", "1") + set_be_param.call("enable_adaptive_batch_size", "false") for (int j = 0; j < 2; j++) { tableName = "test_compact_multi_segments_" + j diff --git a/regression-test/suites/unique_with_mow_c_p0/test_schema_change_add_key_column.groovy b/regression-test/suites/unique_with_mow_c_p0/test_schema_change_add_key_column.groovy index f3d9429c74760f..f64bbfc5dc14a5 100644 --- a/regression-test/suites/unique_with_mow_c_p0/test_schema_change_add_key_column.groovy +++ b/regression-test/suites/unique_with_mow_c_p0/test_schema_change_add_key_column.groovy @@ -57,13 +57,16 @@ suite("test_schema_change_add_key_column", "nonConcurrent") { // batch_size is 4164 in csv_reader.cpp // _batch_size is 8192 in vtablet_writer.cpp def backendId_to_params = get_be_param("doris_scanner_row_bytes") + def backendId_to_adaptive_batch_size = get_be_param("enable_adaptive_batch_size") onFinish { GetDebugPoint().clearDebugPointsForAllBEs() set_original_be_param("doris_scanner_row_bytes", backendId_to_params) + set_original_be_param("enable_adaptive_batch_size", backendId_to_adaptive_batch_size) } GetDebugPoint().enableDebugPointForAllBEs("MemTable.need_flush") GetDebugPoint().enableDebugPointForAllBEs("VBaseSchemaChangeWithSorting._inner_process.create_rowset") set_be_param.call("doris_scanner_row_bytes", "1") + set_be_param.call("enable_adaptive_batch_size", "false") // 0: table without sequence_col; add a key col // 1: table without sequence_col; reorder cols