Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ harness = false
name = "filter_bool"
harness = false

[[bench]]
name = "list_length"
harness = false

[[bench]]
name = "listview_rebuild"
harness = false
Expand Down
146 changes: 146 additions & 0 deletions vortex-array/benches/list_length.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Benchmarks for the `list_length` scalar function over `List` and `ListView` inputs.
//!
//! `list_length` reads only the offsets/sizes (never the elements), so its cost scales with the
//! number of lists.

#![expect(clippy::unwrap_used)]
#![expect(clippy::cast_possible_truncation)]

use std::sync::LazyLock;

use divan::Bencher;
use rand::RngExt;
use rand::SeedableRng;
use rand::distr::Uniform;
use rand::rngs::StdRng;
use vortex_array::ArrayRef;
use vortex_array::Canonical;
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::ListArray;
use vortex_array::arrays::ListViewArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::expr::list_length;
use vortex_array::expr::root;
use vortex_array::validity::Validity;
use vortex_buffer::Buffer;
use vortex_session::VortexSession;

fn main() {
divan::main();
Comment thread
gatesn marked this conversation as resolved.
}

static SESSION: LazyLock<VortexSession> = LazyLock::new(vortex_array::array_session);

const BASE_LIST_SIZE: usize = 8;

const SMALL: usize = 100;
const MEDIUM: usize = 10_000;
const LARGE: usize = 1_000_000;

/// A uniformly-random partition of `num_lists * LIST_SIZE` elements into `num_lists` lists,
/// plus a validity mask with ~1/8 of lists null at random positions.
fn random_lists(num_lists: usize) -> (Vec<i32>, Validity) {
let mut rng = StdRng::seed_from_u64(num_lists as u64);
let total = (num_lists * BASE_LIST_SIZE) as i32;

let cut_dist = Uniform::new_inclusive(0i32, total).unwrap();
let mut cuts: Vec<i32> = (0..num_lists - 1).map(|_| rng.sample(cut_dist)).collect();
cuts.sort_unstable();
let mut sizes = Vec::with_capacity(num_lists);
let mut prev = 0i32;
for cut in cuts {
sizes.push(cut - prev);
prev = cut;
}
sizes.push(total - prev);

let null_dist = Uniform::new(0u32, 8).unwrap();
let valid = (0..num_lists).map(|_| rng.sample(null_dist) != 0);
(
sizes,
Validity::Array(BoolArray::from_iter(valid).into_array()),
)
}

/// A canonical `List<i32>` of `num_lists` variable-length lists, ~1/8 of them null.
fn make_list(num_lists: usize) -> ArrayRef {
let (sizes, validity) = random_lists(num_lists);
let total: i32 = sizes.iter().sum();
let elements = PrimitiveArray::from_iter(0..total).into_array();
let offsets: Buffer<i32> = std::iter::once(0)
.chain(sizes.iter().scan(0i32, |acc, &s| {
*acc += s;
Some(*acc)
}))
.collect();
ListArray::try_new(elements, offsets.into_array(), validity)
.unwrap()
.into_array()
}

/// A gapless `ListView<i32>` of `num_lists` variable-length lists, ~1/8 of them null.
fn make_listview(num_lists: usize) -> ArrayRef {
let (sizes, validity) = random_lists(num_lists);
let total: i32 = sizes.iter().sum();
let elements = PrimitiveArray::from_iter(0..total).into_array();
let offsets: Buffer<i32> = sizes
.iter()
.scan(0i32, |acc, &s| {
let start = *acc;
*acc += s;
Some(start)
})
.collect();
let sizes: Buffer<i32> = sizes.into_iter().collect();
ListViewArray::new(elements, offsets.into_array(), sizes.into_array(), validity).into_array()
}

/// Apply `list_length(root())` and materialize the result.
fn run(bencher: Bencher, array: ArrayRef) {
let expr = list_length(root());
bencher
.with_inputs(|| (&array, SESSION.create_execution_ctx()))
.bench_refs(|(array, ctx)| {
array
.clone()
.apply(&expr)
.unwrap()
.execute::<Canonical>(ctx)
.unwrap()
});
}

#[divan::bench]
fn list_length_small(bencher: Bencher) {
run(bencher, make_list(SMALL));
}

#[divan::bench]
fn list_length_medium(bencher: Bencher) {
run(bencher, make_list(MEDIUM));
}

#[divan::bench]
fn list_length_large(bencher: Bencher) {
run(bencher, make_list(LARGE));
}

#[divan::bench]
fn listview_length_small(bencher: Bencher) {
run(bencher, make_listview(SMALL));
}

#[divan::bench]
fn listview_length_medium(bencher: Bencher) {
run(bencher, make_listview(MEDIUM));
}

#[divan::bench]
fn listview_length_large(bencher: Bencher) {
run(bencher, make_listview(LARGE));
}
15 changes: 15 additions & 0 deletions vortex-array/src/expr/exprs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use crate::scalar_fn::fns::is_null::IsNull;
use crate::scalar_fn::fns::like::Like;
use crate::scalar_fn::fns::like::LikeOptions;
use crate::scalar_fn::fns::list_contains::ListContains;
use crate::scalar_fn::fns::list_length::ListLength;
use crate::scalar_fn::fns::literal::Literal;
use crate::scalar_fn::fns::mask::Mask;
use crate::scalar_fn::fns::merge::DuplicateHandling;
Expand Down Expand Up @@ -750,3 +751,17 @@ pub fn byte_length(input: Expression) -> Expression {
pub fn ext_storage(input: Expression) -> Expression {
ExtStorage.new_expr(EmptyOptions, [input])
}

// ---- ListLength ----

/// Creates an expression that computes the number of elements in each list
/// for `List` and `FixedSizeList` inputs. This is akin to ANSI SQL `CARDINALITY()`,
/// or DuckDB's `len()`/`array_length()`.
///
/// ```rust
/// # use vortex_array::expr::{list_length, root};
/// let expr = list_length(root());
/// ```
pub fn list_length(input: Expression) -> Expression {
ListLength.new_expr(EmptyOptions, [input])
}
Loading
Loading