diff --git a/Cargo.lock b/Cargo.lock index 137d6234e8f..12031559311 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10332,6 +10332,7 @@ dependencies = [ "serde_json", "serde_test", "simdutf8", + "smallvec", "static_assertions", "tabled", "termtree", @@ -11126,6 +11127,7 @@ dependencies = [ "num-traits", "prost 0.14.3", "rstest", + "smallvec", "vortex-array", "vortex-buffer", "vortex-error", diff --git a/Cargo.toml b/Cargo.toml index 129ef446e4c..f8726a6f265 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -231,6 +231,7 @@ sha2 = "0.11.0" simdutf8 = "0.1.5" similar = "3.0.0" sketches-ddsketch = "0.4.0" +smallvec = "1.15.1" smol = "2.0.2" static_assertions = "1.1" strum = "0.28" diff --git a/encodings/sequence/Cargo.toml b/encodings/sequence/Cargo.toml index 06f695cb989..5dd9c5c187d 100644 --- a/encodings/sequence/Cargo.toml +++ b/encodings/sequence/Cargo.toml @@ -16,6 +16,7 @@ version = { workspace = true } [dependencies] num-traits = { workspace = true } prost = { workspace = true } +smallvec = { workspace = true } vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-error = { workspace = true } diff --git a/encodings/sequence/src/array.rs b/encodings/sequence/src/array.rs index 74bebcc5510..2ee0b22d89c 100644 --- a/encodings/sequence/src/array.rs +++ b/encodings/sequence/src/array.rs @@ -8,6 +8,7 @@ use std::hash::Hasher; use num_traits::cast::FromPrimitive; use prost::Message; +use smallvec::smallvec; use vortex_array::Array; use vortex_array::ArrayEq; use vortex_array::ArrayHash; @@ -383,7 +384,7 @@ impl Sequence { // SAFETY: we don't have duplicate stats. unsafe { - StatsSet::new_unchecked(vec![ + StatsSet::new_unchecked(smallvec![ (Stat::IsSorted, StatPrecision::Exact(is_sorted.into())), ( Stat::IsStrictSorted, diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 9701759a787..0e0b3a36592 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -61,6 +61,7 @@ rstest_reuse = { workspace = true, optional = true } rustc-hash = { workspace = true } serde = { workspace = true, optional = true, features = ["derive", "rc"] } simdutf8 = { workspace = true } +smallvec = { workspace = true } static_assertions = { workspace = true } tabled = { workspace = true, optional = true, default-features = false, features = [ "std", diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 48a6af40db8..23b79b2d061 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -19344,7 +19344,7 @@ pub fn vortex_array::stats::StatsSet::as_mut_typed_ref<'a, 'b>(&'a mut self, &'b pub fn vortex_array::stats::StatsSet::as_typed_ref<'a, 'b>(&'a self, &'b vortex_array::dtype::DType) -> vortex_array::stats::TypedStatsSetRef<'a, 'b> -pub unsafe fn vortex_array::stats::StatsSet::new_unchecked(alloc::vec::Vec<(vortex_array::expr::stats::Stat, vortex_array::expr::stats::Precision)>) -> Self +pub unsafe fn vortex_array::stats::StatsSet::new_unchecked(smallvec::SmallVec) -> Self pub fn vortex_array::stats::StatsSet::of(vortex_array::expr::stats::Stat, vortex_array::expr::stats::Precision) -> Self @@ -19508,6 +19508,8 @@ pub fn vortex_array::stats::as_stat_bitset_bytes(&[vortex_array::expr::stats::St pub fn vortex_array::stats::stats_from_bitset_bytes(&[u8]) -> alloc::vec::Vec +pub type vortex_array::stats::StatsArray = [(vortex_array::expr::stats::Stat, vortex_array::expr::stats::Precision); 4] + pub mod vortex_array::stream pub struct vortex_array::stream::ArrayStreamAdapter diff --git a/vortex-array/src/arrays/dict/take.rs b/vortex-array/src/arrays/dict/take.rs index b821af79fac..78a80ff8277 100644 --- a/vortex-array/src/arrays/dict/take.rs +++ b/vortex-array/src/arrays/dict/take.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use smallvec::SmallVec; use vortex_error::VortexResult; use super::Dict; @@ -164,7 +165,7 @@ pub(crate) fn propagate_take_stats( .and_then(|v| v.map(|s| s.into_value()).into_inexact().transpose()) .map(|sv| (stat, sv)) }) - .collect::>(); + .collect::>(); st.combine_sets( &(unsafe { StatsSet::new_unchecked(inexact_min_max) }).as_typed_ref(source.dtype()), ) diff --git a/vortex-array/src/stats/stats_set.rs b/vortex-array/src/stats/stats_set.rs index a0ca186fed6..0b9430ef155 100644 --- a/vortex-array/src/stats/stats_set.rs +++ b/vortex-array/src/stats/stats_set.rs @@ -3,9 +3,10 @@ use std::fmt::Debug; -use enum_iterator::Sequence; use enum_iterator::all; use num_traits::CheckedAdd; +use smallvec::SmallVec; +use smallvec::smallvec; use vortex_error::VortexError; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -31,9 +32,12 @@ use crate::expr::stats::UncompressedSizeInBytes; use crate::scalar::Scalar; use crate::scalar::ScalarValue; +/// Type of the SmallVec stored inside StatsSet +pub type StatsArray = [(Stat, Precision); 4]; + #[derive(Default, Debug, Clone)] pub struct StatsSet { - values: Vec<(Stat, Precision)>, + values: SmallVec, } impl StatsSet { @@ -42,20 +46,14 @@ impl StatsSet { /// # Safety /// /// This method will not panic or trigger UB, but may lead to duplicate stats being stored. - pub unsafe fn new_unchecked(values: Vec<(Stat, Precision)>) -> Self { + pub unsafe fn new_unchecked(values: SmallVec) -> Self { Self { values } } /// Create StatsSet from single stat and value pub fn of(stat: Stat, value: Precision) -> Self { - // SAFETY: No duplicate stats will be set here. - unsafe { Self::new_unchecked(vec![(stat, value)]) } - } - - fn reserve_full_capacity(&mut self) { - if self.values.capacity() < Stat::CARDINALITY { - self.values - .reserve_exact(Stat::CARDINALITY - self.values.capacity()); + Self { + values: smallvec![(stat, value)], } } @@ -80,8 +78,6 @@ impl StatsSet { impl StatsSet { /// Set the stat `stat` to `value`. pub fn set(&mut self, stat: Stat, value: Precision) { - self.reserve_full_capacity(); - if let Some(existing) = self.values.iter_mut().find(|(s, _)| *s == stat) { *existing = (stat, value); } else { @@ -154,7 +150,7 @@ impl StatsSet { /// Owned iterator over the stats. /// /// See [IntoIterator]. -pub struct StatsSetIntoIter(std::vec::IntoIter<(Stat, Precision)>); +pub struct StatsSetIntoIter(smallvec::IntoIter); impl Iterator for StatsSetIntoIter { type Item = (Stat, Precision); @@ -176,10 +172,10 @@ impl IntoIterator for StatsSet { impl FromIterator<(Stat, Precision)> for StatsSet { fn from_iter)>>(iter: T) -> Self { let iter = iter.into_iter(); - let mut values = Vec::default(); - values.reserve_exact(Stat::CARDINALITY); - let mut this = Self { values }; + let mut this = Self { + values: SmallVec::new(), + }; this.extend(iter); this } @@ -188,10 +184,8 @@ impl FromIterator<(Stat, Precision)> for StatsSet { impl Extend<(Stat, Precision)> for StatsSet { #[inline] fn extend)>>(&mut self, iter: T) { - let iter = iter.into_iter(); - self.reserve_full_capacity(); - - iter.for_each(|(stat, value)| self.set(stat, value)); + iter.into_iter() + .for_each(|(stat, value)| self.set(stat, value)); } } @@ -574,6 +568,7 @@ impl MutTypedStatsSetRef<'_, '_> { mod test { use enum_iterator::all; use itertools::Itertools; + use smallvec::smallvec; use crate::LEGACY_SESSION; use crate::VortexSessionExecute; @@ -593,7 +588,7 @@ mod test { fn test_iter() { // SAFETY: No duplicate stats. let set = unsafe { - StatsSet::new_unchecked(vec![ + StatsSet::new_unchecked(smallvec![ (Stat::Max, Precision::exact(100)), (Stat::Min, Precision::exact(42)), ]) @@ -621,7 +616,7 @@ mod test { fn into_iter() { // SAFETY: No duplicate stats. let mut set = unsafe { - StatsSet::new_unchecked(vec![ + StatsSet::new_unchecked(smallvec![ (Stat::Max, Precision::exact(100)), (Stat::Min, Precision::exact(42)), ])