Specialized `GroupValues` for `primitive` and `large_primitive` #16136

Rachelint · 2025-05-21T12:04:45Z

Which issue does this PR close?

Closes Specialized GroupValues for primitive and large_primitive #16135

Rationale for this change

What changes are included in this PR?

Are these changes tested?

Are there any user-facing changes?

…erformance.

Dandandan · 2025-05-21T16:28:16Z

datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs

    ///
-    map: HashTable<(usize, u64)>,
+    map: HashTable<(usize, T::Native)>,


Is storing values: Vec<T::Native> in this case necessary?
It could be rebuild in emit_internal by traversing the map items I think (create a Vec and update by group index). This avoids doing that while inserting.

Something like:

diff --git i/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs w/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs index 693cc997fa..c166c16f8e 100644 --- i/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs +++ w/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs @@ -25,7 +25,6 @@ use arrow::array::{ use arrow::datatypes::{i256, DataType}; use arrow::record_batch::RecordBatch; use datafusion_common::Result; -use datafusion_execution::memory_pool::proxy::VecAllocExt; use datafusion_expr::EmitTo; use half::f16; use hashbrown::hash_table::HashTable; @@ -33,7 +32,6 @@ use std::mem::size_of; use std::sync::Arc; mod large_primitive; -pub use large_primitive::GroupValuesLargePrimitive; /// A trait to allow hashing of floating point numbers pub(crate) trait HashValue { @@ -94,8 +92,6 @@ pub struct GroupValuesPrimitive<T: ArrowPrimitiveType> { map: HashTable<(usize, T::Native)>, /// The group index of the null value if any null_group: Option<usize>, - /// The values for each group index - values: Vec<T::Native>, /// The random state used to generate hashes random_state: RandomState, } @@ -106,7 +102,6 @@ impl<T: ArrowPrimitiveType> GroupValuesPrimitive<T> { Self { data_type, map: HashTable::with_capacity(128), - values: Vec::with_capacity(128), null_group: None, random_state: Default::default(), } @@ -124,13 +119,14 @@ where for v in cols[0].as_primitive::<T>() { let group_id = match v { None => *self.null_group.get_or_insert_with(|| { - let group_id = self.values.len(); - self.values.push(Default::default()); + let group_id = self.map.len(); group_id }), Some(key) => { let state = &self.random_state; let hash = key.hash(state); + let group_id = self.map.len(); + let insert = self.map.entry( hash, |&(_, v)| v.is_eq(key), @@ -140,10 +136,8 @@ where match insert { hashbrown::hash_table::Entry::Occupied(o) => o.get().0, hashbrown::hash_table::Entry::Vacant(v) => { - let g = self.values.len(); - v.insert((g, key)); - self.values.push(key); - g + v.insert((group_id, key)); + group_id } } } @@ -155,21 +149,19 @@ where fn size(&self) -> usize { self.map.capacity() * size_of::<(usize, T::Native)>() - + self.values.allocated_size() } fn is_empty(&self) -> bool { - self.values.is_empty() + self.map.is_empty() } fn len(&self) -> usize { - self.values.len() + self.map.len() } fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> { - emit_internal::<T, T::Native>( + emit_internal::<T>( emit_to, - &mut self.values, &mut self.null_group, &mut self.map, self.data_type.clone(), @@ -178,65 +170,64 @@ where fn clear_shrink(&mut self, batch: &RecordBatch) { let count = batch.num_rows(); - self.values.clear(); - self.values.shrink_to(count); self.map.clear(); self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared } } -pub(crate) fn emit_internal<T: ArrowPrimitiveType, K>( +pub(crate) fn emit_internal<T: ArrowPrimitiveType>( emit_to: EmitTo, - values: &mut Vec<T::Native>, null_group: &mut Option<usize>, - map: &mut HashTable<(usize, K)>, + map: &mut HashTable<(usize, T::Native)>, data_type: DataType, ) -> Result<Vec<ArrayRef>> { fn build_primitive<T: ArrowPrimitiveType>( - values: Vec<T::Native>, + map: HashTable<(usize, T::Native)>, null_idx: Option<usize>, ) -> PrimitiveArray<T> { let nulls = null_idx.map(|null_idx| { - let mut buffer = NullBufferBuilder::new(values.len()); + let mut buffer = NullBufferBuilder::new(map.len()); buffer.append_n_non_nulls(null_idx); buffer.append_null(); - buffer.append_n_non_nulls(values.len() - null_idx - 1); + buffer.append_n_non_nulls(map.len() - null_idx - 1); // NOTE: The inner builder must be constructed as there is at least one null buffer.finish().unwrap() }); + let mut values: Vec<T::Native> = vec![T::default_value(); map.len()]; + map.iter().for_each(|i| values[i.0] = i.1); PrimitiveArray::<T>::new(values.into(), nulls) } let array: PrimitiveArray<T> = match emit_to { EmitTo::All => { - map.clear(); - build_primitive(std::mem::take(values), null_group.take()) + build_primitive(std::mem::take(map), null_group.take()) } EmitTo::First(n) => { - map.retain(|entry| { - // Decrement group index by n - let group_idx = entry.0; - match group_idx.checked_sub(n) { - // Group index was >= n, shift value down - Some(sub) => { - entry.0 = sub; - true - } - // Group index was < n, so remove from table - None => false, - } - }); - let null_group = match null_group { - Some(v) if *v >= n => { - *v -= n; - None - } - Some(_) => null_group.take(), - None => None, - }; - let mut split = values.split_off(n); - std::mem::swap(values, &mut split); - build_primitive(split, null_group) + todo!(""); + // map.retain(|entry| { + // // Decrement group index by n + // let group_idx = entry.0; + // match group_idx.checked_sub(n) { + // // Group index was >= n, shift value down + // Some(sub) => { + // entry.0 = sub; + // true + // } + // // Group index was < n, so remove from table + // None => false, + // } + // }); + // let null_group = match null_group { + // Some(v) if *v >= n => { + // *v -= n; + // None + // } + // Some(_) => null_group.take(), + // None => None, + // }; + // let mut split = values.split_off(n); + // std::mem::swap(values, &mut split); + // build_primitive(split, null_group) } };

I want to try it, too.
But I think it will lead to regression, becasue many extra random writes exist in emit if we do that.

Dandandan · 2025-05-21T16:55:19Z

datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs

@@ -130,15 +133,15 @@ where
                    let hash = key.hash(state);
                    let insert = self.map.entry(
                        hash,
-                        |&(g, _)| unsafe { self.values.get_unchecked(g).is_eq(key) },


I wonder if the existing performance couldn't already be improved by doing h == hash && unsafe { self.values.get_unchecked(g).is_eq(key) } (i.e. avoiding fetching the value if hash equality doesn't hold), like we do in other places. This already filters out most of them as hashes are almost never the same.

I think for the large(> 64bits), we can do that to improve.

But for normal(<= 64bits), the new way will be better, I am testing it.

Dandandan · 2025-05-21T16:57:37Z

...usion/physical-plan/src/aggregates/group_values/single_group_by/primitive/large_primitive.rs

+///
+/// This specialization is significantly faster than using the more general
+/// purpose `Row`s format
+pub struct GroupValuesLargePrimitive<T: ArrowPrimitiveType> {


Why use large primitive? Is 128 bits slower otherwise?

Why use large primitive? Is 128 bits slower otherwise?

🤔 I currently decide which type to use GroupValuesLargePrimitive, according to if its bits > 64.

Because the hash is 64bits, when bits > 64, the map will get larger, and I am not sure if it is still better to go the new way.

Hm I think the storage of a value inside HashTable will not be very different from Vec besides being inline?

Hm I think the storage of a value inside HashTable will not be very different from Vec besides being inline?

I think for the value <= 64bits, storing it inside HashTable:

Can decrease the size of HashTable comparing to store their hashes?

And when probing and rehashing of HashTable totally in-place, and eliminting the random read to a the outside Vec?

But I am indeed not sure, for the large primitive, if doing this can get benefit, because storing them inline HashTable will make the table larger than storing their hashes...

Also comparing them directly is more expansive than comparing their hashes...

alamb · 2025-05-21T18:28:11Z

🤖 ./gh_compare_branch.sh Benchmark Script Running
Linux aal-dev 6.11.0-1013-gcp #13~24.04.1-Ubuntu SMP Wed Apr 2 16:34:16 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux
Comparing improve-primitive-group-values (8c05f69) to 07fe23f diff
Benchmarks: tpch_mem clickbench_partitioned clickbench_extended
Results will be posted here when complete

Dandandan · 2025-05-21T18:58:01Z

datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs

@@ -222,3 +184,61 @@ where
        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
    }
 }
+
+pub(crate) fn emit_internal<T: ArrowPrimitiveType, K>(


K is unneeded here, it can take emit_internal<T: ArrowPrimitiveType>

alamb · 2025-05-21T19:10:11Z

🤖: Benchmark completed

Details

Comparing HEAD and improve-primitive-group-values
--------------------
Benchmark clickbench_extended.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Query        ┃       HEAD ┃ improve-primitive-group-values ┃        Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ QQuery 0     │  2121.49ms │                      1919.65ms │ +1.11x faster │
│ QQuery 1     │   740.28ms │                       713.71ms │     no change │
│ QQuery 2     │  1498.69ms │                      1414.88ms │ +1.06x faster │
│ QQuery 3     │   695.23ms │                       717.43ms │     no change │
│ QQuery 4     │  1471.01ms │                      1474.66ms │     no change │
│ QQuery 5     │ 15267.11ms │                     15054.97ms │     no change │
│ QQuery 6     │  2081.23ms │                      2106.01ms │     no change │
│ QQuery 7     │  2141.68ms │                      2088.73ms │     no change │
│ QQuery 8     │  2008.35ms │                      2070.93ms │     no change │
└──────────────┴────────────┴────────────────────────────────┴───────────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Benchmark Summary                             ┃            ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ Total Time (HEAD)                             │ 28025.07ms │
│ Total Time (improve-primitive-group-values)   │ 27560.97ms │
│ Average Time (HEAD)                           │  3113.90ms │
│ Average Time (improve-primitive-group-values) │  3062.33ms │
│ Queries Faster                                │          2 │
│ Queries Slower                                │          0 │
│ Queries with No Change                        │          7 │
└───────────────────────────────────────────────┴────────────┘
--------------------
Benchmark clickbench_partitioned.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Query        ┃       HEAD ┃ improve-primitive-group-values ┃        Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ QQuery 0     │     2.69ms │                         2.18ms │ +1.23x faster │
│ QQuery 1     │    37.02ms │                        37.08ms │     no change │
│ QQuery 2     │    88.75ms │                        93.11ms │     no change │
│ QQuery 3     │    98.86ms │                       101.96ms │     no change │
│ QQuery 4     │   578.77ms │                       546.64ms │ +1.06x faster │
│ QQuery 5     │   847.22ms │                       889.08ms │     no change │
│ QQuery 6     │     2.32ms │                         2.25ms │     no change │
│ QQuery 7     │    40.97ms │                        41.98ms │     no change │
│ QQuery 8     │   897.48ms │                       914.82ms │     no change │
│ QQuery 9     │  1197.23ms │                      1208.45ms │     no change │
│ QQuery 10    │   271.49ms │                       266.82ms │     no change │
│ QQuery 11    │   302.44ms │                       300.03ms │     no change │
│ QQuery 12    │   890.12ms │                       920.98ms │     no change │
│ QQuery 13    │  1356.21ms │                      1390.50ms │     no change │
│ QQuery 14    │   829.07ms │                       867.85ms │     no change │
│ QQuery 15    │   819.53ms │                       745.34ms │ +1.10x faster │
│ QQuery 16    │  1703.44ms │                      1735.34ms │     no change │
│ QQuery 17    │  1586.44ms │                      1598.57ms │     no change │
│ QQuery 18    │  3046.16ms │                      3127.48ms │     no change │
│ QQuery 19    │    84.86ms │                        89.24ms │  1.05x slower │
│ QQuery 20    │  1154.87ms │                      1172.21ms │     no change │
│ QQuery 21    │  1348.35ms │                      1369.51ms │     no change │
│ QQuery 22    │  2274.86ms │                      2274.24ms │     no change │
│ QQuery 23    │  8334.33ms │                      8396.84ms │     no change │
│ QQuery 24    │   468.77ms │                       471.73ms │     no change │
│ QQuery 25    │   407.86ms │                       400.50ms │     no change │
│ QQuery 26    │   531.96ms │                       536.71ms │     no change │
│ QQuery 27    │  1594.67ms │                      1618.85ms │     no change │
│ QQuery 28    │ 12418.63ms │                     12463.37ms │     no change │
│ QQuery 29    │   543.60ms │                       533.05ms │     no change │
│ QQuery 30    │   820.78ms │                       829.84ms │     no change │
│ QQuery 31    │   876.93ms │                       870.04ms │     no change │
│ QQuery 32    │  2675.37ms │                      2673.68ms │     no change │
│ QQuery 33    │  3362.57ms │                      3387.95ms │     no change │
│ QQuery 34    │  3413.69ms │                      3432.85ms │     no change │
│ QQuery 35    │  1292.91ms │                      1278.04ms │     no change │
│ QQuery 36    │   127.60ms │                       126.27ms │     no change │
│ QQuery 37    │    58.53ms │                        58.60ms │     no change │
│ QQuery 38    │   125.30ms │                       126.89ms │     no change │
│ QQuery 39    │   208.50ms │                       201.65ms │     no change │
│ QQuery 40    │    46.49ms │                        47.32ms │     no change │
│ QQuery 41    │    45.91ms │                        47.14ms │     no change │
│ QQuery 42    │    39.10ms │                        39.89ms │     no change │
└──────────────┴────────────┴────────────────────────────────┴───────────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Benchmark Summary                             ┃            ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ Total Time (HEAD)                             │ 56852.64ms │
│ Total Time (improve-primitive-group-values)   │ 57236.87ms │
│ Average Time (HEAD)                           │  1322.15ms │
│ Average Time (improve-primitive-group-values) │  1331.09ms │
│ Queries Faster                                │          3 │
│ Queries Slower                                │          1 │
│ Queries with No Change                        │         39 │
└───────────────────────────────────────────────┴────────────┘
--------------------
Benchmark tpch_mem_sf1.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Query        ┃     HEAD ┃ improve-primitive-group-values ┃        Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ QQuery 1     │ 121.59ms │                       121.13ms │     no change │
│ QQuery 2     │  24.07ms │                        23.48ms │     no change │
│ QQuery 3     │  35.34ms │                        34.01ms │     no change │
│ QQuery 4     │  21.15ms │                        21.04ms │     no change │
│ QQuery 5     │  55.14ms │                        55.55ms │     no change │
│ QQuery 6     │  12.48ms │                        12.41ms │     no change │
│ QQuery 7     │ 102.58ms │                       100.76ms │     no change │
│ QQuery 8     │  26.34ms │                        26.03ms │     no change │
│ QQuery 9     │  61.67ms │                        63.05ms │     no change │
│ QQuery 10    │  57.59ms │                        59.12ms │     no change │
│ QQuery 11    │  13.34ms │                        13.04ms │     no change │
│ QQuery 12    │  44.73ms │                        45.47ms │     no change │
│ QQuery 13    │  29.54ms │                        29.74ms │     no change │
│ QQuery 14    │  10.57ms │                        10.58ms │     no change │
│ QQuery 15    │  26.48ms │                        25.86ms │     no change │
│ QQuery 16    │  22.88ms │                        24.08ms │  1.05x slower │
│ QQuery 17    │  99.46ms │                        86.24ms │ +1.15x faster │
│ QQuery 18    │ 245.41ms │                       242.33ms │     no change │
│ QQuery 19    │  26.71ms │                        28.23ms │  1.06x slower │
│ QQuery 20    │  40.59ms │                        38.75ms │     no change │
│ QQuery 21    │ 175.47ms │                       170.19ms │     no change │
│ QQuery 22    │  17.18ms │                        17.66ms │     no change │
└──────────────┴──────────┴────────────────────────────────┴───────────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ Benchmark Summary                             ┃           ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ Total Time (HEAD)                             │ 1270.30ms │
│ Total Time (improve-primitive-group-values)   │ 1248.78ms │
│ Average Time (HEAD)                           │   57.74ms │
│ Average Time (improve-primitive-group-values) │   56.76ms │
│ Queries Faster                                │         1 │
│ Queries Slower                                │         2 │
│ Queries with No Change                        │        19 │
└───────────────────────────────────────────────┴───────────┘

Rachelint · 2025-05-22T06:29:30Z

🤖: Benchmark completed

Thanks, q4 and q15 are the target, and it seems indeed get faster!

alamb · 2025-05-22T20:02:51Z

...usion/physical-plan/src/aggregates/group_values/single_group_by/primitive/large_primitive.rs

+/// This specialization is significantly faster than using the more general
+/// purpose `Row`s format


I think it would help me a lot if you could document the rationale behind this change and why it is different than GroupValuesPrimitive

Suggested change

/// This specialization is significantly faster than using the more general

/// purpose `Row`s format

/// This specialization is faster than [`GroupValuesPrimitive`] because it does not store values directly

/// in the hash table, but instead stores an index into self.vales

Or something like that

Dandandan · 2025-05-23T11:05:58Z

...usion/physical-plan/src/aggregates/group_values/single_group_by/primitive/large_primitive.rs

+                    }
+                }
+            };
+            groups.push(group_id)


It can extend to groups here by writing this as iterator.

Dandandan · 2025-05-23T11:06:39Z

datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs

+                            let g = num_total_groups;
+                            v.insert((g, key));
+                            self.append_row_indices.push(row_index as u32);
+                            num_total_groups += 1;
                            g
                        }
                    }
                }
            };
            groups.push(group_id)


this can extend to groups

Dandandan · 2025-05-23T11:07:25Z

datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive/mod.rs

+            self.values.extend_from_slice(col.values());
+        } else {
+            let col_values = col.values();
+            for &row_index in self.append_row_indices.iter() {


can be written as
self.values.extend(self.append_row_indices.iter().map(...)

Actually... I found no obvious improvement when switching to extend...
The bottleneck still the hashtable, I think it is better to just keep the original push logic because it may be simpler, and actually efficient enough.

alamb

I think this PR makes things better so approving. Nice work @Rachelint.

Rachelint · 2025-05-28T20:33:50Z

I think this PR makes things better so approving. Nice work @Rachelint.

Thanks @alamb , I think still two blocked things before merging it:

Maybe we should also compare the performance of this one with what @Dandandan mentioned Specialized GroupValues for primitive and large_primitive #16136 (comment)
Run the benchmark again, and see if it always reproducable (I still have concern about no improvement in q8 of clickbench_extended, it is not as expected...).

alamb · 2025-05-28T20:42:17Z

🤖 ./gh_compare_branch.sh Benchmark Script Running
Linux aal-dev 6.11.0-1013-gcp #13~24.04.1-Ubuntu SMP Wed Apr 2 16:34:16 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux
Comparing improve-primitive-group-values (8e09c83) to 17fe504 diff
Benchmarks: tpch_mem clickbench_partitioned clickbench_extended
Results will be posted here when complete

alamb · 2025-05-28T21:23:16Z

🤖: Benchmark completed

Details

Comparing HEAD and improve-primitive-group-values
--------------------
Benchmark clickbench_extended.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
┃ Query        ┃       HEAD ┃ improve-primitive-group-values ┃       Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
│ QQuery 0     │  1915.07ms │                      1925.29ms │    no change │
│ QQuery 1     │   696.08ms │                       702.78ms │    no change │
│ QQuery 2     │  1421.79ms │                      1407.15ms │    no change │
│ QQuery 3     │   693.44ms │                       689.96ms │    no change │
│ QQuery 4     │  1454.69ms │                      1447.10ms │    no change │
│ QQuery 5     │ 15331.97ms │                     15156.51ms │    no change │
│ QQuery 6     │  2106.64ms │                      2044.70ms │    no change │
│ QQuery 7     │  2037.56ms │                      2207.73ms │ 1.08x slower │
│ QQuery 8     │   818.29ms │                       826.34ms │    no change │
└──────────────┴────────────┴────────────────────────────────┴──────────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Benchmark Summary                             ┃            ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ Total Time (HEAD)                             │ 26475.53ms │
│ Total Time (improve-primitive-group-values)   │ 26407.55ms │
│ Average Time (HEAD)                           │  2941.73ms │
│ Average Time (improve-primitive-group-values) │  2934.17ms │
│ Queries Faster                                │          0 │
│ Queries Slower                                │          1 │
│ Queries with No Change                        │          8 │
└───────────────────────────────────────────────┴────────────┘
--------------------
Benchmark clickbench_partitioned.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
┃ Query        ┃       HEAD ┃ improve-primitive-group-values ┃       Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
│ QQuery 0     │    15.44ms │                        15.45ms │    no change │
│ QQuery 1     │    32.31ms │                        32.50ms │    no change │
│ QQuery 2     │    79.82ms │                        81.50ms │    no change │
│ QQuery 3     │    96.11ms │                        97.19ms │    no change │
│ QQuery 4     │   576.73ms │                       582.80ms │    no change │
│ QQuery 5     │   874.26ms │                       897.85ms │    no change │
│ QQuery 6     │    22.21ms │                        23.67ms │ 1.07x slower │
│ QQuery 7     │    36.56ms │                        38.49ms │ 1.05x slower │
│ QQuery 8     │   876.79ms │                       901.41ms │    no change │
│ QQuery 9     │  1180.79ms │                      1184.28ms │    no change │
│ QQuery 10    │   260.73ms │                       262.17ms │    no change │
│ QQuery 11    │   293.15ms │                       294.16ms │    no change │
│ QQuery 12    │   905.30ms │                       887.00ms │    no change │
│ QQuery 13    │  1375.72ms │                      1387.10ms │    no change │
│ QQuery 14    │   855.54ms │                       838.72ms │    no change │
│ QQuery 15    │   819.43ms │                       824.34ms │    no change │
│ QQuery 16    │  1717.88ms │                      1721.36ms │    no change │
│ QQuery 17    │  1576.03ms │                      1618.82ms │    no change │
│ QQuery 18    │  3021.58ms │                      3057.47ms │    no change │
│ QQuery 19    │    81.36ms │                        85.15ms │    no change │
│ QQuery 20    │  1149.28ms │                      1140.23ms │    no change │
│ QQuery 21    │  1335.99ms │                      1327.81ms │    no change │
│ QQuery 22    │  2276.34ms │                      2214.15ms │    no change │
│ QQuery 23    │  8187.11ms │                      8094.83ms │    no change │
│ QQuery 24    │   469.30ms │                       467.50ms │    no change │
│ QQuery 25    │   401.62ms │                       387.40ms │    no change │
│ QQuery 26    │   542.07ms │                       536.12ms │    no change │
│ QQuery 27    │  1624.08ms │                      1595.77ms │    no change │
│ QQuery 28    │ 12366.86ms │                     12744.64ms │    no change │
│ QQuery 29    │   537.01ms │                       524.08ms │    no change │
│ QQuery 30    │   814.22ms │                       809.74ms │    no change │
│ QQuery 31    │   848.16ms │                       845.50ms │    no change │
│ QQuery 32    │  2611.12ms │                      2676.65ms │    no change │
│ QQuery 33    │  3375.39ms │                      3368.94ms │    no change │
│ QQuery 34    │  3398.75ms │                      3372.24ms │    no change │
│ QQuery 35    │  1335.78ms │                      1304.48ms │    no change │
│ QQuery 36    │   126.26ms │                       125.72ms │    no change │
│ QQuery 37    │    54.80ms │                        57.05ms │    no change │
│ QQuery 38    │   125.56ms │                       119.91ms │    no change │
│ QQuery 39    │   202.77ms │                       194.31ms │    no change │
│ QQuery 40    │    48.89ms │                        48.43ms │    no change │
│ QQuery 41    │    44.40ms │                        43.45ms │    no change │
│ QQuery 42    │    38.56ms │                        37.75ms │    no change │
└──────────────┴────────────┴────────────────────────────────┴──────────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Benchmark Summary                             ┃            ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ Total Time (HEAD)                             │ 56612.10ms │
│ Total Time (improve-primitive-group-values)   │ 56868.13ms │
│ Average Time (HEAD)                           │  1316.56ms │
│ Average Time (improve-primitive-group-values) │  1322.51ms │
│ Queries Faster                                │          0 │
│ Queries Slower                                │          2 │
│ Queries with No Change                        │         41 │
└───────────────────────────────────────────────┴────────────┘
--------------------
Benchmark tpch_mem_sf1.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Query        ┃     HEAD ┃ improve-primitive-group-values ┃        Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ QQuery 1     │ 121.24ms │                       124.54ms │     no change │
│ QQuery 2     │  23.73ms │                        23.22ms │     no change │
│ QQuery 3     │  35.24ms │                        35.45ms │     no change │
│ QQuery 4     │  20.64ms │                        20.81ms │     no change │
│ QQuery 5     │  54.80ms │                        56.04ms │     no change │
│ QQuery 6     │  12.15ms │                        12.42ms │     no change │
│ QQuery 7     │ 107.48ms │                       100.83ms │ +1.07x faster │
│ QQuery 8     │  27.55ms │                        26.45ms │     no change │
│ QQuery 9     │  62.11ms │                        63.13ms │     no change │
│ QQuery 10    │  57.87ms │                        56.84ms │     no change │
│ QQuery 11    │  13.31ms │                        12.76ms │     no change │
│ QQuery 12    │  46.97ms │                        44.21ms │ +1.06x faster │
│ QQuery 13    │  30.85ms │                        29.80ms │     no change │
│ QQuery 14    │   9.89ms │                         9.92ms │     no change │
│ QQuery 15    │  24.96ms │                        25.01ms │     no change │
│ QQuery 16    │  22.43ms │                        23.50ms │     no change │
│ QQuery 17    │ 102.04ms │                        91.40ms │ +1.12x faster │
│ QQuery 18    │ 234.45ms │                       248.00ms │  1.06x slower │
│ QQuery 19    │  26.61ms │                        26.80ms │     no change │
│ QQuery 20    │  40.03ms │                        40.42ms │     no change │
│ QQuery 21    │ 172.21ms │                       165.40ms │     no change │
│ QQuery 22    │  17.87ms │                        17.49ms │     no change │
└──────────────┴──────────┴────────────────────────────────┴───────────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ Benchmark Summary                             ┃           ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ Total Time (HEAD)                             │ 1264.45ms │
│ Total Time (improve-primitive-group-values)   │ 1254.42ms │
│ Average Time (HEAD)                           │   57.47ms │
│ Average Time (improve-primitive-group-values) │   57.02ms │
│ Queries Faster                                │         3 │
│ Queries Slower                                │         1 │
│ Queries with No Change                        │        18 │
└───────────────────────────────────────────────┴───────────┘

Rachelint added 2 commits May 21, 2025 16:42

unnecessary to save hash value... let's save value...

e3477f1

specialized GroupValues for primitive and large_primitive for p…

93045f2

…erformance.

github-actions bot added the physical-plan Changes to the physical-plan crate label May 21, 2025

Rachelint changed the title ~~Specialized GroupValues for primitive and large_primitiveImprove primitive group values~~ Specialized GroupValues for primitive and large_primitive May 21, 2025

fix comments and clippy.

6ce4857

Rachelint force-pushed the improve-primitive-group-values branch 3 times, most recently from afceb44 to bd002ad Compare May 21, 2025 12:20

fix size of GroupValuesPrimitive.

1b5cde9

Rachelint force-pushed the improve-primitive-group-values branch from bd002ad to 1b5cde9 Compare May 21, 2025 12:21

waynexia self-requested a review May 21, 2025 12:35

fix clippy.

8c05f69

Dandandan reviewed May 21, 2025

View reviewed changes

alamb added the performance Make DataFusion faster label May 21, 2025

try extend.

cf053cb

Rachelint force-pushed the improve-primitive-group-values branch from 313ccfc to cf053cb Compare May 22, 2025 09:49

alamb reviewed May 22, 2025

View reviewed changes

Merge branch 'main' into improve-primitive-group-values

8e09c83

Dandandan reviewed May 23, 2025

View reviewed changes

alamb approved these changes May 28, 2025

View reviewed changes

Rachelint marked this pull request as draft May 28, 2025 20:34

		/// This specialization is significantly faster than using the more general
		/// purpose `Row`s format

Specialized GroupValues for primitive and large_primitive #16136

Are you sure you want to change the base?

Specialized GroupValues for primitive and large_primitive #16136

Conversation

Rachelint commented May 21, 2025

Which issue does this PR close?

Rationale for this change

What changes are included in this PR?

Are these changes tested?

Are there any user-facing changes?

Uh oh!

Dandandan May 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Dandandan May 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Rachelint May 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Dandandan May 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Rachelint May 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alamb commented May 21, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alamb commented May 21, 2025

Uh oh!

Rachelint commented May 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alamb left a comment

Choose a reason for hiding this comment

Uh oh!

Rachelint commented May 28, 2025

Uh oh!

alamb commented May 28, 2025

Uh oh!

alamb commented May 28, 2025

Uh oh!

Uh oh!

Specialized `GroupValues` for `primitive` and `large_primitive` #16136

Specialized `GroupValues` for `primitive` and `large_primitive` #16136

Dandandan May 21, 2025 •

edited

Loading

Dandandan May 21, 2025 •

edited

Loading

Rachelint May 22, 2025 •

edited

Loading

Dandandan May 21, 2025 •

edited

Loading

Rachelint May 22, 2025 •

edited

Loading

Rachelint commented May 22, 2025 •

edited

Loading