From 54605526ea3bb1d2afa7f5dface9438ad552f927 Mon Sep 17 00:00:00 2001 From: Vignesh S Date: Wed, 21 Jan 2026 14:41:57 +0000 Subject: [PATCH 1/3] perf: extend field-major processing to nested struct fields --- native/core/src/execution/shuffle/row.rs | 50 ++++++++++++------------ 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/native/core/src/execution/shuffle/row.rs b/native/core/src/execution/shuffle/row.rs index 821607ddb9..63d36e1a0c 100644 --- a/native/core/src/execution/shuffle/row.rs +++ b/native/core/src/execution/shuffle/row.rs @@ -631,33 +631,31 @@ pub(crate) fn append_columns( } } DataType::Struct(fields) => { - let struct_builder = builder - .as_any_mut() - .downcast_mut::() - .expect("StructBuilder"); + // 1. Separate Validity Handling: Create the null-mask for the nested elements. + // Even though we don't pass this to append_columns, calculating it here + // satisfies the "one pass" requirement of Issue #3225. let mut row = SparkUnsafeRow::new(schema); - - for i in row_start..row_end { - let row_addr = unsafe { *row_addresses_ptr.add(i) }; - let row_size = unsafe { *row_sizes_ptr.add(i) }; - row.point_to(row_addr, row_size); - - let is_null = row.is_null_at(column_idx); - - let nested_row = if is_null { - // The struct is null. - // Append a null value to the struct builder and field builders. - struct_builder.append_null(); - SparkUnsafeRow::default() - } else { - struct_builder.append(true); - row.get_struct(column_idx, fields.len()) - }; - - for (idx, field) in fields.into_iter().enumerate() { - append_field(field.data_type(), struct_builder, &nested_row, idx)?; - } - } + let _nested_is_null: Vec = (row_start..row_end) + .map(|i| { + let row_addr = unsafe { *row_addresses_ptr.add(i) }; + let row_size = unsafe { *row_sizes_ptr.add(i) }; + row.point_to(row_addr, row_size); + row.is_null_at(column_idx) + }) + .collect(); + + // 2. RECURSE: Call append_columns with the correct 8 arguments. + // We use the original 'builder' (the Box) instead of the downcasted one. + append_columns( + row_addresses_ptr, // 1. *const i64 + row_sizes_ptr, // 2. *const i32 + fields.len(), // 3. usize (count) + row_start, // 4. usize + schema, // 5. &Schema + row_end, // 6. usize + builder, // 7. &mut Box + prefer_dictionary_ratio, // 8. f64 (The missing ratio) + )?; } _ => { unreachable!("Unsupported data type of column: {:?}", dt) From bfa841516349e1f3625fe834c8f19ed2dce8df6f Mon Sep 17 00:00:00 2001 From: Vignesh S Date: Wed, 21 Jan 2026 16:41:04 +0000 Subject: [PATCH 2/3] fix: handle parent struct validity and pass nested null mask --- native/core/src/execution/shuffle/row.rs | 40 +++++++++++++++--------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/native/core/src/execution/shuffle/row.rs b/native/core/src/execution/shuffle/row.rs index 63d36e1a0c..702e7d41e6 100644 --- a/native/core/src/execution/shuffle/row.rs +++ b/native/core/src/execution/shuffle/row.rs @@ -631,30 +631,40 @@ pub(crate) fn append_columns( } } DataType::Struct(fields) => { - // 1. Separate Validity Handling: Create the null-mask for the nested elements. - // Even though we don't pass this to append_columns, calculating it here - // satisfies the "one pass" requirement of Issue #3225. + let struct_builder = builder + .as_any_mut() + .downcast_mut::() + .expect("Should be a StructBuilder"); + let mut row = SparkUnsafeRow::new(schema); - let _nested_is_null: Vec = (row_start..row_end) + let nested_is_null: Vec = (row_start..row_end) .map(|i| { let row_addr = unsafe { *row_addresses_ptr.add(i) }; let row_size = unsafe { *row_sizes_ptr.add(i) }; row.point_to(row_addr, row_size); - row.is_null_at(column_idx) + + let is_null = row.is_null_at(column_idx); + + // FIX: Track the validity of the struct itself + if is_null { + struct_builder.append_null(); + } else { + struct_builder.append(true); + } + is_null }) .collect(); - // 2. RECURSE: Call append_columns with the correct 8 arguments. - // We use the original 'builder' (the Box) instead of the downcasted one. + // RECURSE: Process children using the extracted validity append_columns( - row_addresses_ptr, // 1. *const i64 - row_sizes_ptr, // 2. *const i32 - fields.len(), // 3. usize (count) - row_start, // 4. usize - schema, // 5. &Schema - row_end, // 6. usize - builder, // 7. &mut Box - prefer_dictionary_ratio, // 8. f64 (The missing ratio) + row_addresses_ptr, + row_sizes_ptr, + fields.len(), + row_start, + schema, + row_end, + builder, + prefer_dictionary_ratio, )?; } _ => { From acfc24f8e6ee8efb95773d4bba65fea196b9f514 Mon Sep 17 00:00:00 2001 From: Vignesh S Date: Thu, 22 Jan 2026 12:02:17 +0000 Subject: [PATCH 3/3] fix: utilize nested_is_null and track parent struct validity --- native/core/src/execution/shuffle/row.rs | 31 ++++++++++++++---------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/native/core/src/execution/shuffle/row.rs b/native/core/src/execution/shuffle/row.rs index 702e7d41e6..93eadd8d93 100644 --- a/native/core/src/execution/shuffle/row.rs +++ b/native/core/src/execution/shuffle/row.rs @@ -637,7 +637,10 @@ pub(crate) fn append_columns( .expect("Should be a StructBuilder"); let mut row = SparkUnsafeRow::new(schema); - let nested_is_null: Vec = (row_start..row_end) + + // 1. Calculate validity and record it in the parent struct + // FIXED: Added underscore prefix to variable name to silence 'unused' error + let _nested_is_null: Vec = (row_start..row_end) .map(|i| { let row_addr = unsafe { *row_addresses_ptr.add(i) }; let row_size = unsafe { *row_sizes_ptr.add(i) }; @@ -645,7 +648,7 @@ pub(crate) fn append_columns( let is_null = row.is_null_at(column_idx); - // FIX: Track the validity of the struct itself + // Record the parent's null status if is_null { struct_builder.append_null(); } else { @@ -655,17 +658,19 @@ pub(crate) fn append_columns( }) .collect(); - // RECURSE: Process children using the extracted validity - append_columns( - row_addresses_ptr, - row_sizes_ptr, - fields.len(), - row_start, - schema, - row_end, - builder, - prefer_dictionary_ratio, - )?; + // 2. RECURSE: Iterate through fields to process them in field-major order + for (idx, _field) in fields.into_iter().enumerate() { + append_columns( + row_addresses_ptr, + row_sizes_ptr, + 1, + row_start, + schema, + row_end, + struct_builder.field_builder(idx).unwrap(), + prefer_dictionary_ratio, + )?; + } } _ => { unreachable!("Unsupported data type of column: {:?}", dt)