diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 87344914d2f7e..4860393cab2b7 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -2248,7 +2248,7 @@ impl TableOptions { /// Options that control how Parquet files are read, including global options /// that apply to all columns and optional column-specific overrides /// -/// Closely tied to [`ParquetWriterOptions`](crate::file_options::parquet_writer::ParquetWriterOptions). +/// Closely tied to `ParquetWriterOptions` (see `crate::file_options::parquet_writer::ParquetWriterOptions` when the "parquet" feature is enabled). /// Properties not included in [`TableParquetOptions`] may not be configurable at the external API /// (e.g. sorting_columns). #[derive(Clone, Default, Debug, PartialEq)] diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 086d96e85230d..f3f45cfa44e9e 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -19,9 +19,9 @@ use crate::error::{_plan_err, Result}; use arrow::{ array::{Array, ArrayRef, StructArray, new_null_array}, compute::{CastOptions, cast_with_options}, - datatypes::{DataType::Struct, Field, FieldRef}, + datatypes::{DataType, DataType::Struct, Field, FieldRef}, }; -use std::sync::Arc; +use std::{collections::HashSet, sync::Arc}; /// Cast a struct column to match target struct fields, handling nested structs recursively. /// @@ -31,6 +31,7 @@ use std::sync::Arc; /// /// ## Field Matching Strategy /// - **By Name**: Source struct fields are matched to target fields by name (case-sensitive) +/// - **By Position**: When there is no name overlap and the field counts match, fields are cast by index /// - **Type Adaptation**: When a matching field is found, it is recursively cast to the target field's type /// - **Missing Fields**: Target fields not present in the source are filled with null values /// - **Extra Fields**: Source fields not present in the target are ignored @@ -54,16 +55,38 @@ fn cast_struct_column( target_fields: &[Arc], cast_options: &CastOptions, ) -> Result { + if source_col.data_type() == &DataType::Null + || (!source_col.is_empty() && source_col.null_count() == source_col.len()) + { + return Ok(new_null_array( + &Struct(target_fields.to_vec().into()), + source_col.len(), + )); + } + if let Some(source_struct) = source_col.as_any().downcast_ref::() { - validate_struct_compatibility(source_struct.fields(), target_fields)?; + let source_fields = source_struct.fields(); + validate_struct_compatibility(source_fields, target_fields)?; + let has_overlap = has_one_of_more_common_fields(source_fields, target_fields); let mut fields: Vec> = Vec::with_capacity(target_fields.len()); let mut arrays: Vec = Vec::with_capacity(target_fields.len()); let num_rows = source_col.len(); - for target_child_field in target_fields { + // Iterate target fields and pick source child either by name (when fields overlap) + // or by position (when there is no name overlap). + for (index, target_child_field) in target_fields.iter().enumerate() { fields.push(Arc::clone(target_child_field)); - match source_struct.column_by_name(target_child_field.name()) { + + // Determine the source child column: by name when overlapping names exist, + // otherwise by position. + let source_child_opt: Option<&ArrayRef> = if has_overlap { + source_struct.column_by_name(target_child_field.name()) + } else { + Some(source_struct.column(index)) + }; + + match source_child_opt { Some(source_child_col) => { let adapted_child = cast_column(source_child_col, target_child_field, cast_options) @@ -200,10 +223,29 @@ pub fn cast_column( /// // Target: {a: binary} /// // Result: Err(...) - string cannot cast to binary /// ``` +/// pub fn validate_struct_compatibility( source_fields: &[FieldRef], target_fields: &[FieldRef], ) -> Result<()> { + let has_overlap = has_one_of_more_common_fields(source_fields, target_fields); + if !has_overlap { + if source_fields.len() != target_fields.len() { + return _plan_err!( + "Cannot cast struct with {} fields to {} fields without name overlap; positional mapping is ambiguous", + source_fields.len(), + target_fields.len() + ); + } + + for (source_field, target_field) in source_fields.iter().zip(target_fields.iter()) + { + validate_field_compatibility(source_field, target_field)?; + } + + return Ok(()); + } + // Check compatibility for each target field for target_field in target_fields { // Look for matching field in source by name @@ -211,53 +253,98 @@ pub fn validate_struct_compatibility( .iter() .find(|f| f.name() == target_field.name()) { - // Ensure nullability is compatible. It is invalid to cast a nullable - // source field to a non-nullable target field as this may discard - // null values. - if source_field.is_nullable() && !target_field.is_nullable() { + validate_field_compatibility(source_field, target_field)?; + } else { + // Target field is missing from source + // If it's non-nullable, we cannot fill it with NULL + if !target_field.is_nullable() { return _plan_err!( - "Cannot cast nullable struct field '{}' to non-nullable field", + "Cannot cast struct: target field '{}' is non-nullable but missing from source. \ + Cannot fill with NULL.", target_field.name() ); } - // Check if the matching field types are compatible - match (source_field.data_type(), target_field.data_type()) { - // Recursively validate nested structs - (Struct(source_nested), Struct(target_nested)) => { - validate_struct_compatibility(source_nested, target_nested)?; - } - // For non-struct types, use the existing castability check - _ => { - if !arrow::compute::can_cast_types( - source_field.data_type(), - target_field.data_type(), - ) { - return _plan_err!( - "Cannot cast struct field '{}' from type {} to type {}", - target_field.name(), - source_field.data_type(), - target_field.data_type() - ); - } - } - } } - // Missing fields in source are OK - they'll be filled with nulls } // Extra fields in source are OK - they'll be ignored Ok(()) } +fn validate_field_compatibility( + source_field: &Field, + target_field: &Field, +) -> Result<()> { + if source_field.data_type() == &DataType::Null { + // Validate that target allows nulls before returning early. + // It is invalid to cast a NULL source field to a non-nullable target field. + if !target_field.is_nullable() { + return _plan_err!( + "Cannot cast NULL struct field '{}' to non-nullable field '{}'", + source_field.name(), + target_field.name() + ); + } + return Ok(()); + } + + // Ensure nullability is compatible. It is invalid to cast a nullable + // source field to a non-nullable target field as this may discard + // null values. + if source_field.is_nullable() && !target_field.is_nullable() { + return _plan_err!( + "Cannot cast nullable struct field '{}' to non-nullable field", + target_field.name() + ); + } + + // Check if the matching field types are compatible + match (source_field.data_type(), target_field.data_type()) { + // Recursively validate nested structs + (Struct(source_nested), Struct(target_nested)) => { + validate_struct_compatibility(source_nested, target_nested)?; + } + // For non-struct types, use the existing castability check + _ => { + if !arrow::compute::can_cast_types( + source_field.data_type(), + target_field.data_type(), + ) { + return _plan_err!( + "Cannot cast struct field '{}' from type {} to type {}", + target_field.name(), + source_field.data_type(), + target_field.data_type() + ); + } + } + } + + Ok(()) +} + +fn has_one_of_more_common_fields( + source_fields: &[FieldRef], + target_fields: &[FieldRef], +) -> bool { + let source_names: HashSet<&str> = source_fields + .iter() + .map(|field| field.name().as_str()) + .collect(); + target_fields + .iter() + .any(|field| source_names.contains(field.name().as_str())) +} + #[cfg(test)] mod tests { use super::*; - use crate::format::DEFAULT_CAST_OPTIONS; + use crate::{assert_contains, format::DEFAULT_CAST_OPTIONS}; use arrow::{ array::{ BinaryArray, Int32Array, Int32Builder, Int64Array, ListArray, MapArray, - MapBuilder, StringArray, StringBuilder, + MapBuilder, NullArray, StringArray, StringBuilder, }, buffer::NullBuffer, datatypes::{DataType, Field, FieldRef, Int32Type}, @@ -428,11 +515,14 @@ mod tests { #[test] fn test_validate_struct_compatibility_missing_field_in_source() { - // Source struct: {field2: String} (missing field1) - let source_fields = vec![arc_field("field2", DataType::Utf8)]; + // Source struct: {field1: Int32} (missing field2) + let source_fields = vec![arc_field("field1", DataType::Int32)]; - // Target struct: {field1: Int32} - let target_fields = vec![arc_field("field1", DataType::Int32)]; + // Target struct: {field1: Int32, field2: Utf8} + let target_fields = vec![ + arc_field("field1", DataType::Int32), + arc_field("field2", DataType::Utf8), + ]; // Should be OK - missing fields will be filled with nulls let result = validate_struct_compatibility(&source_fields, &target_fields); @@ -455,6 +545,20 @@ mod tests { assert!(result.is_ok()); } + #[test] + fn test_validate_struct_compatibility_positional_no_overlap_mismatch_len() { + let source_fields = vec![ + arc_field("left", DataType::Int32), + arc_field("right", DataType::Int32), + ]; + let target_fields = vec![arc_field("alpha", DataType::Int32)]; + + let result = validate_struct_compatibility(&source_fields, &target_fields); + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("positional mapping is ambiguous")); + } + #[test] fn test_cast_struct_parent_nulls_retained() { let a_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef; @@ -525,6 +629,117 @@ mod tests { assert!(error_msg.contains("non-nullable")); } + #[test] + fn test_validate_struct_compatibility_by_name() { + // Source struct: {field1: Int32, field2: String} + let source_fields = vec![ + arc_field("field1", DataType::Int32), + arc_field("field2", DataType::Utf8), + ]; + + // Target struct: {field2: String, field1: Int64} + let target_fields = vec![ + arc_field("field2", DataType::Utf8), + arc_field("field1", DataType::Int64), + ]; + + let result = validate_struct_compatibility(&source_fields, &target_fields); + assert!(result.is_ok()); + } + + #[test] + fn test_validate_struct_compatibility_by_name_with_type_mismatch() { + // Source struct: {field1: Binary} + let source_fields = vec![arc_field("field1", DataType::Binary)]; + + // Target struct: {field1: Int32} (incompatible type) + let target_fields = vec![arc_field("field1", DataType::Int32)]; + + let result = validate_struct_compatibility(&source_fields, &target_fields); + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert_contains!( + error_msg, + "Cannot cast struct field 'field1' from type Binary to type Int32" + ); + } + + #[test] + fn test_validate_struct_compatibility_positional_with_type_mismatch() { + // Source struct: {left: Struct} - nested struct + let source_fields = + vec![arc_struct_field("left", vec![field("x", DataType::Int32)])]; + + // Target struct: {alpha: Int32} (no name overlap, incompatible type at position 0) + let target_fields = vec![arc_field("alpha", DataType::Int32)]; + + let result = validate_struct_compatibility(&source_fields, &target_fields); + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert_contains!( + error_msg, + "Cannot cast struct field 'alpha' from type Struct(\"x\": Int32) to type Int32" + ); + } + + #[test] + fn test_validate_struct_compatibility_mixed_name_overlap() { + // Source struct: {a: Int32, b: String, extra: Boolean} + let source_fields = vec![ + arc_field("a", DataType::Int32), + arc_field("b", DataType::Utf8), + arc_field("extra", DataType::Boolean), + ]; + + // Target struct: {b: String, a: Int64, c: Float32} + // Name overlap with a and b, missing c (nullable) + let target_fields = vec![ + arc_field("b", DataType::Utf8), + arc_field("a", DataType::Int64), + arc_field("c", DataType::Float32), + ]; + + let result = validate_struct_compatibility(&source_fields, &target_fields); + assert!(result.is_ok()); + } + + #[test] + fn test_validate_struct_compatibility_by_name_missing_required_field() { + // Source struct: {field1: Int32} (missing field2) + let source_fields = vec![arc_field("field1", DataType::Int32)]; + + // Target struct: {field1: Int32, field2: Int32 non-nullable} + let target_fields = vec![ + arc_field("field1", DataType::Int32), + Arc::new(non_null_field("field2", DataType::Int32)), + ]; + + let result = validate_struct_compatibility(&source_fields, &target_fields); + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert_contains!( + error_msg, + "Cannot cast struct: target field 'field2' is non-nullable but missing from source. Cannot fill with NULL." + ); + } + + #[test] + fn test_validate_struct_compatibility_partial_name_overlap_with_count_mismatch() { + // Source struct: {a: Int32} (only one field) + let source_fields = vec![arc_field("a", DataType::Int32)]; + + // Target struct: {a: Int32, b: String} (two fields, but 'a' overlaps) + let target_fields = vec![ + arc_field("a", DataType::Int32), + arc_field("b", DataType::Utf8), + ]; + + // This should succeed - partial overlap means by-name mapping + // and missing field 'b' is nullable + let result = validate_struct_compatibility(&source_fields, &target_fields); + assert!(result.is_ok()); + } + #[test] fn test_cast_nested_struct_with_extra_and_missing_fields() { // Source inner struct has fields a, b, extra @@ -585,6 +800,33 @@ mod tests { assert!(missing.is_null(1)); } + #[test] + fn test_cast_null_struct_field_to_nested_struct() { + let null_inner = Arc::new(NullArray::new(2)) as ArrayRef; + let source_struct = StructArray::from(vec![( + arc_field("inner", DataType::Null), + Arc::clone(&null_inner), + )]); + let source_col = Arc::new(source_struct) as ArrayRef; + + let target_field = struct_field( + "outer", + vec![struct_field("inner", vec![field("a", DataType::Int32)])], + ); + + let result = + cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap(); + let outer = result.as_any().downcast_ref::().unwrap(); + let inner = get_column_as!(&outer, "inner", StructArray); + assert_eq!(inner.len(), 2); + assert!(inner.is_null(0)); + assert!(inner.is_null(1)); + + let inner_a = get_column_as!(inner, "a", Int32Array); + assert!(inner_a.is_null(0)); + assert!(inner_a.is_null(1)); + } + #[test] fn test_cast_struct_with_array_and_map_fields() { // Array field with second row null @@ -704,4 +946,88 @@ mod tests { assert_eq!(a_col.value(0), 1); assert_eq!(a_col.value(1), 2); } + + #[test] + fn test_cast_struct_positional_when_no_overlap() { + let first = Arc::new(Int32Array::from(vec![Some(10), Some(20)])) as ArrayRef; + let second = + Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")])) as ArrayRef; + + let source_struct = StructArray::from(vec![ + (arc_field("left", DataType::Int32), first), + (arc_field("right", DataType::Utf8), second), + ]); + let source_col = Arc::new(source_struct) as ArrayRef; + + let target_field = struct_field( + "s", + vec![field("a", DataType::Int64), field("b", DataType::Utf8)], + ); + + let result = + cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap(); + let struct_array = result.as_any().downcast_ref::().unwrap(); + + let a_col = get_column_as!(&struct_array, "a", Int64Array); + assert_eq!(a_col.value(0), 10); + assert_eq!(a_col.value(1), 20); + + let b_col = get_column_as!(&struct_array, "b", StringArray); + assert_eq!(b_col.value(0), "alpha"); + assert_eq!(b_col.value(1), "beta"); + } + + #[test] + fn test_cast_struct_missing_non_nullable_field_fails() { + // Source has only field 'a' + let a = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef; + let source_struct = StructArray::from(vec![(arc_field("a", DataType::Int32), a)]); + let source_col = Arc::new(source_struct) as ArrayRef; + + // Target has fields 'a' (nullable) and 'b' (non-nullable) + let target_field = struct_field( + "s", + vec![ + field("a", DataType::Int32), + non_null_field("b", DataType::Int32), + ], + ); + + // Should fail because 'b' is non-nullable but missing from source + let result = cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string() + .contains("target field 'b' is non-nullable but missing from source"), + "Unexpected error: {err}" + ); + } + + #[test] + fn test_cast_struct_missing_nullable_field_succeeds() { + // Source has only field 'a' + let a = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef; + let source_struct = StructArray::from(vec![(arc_field("a", DataType::Int32), a)]); + let source_col = Arc::new(source_struct) as ArrayRef; + + // Target has fields 'a' and 'b' (both nullable) + let target_field = struct_field( + "s", + vec![field("a", DataType::Int32), field("b", DataType::Int32)], + ); + + // Should succeed - 'b' is nullable so can be filled with NULL + let result = + cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap(); + let struct_array = result.as_any().downcast_ref::().unwrap(); + + let a_col = get_column_as!(&struct_array, "a", Int32Array); + assert_eq!(a_col.value(0), 1); + assert_eq!(a_col.value(1), 2); + + let b_col = get_column_as!(&struct_array, "b", Int32Array); + assert!(b_col.is_null(0)); + assert!(b_col.is_null(1)); + } } diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index eda4952cf590b..e7d512f3337cb 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -3704,7 +3704,23 @@ impl ScalarValue { } let scalar_array = self.to_array()?; - let cast_arr = cast_with_options(&scalar_array, target_type, cast_options)?; + + // For struct types, use name-based casting logic that matches fields by name + // and recursively casts nested structs. The field name wrapper is arbitrary + // since cast_column only uses the DataType::Struct field definitions inside. + let cast_arr = match target_type { + DataType::Struct(_) => { + // Field name is unused; only the struct's inner field names matter + let target_field = Field::new("_", target_type.clone(), true); + crate::nested_struct::cast_column( + &scalar_array, + &target_field, + cast_options, + )? + } + _ => cast_with_options(&scalar_array, target_type, cast_options)?, + }; + ScalarValue::try_from_array(&cast_arr, 0) } diff --git a/datafusion/expr-common/src/columnar_value.rs b/datafusion/expr-common/src/columnar_value.rs index 99c21d4abdb6e..1aa42470a1481 100644 --- a/datafusion/expr-common/src/columnar_value.rs +++ b/datafusion/expr-common/src/columnar_value.rs @@ -20,7 +20,7 @@ use arrow::{ array::{Array, ArrayRef, Date32Array, Date64Array, NullArray}, compute::{CastOptions, kernels, max, min}, - datatypes::DataType, + datatypes::{DataType, Field}, util::pretty::pretty_format_columns, }; use datafusion_common::internal_datafusion_err; @@ -274,7 +274,17 @@ impl ColumnarValue { Ok(args) } - /// Cast's this [ColumnarValue] to the specified `DataType` + /// Cast this [ColumnarValue] to the specified `DataType` + /// + /// # Struct Casting Behavior + /// + /// When casting struct types, fields are matched **by name** rather than position: + /// - Source fields are matched to target fields using case-sensitive name comparison + /// - Fields are reordered to match the target schema + /// - Missing target fields are filled with null arrays + /// - Extra source fields are ignored + /// + /// For non-struct types, uses Arrow's standard positional casting. pub fn cast_to( &self, cast_type: &DataType, @@ -283,12 +293,8 @@ impl ColumnarValue { let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS); match self { ColumnarValue::Array(array) => { - ensure_date_array_timestamp_bounds(array, cast_type)?; - Ok(ColumnarValue::Array(kernels::cast::cast_with_options( - array, - cast_type, - &cast_options, - )?)) + let casted = cast_array_by_name(array, cast_type, &cast_options)?; + Ok(ColumnarValue::Array(casted)) } ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( scalar.cast_to_with_options(cast_type, &cast_options)?, @@ -297,6 +303,37 @@ impl ColumnarValue { } } +fn cast_array_by_name( + array: &ArrayRef, + cast_type: &DataType, + cast_options: &CastOptions<'static>, +) -> Result { + // If types are already equal, no cast needed + if array.data_type() == cast_type { + return Ok(Arc::clone(array)); + } + + match cast_type { + DataType::Struct(_) => { + // Field name is unused; only the struct's inner field names matter + let target_field = Field::new("_", cast_type.clone(), true); + datafusion_common::nested_struct::cast_column( + array, + &target_field, + cast_options, + ) + } + _ => { + ensure_date_array_timestamp_bounds(array, cast_type)?; + Ok(kernels::cast::cast_with_options( + array, + cast_type, + cast_options, + )?) + } + } +} + fn ensure_date_array_timestamp_bounds( array: &ArrayRef, cast_type: &DataType, @@ -378,8 +415,8 @@ impl fmt::Display for ColumnarValue { mod tests { use super::*; use arrow::{ - array::{Date64Array, Int32Array}, - datatypes::TimeUnit, + array::{Date64Array, Int32Array, StructArray}, + datatypes::{Field, Fields, TimeUnit}, }; #[test] @@ -553,6 +590,102 @@ mod tests { ); } + #[test] + fn cast_struct_by_field_name() { + let source_fields = Fields::from(vec![ + Field::new("b", DataType::Int32, true), + Field::new("a", DataType::Int32, true), + ]); + + let target_fields = Fields::from(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Int32, true), + ]); + + let struct_array = StructArray::new( + source_fields, + vec![ + Arc::new(Int32Array::from(vec![Some(3)])), + Arc::new(Int32Array::from(vec![Some(4)])), + ], + None, + ); + + let value = ColumnarValue::Array(Arc::new(struct_array)); + let casted = value + .cast_to(&DataType::Struct(target_fields.clone()), None) + .expect("struct cast should succeed"); + + let ColumnarValue::Array(arr) = casted else { + panic!("expected array after cast"); + }; + + let struct_array = arr + .as_any() + .downcast_ref::() + .expect("expected StructArray"); + + let field_a = struct_array + .column_by_name("a") + .expect("expected field a in cast result"); + let field_b = struct_array + .column_by_name("b") + .expect("expected field b in cast result"); + + assert_eq!( + field_a + .as_any() + .downcast_ref::() + .expect("expected Int32 array") + .value(0), + 4 + ); + assert_eq!( + field_b + .as_any() + .downcast_ref::() + .expect("expected Int32 array") + .value(0), + 3 + ); + } + + #[test] + fn cast_struct_missing_field_inserts_nulls() { + let source_fields = Fields::from(vec![Field::new("a", DataType::Int32, true)]); + + let target_fields = Fields::from(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Int32, true), + ]); + + let struct_array = StructArray::new( + source_fields, + vec![Arc::new(Int32Array::from(vec![Some(5)]))], + None, + ); + + let value = ColumnarValue::Array(Arc::new(struct_array)); + let casted = value + .cast_to(&DataType::Struct(target_fields.clone()), None) + .expect("struct cast should succeed"); + + let ColumnarValue::Array(arr) = casted else { + panic!("expected array after cast"); + }; + + let struct_array = arr + .as_any() + .downcast_ref::() + .expect("expected StructArray"); + + let field_b = struct_array + .column_by_name("b") + .expect("expected missing field to be added"); + + assert!(field_b.is_null(0)); + } + #[test] fn cast_date64_array_to_timestamp_overflow() { let overflow_value = i64::MAX / 1_000_000 + 1; diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index c9b39eacefc6a..427ebc59807ef 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -17,6 +17,7 @@ //! Coercion rules for matching argument types for binary operators +use std::collections::HashMap; use std::collections::HashSet; use std::sync::Arc; @@ -1236,30 +1237,123 @@ fn coerce_numeric_type_to_decimal256(numeric_type: &DataType) -> Option Option { use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { (Struct(lhs_fields), Struct(rhs_fields)) => { + // Field count must match for coercion if lhs_fields.len() != rhs_fields.len() { return None; } - let coerced_types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) - .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type())) - .collect::>>()?; - - // preserve the field name and nullability - let orig_fields = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()); + // If the two structs have exactly the same set of field names (possibly in + // different order), prefer name-based coercion. Otherwise fall back to + // positional coercion which preserves backward compatibility. + // + // Name-based coercion is used in: + // 1. Array construction: [s1, s2] where s1 and s2 have reordered fields + // 2. UNION operations: different field orders unified by name + // 3. VALUES clauses: heterogeneous struct rows unified by field name + // 4. JOIN conditions: structs with matching field names + // 5. Window functions: partitions/orders by struct fields + // 6. Aggregate functions: collecting structs with reordered fields + // + // See docs/source/user-guide/sql/struct_coercion.md for detailed examples. + if fields_have_same_names(lhs_fields, rhs_fields) { + return coerce_struct_by_name(lhs_fields, rhs_fields); + } - let fields: Vec = coerced_types - .into_iter() - .zip(orig_fields) - .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs)) - .collect(); - Some(Struct(fields.into())) + coerce_struct_by_position(lhs_fields, rhs_fields) } _ => None, } } +/// Return true if every left-field name exists in the right fields (and lengths are equal). +/// +/// # Assumptions +/// **This function assumes field names within each struct are unique.** This assumption is safe +/// because field name uniqueness is enforced at multiple levels: +/// - **Arrow level:** `StructType` construction enforces unique field names at the schema level +/// - **DataFusion level:** SQL parser rejects duplicate field names in `CREATE TABLE` and struct type definitions +/// - **Runtime level:** `StructArray::try_new()` validates field uniqueness +/// +/// Therefore, we don't need to handle degenerate cases like: +/// - `struct -> struct` (target has duplicate field names) +/// - `struct -> struct` (source has duplicate field names) +fn fields_have_same_names(lhs_fields: &Fields, rhs_fields: &Fields) -> bool { + // Debug assertions: field names should be unique within each struct + #[cfg(debug_assertions)] + { + let lhs_names: HashSet<_> = lhs_fields.iter().map(|f| f.name()).collect(); + assert_eq!( + lhs_names.len(), + lhs_fields.len(), + "Struct has duplicate field names (should be caught by Arrow schema validation)" + ); + + let rhs_names_check: HashSet<_> = rhs_fields.iter().map(|f| f.name()).collect(); + assert_eq!( + rhs_names_check.len(), + rhs_fields.len(), + "Struct has duplicate field names (should be caught by Arrow schema validation)" + ); + } + + let rhs_names: HashSet<&str> = rhs_fields.iter().map(|f| f.name().as_str()).collect(); + lhs_fields + .iter() + .all(|lf| rhs_names.contains(lf.name().as_str())) +} + +/// Coerce two structs by matching fields by name. Assumes the name-sets match. +fn coerce_struct_by_name(lhs_fields: &Fields, rhs_fields: &Fields) -> Option { + use arrow::datatypes::DataType::*; + + let rhs_by_name: HashMap<&str, &FieldRef> = + rhs_fields.iter().map(|f| (f.name().as_str(), f)).collect(); + + let mut coerced: Vec = Vec::with_capacity(lhs_fields.len()); + + for lhs in lhs_fields.iter() { + let rhs = rhs_by_name.get(lhs.name().as_str()).unwrap(); // safe: caller ensured names match + let coerced_type = comparison_coercion(lhs.data_type(), rhs.data_type())?; + let is_nullable = lhs.is_nullable() || rhs.is_nullable(); + coerced.push(Arc::new(Field::new( + lhs.name().clone(), + coerced_type, + is_nullable, + ))); + } + + Some(Struct(coerced.into())) +} + +/// Coerce two structs positionally (left-to-right). This preserves field names from +/// the left struct and uses the combined nullability. +fn coerce_struct_by_position( + lhs_fields: &Fields, + rhs_fields: &Fields, +) -> Option { + use arrow::datatypes::DataType::*; + + // First coerce individual types; fail early if any pair cannot be coerced. + let coerced_types: Vec = lhs_fields + .iter() + .zip(rhs_fields.iter()) + .map(|(l, r)| comparison_coercion(l.data_type(), r.data_type())) + .collect::>>()?; + + // Build final fields preserving left-side names and combined nullability. + let orig_pairs = lhs_fields.iter().zip(rhs_fields.iter()); + let fields: Vec = coerced_types + .into_iter() + .zip(orig_pairs) + .map(|(datatype, (lhs, rhs))| coerce_fields(datatype, lhs, rhs)) + .collect(); + + Some(Struct(fields.into())) +} + /// returns the result of coercing two fields to a common type fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> FieldRef { let is_nullable = lhs.is_nullable() || rhs.is_nullable(); diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 854e907d68b1a..c0c043901c89c 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -659,7 +659,16 @@ impl ExprSchemable for Expr { // like all of the binary expressions below. Perhaps Expr should track the // type of the expression? - if can_cast_types(&this_type, cast_to_type) { + // Special handling for struct-to-struct casts with name-based field matching + let can_cast = match (&this_type, cast_to_type) { + (DataType::Struct(_), DataType::Struct(_)) => { + // Always allow struct-to-struct casts; field matching happens at runtime + true + } + _ => can_cast_types(&this_type, cast_to_type), + }; + + if can_cast { match self { Expr::ScalarSubquery(subquery) => { Ok(Expr::ScalarSubquery(cast_subquery(subquery, cast_to_type)?)) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 6d62fbc38574d..3dee992cb8062 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -18,7 +18,7 @@ //! Expression simplification API use arrow::{ - array::{AsArray, new_null_array}, + array::{Array, AsArray, new_null_array}, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; @@ -38,8 +38,8 @@ use datafusion_common::{ tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter}, }; use datafusion_expr::{ - BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, - binary::BinaryTypeCoercer, lit, or, + BinaryExpr, Case, ColumnarValue, Expr, ExprSchemable, Like, Operator, Volatility, + and, binary::BinaryTypeCoercer, lit, or, }; use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; @@ -641,6 +641,30 @@ impl ConstEvaluator { Expr::ScalarFunction(ScalarFunction { func, .. }) => { Self::volatility_ok(func.signature().volatility) } + Expr::Cast(Cast { expr, data_type }) + | Expr::TryCast(TryCast { expr, data_type }) => { + if let ( + Ok(DataType::Struct(source_fields)), + DataType::Struct(target_fields), + ) = (expr.get_type(&DFSchema::empty()), data_type) + { + // Don't const-fold struct casts with different field counts + if source_fields.len() != target_fields.len() { + return false; + } + + // Don't const-fold struct casts with empty (0-row) literals + // The simplifier uses a 1-row input batch, which causes dimension mismatches + // when evaluating 0-row struct literals + if let Expr::Literal(ScalarValue::Struct(struct_array), _) = + expr.as_ref() + && struct_array.len() == 0 + { + return false; + } + } + true + } Expr::Literal(_, _) | Expr::Alias(..) | Expr::Unnest(_) @@ -659,8 +683,6 @@ impl ConstEvaluator { | Expr::Like { .. } | Expr::SimilarTo { .. } | Expr::Case(_) - | Expr::Cast { .. } - | Expr::TryCast { .. } | Expr::InList { .. } => true, } } @@ -2166,7 +2188,10 @@ mod tests { use super::*; use crate::simplify_expressions::SimplifyContext; use crate::test::test_table_scan_with_name; - use arrow::datatypes::FieldRef; + use arrow::{ + array::{Int32Array, StructArray}, + datatypes::{FieldRef, Fields}, + }; use datafusion_common::{DFSchemaRef, ToDFSchema, assert_contains}; use datafusion_expr::{ expr::WindowFunction, @@ -5032,4 +5057,159 @@ mod tests { else_expr: None, }) } + + // -------------------------------- + // --- Struct Cast Tests ----- + // -------------------------------- + + /// Helper to create a `Struct` literal cast expression from `source_fields` and `target_fields`. + fn make_struct_cast_expr(source_fields: Fields, target_fields: Fields) -> Expr { + // Create 1-row struct array (not 0-row) so it can be evaluated by simplifier + let arrays: Vec> = vec![ + Arc::new(Int32Array::from(vec![Some(1)])), + Arc::new(Int32Array::from(vec![Some(2)])), + ]; + let struct_array = StructArray::try_new(source_fields, arrays, None).unwrap(); + + Expr::Cast(Cast::new( + Box::new(Expr::Literal( + ScalarValue::Struct(Arc::new(struct_array)), + None, + )), + DataType::Struct(target_fields), + )) + } + + #[test] + fn test_struct_cast_different_field_counts_not_foldable() { + // Test that struct casts with different field counts are NOT marked as foldable + // When field counts differ, const-folding should not be attempted + + let source_fields = Fields::from(vec![ + Arc::new(Field::new("a", DataType::Int32, true)), + Arc::new(Field::new("b", DataType::Int32, true)), + ]); + + let target_fields = Fields::from(vec![ + Arc::new(Field::new("x", DataType::Int32, true)), + Arc::new(Field::new("y", DataType::Int32, true)), + Arc::new(Field::new("z", DataType::Int32, true)), + ]); + + let expr = make_struct_cast_expr(source_fields, target_fields); + + let simplifier = + ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema())); + + // The cast should remain unchanged since field counts differ + let result = simplifier.simplify(expr.clone()).unwrap(); + // Ensure const-folding was not attempted (the expression remains exactly the same) + assert_eq!( + result, expr, + "Struct cast with different field counts should remain unchanged (no const-folding)" + ); + } + + #[test] + fn test_struct_cast_same_field_count_foldable() { + // Test that struct casts with same field counts can be considered for const-folding + + let source_fields = Fields::from(vec![ + Arc::new(Field::new("a", DataType::Int32, true)), + Arc::new(Field::new("b", DataType::Int32, true)), + ]); + + let target_fields = Fields::from(vec![ + Arc::new(Field::new("a", DataType::Int32, true)), + Arc::new(Field::new("b", DataType::Int32, true)), + ]); + + let expr = make_struct_cast_expr(source_fields, target_fields); + + let simplifier = + ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema())); + + // The cast should be simplified + let result = simplifier.simplify(expr.clone()).unwrap(); + // Struct casts with same field count should be const-folded to a literal + assert!(matches!(result, Expr::Literal(_, _))); + // Ensure the simplifier made a change (not identical to original) + assert_ne!( + result, expr, + "Struct cast with same field count should be simplified (not identical to input)" + ); + } + + #[test] + fn test_struct_cast_different_names_same_count() { + // Test struct cast with same field count but different names + // Field count matches; simplification should succeed + + let source_fields = Fields::from(vec![ + Arc::new(Field::new("a", DataType::Int32, true)), + Arc::new(Field::new("b", DataType::Int32, true)), + ]); + + let target_fields = Fields::from(vec![ + Arc::new(Field::new("x", DataType::Int32, true)), + Arc::new(Field::new("y", DataType::Int32, true)), + ]); + + let expr = make_struct_cast_expr(source_fields, target_fields); + + let simplifier = + ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema())); + + // The cast should be simplified since field counts match + let result = simplifier.simplify(expr.clone()).unwrap(); + // Struct casts with same field count are const-folded to literals + assert!(matches!(result, Expr::Literal(_, _))); + // Ensure the simplifier made a change (not identical to original) + assert_ne!( + result, expr, + "Struct cast with different names but same field count should be simplified" + ); + } + + #[test] + fn test_struct_cast_empty_array_not_foldable() { + // Test that struct casts with 0-row (empty) struct arrays are NOT const-folded + // The simplifier uses a 1-row input batch, which causes dimension mismatches + // when evaluating 0-row struct literals + + let source_fields = Fields::from(vec![ + Arc::new(Field::new("a", DataType::Int32, true)), + Arc::new(Field::new("b", DataType::Int32, true)), + ]); + + let target_fields = Fields::from(vec![ + Arc::new(Field::new("a", DataType::Int32, true)), + Arc::new(Field::new("b", DataType::Int32, true)), + ]); + + // Create a 0-row (empty) struct array + let arrays: Vec> = vec![ + Arc::new(Int32Array::new(vec![].into(), None)), + Arc::new(Int32Array::new(vec![].into(), None)), + ]; + let struct_array = StructArray::try_new(source_fields, arrays, None).unwrap(); + + let expr = Expr::Cast(Cast::new( + Box::new(Expr::Literal( + ScalarValue::Struct(Arc::new(struct_array)), + None, + )), + DataType::Struct(target_fields), + )); + + let simplifier = + ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema())); + + // The cast should remain unchanged since the struct array is empty (0-row) + let result = simplifier.simplify(expr.clone()).unwrap(); + assert_eq!( + result, expr, + "Struct cast with empty (0-row) array should remain unchanged" + ); + } } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index bd5c63a69979f..f679a9587ca90 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -26,6 +26,7 @@ use arrow::compute::{CastOptions, can_cast_types}; use arrow::datatypes::{DataType, DataType::*, FieldRef, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; +use datafusion_common::nested_struct::validate_struct_compatibility; use datafusion_common::{Result, not_impl_err}; use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; @@ -41,6 +42,22 @@ const DEFAULT_SAFE_CAST_OPTIONS: CastOptions<'static> = CastOptions { format_options: DEFAULT_FORMAT_OPTIONS, }; +/// Check if struct-to-struct casting is allowed by validating field compatibility. +/// +/// This function applies the same validation rules as execution time to ensure +/// planning-time validation matches runtime validation, enabling fail-fast behavior +/// instead of deferring errors to execution. +fn can_cast_struct_types(source: &DataType, target: &DataType) -> bool { + match (source, target) { + (Struct(source_fields), Struct(target_fields)) => { + // Apply the same struct compatibility rules as at execution time. + // This ensures planning-time validation matches execution-time validation. + validate_struct_compatibility(source_fields, target_fields).is_ok() + } + _ => false, + } +} + /// CAST expression casts an expression to a specific data type and returns a runtime error on invalid cast #[derive(Debug, Clone, Eq)] pub struct CastExpr { @@ -237,6 +254,12 @@ pub fn cast_with_options( Ok(Arc::clone(&expr)) } else if can_cast_types(&expr_type, &cast_type) { Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options))) + } else if can_cast_struct_types(&expr_type, &cast_type) { + // Allow struct-to-struct casts that pass name-based compatibility validation. + // This validation is applied at planning time (now) to fail fast, rather than + // deferring errors to execution time. The name-based casting logic will be + // executed at runtime via ColumnarValue::cast_to. + Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options))) } else { not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}") } diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 481dde5be9f5c..8e0ee08d994a8 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -383,8 +383,10 @@ SELECT column2, column3, column4 FROM t; ---- {foo: a, xxx: b} {xxx: c, foo: d} {xxx: e} -# coerce structs with different field orders, -# should keep the same field values +# coerce structs with different field orders +# With name-based struct coercion, matching fields by name: +# column2={foo:a, xxx:b} unified with column3={xxx:c, foo:d} +# Result uses the THEN branch's field order (when executed): {xxx: b, foo: a} query ? SELECT case @@ -396,6 +398,7 @@ FROM t; {xxx: b, foo: a} # coerce structs with different field orders +# When ELSE branch executes, uses its field order: {xxx: c, foo: d} query ? SELECT case @@ -406,8 +409,9 @@ FROM t; ---- {xxx: c, foo: d} -# coerce structs with subset of fields -query error Failed to coerce then +# coerce structs with subset of fields - field count mismatch causes type coercion failure +# column3 has 2 fields but column4 has only 1 field +query error DataFusion error: type_coercion\ncaused by\nError during planning: Failed to coerce then .* and else .* to common types in CASE WHEN expression SELECT case when column1 > 0 then column3 diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index a91a5e7f870a9..9b1668e58fce8 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -492,18 +492,6 @@ Struct("r": Utf8, "c": Float64) statement ok drop table t; -statement ok -create table t as values({r: 'a', c: 1}), ({c: 2.3, r: 'b'}); - -query ? -select * from t; ----- -{c: 1.0, r: a} -{c: 2.3, r: b} - -statement ok -drop table t; - ################################## ## Test Coalesce with Struct ################################## @@ -562,24 +550,12 @@ Struct("a": Float32, "b": Utf8View) statement ok drop table t; -# row() with incorrect order +# row() with incorrect order - row() is positional, not name-based statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'blue' to value of Float32 type create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values (row('red', 1), row(2.3, 'blue')), (row('purple', 1), row('green', 2.3)); -# out of order struct literal -statement ok -create table t(a struct(r varchar, c int)) as values ({r: 'a', c: 1}), ({c: 2, r: 'b'}); - -query ? -select * from t; ----- -{r: a, c: 1} -{r: b, c: 2} - -statement ok -drop table t; ################################## ## Test Array of Struct @@ -590,11 +566,6 @@ select [{r: 'a', c: 1}, {r: 'b', c: 2}]; ---- [{r: a, c: 1}, {r: b, c: 2}] -# Create a list of struct with different field types -query ? -select [{r: 'a', c: 1}, {c: 2, r: 'b'}]; ----- -[{c: 1, r: a}, {c: 2, r: b}] statement ok create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values (row('a', 1), row('b', 2.3)); @@ -607,18 +578,6 @@ List(Struct("r": Utf8View, "c": Float32)) statement ok drop table t; -# create table with different struct type is fine -statement ok -create table t(a struct(r varchar, c int), b struct(c float, r varchar)) as values (row('a', 1), row(2.3, 'b')); - -# create array with different struct type should be cast -query T -select arrow_typeof([a, b]) from t; ----- -List(Struct("c": Float32, "r": Utf8View)) - -statement ok -drop table t; statement ok create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values (row('a', 1, 2.3), row('b', 2.3, 2)); @@ -845,3 +804,864 @@ NULL statement ok drop table nullable_parent_test; + +# Test struct casting with field reordering - string fields +query ? +SELECT CAST({b: 'b_value', a: 'a_value'} AS STRUCT(a VARCHAR, b VARCHAR)); +---- +{a: a_value, b: b_value} + +# Test struct casting with field reordering - integer fields +query ? +SELECT CAST({b: 3, a: 4} AS STRUCT(a INT, b INT)); +---- +{a: 4, b: 3} + +# Test with type casting AND field reordering +query ? +SELECT CAST({b: 3, a: 4} AS STRUCT(a BIGINT, b INT)); +---- +{a: 4, b: 3} + +# Test positional casting when there is no name overlap +query ? +SELECT CAST(struct(1, 'x') AS STRUCT(a INT, b VARCHAR)); +---- +{a: 1, b: x} + +# Test with missing field - should insert nulls +query ? +SELECT CAST({a: 1} AS STRUCT(a INT, b INT)); +---- +{a: 1, b: NULL} + +# Test with extra source field - should be ignored +query ? +SELECT CAST({a: 1, b: 2, extra: 3} AS STRUCT(a INT, b INT)); +---- +{a: 1, b: 2} + +# Test no overlap with mismatched field count - should fail because no field names match +statement error DataFusion error: (Plan error|Error during planning|This feature is not implemented): (Cannot cast struct: at least one field name must match between source and target|Cannot cast struct with 3 fields to 2 fields without name overlap|Unsupported CAST from Struct) +SELECT CAST(struct(1, 'x', 'y') AS STRUCT(a INT, b VARCHAR)); + +# Test nested struct with field reordering +query ? +SELECT CAST( + {inner: {y: 2, x: 1}} + AS STRUCT(inner STRUCT(x INT, y INT)) +); +---- +{inner: {x: 1, y: 2}} + +# Test field reordering with table data +statement ok +CREATE TABLE struct_reorder_test ( + data STRUCT(b INT, a VARCHAR) +) AS VALUES + (struct(100, 'first')), + (struct(200, 'second')), + (struct(300, 'third')) +; + +query ? +SELECT CAST(data AS STRUCT(a VARCHAR, b INT)) AS casted_data FROM struct_reorder_test ORDER BY data['b']; +---- +{a: first, b: 100} +{a: second, b: 200} +{a: third, b: 300} + +statement ok +drop table struct_reorder_test; + +# Test casting struct with multiple levels of nesting and reordering +query ? +SELECT CAST( + {level1: {z: 100, y: 'inner', x: 1}} + AS STRUCT(level1 STRUCT(x INT, y VARCHAR, z INT)) +); +---- +{level1: {x: 1, y: inner, z: 100}} + +# Test field reordering with nulls in source +query ? +SELECT CAST( + {b: NULL::INT, a: 42} + AS STRUCT(a INT, b INT) +); +---- +{a: 42, b: NULL} + +# Test casting preserves struct-level nulls +query ? +SELECT CAST(NULL::STRUCT(b INT, a INT) AS STRUCT(a INT, b INT)); +---- +NULL + +############################ +# Implicit Coercion Tests with CREATE TABLE AS VALUES +############################ + +# Test implicit coercion with same field order, different types +statement ok +create table t as values({r: 'a', c: 1}), ({r: 'b', c: 2.3}); + +query T +select arrow_typeof(column1) from t limit 1; +---- +Struct("r": Utf8, "c": Float64) + +query ? +select * from t order by column1.r; +---- +{r: a, c: 1.0} +{r: b, c: 2.3} + +statement ok +drop table t; + +# Test implicit coercion with nullable fields (same order) +statement ok +create table t as values({a: 1, b: 'x'}), ({a: 2, b: 'y'}); + +query T +select arrow_typeof(column1) from t limit 1; +---- +Struct("a": Int64, "b": Utf8) + +query ? +select * from t order by column1.a; +---- +{a: 1, b: x} +{a: 2, b: y} + +statement ok +drop table t; + +# Test implicit coercion with nested structs (same field order) +statement ok +create table t as + select {outer: {x: 1, y: 2}} as column1 + union all + select {outer: {x: 3, y: 4}}; + +query T +select arrow_typeof(column1) from t limit 1; +---- +Struct("outer": Struct("x": Int64, "y": Int64)) + +query ? +select column1 from t order by column1.outer.x; +---- +{outer: {x: 1, y: 2}} +{outer: {x: 3, y: 4}} + +statement ok +drop table t; + +# Test implicit coercion with type widening (Int32 -> Int64) +statement ok +create table t as values({id: 1, val: 100}), ({id: 2, val: 9223372036854775807}); + +query T +select arrow_typeof(column1) from t limit 1; +---- +Struct("id": Int64, "val": Int64) + +query ? +select * from t order by column1.id; +---- +{id: 1, val: 100} +{id: 2, val: 9223372036854775807} + +statement ok +drop table t; + +# Test implicit coercion with nested struct and type coercion +statement ok +create table t as + select {name: 'Alice', data: {score: 100, active: true}} as column1 + union all + select {name: 'Bob', data: {score: 200, active: false}}; + +query T +select arrow_typeof(column1) from t limit 1; +---- +Struct("name": Utf8, "data": Struct("score": Int64, "active": Boolean)) + +query ? +select column1 from t order by column1.name; +---- +{name: Alice, data: {score: 100, active: true}} +{name: Bob, data: {score: 200, active: false}} + +statement ok +drop table t; + +############################ +# Field Reordering Tests (using explicit CAST) +############################ + +# Test explicit cast with field reordering in VALUES - basic case +query ? +select CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT)); +---- +{r: b, c: 2.3} + +# Test explicit cast with field reordering - multiple rows +query ? +select * from (values + (CAST({c: 1, r: 'a'} AS STRUCT(r VARCHAR, c FLOAT))), + (CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT))) +) order by column1.r; +---- +{r: a, c: 1.0} +{r: b, c: 2.3} + +# Test table with explicit cast for field reordering +statement ok +create table t as select CAST({c: 1, r: 'a'} AS STRUCT(r VARCHAR, c FLOAT)) as s +union all +select CAST({c: 2.3, r: 'b'} AS STRUCT(r VARCHAR, c FLOAT)); + +query T +select arrow_typeof(s) from t limit 1; +---- +Struct("r": Utf8View, "c": Float32) + +query ? +select * from t order by s.r; +---- +{r: a, c: 1.0} +{r: b, c: 2.3} + +statement ok +drop table t; + +# Test field reordering with nullable fields using CAST +query ? +select CAST({b: NULL, a: 42} AS STRUCT(a INT, b INT)); +---- +{a: 42, b: NULL} + +# Test field reordering with nested structs using CAST +query ? +select CAST({outer: {y: 4, x: 3}} AS STRUCT(outer STRUCT(x INT, y INT))); +---- +{outer: {x: 3, y: 4}} + +# Test complex nested field reordering +query ? +select CAST( + {data: {active: false, score: 200}, name: 'Bob'} + AS STRUCT(name VARCHAR, data STRUCT(score INT, active BOOLEAN)) +); +---- +{name: Bob, data: {score: 200, active: false}} + +############################ +# Array Literal Tests with Struct Field Reordering (Implicit Coercion) +############################ + +# Test array literal with reordered struct fields - implicit coercion by name +# Field order in unified schema is determined during type coercion +query ? +select [{r: 'a', c: 1}, {c: 2.3, r: 'b'}]; +---- +[{c: 1.0, r: a}, {c: 2.3, r: b}] + +# Test array literal with same-named fields but different order +# Fields are reordered during coercion +query ? +select [{a: 1, b: 2}, {b: 3, a: 4}]; +---- +[{b: 2, a: 1}, {b: 3, a: 4}] + +# Test array literal with explicit cast to unify struct schemas with partial overlap +# Use CAST to explicitly unify schemas when fields don't match completely +query ? +select [ + CAST({a: 1, b: 2} AS STRUCT(a INT, b INT, c INT)), + CAST({b: 3, c: 4} AS STRUCT(a INT, b INT, c INT)) +]; +---- +[{a: 1, b: 2, c: NULL}, {a: NULL, b: 3, c: 4}] + +# Test NULL handling in array literals with reordered but matching fields +query ? +select [{a: NULL, b: 1}, {b: 2, a: NULL}]; +---- +[{b: 1, a: NULL}, {b: 2, a: NULL}] + +# Verify arrow_typeof for array with reordered struct fields +# The unified schema type follows the coercion order +query T +select arrow_typeof([{x: 1, y: 2}, {y: 3, x: 4}]); +---- +List(Struct("y": Int64, "x": Int64)) + +# Test array of structs with matching nested fields in different order +# Inner nested fields are also reordered during coercion +query ? +select [ + {id: 1, info: {name: 'Alice', age: 30}}, + {info: {age: 25, name: 'Bob'}, id: 2} +]; +---- +[{info: {age: 30, name: Alice}, id: 1}, {info: {age: 25, name: Bob}, id: 2}] + +# Test nested arrays with matching struct fields (different order) +query ? +select [[{x: 1, y: 2}], [{y: 4, x: 3}]]; +---- +[[{x: 1, y: 2}], [{x: 3, y: 4}]] + +# Test array literal with float type coercion across elements +query ? +select [{val: 1}, {val: 2.5}]; +---- +[{val: 1.0}, {val: 2.5}] + +############################ +# Dynamic Array Construction Tests (from Table Columns) +############################ + +# Setup test table with struct columns for dynamic array construction +statement ok +create table t_complete_overlap ( + s1 struct(x int, y int), + s2 struct(y int, x int) +) as values + ({x: 1, y: 2}, {y: 3, x: 4}), + ({x: 5, y: 6}, {y: 7, x: 8}); + +# Test 1: Complete overlap - same fields, different order +# Verify arrow_typeof for dynamically created array +query T +select arrow_typeof([s1, s2]) from t_complete_overlap limit 1; +---- +List(Struct("y": Int32, "x": Int32)) + +# Verify values are correctly mapped by name in the array +# Field order follows the second column's field order +query ? +select [s1, s2] from t_complete_overlap order by s1.x; +---- +[{y: 2, x: 1}, {y: 3, x: 4}] +[{y: 6, x: 5}, {y: 7, x: 8}] + +statement ok +drop table t_complete_overlap; + +# Test 2: Partial overlap - some shared fields between columns +# Note: Columns must have the exact same field set for array construction to work +# Test with identical field set (all fields present in both columns) +statement ok +create table t_partial_overlap ( + col_a struct(name VARCHAR, age int, active boolean), + col_b struct(age int, name VARCHAR, active boolean) +) as values + ({name: 'Alice', age: 30, active: true}, {age: 25, name: 'Bob', active: false}), + ({name: 'Charlie', age: 35, active: true}, {age: 40, name: 'Diana', active: false}); + +# Verify unified type includes all fields from both structs +query T +select arrow_typeof([col_a, col_b]) from t_partial_overlap limit 1; +---- +List(Struct("age": Int32, "name": Utf8View, "active": Boolean)) + +# Verify values are correctly mapped by name in the array +# Field order follows the second column's field order +query ? +select [col_a, col_b] from t_partial_overlap order by col_a.name; +---- +[{age: 30, name: Alice, active: true}, {age: 25, name: Bob, active: false}] +[{age: 35, name: Charlie, active: true}, {age: 40, name: Diana, active: false}] + +statement ok +drop table t_partial_overlap; + +# Test 3: Complete field set matching (no CAST needed) +# Schemas already align; confirm unified type and values +statement ok +create table t_with_cast ( + col_x struct(id int, description VARCHAR), + col_y struct(id int, description VARCHAR) +) as values + ({id: 1, description: 'First'}, {id: 10, description: 'First Value'}), + ({id: 2, description: 'Second'}, {id: 20, description: 'Second Value'}); + +# Verify type unification with all fields +query T +select arrow_typeof([col_x, col_y]) from t_with_cast limit 1; +---- +List(Struct("id": Int32, "description": Utf8View)) + +# Verify values remain aligned by name +query ? +select [col_x, col_y] from t_with_cast order by col_x.id; +---- +[{id: 1, description: First}, {id: 10, description: First Value}] +[{id: 2, description: Second}, {id: 20, description: Second Value}] + +statement ok +drop table t_with_cast; + +# Test 4: Explicit CAST for partial field overlap scenarios +# When columns have different field sets, use explicit CAST to unify schemas +query ? +select [ + CAST({id: 1} AS STRUCT(id INT, description VARCHAR)), + CAST({id: 10, description: 'Value'} AS STRUCT(id INT, description VARCHAR)) +]; +---- +[{id: 1, description: NULL}, {id: 10, description: Value}] + +# Test 5: Complex nested structs with field reordering +# Nested fields must have the exact same field set for array construction +statement ok +create table t_nested ( + col_1 struct(id int, outer struct(x int, y int)), + col_2 struct(id int, outer struct(x int, y int)) +) as values + ({id: 100, outer: {x: 1, y: 2}}, {id: 101, outer: {x: 4, y: 3}}), + ({id: 200, outer: {x: 5, y: 6}}, {id: 201, outer: {x: 8, y: 7}}); + +# Verify nested struct in unified schema +query T +select arrow_typeof([col_1, col_2]) from t_nested limit 1; +---- +List(Struct("id": Int32, "outer": Struct("x": Int32, "y": Int32))) + +# Verify nested field values are correctly mapped +query ? +select [col_1, col_2] from t_nested order by col_1.id; +---- +[{id: 100, outer: {x: 1, y: 2}}, {id: 101, outer: {x: 4, y: 3}}] +[{id: 200, outer: {x: 5, y: 6}}, {id: 201, outer: {x: 8, y: 7}}] + +statement ok +drop table t_nested; + +# Test 6: NULL handling with matching field sets +statement ok +create table t_nulls ( + col_a struct(val int, flag boolean), + col_b struct(val int, flag boolean) +) as values + ({val: 1, flag: true}, {val: 10, flag: false}), + ({val: NULL, flag: false}, {val: NULL, flag: true}); + +# Verify NULL values are preserved +query ? +select [col_a, col_b] from t_nulls order by col_a.val; +---- +[{val: 1, flag: true}, {val: 10, flag: false}] +[{val: NULL, flag: false}, {val: NULL, flag: true}] + +statement ok +drop table t_nulls; + +# Test 7: Multiple columns with complete field matching +statement ok +create table t_multi ( + col1 struct(a int, b int, c int), + col2 struct(a int, b int, c int) +) as values + ({a: 1, b: 2, c: 3}, {a: 10, b: 20, c: 30}), + ({a: 4, b: 5, c: 6}, {a: 40, b: 50, c: 60}); + +# Verify array with complete field matching +query T +select arrow_typeof([col1, col2]) from t_multi limit 1; +---- +List(Struct("a": Int32, "b": Int32, "c": Int32)) + +# Verify values are correctly unified +query ? +select [col1, col2] from t_multi order by col1.a; +---- +[{a: 1, b: 2, c: 3}, {a: 10, b: 20, c: 30}] +[{a: 4, b: 5, c: 6}, {a: 40, b: 50, c: 60}] + +statement ok +drop table t_multi; + +############################ +# Comprehensive Implicit Struct Coercion Suite +############################ + +# Test 1: VALUES clause with field reordering coerced by name into declared schema +statement ok +create table implicit_values_reorder ( + s struct(a int, b int) +) as values + ({a: 1, b: 2}), + ({b: 3, a: 4}); + +query T +select arrow_typeof(s) from implicit_values_reorder limit 1; +---- +Struct("a": Int32, "b": Int32) + +query ? +select s from implicit_values_reorder order by s.a; +---- +{a: 1, b: 2} +{a: 4, b: 3} + +statement ok +drop table implicit_values_reorder; + +# Test 2: Array literal coercion with reordered struct fields +query IIII +select + [{a: 1, b: 2}, {b: 3, a: 4}][1]['a'], + [{a: 1, b: 2}, {b: 3, a: 4}][1]['b'], + [{a: 1, b: 2}, {b: 3, a: 4}][2]['a'], + [{a: 1, b: 2}, {b: 3, a: 4}][2]['b']; +---- +1 2 4 3 + +# Test 3: Array construction from columns with reordered struct fields +statement ok +create table struct_columns_order ( + s1 struct(a int, b int), + s2 struct(b int, a int) +) as values + ({a: 1, b: 2}, {b: 3, a: 4}), + ({a: 5, b: 6}, {b: 7, a: 8}); + +query IIII +select + [s1, s2][1]['a'], + [s1, s2][1]['b'], + [s1, s2][2]['a'], + [s1, s2][2]['b'] +from struct_columns_order +order by s1['a']; +---- +1 2 4 3 +5 6 8 7 + +statement ok +drop table struct_columns_order; + +# Test 4: UNION with struct field reordering +query II +select s['a'], s['b'] +from ( + select {a: 1, b: 2} as s + union all + select {b: 3, a: 4} as s +) t +order by s['a']; +---- +1 2 +4 3 + +# Test 5: CTE with struct coercion across branches +query II +with + t1 as (select {a: 1, b: 2} as s), + t2 as (select {b: 3, a: 4} as s) +select t1.s['a'], t1.s['b'] from t1 +union all +select t2.s['a'], t2.s['b'] from t2 +order by 1; +---- +1 2 +4 3 + +# Test 6: Struct aggregation retains name-based mapping +statement ok +create table agg_structs_reorder ( + k int, + s struct(x int, y int) +) as values + (1, {x: 1, y: 2}), + (1, {y: 3, x: 4}), + (2, {x: 5, y: 6}); + +query I? +select k, array_agg(s) from agg_structs_reorder group by k order by k; +---- +1 [{x: 1, y: 2}, {x: 4, y: 3}] +2 [{x: 5, y: 6}] + +statement ok +drop table agg_structs_reorder; + +# Test 7: Nested struct coercion with reordered inner fields +query IIII +with nested as ( + select [{outer: {inner: 1, value: 2}}, {outer: {value: 3, inner: 4}}] as arr +) +select + arr[1]['outer']['inner'], + arr[1]['outer']['value'], + arr[2]['outer']['inner'], + arr[2]['outer']['value'] +from nested; +---- +1 2 4 3 + +# Test 8: Partial name overlap - currently errors (field count mismatch detected) +# This is a documented limitation: structs must have exactly same field set for coercion +query error DataFusion error: Error during planning: Inconsistent data type across values list +select column1 from (values ({a: 1, b: 2}), ({b: 3, c: 4})) order by column1['a']; + +# Negative test: mismatched struct field counts are rejected (documented limitation) +query error DataFusion error: .* +select [{a: 1}, {a: 2, b: 3}]; + +# Test 9: INSERT with name-based struct coercion into target schema +statement ok +create table target_struct_insert (s struct(a int, b int)); + +statement ok +insert into target_struct_insert values ({b: 1, a: 2}); + +query ? +select s from target_struct_insert; +---- +{a: 2, b: 1} + +statement ok +drop table target_struct_insert; + +# Test 10: CASE expression with different struct field orders +query II +select + (case when true then {a: 1, b: 2} else {b: 3, a: 4} end)['a'] as a_val, + (case when true then {a: 1, b: 2} else {b: 3, a: 4} end)['b'] as b_val; +---- +1 2 + +############################ +# JOIN Coercion Tests +############################ + +# Test: Struct coercion in JOIN ON condition +statement ok +create table t_left ( + id int, + s struct(x int, y int) +) as values + (1, {x: 1, y: 2}), + (2, {x: 3, y: 4}); + +statement ok +create table t_right ( + id int, + s struct(y int, x int) +) as values + (1, {y: 2, x: 1}), + (2, {y: 4, x: 3}); + +# JOIN on reordered struct fields - matched by name +query IIII +select t_left.id, t_left.s['x'], t_left.s['y'], t_right.id +from t_left +join t_right on t_left.s = t_right.s +order by t_left.id; +---- +1 1 2 1 +2 3 4 2 + +statement ok +drop table t_left; + +statement ok +drop table t_right; + +# Test: Struct coercion with filtered JOIN +statement ok +create table orders ( + order_id int, + customer struct(name varchar, id int) +) as values + (1, {name: 'Alice', id: 100}), + (2, {name: 'Bob', id: 101}), + (3, {name: 'Charlie', id: 102}); + +statement ok +create table customers ( + customer_id int, + info struct(id int, name varchar) +) as values + (100, {id: 100, name: 'Alice'}), + (101, {id: 101, name: 'Bob'}), + (103, {id: 103, name: 'Diana'}); + +# Join with struct field reordering - names matched, not positions +query I +select count(*) from orders +join customers on orders.customer = customers.info +where orders.order_id <= 2; +---- +2 + +statement ok +drop table orders; + +statement ok +drop table customers; + +############################ +# WHERE Predicate Coercion Tests +############################ + +# Test: Struct equality in WHERE clause with field reordering +statement ok +create table t_where ( + id int, + s struct(x int, y int) +) as values + (1, {x: 1, y: 2}), + (2, {x: 3, y: 4}), + (3, {x: 1, y: 2}); + +# WHERE clause with struct comparison - coerced by name +query I +select id from t_where +where s = {y: 2, x: 1} +order by id; +---- +1 +3 + +statement ok +drop table t_where; + +# Test: Struct IN clause with reordering +statement ok +create table t_in ( + id int, + s struct(a int, b varchar) +) as values + (1, {a: 1, b: 'x'}), + (2, {a: 2, b: 'y'}), + (3, {a: 1, b: 'x'}); + +# IN clause with reordered struct literals +query I +select id from t_in +where s in ({b: 'x', a: 1}, {b: 'y', a: 2}) +order by id; +---- +1 +2 +3 + +statement ok +drop table t_in; + +# Test: Struct BETWEEN (not supported, but documents limitation) +# Structs don't support BETWEEN, but can use comparison operators + +statement ok +create table t_between ( + id int, + s struct(val int) +) as values + (1, {val: 10}), + (2, {val: 20}), + (3, {val: 30}); + +# Comparison via field extraction works +query I +select id from t_between +where s['val'] >= 20 +order by id; +---- +2 +3 + +statement ok +drop table t_between; + +############################ +# Window Function Coercion Tests +############################ + +# Test: Struct in window function PARTITION BY +statement ok +create table t_window ( + id int, + s struct(category int, value int) +) as values + (1, {category: 1, value: 10}), + (2, {category: 1, value: 20}), + (3, {category: 2, value: 30}), + (4, {category: 2, value: 40}); + +# Window partition on struct field via extraction +query III +select + id, + s['value'], + row_number() over (partition by s['category'] order by s['value']) +from t_window +order by id; +---- +1 10 1 +2 20 2 +3 30 1 +4 40 2 + +statement ok +drop table t_window; + +# Test: Struct in window function ORDER BY with coercion +statement ok +create table t_rank ( + id int, + s struct(rank_val int, group_id int) +) as values + (1, {rank_val: 10, group_id: 1}), + (2, {rank_val: 20, group_id: 1}), + (3, {rank_val: 15, group_id: 2}); + +# Window ranking with struct field extraction +query III +select + id, + s['rank_val'], + rank() over (partition by s['group_id'] order by s['rank_val']) +from t_rank +order by id; +---- +1 10 1 +2 20 2 +3 15 1 + +statement ok +drop table t_rank; + +# Test: Aggregate function with struct coercion across window partitions +statement ok +create table t_agg_window ( + id int, + partition_id int, + s struct(amount int) +) as values + (1, 1, {amount: 100}), + (2, 1, {amount: 200}), + (3, 2, {amount: 150}); + +# Running sum via extracted struct field +query III +select + id, + partition_id, + sum(s['amount']) over (partition by partition_id order by id) +from t_agg_window +order by id; +---- +1 1 100 +2 1 300 +3 2 150 + +statement ok +drop table t_agg_window; \ No newline at end of file diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst index a13d40334b639..f1fef45f705a8 100644 --- a/docs/source/user-guide/sql/index.rst +++ b/docs/source/user-guide/sql/index.rst @@ -22,6 +22,7 @@ SQL Reference :maxdepth: 2 data_types + struct_coercion select subqueries ddl diff --git a/docs/source/user-guide/sql/struct_coercion.md b/docs/source/user-guide/sql/struct_coercion.md new file mode 100644 index 0000000000000..d2a32fcee2650 --- /dev/null +++ b/docs/source/user-guide/sql/struct_coercion.md @@ -0,0 +1,354 @@ + + +# Struct Type Coercion and Field Mapping + +DataFusion uses **name-based field mapping** when coercing struct types across different operations. This document explains how struct coercion works, when it applies, and how to handle NULL fields. + +## Overview: Name-Based vs Positional Mapping + +When combining structs from different sources (e.g., in UNION, array construction, or JOINs), DataFusion matches struct fields by **name** rather than by **position**. This provides more robust and predictable behavior compared to positional matching. + +### Example: Field Reordering is Handled Transparently + +```sql +-- These two structs have the same fields in different order +SELECT [{a: 1, b: 2}, {b: 3, a: 4}]; + +-- Result: Field names matched, values unified +-- [{"a": 1, "b": 2}, {"a": 4, "b": 3}] +``` + +## Coercion Paths Using Name-Based Matching + +The following query operations use name-based field mapping for struct coercion: + +### 1. Array Literal Construction + +When creating array literals with struct elements that have different field orders: + +```sql +-- Structs with reordered fields in array literal +SELECT [{x: 1, y: 2}, {y: 3, x: 4}]; + +-- Unified type: List(Struct("x": Int32, "y": Int32)) +-- Values: [{"x": 1, "y": 2}, {"x": 4, "y": 3}] +``` + +**When it applies:** + +- Array literals with struct elements: `[{...}, {...}]` +- Nested arrays with structs: `[[{x: 1}, {x: 2}]]` + +### 2. Array Construction from Columns + +When constructing arrays from table columns with different struct schemas: + +```sql +CREATE TABLE t_left (s struct(x int, y int)) AS VALUES ({x: 1, y: 2}); +CREATE TABLE t_right (s struct(y int, x int)) AS VALUES ({y: 3, x: 4}); + +-- Dynamically constructs unified array schema +SELECT [t_left.s, t_right.s] FROM t_left JOIN t_right; + +-- Result: [{"x": 1, "y": 2}, {"x": 4, "y": 3}] +``` + +**When it applies:** + +- Array construction with column references: `[col1, col2]` +- Array construction in joins with matching field names + +### 3. UNION Operations + +When combining query results with different struct field orders: + +```sql +SELECT {a: 1, b: 2} as s +UNION ALL +SELECT {b: 3, a: 4} as s; + +-- Result: {"a": 1, "b": 2} and {"a": 4, "b": 3} +``` + +**When it applies:** + +- UNION ALL with structs: field names matched across branches +- UNION (deduplicated) with structs + +### 4. Common Table Expressions (CTEs) + +When multiple CTEs produce structs with different field orders that are combined: + +```sql +WITH + t1 AS (SELECT {a: 1, b: 2} as s), + t2 AS (SELECT {b: 3, a: 4} as s) +SELECT s FROM t1 +UNION ALL +SELECT s FROM t2; + +-- Result: Field names matched across CTEs +``` + +### 5. VALUES Clauses + +When creating tables or temporary results with struct values in different field orders: + +```sql +CREATE TABLE t AS VALUES ({a: 1, b: 2}), ({b: 3, a: 4}); + +-- Table schema unified: struct(a: int, b: int) +-- Values: {a: 1, b: 2} and {a: 4, b: 3} +``` + +### 6. JOIN Operations + +When joining tables where the JOIN condition involves structs with different field orders: + +```sql +CREATE TABLE orders (customer struct(name varchar, id int)); +CREATE TABLE customers (info struct(id int, name varchar)); + +-- Join matches struct fields by name +SELECT * FROM orders +JOIN customers ON orders.customer = customers.info; +``` + +### 7. Aggregate Functions + +When collecting structs with different field orders using aggregate functions like `array_agg`: + +```sql +SELECT array_agg(s) FROM ( + SELECT {x: 1, y: 2} as s + UNION ALL + SELECT {y: 3, x: 4} as s +) t +GROUP BY category; + +-- Result: Array of structs with unified field order +``` + +### 8. Window Functions + +When using window functions with struct expressions having different field orders: + +```sql +SELECT + id, + row_number() over (partition by s order by id) as rn +FROM ( + SELECT {category: 1, value: 10} as s, 1 as id + UNION ALL + SELECT {value: 20, category: 1} as s, 2 as id +); + +-- Fields matched by name in PARTITION BY clause +``` + +## NULL Handling for Missing Fields + +When structs have different field sets, missing fields are filled with **NULL** values during coercion. + +### Example: Partial Field Overlap + +```sql +-- Struct in first position has fields: a, b +-- Struct in second position has fields: b, c +-- Unified schema includes all fields: a, b, c + +SELECT [ + CAST({a: 1, b: 2} AS STRUCT(a INT, b INT, c INT)), + CAST({b: 3, c: 4} AS STRUCT(a INT, b INT, c INT)) +]; + +-- Result: +-- [ +-- {"a": 1, "b": 2, "c": NULL}, +-- {"a": NULL, "b": 3, "c": 4} +-- ] +``` + +### Limitations + +**Field count must match exactly.** If structs have different numbers of fields and their field names don't completely overlap, the query will fail: + +```sql +-- This fails because field sets don't match: +-- t_left has {x, y} but t_right has {x, y, z} +SELECT [t_left.s, t_right.s] FROM t_left JOIN t_right; +-- Error: Cannot coerce struct with mismatched field counts +``` + +**Workaround: Use explicit CAST** + +To handle partial field overlap, explicitly cast structs to a unified schema: + +```sql +SELECT [ + CAST(t_left.s AS STRUCT(x INT, y INT, z INT)), + CAST(t_right.s AS STRUCT(x INT, y INT, z INT)) +] FROM t_left JOIN t_right; +``` + +## Migration Guide: From Positional to Name-Based Matching + +If you have existing code that relied on **positional** struct field matching, you may need to update it. + +### Example: Query That Changes Behavior + +**Old behavior (positional):** + +```sql +-- These would have been positionally mapped (left-to-right) +SELECT [{x: 1, y: 2}, {y: 3, x: 4}]; +-- Old result (positional): [{"x": 1, "y": 2}, {"y": 3, "x": 4}] +``` + +**New behavior (name-based):** + +```sql +-- Now uses name-based matching +SELECT [{x: 1, y: 2}, {y: 3, x: 4}]; +-- New result (by name): [{"x": 1, "y": 2}, {"x": 4, "y": 3}] +``` + +### Migration Steps + +1. **Review struct operations** - Look for queries that combine structs from different sources +2. **Check field names** - Verify that field names match as expected (not positions) +3. **Test with new coercion** - Run queries and verify the results match your expectations +4. **Handle field reordering** - If you need specific field orders, use explicit CAST operations + +### Using Explicit CAST for Compatibility + +If you need precise control over struct field order and types, use explicit `CAST`: + +```sql +-- Guarantee specific field order and types +SELECT CAST({b: 3, a: 4} AS STRUCT(a INT, b INT)); +-- Result: {"a": 4, "b": 3} +``` + +## Best Practices + +### 1. Be Explicit with Schema Definitions + +When joining or combining structs, define target schemas explicitly: + +```sql +-- Good: explicit schema definition +SELECT CAST(data AS STRUCT(id INT, name VARCHAR, active BOOLEAN)) +FROM external_source; +``` + +### 2. Use Named Struct Constructors + +Prefer named struct constructors for clarity: + +```sql +-- Good: field names are explicit +SELECT named_struct('id', 1, 'name', 'Alice', 'active', true); + +-- Or using struct literal syntax +SELECT {id: 1, name: 'Alice', active: true}; +``` + +### 3. Test Field Mappings + +Always verify that field mappings work as expected: + +```sql +-- Use arrow_typeof to verify unified schema +SELECT arrow_typeof([{x: 1, y: 2}, {y: 3, x: 4}]); +-- Result: List(Struct("x": Int32, "y": Int32)) +``` + +### 4. Handle Partial Field Overlap Explicitly + +When combining structs with partial field overlap, use explicit CAST: + +```sql +-- Instead of relying on implicit coercion +SELECT [ + CAST(left_struct AS STRUCT(x INT, y INT, z INT)), + CAST(right_struct AS STRUCT(x INT, y INT, z INT)) +]; +``` + +### 5. Document Struct Schemas + +In complex queries, document the expected struct schemas: + +```sql +-- Expected schema: {customer_id: INT, name: VARCHAR, age: INT} +SELECT { + customer_id: c.id, + name: c.name, + age: c.age +} as customer_info +FROM customers c; +``` + +## Error Messages and Troubleshooting + +### "Cannot coerce struct with different field counts" + +**Cause:** Trying to combine structs with different numbers of fields. + +**Solution:** + +```sql +-- Use explicit CAST to handle missing fields +SELECT [ + CAST(struct1 AS STRUCT(a INT, b INT, c INT)), + CAST(struct2 AS STRUCT(a INT, b INT, c INT)) +]; +``` + +### "Field X not found in struct" + +**Cause:** Referencing a field name that doesn't exist in the struct. + +**Solution:** + +```sql +-- Verify field names match exactly (case-sensitive) +SELECT s['field_name'] FROM my_table; -- Use bracket notation for access +-- Or use get_field function +SELECT get_field(s, 'field_name') FROM my_table; +``` + +### Unexpected NULL values after coercion + +**Cause:** Struct coercion added NULL for missing fields. + +**Solution:** Check that all structs have the required fields, or explicitly handle NULLs: + +```sql +SELECT COALESCE(s['field'], default_value) FROM my_table; +``` + +## Related Functions + +- `arrow_typeof()` - Returns the Arrow type of an expression +- `struct()` / `named_struct()` - Creates struct values +- `get_field()` - Extracts field values from structs +- `CAST()` - Explicitly casts structs to specific schemas