-
Notifications
You must be signed in to change notification settings - Fork 198
[AURON #1780] Fix the ORC table written by Hive to read null uppercase fields #1781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
5903e13
a1fbc9f
b724889
7e6a629
271fc41
440c897
a32f4f1
d6e7a08
0675b0b
715c12d
53cf322
9d981c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -160,6 +160,7 @@ impl ExecutionPlan for OrcExec { | |
|
|
||
| let force_positional_evolution = conf::ORC_FORCE_POSITIONAL_EVOLUTION.value()?; | ||
| let use_microsecond_precision = conf::ORC_TIMESTAMP_USE_MICROSECOND.value()?; | ||
| let is_case_sensitive = conf::ORC_SCHEMA_CASE_SENSITIVE.value()?; | ||
|
|
||
| let opener: Arc<dyn FileOpener> = Arc::new(OrcOpener { | ||
| projection, | ||
|
|
@@ -170,6 +171,7 @@ impl ExecutionPlan for OrcExec { | |
| metrics: self.metrics.clone(), | ||
| force_positional_evolution, | ||
| use_microsecond_precision, | ||
| is_case_sensitive, | ||
| }); | ||
|
|
||
| let file_stream = Box::pin(FileStream::new( | ||
|
|
@@ -217,6 +219,7 @@ struct OrcOpener { | |
| metrics: ExecutionPlanMetricsSet, | ||
| force_positional_evolution: bool, | ||
| use_microsecond_precision: bool, | ||
| is_case_sensitive: bool, | ||
| } | ||
|
|
||
| impl FileOpener for OrcOpener { | ||
|
|
@@ -245,6 +248,7 @@ impl FileOpener for OrcOpener { | |
| self.force_positional_evolution, | ||
| ); | ||
| let use_microsecond = self.use_microsecond_precision; | ||
| let is_case = self.is_case_sensitive; | ||
|
|
||
| Ok(Box::pin(async move { | ||
| let mut builder = ArrowReaderBuilder::try_new_async(reader) | ||
|
|
@@ -259,7 +263,7 @@ impl FileOpener for OrcOpener { | |
| } | ||
|
|
||
| let (schema_mapping, projection) = | ||
| schema_adapter.map_schema(builder.file_metadata())?; | ||
| schema_adapter.map_schema(builder.file_metadata(), is_case)?; | ||
|
|
||
| let projection_mask = | ||
| ProjectionMask::roots(builder.file_metadata().root_data_type(), projection); | ||
|
|
@@ -325,6 +329,7 @@ impl SchemaAdapter { | |
| fn map_schema( | ||
| &self, | ||
| orc_file_meta: &FileMetadata, | ||
| is_case_sensitive: bool, | ||
| ) -> Result<(Arc<dyn SchemaMapper>, Vec<usize>)> { | ||
| let mut projection = Vec::with_capacity(self.projected_schema.fields().len()); | ||
| let mut field_mappings = vec![None; self.projected_schema.fields().len()]; | ||
|
|
@@ -363,7 +368,7 @@ impl SchemaAdapter { | |
| } | ||
| } | ||
| } | ||
| } else { | ||
| } else if is_case_sensitive { | ||
| for named_column in file_named_columns { | ||
| if let Some((proj_idx, _)) = | ||
| self.projected_schema.fields().find(named_column.name()) | ||
|
|
@@ -372,6 +377,21 @@ impl SchemaAdapter { | |
| projection.push(named_column.data_type().column_index()); | ||
| } | ||
| } | ||
| } else { | ||
| for named_column in file_named_columns { | ||
| // Case-insensitive field name matching | ||
| let named_column_name_lower = named_column.name().to_lowercase(); | ||
| if let Some((proj_idx, _)) = self | ||
| .projected_schema | ||
| .fields() | ||
| .iter() | ||
| .enumerate() | ||
| .find(|(_, f)| f.name().to_lowercase() == named_column_name_lower) | ||
| { | ||
| field_mappings[proj_idx] = Some(projection.len()); | ||
| projection.push(named_column.data_type().column_index()); | ||
| } | ||
| } | ||
|
Comment on lines
+381
to
+394
|
||
| } | ||
|
|
||
| Ok(( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The configuration name "ORC_SCHEMA_ISCASE_SENSITIVE" with a default value of false is semantically confusing. The name suggests that when set to false (the default), case-insensitive matching is disabled, but the actual behavior is the opposite: false enables case-insensitive matching. This creates a double negative that makes the configuration harder to understand. Consider renaming to "ORC_SCHEMA_CASE_INSENSITIVE" (with default true for Hive compatibility) or improving the documentation to clearly state that false means "case-insensitive matching enabled" and true means "case-sensitive matching enabled".