From ad4d4a1df6780942f9ce02cf548e8d904ab4ce62 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 21 Jan 2026 17:16:42 -0700 Subject: [PATCH 01/10] auto scan mode no longer falls back to native_comet --- spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 8ba2c5845e..7ba1a50fd3 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -166,7 +166,6 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] with Com case SCAN_AUTO => // TODO add support for native_datafusion in the future nativeIcebergCompatScan(session, scanExec, r, hadoopConf) - .orElse(nativeCometScan(session, scanExec, r, hadoopConf)) .getOrElse(scanExec) case SCAN_NATIVE_DATAFUSION => nativeDataFusionScan(session, scanExec, r, hadoopConf).getOrElse(scanExec) From 74c724467a1121a871e566d0dbecc9600faed3d7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 21 Jan 2026 18:00:13 -0700 Subject: [PATCH 02/10] update test --- .../apache/comet/CometExpressionSuite.scala | 52 ++++++++----------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 0352da7850..5d8bbc9b1f 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -191,7 +191,7 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { withTempDir { dir => val path = new Path(dir.toURI.toString, "test.parquet") makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = dictionaryEnabled, 10000) - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "false") { + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { withParquetTable(path.toString, "tbl") { checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") } @@ -202,36 +202,26 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { test("uint data type support") { Seq(true, false).foreach { dictionaryEnabled => - // TODO: Once the question of what to get back from uint_8, uint_16 types is resolved, - // we can also update this test to check for COMET_SCAN_ALLOW_INCOMPATIBLE=true - Seq(false).foreach { allowIncompatible => - { - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> allowIncompatible.toString) { - withTempDir { dir => - val path = new Path(dir.toURI.toString, "testuint.parquet") - makeParquetFileAllPrimitiveTypes( - path, - dictionaryEnabled = dictionaryEnabled, - Byte.MinValue, - Byte.MaxValue) - withParquetTable(path.toString, "tbl") { - val qry = "select _9 from tbl order by _11" - if (usingDataSourceExec(conf)) { - if (!allowIncompatible) { - checkSparkAnswerAndOperator(qry) - } else { - // need to convert the values to unsigned values - val expected = (Byte.MinValue to Byte.MaxValue) - .map(v => { - if (v < 0) Byte.MaxValue.toShort - v else v - }) - .toDF("a") - checkAnswer(sql(qry), expected) - } - } else { - checkSparkAnswerAndOperator(qry) - } - } + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { + withTempDir { dir => + val path = new Path(dir.toURI.toString, "testuint.parquet") + makeParquetFileAllPrimitiveTypes( + path, + dictionaryEnabled = dictionaryEnabled, + Byte.MinValue, + Byte.MaxValue) + withParquetTable(path.toString, "tbl") { + val qry = "select _9 from tbl order by _11" + if (usingDataSourceExec(conf)) { + // need to convert the values to unsigned values + val expected = (Byte.MinValue to Byte.MaxValue) + .map(v => { + if (v < 0) Byte.MaxValue.toShort - v else v + }) + .toDF("a") + checkAnswer(sql(qry), expected) + } else { + checkSparkAnswerAndOperator(qry) } } } From ee290fc061cf6be1b8f21d54339b93f7a6ae093d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 22 Jan 2026 07:47:30 -0700 Subject: [PATCH 03/10] update tests --- .../apache/comet/CometExpressionSuite.scala | 62 ++++++++++--------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 5d8bbc9b1f..bf0d3e6a19 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -187,13 +187,16 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("basic data type support") { - Seq(true, false).foreach { dictionaryEnabled => - withTempDir { dir => - val path = new Path(dir.toURI.toString, "test.parquet") - makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = dictionaryEnabled, 10000) - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { - withParquetTable(path.toString, "tbl") { - checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") + // this test requires native_comet scan due to unsigned u8/u16 issue + withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { + Seq(true, false).foreach { dictionaryEnabled => + withTempDir { dir => + val path = new Path(dir.toURI.toString, "test.parquet") + makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = dictionaryEnabled, 10000) + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { + withParquetTable(path.toString, "tbl") { + checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") + } } } } @@ -201,27 +204,30 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("uint data type support") { - Seq(true, false).foreach { dictionaryEnabled => - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { - withTempDir { dir => - val path = new Path(dir.toURI.toString, "testuint.parquet") - makeParquetFileAllPrimitiveTypes( - path, - dictionaryEnabled = dictionaryEnabled, - Byte.MinValue, - Byte.MaxValue) - withParquetTable(path.toString, "tbl") { - val qry = "select _9 from tbl order by _11" - if (usingDataSourceExec(conf)) { - // need to convert the values to unsigned values - val expected = (Byte.MinValue to Byte.MaxValue) - .map(v => { - if (v < 0) Byte.MaxValue.toShort - v else v - }) - .toDF("a") - checkAnswer(sql(qry), expected) - } else { - checkSparkAnswerAndOperator(qry) + // this test requires native_comet scan due to unsigned u8/u16 issue + withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { + Seq(true, false).foreach { dictionaryEnabled => + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { + withTempDir { dir => + val path = new Path(dir.toURI.toString, "testuint.parquet") + makeParquetFileAllPrimitiveTypes( + path, + dictionaryEnabled = dictionaryEnabled, + Byte.MinValue, + Byte.MaxValue) + withParquetTable(path.toString, "tbl") { + val qry = "select _9 from tbl order by _11" + if (usingDataSourceExec(conf)) { + // need to convert the values to unsigned values + val expected = (Byte.MinValue to Byte.MaxValue) + .map(v => { + if (v < 0) Byte.MaxValue.toShort - v else v + }) + .toDF("a") + checkAnswer(sql(qry), expected) + } else { + checkSparkAnswerAndOperator(qry) + } } } } From 537d62e8274a95248b7b946e1a80a087b07ba8f8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 22 Jan 2026 08:54:31 -0700 Subject: [PATCH 04/10] Fix clippy warnings for Rust 1.93 - Use local `root_op` variable instead of unwrapping `exec_context.root_op` - Replace `is_some()` + `unwrap()` pattern with `if let Some(...)` Co-Authored-By: Claude Opus 4.5 --- native/core/src/execution/jni_api.rs | 10 ++-------- native/core/src/parquet/schema_adapter.rs | 5 ++--- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 56569bc69c..680cf80c75 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -503,12 +503,7 @@ pub unsafe extern "system" fn Java_org_apache_comet_Native_executePlan( let task_ctx = exec_context.session_ctx.task_ctx(); // Each Comet native execution corresponds to a single Spark partition, // so we should always execute partition 0. - let stream = exec_context - .root_op - .as_ref() - .unwrap() - .native_plan - .execute(0, task_ctx)?; + let stream = root_op.native_plan.execute(0, task_ctx)?; exec_context.stream = Some(stream); } else { // Pull input batches @@ -619,8 +614,7 @@ pub extern "system" fn Java_org_apache_comet_Native_releasePlan( /// Updates the metrics of the query plan. fn update_metrics(env: &mut JNIEnv, exec_context: &mut ExecutionContext) -> CometResult<()> { - if exec_context.root_op.is_some() { - let native_query = exec_context.root_op.as_ref().unwrap(); + if let Some(native_query) = &exec_context.root_op { let metrics = exec_context.metrics.as_obj(); update_comet_metric(env, metrics, native_query) } else { diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs index b321d902a9..1e0d30c835 100644 --- a/native/core/src/parquet/schema_adapter.rs +++ b/native/core/src/parquet/schema_adapter.rs @@ -209,10 +209,9 @@ impl SchemaMapper for SchemaMapping { // If this field only exists in the table, and not in the file, then we need to // populate a default value for it. || { - if self.default_values.is_some() { + if let Some(default_values) = &self.default_values { // We have a map of default values, see if this field is in there. - if let Some(value) = - self.default_values.as_ref().unwrap().get(&field_idx) + if let Some(value) = default_values.get(&field_idx) // Default value exists, construct a column from it. { let cv = if field.data_type() == &value.data_type() { From e8b6e7a7aa5ea9bc378e213873046d5838315f7e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 22 Jan 2026 10:48:03 -0700 Subject: [PATCH 05/10] update tests --- .../apache/comet/CometExpressionSuite.scala | 56 +++++++++---------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index bf0d3e6a19..97a2b4ce80 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -187,16 +187,14 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("basic data type support") { - // this test requires native_comet scan due to unsigned u8/u16 issue - withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { + // this test requires COMET_SCAN_ALLOW_INCOMPATIBLE due to unsigned u8/u16 issue + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { Seq(true, false).foreach { dictionaryEnabled => withTempDir { dir => val path = new Path(dir.toURI.toString, "test.parquet") makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = dictionaryEnabled, 10000) - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { - withParquetTable(path.toString, "tbl") { - checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") - } + withParquetTable(path.toString, "tbl") { + checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") } } } @@ -204,30 +202,28 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("uint data type support") { - // this test requires native_comet scan due to unsigned u8/u16 issue - withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { - Seq(true, false).foreach { dictionaryEnabled => - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { - withTempDir { dir => - val path = new Path(dir.toURI.toString, "testuint.parquet") - makeParquetFileAllPrimitiveTypes( - path, - dictionaryEnabled = dictionaryEnabled, - Byte.MinValue, - Byte.MaxValue) - withParquetTable(path.toString, "tbl") { - val qry = "select _9 from tbl order by _11" - if (usingDataSourceExec(conf)) { - // need to convert the values to unsigned values - val expected = (Byte.MinValue to Byte.MaxValue) - .map(v => { - if (v < 0) Byte.MaxValue.toShort - v else v - }) - .toDF("a") - checkAnswer(sql(qry), expected) - } else { - checkSparkAnswerAndOperator(qry) - } + Seq(true, false).foreach { dictionaryEnabled => + // this test requires COMET_SCAN_ALLOW_INCOMPATIBLE due to unsigned u8/u16 issue + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { + withTempDir { dir => + val path = new Path(dir.toURI.toString, "testuint.parquet") + makeParquetFileAllPrimitiveTypes( + path, + dictionaryEnabled = dictionaryEnabled, + Byte.MinValue, + Byte.MaxValue) + withParquetTable(path.toString, "tbl") { + val qry = "select _9 from tbl order by _11" + if (usingDataSourceExec(conf)) { + // need to convert the values to unsigned values + val expected = (Byte.MinValue to Byte.MaxValue) + .map(v => { + if (v < 0) Byte.MaxValue.toShort - v else v + }) + .toDF("a") + checkAnswer(sql(qry), expected) + } else { + checkSparkAnswerAndOperator(qry) } } } From ebc608b7be47e215e9465355bd4d0e464f344ca8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 22 Jan 2026 10:49:02 -0700 Subject: [PATCH 06/10] Revert "update tests" This reverts commit e8b6e7a7aa5ea9bc378e213873046d5838315f7e. --- .../apache/comet/CometExpressionSuite.scala | 56 ++++++++++--------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 97a2b4ce80..bf0d3e6a19 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -187,14 +187,16 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("basic data type support") { - // this test requires COMET_SCAN_ALLOW_INCOMPATIBLE due to unsigned u8/u16 issue - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { + // this test requires native_comet scan due to unsigned u8/u16 issue + withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { Seq(true, false).foreach { dictionaryEnabled => withTempDir { dir => val path = new Path(dir.toURI.toString, "test.parquet") makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = dictionaryEnabled, 10000) - withParquetTable(path.toString, "tbl") { - checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { + withParquetTable(path.toString, "tbl") { + checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") + } } } } @@ -202,28 +204,30 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("uint data type support") { - Seq(true, false).foreach { dictionaryEnabled => - // this test requires COMET_SCAN_ALLOW_INCOMPATIBLE due to unsigned u8/u16 issue - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { - withTempDir { dir => - val path = new Path(dir.toURI.toString, "testuint.parquet") - makeParquetFileAllPrimitiveTypes( - path, - dictionaryEnabled = dictionaryEnabled, - Byte.MinValue, - Byte.MaxValue) - withParquetTable(path.toString, "tbl") { - val qry = "select _9 from tbl order by _11" - if (usingDataSourceExec(conf)) { - // need to convert the values to unsigned values - val expected = (Byte.MinValue to Byte.MaxValue) - .map(v => { - if (v < 0) Byte.MaxValue.toShort - v else v - }) - .toDF("a") - checkAnswer(sql(qry), expected) - } else { - checkSparkAnswerAndOperator(qry) + // this test requires native_comet scan due to unsigned u8/u16 issue + withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { + Seq(true, false).foreach { dictionaryEnabled => + withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { + withTempDir { dir => + val path = new Path(dir.toURI.toString, "testuint.parquet") + makeParquetFileAllPrimitiveTypes( + path, + dictionaryEnabled = dictionaryEnabled, + Byte.MinValue, + Byte.MaxValue) + withParquetTable(path.toString, "tbl") { + val qry = "select _9 from tbl order by _11" + if (usingDataSourceExec(conf)) { + // need to convert the values to unsigned values + val expected = (Byte.MinValue to Byte.MaxValue) + .map(v => { + if (v < 0) Byte.MaxValue.toShort - v else v + }) + .toDF("a") + checkAnswer(sql(qry), expected) + } else { + checkSparkAnswerAndOperator(qry) + } } } } From 8feb1d22d3d31926265411aed42a769ec60fd987 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 22 Jan 2026 11:03:52 -0700 Subject: [PATCH 07/10] improve tests --- .../source/user-guide/latest/compatibility.md | 9 ++--- .../apache/comet/CometExpressionSuite.scala | 38 ++++++------------- 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index 0ca6f8ea97..48c3601390 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -105,7 +105,6 @@ Cast operations in Comet fall into three levels of support: **Notes:** - - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -113,7 +112,7 @@ Cast operations in Comet fall into three levels of support: - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) +or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: Not all valid formats are supported @@ -140,7 +139,6 @@ Cast operations in Comet fall into three levels of support: **Notes:** - - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -148,7 +146,7 @@ Cast operations in Comet fall into three levels of support: - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) +or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: Not all valid formats are supported @@ -175,7 +173,6 @@ Cast operations in Comet fall into three levels of support: **Notes:** - - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -183,7 +180,7 @@ Cast operations in Comet fall into three levels of support: - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) +or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: ANSI mode not supported diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index bf0d3e6a19..2b1dab4de1 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -193,10 +193,8 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { withTempDir { dir => val path = new Path(dir.toURI.toString, "test.parquet") makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = dictionaryEnabled, 10000) - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { - withParquetTable(path.toString, "tbl") { - checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") - } + withParquetTable(path.toString, "tbl") { + checkSparkAnswerAndOperator("select * FROM tbl WHERE _2 > 100") } } } @@ -207,28 +205,16 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { // this test requires native_comet scan due to unsigned u8/u16 issue withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { Seq(true, false).foreach { dictionaryEnabled => - withSQLConf(CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key -> "true") { - withTempDir { dir => - val path = new Path(dir.toURI.toString, "testuint.parquet") - makeParquetFileAllPrimitiveTypes( - path, - dictionaryEnabled = dictionaryEnabled, - Byte.MinValue, - Byte.MaxValue) - withParquetTable(path.toString, "tbl") { - val qry = "select _9 from tbl order by _11" - if (usingDataSourceExec(conf)) { - // need to convert the values to unsigned values - val expected = (Byte.MinValue to Byte.MaxValue) - .map(v => { - if (v < 0) Byte.MaxValue.toShort - v else v - }) - .toDF("a") - checkAnswer(sql(qry), expected) - } else { - checkSparkAnswerAndOperator(qry) - } - } + withTempDir { dir => + val path = new Path(dir.toURI.toString, "testuint.parquet") + makeParquetFileAllPrimitiveTypes( + path, + dictionaryEnabled = dictionaryEnabled, + Byte.MinValue, + Byte.MaxValue) + withParquetTable(path.toString, "tbl") { + val qry = "select _9 from tbl order by _11" + checkSparkAnswerAndOperator(qry) } } } From 0ee79d9a29be182f6d5c2b379168f2da5b5c0f5b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 22 Jan 2026 12:01:41 -0700 Subject: [PATCH 08/10] fix --- docs/source/user-guide/latest/compatibility.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index 48c3601390..0ca6f8ea97 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -105,6 +105,7 @@ Cast operations in Comet fall into three levels of support: **Notes:** + - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -112,7 +113,7 @@ Cast operations in Comet fall into three levels of support: - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) -or strings containing null bytes (e.g \\u0000) + or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: Not all valid formats are supported @@ -139,6 +140,7 @@ or strings containing null bytes (e.g \\u0000) **Notes:** + - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -146,7 +148,7 @@ or strings containing null bytes (e.g \\u0000) - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) -or strings containing null bytes (e.g \\u0000) + or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: Not all valid formats are supported @@ -173,6 +175,7 @@ or strings containing null bytes (e.g \\u0000) **Notes:** + - **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not - **double -> decimal**: There can be rounding differences - **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 @@ -180,7 +183,7 @@ or strings containing null bytes (e.g \\u0000) - **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 - **string -> date**: Only supports years between 262143 BC and 262142 AD - **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) -or strings containing null bytes (e.g \\u0000) + or strings containing null bytes (e.g \\u0000) - **string -> timestamp**: ANSI mode not supported From 8b229018c7f3744529f5078926a6264b0c4c21ef Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 23 Jan 2026 07:26:24 -0700 Subject: [PATCH 09/10] fix: add missing datafusion-datasource dependency The csv_scan.rs file uses types from datafusion_datasource but the dependency was not declared in native/core/Cargo.toml. Co-Authored-By: Claude Opus 4.5 --- native/Cargo.lock | 78 ++++++++++++++++++++++++++++++++++++++++++ native/core/Cargo.toml | 1 + 2 files changed, 79 insertions(+) diff --git a/native/Cargo.lock b/native/Cargo.lock index ce0eb0f2b3..2e53b3c274 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -418,6 +418,23 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async-compression" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +dependencies = [ + "bzip2 0.5.2", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + [[package]] name = "async-executor" version = "1.13.3" @@ -1189,6 +1206,34 @@ dependencies = [ "either", ] +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cast" version = "0.3.0" @@ -1784,6 +1829,7 @@ dependencies = [ "datafusion-comet-objectstore-hdfs", "datafusion-comet-proto", "datafusion-comet-spark-expr", + "datafusion-datasource", "datafusion-functions-nested", "datafusion-spark", "futures", @@ -1925,8 +1971,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fde13794244bc7581cd82f6fff217068ed79cdc344cafe4ab2c3a1c3510b38d6" dependencies = [ "arrow", + "async-compression", "async-trait", "bytes", + "bzip2 0.6.1", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -1937,6 +1985,7 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", + "flate2", "futures", "glob", "itertools 0.14.0", @@ -1944,7 +1993,10 @@ dependencies = [ "object_store", "rand 0.9.2", "tokio", + "tokio-util", "url", + "xz2", + "zstd", ] [[package]] @@ -3625,6 +3677,12 @@ dependencies = [ "lexical-util", ] +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" version = "0.2.180" @@ -3754,6 +3812,17 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "md-5" version = "0.10.6" @@ -6573,6 +6642,15 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.8.1" diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index 5e30883e35..b13d6d54fd 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -60,6 +60,7 @@ tempfile = "3.24.0" itertools = "0.14.0" paste = "1.0.14" datafusion = { workspace = true, features = ["parquet_encryption", "sql"] } +datafusion-datasource = { workspace = true } datafusion-spark = { workspace = true } once_cell = "1.18.0" regex = { workspace = true } From 194c836e6e521d6e798caef696a9ba07fe0be878 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 23 Jan 2026 09:30:44 -0700 Subject: [PATCH 10/10] test: add variants of data type tests that exclude u8/u16 columns Add duplicate tests for "basic data type support" and "uint data type support" that skip reading _9 (UINT_8) and _10 (UINT_16) columns. This restores test coverage for the default scan implementation which can be overridden in CI. Co-Authored-By: Claude Opus 4.5 --- .../apache/comet/CometExpressionSuite.scala | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 78bd2ab0b1..e0a5c43aef 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -201,6 +201,22 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } + test("basic data type support - excluding u8/u16") { + // variant that skips _9 (UINT_8) and _10 (UINT_16) for default scan impl + Seq(true, false).foreach { dictionaryEnabled => + withTempDir { dir => + val path = new Path(dir.toURI.toString, "test.parquet") + makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = dictionaryEnabled, 10000) + withParquetTable(path.toString, "tbl") { + // select all columns except _9 (UINT_8) and _10 (UINT_16) + checkSparkAnswerAndOperator( + """select _1, _2, _3, _4, _5, _6, _7, _8, _11, _12, _13, _14, _15, _16, _17, + |_18, _19, _20, _21, _id FROM tbl WHERE _2 > 100""".stripMargin) + } + } + } + } + test("uint data type support") { // this test requires native_comet scan due to unsigned u8/u16 issue withSQLConf(CometConf.COMET_NATIVE_SCAN_IMPL.key -> CometConf.SCAN_NATIVE_COMET) { @@ -221,6 +237,24 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } + test("uint data type support - excluding u8/u16") { + // variant that tests UINT_32 and UINT_64, skipping _9 (UINT_8) and _10 (UINT_16) + Seq(true, false).foreach { dictionaryEnabled => + withTempDir { dir => + val path = new Path(dir.toURI.toString, "testuint.parquet") + makeParquetFileAllPrimitiveTypes( + path, + dictionaryEnabled = dictionaryEnabled, + Byte.MinValue, + Byte.MaxValue) + withParquetTable(path.toString, "tbl") { + // test UINT_32 (_11) and UINT_64 (_12) only + checkSparkAnswerAndOperator("select _11, _12 from tbl order by _11") + } + } + } + } + test("null literals") { val batchSize = 1000 Seq(true, false).foreach { dictionaryEnabled =>