Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion cpp/velox/substrait/SubstraitToVeloxPlan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1495,6 +1495,31 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
// The columns present in the table, if not available default to the baseSchema.
auto tableSchema = splitInfo->tableSchema ? splitInfo->tableSchema : baseSchema;

// Build dataColumns from tableSchema, excluding partition columns.
// HiveTableHandle::dataColumns() is used as fileSchema for the reader.
// Partition columns should not be validated against the file's physical types
// (their values come from the partition path, not from the file).
std::unordered_set<std::string> partitionColNames;
for (int idx = 0; idx < colNameList.size(); idx++) {
if (columnTypes[idx] == ColumnType::kPartitionKey) {
partitionColNames.insert(colNameList[idx]);
}
}
RowTypePtr dataColumns;
if (partitionColNames.empty()) {
dataColumns = tableSchema;
} else {
std::vector<std::string> dataColNames;
std::vector<TypePtr> dataColTypes;
for (int idx = 0; idx < tableSchema->size(); idx++) {
if (partitionColNames.find(tableSchema->nameOf(idx)) == partitionColNames.end()) {
dataColNames.push_back(tableSchema->nameOf(idx));
dataColTypes.push_back(tableSchema->childAt(idx));
}
}
dataColumns = ROW(std::move(dataColNames), std::move(dataColTypes));
}

connector::ConnectorTableHandlePtr tableHandle;
auto remainingFilter = readRel.has_filter() ? exprConverter_->toVeloxExpr(readRel.filter(), baseSchema) : nullptr;
auto connectorId = kHiveConnectorId;
Expand All @@ -1506,7 +1531,7 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
}
common::SubfieldFilters subfieldFilters;
tableHandle = std::make_shared<connector::hive::HiveTableHandle>(
connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, tableSchema);
connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, dataColumns);

// Get assignments and out names.
std::vector<std::string> outNames;
Expand Down
4 changes: 2 additions & 2 deletions ep/build-velox/src/get-velox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
set -exu

CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
VELOX_REPO=https://github.com/IBM/velox.git
VELOX_BRANCH=dft-2026_03_08-iceberg
VELOX_REPO=https://github.com/baibaichen/velox.git
VELOX_BRANCH=pr2/fix-parquet-thrift-spark18108
VELOX_ENHANCED_BRANCH=ibm-2026_03_08
VELOX_HOME=""
RUN_SETUP_SCRIPT=ON
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
Expand All @@ -463,6 +464,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV2QuerySuite]
// Unsupport spark.sql.files.ignoreCorruptFiles.
.exclude("Enabling/disabling ignoreCorruptFiles")
Expand All @@ -471,6 +473,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV1SchemaPruningSuite]
enableSuite[GlutenParquetV2SchemaPruningSuite]
enableSuite[GlutenParquetRebaseDatetimeV1Suite]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
Expand All @@ -468,6 +469,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV2QuerySuite]
.exclude("row group skipping doesn't overflow when reading into larger type")
// Unsupport spark.sql.files.ignoreCorruptFiles.
Expand All @@ -477,6 +479,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV1SchemaPruningSuite]
enableSuite[GlutenParquetV2SchemaPruningSuite]
enableSuite[GlutenParquetRebaseDatetimeV1Suite]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-35640: int as long should throw schema incompatible error")
// Velox parquet reader not allow offset zero.
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
Expand All @@ -411,6 +412,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV2QuerySuite]
.exclude("row group skipping doesn't overflow when reading into larger type")
// Unsupport spark.sql.files.ignoreCorruptFiles.
Expand All @@ -420,6 +422,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV1SchemaPruningSuite]
enableSuite[GlutenParquetV2SchemaPruningSuite]
enableSuite[GlutenParquetRebaseDatetimeV1Suite]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,11 +363,9 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)")
.exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)")
.exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)")
.exclude("unsupported parquet conversion LongType -> DateType")
.exclude("unsupported parquet conversion LongType -> DecimalType(10,0)")
.exclude("unsupported parquet conversion LongType -> DecimalType(19,0)")
.exclude("unsupported parquet conversion LongType -> DecimalType(20,1)")
.exclude("unsupported parquet conversion LongType -> IntegerType")
.exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)")
.exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)")
.exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)")
Expand All @@ -379,13 +377,15 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("parquet widening conversion IntegerType -> DecimalType(20,0)")
.exclude("parquet widening conversion IntegerType -> DecimalType(38,0)")
.exclude("parquet widening conversion IntegerType -> DoubleType")
.exclude("parquet widening conversion IntegerType -> ShortType")
.exclude("parquet widening conversion LongType -> DecimalType(20,0)")
.exclude("parquet widening conversion LongType -> DecimalType(21,1)")
.exclude("parquet widening conversion LongType -> DecimalType(38,0)")
.exclude("parquet widening conversion ShortType -> DecimalType(11,1)")
.exclude("parquet widening conversion ShortType -> DecimalType(20,0)")
.exclude("parquet widening conversion ShortType -> DecimalType(38,0)")
.exclude("parquet widening conversion ShortType -> DoubleType")
.exclude("parquet decimal type change IntegerType -> ShortType overflows")
enableSuite[GlutenParquetVariantShreddingSuite]
// Generated suites for org.apache.spark.sql.execution.datasources.text
// TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite] // 1 failure
Expand Down Expand Up @@ -578,6 +578,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-40128 read DELTA_LENGTH_BYTE_ARRAY encoded strings")
// TODO: fix in Spark-4.0
.exclude("explode nested lists crossing a rowgroup boundary")
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
Expand All @@ -593,6 +594,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV2QuerySuite]
.exclude("row group skipping doesn't overflow when reading into larger type")
// Unsupport spark.sql.files.ignoreCorruptFiles.
Expand All @@ -602,6 +604,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV1SchemaPruningSuite]
enableSuite[GlutenParquetV2SchemaPruningSuite]
enableSuite[GlutenParquetRebaseDatetimeV1Suite]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,11 +374,9 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)")
.exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)")
.exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)")
.exclude("unsupported parquet conversion LongType -> DateType")
.exclude("unsupported parquet conversion LongType -> DecimalType(10,0)")
.exclude("unsupported parquet conversion LongType -> DecimalType(19,0)")
.exclude("unsupported parquet conversion LongType -> DecimalType(20,1)")
.exclude("unsupported parquet conversion LongType -> IntegerType")
.exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)")
.exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)")
.exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)")
Expand All @@ -390,13 +388,15 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("parquet widening conversion IntegerType -> DecimalType(20,0)")
.exclude("parquet widening conversion IntegerType -> DecimalType(38,0)")
.exclude("parquet widening conversion IntegerType -> DoubleType")
.exclude("parquet widening conversion IntegerType -> ShortType")
.exclude("parquet widening conversion LongType -> DecimalType(20,0)")
.exclude("parquet widening conversion LongType -> DecimalType(21,1)")
.exclude("parquet widening conversion LongType -> DecimalType(38,0)")
.exclude("parquet widening conversion ShortType -> DecimalType(11,1)")
.exclude("parquet widening conversion ShortType -> DecimalType(20,0)")
.exclude("parquet widening conversion ShortType -> DecimalType(38,0)")
.exclude("parquet widening conversion ShortType -> DoubleType")
.exclude("parquet decimal type change IntegerType -> ShortType overflows")
// TODO: 4.x enableSuite[GlutenParquetVariantShreddingSuite] // 1 failure
// Generated suites for org.apache.spark.sql.execution.datasources.text
// TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite] // 1 failure
Expand Down Expand Up @@ -539,6 +539,7 @@ class VeloxTestSettings extends BackendTestSettings {
.excludeByPrefix("SPARK-53535") // see https://issues.apache.org/jira/browse/SPARK-53535
.excludeByPrefix("vectorized reader: missing all struct fields")
.excludeByPrefix("SPARK-54220") // https://issues.apache.org/jira/browse/SPARK-54220
.exclude("SPARK-34817: Read UINT_64 as Decimal from parquet")
enableSuite[GlutenParquetV1PartitionDiscoverySuite]
enableSuite[GlutenParquetV2PartitionDiscoverySuite]
enableSuite[GlutenParquetProtobufCompatibilitySuite]
Expand All @@ -554,6 +555,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV2QuerySuite]
.exclude("row group skipping doesn't overflow when reading into larger type")
// Unsupport spark.sql.files.ignoreCorruptFiles.
Expand All @@ -563,6 +565,7 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite because the filter after datasource is not needed.
.exclude(
"SPARK-26677: negated null-safe equality comparison should not filter matched row groups")
.exclude("SPARK-16632: read Parquet int32 as ByteType and ShortType")
enableSuite[GlutenParquetV1SchemaPruningSuite]
enableSuite[GlutenParquetV2SchemaPruningSuite]
enableSuite[GlutenParquetRebaseDatetimeV1Suite]
Expand Down
Loading