-
Notifications
You must be signed in to change notification settings - Fork 4k
Open
Description
Describe the bug, including details regarding any error messages, version, and platform.
Fails with:
Cannot decrypt ColumnMetadata. FileDecryption is not setup correctly
This is using plaintext footer.
Reproducer:
import os
import pyarrow.parquet.encryption as pe
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow as pa
import base64
import polars as pl
class KmsClient(pe.KmsClient):
def unwrap_key(self, wrapped_key, master_key_identifier):
return base64.b64decode(wrapped_key)
def wrap_key(self, key_bytes, master_key_identifier):
return base64.b64encode(key_bytes)
def write(location):
cf = pe.CryptoFactory(lambda *a, **k: KmsClient())
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [1, 2, 3],
"year": [2020, 2020, 2021]
})
ecfg = pe.EncryptionConfiguration(
footer_key="TEST",
column_keys={
"TEST": ["col2"]
},
double_wrapping=False,
plaintext_footer=False,
)
table = df.to_arrow()
parquet_encryption_cfg = ds.ParquetEncryptionConfig(
cf, pe.KmsConnectionConfig(), ecfg
)
metadata_collector = []
pq.write_to_dataset(
table,
location,
partitioning=ds.partitioning(
schema=pa.schema([
pa.field("year", pa.int16())
]),
flavor="hive"
),
encryption_config=parquet_encryption_cfg,
metadata_collector=metadata_collector
)
pq.write_metadata(
pa.schema(
field
for field in table.schema
if field.name != "year"
),
os.path.join(location, "_metadata"),
metadata_collector
)
def read(location):
decryption_config = pe.DecryptionConfiguration(cache_lifetime=300)
kms_connection_config = pe.KmsConnectionConfig()
cf = pe.CryptoFactory(lambda *a, **k: KmsClient())
parquet_decryption_cfg = ds.ParquetDecryptionConfig(
cf, kms_connection_config, decryption_config
)
decryption_properties = cf.file_decryption_properties(
kms_connection_config, decryption_config)
pq_scan_opts = ds.ParquetFragmentScanOptions(
decryption_config=parquet_decryption_cfg,
# If using build from master
# decryption_properties=decryption_properties
)
pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts)
dataset = ds.parquet_dataset(
os.path.join(location, "_metadata"),
format=pformat,
partitioning=ds.partitioning(
schema=pa.schema([
pa.field("year", pa.int16())
]),
flavor="hive"
)
)
print(dataset.to_table())
if __name__ == '__main__':
location = r"/tmp/dataset-test"
os.makedirs(location, exist_ok=True)
write(location)
read(location)Presumably the metadata read out of _metadata file is not decrypted or the footer indicates incorrectly whether it's encrypted or not.
Tried with latest master which contains:
bd44410
Component(s)
C++, Python