Skip to content

Commit 9fc2fbb

Browse files
alambrokadamreeve
authored
[57_maintenance[Parquet] Provide only encrypted column stats in plaintext footer (#8305) (#9310)
- Part of #9240 - Related to #8304 This is a backport of the following PR to the 57 line - #8305 from @rok Co-authored-by: Rok Mihevc <rok@mihevc.org> Co-authored-by: Adam Reeve <adreeve@gmail.com>
1 parent 3df3157 commit 9fc2fbb

File tree

5 files changed

+331
-73
lines changed

5 files changed

+331
-73
lines changed

parquet/src/file/metadata/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,11 @@ pub struct ColumnChunkMetaData {
836836
column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
837837
#[cfg(feature = "encryption")]
838838
encrypted_column_metadata: Option<Vec<u8>>,
839+
/// When true, indicates the footer is plaintext (not encrypted).
840+
/// This affects how column metadata is serialized when `encrypted_column_metadata` is present.
841+
/// This field is only used at write time and is not needed when reading metadata.
842+
#[cfg(feature = "encryption")]
843+
plaintext_footer_mode: bool,
839844
}
840845

841846
/// Histograms for repetition and definition levels.
@@ -1238,6 +1243,8 @@ impl ColumnChunkMetaDataBuilder {
12381243
column_crypto_metadata: None,
12391244
#[cfg(feature = "encryption")]
12401245
encrypted_column_metadata: None,
1246+
#[cfg(feature = "encryption")]
1247+
plaintext_footer_mode: false,
12411248
})
12421249
}
12431250

parquet/src/file/metadata/thrift/encryption.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,18 @@ fn row_group_from_encrypted_thrift(
145145
}
146146
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(crypto_metadata)) => {
147147
let column_name = crypto_metadata.path_in_schema.join(".");
148-
decryptor.get_column_metadata_decryptor(
148+
// Try to get the decryptor - if it fails, we don't have the key
149+
match decryptor.get_column_metadata_decryptor(
149150
column_name.as_str(),
150151
crypto_metadata.key_metadata.as_deref(),
151-
)?
152+
) {
153+
Ok(dec) => dec,
154+
Err(_) => {
155+
// We don't have the key for this column, so we can't decrypt its metadata.
156+
columns.push(c);
157+
continue;
158+
}
159+
}
152160
}
153161
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) => {
154162
decryptor.get_footer_decryptor()?

parquet/src/file/metadata/thrift/mod.rs

Lines changed: 66 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1281,6 +1281,19 @@ impl PageHeader {
12811281
/////////////////////////////////////////////////
12821282
// helper functions for writing file meta data
12831283

1284+
#[cfg(feature = "encryption")]
1285+
fn should_write_column_stats(column_chunk: &ColumnChunkMetaData) -> bool {
1286+
// If there is encrypted column metadata present,
1287+
// the column is encrypted with a different key to the footer or a plaintext footer is used,
1288+
// so the statistics are sensitive and shouldn't be written.
1289+
column_chunk.encrypted_column_metadata.is_none()
1290+
}
1291+
1292+
#[cfg(not(feature = "encryption"))]
1293+
fn should_write_column_stats(_column_chunk: &ColumnChunkMetaData) -> bool {
1294+
true
1295+
}
1296+
12841297
// serialize the bits of the column chunk needed for a thrift ColumnMetaData
12851298
// struct ColumnMetaData {
12861299
// 1: required Type type
@@ -1331,48 +1344,51 @@ pub(super) fn serialize_column_meta_data<W: Write>(
13311344
if let Some(dictionary_page_offset) = column_chunk.dictionary_page_offset {
13321345
last_field_id = dictionary_page_offset.write_thrift_field(w, 11, last_field_id)?;
13331346
}
1334-
// PageStatistics is the same as thrift Statistics, but writable
1335-
let stats = page_stats_to_thrift(column_chunk.statistics());
1336-
if let Some(stats) = stats {
1337-
last_field_id = stats.write_thrift_field(w, 12, last_field_id)?;
1338-
}
1339-
if let Some(page_encoding_stats) = column_chunk.page_encoding_stats() {
1340-
last_field_id = page_encoding_stats.write_thrift_field(w, 13, last_field_id)?;
1341-
}
1342-
if let Some(bloom_filter_offset) = column_chunk.bloom_filter_offset {
1343-
last_field_id = bloom_filter_offset.write_thrift_field(w, 14, last_field_id)?;
1344-
}
1345-
if let Some(bloom_filter_length) = column_chunk.bloom_filter_length {
1346-
last_field_id = bloom_filter_length.write_thrift_field(w, 15, last_field_id)?;
1347-
}
13481347

1349-
// SizeStatistics
1350-
let size_stats = if column_chunk.unencoded_byte_array_data_bytes.is_some()
1351-
|| column_chunk.repetition_level_histogram.is_some()
1352-
|| column_chunk.definition_level_histogram.is_some()
1353-
{
1354-
let repetition_level_histogram = column_chunk
1355-
.repetition_level_histogram()
1356-
.map(|hist| hist.clone().into_inner());
1357-
1358-
let definition_level_histogram = column_chunk
1359-
.definition_level_histogram()
1360-
.map(|hist| hist.clone().into_inner());
1361-
1362-
Some(SizeStatistics {
1363-
unencoded_byte_array_data_bytes: column_chunk.unencoded_byte_array_data_bytes,
1364-
repetition_level_histogram,
1365-
definition_level_histogram,
1366-
})
1367-
} else {
1368-
None
1369-
};
1370-
if let Some(size_stats) = size_stats {
1371-
last_field_id = size_stats.write_thrift_field(w, 16, last_field_id)?;
1372-
}
1348+
if should_write_column_stats(column_chunk) {
1349+
// PageStatistics is the same as thrift Statistics, but writable
1350+
let stats = page_stats_to_thrift(column_chunk.statistics());
1351+
if let Some(stats) = stats {
1352+
last_field_id = stats.write_thrift_field(w, 12, last_field_id)?;
1353+
}
1354+
if let Some(page_encoding_stats) = column_chunk.page_encoding_stats() {
1355+
last_field_id = page_encoding_stats.write_thrift_field(w, 13, last_field_id)?;
1356+
}
1357+
if let Some(bloom_filter_offset) = column_chunk.bloom_filter_offset {
1358+
last_field_id = bloom_filter_offset.write_thrift_field(w, 14, last_field_id)?;
1359+
}
1360+
if let Some(bloom_filter_length) = column_chunk.bloom_filter_length {
1361+
last_field_id = bloom_filter_length.write_thrift_field(w, 15, last_field_id)?;
1362+
}
13731363

1374-
if let Some(geo_stats) = column_chunk.geo_statistics() {
1375-
geo_stats.write_thrift_field(w, 17, last_field_id)?;
1364+
// SizeStatistics
1365+
let size_stats = if column_chunk.unencoded_byte_array_data_bytes.is_some()
1366+
|| column_chunk.repetition_level_histogram.is_some()
1367+
|| column_chunk.definition_level_histogram.is_some()
1368+
{
1369+
let repetition_level_histogram = column_chunk
1370+
.repetition_level_histogram()
1371+
.map(|hist| hist.clone().into_inner());
1372+
1373+
let definition_level_histogram = column_chunk
1374+
.definition_level_histogram()
1375+
.map(|hist| hist.clone().into_inner());
1376+
1377+
Some(SizeStatistics {
1378+
unencoded_byte_array_data_bytes: column_chunk.unencoded_byte_array_data_bytes,
1379+
repetition_level_histogram,
1380+
definition_level_histogram,
1381+
})
1382+
} else {
1383+
None
1384+
};
1385+
if let Some(size_stats) = size_stats {
1386+
last_field_id = size_stats.write_thrift_field(w, 16, last_field_id)?;
1387+
}
1388+
1389+
if let Some(geo_stats) = column_chunk.geo_statistics() {
1390+
geo_stats.write_thrift_field(w, 17, last_field_id)?;
1391+
}
13761392
}
13771393

13781394
w.write_struct_end()
@@ -1592,17 +1608,17 @@ impl WriteThrift for ColumnChunkMetaData {
15921608
.write_thrift_field(writer, 2, last_field_id)?;
15931609

15941610
#[cfg(feature = "encryption")]
1595-
{
1596-
// only write the ColumnMetaData if we haven't already encrypted it
1597-
if self.encrypted_column_metadata.is_none() {
1598-
writer.write_field_begin(FieldType::Struct, 3, last_field_id)?;
1599-
serialize_column_meta_data(self, writer)?;
1600-
last_field_id = 3;
1601-
}
1602-
}
1611+
let write_meta_data =
1612+
self.encrypted_column_metadata.is_none() || self.plaintext_footer_mode;
16031613
#[cfg(not(feature = "encryption"))]
1604-
{
1605-
// always write the ColumnMetaData
1614+
let write_meta_data = true;
1615+
1616+
// When the footer is encrypted and encrypted_column_metadata is present,
1617+
// skip writing the plaintext meta_data field to reduce footer size.
1618+
// When the footer is plaintext (plaintext_footer_mode=true), we still write
1619+
// meta_data for backward compatibility with readers that expect it, but with
1620+
// sensitive fields (statistics, bloom filter info, etc.) stripped out.
1621+
if write_meta_data {
16061622
writer.write_field_begin(FieldType::Struct, 3, last_field_id)?;
16071623
serialize_column_meta_data(self, writer)?;
16081624
last_field_id = 3;

parquet/src/file/metadata/writer.rs

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -818,34 +818,48 @@ impl MetadataObjectWriter {
818818
) -> Result<ColumnChunkMetaData> {
819819
// Column crypto metadata should have already been set when the column was created.
820820
// Here we apply the encryption by encrypting the column metadata if required.
821-
match column_chunk.column_crypto_metadata.as_deref() {
822-
None => {}
821+
let encryptor = match column_chunk.column_crypto_metadata.as_deref() {
822+
None => None,
823823
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) => {
824+
let is_footer_encrypted = file_encryptor.properties().encrypt_footer();
825+
824826
// When uniform encryption is used the footer is already encrypted,
825827
// so the column chunk does not need additional encryption.
828+
// Except if we're in plaintext footer mode, then we need to encrypt
829+
// the column metadata here.
830+
if !is_footer_encrypted {
831+
Some(file_encryptor.get_footer_encryptor()?)
832+
} else {
833+
None
834+
}
826835
}
827836
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(col_key)) => {
828-
use crate::file::metadata::thrift::serialize_column_meta_data;
829-
830837
let column_path = col_key.path_in_schema.join(".");
831-
let mut column_encryptor = file_encryptor.get_column_encryptor(&column_path)?;
832-
let aad = create_module_aad(
833-
file_encryptor.file_aad(),
834-
ModuleType::ColumnMetaData,
835-
row_group_index,
836-
column_index,
837-
None,
838-
)?;
839-
// create temp ColumnMetaData that we can encrypt
840-
let mut buffer: Vec<u8> = vec![];
841-
{
842-
let mut prot = ThriftCompactOutputProtocol::new(&mut buffer);
843-
serialize_column_meta_data(&column_chunk, &mut prot)?;
844-
}
845-
let ciphertext = column_encryptor.encrypt(&buffer, &aad)?;
838+
Some(file_encryptor.get_column_encryptor(&column_path)?)
839+
}
840+
};
841+
842+
if let Some(mut encryptor) = encryptor {
843+
use crate::file::metadata::thrift::serialize_column_meta_data;
846844

847-
column_chunk.encrypted_column_metadata = Some(ciphertext);
845+
let aad = create_module_aad(
846+
file_encryptor.file_aad(),
847+
ModuleType::ColumnMetaData,
848+
row_group_index,
849+
column_index,
850+
None,
851+
)?;
852+
// create temp ColumnMetaData that we can encrypt
853+
let mut buffer: Vec<u8> = vec![];
854+
{
855+
let mut prot = ThriftCompactOutputProtocol::new(&mut buffer);
856+
serialize_column_meta_data(&column_chunk, &mut prot)?;
848857
}
858+
let ciphertext = encryptor.encrypt(&buffer, &aad)?;
859+
column_chunk.encrypted_column_metadata = Some(ciphertext);
860+
// Track whether the footer is plaintext, which affects how we serialize
861+
// the column metadata (we need to write stripped metadata for backward compatibility)
862+
column_chunk.plaintext_footer_mode = !file_encryptor.properties().encrypt_footer();
849863
}
850864

851865
Ok(column_chunk)

0 commit comments

Comments
 (0)