Skip to content

Commit 880be2f

Browse files
authored
feat: deterministic metadata encoding (#7437)
* Apply ordering to metadata keys before encoding * Add unit test to ensure consistent metadata encoding * Clippy warning
1 parent 2b3a821 commit 880be2f

File tree

2 files changed

+51
-3
lines changed

2 files changed

+51
-3
lines changed

arrow-ipc/src/convert.rs

+6-3
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,12 @@ pub fn metadata_to_fb<'a>(
138138
fbb: &mut FlatBufferBuilder<'a>,
139139
metadata: &HashMap<String, String>,
140140
) -> WIPOffset<Vector<'a, ForwardsUOffset<KeyValue<'a>>>> {
141-
let custom_metadata = metadata
142-
.iter()
143-
.map(|(k, v)| {
141+
let mut ordered_keys = metadata.keys().collect::<Vec<_>>();
142+
ordered_keys.sort();
143+
let custom_metadata = ordered_keys
144+
.into_iter()
145+
.map(|k| {
146+
let v = metadata.get(k).unwrap();
144147
let fb_key_name = fbb.create_string(k);
145148
let fb_val_name = fbb.create_string(v);
146149

arrow-ipc/src/writer.rs

+45
Original file line numberDiff line numberDiff line change
@@ -1854,6 +1854,7 @@ fn pad_to_alignment(alignment: u8, len: usize) -> usize {
18541854

18551855
#[cfg(test)]
18561856
mod tests {
1857+
use std::hash::Hasher;
18571858
use std::io::Cursor;
18581859
use std::io::Seek;
18591860

@@ -3306,4 +3307,48 @@ mod tests {
33063307

33073308
Ok(())
33083309
}
3310+
3311+
#[test]
3312+
fn test_metadata_encoding_ordering() {
3313+
fn create_hash() -> u64 {
3314+
let metadata: HashMap<String, String> = [
3315+
("a", "1"), //
3316+
("b", "2"), //
3317+
("c", "3"), //
3318+
("d", "4"), //
3319+
("e", "5"), //
3320+
]
3321+
.into_iter()
3322+
.map(|(k, v)| (k.to_owned(), v.to_owned()))
3323+
.collect();
3324+
3325+
// Set metadata on both the schema and a field within it.
3326+
let schema = Arc::new(
3327+
Schema::new(vec![
3328+
Field::new("a", DataType::Int64, true).with_metadata(metadata.clone())
3329+
])
3330+
.with_metadata(metadata)
3331+
.clone(),
3332+
);
3333+
let batch = RecordBatch::new_empty(schema.clone());
3334+
3335+
let mut bytes = Vec::new();
3336+
let mut w = StreamWriter::try_new(&mut bytes, batch.schema_ref()).unwrap();
3337+
w.write(&batch).unwrap();
3338+
w.finish().unwrap();
3339+
3340+
let mut h = std::hash::DefaultHasher::new();
3341+
h.write(&bytes);
3342+
h.finish()
3343+
}
3344+
3345+
let expected = create_hash();
3346+
3347+
// Since there is randomness in the HashMap and we cannot specify our
3348+
// own Hasher for the implementation used for metadata, run the above
3349+
// code 20x and verify it does not change. This is not perfect but it
3350+
// should be good enough.
3351+
let all_passed = (0..20).all(|_| create_hash() == expected);
3352+
assert!(all_passed);
3353+
}
33093354
}

0 commit comments

Comments
 (0)