Skip to content

Commit 9ec9993

Browse files
committed
Store hash inside parquet file
1 parent 01b3492 commit 9ec9993

File tree

2 files changed

+60
-10
lines changed

2 files changed

+60
-10
lines changed

audformat/core/table.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,38 @@ def _save_csv(self, path: str):
10991099

11001100
def _save_parquet(self, path: str):
11011101
table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)
1102+
1103+
# Add hash of dataframe
1104+
# to the metadata,
1105+
# which pyarrow stores inside the schema.
1106+
# See https://stackoverflow.com/a/58978449
1107+
try:
1108+
metadata = {"hash": utils.hash(self.df)}
1109+
except TypeError:
1110+
# Levels/columns with dtype "object" might not be hashable,
1111+
# e.g. when storing numpy arrays.
1112+
# We convert them to strings in this case.
1113+
#
1114+
# Index
1115+
df = self.df.copy()
1116+
update_index_dtypes = {
1117+
level: "string"
1118+
for level, dtype in self._levels_and_dtypes.items()
1119+
if dtype == define.DataType.OBJECT
1120+
}
1121+
df.index = utils.set_index_dtypes(df.index, update_index_dtypes)
1122+
# Columns
1123+
for column_id, column in self.columns.items():
1124+
if column.scheme_id is not None:
1125+
scheme = self.db.schemes[column.scheme_id]
1126+
if scheme.dtype == define.DataType.OBJECT:
1127+
df[column_id] = df[column_id].astype("string")
1128+
else:
1129+
# No scheme defaults to `object` dtype
1130+
df[column_id] = df[column_id].astype("string")
1131+
metadata = {"hash": utils.hash(df)}
1132+
1133+
table = table.replace_schema_metadata({**metadata, **table.schema.metadata})
11021134
parquet.write_table(table, path, compression="snappy")
11031135

11041136
def _save_pickled(self, path: str):

tests/test_table.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import os
2+
import random
23
import re
34
import typing
45

56
import numpy as np
67
import pandas as pd
8+
import pyarrow.parquet as parquet
79
import pytest
810

911
import audeer
@@ -1209,32 +1211,48 @@ def test_map(table, map):
12091211

12101212

12111213
@pytest.mark.parametrize(
1212-
"table_id, expected_md5sum",
1214+
"table_id, expected_hash",
12131215
[
1214-
("files", "a856aef8ec9d5e4b1752a13ad68cc0c2"),
1216+
("files", "-4778271914368537359"),
1217+
("segments", "6154135801036965154"),
1218+
("misc", "8941499293930597709"),
12151219
],
12161220
)
1217-
def test_parquet_reproducibility(tmpdir, table_id, expected_md5sum):
1221+
def test_parquet_reproducibility(tmpdir, table_id, expected_hash):
12181222
r"""Test reproducibility of binary PARQUET files.
12191223
12201224
When storing the same dataframe
12211225
to different PARQUET files,
1222-
those files should have an identical
1223-
MD5sum,
1224-
which should also be reproducible
1225-
across different pandas and pyarrow versions.
1226+
the files will slightly vary
1227+
and have different MD5 sums.
1228+
1229+
To provide a reproducible hash,
1230+
in order to judge if a table has changed,
1231+
we calculate the hash of the table
1232+
and store it in the metadata
1233+
of the schema
1234+
of a the table.
12261235
12271236
"""
1237+
random.seed(1) # ensure the same random table values are created
12281238
db = audformat.testing.create_db()
1239+
1240+
# Check that the output of audfromat.utils.hash() does not change
1241+
assert audformat.utils.hash(db[table_id].df) == expected_hash
1242+
1243+
# Write to PARQUET file and check if correct hash is stored
12291244
path_wo_ext = audeer.path(tmpdir, table_id)
12301245
path = f"{path_wo_ext}.parquet"
12311246
db[table_id].save(path_wo_ext, storage_format="parquet")
1232-
assert audeer.md5(path) == expected_md5sum
1233-
# Repeat writing after loading table
1247+
metadata = parquet.read_schema(path).metadata
1248+
assert metadata[b"hash"].decode() == expected_hash
1249+
1250+
# Load table from PARQUET file, and overwrite it
12341251
db[table_id].load(path_wo_ext)
12351252
os.remove(path)
12361253
db[table_id].save(path_wo_ext, storage_format="parquet")
1237-
assert audeer.md5(path) == expected_md5sum
1254+
metadata = parquet.read_schema(path).metadata
1255+
assert metadata[b"hash"].decode() == expected_hash
12381256

12391257

12401258
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)