|
1 | 1 | import os
|
| 2 | +import random |
2 | 3 | import re
|
3 | 4 | import typing
|
4 | 5 |
|
5 | 6 | import numpy as np
|
6 | 7 | import pandas as pd
|
| 8 | +import pyarrow.parquet as parquet |
7 | 9 | import pytest
|
8 | 10 |
|
9 | 11 | import audeer
|
@@ -1209,32 +1211,48 @@ def test_map(table, map):
|
1209 | 1211 |
|
1210 | 1212 |
|
1211 | 1213 | @pytest.mark.parametrize(
|
1212 |
| - "table_id, expected_md5sum", |
| 1214 | + "table_id, expected_hash", |
1213 | 1215 | [
|
1214 |
| - ("files", "a856aef8ec9d5e4b1752a13ad68cc0c2"), |
| 1216 | + ("files", "-4778271914368537359"), |
| 1217 | + ("segments", "6154135801036965154"), |
| 1218 | + ("misc", "8941499293930597709"), |
1215 | 1219 | ],
|
1216 | 1220 | )
|
1217 |
| -def test_parquet_reproducibility(tmpdir, table_id, expected_md5sum): |
| 1221 | +def test_parquet_reproducibility(tmpdir, table_id, expected_hash): |
1218 | 1222 | r"""Test reproducibility of binary PARQUET files.
|
1219 | 1223 |
|
1220 | 1224 | When storing the same dataframe
|
1221 | 1225 | to different PARQUET files,
|
1222 |
| - those files should have an identical |
1223 |
| - MD5sum, |
1224 |
| - which should also be reproducible |
1225 |
| - across different pandas and pyarrow versions. |
| 1226 | + the files will slightly vary |
| 1227 | + and have different MD5 sums. |
| 1228 | +
|
| 1229 | + To provide a reproducible hash, |
| 1230 | + in order to judge if a table has changed, |
| 1231 | + we calculate the hash of the table |
| 1232 | + and store it in the metadata |
| 1233 | + of the schema |
| 1234 | + of a the table. |
1226 | 1235 |
|
1227 | 1236 | """
|
| 1237 | + random.seed(1) # ensure the same random table values are created |
1228 | 1238 | db = audformat.testing.create_db()
|
| 1239 | + |
| 1240 | + # Check that the output of audfromat.utils.hash() does not change |
| 1241 | + assert audformat.utils.hash(db[table_id].df) == expected_hash |
| 1242 | + |
| 1243 | + # Write to PARQUET file and check if correct hash is stored |
1229 | 1244 | path_wo_ext = audeer.path(tmpdir, table_id)
|
1230 | 1245 | path = f"{path_wo_ext}.parquet"
|
1231 | 1246 | db[table_id].save(path_wo_ext, storage_format="parquet")
|
1232 |
| - assert audeer.md5(path) == expected_md5sum |
1233 |
| - # Repeat writing after loading table |
| 1247 | + metadata = parquet.read_schema(path).metadata |
| 1248 | + assert metadata[b"hash"].decode() == expected_hash |
| 1249 | + |
| 1250 | + # Load table from PARQUET file, and overwrite it |
1234 | 1251 | db[table_id].load(path_wo_ext)
|
1235 | 1252 | os.remove(path)
|
1236 | 1253 | db[table_id].save(path_wo_ext, storage_format="parquet")
|
1237 |
| - assert audeer.md5(path) == expected_md5sum |
| 1254 | + metadata = parquet.read_schema(path).metadata |
| 1255 | + assert metadata[b"hash"].decode() == expected_hash |
1238 | 1256 |
|
1239 | 1257 |
|
1240 | 1258 | @pytest.mark.parametrize(
|
|
0 commit comments