Skip to content

Commit 6529551

Browse files
authored
CLN: test_hashing (#44815)
1 parent a755700 commit 6529551

File tree

1 file changed

+80
-61
lines changed

1 file changed

+80
-61
lines changed

pandas/tests/util/test_hashing.py

+80-61
Original file line numberDiff line numberDiff line change
@@ -37,39 +37,6 @@ def index(request):
3737
return request.param
3838

3939

40-
def _check_equal(obj, **kwargs):
41-
"""
42-
Check that hashing an objects produces the same value each time.
43-
44-
Parameters
45-
----------
46-
obj : object
47-
The object to hash.
48-
kwargs : kwargs
49-
Keyword arguments to pass to the hashing function.
50-
"""
51-
a = hash_pandas_object(obj, **kwargs)
52-
b = hash_pandas_object(obj, **kwargs)
53-
tm.assert_series_equal(a, b)
54-
55-
56-
def _check_not_equal_with_index(obj):
57-
"""
58-
Check the hash of an object with and without its index is not the same.
59-
60-
Parameters
61-
----------
62-
obj : object
63-
The object to hash.
64-
"""
65-
if not isinstance(obj, Index):
66-
a = hash_pandas_object(obj, index=True)
67-
b = hash_pandas_object(obj, index=False)
68-
69-
if len(obj):
70-
assert not (a == b).all()
71-
72-
7340
def test_consistency():
7441
# Check that our hash doesn't change because of a mistake
7542
# in the actual code; this is the ground truth.
@@ -89,12 +56,10 @@ def test_hash_array(series):
8956
tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr))
9057

9158

92-
@pytest.mark.parametrize(
93-
"arr2", [np.array([3, 4, "All"], dtype="U"), np.array([3, 4, "All"], dtype=object)]
94-
)
95-
def test_hash_array_mixed(arr2):
59+
@pytest.mark.parametrize("dtype", ["U", object])
60+
def test_hash_array_mixed(dtype):
9661
result1 = hash_array(np.array(["3", "4", "All"]))
97-
result2 = hash_array(arr2)
62+
result2 = hash_array(np.array([3, 4, "All"], dtype=dtype))
9863

9964
tm.assert_numpy_array_equal(result1, result2)
10065

@@ -159,32 +124,77 @@ def test_multiindex_objects():
159124
Series(["a", None, "c"]),
160125
Series([True, False, True]),
161126
Series(dtype=object),
162-
Index([1, 2, 3]),
163-
Index([True, False, True]),
164127
DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
165128
DataFrame(),
166129
tm.makeMissingDataframe(),
167130
tm.makeMixedDataFrame(),
168131
tm.makeTimeDataFrame(),
169132
tm.makeTimeSeries(),
170-
tm.makeTimedeltaIndex(),
171-
tm.makePeriodIndex(),
172133
Series(tm.makePeriodIndex()),
173134
Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
135+
],
136+
)
137+
def test_hash_pandas_object(obj, index):
138+
a = hash_pandas_object(obj, index=index)
139+
b = hash_pandas_object(obj, index=index)
140+
tm.assert_series_equal(a, b)
141+
142+
143+
@pytest.mark.parametrize(
144+
"obj",
145+
[
146+
Series([1, 2, 3]),
147+
Series([1.0, 1.5, 3.2]),
148+
Series([1.0, 1.5, np.nan]),
149+
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
150+
Series(["a", "b", "c"]),
151+
Series(["a", np.nan, "c"]),
152+
Series(["a", None, "c"]),
153+
Series([True, False, True]),
154+
DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
155+
tm.makeMissingDataframe(),
156+
tm.makeMixedDataFrame(),
157+
tm.makeTimeDataFrame(),
158+
tm.makeTimeSeries(),
159+
Series(tm.makePeriodIndex()),
160+
Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
161+
],
162+
)
163+
def test_hash_pandas_object_diff_index_non_empty(obj):
164+
a = hash_pandas_object(obj, index=True)
165+
b = hash_pandas_object(obj, index=False)
166+
assert not (a == b).all()
167+
168+
169+
@pytest.mark.parametrize(
170+
"obj",
171+
[
172+
Index([1, 2, 3]),
173+
Index([True, False, True]),
174+
tm.makeTimedeltaIndex(),
175+
tm.makePeriodIndex(),
174176
MultiIndex.from_product(
175177
[range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)]
176178
),
177179
MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]),
178180
],
179181
)
180-
def test_hash_pandas_object(obj, index):
181-
_check_equal(obj, index=index)
182-
_check_not_equal_with_index(obj)
182+
def test_hash_pandas_index(obj, index):
183+
a = hash_pandas_object(obj, index=index)
184+
b = hash_pandas_object(obj, index=index)
185+
tm.assert_series_equal(a, b)
183186

184187

185-
def test_hash_pandas_object2(series, index):
186-
_check_equal(series, index=index)
187-
_check_not_equal_with_index(series)
188+
def test_hash_pandas_series(series, index):
189+
a = hash_pandas_object(series, index=index)
190+
b = hash_pandas_object(series, index=index)
191+
tm.assert_series_equal(a, b)
192+
193+
194+
def test_hash_pandas_series_diff_index(series):
195+
a = hash_pandas_object(series, index=True)
196+
b = hash_pandas_object(series, index=False)
197+
assert not (a == b).all()
188198

189199

190200
@pytest.mark.parametrize(
@@ -193,7 +203,9 @@ def test_hash_pandas_object2(series, index):
193203
def test_hash_pandas_empty_object(obj, index):
194204
# These are by-definition the same with
195205
# or without the index as the data is empty.
196-
_check_equal(obj, index=index)
206+
a = hash_pandas_object(obj, index=index)
207+
b = hash_pandas_object(obj, index=index)
208+
tm.assert_series_equal(a, b)
197209

198210

199211
@pytest.mark.parametrize(
@@ -235,11 +247,10 @@ def test_categorical_with_nan_consistency():
235247
assert result[1] in expected
236248

237249

238-
@pytest.mark.parametrize("obj", [pd.Timestamp("20130101")])
239-
def test_pandas_errors(obj):
250+
def test_pandas_errors():
240251
msg = "Unexpected type for hashing"
241252
with pytest.raises(TypeError, match=msg):
242-
hash_pandas_object(obj)
253+
hash_pandas_object(pd.Timestamp("20130101"))
243254

244255

245256
def test_hash_keys():
@@ -292,12 +303,16 @@ def test_invalid_key():
292303
def test_already_encoded(index):
293304
# If already encoded, then ok.
294305
obj = Series(list("abc")).str.encode("utf8")
295-
_check_equal(obj, index=index)
306+
a = hash_pandas_object(obj, index=index)
307+
b = hash_pandas_object(obj, index=index)
308+
tm.assert_series_equal(a, b)
296309

297310

298311
def test_alternate_encoding(index):
299312
obj = Series(list("abc"))
300-
_check_equal(obj, index=index, encoding="ascii")
313+
a = hash_pandas_object(obj, index=index)
314+
b = hash_pandas_object(obj, index=index)
315+
tm.assert_series_equal(a, b)
301316

302317

303318
@pytest.mark.parametrize("l_exp", range(8))
@@ -332,20 +347,24 @@ def test_hash_collisions():
332347
tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0))
333348

334349

335-
def test_hash_with_tuple():
350+
@pytest.mark.parametrize(
351+
"data, result_data",
352+
[
353+
[[tuple("1"), tuple("2")], [10345501319357378243, 8331063931016360761]],
354+
[[(1,), (2,)], [9408946347443669104, 3278256261030523334]],
355+
],
356+
)
357+
def test_hash_with_tuple(data, result_data):
336358
# GH#28969 array containing a tuple raises on call to arr.astype(str)
337359
# apparently a numpy bug github.com/numpy/numpy/issues/9441
338360

339-
df = DataFrame({"data": [tuple("1"), tuple("2")]})
361+
df = DataFrame({"data": data})
340362
result = hash_pandas_object(df)
341-
expected = Series([10345501319357378243, 8331063931016360761], dtype=np.uint64)
363+
expected = Series(result_data, dtype=np.uint64)
342364
tm.assert_series_equal(result, expected)
343365

344-
df2 = DataFrame({"data": [(1,), (2,)]})
345-
result = hash_pandas_object(df2)
346-
expected = Series([9408946347443669104, 3278256261030523334], dtype=np.uint64)
347-
tm.assert_series_equal(result, expected)
348366

367+
def test_hashable_tuple_args():
349368
# require that the elements of such tuples are themselves hashable
350369

351370
df3 = DataFrame(

0 commit comments

Comments
 (0)