diff --git a/pandas/tests/arrays/sparse/test_indexing.py b/pandas/tests/arrays/sparse/test_indexing.py index f639e9b18596c..d63d0fb07b404 100644 --- a/pandas/tests/arrays/sparse/test_indexing.py +++ b/pandas/tests/arrays/sparse/test_indexing.py @@ -6,18 +6,25 @@ import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray -arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) -arr = SparseArray(arr_data) + +@pytest.fixture +def arr_data(): + return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) + + +@pytest.fixture +def arr(arr_data): + return SparseArray(arr_data) class TestGetitem: - def test_getitem(self): + def test_getitem(self, arr): dense = arr.to_dense() for i, value in enumerate(arr): tm.assert_almost_equal(value, dense[i]) tm.assert_almost_equal(arr[-i], dense[-i]) - def test_getitem_arraylike_mask(self): + def test_getitem_arraylike_mask(self, arr): arr = SparseArray([0, 1, 2]) result = arr[[True, False, True]] expected = SparseArray([0, 2]) @@ -81,7 +88,7 @@ def test_boolean_slice_empty(self): res = arr[[False, False, False]] assert res.dtype == arr.dtype - def test_getitem_bool_sparse_array(self): + def test_getitem_bool_sparse_array(self, arr): # GH 23122 spar_bool = SparseArray([False, True] * 5, dtype=np.bool_, fill_value=True) exp = SparseArray([np.nan, 2, np.nan, 5, 6]) @@ -106,7 +113,7 @@ def test_getitem_bool_sparse_array_as_comparison(self): exp = SparseArray([3.0, 4.0], fill_value=np.nan) tm.assert_sp_array_equal(res, exp) - def test_get_item(self): + def test_get_item(self, arr): zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) assert np.isnan(arr[1]) @@ -129,7 +136,7 @@ def test_get_item(self): class TestSetitem: - def test_set_item(self): + def test_set_item(self, arr_data): arr = SparseArray(arr_data).copy() def setitem(): @@ -146,12 +153,12 @@ def setslice(): class TestTake: - def test_take_scalar_raises(self): + def test_take_scalar_raises(self, arr): msg = "'indices' must be an array, not a scalar '2'." with pytest.raises(ValueError, match=msg): arr.take(2) - def test_take(self): + def test_take(self, arr_data, arr): exp = SparseArray(np.take(arr_data, [2, 3])) tm.assert_sp_array_equal(arr.take([2, 3]), exp) @@ -173,14 +180,14 @@ def test_take_fill_value(self): exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) - def test_take_negative(self): + def test_take_negative(self, arr_data, arr): exp = SparseArray(np.take(arr_data, [-1])) tm.assert_sp_array_equal(arr.take([-1]), exp) exp = SparseArray(np.take(arr_data, [-4, -3, -2])) tm.assert_sp_array_equal(arr.take([-4, -3, -2]), exp) - def test_bad_take(self): + def test_bad_take(self, arr): with pytest.raises(IndexError, match="bounds"): arr.take([11]) diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index b7517b1b16445..7a77a2064e7e0 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -14,77 +14,74 @@ make_sparse_index, ) -TEST_LENGTH = 20 - -plain_case = [ - [0, 7, 15], - [3, 5, 5], - [2, 9, 14], - [2, 3, 5], - [2, 9, 15], - [1, 3, 4], -] -delete_blocks = [ - [0, 5], - [4, 4], - [1], - [4], - [1], - [3], -] -split_blocks = [ - [0], - [10], - [0, 5], - [3, 7], - [0, 5], - [3, 5], -] -skip_block = [ - [10], - [5], - [0, 12], - [5, 3], - [12], - [3], -] - -no_intersect = [ - [0, 10], - [4, 6], - [5, 17], - [4, 2], - [], - [], -] - -one_empty = [ - [0], - [5], - [], - [], - [], - [], -] - -both_empty = [ # type: ignore[var-annotated] - [], - [], - [], - [], - [], - [], -] - -CASES = [plain_case, delete_blocks, split_blocks, skip_block, no_intersect, one_empty] -IDS = [ - "plain_case", - "delete_blocks", - "split_blocks", - "skip_block", - "no_intersect", - "one_empty", -] + +@pytest.fixture +def test_length(): + return 20 + + +@pytest.fixture( + params=[ + [ + [0, 7, 15], + [3, 5, 5], + [2, 9, 14], + [2, 3, 5], + [2, 9, 15], + [1, 3, 4], + ], + [ + [0, 5], + [4, 4], + [1], + [4], + [1], + [3], + ], + [ + [0], + [10], + [0, 5], + [3, 7], + [0, 5], + [3, 5], + ], + [ + [10], + [5], + [0, 12], + [5, 3], + [12], + [3], + ], + [ + [0, 10], + [4, 6], + [5, 17], + [4, 2], + [], + [], + ], + [ + [0], + [5], + [], + [], + [], + [], + ], + ], + ids=[ + "plain_case", + "delete_blocks", + "split_blocks", + "skip_block", + "no_intersect", + "one_empty", + ], +) +def cases(request): + return request.param class TestSparseIndexUnion: @@ -101,7 +98,7 @@ class TestSparseIndexUnion: [[0, 10], [3, 3], [5, 15], [2, 2], [0, 5, 10, 15], [3, 2, 3, 2]], ], ) - def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen): + def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen, test_length): # Case 1 # x: ---- # y: ---- @@ -132,8 +129,8 @@ def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen): # Case 8 # x: ---- --- # y: --- --- - xindex = BlockIndex(TEST_LENGTH, xloc, xlen) - yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) bresult = xindex.make_union(yindex) assert isinstance(bresult, BlockIndex) tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32)) @@ -180,12 +177,12 @@ def test_int_index_make_union(self): class TestSparseIndexIntersect: @td.skip_if_windows - @pytest.mark.parametrize("xloc, xlen, yloc, ylen, eloc, elen", CASES, ids=IDS) - def test_intersect(self, xloc, xlen, yloc, ylen, eloc, elen): - xindex = BlockIndex(TEST_LENGTH, xloc, xlen) - yindex = BlockIndex(TEST_LENGTH, yloc, ylen) - expected = BlockIndex(TEST_LENGTH, eloc, elen) - longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) + def test_intersect(self, cases, test_length): + xloc, xlen, yloc, ylen, eloc, elen = cases + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) + expected = BlockIndex(test_length, eloc, elen) + longer_index = BlockIndex(test_length + 1, yloc, ylen) result = xindex.intersect(yindex) assert result.equals(expected) @@ -493,10 +490,10 @@ def test_equals(self): assert index.equals(index) assert not index.equals(IntIndex(10, [0, 1, 2, 3])) - @pytest.mark.parametrize("xloc, xlen, yloc, ylen, eloc, elen", CASES, ids=IDS) - def test_to_block_index(self, xloc, xlen, yloc, ylen, eloc, elen): - xindex = BlockIndex(TEST_LENGTH, xloc, xlen) - yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + def test_to_block_index(self, cases, test_length): + xloc, xlen, yloc, ylen, _, _ = cases + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) # see if survive the round trip xbindex = xindex.to_int_index().to_block_index() @@ -512,13 +509,13 @@ def test_to_int_index(self): class TestSparseOperators: @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"]) - @pytest.mark.parametrize("xloc, xlen, yloc, ylen, eloc, elen", CASES, ids=IDS) - def test_op(self, opname, xloc, xlen, yloc, ylen, eloc, elen): + def test_op(self, opname, cases, test_length): + xloc, xlen, yloc, ylen, _, _ = cases sparse_op = getattr(splib, f"sparse_{opname}_float64") python_op = getattr(operator, opname) - xindex = BlockIndex(TEST_LENGTH, xloc, xlen) - yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) xdindex = xindex.to_int_index() ydindex = yindex.to_int_index() @@ -542,10 +539,10 @@ def test_op(self, opname, xloc, xlen, yloc, ylen, eloc, elen): # check versus Series... xseries = Series(x, xdindex.indices) - xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill) + xseries = xseries.reindex(np.arange(test_length)).fillna(xfill) yseries = Series(y, ydindex.indices) - yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill) + yseries = yseries.reindex(np.arange(test_length)).fillna(yfill) series_result = python_op(xseries, yseries) series_result = series_result.reindex(ri_index.indices) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index 36b7dcfe4db12..de36d52921622 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -1,8 +1,3 @@ -from typing import ( - Any, - List, -) - import numpy as np import pytest @@ -13,78 +8,72 @@ ) import pandas._testing as tm -m = 50 -n = 1000 -cols = ["jim", "joe", "jolie", "joline", "jolia"] - -vals: List[Any] = [ - np.random.randint(0, 10, n), - np.random.choice(list("abcdefghij"), n), - np.random.choice(pd.date_range("20141009", periods=10).tolist(), n), - np.random.choice(list("ZYXWVUTSRQ"), n), - np.random.randn(n), -] -vals = list(map(tuple, zip(*vals))) - -# bunch of keys for testing -keys: List[Any] = [ - np.random.randint(0, 11, m), - np.random.choice(list("abcdefghijk"), m), - np.random.choice(pd.date_range("20141009", periods=11).tolist(), m), - np.random.choice(list("ZYXWVUTSRQP"), m), -] -keys = list(map(tuple, zip(*keys))) -keys += [t[:-1] for t in vals[:: n // m]] + +@pytest.fixture +def m(): + return 50 + + +@pytest.fixture +def n(): + return 1000 + + +@pytest.fixture +def cols(): + return ["jim", "joe", "jolie", "joline", "jolia"] + + +@pytest.fixture +def vals(n): + vals = [ + np.random.randint(0, 10, n), + np.random.choice(list("abcdefghij"), n), + np.random.choice(pd.date_range("20141009", periods=10).tolist(), n), + np.random.choice(list("ZYXWVUTSRQ"), n), + np.random.randn(n), + ] + vals = list(map(tuple, zip(*vals))) + return vals + + +@pytest.fixture +def keys(n, m, vals): + # bunch of keys for testing + keys = [ + np.random.randint(0, 11, m), + np.random.choice(list("abcdefghijk"), m), + np.random.choice(pd.date_range("20141009", periods=11).tolist(), m), + np.random.choice(list("ZYXWVUTSRQP"), m), + ] + keys = list(map(tuple, zip(*keys))) + keys += [t[:-1] for t in vals[:: n // m]] + return keys # covers both unique index and non-unique index -df = DataFrame(vals, columns=cols) -a = pd.concat([df, df]) -b = df.drop_duplicates(subset=cols[:-1]) - - -def validate(mi, df, key): - # check indexing into a multi-index before & past the lexsort depth - - mask = np.ones(len(df), dtype=bool) - - # test for all partials of this key - for i, k in enumerate(key): - mask &= df.iloc[:, i] == k - - if not mask.any(): - assert key[: i + 1] not in mi.index - continue - - assert key[: i + 1] in mi.index - right = df[mask].copy(deep=False) - - if i + 1 != len(key): # partial key - return_value = right.drop(cols[: i + 1], axis=1, inplace=True) - assert return_value is None - return_value = right.set_index(cols[i + 1 : -1], inplace=True) - assert return_value is None - tm.assert_frame_equal(mi.loc[key[: i + 1]], right) - - else: # full key - return_value = right.set_index(cols[:-1], inplace=True) - assert return_value is None - if len(right) == 1: # single hit - right = Series( - right["jolia"].values, name=right.index[0], index=["jolia"] - ) - tm.assert_series_equal(mi.loc[key[: i + 1]], right) - else: # multi hit - tm.assert_frame_equal(mi.loc[key[: i + 1]], right) +@pytest.fixture +def df(vals, cols): + return DataFrame(vals, columns=cols) + + +@pytest.fixture +def a(df): + return pd.concat([df, df]) + + +@pytest.fixture +def b(df, cols): + return df.drop_duplicates(subset=cols[:-1]) @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") @pytest.mark.parametrize("lexsort_depth", list(range(5))) -@pytest.mark.parametrize("key", keys) -@pytest.mark.parametrize("frame", [a, b]) -def test_multiindex_get_loc(lexsort_depth, key, frame): +@pytest.mark.parametrize("frame_fixture", ["a", "b"]) +def test_multiindex_get_loc(request, lexsort_depth, keys, frame_fixture, cols): # GH7724, GH2646 + frame = request.getfixturevalue(frame_fixture) if lexsort_depth == 0: df = frame.copy(deep=False) else: @@ -92,4 +81,34 @@ def test_multiindex_get_loc(lexsort_depth, key, frame): mi = df.set_index(cols[:-1]) assert not mi.index._lexsort_depth < lexsort_depth - validate(mi, df, key) + for key in keys: + mask = np.ones(len(df), dtype=bool) + + # test for all partials of this key + for i, k in enumerate(key): + mask &= df.iloc[:, i] == k + + if not mask.any(): + assert key[: i + 1] not in mi.index + continue + + assert key[: i + 1] in mi.index + right = df[mask].copy(deep=False) + + if i + 1 != len(key): # partial key + return_value = right.drop(cols[: i + 1], axis=1, inplace=True) + assert return_value is None + return_value = right.set_index(cols[i + 1 : -1], inplace=True) + assert return_value is None + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) + + else: # full key + return_value = right.set_index(cols[:-1], inplace=True) + assert return_value is None + if len(right) == 1: # single hit + right = Series( + right["jolia"].values, name=right.index[0], index=["jolia"] + ) + tm.assert_series_equal(mi.loc[key[: i + 1]], right) + else: # multi hit + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index b863e85cae457..68365c125a951 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -15,9 +15,15 @@ import pandas._testing as tm +import pandas.io.common as icom from pandas.io.parsers import read_csv +@pytest.fixture +def compression_to_extension(): + return {value: key for key, value in icom.extension_to_compression.items()} + + @pytest.fixture def tips_file(datapath): """Path to the tips dataset""" diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index a208daaf9f77b..32509a799fa69 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -13,7 +13,6 @@ compat, ) import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension class TestToCSV: @@ -543,13 +542,15 @@ def test_to_csv_write_to_open_file_with_newline_py3(self): @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) - def test_to_csv_compression(self, compression_only, read_infer, to_infer): + def test_to_csv_compression( + self, compression_only, read_infer, to_infer, compression_to_extension + ): # see gh-15008 compression = compression_only # We'll complete file extension subsequently. filename = "test." - filename += _compression_to_extension[compression] + filename += compression_to_extension[compression] df = DataFrame({"A": [1]}) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 143d2431d4147..4a7606eaf05d7 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -6,7 +6,6 @@ import pandas as pd import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension def test_compression_roundtrip(compression): @@ -91,13 +90,15 @@ def test_read_unsupported_compression_type(): @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) -def test_to_json_compression(compression_only, read_infer, to_infer): +def test_to_json_compression( + compression_only, read_infer, to_infer, compression_to_extension +): # see gh-15008 compression = compression_only # We'll complete file extension subsequently. filename = "test." - filename += _compression_to_extension[compression] + filename += compression_to_extension[compression] df = pd.DataFrame({"A": [1]}) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index bcba9c4a1823d..d150b52258d47 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -12,7 +12,6 @@ from pandas import DataFrame import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -91,11 +90,18 @@ def test_zip_error_invalid_zip(parser_and_data): @skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) -def test_compression(request, parser_and_data, compression_only, buffer, filename): +def test_compression( + request, + parser_and_data, + compression_only, + buffer, + filename, + compression_to_extension, +): parser, data, expected = parser_and_data compress_type = compression_only - ext = _compression_to_extension[compress_type] + ext = compression_to_extension[compress_type] filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index a0d9c6ae99dcf..f3ae5b54d09ce 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -16,7 +16,6 @@ from pandas import DataFrame import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension from pandas.io.feather_format import read_feather from pandas.io.parsers import read_csv @@ -32,10 +31,12 @@ ) @pytest.mark.parametrize("mode", ["explicit", "infer"]) @pytest.mark.parametrize("engine", ["python", "c"]) -def test_compressed_urls(salaries_table, mode, engine, compression_only): +def test_compressed_urls( + salaries_table, mode, engine, compression_only, compression_to_extension +): # test reading compressed urls with various engines and # extension inference - extension = _compression_to_extension[compression_only] + extension = compression_to_extension[compression_only] base_url = ( "https://github.com/pandas-dev/pandas/raw/main/" "pandas/tests/io/parser/data/salaries.csv" diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 030650ad0031d..c682963c462cc 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -26,7 +26,6 @@ ArrowStringArray, StringArray, ) -from pandas.tests.io.test_compression import _compression_to_extension from pandas.io.common import urlopen from pandas.io.parsers import ( @@ -667,13 +666,13 @@ def test_default_delimiter(): @pytest.mark.parametrize("infer", [True, False]) -def test_fwf_compression(compression_only, infer): +def test_fwf_compression(compression_only, infer, compression_to_extension): data = """1111111111 2222222222 3333333333""".strip() compression = compression_only - extension = _compression_to_extension[compression] + extension = compression_to_extension[compression] kwargs = {"widths": [5, 5], "names": ["one", "two"]} expected = read_fwf(StringIO(data), **kwargs) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index ac11e2165eb6f..c84670f0eb69c 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -18,10 +18,6 @@ import pandas.io.common as icom -_compression_to_extension = { - value: key for key, value in icom.extension_to_compression.items() -} - @pytest.mark.parametrize( "obj", @@ -84,11 +80,11 @@ def test_compression_size_fh(obj, method, compression_only): ], ) def test_dataframe_compression_defaults_to_infer( - write_method, write_kwargs, read_method, compression_only + write_method, write_kwargs, read_method, compression_only, compression_to_extension ): # GH22004 input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) - extension = _compression_to_extension[compression_only] + extension = compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) @@ -104,11 +100,16 @@ def test_dataframe_compression_defaults_to_infer( ], ) def test_series_compression_defaults_to_infer( - write_method, write_kwargs, read_method, read_kwargs, compression_only + write_method, + write_kwargs, + read_method, + read_kwargs, + compression_only, + compression_to_extension, ): # GH22004 input = pd.Series([0, 5, -2, 10], name="X") - extension = _compression_to_extension[compression_only] + extension = compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) if "squeeze" in read_kwargs: diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 01e1be5529bad..7b139dc45624e 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -11,7 +11,7 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") +pyarrow = pytest.importorskip("pyarrow") @pytest.mark.single_cpu diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 18cc0f0b11dc9..d82cfd5bd169d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -16,7 +16,6 @@ read_parquet, ) import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension from pandas.util import _test_decorators as td @@ -132,7 +131,9 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): @td.skip_if_no("gcsfs") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) -def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding): +def test_to_csv_compression_encoding_gcs( + gcs_buffer, compression_only, encoding, compression_to_extension +): """ Compression and encoding should with GCS. @@ -161,7 +162,7 @@ def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding) tm.assert_frame_equal(df, read_df) # write compressed file with implicit compression - file_ext = _compression_to_extension[compression_only] + file_ext = compression_to_extension[compression_only] compression["method"] = "infer" path_gcs += f".{file_ext}" df.to_csv(path_gcs, compression=compression, encoding=encoding) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 36cfe5576adf9..571d9d5536e20 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -25,25 +25,19 @@ def dirpath(datapath): return datapath("io", "data", "orc") -# Examples of dataframes with dtypes for which conversion to ORC -# hasn't been implemented yet, that is, Category, unsigned integers, -# interval, period and sparse. -orc_writer_dtypes_not_supported = [ - pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}), - pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}), - pd.DataFrame( - {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]} - ), - pd.DataFrame( - { - "unimpl": [ - pd.Period("2022-01-03", freq="D"), - pd.Period("2022-01-04", freq="D"), - ] - } - ), - pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)), -] +@pytest.fixture( + params=[ + np.array([1, 20], dtype="uint64"), + pd.Series(["a", "b", "a"], dtype="category"), + [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)], + [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")], + ] +) +def orc_writer_dtypes_not_supported(request): + # Examples of dataframes with dtypes for which conversion to ORC + # hasn't been implemented yet, that is, Category, unsigned integers, + # interval, period and sparse. + return pd.DataFrame({"unimpl": request.param}) def test_orc_reader_empty(dirpath): @@ -297,13 +291,12 @@ def test_orc_roundtrip_bytesio(): @td.skip_if_no("pyarrow", min_version="7.0.0") -@pytest.mark.parametrize("df_not_supported", orc_writer_dtypes_not_supported) -def test_orc_writer_dtypes_not_supported(df_not_supported): +def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): # GH44554 # PyArrow gained ORC write support with the current argument order msg = "The dtype of one or more columns is not supported yet." with pytest.raises(NotImplementedError, match=msg): - df_not_supported.to_orc() + orc_writer_dtypes_not_supported.to_orc() @td.skip_if_no("pyarrow", min_version="7.0.0") diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 1b0a1d740677b..68f9b2b64b92a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -19,7 +19,6 @@ DataFrame, Series, ) -from pandas.tests.io.test_compression import _compression_to_extension from pandas.io.parsers import read_csv from pandas.io.stata import ( @@ -1964,13 +1963,13 @@ def test_statareader_warns_when_used_without_context(datapath): @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("use_dict", [True, False]) @pytest.mark.parametrize("infer", [True, False]) -def test_compression(compression, version, use_dict, infer): +def test_compression(compression, version, use_dict, infer, compression_to_extension): file_name = "dta_inferred_compression.dta" if compression: if use_dict: file_ext = compression else: - file_ext = _compression_to_extension[compression] + file_ext = compression_to_extension[compression] file_name += f".{file_ext}" compression_arg = compression if infer: @@ -2134,10 +2133,12 @@ def test_compression_roundtrip(compression): @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) -def test_stata_compression(compression_only, read_infer, to_infer): +def test_stata_compression( + compression_only, read_infer, to_infer, compression_to_extension +): compression = compression_only - ext = _compression_to_extension[compression] + ext = compression_to_extension[compression] filename = f"test.{ext}" df = DataFrame( diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 1f1f44f408fc1..04194a68ed512 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -17,7 +17,6 @@ Index, ) import pandas._testing as tm -from pandas.tests.io.test_compression import _compression_to_extension from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -56,60 +55,69 @@ # [X] - XSLTParseError: "failed to compile" # [X] - PermissionError: "Forbidden" -geom_df = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4, np.nan, 3], - } -) -planet_df = DataFrame( - { - "planet": [ - "Mercury", - "Venus", - "Earth", - "Mars", - "Jupiter", - "Saturn", - "Uranus", - "Neptune", - ], - "type": [ - "terrestrial", - "terrestrial", - "terrestrial", - "terrestrial", - "gas giant", - "gas giant", - "ice giant", - "ice giant", - ], - "location": [ - "inner", - "inner", - "inner", - "inner", - "outer", - "outer", - "outer", - "outer", - ], - "mass": [ - 0.330114, - 4.86747, - 5.97237, - 0.641712, - 1898.187, - 568.3174, - 86.8127, - 102.4126, - ], - } -) +@pytest.fixture +def geom_df(): + return DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, np.nan, 3], + } + ) + + +@pytest.fixture +def planet_df(): + return DataFrame( + { + "planet": [ + "Mercury", + "Venus", + "Earth", + "Mars", + "Jupiter", + "Saturn", + "Uranus", + "Neptune", + ], + "type": [ + "terrestrial", + "terrestrial", + "terrestrial", + "terrestrial", + "gas giant", + "gas giant", + "ice giant", + "ice giant", + ], + "location": [ + "inner", + "inner", + "inner", + "inner", + "outer", + "outer", + "outer", + "outer", + ], + "mass": [ + 0.330114, + 4.86747, + 5.97237, + 0.641712, + 1898.187, + 568.3174, + 86.8127, + 102.4126, + ], + } + ) + -from_file_expected = """\ +@pytest.fixture +def from_file_expected(): + return """\ @@ -163,7 +171,7 @@ def parser(request): # FILE OUTPUT -def test_file_output_str_read(datapath, parser): +def test_file_output_str_read(datapath, parser, from_file_expected): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) @@ -177,7 +185,7 @@ def test_file_output_str_read(datapath, parser): assert output == from_file_expected -def test_file_output_bytes_read(datapath, parser): +def test_file_output_bytes_read(datapath, parser, from_file_expected): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) @@ -191,7 +199,7 @@ def test_file_output_bytes_read(datapath, parser): assert output == from_file_expected -def test_str_output(datapath, parser): +def test_str_output(datapath, parser, from_file_expected): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) @@ -201,7 +209,7 @@ def test_str_output(datapath, parser): assert output == from_file_expected -def test_wrong_file_path(parser): +def test_wrong_file_path(parser, geom_df): path = "/my/fake/path/output.xml" with pytest.raises( @@ -299,7 +307,7 @@ def test_index_false_rename_row_root(datapath, parser): @pytest.mark.parametrize( "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] ) -def test_index_false_with_offset_input_index(parser, offset_index): +def test_index_false_with_offset_input_index(parser, offset_index, geom_df): """ Tests that the output does not contain the `` field when the index of the input Dataframe has an offset. @@ -361,21 +369,21 @@ def test_index_false_with_offset_input_index(parser, offset_index): """ -def test_na_elem_output(parser): +def test_na_elem_output(parser, geom_df): output = geom_df.to_xml(parser=parser) output = equalize_decl(output) assert output == na_expected -def test_na_empty_str_elem_option(parser): +def test_na_empty_str_elem_option(parser, geom_df): output = geom_df.to_xml(na_rep="", parser=parser) output = equalize_decl(output) assert output == na_expected -def test_na_empty_elem_option(parser): +def test_na_empty_elem_option(parser, geom_df): expected = """\ @@ -408,7 +416,7 @@ def test_na_empty_elem_option(parser): # ATTR_COLS -def test_attrs_cols_nan_output(parser): +def test_attrs_cols_nan_output(parser, geom_df): expected = """\ @@ -423,7 +431,7 @@ def test_attrs_cols_nan_output(parser): assert output == expected -def test_attrs_cols_prefix(parser): +def test_attrs_cols_prefix(parser, geom_df): expected = """\ @@ -446,12 +454,12 @@ def test_attrs_cols_prefix(parser): assert output == expected -def test_attrs_unknown_column(parser): +def test_attrs_unknown_column(parser, geom_df): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(attr_cols=["shape", "degree", "sides"], parser=parser) -def test_attrs_wrong_type(parser): +def test_attrs_wrong_type(parser, geom_df): with pytest.raises(TypeError, match=("is not a valid type for attr_cols")): geom_df.to_xml(attr_cols='"shape", "degree", "sides"', parser=parser) @@ -459,7 +467,7 @@ def test_attrs_wrong_type(parser): # ELEM_COLS -def test_elems_cols_nan_output(parser): +def test_elems_cols_nan_output(parser, geom_df): elems_cols_expected = """\ @@ -488,17 +496,17 @@ def test_elems_cols_nan_output(parser): assert output == elems_cols_expected -def test_elems_unknown_column(parser): +def test_elems_unknown_column(parser, geom_df): with pytest.raises(KeyError, match=("no valid column")): geom_df.to_xml(elem_cols=["shape", "degree", "sides"], parser=parser) -def test_elems_wrong_type(parser): +def test_elems_wrong_type(parser, geom_df): with pytest.raises(TypeError, match=("is not a valid type for elem_cols")): geom_df.to_xml(elem_cols='"shape", "degree", "sides"', parser=parser) -def test_elems_and_attrs_cols(parser): +def test_elems_and_attrs_cols(parser, geom_df): elems_cols_expected = """\ @@ -530,7 +538,7 @@ def test_elems_and_attrs_cols(parser): # HIERARCHICAL COLUMNS -def test_hierarchical_columns(parser): +def test_hierarchical_columns(parser, planet_df): expected = """\ @@ -577,7 +585,7 @@ def test_hierarchical_columns(parser): assert output == expected -def test_hierarchical_attrs_columns(parser): +def test_hierarchical_attrs_columns(parser, planet_df): expected = """\ @@ -607,7 +615,7 @@ def test_hierarchical_attrs_columns(parser): # MULTIINDEX -def test_multi_index(parser): +def test_multi_index(parser, planet_df): expected = """\ @@ -646,7 +654,7 @@ def test_multi_index(parser): assert output == expected -def test_multi_index_attrs_cols(parser): +def test_multi_index_attrs_cols(parser, planet_df): expected = """\ @@ -672,7 +680,7 @@ def test_multi_index_attrs_cols(parser): # NAMESPACE -def test_default_namespace(parser): +def test_default_namespace(parser, geom_df): expected = """\ @@ -705,7 +713,7 @@ def test_default_namespace(parser): # PREFIX -def test_namespace_prefix(parser): +def test_namespace_prefix(parser, geom_df): expected = """\ @@ -737,14 +745,14 @@ def test_namespace_prefix(parser): assert output == expected -def test_missing_prefix_in_nmsp(parser): +def test_missing_prefix_in_nmsp(parser, geom_df): with pytest.raises(KeyError, match=("doc is not included in namespaces")): geom_df.to_xml( namespaces={"": "http://example.com"}, prefix="doc", parser=parser ) -def test_namespace_prefix_and_default(parser): +def test_namespace_prefix_and_default(parser, geom_df): expected = """\ @@ -858,7 +866,7 @@ def test_wrong_encoding_option_lxml(datapath, parser, encoding): df_file.to_xml(path, index=False, encoding=encoding, parser=parser) -def test_misspelled_encoding(parser): +def test_misspelled_encoding(parser, geom_df): with pytest.raises(LookupError, match=("unknown encoding")): geom_df.to_xml(encoding="uft-8", parser=parser) @@ -867,7 +875,7 @@ def test_misspelled_encoding(parser): @td.skip_if_no("lxml") -def test_xml_declaration_pretty_print(): +def test_xml_declaration_pretty_print(geom_df): expected = """\ @@ -895,7 +903,7 @@ def test_xml_declaration_pretty_print(): assert output == expected -def test_no_pretty_print_with_decl(parser): +def test_no_pretty_print_with_decl(parser, geom_df): expected = ( "\n" "0square" @@ -916,7 +924,7 @@ def test_no_pretty_print_with_decl(parser): assert output == expected -def test_no_pretty_print_no_decl(parser): +def test_no_pretty_print_no_decl(parser, geom_df): expected = ( "0square" "3604.0" @@ -939,14 +947,14 @@ def test_no_pretty_print_no_decl(parser): @td.skip_if_installed("lxml") -def test_default_parser_no_lxml(): +def test_default_parser_no_lxml(geom_df): with pytest.raises( ImportError, match=("lxml not found, please install or use the etree parser.") ): geom_df.to_xml() -def test_unknown_parser(): +def test_unknown_parser(geom_df): with pytest.raises( ValueError, match=("Values for parser can only be lxml or etree.") ): @@ -980,7 +988,7 @@ def test_unknown_parser(): @td.skip_if_no("lxml") -def test_stylesheet_file_like(datapath, mode): +def test_stylesheet_file_like(datapath, mode, geom_df): xsl = datapath("io", "data", "xml", "row_field_output.xsl") with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: @@ -988,7 +996,7 @@ def test_stylesheet_file_like(datapath, mode): @td.skip_if_no("lxml") -def test_stylesheet_io(datapath, mode): +def test_stylesheet_io(datapath, mode, geom_df): xsl_path = datapath("io", "data", "xml", "row_field_output.xsl") # note: By default the bodies of untyped functions are not checked, @@ -1007,7 +1015,7 @@ def test_stylesheet_io(datapath, mode): @td.skip_if_no("lxml") -def test_stylesheet_buffered_reader(datapath, mode): +def test_stylesheet_buffered_reader(datapath, mode, geom_df): xsl = datapath("io", "data", "xml", "row_field_output.xsl") with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: @@ -1019,7 +1027,7 @@ def test_stylesheet_buffered_reader(datapath, mode): @td.skip_if_no("lxml") -def test_stylesheet_wrong_path(): +def test_stylesheet_wrong_path(geom_df): from lxml.etree import XMLSyntaxError xsl = os.path.join("data", "xml", "row_field_output.xslt") @@ -1033,7 +1041,7 @@ def test_stylesheet_wrong_path(): @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) -def test_empty_string_stylesheet(val): +def test_empty_string_stylesheet(val, geom_df): from lxml.etree import XMLSyntaxError msg = "|".join( @@ -1050,7 +1058,7 @@ def test_empty_string_stylesheet(val): @td.skip_if_no("lxml") -def test_incorrect_xsl_syntax(): +def test_incorrect_xsl_syntax(geom_df): from lxml.etree import XMLSyntaxError xsl = """\ @@ -1079,7 +1087,7 @@ def test_incorrect_xsl_syntax(): @td.skip_if_no("lxml") -def test_incorrect_xsl_eval(): +def test_incorrect_xsl_eval(geom_df): from lxml.etree import XSLTParseError xsl = """\ @@ -1108,7 +1116,7 @@ def test_incorrect_xsl_eval(): @td.skip_if_no("lxml") -def test_incorrect_xsl_apply(): +def test_incorrect_xsl_apply(geom_df): from lxml.etree import XSLTApplyError xsl = """\ @@ -1128,7 +1136,7 @@ def test_incorrect_xsl_apply(): geom_df.to_xml(path, stylesheet=xsl) -def test_stylesheet_with_etree(): +def test_stylesheet_with_etree(geom_df): xsl = """\ @@ -1147,7 +1155,7 @@ def test_stylesheet_with_etree(): @td.skip_if_no("lxml") -def test_style_to_csv(): +def test_style_to_csv(geom_df): xsl = """\ @@ -1176,7 +1184,7 @@ def test_style_to_csv(): @td.skip_if_no("lxml") -def test_style_to_string(): +def test_style_to_string(geom_df): xsl = """\ @@ -1210,7 +1218,7 @@ def test_style_to_string(): @td.skip_if_no("lxml") -def test_style_to_json(): +def test_style_to_json(geom_df): xsl = """\ @@ -1281,7 +1289,7 @@ def test_style_to_json(): """ -def test_compression_output(parser, compression_only): +def test_compression_output(parser, compression_only, geom_df): with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression=compression_only) @@ -1297,8 +1305,10 @@ def test_compression_output(parser, compression_only): assert geom_xml == output.strip() -def test_filename_and_suffix_comp(parser, compression_only): - compfile = "xml." + _compression_to_extension[compression_only] +def test_filename_and_suffix_comp( + parser, compression_only, geom_df, compression_to_extension +): + compfile = "xml." + compression_to_extension[compression_only] with tm.ensure_clean(filename=compfile) as path: geom_df.to_xml(path, parser=parser, compression=compression_only) @@ -1328,7 +1338,7 @@ def test_ea_dtypes(any_numeric_ea_dtype, parser): assert equalize_decl(result).strip() == expected -def test_unsuported_compression(parser): +def test_unsuported_compression(parser, geom_df): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean() as path: geom_df.to_xml(path, parser=parser, compression="7z") @@ -1340,7 +1350,7 @@ def test_unsuported_compression(parser): @pytest.mark.single_cpu @td.skip_if_no("s3fs") @td.skip_if_no("lxml") -def test_s3_permission_output(parser, s3_resource): +def test_s3_permission_output(parser, s3_resource, geom_df): # s3_resource hosts pandas-test import s3fs