From ae06c02ec8237b618fd3aecc50d267119c8956a7 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Mon, 31 Mar 2025 12:00:31 -0500 Subject: [PATCH 1/3] fix: `to_pandas_batches()` respects `page_size` and `max_results` again --- bigframes/session/_io/bigquery/__init__.py | 2 ++ tests/system/load/test_large_tables.py | 8 ++++--- tests/system/small/test_dataframe_io.py | 25 ++++++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index d9f1c0f295..4fdd836777 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -245,6 +245,8 @@ def start_query_with_client( location=location, project=project, api_timeout=timeout, + page_size=page_size, + max_results=max_results, ) if metrics is not None: metrics.count_job_stats(query=sql) diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index 472be3d2ad..ee49c2703e 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -75,17 +75,19 @@ def test_index_repr_large_table(): def test_to_pandas_batches_large_table(): - df = bpd.read_gbq("load_testing.scalars_1tb") + df = bpd.read_gbq("load_testing.scalars_100gb") _, expected_column_count = df.shape # download only a few batches, since 1tb would be too much - iterable = df.to_pandas_batches(page_size=500, max_results=1500) + iterable = df.to_pandas_batches( + page_size=500, max_results=1500, allow_large_results=True + ) # use page size since client library doesn't support # streaming only part of the dataframe via bqstorage for pdf in iterable: batch_row_count, batch_column_count = pdf.shape assert batch_column_count == expected_column_count - assert batch_row_count > 0 + assert 0 < batch_row_count <= 500 @pytest.mark.skip(reason="See if it caused kokoro build aborted.") diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index cd21f5094c..fbaf4fcb49 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -35,6 +35,7 @@ import bigframes import bigframes.dataframe +import bigframes.enums import bigframes.features import bigframes.pandas as bpd @@ -288,6 +289,30 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): pd.testing.assert_series_equal(actual, expected) +@pytest.mark.parametrize("allow_large_results", (True, False)) +def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results): + """Verify to_pandas_batches() APIs returns the expected page size. + + Regression test for b/407521010. + """ + bf_df = session.read_gbq( + "bigquery-public-data.usa_names.usa_1910_2013", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + expected_column_count = len(bf_df.columns) + + batch_count = 0 + for pd_df in bf_df.to_pandas_batches( + page_size=42, allow_large_results=allow_large_results, max_results=42 * 3 + ): + batch_row_count, batch_column_count = pd_df.shape + batch_count += 1 + assert batch_column_count == expected_column_count + assert batch_row_count == 42 + + assert batch_count == 3 + + @pytest.mark.parametrize( ("index",), [(True,), (False,)], From 735da6771ae0aa318d2d21a5eea1f5d02d5457fd Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Mon, 31 Mar 2025 13:43:07 -0500 Subject: [PATCH 2/3] fix lint --- .pre-commit-config.yaml | 4 ++-- noxfile.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8ca120bd07..863a345da1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,11 +31,11 @@ repos: hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 6.1.0 + rev: 7.1.2 hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 + rev: v1.15.0 hooks: - id: mypy additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126] diff --git a/noxfile.py b/noxfile.py index bb4ba91a3a..5c7a2eacc9 100644 --- a/noxfile.py +++ b/noxfile.py @@ -29,7 +29,9 @@ import nox.sessions BLACK_VERSION = "black==22.3.0" +FLAKE8_VERSION = "flake8==7.1.2" ISORT_VERSION = "isort==5.12.0" +MYPY_VERSION = "mypy==1.15.0" # TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751) LATEST_FULLY_SUPPORTED_PYTHON = "3.12" @@ -135,7 +137,7 @@ def lint(session): Returns a failure if the linters find linting errors or sufficiently serious code quality issues. """ - session.install("flake8", BLACK_VERSION, ISORT_VERSION) + session.install(FLAKE8_VERSION, BLACK_VERSION, ISORT_VERSION) session.run( "isort", "--check", @@ -264,7 +266,7 @@ def mypy(session): deps = ( set( [ - "mypy", + MYPY_VERSION, # TODO: update to latest pandas-stubs once we resolve bigframes issues. "pandas-stubs<=2.2.3.241126", "types-protobuf", From 1a8435ae713ee230c2bc06aa7dc53090ef8417c2 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Mon, 31 Mar 2025 13:52:47 -0500 Subject: [PATCH 3/3] help with session close flakiness --- tests/system/small/test_bq_sessions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/system/small/test_bq_sessions.py b/tests/system/small/test_bq_sessions.py index e470728061..7aad19bd8f 100644 --- a/tests/system/small/test_bq_sessions.py +++ b/tests/system/small/test_bq_sessions.py @@ -13,6 +13,7 @@ # limitations under the License. from concurrent.futures import ThreadPoolExecutor +import time import google import google.api_core.exceptions @@ -58,7 +59,11 @@ def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client session_resource_manager.close() with pytest.raises(google.api_core.exceptions.NotFound): - bigquery_client.get_table(session_table_ref) + # It may take time for the underlying tables to get cleaned up after + # closing the session, so wait at least 1 minute to check. + for _ in range(6): + bigquery_client.get_table(session_table_ref) + time.sleep(10) def test_bq_session_create_multi_temp_tables(bigquery_client: bigquery.Client):