-
Notifications
You must be signed in to change notification settings - Fork 62
Description
I have noticed a wild discrepancy between the sql obtained from DataFrame.sql and the dispatched sql from DataFrame.to_pandas(dry_run=True). The sql code I get from DataFrame.sql is much cleaner and, accordingly to the big query UI, it would consume much less bytes when executed (in my example 156.35 MB). The dispatched sql obtained via DataFrame.to_pandas(dry_run=True) is much heavier and less optimised, requiring a full table scan (~2TB in my example). As far as I understood, the sql from DataFrame.sql does not rely on any cached table.
It would be nice if the dispatched sql would be the optimised one in order to avoid a full table scan on partitioned tables.
Environment details
- OS type and version: Debian GNU/Linux 11 (bullseye)
- Python version: 3.10.16
- pip version:
pip 25.0.1 bigframesversion:2.26.0
Python: 3.10.16 | packaged by conda-forge | (main, Dec 5 2024, 14:16:10) [GCC 13.3.0]
bigframes==2.26.0
google-cloud-bigquery==3.38.0
pandas==2.3.3
pyarrow==21.0.0
sqlglot==27.28.1Code example
import datetime
import bigframes.pandas as bpd
bpd.options.bigquery.project = "<redacted>"
df = bpd.read_gbq("bigquery-public-data.crypto_bitcoin.transactions")
# This table is partitioned by block_timestamp_month
selection = df[df.block_timestamp_month == datetime.date(2025, 10, 1)][["block_timestamp_month", "size"]]
dry_run = selection.to_pandas(dry_run=True)dry_run[dry_run.index == "totalBytesProcessed"].values[0]
#Output: 2365056405079
# ~2.3TBselection.sqlOutputs
SELECT
`bfuid_col_12` AS `block_timestamp_month`,
`bfuid_col_5` AS `size`
FROM
(SELECT
`t0`.`size`,
`t0`.`block_timestamp_month`,
`t0`.`size` AS `bfuid_col_5`,
`t0`.`block_timestamp_month` AS `bfuid_col_12`,
`t0`.`block_timestamp_month` = DATE(2025, 10, 1) AS `bfuid_col_21`
FROM (
SELECT
`size`,
`block_timestamp_month`
FROM `bigquery-public-data.crypto_bitcoin.transactions` FOR SYSTEM_TIME AS OF TIMESTAMP('2025-10-23T12:47:48.073960+00:00')
) AS `t0`
WHERE
`t0`.`block_timestamp_month` = DATE(2025, 10, 1))Which gives an estimated processed bytes of 156.35 MB when pasted on the big query editor.
Checking the dispatched sql I get:
print(dry_run[dry_run.index == "dispatchedSql"].values[0])SELECT
`bfuid_col_3` AS `bfuid_col_3`,
`bfuid_col_12` AS `bfuid_col_12`,
`bfuid_col_5` AS `bfuid_col_5`
FROM
(SELECT
`t1`.`bfuid_col_3`,
`t1`.`bfuid_col_12`,
`t1`.`bfuid_col_5`,
`t1`.`bfuid_col_28` AS `bfuid_col_29`
FROM (
SELECT
ROW_NUMBER() OVER (
ORDER BY CONCAT(
CAST(FARM_FINGERPRINT(
CONCAT(
CONCAT('\\', REPLACE(COALESCE(`t0`.`hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`virtual_size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`version` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`lock_time` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(`t0`.`block_hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_number` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_timestamp` AS STRING), ''), '\\', '\\\\')),
CONCAT(
'\\',
REPLACE(COALESCE(CAST(`t0`.`block_timestamp_month` AS STRING), ''), '\\', '\\\\')
),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`is_coinbase` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`fee` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`inputs`), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`outputs`), ''), '\\', '\\\\'))
)
) AS STRING),
CAST(FARM_FINGERPRINT(
CONCAT(
CONCAT(
CONCAT('\\', REPLACE(COALESCE(`t0`.`hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`virtual_size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`version` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`lock_time` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(`t0`.`block_hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_number` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_timestamp` AS STRING), ''), '\\', '\\\\')),
CONCAT(
'\\',
REPLACE(COALESCE(CAST(`t0`.`block_timestamp_month` AS STRING), ''), '\\', '\\\\')
),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`is_coinbase` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`fee` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`inputs`), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`outputs`), ''), '\\', '\\\\'))
),
'_'
)
) AS STRING),
CAST(RAND() AS STRING)
) ASC
) - 1 AS `bfuid_col_3`,
`t0`.`size` AS `bfuid_col_5`,
`t0`.`block_timestamp_month` AS `bfuid_col_12`,
`t0`.`block_timestamp_month` = DATE(2025, 10, 1) AS `bfuid_col_21`,
ROW_NUMBER() OVER (
ORDER BY CONCAT(
CAST(FARM_FINGERPRINT(
CONCAT(
CONCAT('\\', REPLACE(COALESCE(`t0`.`hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`virtual_size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`version` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`lock_time` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(`t0`.`block_hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_number` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_timestamp` AS STRING), ''), '\\', '\\\\')),
CONCAT(
'\\',
REPLACE(COALESCE(CAST(`t0`.`block_timestamp_month` AS STRING), ''), '\\', '\\\\')
),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`is_coinbase` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`fee` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`inputs`), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`outputs`), ''), '\\', '\\\\'))
)
) AS STRING),
CAST(FARM_FINGERPRINT(
CONCAT(
CONCAT(
CONCAT('\\', REPLACE(COALESCE(`t0`.`hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`virtual_size` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`version` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`lock_time` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(`t0`.`block_hash`, ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_number` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`block_timestamp` AS STRING), ''), '\\', '\\\\')),
CONCAT(
'\\',
REPLACE(COALESCE(CAST(`t0`.`block_timestamp_month` AS STRING), ''), '\\', '\\\\')
),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_count` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`input_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`output_value` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`is_coinbase` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(CAST(`t0`.`fee` AS STRING), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`inputs`), ''), '\\', '\\\\')),
CONCAT('\\', REPLACE(COALESCE(to_json_string(`t0`.`outputs`), ''), '\\', '\\\\'))
),
'_'
)
) AS STRING),
CAST(RAND() AS STRING)
) ASC
) - 1 AS `bfuid_col_28`
FROM (
SELECT
`hash`,
`size`,
`virtual_size`,
`version`,
`lock_time`,
`block_hash`,
`block_number`,
`block_timestamp`,
`block_timestamp_month`,
`input_count`,
`output_count`,
`input_value`,
`output_value`,
`is_coinbase`,
`fee`,
`inputs`,
`outputs`
FROM `bigquery-public-data.crypto_bitcoin.transactions` FOR SYSTEM_TIME AS OF TIMESTAMP('2025-10-23T12:47:48.073960+00:00')
) AS `t0`
) AS `t1`
WHERE
`t1`.`bfuid_col_21`)
ORDER BY `bfuid_col_29` ASC NULLS LAST