Skip to content

Commit 3338e0f

Browse files
authored
Read pixel parquet more efficiently (#601)
* Accept is_dir arg in read_parquet call * Commit changes from code review
1 parent 81a5844 commit 3338e0f

File tree

1 file changed

+16
-2
lines changed

1 file changed

+16
-2
lines changed

src/hats/io/file_io/file_io.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,13 +365,22 @@ def delete_file(file_handle: str | Path | UPath):
365365
file_handle.unlink()
366366

367367

368-
def read_parquet_file_to_pandas(file_pointer: str | Path | UPath, **kwargs) -> npd.NestedFrame:
368+
def read_parquet_file_to_pandas(
369+
file_pointer: str | Path | UPath, is_dir: bool | None = None, **kwargs
370+
) -> npd.NestedFrame:
369371
"""Reads parquet file(s) to a pandas DataFrame
370372
371373
Parameters
372374
----------
373375
file_pointer: str | Path | UPath
374376
File Pointer to a parquet file or a directory containing parquet files
377+
is_dir : bool | None
378+
If True, the pointer represents a pixel directory, otherwise, the pointer
379+
represents a file. In both cases there is no need to check the pointer's
380+
content type. If `is_dir` is None (default), this method will resort to
381+
`upath.is_dir()` to identify the type of pointer. Inferring the type for
382+
HTTP is particularly expensive because it requires downloading the contents
383+
of the pointer in its entirety.
375384
**kwargs
376385
Additional arguments to pass to pandas read_parquet method
377386
@@ -381,18 +390,23 @@ def read_parquet_file_to_pandas(file_pointer: str | Path | UPath, **kwargs) -> n
381390
Pandas DataFrame with the data from the parquet file(s)
382391
"""
383392
file_pointer = get_upath(file_pointer)
393+
384394
# If we are trying to read a remote directory, we need to send the explicit list of files instead.
385395
# We don't want to get the list unnecessarily because it can be expensive.
386-
if file_pointer.protocol not in ("", "file") and file_pointer.is_dir(): # pragma: no cover
396+
if is_dir is None:
397+
is_dir = file_pointer.is_dir()
398+
if file_pointer.protocol not in ("", "file") and is_dir: # pragma: no cover
387399
file_pointers = [f.path for f in file_pointer.iterdir() if f.is_file()]
388400
return npd.read_parquet(
389401
file_pointers,
390402
filesystem=file_pointer.fs,
391403
partitioning=None, # Avoid the ArrowTypeError described in #367
392404
**kwargs,
393405
)
406+
394407
if _parquet_precache_all_bytes(file_pointer): # pragma: no cover
395408
return npd.read_parquet(BytesIO(file_pointer.read_bytes()), partitioning=None, **kwargs)
409+
396410
return npd.read_parquet(
397411
file_pointer.path,
398412
filesystem=file_pointer.fs,

0 commit comments

Comments
 (0)