Read pixel parquet more efficiently (#601)

camposandro · web-flow · commit 3338e0f8ef60 · 2025-11-21T15:07:00.000-05:00
* Accept is_dir arg in read_parquet call

* Commit changes from code review
diff --git a/src/hats/io/file_io/file_io.py b/src/hats/io/file_io/file_io.py
@@ -365,13 +365,22 @@ def delete_file(file_handle: str | Path | UPath):
     file_handle.unlink()
 
 
-def read_parquet_file_to_pandas(file_pointer: str | Path | UPath, **kwargs) -> npd.NestedFrame:
+def read_parquet_file_to_pandas(
+    file_pointer: str | Path | UPath, is_dir: bool | None = None, **kwargs
+) -> npd.NestedFrame:
     """Reads parquet file(s) to a pandas DataFrame
 
     Parameters
     ----------
     file_pointer: str | Path | UPath
         File Pointer to a parquet file or a directory containing parquet files
+    is_dir : bool | None
+        If True, the pointer represents a pixel directory, otherwise, the pointer
+        represents a file. In both cases there is no need to check the pointer's
+        content type. If `is_dir` is None (default), this method will resort to
+        `upath.is_dir()` to identify the type of pointer. Inferring the type for
+        HTTP is particularly expensive because it requires downloading the contents
+        of the pointer in its entirety.
     **kwargs
         Additional arguments to pass to pandas read_parquet method
 
@@ -381,18 +390,23 @@ def read_parquet_file_to_pandas(file_pointer: str | Path | UPath, **kwargs) -> n
         Pandas DataFrame with the data from the parquet file(s)
     """
     file_pointer = get_upath(file_pointer)
+
     # If we are trying to read a remote directory, we need to send the explicit list of files instead.
     # We don't want to get the list unnecessarily because it can be expensive.
-    if file_pointer.protocol not in ("", "file") and file_pointer.is_dir():  # pragma: no cover
+    if is_dir is None:
+        is_dir = file_pointer.is_dir()
+    if file_pointer.protocol not in ("", "file") and is_dir:  # pragma: no cover
         file_pointers = [f.path for f in file_pointer.iterdir() if f.is_file()]
         return npd.read_parquet(
             file_pointers,
             filesystem=file_pointer.fs,
             partitioning=None,  # Avoid the ArrowTypeError described in #367
             **kwargs,
         )
+
     if _parquet_precache_all_bytes(file_pointer):  # pragma: no cover
         return npd.read_parquet(BytesIO(file_pointer.read_bytes()), partitioning=None, **kwargs)
+
     return npd.read_parquet(
         file_pointer.path,
         filesystem=file_pointer.fs,