@@ -365,13 +365,22 @@ def delete_file(file_handle: str | Path | UPath):
365365 file_handle .unlink ()
366366
367367
368- def read_parquet_file_to_pandas (file_pointer : str | Path | UPath , ** kwargs ) -> npd .NestedFrame :
368+ def read_parquet_file_to_pandas (
369+ file_pointer : str | Path | UPath , is_dir : bool | None = None , ** kwargs
370+ ) -> npd .NestedFrame :
369371 """Reads parquet file(s) to a pandas DataFrame
370372
371373 Parameters
372374 ----------
373375 file_pointer: str | Path | UPath
374376 File Pointer to a parquet file or a directory containing parquet files
377+ is_dir : bool | None
378+ If True, the pointer represents a pixel directory, otherwise, the pointer
379+ represents a file. In both cases there is no need to check the pointer's
380+ content type. If `is_dir` is None (default), this method will resort to
381+ `upath.is_dir()` to identify the type of pointer. Inferring the type for
382+ HTTP is particularly expensive because it requires downloading the contents
383+ of the pointer in its entirety.
375384 **kwargs
376385 Additional arguments to pass to pandas read_parquet method
377386
@@ -381,18 +390,23 @@ def read_parquet_file_to_pandas(file_pointer: str | Path | UPath, **kwargs) -> n
381390 Pandas DataFrame with the data from the parquet file(s)
382391 """
383392 file_pointer = get_upath (file_pointer )
393+
384394 # If we are trying to read a remote directory, we need to send the explicit list of files instead.
385395 # We don't want to get the list unnecessarily because it can be expensive.
386- if file_pointer .protocol not in ("" , "file" ) and file_pointer .is_dir (): # pragma: no cover
396+ if is_dir is None :
397+ is_dir = file_pointer .is_dir ()
398+ if file_pointer .protocol not in ("" , "file" ) and is_dir : # pragma: no cover
387399 file_pointers = [f .path for f in file_pointer .iterdir () if f .is_file ()]
388400 return npd .read_parquet (
389401 file_pointers ,
390402 filesystem = file_pointer .fs ,
391403 partitioning = None , # Avoid the ArrowTypeError described in #367
392404 ** kwargs ,
393405 )
406+
394407 if _parquet_precache_all_bytes (file_pointer ): # pragma: no cover
395408 return npd .read_parquet (BytesIO (file_pointer .read_bytes ()), partitioning = None , ** kwargs )
409+
396410 return npd .read_parquet (
397411 file_pointer .path ,
398412 filesystem = file_pointer .fs ,
0 commit comments