[Data] Fix broken code snippets in user guides (#55519)

bveeramani · dstrodtman · commit 69927ec0f8f2 · 2025-10-06T16:34:29.000-04:00
In #51334, we discovered we weren't actually testing code snippets in our user guides. As a result, there are several broken code snippets in our guides. This PR fixes some of those code snippets, and re-enables testing on the user guides. --------- Signed-off-by: Balaji Veeramani <bveeramani@berkeley.edu> Signed-off-by: Douglas Strodtman <douglas@anyscale.com>
diff --git a/bazel/python.bzl b/bazel/python.bzl
@@ -17,6 +17,24 @@ def _convert_target_to_import_path(t):
     # 3) Replace '/' with '.' to form an import path.
     return t.replace("/", ".")
 
+def doctest_each(files, gpu = False, deps=[], srcs=[], data=[], args=[], size="medium", tags=[], pytest_plugin_file="//bazel:default_doctest_pytest_plugin.py", **kwargs):
+    # Unlike the `doctest` macro, `doctest_each` runs `pytest` on each file separately.
+    # This is useful to run tests in parallel and more clearly report the test results.
+    for file in files:
+        doctest(
+            files = [file],
+            gpu = gpu,
+            name = paths.split_extension(file)[0],
+            deps = deps,
+            srcs = srcs,
+            data = data,
+            args = args,
+            size = size,
+            tags = tags,
+            pytest_plugin_file = pytest_plugin_file,
+            **kwargs
+        )
+
 def doctest(files, gpu = False, name="doctest", deps=[], srcs=[], data=[], args=[], size="medium", tags=[], pytest_plugin_file="//bazel:default_doctest_pytest_plugin.py", **kwargs):
     # NOTE: If you run `pytest` on `__init__.py`, it tries to test all files in that
     # package. We don't want that, so we exclude it from the list of input files.
diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel
@@ -1,6 +1,6 @@
 load("@py_deps_buildkite//:requirements.bzl", ci_require = "requirement")
 load("@rules_python//python:defs.bzl", "py_test")
-load("//bazel:python.bzl", "doctest", "py_test_run_all_notebooks", "py_test_run_all_subdirectory")
+load("//bazel:python.bzl", "doctest", "doctest_each", "py_test_run_all_notebooks", "py_test_run_all_subdirectory")
 
 exports_files(["test_myst_doc.py"])
 
@@ -480,8 +480,7 @@ doctest(
     tags = ["team:core"],
 )
 
-doctest(
-    name = "doctest[data]",
+doctest_each(
     files = glob(
         include = [
             "source/data/**/*.md",
@@ -492,15 +491,9 @@ doctest(
             "source/data/batch_inference.rst",
             "source/data/transforming-data.rst",
             # These tests are currently failing.
-            "source/data/loading-data.rst",
-            "source/data/data-internals.rst",
-            "source/data/inspecting-data.rst",
-            "source/data/loading-data.rst",
-            "source/data/performance-tips.rst",
-            "source/data/saving-data.rst",
-            "source/data/working-with-images.rst",
             "source/data/working-with-llms.rst",
-            "source/data/working-with-pytorch.rst",
+            # These don't contain code snippets.
+            "source/data/api/**/*.rst",
         ],
     ),
     pytest_plugin_file = "//python/ray/data:tests/doctest_pytest_plugin.py",
diff --git a/doc/source/data/data-internals.rst b/doc/source/data/data-internals.rst
@@ -179,12 +179,19 @@ To add custom optimization rules, implement a class that extends ``Rule`` and co
 
     import ray
     from ray.data._internal.logical.interfaces import Rule
+    from ray.data._internal.logical.optimizers import get_logical_ruleset
 
     class CustomRule(Rule):
         def apply(self, plan):
             ...
 
-    ray.data._internal.logical.optimizers.DEFAULT_LOGICAL_RULES.append(CustomRule)
+    logical_ruleset = get_logical_ruleset()
+    logical_ruleset.add(CustomRule)
+
+.. testcode::
+    :hide:
+
+    logical_ruleset.remove(CustomRule)
 
 Types of physical operators
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/data/inspecting-data.rst b/doc/source/data/inspecting-data.rst
@@ -123,12 +123,11 @@ of the returned batch, set ``batch_format``.
             print(batch)
 
         .. testoutput::
-            :options: +NORMALIZE_WHITESPACE
+            :options: +MOCK
 
                sepal length (cm)  sepal width (cm)  ...  petal width (cm)  target
             0                5.1               3.5  ...               0.2       0
             1                4.9               3.0  ...               0.2       0
-            <BLANKLINE>
 
 For more information on working with batches, see
 :ref:`Transforming batches <transforming_batches>` and
@@ -143,7 +142,10 @@ Ray Data calculates statistics during execution for each operator, such as wall
 To view stats about your :class:`Datasets <ray.data.Dataset>`, call :meth:`Dataset.stats() <ray.data.Dataset.stats>` on an executed dataset. The stats are also persisted under `/tmp/ray/session_*/logs/ray-data/ray-data.log`.
 For more on how to read this output, see :ref:`Monitoring Your Workload with the Ray Data Dashboard <monitoring-your-workload>`.
 
+.. This snippet below is skipped because of  https://github.com/ray-project/ray/issues/54101.
+
 .. testcode::
+    :skipif: True
 
     import ray
     import datasets
diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst
@@ -486,13 +486,16 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
         :func:`~ray.data.from_daft`. This function executes the Daft dataframe and constructs a ``Dataset`` backed by the resultant arrow data produced
         by your Daft query.
 
+        .. warning::
+            :func:`~ray.data.from_daft` doesn't work with PyArrow 14 and later. For more
+            information, see `this issue <https://github.com/ray-project/ray/issues/54837>`__.
+
         .. testcode::
+            :skipif: True
 
             import daft
             import ray
 
-            ray.init()
-
             df = daft.from_pydict({"int_col": [i for i in range(10000)], "str_col": [str(i) for i in range(10000)]})
             ds = ray.data.from_daft(df)
 
@@ -512,7 +515,12 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
         ``Dataset`` backed by the distributed Pandas DataFrame partitions that underly
         the Dask DataFrame.
 
+        ..
+          We skip the code snippet below because `from_dask` doesn't work with PyArrow 
+          14 and later. For more information, see https://github.com/ray-project/ray/issues/54837
+
         .. testcode::
+            :skipif: True
 
             import dask.dataframe as dd
             import pandas as pd
@@ -569,21 +577,21 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
         call :func:`~ray.data.read_iceberg`. This function creates a ``Dataset`` backed by
         the distributed files that underlie the Iceberg table.
 
-        ..
-
         .. testcode::
             :skipif: True
 
-            >>> import ray
-            >>> from pyiceberg.expressions import EqualTo
-            >>> ds = ray.data.read_iceberg(
-            ...     table_identifier="db_name.table_name",
-            ...     row_filter=EqualTo("column_name", "literal_value"),
-            ...     catalog_kwargs={"name": "default", "type": "glue"}
-            ... )
+            import ray
+            from pyiceberg.expressions import EqualTo
 
+            ds = ray.data.read_iceberg(
+                table_identifier="db_name.table_name",
+                row_filter=EqualTo("column_name", "literal_value"),
+                catalog_kwargs={"name": "default", "type": "glue"}
+            )
+            ds.show(3)
 
         .. testoutput::
+            :options: +MOCK
 
             {'col1': 0, 'col2': '0'}
             {'col1': 1, 'col2': '1'}
@@ -622,6 +630,7 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
         DataFrame.
 
         .. testcode::
+            :skipif: True
 
             import mars
             import mars.dataframe as md
@@ -668,7 +677,10 @@ Ray Data interoperates with HuggingFace, PyTorch, and TensorFlow datasets.
             `IterableDatasetDict <https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.IterableDatasetDict>`_
             objects aren't supported.
 
+        .. This snippet below is skipped because of  https://github.com/ray-project/ray/issues/54837.
+
         .. testcode::
+            :skipif: True
 
             import ray.data
             from datasets import load_dataset
diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst
@@ -51,7 +51,7 @@ For example, the following code batches multiple files into the same read task t
     ray.init(num_cpus=2)
 
     # Repeat the iris.csv file 16 times.
-    ds = ray.data.read_csv(["example://iris.csv"] * 16)
+    ds = ray.data.read_csv(["s3://anonymous@ray-example-data/iris.csv"] * 16)
     print(ds.materialize())
 
 .. testoutput::
@@ -81,7 +81,7 @@ Notice how the number of output blocks is equal to ``override_num_blocks`` in th
     ray.init(num_cpus=2)
 
     # Repeat the iris.csv file 16 times.
-    ds = ray.data.read_csv(["example://iris.csv"] * 16, override_num_blocks=16)
+    ds = ray.data.read_csv(["s3://anonymous@ray-example-data/iris.csv"] * 16, override_num_blocks=16)
     print(ds.materialize())
 
 .. testoutput::
@@ -143,7 +143,7 @@ For example, the following code executes :func:`~ray.data.read_csv` with only on
     # Pretend there are two CPUs.
     ray.init(num_cpus=2)
 
-    ds = ray.data.read_csv("example://iris.csv").map(lambda row: row)
+    ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv").map(lambda row: row)
     print(ds.materialize().stats())
 
 .. testoutput::
@@ -171,7 +171,7 @@ For example, this code sets the number of files equal to ``override_num_blocks``
     # Pretend there are two CPUs.
     ray.init(num_cpus=2)
 
-    ds = ray.data.read_csv("example://iris.csv", override_num_blocks=1).map(lambda row: row)
+    ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv", override_num_blocks=1).map(lambda row: row)
     print(ds.materialize().stats())
 
 .. testoutput::
@@ -205,15 +205,21 @@ calling :func:`~ray.data.Dataset.select_columns`, since column selection is push
 .. testcode::
 
     import ray
+
     # Read just two of the five columns of the Iris dataset.
-    ray.data.read_parquet(
+    ds = ray.data.read_parquet(
         "s3://anonymous@ray-example-data/iris.parquet",
         columns=["sepal.length", "variety"],
     )
+    
+    print(ds.schema())
 
 .. testoutput::
 
-    Dataset(num_rows=150, schema={sepal.length: double, variety: string})
+    Column        Type
+    ------        ----
+    sepal.length  double
+    variety       string
 
 
 .. _data_memory:
diff --git a/doc/source/data/saving-data.rst b/doc/source/data/saving-data.rst
@@ -228,7 +228,7 @@ number of files & their sizes (since every block could potentially carry the row
     print_directory_tree("/tmp/sales_partitioned")
 
 .. testoutput::
-    :options: +NORMALIZE_WHITESPACE
+    :options: +MOCK
 
     sales_partitioned/
         city=NYC/
@@ -301,24 +301,10 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
             ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
 
             df = ds.to_daft()
-
-    .. tab-item:: Dask
-
-        To convert a :class:`~ray.data.dataset.Dataset` to a
-        `Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__, call
-        :meth:`Dataset.to_dask() <ray.data.Dataset.to_dask>`.
-
-        .. testcode::
-
-            import ray
-
-            ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
-
-            df = ds.to_dask()
-
-            df
+            print(df)
 
         .. testoutput::
+            :options: +MOCK
 
             ╭───────────────────┬──────────────────┬───────────────────┬──────────────────┬────────╮
             │ sepal length (cm) ┆ sepal width (cm) ┆ petal length (cm) ┆ petal width (cm) ┆ target │
@@ -345,13 +331,33 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
             (Showing first 8 of 150 rows)
 
 
+    .. tab-item:: Dask
+
+        To convert a :class:`~ray.data.dataset.Dataset` to a
+        `Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__, call
+        :meth:`Dataset.to_dask() <ray.data.Dataset.to_dask>`.
+
+        ..
+          We skip the code snippet below because `to_dask` doesn't work with PyArrow 
+          14 and later. For more information, see https://github.com/ray-project/ray/issues/54837
+            
+        .. testcode::
+            :skipif: True
+
+            import ray
+
+            ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv")
+
+            df = ds.to_dask()
+
     .. tab-item:: Spark
 
         To convert a :class:`~ray.data.dataset.Dataset` to a `Spark DataFrame
         <https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html>`__,
         call :meth:`Dataset.to_spark() <ray.data.Dataset.to_spark>`.
 
         .. testcode::
+            :skipif: True
 
             import ray
             import raydp
@@ -367,6 +373,7 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
             df = ds.to_spark(spark)
 
         .. testcode::
+            :skipif: True
             :hide:
 
             raydp.stop_spark()
@@ -390,6 +397,7 @@ Ray Data interoperates with distributed data processing frameworks like `Daft <h
         :meth:`Dataset.to_mars() <ray.data.Dataset.to_mars>`.
 
         .. testcode::
+            :skipif: True
 
             import ray
 
diff --git a/doc/source/data/working-with-images.rst b/doc/source/data/working-with-images.rst
@@ -147,7 +147,7 @@ To view the full list of supported file formats, see the
 
             Column  Type
             ------  ----
-            image   numpy.ndarray(shape=(32, 32, 3), dtype=uint8)
+            img     struct<bytes: binary, path: string>
             label   int64
 
 
diff --git a/doc/source/data/working-with-pytorch.rst b/doc/source/data/working-with-pytorch.rst
@@ -229,8 +229,8 @@ You can use built-in Torch transforms from ``torchvision``, ``torchtext``, and `
 
             Column          Type
             ------          ----
-            text            <class 'object'>
-            tokenized_text  <class 'object'>
+            text            string
+            tokenized_text  list<item: string>
 
 .. _batch_inference_pytorch: