[Data] - Update Pyarrow version to 23.0 for release tests + Update moto to 5.x.x (ray-project#59489)

goutamvenkat-anyscale · alexeykudinkin · ans9868 · commit 3cfd5c1ea76d · 2026-02-18T15:50:24.000-05:00
## Description PyArrow 22 uses a newer AWS SDK that sends S3 requests with HTTP chunked transfer encoding and trailer checksums (x-amz-checksum-crc64nvme). Our old moto version (4.2.12) doesn't properly parse this protocol, causing raw HTTP wire format to leak into test responses: ``` Expected: b'spam' Got: b'4\r\nspam\r\n0\r\nx-amz-checksum-crc64nvme:...\r\n\r\n' ``` Related issue from moto: getmoto/moto#7198 ## Related issues > Link related issues: "Fixes ray-project#1234", "Closes ray-project#1234", or "Related to ray-project#1234". ## Additional information > Optional: Add implementation details, API changes, usage examples, screenshots, etc. --------- Signed-off-by: Alexey Kudinkin <ak@anyscale.com> Signed-off-by: Goutam <goutam@anyscale.com> Co-authored-by: Alexey Kudinkin <ak@anyscale.com> Signed-off-by: Adel Nour <ans9868@nyu.edu>
diff --git a/.buildkite/data.rayci.yml b/.buildkite/data.rayci.yml
@@ -99,7 +99,7 @@ steps:
         --only-tags data_non_parallel
     depends_on: data9build-multipy
 
-  - label: ":database: data: arrow v21 tests"
+  - label: ":database: data: arrow v23 tests"
     tags:
       - python
       - data
@@ -113,7 +113,7 @@ steps:
         --except-tags data_integration,doctest,data_non_parallel,dask,needs_credentials,tensorflow_datasets
     depends_on: datalbuild-multipy
 
-  - label: ":database: data: arrow v21 tests (data_non_parallel)"
+  - label: ":database: data: arrow v23 tests (data_non_parallel)"
     tags:
       - python
       - data
@@ -128,7 +128,7 @@ steps:
         --only-tags data_non_parallel
     depends_on: datalbuild-multipy
 
-  - label: ":database: data: arrow v21 py{{matrix.python}} tests ({{matrix.worker_id}})"
+  - label: ":database: data: arrow v23 py{{matrix.python}} tests ({{matrix.worker_id}})"
     key: datal_python_tests
     if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
     tags:
@@ -145,7 +145,7 @@ steps:
         python: ["3.12"]
         worker_id: ["0", "1"]
 
-  - label: ":database: data: arrow v21 py{{matrix.python}} tests (data_non_parallel)"
+  - label: ":database: data: arrow v23 py{{matrix.python}} tests (data_non_parallel)"
     key: datal_python_non_parallel_tests
     if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
     tags:
diff --git a/ci/docker/datal.build.wanda.yaml b/ci/docker/datal.build.wanda.yaml
@@ -11,6 +11,6 @@ srcs:
   - python/requirements/ml/data-test-requirements.txt
 build_args:
   - DOCKER_IMAGE_BASE_BUILD=cr.ray.io/rayproject/oss-ci-base_ml-py$PYTHON
-  - ARROW_VERSION=21.*
+  - ARROW_VERSION=23.*
 tags:
   - cr.ray.io/rayproject/databuild-py$PYTHON
diff --git a/python/ray/data/tests/mock_server.py b/python/ray/data/tests/mock_server.py
@@ -60,7 +60,9 @@ def start_service(service_name, host, port):
     # Always use port conflict resolution to be safe
     port = _find_available_port(host, port)
 
-    args = [moto_svr_path, service_name, "-H", host, "-p", str(port)]
+    # moto 5.x no longer accepts a service name argument - all services
+    # are served on a single endpoint
+    args = [moto_svr_path, "-H", host, "-p", str(port)]
     # For debugging
     # args = '{0} {1} -H {2} -p {3} 2>&1 | \
     # tee -a /tmp/moto.log'.format(moto_svr_path, service_name, host, port)
diff --git a/python/ray/tests/mock_s3_server.py b/python/ray/tests/mock_s3_server.py
@@ -18,7 +18,9 @@
 
 def start_service(service_name, host, port):
     moto_svr_path = shutil.which("moto_server")
-    args = [moto_svr_path, service_name, "-H", host, "-p", str(port)]
+    # moto 5.x no longer accepts a service name argument - all services
+    # are served on a single endpoint
+    args = [moto_svr_path, "-H", host, "-p", str(port)]
     process = sp.Popen(
         args, stdin=sp.PIPE, stdout=sp.DEVNULL, stderr=sp.DEVNULL
     )  # shell=True
diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py
@@ -35,11 +35,10 @@
 from unittest import mock
 from unittest.mock import MagicMock, patch
 
-import moto
 import pytest
 import yaml
 from click.testing import CliRunner
-from moto import mock_ec2, mock_iam
+from moto import mock_aws
 from testfixtures import Replacer
 from testfixtures.popen import MockPopen, PopenBehaviour
 
@@ -113,18 +112,25 @@ def configure_aws():
     os.environ["AWS_SESSION_TOKEN"] = "testing"
 
     # moto (boto3 mock) only allows a hardcoded set of AMIs
-    dlami = (
-        moto.ec2.models.ec2_backends["us-west-2"]["us-west-2"]
-        .describe_images(filters={"name": "Deep Learning AMI Ubuntu*"})[0]
-        .id
-    )
-    aws_config.DEFAULT_AMI["us-west-2"] = dlami
-    list_instances_mock = MagicMock(return_value=boto3_list)
-    with patch(
-        "ray.autoscaler._private.aws.node_provider.list_ec2_instances",
-        list_instances_mock,
-    ):
-        yield
+    # Use mock_aws context manager and boto3 to find the AMI
+    import boto3
+
+    # In moto 5.x, AWS managed policies (e.g., AmazonEC2FullAccess) are not
+    # loaded by default for performance. Enable them since the autoscaler
+    # attaches these policies to the IAM role.
+    with mock_aws(config={"iam": {"load_aws_managed_policies": True}}):
+        ec2_client = boto3.client("ec2", region_name="us-west-2")
+        images = ec2_client.describe_images(
+            Filters=[{"Name": "name", "Values": ["Deep Learning AMI Ubuntu*"]}]
+        )["Images"]
+        dlami = images[0]["ImageId"]
+        aws_config.DEFAULT_AMI["us-west-2"] = dlami
+        list_instances_mock = MagicMock(return_value=boto3_list)
+        with patch(
+            "ray.autoscaler._private.aws.node_provider.list_ec2_instances",
+            list_instances_mock,
+        ):
+            yield
 
 
 @pytest.fixture(scope="function")
@@ -636,8 +642,6 @@ def test_ray_start_block_and_stop(configure_lang, monkeypatch, tmp_path, cleanup
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_up(
     configure_lang, _unlink_test_ssh_key, configure_aws, monkeypatch, tmp_path
 ):
@@ -677,8 +681,6 @@ def commands_mock(command, stdin):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_up_docker(
     configure_lang, _unlink_test_ssh_key, configure_aws, monkeypatch, tmp_path
 ):
@@ -720,8 +722,6 @@ def commands_mock(command, stdin):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_up_record(
     configure_lang, _unlink_test_ssh_key, configure_aws, monkeypatch, tmp_path
 ):
@@ -754,8 +754,6 @@ def commands_mock(command, stdin):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_attach(configure_lang, configure_aws, _unlink_test_ssh_key):
     def commands_mock(command, stdin):
         # TODO(maximsmol): this is a hack since stdout=sys.stdout
@@ -796,8 +794,6 @@ def commands_mock(command, stdin):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_attach_with_ip(configure_lang, configure_aws, _unlink_test_ssh_key):
     from ray.autoscaler._private.commands import get_worker_node_ips
 
@@ -876,8 +872,6 @@ def commands_verifier(calls):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_dashboard(configure_lang, configure_aws, _unlink_test_ssh_key):
     def commands_mock(command, stdin):
         # TODO(maximsmol): this is a hack since stdout=sys.stdout
@@ -910,8 +904,6 @@ def commands_mock(command, stdin):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_exec(configure_lang, configure_aws, _unlink_test_ssh_key):
     def commands_mock(command, stdin):
         # TODO(maximsmol): this is a hack since stdout=sys.stdout
@@ -963,8 +955,6 @@ def commands_verifier(calls):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_submit(configure_lang, configure_aws, _unlink_test_ssh_key):
     def commands_mock(command, stdin):
         # TODO(maximsmol): this is a hack since stdout=sys.stdout
@@ -1355,8 +1345,6 @@ def test_ray_drain_node(monkeypatch):
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"),
 )
-@mock_ec2
-@mock_iam
 def test_ray_cluster_dump(configure_lang, configure_aws, _unlink_test_ssh_key):
     def commands_mock(command, stdin):
         print("This is a test!")
diff --git a/python/requirements/ml/data-requirements.txt b/python/requirements/ml/data-requirements.txt
@@ -14,5 +14,5 @@ modin==0.22.2; python_version < '3.12'
 pandas==1.5.3; python_version < '3.12'
 modin==0.31.0; python_version >= '3.12'
 pandas==2.2.2; python_version >= '3.12'
-responses==0.13.4
+responses>=0.15.0
 pymars>=0.8.3; python_version < "3.12"
diff --git a/python/requirements/test-requirements.txt b/python/requirements/test-requirements.txt
@@ -31,7 +31,7 @@ jsonpatch==1.32
 kubernetes==24.2.0
 llvmlite==0.42.0
 lxml>=6.0.2
-moto[s3,server]==4.2.12
+moto[s3,server]==5.1.18
 mypy==1.7.0
 numba==0.59.1
 openpyxl==3.0.10
diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt
@@ -88,6 +88,7 @@ annotated-types==0.6.0
 antlr4-python3-runtime==4.11.1
     # via
     #   fugue-sql-antlr
+    #   moto
     #   qpd
 anyio==4.12.0
     # via
@@ -392,14 +393,13 @@ cryptography==44.0.3
     #   azure-cli-core
     #   azure-identity
     #   azure-storage-blob
+    #   joserfc
     #   moto
     #   msal
     #   paramiko
     #   pyjwt
     #   pyopenssl
-    #   python-jose
     #   snowflake-connector-python
-    #   sshpubkeys
     #   trustme
 cupy-cuda12x==13.4.0 ; sys_platform != "darwin"
     # via
@@ -484,11 +484,6 @@ docutils==0.19
     #   sphinx
 dulwich==0.21.6
     # via comet-ml
-ecdsa==0.18.0
-    # via
-    #   moto
-    #   python-jose
-    #   sshpubkeys
 entrypoints==0.4
     # via nbconvert
 et-xmlfile==1.1.0
@@ -889,17 +884,19 @@ joblib==1.2.0
     # via
     #   -r python/requirements/test-requirements.txt
     #   scikit-learn
+joserfc==1.5.0
+    # via moto
 jschema-to-python==1.2.3
     # via cfn-lint
 json5==0.9.14
     # via jupyterlab-server
-jsondiff==2.0.0
-    # via moto
 jsonpatch==1.32
     # via
     #   -r python/requirements/cloud-requirements.txt
     #   -r python/requirements/test-requirements.txt
     #   cfn-lint
+jsonpath-ng==1.7.0
+    # via moto
 jsonpickle==3.0.2
     # via jschema-to-python
 jsonpointer==2.4
@@ -1089,7 +1086,7 @@ more-itertools==10.7.0
     # via configspace
 mosaicml==0.3.1 ; python_version < "3.12"
     # via -r python/requirements/ml/train-test-requirements.txt
-moto==4.2.12
+moto==5.1.18
     # via -r python/requirements/test-requirements.txt
 moviepy==0.2.3.1
     # via -r python/requirements/ml/rllib-test-requirements.txt
@@ -1497,6 +1494,8 @@ plotly==5.23.0
     # via ax-platform
 pluggy==1.3.0
     # via pytest
+ply==3.11
+    # via jsonpath-ng
 polars==1.36.1
     # via -r python/requirements/test-requirements.txt
 polars-runtime-32==1.36.1
@@ -1579,7 +1578,7 @@ py==1.11.0
     # via pytest-forked
 py-cpuinfo==9.0.0
     # via deepspeed
-py-partiql-parser==0.5.0
+py-partiql-parser==0.6.3
     # via moto
 py-spy==0.4.0 ; python_version < "3.12"
     # via -r python/requirements.txt
@@ -1605,7 +1604,6 @@ pyasn1==0.5.1
     # via
     #   oauth2client
     #   pyasn1-modules
-    #   python-jose
     #   rsa
 pyasn1-modules==0.3.0
     # via
@@ -1786,8 +1784,6 @@ python-dateutil==2.8.2
     #   strictyaml
 python-dotenv==1.2.1
     # via testcontainers
-python-jose==3.3.0
-    # via moto
 python-json-logger==2.0.7
     # via jupyter-events
 python-lsp-jsonrpc==1.0.0
@@ -1842,6 +1838,7 @@ pyyaml==6.0.3
     #   pymars
     #   pytorch-lightning
     #   ray
+    #   responses
     #   timm
     #   transformers
     #   wandb
@@ -1926,7 +1923,7 @@ requests-oauthlib==2.0.0
     #   msrest
 requests-toolbelt==1.0.0
     # via comet-ml
-responses==0.13.4
+responses==0.25.8
     # via
     #   -r python/requirements/ml/data-requirements.txt
     #   moto
@@ -1965,7 +1962,6 @@ rsa==4.7.2
     #   gcs-oauth2-boto-plugin
     #   google-auth
     #   oauth2client
-    #   python-jose
 ruamel-yaml==0.17.40
     # via
     #   semgrep
@@ -2058,7 +2054,6 @@ six==1.16.0
     #   azure-core
     #   bleach
     #   configobj
-    #   ecdsa
     #   fs
     #   gcs-oauth2-boto-plugin
     #   google-apitools
@@ -2080,7 +2075,6 @@ six==1.16.0
     #   python-dateutil
     #   pyu2f
     #   pyvmomi
-    #   responses
     #   rfc3339-validator
     #   tensorboard
     #   tensorflow
@@ -2139,8 +2133,6 @@ sqlglot==25.6.1
     # via fugue
 sqlparse==0.5.1
     # via mlflow-skinny
-sshpubkeys==3.3.1
-    # via moto
 stack-data==0.6.3
     # via ipython
 stanio==0.3.0