diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index a89f5c21a..9b22e0582 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -93,6 +93,23 @@ jobs: cd .. + - name: Add user to KinD + uses: ./common/github-actions/kind-add-user + with: + user-name: sdk-user + + - name: Configure RBAC for sdk user with limited permissions + run: | + kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses + kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user + kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers + kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user + kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces + kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user + kubectl create clusterrole list-rayclusters --verb=get,list --resource=rayclusters + kubectl create clusterrolebinding sdk-user-list-rayclusters --clusterrole=list-rayclusters --user=sdk-user + kubectl config use-context sdk-user + - name: Run e2e tests run: | export CODEFLARE_TEST_TIMEOUT_SHORT=1m @@ -103,7 +120,10 @@ jobs: echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV set -euo pipefail - go test -timeout 30m -v ./tests/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt + pip install poetry + poetry install --with test,docs + echo "Running e2e tests..." + poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 - name: Print CodeFlare operator logs if: always() && steps.deploy.outcome == 'success' diff --git a/README.md b/README.md index 8228b685b..088b2d3d5 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,15 @@ To build the codeflare-sdk pre-commit image run `podman build -f .github/build/C ### Testing - To install codeflare-sdk in editable mode, run `pip install -e .` from the repo root. -- To run the unit tests, run `pytest -v tests/unit_test.py` - Any new test functions/scripts can be added into the `tests` folder - NOTE: Functional tests coming soon, will live in `tests/func_test.py` +#### Unit Testing +- To run the unit tests, run `pytest -v tests/unit_test.py` + +#### Local e2e Testing +- Please follow the [e2e documentation](https://https://github.com/project-codeflare/codeflare-sdk/blob/main/docs/e2e.md) + #### Code Coverage - Run tests with the following command: `coverage run -m --source=src pytest tests/unit_test.py` diff --git a/docs/e2e.md b/docs/e2e.md new file mode 100644 index 000000000..ce04c7691 --- /dev/null +++ b/docs/e2e.md @@ -0,0 +1,79 @@ +# Running e2e tests locally +#### Pre-requisites +- We recommend using Python 3.9, along with Poetry. + +## On KinD clusters +Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127.0.0.1 kind`. This will map your localhost IP address to the KinD cluster's hostname. This is already performed on [GitHub Actions](https://github.com/project-codeflare/codeflare-common/blob/1edd775e2d4088a5a0bfddafb06ff3a773231c08/github-actions/kind/action.yml#L70-L72) + +- Setup Phase: + - Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets: + ``` + make kind-e2e + export CLUSTER_HOSTNAME=kind + export CODEFLARE_TEST_TIMEOUT_LONG=20m + make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.1.0 + make setup-e2e + ``` + + - **(Optional)** - Create and add `sdk-user` with limited permissions to the cluster to run through the e2e tests: + ``` + # Get KinD certificates + docker cp kind-control-plane:/etc/kubernetes/pki/ca.crt . + docker cp kind-control-plane:/etc/kubernetes/pki/ca.key . + + # Generate certificates for new user + openssl genrsa -out user.key 2048 + openssl req -new -key user.key -out user.csr -subj '/CN=sdk-user/O=tenant' + openssl x509 -req -in user.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out user.crt -days 360 + + # Add generated certificated to KinD context + user_crt=$(base64 --wrap=0 user.crt) + user_key=$(base64 --wrap=0 user.key) + yq eval -i ".contexts += {\"context\": {\"cluster\": \"kind-kind\", \"user\": \"sdk-user\"}, \"name\": \"sdk-user\"}" $HOME/.kube/config + yq eval -i ".users += {\"name\": \"sdk-user\", \"user\": {\"client-certificate-data\": \"$user_crt\", \"client-key-data\": \"$user_key\"}}" $HOME/.kube/config + cat $HOME/.kube/config + + # Cleanup + rm ca.crt + rm ca.srl + rm ca.key + rm user.crt + rm user.key + rm user.csr + + # Add RBAC permissions to sdk-user + kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses + kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user + kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers + kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user + kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces + kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user + kubectl create clusterrole list-rayclusters --verb=get,list --resource=rayclusters + kubectl create clusterrolebinding sdk-user-list-rayclusters --clusterrole=list-rayclusters --user=sdk-user + kubectl config use-context sdk-user + + ``` + + +- Test Phase: + - Once we have the codeflare-operator and kuberay-operator running and ready, we can run the e2e test on the codeflare-sdk repository: + ``` + poetry install --with test,docs + poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py + ``` + + + +## On OpenShift clusters +- Setup Phase: + - Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets: + ``` + make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.1.0 + make setup-e2e + ``` +- Test Phase: + - Once we have the codeflare-operator and kuberay-operator running and ready, we can run the e2e test on the codeflare-sdk repository: + ``` + poetry install --with test,docs + poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py + ``` diff --git a/poetry.lock b/poetry.lock index f52cd9c97..5496961b6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiohttp" version = "3.9.1" description = "Async http client/server framework (asyncio)" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -101,7 +100,6 @@ speedups = ["Brotli", "aiodns", "brotlicffi"] name = "aiohttp-cors" version = "0.7.0" description = "CORS support for aiohttp" -category = "main" optional = false python-versions = "*" files = [ @@ -116,7 +114,6 @@ aiohttp = ">=1.1" name = "aiosignal" version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -131,7 +128,6 @@ frozenlist = ">=1.1.0" name = "ansicon" version = "1.89.0" description = "Python wrapper for loading Jason Hood's ANSICON" -category = "main" optional = false python-versions = "*" files = [ @@ -143,7 +139,6 @@ files = [ name = "async-timeout" version = "4.0.3" description = "Timeout context manager for asyncio programs" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -155,7 +150,6 @@ files = [ name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -174,7 +168,6 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte name = "bcrypt" version = "4.0.1" description = "Modern password hashing for your software and your servers" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -209,7 +202,6 @@ typecheck = ["mypy"] name = "blessed" version = "1.20.0" description = "Easy, practical library for making terminal apps, by providing an elegant, well-documented interface to Colors, Keyboard input, and screen Positioning capabilities." -category = "main" optional = false python-versions = ">=2.7" files = [ @@ -226,7 +218,6 @@ wcwidth = ">=0.1.4" name = "cachetools" version = "5.3.1" description = "Extensible memoizing collections and decorators" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -238,7 +229,6 @@ files = [ name = "certifi" version = "2023.7.22" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -250,7 +240,6 @@ files = [ name = "cffi" version = "1.16.0" description = "Foreign Function Interface for Python calling C code." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -315,7 +304,6 @@ pycparser = "*" name = "charset-normalizer" version = "3.3.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -415,7 +403,6 @@ files = [ name = "click" version = "8.1.7" description = "Composable command line interface toolkit" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -430,7 +417,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "codeflare-torchx" version = "0.6.0.dev1" description = "TorchX SDK and Components" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -459,7 +445,6 @@ ray = ["ray (>=1.12.1)"] name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -471,7 +456,6 @@ files = [ name = "colorful" version = "0.5.5" description = "Terminal string styling done right, in Python." -category = "main" optional = false python-versions = "*" files = [ @@ -486,7 +470,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "commonmark" version = "0.9.1" description = "Python parser for the CommonMark Markdown spec" -category = "main" optional = false python-versions = "*" files = [ @@ -501,7 +484,6 @@ test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"] name = "coverage" version = "7.2.7" description = "Code coverage measurement for Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -574,7 +556,6 @@ toml = ["tomli"] name = "cryptography" version = "40.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -616,7 +597,6 @@ tox = ["tox"] name = "distlib" version = "0.3.7" description = "Distribution utilities" -category = "main" optional = false python-versions = "*" files = [ @@ -628,7 +608,6 @@ files = [ name = "docker" version = "6.1.3" description = "A Python library for the Docker Engine API." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -650,7 +629,6 @@ ssh = ["paramiko (>=2.4.3)"] name = "docstring-parser" version = "0.8.1" description = "\"Parse Python docstrings in reST, Google and Numpydoc format\"" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -661,7 +639,6 @@ files = [ name = "exceptiongroup" version = "1.1.3" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -676,7 +653,6 @@ test = ["pytest (>=6)"] name = "executing" version = "1.2.0" description = "Get the currently executing AST node of a frame, and other information" -category = "main" optional = false python-versions = "*" files = [ @@ -691,7 +667,6 @@ tests = ["asttokens", "littleutils", "pytest", "rich"] name = "filelock" version = "3.12.4" description = "A platform independent file lock." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -708,7 +683,6 @@ typing = ["typing-extensions (>=4.7.1)"] name = "frozenlist" version = "1.4.0" description = "A list-like structure which implements collections.abc.MutableSequence" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -779,7 +753,6 @@ files = [ name = "fsspec" version = "2023.9.2" description = "File-system specification" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -815,7 +788,6 @@ tqdm = ["tqdm"] name = "google-api-core" version = "2.15.0" description = "Google API client core library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -838,7 +810,6 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] name = "google-auth" version = "2.23.3" description = "Google Authentication Library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -862,7 +833,6 @@ requests = ["requests (>=2.20.0,<3.0.0.dev0)"] name = "googleapis-common-protos" version = "1.62.0" description = "Common protobufs used in Google APIs" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -880,7 +850,6 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] name = "gpustat" version = "1.1.1" description = "An utility to monitor NVIDIA GPU status and usage" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -900,7 +869,6 @@ test = ["mockito (>=1.2.1)", "pytest (>=5.4.1)", "pytest-runner"] name = "grpcio" version = "1.60.0" description = "HTTP/2-based RPC framework" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -967,7 +935,6 @@ protobuf = ["grpcio-tools (>=1.60.0)"] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -979,7 +946,6 @@ files = [ name = "importlib-metadata" version = "6.8.0" description = "Read metadata from Python packages" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -999,7 +965,6 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs name = "importlib-resources" version = "6.1.0" description = "Read resources from Python packages" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1018,7 +983,6 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1030,7 +994,6 @@ files = [ name = "jinxed" version = "1.2.1" description = "Jinxed Terminal Library" -category = "main" optional = false python-versions = "*" files = [ @@ -1045,7 +1008,6 @@ ansicon = {version = "*", markers = "platform_system == \"Windows\""} name = "jsonschema" version = "4.19.1" description = "An implementation of JSON Schema validation for Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1069,7 +1031,6 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- name = "jsonschema-specifications" version = "2023.7.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1085,7 +1046,6 @@ referencing = ">=0.28.0" name = "kubernetes" version = "26.1.0" description = "Kubernetes python client" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1103,7 +1063,7 @@ requests-oauthlib = "*" setuptools = ">=21.0.0" six = ">=1.9.0" urllib3 = ">=1.24.2" -websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.0 || >=0.43.0" +websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" [package.extras] adal = ["adal (>=1.0.2)"] @@ -1112,7 +1072,6 @@ adal = ["adal (>=1.0.2)"] name = "mako" version = "1.2.4" description = "A super-fast templating language that borrows the best ideas from the existing templating languages." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1132,7 +1091,6 @@ testing = ["pytest"] name = "markdown" version = "3.5" description = "Python implementation of John Gruber's Markdown." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -1151,7 +1109,6 @@ testing = ["coverage", "pyyaml"] name = "markupsafe" version = "2.1.3" description = "Safely add untrusted strings to HTML/XML markup." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1175,6 +1132,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -1211,7 +1178,6 @@ files = [ name = "msgpack" version = "1.0.7" description = "MessagePack serializer" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1277,7 +1243,6 @@ files = [ name = "multidict" version = "6.0.4" description = "multidict implementation" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1361,7 +1326,6 @@ files = [ name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1373,7 +1337,6 @@ files = [ name = "numpy" version = "1.24.4" description = "Fundamental package for array computing in Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1411,7 +1374,6 @@ files = [ name = "nvidia-ml-py" version = "12.535.133" description = "Python Bindings for the NVIDIA Management Library" -category = "main" optional = false python-versions = "*" files = [ @@ -1423,7 +1385,6 @@ files = [ name = "oauthlib" version = "3.2.2" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1440,7 +1401,6 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] name = "opencensus" version = "0.11.3" description = "A stats collection and distributed tracing framework" -category = "main" optional = false python-versions = "*" files = [ @@ -1456,7 +1416,6 @@ opencensus-context = ">=0.1.3" name = "opencensus-context" version = "0.1.3" description = "OpenCensus Runtime Context" -category = "main" optional = false python-versions = "*" files = [ @@ -1468,7 +1427,6 @@ files = [ name = "openshift-client" version = "1.0.18" description = "OpenShift python client" -category = "main" optional = false python-versions = "*" files = [ @@ -1485,7 +1443,6 @@ six = "*" name = "packaging" version = "23.2" description = "Core utilities for Python packages" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1497,7 +1454,6 @@ files = [ name = "pandas" version = "2.0.3" description = "Powerful data structures for data analysis, time series, and statistics" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1531,8 +1487,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -1565,7 +1521,6 @@ xml = ["lxml (>=4.6.3)"] name = "paramiko" version = "3.3.1" description = "SSH2 protocol library" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1587,7 +1542,6 @@ invoke = ["invoke (>=2.0)"] name = "pdoc3" version = "0.10.0" description = "Auto-generate API documentation for Python projects." -category = "dev" optional = false python-versions = ">= 3.6" files = [ @@ -1603,7 +1557,6 @@ markdown = ">=3.0" name = "pkgutil-resolve-name" version = "1.3.10" description = "Resolve a name to an object." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1615,7 +1568,6 @@ files = [ name = "platformdirs" version = "3.11.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1631,7 +1583,6 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co name = "pluggy" version = "1.3.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -1647,7 +1598,6 @@ testing = ["pytest", "pytest-benchmark"] name = "prometheus-client" version = "0.19.0" description = "Python client for the Prometheus monitoring system." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1662,7 +1612,6 @@ twisted = ["twisted"] name = "protobuf" version = "4.24.4" description = "" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1685,7 +1634,6 @@ files = [ name = "psutil" version = "5.9.6" description = "Cross-platform lib for process and system monitoring in Python." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -1714,7 +1662,6 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "py-spy" version = "0.3.14" description = "Sampling profiler for Python programs" -category = "main" optional = false python-versions = "*" files = [ @@ -1731,7 +1678,6 @@ files = [ name = "pyarrow" version = "14.0.1" description = "Python library for Apache Arrow" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1780,7 +1726,6 @@ numpy = ">=1.16.6" name = "pyasn1" version = "0.5.0" description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ @@ -1792,7 +1737,6 @@ files = [ name = "pyasn1-modules" version = "0.3.0" description = "A collection of ASN.1-based protocols modules" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ @@ -1807,7 +1751,6 @@ pyasn1 = ">=0.4.6,<0.6.0" name = "pycparser" version = "2.21" description = "C parser in Python" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1819,7 +1762,6 @@ files = [ name = "pydantic" version = "1.10.13" description = "Data validation and settings management using python type hints" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1872,7 +1814,6 @@ email = ["email-validator (>=1.0.3)"] name = "pygments" version = "2.16.1" description = "Pygments is a syntax highlighting package written in Python." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1887,7 +1828,6 @@ plugins = ["importlib-metadata"] name = "pynacl" version = "1.5.0" description = "Python binding to the Networking and Cryptography (NaCl) library" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1914,7 +1854,6 @@ tests = ["hypothesis (>=3.27.0)", "pytest (>=3.2.1,!=3.3.0)"] name = "pyre-extensions" version = "0.0.30" description = "Type system extensions for use with the pyre type checker" -category = "main" optional = false python-versions = "*" files = [ @@ -1930,7 +1869,6 @@ typing-inspect = "*" name = "pytest" version = "7.4.0" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1953,7 +1891,6 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "pytest-mock" version = "3.11.1" description = "Thin-wrapper around the mock package for easier use with pytest" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1971,7 +1908,6 @@ dev = ["pre-commit", "pytest-asyncio", "tox"] name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -1986,7 +1922,6 @@ six = ">=1.5" name = "pytz" version = "2023.3.post1" description = "World timezone definitions, modern and historical" -category = "main" optional = false python-versions = "*" files = [ @@ -1998,7 +1933,6 @@ files = [ name = "pywin32" version = "306" description = "Python for Window Extensions" -category = "main" optional = false python-versions = "*" files = [ @@ -2022,7 +1956,6 @@ files = [ name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2044,6 +1977,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2082,7 +2016,6 @@ files = [ name = "ray" version = "2.7.0" description = "Ray provides a simple, universal API for building distributed applications." -category = "main" optional = false python-versions = "*" files = [ @@ -2123,15 +2056,15 @@ frozenlist = "*" fsspec = {version = "*", optional = true, markers = "extra == \"data\""} gpustat = {version = ">=1.0.0", optional = true, markers = "extra == \"default\""} grpcio = [ - {version = ">=1.32.0", optional = true, markers = "python_version < \"3.10\""}, - {version = ">=1.42.0", optional = true, markers = "python_version >= \"3.10\""}, + {version = ">=1.32.0", optional = true, markers = "python_version < \"3.10\" and extra == \"default\""}, + {version = ">=1.42.0", optional = true, markers = "python_version >= \"3.10\" and extra == \"default\""}, ] jsonschema = "*" msgpack = ">=1.0.0,<2.0.0" numpy = [ {version = ">=1.16", markers = "python_version < \"3.9\""}, - {version = ">=1.19.3", markers = "python_version >= \"3.9\""}, {version = ">=1.20", optional = true, markers = "extra == \"data\""}, + {version = ">=1.19.3", markers = "python_version >= \"3.9\""}, ] opencensus = {version = "*", optional = true, markers = "extra == \"default\""} packaging = "*" @@ -2164,7 +2097,6 @@ tune = ["fsspec", "pandas", "pyarrow (>=6.0.1)", "requests", "tensorboardX (>=1. name = "referencing" version = "0.30.2" description = "JSON Referencing + Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2180,7 +2112,6 @@ rpds-py = ">=0.7.0" name = "requests" version = "2.31.0" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2202,7 +2133,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-oauthlib" version = "1.3.1" description = "OAuthlib authentication support for Requests." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2221,7 +2151,6 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] name = "rich" version = "12.6.0" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -category = "main" optional = false python-versions = ">=3.6.3,<4.0.0" files = [ @@ -2241,7 +2170,6 @@ jupyter = ["ipywidgets (>=7.5.1,<8.0.0)"] name = "rpds-py" version = "0.10.4" description = "Python bindings to Rust's persistent data structures (rpds)" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2350,7 +2278,6 @@ files = [ name = "rsa" version = "4.9" description = "Pure-Python RSA implementation" -category = "main" optional = false python-versions = ">=3.6,<4" files = [ @@ -2365,7 +2292,6 @@ pyasn1 = ">=0.1.3" name = "setuptools" version = "68.2.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2382,7 +2308,6 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -2394,7 +2319,6 @@ files = [ name = "smart-open" version = "6.4.0" description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" -category = "main" optional = false python-versions = ">=3.6,<4.0" files = [ @@ -2416,7 +2340,6 @@ webhdfs = ["requests"] name = "tabulate" version = "0.9.0" description = "Pretty-print tabular data" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2431,7 +2354,6 @@ widechars = ["wcwidth"] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2443,7 +2365,6 @@ files = [ name = "typing-extensions" version = "4.8.0" description = "Backported and Experimental Type Hints for Python 3.8+" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2455,7 +2376,6 @@ files = [ name = "typing-inspect" version = "0.9.0" description = "Runtime inspection utilities for typing module." -category = "main" optional = false python-versions = "*" files = [ @@ -2471,7 +2391,6 @@ typing-extensions = ">=3.7.4" name = "tzdata" version = "2023.3" description = "Provider of IANA time zone data" -category = "main" optional = false python-versions = ">=2" files = [ @@ -2483,7 +2402,6 @@ files = [ name = "urllib3" version = "1.26.17" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -2500,7 +2418,6 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] name = "virtualenv" version = "20.21.0" description = "Virtual Python Environment builder" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2521,7 +2438,6 @@ test = ["covdefaults (>=2.2.2)", "coverage (>=7.1)", "coverage-enable-subprocess name = "wcwidth" version = "0.2.12" description = "Measures the displayed width of unicode strings in a terminal" -category = "main" optional = false python-versions = "*" files = [ @@ -2533,7 +2449,6 @@ files = [ name = "websocket-client" version = "1.6.4" description = "WebSocket client for Python with low level API options" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2550,7 +2465,6 @@ test = ["websockets"] name = "yarl" version = "1.9.4" description = "Yet another URL library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2654,7 +2568,6 @@ multidict = ">=4.0" name = "zipp" version = "3.17.0" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" optional = false python-versions = ">=3.8" files = [ diff --git a/tests/e2e/mnist_raycluster_sdk.py b/tests/e2e/mnist_raycluster_sdk.py deleted file mode 100644 index 0ded85e4b..000000000 --- a/tests/e2e/mnist_raycluster_sdk.py +++ /dev/null @@ -1,90 +0,0 @@ -import sys -import os - -from time import sleep - -import ray - -from torchx.specs.api import AppState, is_terminal - -from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration -from codeflare_sdk.job.jobs import DDPJobDefinition - -namespace = sys.argv[1] -ray_image = os.getenv("RAY_IMAGE") -host = os.getenv("CLUSTER_HOSTNAME") - -ingress_options = {} -if host is not None: - ingress_options = { - "ingresses": [ - { - "ingressName": "ray-dashboard", - "port": 8265, - "pathType": "Prefix", - "path": "/", - "host": host, - }, - ] - } - -cluster = Cluster( - ClusterConfiguration( - name="mnist", - namespace=namespace, - num_workers=1, - head_cpus="500m", - head_memory=2, - min_cpus="500m", - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, - instascale=False, - image=ray_image, - ingress_options=ingress_options, - ) -) - - -cluster.up() - -cluster.status() - -cluster.wait_ready() - -cluster.status() - -cluster.details() - -jobdef = DDPJobDefinition( - name="mnist", - script="mnist.py", - scheduler_args={"requirements": "requirements.txt"}, -) -job = jobdef.submit(cluster) - -done = False -time = 0 -timeout = 900 -while not done: - status = job.status() - if is_terminal(status.state): - break - if not done: - print(status) - if timeout and time >= timeout: - raise TimeoutError(f"job has timed out after waiting {timeout}s") - sleep(5) - time += 5 - -print(f"Job has completed: {status.state}") - -print(job.logs()) - -cluster.down() - -if not status.state == AppState.SUCCEEDED: - exit(1) -else: - exit(0) diff --git a/tests/e2e/mnist_raycluster_sdk_test.go b/tests/e2e/mnist_raycluster_sdk_test.go deleted file mode 100644 index 963a48722..000000000 --- a/tests/e2e/mnist_raycluster_sdk_test.go +++ /dev/null @@ -1,208 +0,0 @@ -/* -Copyright 2023. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package e2e - -import ( - "testing" - - . "github.com/onsi/gomega" - . "github.com/project-codeflare/codeflare-common/support" - mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// Creates a Ray cluster, and trains the MNIST dataset using the CodeFlare SDK. -// Asserts successful completion of the training job. -// -// This covers the installation of the CodeFlare SDK, as well as the RBAC required -// for the SDK to successfully perform requests to the cluster, on behalf of the -// impersonated user. -func TestMNISTRayClusterSDK(t *testing.T) { - test := With(t) - test.T().Parallel() - - // Create a namespace - namespace := test.NewTestNamespace() - - // Test configuration - config := CreateConfigMap(test, namespace.Name, map[string][]byte{ - // SDK script - "mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"), - // pip requirements - "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), - // MNIST training script - "mnist.py": ReadFile(test, "mnist.py"), - // codeflare-sdk installation script - "install-codeflare-sdk.sh": ReadFile(test, "install-codeflare-sdk.sh"), - }) - - // Create RBAC, retrieve token for user with limited rights - policyRules := []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, - APIGroups: []string{mcadv1beta1.GroupName}, - Resources: []string{"appwrappers"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{rayv1.GroupVersion.Group}, - Resources: []string{"rayclusters", "rayclusters/status"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"route.openshift.io"}, - Resources: []string{"routes"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"networking.k8s.io"}, - Resources: []string{"ingresses"}, - }, - } - - sa := CreateServiceAccount(test, namespace.Name) - role := CreateRole(test, namespace.Name, policyRules) - CreateRoleBinding(test, namespace.Name, sa, role) - - job := &batchv1.Job{ - TypeMeta: metav1.TypeMeta{ - APIVersion: batchv1.SchemeGroupVersion.String(), - Kind: "Job", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", - Namespace: namespace.Name, - }, - Spec: batchv1.JobSpec{ - Completions: Ptr(int32(1)), - Parallelism: Ptr(int32(1)), - BackoffLimit: Ptr(int32(0)), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Volumes: []corev1.Volume{ - { - Name: "test", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: config.Name, - }, - }, - }, - }, - { - Name: "codeflare-sdk", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - { - Name: "workdir", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - }, - Containers: []corev1.Container{ - { - Name: "test", - // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed - // See https://github.com/project-codeflare/codeflare-sdk/pull/146 - Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", - Env: []corev1.EnvVar{ - {Name: "PYTHONUSERBASE", Value: "/workdir"}, - {Name: "RAY_IMAGE", Value: GetRayImage()}, - }, - Command: []string{ - "/bin/sh", "-c", - "while [ ! -f /codeflare-sdk/pyproject.toml ]; do sleep 1; done; " + - "cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python mnist_raycluster_sdk.py " + namespace.Name, - }, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "test", - MountPath: "/test", - }, - { - Name: "codeflare-sdk", - MountPath: "/codeflare-sdk", - }, - { - Name: "workdir", - MountPath: "/workdir", - }, - }, - WorkingDir: "/workdir", - SecurityContext: &corev1.SecurityContext{ - AllowPrivilegeEscalation: Ptr(false), - SeccompProfile: &corev1.SeccompProfile{ - Type: "RuntimeDefault", - }, - Capabilities: &corev1.Capabilities{ - Drop: []corev1.Capability{"ALL"}, - }, - RunAsNonRoot: Ptr(true), - }, - }, - }, - RestartPolicy: corev1.RestartPolicyNever, - ServiceAccountName: sa.Name, - }, - }, - }, - } - if GetClusterType(test) == KindCluster { - // Take first KinD node and redirect pod hostname requests there - node := GetNodes(test)[0] - hostname := GetClusterHostname(test) - IP := GetNodeInternalIP(test, node) - - test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP) - job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{ - { - IP: IP, - Hostnames: []string{hostname}, - }, - } - - // Propagate hostname into Python code as env variable - hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname} - job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar) - } - job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) - - // Setup the codeflare-sdk inside the pod associated to the created job - SetupCodeflareSDKInsidePod(test, namespace, job.Name) - - test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( - Or( - WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), - WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), - )) - - // Assert the job has completed successfully - test.Expect(GetJob(test, job.Namespace, job.Name)). - To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) -} diff --git a/tests/e2e/mnist_raycluster_sdk_test.py b/tests/e2e/mnist_raycluster_sdk_test.py new file mode 100644 index 000000000..26f76b602 --- /dev/null +++ b/tests/e2e/mnist_raycluster_sdk_test.py @@ -0,0 +1,167 @@ +from kubernetes import client, config +import kubernetes.client + +import os + +from time import sleep + +import ray + +from torchx.specs.api import AppState, is_terminal + +from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration +from codeflare_sdk.job.jobs import DDPJobDefinition + +import pytest + +from support import random_choice, get_ray_image + +# Creates a Ray cluster, and trains the MNIST dataset using the CodeFlare SDK. +# Asserts creation of AppWrapper, RayCluster, and successful completion of the training job. +# Covers successfull installation of CodeFlare-SDK + + +class TestMNISTRayClusterSDK: + def setup_method(self): + # Load the kube config from the environment or Kube config file. + config.load_kube_config() + + # Initialize Kubernetes client + self.api_instance = client.CoreV1Api() + self.custom_api = kubernetes.client.CustomObjectsApi( + self.api_instance.api_client + ) + + def teardown_method(self): + if hasattr(self, "namespace"): + self.api_instance.delete_namespace(self.namespace) + + def test_mnist_ray_cluster_sdk(self): + self.create_test_namespace() + self.run_mnist_raycluster_sdk() + + def create_test_namespace(self): + self.namespace = f"test-ns-{random_choice()}" + namespace_body = client.V1Namespace( + metadata=client.V1ObjectMeta(name=self.namespace) + ) + self.api_instance.create_namespace(namespace_body) + return self.namespace + + def run_mnist_raycluster_sdk(self): + ray_image = get_ray_image() + host = os.getenv("CLUSTER_HOSTNAME") + + ingress_options = {} + if host is not None: + ingress_options = { + "ingresses": [ + { + "ingressName": "ray-dashboard", + "port": 8265, + "pathType": "Prefix", + "path": "/", + "host": host, + "annotations": { + "nginx.ingress.kubernetes.io/proxy-body-size": "100M", + }, + }, + ] + } + + cluster = Cluster( + ClusterConfiguration( + name="mnist", + namespace=self.namespace, + num_workers=1, + head_cpus="500m", + head_memory=2, + min_cpus="500m", + max_cpus=1, + min_memory=1, + max_memory=2, + num_gpus=0, + instascale=False, + image=ray_image, + ingress_options=ingress_options, + ) + ) + + cluster.up() + self.assert_appwrapper_exists() + + cluster.status() + + cluster.wait_ready() + self.assert_raycluster_exists() + + cluster.status() + + cluster.details() + + jobdef = DDPJobDefinition( + name="mnist", + script="./tests/e2e/mnist.py", + scheduler_args={"requirements": "./tests/e2e/mnist_pip_requirements.txt"}, + ) + job = jobdef.submit(cluster) + + done = False + time = 0 + timeout = 900 + while not done: + status = job.status() + if is_terminal(status.state): + break + if not done: + print(status) + if timeout and time >= timeout: + raise TimeoutError(f"job has timed out after waiting {timeout}s") + sleep(5) + time += 5 + + print(job.status()) + self.assert_job_completion(status) + + print(job.logs()) + + cluster.down() + + # Assertions + def assert_appwrapper_exists(self): + try: + self.custom_api.get_namespaced_custom_object( + "workload.codeflare.dev", + "v1beta1", + self.namespace, + "appwrappers", + "mnist", + ) + print( + f"AppWrapper 'mnist' has been created in the namespace: '{self.namespace}'" + ) + assert True + except Exception as e: + print(f"AppWrapper 'mnist' has not been created. Error: {e}") + assert False + + def assert_raycluster_exists(self): + try: + self.custom_api.get_namespaced_custom_object( + "ray.io", "v1", self.namespace, "rayclusters", "mnist" + ) + print( + f"RayCluster 'mnist' created successfully in the namespace: '{self.namespace}'" + ) + assert True + except Exception as e: + print(f"RayCluster 'mnist' has not been created. Error: {e}") + assert False + + def assert_job_completion(self, status): + if status.state == AppState.SUCCEEDED: + print(f"Job has completed: '{status.state}'") + assert True + else: + print(f"Job has completed: '{status.state}'") + assert False diff --git a/tests/e2e/support.py b/tests/e2e/support.py new file mode 100644 index 000000000..303b03c8a --- /dev/null +++ b/tests/e2e/support.py @@ -0,0 +1,13 @@ +import os +import random +import string + + +def get_ray_image(): + default_ray_image = "quay.io/project-codeflare/ray:latest-py39-cu118" + return os.getenv("RAY_IMAGE", default_ray_image) + + +def random_choice(): + alphabet = string.ascii_lowercase + string.digits + return "".join(random.choices(alphabet, k=5))