diff --git a/demo-notebooks/interactive/local_interactive.ipynb b/demo-notebooks/interactive/local_interactive.ipynb index 88a6ccd58..d70c00df7 100644 --- a/demo-notebooks/interactive/local_interactive.ipynb +++ b/demo-notebooks/interactive/local_interactive.ipynb @@ -32,20 +32,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "4364ac2e-dd10-4d30-ba66-12708daefb3f", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Written to: hfgputest-1.yaml\n" - ] - } - ], + "outputs": [], "source": [ "# Create our cluster and submit appwrapper\n", "namespace = \"default\"\n", @@ -89,7 +81,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "12eef53c", "metadata": {}, @@ -99,38 +90,21 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "cf1b749e-2335-42c2-b673-26768ec9895d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n" - ] - } - ], + "outputs": [], "source": [ - "import openshift as oc\n", "from codeflare_sdk.utils import generate_cert\n", "\n", "if local_interactive:\n", " generate_cert.generate_tls_cert(cluster_name, namespace)\n", - " generate_cert.export_env(cluster_name, namespace)\n", - "\n", - "with oc.project(namespace):\n", - " routes=oc.selector(\"route\").objects()\n", - " rayclient_url=\"\"\n", - " for r in routes:\n", - " if \"rayclient\" in r.name():\n", - " rayclient_url=r.model.spec.host\n", - "print(rayclient_url)" + " generate_cert.export_env(cluster_name, namespace)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7", "metadata": { "scrolled": true, @@ -141,15 +115,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n", - "2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n", - "2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n", - "2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n", - "2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n", - "2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n", - "2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n" + "2023-06-27 19:14:16,088\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n", + "2023-06-27 19:14:16,100\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n", + "2023-06-27 19:14:16,308\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n", + "2023-06-27 19:14:16,434\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n", + "2023-06-27 19:14:16,436\tDEBUG worker.py:807 -- Pinging server.\n", + "2023-06-27 19:14:18,634\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:18,635\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n", + "2023-06-27 19:14:18,645\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:19,454\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n" ] }, { @@ -190,7 +164,7 @@ " \n", " \n", " Dashboard:\n", - " http://10.254.12.141:8265\n", + " http://10.254.20.41:8265\n", "\n", "\n", " \n", @@ -198,10 +172,10 @@ "\n" ], "text/plain": [ - "ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)" + "ClientContext(dashboard_url='10.254.20.41:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)" ] }, - "execution_count": 12, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -210,12 +184,12 @@ "import ray\n", "\n", "ray.shutdown()\n", - "ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")" + "ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "id": "3436eb4a-217c-4109-a3c3-309fda7e2442", "metadata": {}, "outputs": [], @@ -239,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "id": "5cca1874-2be3-4631-ae48-9adfa45e3af3", "metadata": { "scrolled": true, @@ -250,8 +224,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n", - "2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n" + "2023-06-27 19:14:28,222\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n", + "2023-06-27 19:14:28,222\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n" ] } ], @@ -261,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "id": "01172c29-e8bf-41ef-8db5-eccb07906111", "metadata": {}, "outputs": [ @@ -269,8 +243,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n" + "2023-06-27 19:14:29,202\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:31,224\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n" ] }, { @@ -279,7 +253,7 @@ "1789.4644387076714" ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -290,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "id": "9e79b547-a457-4232-b77d-19147067b972", "metadata": {}, "outputs": [ @@ -298,10 +272,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n", + "2023-06-27 19:14:33,161\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n", "}\n", "\n", - "2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n" + "2023-06-27 19:14:34,460\tDEBUG dataclient.py:278 -- Shutting down data channel.\n" ] } ], @@ -312,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2", "metadata": {}, "outputs": [], diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 0e0e73c06..8004916f1 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -35,6 +35,7 @@ RayCluster, RayClusterStatus, ) +from kubernetes import client, config class Cluster: @@ -294,6 +295,13 @@ def torchx_config( to_return["requirements"] = requirements return to_return + def local_client_url(self): + if self.config.local_interactive == True: + ingress_domain = _get_ingress_domain() + return f"ray://rayclient-{self.config.name}-{self.config.namespace}.{ingress_domain}" + else: + return "None" + def list_all_clusters(namespace: str, print_to_console: bool = True): """ @@ -319,6 +327,13 @@ def list_all_queued(namespace: str, print_to_console: bool = True): # private methods +def _get_ingress_domain(): + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" + ) + return ingress["spec"]["domain"] def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index fffc4fa9a..426203b4e 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -21,7 +21,7 @@ import sys import argparse import uuid -import openshift as oc +from kubernetes import client, config def read_template(template): @@ -239,12 +239,13 @@ def enable_local_interactive(resources, cluster_name, namespace): ][0].get("command")[2] command = command.replace("deployment-name", cluster_name) - - server_name = ( - oc.whoami("--show-server").split(":")[1].split("//")[1].replace("api", "apps") + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" ) - - command = command.replace("server-name", server_name) + domain = ingress["spec"]["domain"] + command = command.replace("server-name", domain) item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ "initContainers"