|
38 | 38 | "id": "288ec515", |
39 | 39 | "metadata": {}, |
40 | 40 | "outputs": [], |
41 | | - "source": "!pip install git+https://github.com/kubeflow/sdk.git@main" |
| 41 | + "source": [ |
| 42 | + "!pip install git+https://github.com/kubeflow/sdk.git@main" |
| 43 | + ] |
42 | 44 | }, |
43 | 45 | { |
44 | 46 | "cell_type": "markdown", |
|
73 | 75 | "source": [ |
74 | 76 | "# List all available Kubeflow Training Runtimes.\n", |
75 | 77 | "from kubeflow.trainer import *\n", |
| 78 | + "from kubeflow_trainer_api import models\n", |
| 79 | + "import os\n", |
76 | 80 | "\n", |
77 | 81 | "client = TrainerClient()\n", |
78 | 82 | "for runtime in client.list_runtimes():\n", |
|
154 | 158 | ], |
155 | 159 | "source": [ |
156 | 160 | "# Create a PersistentVolumeClaim for the TorchTune Llama 3.2 1B model.\n", |
157 | | - "client.core_api.create_namespaced_persistent_volume_claim(\n", |
158 | | - " namespace=\"default\",\n", |
159 | | - " body=client.V1PersistentVolumeClaim(\n", |
160 | | - " api_version=\"v1\",\n", |
161 | | - " kind=\"PersistentVolumeClaim\",\n", |
162 | | - " metadata=client.V1ObjectMeta(name=\"torchtune-llama3.2-1b\"),\n", |
163 | | - " spec=client.V1PersistentVolumeClaimSpec(\n", |
164 | | - " access_modes=[\"ReadWriteOnce\"],\n", |
165 | | - " resources=client.V1ResourceRequirements(\n", |
166 | | - " requests={\"storage\": \"20Gi\"}\n", |
167 | | - " ),\n", |
168 | | - " ),\n", |
169 | | - " ),\n", |
| 161 | + "client.backend.core_api.create_namespaced_persistent_volume_claim(\n", |
| 162 | + " namespace=\"default\",\n", |
| 163 | + " body=models.IoK8sApiCoreV1PersistentVolumeClaim(\n", |
| 164 | + " apiVersion=\"v1\",\n", |
| 165 | + " kind=\"PersistentVolumeClaim\",\n", |
| 166 | + " metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(\n", |
| 167 | + " name=\"torchtune-llama3.2-1b\"\n", |
| 168 | + " ),\n", |
| 169 | + " spec=models.IoK8sApiCoreV1PersistentVolumeClaimSpec(\n", |
| 170 | + " accessModes=[\"ReadWriteOnce\"],\n", |
| 171 | + " resources=models.IoK8sApiCoreV1VolumeResourceRequirements(\n", |
| 172 | + " requests={\n", |
| 173 | + " \"storage\": models.IoK8sApimachineryPkgApiResourceQuantity(\"200Gi\")\n", |
| 174 | + " }\n", |
| 175 | + " ),\n", |
| 176 | + " ),\n", |
| 177 | + " ).to_dict(),\n", |
170 | 178 | ")" |
171 | 179 | ] |
172 | 180 | }, |
|
188 | 196 | "outputs": [], |
189 | 197 | "source": [ |
190 | 198 | "job_name = client.train(\n", |
191 | | - " runtime=Runtime(\n", |
192 | | - " name=\"torchtune-llama3.2-1b\"\n", |
193 | | - " ),\n", |
| 199 | + " runtime=client.get_runtime(name=\"torchtune-llama3.2-1b\"),\n", |
194 | 200 | " initializer=Initializer(\n", |
195 | 201 | " dataset=HuggingFaceDatasetInitializer(\n", |
196 | 202 | " storage_uri=\"hf://tatsu-lab/alpaca/data\"\n", |
197 | 203 | " ),\n", |
198 | 204 | " model=HuggingFaceModelInitializer(\n", |
199 | 205 | " storage_uri=\"hf://meta-llama/Llama-3.2-1B-Instruct\",\n", |
200 | | - " access_token=\"<YOUR_HF_TOKEN>\" # Replace with your Hugging Face token,\n", |
| 206 | + " access_token=os.environ[\"HF_TOKEN\"] # Replace with your Hugging Face token,\n", |
201 | 207 | " )\n", |
202 | 208 | " ),\n", |
203 | 209 | " trainer=BuiltinTrainer(\n", |
204 | 210 | " config=TorchTuneConfig(\n", |
205 | 211 | " dataset_preprocess_config=TorchTuneInstructDataset(\n", |
206 | | - " source=DataFormat.PARQUET,\n", |
| 212 | + " source=DataFormat.PARQUET, split=\"train[:1000]\"\n", |
207 | 213 | " ),\n", |
208 | 214 | " resources_per_node={\n", |
| 215 | + " \"memory\": \"200G\",\n", |
209 | 216 | " \"gpu\": 1,\n", |
210 | | - " }\n", |
| 217 | + " },\n", |
| 218 | + " \n", |
211 | 219 | " )\n", |
212 | 220 | " )\n", |
213 | 221 | ")" |
214 | 222 | ] |
215 | 223 | }, |
| 224 | + { |
| 225 | + "cell_type": "markdown", |
| 226 | + "id": "ee5fbe8e", |
| 227 | + "metadata": {}, |
| 228 | + "source": [ |
| 229 | + "## Wait for running status" |
| 230 | + ] |
| 231 | + }, |
| 232 | + { |
| 233 | + "cell_type": "code", |
| 234 | + "execution_count": null, |
| 235 | + "id": "53eaa65a", |
| 236 | + "metadata": {}, |
| 237 | + "outputs": [], |
| 238 | + "source": [ |
| 239 | + "\n", |
| 240 | + "# Wait for the running status.\n", |
| 241 | + "client.wait_for_job_status(name=job_name, status={\"Running\"})\n" |
| 242 | + ] |
| 243 | + }, |
216 | 244 | { |
217 | 245 | "cell_type": "markdown", |
218 | 246 | "id": "75a82b76", |
|
247 | 275 | "source": [ |
248 | 276 | "from kubeflow.trainer.constants import constants\n", |
249 | 277 | "\n", |
250 | | - "log_dict = client.get_job_logs(job_name, follow=False, step=constants.DATASET_INITIALIZER)\n", |
251 | | - "print(log_dict[constants.DATASET_INITIALIZER])" |
| 278 | + "for line in client.get_job_logs(job_name, follow=True, step=constants.DATASET_INITIALIZER):\n", |
| 279 | + " print(line)" |
252 | 280 | ] |
253 | 281 | }, |
254 | 282 | { |
|
279 | 307 | } |
280 | 308 | ], |
281 | 309 | "source": [ |
282 | | - "log_dict = client.get_job_logs(job_name, follow=False, step=constants.MODEL_INITIALIZER)\n", |
283 | | - "print(log_dict[constants.MODEL_INITIALIZER])" |
| 310 | + "for line in client.get_job_logs(job_name, follow=True, step=constants.MODEL_INITIALIZER):\n", |
| 311 | + " print(line)" |
284 | 312 | ] |
285 | 313 | }, |
286 | 314 | { |
287 | 315 | "cell_type": "markdown", |
288 | 316 | "id": "b67775ea", |
289 | 317 | "metadata": {}, |
290 | 318 | "source": [ |
291 | | - "### Trainer Node" |
| 319 | + "### Trainer Node " |
292 | 320 | ] |
293 | 321 | }, |
294 | 322 | { |
|
392 | 420 | } |
393 | 421 | ], |
394 | 422 | "source": [ |
395 | | - "log_dict = client.get_job_logs(job_name, follow=False)\n", |
396 | | - "print(log_dict[f\"{constants.NODE}-0\"])" |
| 423 | + "for c in client.get_job(name=job_name).steps:\n", |
| 424 | + " print(f\"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\\n\")\n", |
| 425 | + "\n", |
| 426 | + "for line in client.get_job_logs(job_name, follow=True):\n", |
| 427 | + " print(line)" |
397 | 428 | ] |
398 | 429 | }, |
399 | 430 | { |
|
0 commit comments