Skip to content

Commit 3846ba6

Browse files
committed
Sync with upstream master to pull latest changes
Merge remote-tracking branch 'upstream/master' into feat/webhook-validate-trainjob-name
2 parents 45be2cc + 4cd66b4 commit 3846ba6

18 files changed

Lines changed: 537 additions & 692 deletions

File tree

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: Approve Workflow Runs
2+
3+
permissions:
4+
actions: write
5+
contents: read
6+
7+
on:
8+
pull_request_target:
9+
types:
10+
- labeled
11+
- synchronize
12+
13+
concurrency:
14+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event.number }}
15+
cancel-in-progress: true
16+
17+
jobs:
18+
ok-to-test:
19+
if: contains(github.event.pull_request.labels.*.name, 'ok-to-test') || github.event_name == 'pull_request_target'
20+
runs-on: ubuntu-latest
21+
continue-on-error: true
22+
23+
steps:
24+
- name: Check if author is a Kubeflow GitHub member
25+
id: membership-check
26+
uses: actions/github-script@v7
27+
with:
28+
script: |
29+
const username = context.payload.pull_request.user.login;
30+
const org = context.repo.owner;
31+
try {
32+
const res = await github.rest.orgs.checkMembershipForUser({
33+
org,
34+
username
35+
});
36+
core.setOutput("is_member", true);
37+
} catch (error) {
38+
if (error.status === 404) {
39+
// User is not a member
40+
core.setOutput("is_member", false);
41+
} else {
42+
throw error;
43+
}
44+
}
45+
46+
- name: Approve Pending Workflow Runs
47+
if: steps.membership-check.outputs.is_member == 'true' || contains(github.event.pull_request.labels.*.name, 'ok-to-test')
48+
uses: actions/github-script@v7
49+
with:
50+
retries: 3
51+
script: |
52+
const request = {
53+
owner: context.repo.owner,
54+
repo: context.repo.repo,
55+
event: "pull_request",
56+
status: "action_required",
57+
head_sha: context.payload.pull_request.head.sha,
58+
}
59+
60+
core.info(`Getting workflow runs that need approval for commit ${request.head_sha}`)
61+
const runs = await github.paginate(github.rest.actions.listWorkflowRunsForRepo, request)
62+
63+
core.info(`Found ${runs.length} workflow runs that need approval`)
64+
for (const run of runs) {
65+
core.info(`Approving workflow run ${run.id}`)
66+
const request = {
67+
owner: context.repo.owner,
68+
repo: context.repo.repo,
69+
run_id: run.id,
70+
}
71+
await github.rest.actions.approveWorkflowRun(request)
72+
}

.github/workflows/test-e2e.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
4242
4343
echo "Install Kubeflow SDK"
44-
pip install git+https://github.com/kubeflow/sdk.git@main#subdirectory=python
44+
pip install git+https://github.com/kubeflow/sdk.git@main
4545
4646
- name: Setup cluster
4747
run: |

.github/workflows/test-go.yaml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,3 @@ jobs:
7777
with:
7878
path-to-profile: cover.out
7979
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
80-
parallel: true
81-
82-
finish:
83-
needs: test
84-
runs-on: ubuntu-latest
85-
steps:
86-
- uses: shogo82148/actions-goveralls@v1
87-
with:
88-
parallel-finished: true

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@ __debug_bin
1616
# The default output for various artifacts (e.g. Jupyter Notebooks after Papermill execution).
1717
artifacts
1818

19-
# Python cache files
19+
# Python cache files / packaging
2020
__pycache__/
2121
*.egg-info/
22+
dist/
2223

2324
# Coverage
2425
cover.out

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,4 @@ On ubuntu the default go package appears to be gccgo-go which has problems. It's
135135

136136
Changes to the Kubeflow Trainer Python SDK can be made in the https://github.com/kubeflow/sdk repo.
137137

138-
The Trainer SDK can be found at https://github.com/kubeflow/sdk/tree/main/python/kubeflow/trainer.
138+
The Trainer SDK can be found at https://github.com/kubeflow/sdk/tree/main/kubeflow/trainer.

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,11 @@ endif
152152
# Instructions to run tests.
153153
.PHONY: test
154154
test: ## Run Go unit test.
155-
go test $(shell go list ./... | grep -v '/test/' | grep -v '/cmd/' | grep -v '/hack/' | grep -v '/pkg/apis' | grep -v '/pkg/client') -coverprofile cover.out
155+
go test $(shell go list ./... | grep -Ev '/(test|cmd|hack|pkg/apis|pkg/client|pkg/util/testing)') -coverprofile cover.out
156156

157157
.PHONY: test-integration
158158
test-integration: ginkgo envtest jobset-operator-crd scheduler-plugins-crd ## Run Go integration test.
159-
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(K8S_VERSION) -p path)" $(GINKGO) -coverprofile cover.out -v ./test/integration/...
159+
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(K8S_VERSION) -p path)" $(GINKGO) -v ./test/integration/...
160160

161161
.PHONY: test-python
162162
test-python: ## Run Python unit test.

README.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,26 @@
11
# Kubeflow Trainer
22

3-
[![Build Status](https://github.com/kubeflow/trainer/actions/workflows/test-go.yaml/badge.svg?branch=master)](https://github.com/kubeflow/trainer/actions/workflows/test-go.yaml?branch=master)
3+
[![Join Slack](https://img.shields.io/badge/Join_Slack-blue?logo=slack)](https://www.kubeflow.org/docs/about/community/#kubeflow-slack-channels)
44
[![Coverage Status](https://coveralls.io/repos/github/kubeflow/trainer/badge.svg?branch=master)](https://coveralls.io/github/kubeflow/trainer?branch=master)
55
[![Go Report Card](https://goreportcard.com/badge/github.com/kubeflow/trainer)](https://goreportcard.com/report/github.com/kubeflow/trainer)
66
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10435/badge)](https://www.bestpractices.dev/projects/10435)
7+
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/kubeflow/trainer)
78

89
<h1 align="center">
910
<img src="./docs/images/trainer-logo.svg" alt="logo" width="200">
1011
<br>
1112
</h1>
1213

14+
Latest News 🔥
15+
16+
- [2025/07] PyTorch on Kubernetes: Kubeflow Trainer Joins the PyTorch Ecosystem. Find the
17+
announcement in [the PyTorch blog post](https://pytorch.org/blog/pytorch-on-kubernetes-kubeflow-trainer-joins-the-pytorch-ecosystem/).
18+
- [2025/07] Kubeflow Trainer v2.0 has been officially released. Check out
19+
[the blog post announcement](https://blog.kubeflow.org/trainer/intro/) and [the
20+
release notes](https://github.com/kubeflow/trainer/releases/tag/v2.0.0).
21+
- [2025/04] From High Performance Computing To AI Workloads on Kubernetes: MPI Runtime in
22+
Kubeflow TrainJob. See the [KubeCon + CloudNativeCon London talk](https://youtu.be/Fnb1a5Kaxgo)
23+
1324
## Overview
1425

1526
Kubeflow Trainer is a Kubernetes-native project designed for large language models (LLMs)
@@ -18,7 +29,7 @@ various frameworks, including PyTorch, JAX, TensorFlow, and others.
1829

1930
You can integrate other ML libraries such as [HuggingFace](https://huggingface.co),
2031
[DeepSpeed](https://github.com/microsoft/DeepSpeed), or [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
21-
with Kubeflow Training to orchestrate their ML training on Kubernetes.
32+
with Kubeflow Trainer to run them on Kubernetes.
2233

2334
Kubeflow Trainer enables you to effortlessly develop your LLMs with the
2435
[Kubeflow Python SDK](https://github.com/kubeflow/sdk/), and build Kubernetes-native Training
@@ -37,7 +48,7 @@ The following KubeCon + CloudNativeCon 2024 talk provides an overview of Kubeflo
3748

3849
## Getting Started
3950

40-
Please check [the official Kubeflow documentation](https://www.kubeflow.org/docs/components/trainer/getting-started)
51+
Please check [the official Kubeflow Trainer documentation](https://www.kubeflow.org/docs/components/trainer/getting-started)
4152
to install and get started with Kubeflow Trainer.
4253

4354
## Community

api/python_api/pyproject.toml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,18 @@
1-
[build-system]
2-
requires = ["hatchling"]
3-
build-backend = "hatchling.build"
4-
51
[project]
62
name = "kubeflow_trainer_api"
73
dynamic = ["version"]
4+
requires-python = ">=3.9"
85
authors = [
96
{ name = "The Kubeflow Authors", email = "kubeflow-discuss@googlegroups.com" },
107
]
11-
license = { file = "../../LICENSE" }
8+
license = "Apache-2.0"
129
description = "Kubeflow Trainer API models for Kubernetes resources to interact with Kubeflow APIs."
1310
readme = "README.md"
1411
keywords = ["kubeflow", "trainer", "model training", "llm", "ai", "api"]
1512
classifiers = [
1613
"Intended Audience :: Developers",
1714
"Intended Audience :: Education",
1815
"Intended Audience :: Science/Research",
19-
"Programming Language :: Python :: 3.8",
2016
"Programming Language :: Python :: 3.9",
2117
"Programming Language :: Python :: 3.10",
2218
"Programming Language :: Python :: 3.11",
@@ -35,6 +31,10 @@ Homepage = "https://github.com/kubeflow/trainer"
3531
Documentation = "https://www.kubeflow.org/docs/components/trainer/"
3632
Source = "https://github.com/kubeflow/trainer"
3733

34+
[build-system]
35+
requires = ["hatchling"]
36+
build-backend = "hatchling.build"
37+
3838
[tool.hatch.build.targets.wheel]
3939
packages = ["kubeflow_trainer_api"]
4040

charts/kubeflow-trainer/values.yaml

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,14 @@ manager:
9292
# memory: 300Mi
9393

9494
# -- Security context for manager containers.
95-
securityContext: {}
96-
# readOnlyRootFilesystem: true
97-
# privileged: false
98-
# allowPrivilegeEscalation: false
99-
# runAsNonRoot: true
100-
# capabilities:
101-
# drop:
102-
# - ALL
103-
# seccompProfile:
104-
# type: RuntimeDefault
95+
securityContext:
96+
allowPrivilegeEscalation: false
97+
runAsNonRoot: true
98+
capabilities:
99+
drop:
100+
- ALL
101+
seccompProfile:
102+
type: RuntimeDefault
105103

106104
webhook:
107105
# -- Specifies how unrecognized errors are handled.
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Keep the same version as for Torch runtime.
22
torch==2.7.1
33
# DeepSpeed libraries.
4-
deepspeed==0.17.1
5-
mpi4py==4.0.3
4+
deepspeed==0.17.4
5+
mpi4py==4.1.0
66
# HuggingFace libraries.
7-
datasets==3.4.1
8-
transformers==4.50.0
7+
datasets==4.0.0
8+
transformers==4.55.0
99
SentencePiece==0.2.0

0 commit comments

Comments
 (0)