cloudflare · threepointone · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/.changeset/workers-ai-provider-reasoning-passthrough.md b/.changeset/workers-ai-provider-reasoning-passthrough.md
@@ -0,0 +1,28 @@
+---
+"workers-ai-provider": minor
+---
+
+Forward `reasoning_effort` and `chat_template_kwargs` onto `binding.run(model, inputs)`'s `inputs` object instead of silently dropping them into the options arg / REST query string. This fixes reasoning models (GLM-4.7-flash, Kimi K2.5/K2.6, GPT-OSS, QwQ) burning the entire output token budget on chain-of-thought with no visible content.
+
+Both settings-level and per-call usage are supported:
+
+```ts
+// Settings-level
+const model = workersai("@cf/zai-org/glm-4.7-flash", {
+  reasoning_effort: "low",
+  chat_template_kwargs: { enable_thinking: false },
+});
+
+// Per-call (overrides settings)
+await generateText({
+  model,
+  prompt,
+  providerOptions: {
+    "workers-ai": { reasoning_effort: "low" },
+  },
+});
+```
+
+`reasoning_effort: null` is preserved as-is (explicit "disable reasoning" signal). The two fields are also typed directly on `WorkersAIChatSettings`.
+
+Closes #501.
diff --git a/packages/workers-ai-provider/README.md b/packages/workers-ai-provider/README.md
@@ -112,6 +112,35 @@ for await (const chunk of result.textStream) {
 }
 ```
 
+## Reasoning Controls
+
+Reasoning-capable Workers AI models (GLM-4.7-flash, Kimi K2.5/K2.6, GPT-OSS, QwQ) accept `reasoning_effort` and `chat_template_kwargs` on their inputs. Either set them at model creation time as settings, or per-call via `providerOptions["workers-ai"]` (per-call wins):
+
+```ts
+// Settings-level (applies to every request on this model instance)
+const model = workersai("@cf/zai-org/glm-4.7-flash", {
+	reasoning_effort: "low", // "low" | "medium" | "high" | null
+	chat_template_kwargs: { enable_thinking: false },
+});
+
+await generateText({ model, prompt: "Summarize in one sentence." });
+```
+
+```ts
+// Per-call (overrides any settings-level value)
+const model = workersai("@cf/zai-org/glm-4.7-flash");
+
+await generateText({
+	model,
+	prompt: "Summarize in one sentence.",
+	providerOptions: {
+		"workers-ai": { reasoning_effort: "low" },
+	},
+});
+```
+
+`reasoning_effort: null` is meaningful — it's the explicit "disable reasoning" signal for models that support it. Both fields land on the `inputs` object of `binding.run()` (and the JSON body of the REST request), matching the shape expected by Workers AI. See the [model catalog](https://developers.cloudflare.com/workers-ai/models/) for per-model reasoning capabilities.
+
 ## Vision (Image Inputs)
 
 Send images to vision-capable models like Kimi K2.5:

diff --git a/packages/workers-ai-provider/src/workersai-chat-language-model.ts b/packages/workers-ai-provider/src/workersai-chat-language-model.ts
@@ -123,12 +123,36 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
 	 * accept this format at runtime.
 	 *
 	 * The binding path additionally normalises null content to empty strings.
+	 *
+	 * Reasoning controls (`reasoning_effort`, `chat_template_kwargs`) are
+	 * forwarded here from settings. These belong on the INPUTS object, not on
+	 * the 3rd-arg options / REST query string — see
+	 * https://github.com/cloudflare/ai/issues/501. Per-call values from
+	 * `providerOptions["workers-ai"]` override settings.
+	 *
+	 * `reasoning_effort: null` is a valid value ("disable reasoning"), so we
+	 * check `!== undefined` rather than truthiness.
 	 */
 	private buildRunInputs(
 		args: ReturnType<typeof this.getArgs>["args"],
 		messages: ReturnType<typeof convertToWorkersAIChatMessages>["messages"],
-		options?: { stream?: boolean },
+		options?: { stream?: boolean; providerOptions?: Record<string, unknown> },
 	) {
+		// The AI SDK types this as `Record<string, JSONObject>` but we defensively
+		// accept anything and only treat it as a lookup if it's a plain object.
+		// `"key" in x` throws for primitives, so we can't skip the typeof guard.
+		const rawPerCall = options?.providerOptions?.["workers-ai"];
+		const perCall: Record<string, unknown> =
+			rawPerCall !== null && typeof rawPerCall === "object" && !Array.isArray(rawPerCall)
+				? (rawPerCall as Record<string, unknown>)
+				: {};
+		const reasoningEffort =
+			"reasoning_effort" in perCall ? perCall.reasoning_effort : this.settings.reasoning_effort;
+		const chatTemplateKwargs =
+			"chat_template_kwargs" in perCall
+				? perCall.chat_template_kwargs
+				: this.settings.chat_template_kwargs;
+
 		return {
 			max_tokens: args.max_tokens,
 			messages: this.config.isBinding ? normalizeMessagesForBinding(messages) : messages,
@@ -138,18 +162,28 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
 			top_p: args.top_p,
 			...(args.response_format ? { response_format: args.response_format } : {}),
 			...(options?.stream ? { stream: true } : {}),
+			...(reasoningEffort !== undefined ? { reasoning_effort: reasoningEffort } : {}),
+			...(chatTemplateKwargs !== undefined
+				? { chat_template_kwargs: chatTemplateKwargs }
+				: {}),
 		};
 	}
 
 	/**
 	 * Get passthrough options for binding.run() from settings.
+	 *
+	 * `reasoning_effort` and `chat_template_kwargs` are explicitly excluded
+	 * here — they belong on the `inputs` object (see `buildRunInputs`), not on
+	 * the `options` (3rd) arg of binding.run() or the REST query string.
 	 */
 	private getRunOptions() {
 		const {
 			gateway,
 			safePrompt: _safePrompt,
 			sessionAffinity,
 			extraHeaders,
+			reasoning_effort: _reasoningEffort,
+			chat_template_kwargs: _chatTemplateKwargs,
 			...passthroughOptions
 		} = this.settings;
 
@@ -173,7 +207,9 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
 		const { args, warnings } = this.getArgs(options);
 		const { messages } = convertToWorkersAIChatMessages(options.prompt);
 
-		const inputs = this.buildRunInputs(args, messages);
+		const inputs = this.buildRunInputs(args, messages, {
+			providerOptions: options.providerOptions,
+		});
 		const runOptions = this.getRunOptions();
 
 		const output = await this.config.binding.run(
@@ -223,7 +259,10 @@ export class WorkersAIChatLanguageModel implements LanguageModelV3 {
 		const { args, warnings } = this.getArgs(options);
 		const { messages } = convertToWorkersAIChatMessages(options.prompt);
 
-		const inputs = this.buildRunInputs(args, messages, { stream: true });
+		const inputs = this.buildRunInputs(args, messages, {
+			stream: true,
+			providerOptions: options.providerOptions,
+		});
 		const runOptions = this.getRunOptions();
 
 		const response = await this.config.binding.run(

diff --git a/packages/workers-ai-provider/src/workersai-chat-settings.ts b/packages/workers-ai-provider/src/workersai-chat-settings.ts
@@ -16,6 +16,29 @@ export type WorkersAIChatSettings = {
 	 */
 	sessionAffinity?: string;
 
+	/**
+	 * Controls the reasoning budget for reasoning-capable Workers AI models
+	 * (e.g. `@cf/zai-org/glm-4.7-flash`, `@cf/moonshotai/kimi-k2.5`,
+	 * `@cf/openai/gpt-oss-120b`).
+	 *
+	 * `null` is a valid value and disables reasoning for models that support it.
+	 * Forwarded on the `inputs` object of `binding.run(model, inputs)`.
+	 */
+	reasoning_effort?: "low" | "medium" | "high" | null;
+
+	/**
+	 * Chat-template overrides for reasoning-capable models that expose
+	 * thinking toggles (e.g. GLM, Kimi).
+	 *
+	 * Forwarded on the `inputs` object of `binding.run(model, inputs)`.
+	 */
+	chat_template_kwargs?: {
+		/** Whether to enable reasoning. Enabled by default on reasoning models. */
+		enable_thinking?: boolean;
+		/** If false, preserves reasoning context between turns. */
+		clear_thinking?: boolean;
+	};
+
 	/**
 	 * Passthrough settings that are provided directly to the run function.
 	 * Use this for any provider-specific options not covered by the typed fields.