diff --git a/app/schema.graphql b/app/schema.graphql
index e681fb97241..3d03a569b4f 100644
--- a/app/schema.graphql
+++ b/app/schema.graphql
@@ -594,7 +594,7 @@ input CreateCodeEvaluatorInput {
   sourceCode: String!
   language: Language!
   description: String = null
-  sandboxConfigId: Int = null
+  sandboxConfigId: ID = null
   outputConfigs: [AnnotationConfigInput!] = null
   inputMapping: EvaluatorInputMappingInput = null
 }
@@ -1440,6 +1440,7 @@ input EvaluatorPreviewInput @oneOf {
   builtInEvaluatorId: ID
   inlineLlmEvaluator: InlineLLMEvaluatorInput
   codeEvaluatorId: ID
+  inlineCodeEvaluator: InlineCodeEvaluatorInput
 }
 
 input EvaluatorPreviewItemInput {
@@ -2045,6 +2046,15 @@ input GoogleGenAIHttpOptionsInput {
 
 scalar Identifier
 
+input InlineCodeEvaluatorInput {
+  name: String!
+  language: Language!
+  sourceCode: String!
+  outputConfigs: [AnnotationConfigInput!]!
+  sandboxConfigId: ID = null
+  description: String = null
+}
+
 input InlineLLMEvaluatorInput {
   name: String!
   promptVersion: ChatPromptVersionInput!
@@ -3867,7 +3877,7 @@ input UpdateCodeEvaluatorInput {
   sourceCode: String
   language: Language
   description: String
-  sandboxConfigId: Int
+  sandboxConfigId: ID
   outputConfigs: [AnnotationConfigInput!]
   inputMapping: EvaluatorInputMappingInput
 }
diff --git a/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx b/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx
index 4080094ab73..6e456e69f7d 100644
--- a/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx
+++ b/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx
@@ -1,4 +1,4 @@
-import { Suspense, useMemo, useState } from "react";
+import { Suspense, useCallback, useEffect, useRef, useState } from "react";
 import type { ModalOverlayProps } from "react-aria-components";
 import { graphql, useLazyLoadQuery, useMutation } from "react-relay";
 import invariant from "tiny-invariant";
@@ -9,11 +9,11 @@ import { Modal, ModalOverlay } from "@phoenix/components/core/overlay/Modal";
 import type { CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql";
 import type { CreateCodeDatasetEvaluatorSlideover_createDatasetCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createDatasetCodeEvaluatorMutation.graphql";
 import type { CreateCodeDatasetEvaluatorSlideoverQuery } from "@phoenix/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql";
+import { mapSandboxConfigOptions } from "@phoenix/components/evaluators/CodeEvaluatorLanguageSandboxFields";
 import { DEFAULT_CODE_EVALUATOR_SOURCE } from "@phoenix/components/evaluators/codeEvaluatorUtils";
 import {
-  EditCodeEvaluatorDialogContent,
-  mapSandboxConfigOptions,
   createDefaultContinuousOutputConfig,
+  EditCodeEvaluatorDialogContent,
 } from "@phoenix/components/evaluators/EditCodeEvaluatorDialogContent";
 import { buildOutputConfigsInput } from "@phoenix/components/evaluators/utils";
 import { EvaluatorStoreProvider } from "@phoenix/contexts/EvaluatorContext";
@@ -29,20 +29,49 @@ export const CreateCodeDatasetEvaluatorSlideover = ({
   datasetId,
   updateConnectionIds,
   onEvaluatorCreated,
+  onOpenChange,
+  isOpen,
   ...props
 }: {
   datasetId: string;
   updateConnectionIds?: string[];
   onEvaluatorCreated?: (datasetEvaluatorId: string) => void;
 } & ModalOverlayProps) => {
+  const isDirtyRef = useRef(false);
+
+  // Reset dirty state when slideover opens
+  useEffect(() => {
+    if (isOpen) {
+      isDirtyRef.current = false;
+    }
+  }, [isOpen]);
+
+  const handleOpenChange = useCallback(
+    (nextIsOpen: boolean) => {
+      if (!nextIsOpen && isDirtyRef.current) {
+        const confirmed = window.confirm(
+          "You have unsaved changes. Are you sure you want to close?"
+        );
+        if (!confirmed) return;
+      }
+      onOpenChange?.(nextIsOpen);
+    },
+    [onOpenChange]
+  );
+
+  const handleDirtyChange = useCallback((isDirty: boolean) => {
+    isDirtyRef.current = isDirty;
+  }, []);
+
   return (
-    <ModalOverlay {...props}>
+    <ModalOverlay {...props} isOpen={isOpen} onOpenChange={handleOpenChange}>
       <Modal variant="slideover" size="fullscreen">
         <Dialog>
           {({ close }) => (
             <Suspense fallback={<Loading />}>
               <CreateCodeEvaluatorDialog
                 onClose={close}
+                onDirtyChange={handleDirtyChange}
                 datasetId={datasetId}
                 updateConnectionIds={updateConnectionIds}
                 onEvaluatorCreated={onEvaluatorCreated}
@@ -57,11 +86,13 @@ export const CreateCodeDatasetEvaluatorSlideover = ({
 
 const CreateCodeEvaluatorDialog = ({
   onClose,
+  onDirtyChange,
   datasetId,
   updateConnectionIds,
   onEvaluatorCreated,
 }: {
   onClose: () => void;
+  onDirtyChange?: (isDirty: boolean) => void;
   datasetId: string;
   updateConnectionIds?: string[];
   onEvaluatorCreated?: (datasetEvaluatorId: string) => void;
@@ -74,19 +105,24 @@ const CreateCodeEvaluatorDialog = ({
         sandboxProviders {
           backendType
           language
+          enabled
           configs {
             id
             name
             description
           }
         }
+        sandboxBackends {
+          backendType
+          status
+        }
       }
     `,
     {}
   );
-  const sandboxConfigs = useMemo(
-    () => mapSandboxConfigOptions(data.sandboxProviders),
-    [data.sandboxProviders]
+  const sandboxConfigs = mapSandboxConfigOptions(
+    data.sandboxProviders,
+    data.sandboxBackends
   );
   const [createCodeEvaluator, isCreatingCodeEvaluator] =
     useMutation<CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation>(graphql`
@@ -119,40 +155,36 @@ const CreateCodeEvaluatorDialog = ({
         }
       }
     `);
-  const initialState = useMemo(
-    () =>
-      ({
-        evaluator: {
-          globalName: "",
-          name: "",
-          description: "",
-          inputMapping: {
-            literalMapping: {},
-            pathMapping: {},
-          },
-          kind: "CODE",
-          isBuiltin: false,
-          includeExplanation: false,
-        },
-        outputConfigs: [createDefaultContinuousOutputConfig("")],
-        dataset: {
-          readonly: true,
-          id: datasetId,
-          selectedExampleId: null,
-          selectedSplitIds: [],
-        },
-        evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT,
-        showPromptPreview: false,
-      }) satisfies EvaluatorStoreProps,
-    [datasetId]
-  );
+  const initialState: EvaluatorStoreProps = {
+    evaluator: {
+      globalName: "",
+      name: "",
+      description: "",
+      inputMapping: {
+        literalMapping: {},
+        pathMapping: {},
+      },
+      kind: "CODE",
+      isBuiltin: false,
+      includeExplanation: false,
+    },
+    outputConfigs: [createDefaultContinuousOutputConfig("")],
+    dataset: {
+      readonly: true,
+      id: datasetId,
+      selectedExampleId: null,
+      selectedSplitIds: [],
+    },
+    evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT,
+    showPromptPreview: false,
+  };
 
   const onSubmit = (
     store: EvaluatorStoreInstance,
     payload: {
       language: "PYTHON" | "TYPESCRIPT";
       sourceCode: string;
-      sandboxConfigId: number | null;
+      sandboxConfigId?: string | null;
     }
   ) => {
     setError(undefined);
@@ -198,6 +230,7 @@ const CreateCodeEvaluatorDialog = ({
               title: "Evaluator created",
               message: "The code evaluator has been added to the dataset.",
             });
+            onDirtyChange?.(false);
             onClose();
           },
           onError: (mutationError) => {
@@ -223,6 +256,8 @@ const CreateCodeEvaluatorDialog = ({
       {({ store }) => (
         <EditCodeEvaluatorDialogContent
           onSubmit={(payload) => onSubmit(store, payload)}
+          onCancel={onClose}
+          onDirtyChange={onDirtyChange}
           isSubmitting={
             isCreatingCodeEvaluator || isCreatingDatasetCodeEvaluator
           }
diff --git a/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx b/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx
index 0f1da5fb656..ac2b299ce31 100644
--- a/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx
+++ b/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx
@@ -1,4 +1,4 @@
-import { Suspense, useMemo, useState } from "react";
+import { Suspense, useCallback, useEffect, useRef, useState } from "react";
 import type { ModalOverlayProps } from "react-aria-components";
 import { graphql, useLazyLoadQuery, useMutation } from "react-relay";
 import invariant from "tiny-invariant";
@@ -13,10 +13,10 @@ import {
 import type { EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery } from "@phoenix/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql";
 import type { EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql";
 import type { EditCodeDatasetEvaluatorSlideover_updateDatasetCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateDatasetCodeEvaluatorMutation.graphql";
+import { mapSandboxConfigOptions } from "@phoenix/components/evaluators/CodeEvaluatorLanguageSandboxFields";
 import {
-  EditCodeEvaluatorDialogContent,
-  mapSandboxConfigOptions,
   createDefaultContinuousOutputConfig,
+  EditCodeEvaluatorDialogContent,
 } from "@phoenix/components/evaluators/EditCodeEvaluatorDialogContent";
 import { buildOutputConfigsInput } from "@phoenix/components/evaluators/utils";
 import { EvaluatorStoreProvider } from "@phoenix/contexts/EvaluatorContext";
@@ -45,10 +45,38 @@ export function EditCodeDatasetEvaluatorSlideover({
   datasetId,
   updateConnectionIds,
   onUpdate,
+  onOpenChange,
+  isOpen,
   ...props
 }: EditCodeDatasetEvaluatorSlideoverProps) {
+  const isDirtyRef = useRef(false);
+
+  // Reset dirty state when slideover opens
+  useEffect(() => {
+    if (isOpen) {
+      isDirtyRef.current = false;
+    }
+  }, [isOpen]);
+
+  const handleOpenChange = useCallback(
+    (nextIsOpen: boolean) => {
+      if (!nextIsOpen && isDirtyRef.current) {
+        const confirmed = window.confirm(
+          "You have unsaved changes. Are you sure you want to close?"
+        );
+        if (!confirmed) return;
+      }
+      onOpenChange?.(nextIsOpen);
+    },
+    [onOpenChange]
+  );
+
+  const handleDirtyChange = useCallback((isDirty: boolean) => {
+    isDirtyRef.current = isDirty;
+  }, []);
+
   return (
-    <ModalOverlay {...props}>
+    <ModalOverlay {...props} isOpen={isOpen} onOpenChange={handleOpenChange}>
       <Modal variant="slideover" size="fullscreen">
         <Dialog aria-label="Edit code evaluator on dataset">
           {({ close }) => (
@@ -63,6 +91,7 @@ export function EditCodeDatasetEvaluatorSlideover({
                 <EditCodeDatasetEvaluatorSlideoverContent
                   datasetEvaluatorId={datasetEvaluatorId}
                   onClose={close}
+                  onDirtyChange={handleDirtyChange}
                   datasetId={datasetId}
                   updateConnectionIds={updateConnectionIds}
                   onUpdate={onUpdate}
@@ -79,12 +108,14 @@ export function EditCodeDatasetEvaluatorSlideover({
 function EditCodeDatasetEvaluatorSlideoverContent({
   datasetEvaluatorId,
   onClose,
+  onDirtyChange,
   datasetId,
   updateConnectionIds,
   onUpdate,
 }: {
   datasetEvaluatorId: string;
   onClose: () => void;
+  onDirtyChange?: (isDirty: boolean) => void;
   datasetId: string;
   updateConnectionIds?: string[];
   onUpdate?: () => void;
@@ -160,18 +191,23 @@ function EditCodeDatasetEvaluatorSlideoverContent({
           sandboxProviders {
             backendType
             language
+            enabled
             configs {
               id
               name
               description
             }
           }
+          sandboxBackends {
+            backendType
+            status
+          }
         }
       `,
       { datasetEvaluatorId, datasetId },
       { fetchPolicy: "network-only" }
     );
-  const { dataset, sandboxProviders } = data;
+  const { dataset, sandboxProviders, sandboxBackends } = data;
   invariant(dataset, "dataset is required");
   const datasetEvaluator = dataset.datasetEvaluator;
   invariant(datasetEvaluator, "dataset evaluator is required");
@@ -181,17 +217,12 @@ function EditCodeDatasetEvaluatorSlideoverContent({
   invariant(evaluator.sourceCode, "code evaluator source code is required");
   const evaluatorLanguage = evaluator.language;
   const evaluatorSourceCode = evaluator.sourceCode;
-  const sandboxConfigs = useMemo(
-    () => mapSandboxConfigOptions(sandboxProviders),
-    [sandboxProviders]
+  const sandboxConfigs = mapSandboxConfigOptions(
+    sandboxProviders,
+    sandboxBackends
   );
   const sandboxConfigGlobalId = evaluator.sandboxConfig?.id;
-  const initialSandboxConfigId = sandboxConfigGlobalId
-    ? (sandboxConfigs.find(
-        (config) =>
-          String(config.id) === atob(sandboxConfigGlobalId).split(":", 2)[1]
-      )?.id ?? null)
-    : null;
+  const initialSandboxConfigId = sandboxConfigGlobalId ?? null;
 
   const [updateCodeEvaluator, isUpdatingCodeEvaluator] =
     useMutation<EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation>(graphql`
@@ -226,70 +257,47 @@ function EditCodeDatasetEvaluatorSlideoverContent({
       }
     `);
 
-  const loadedOutputConfigs = useMemo(
-    () =>
-      (datasetEvaluator.outputConfigs?.length
-        ? datasetEvaluator.outputConfigs
-        : evaluator.outputConfigs?.length
-          ? evaluator.outputConfigs
-          : [
-              createDefaultContinuousOutputConfig(datasetEvaluator.name ?? ""),
-            ]) as Mutable<
-        | ContinuousEvaluatorAnnotationConfig
-        | ClassificationEvaluatorAnnotationConfig
-      >[],
-    [
-      datasetEvaluator.name,
-      datasetEvaluator.outputConfigs,
-      evaluator.outputConfigs,
-    ]
-  );
-  const initialState = useMemo(
-    () =>
-      ({
-        evaluator: {
-          id: evaluator.id,
-          globalName: evaluator.name ?? datasetEvaluator.name ?? "",
-          name: datasetEvaluator.name ?? evaluator.name ?? "",
-          description:
-            datasetEvaluator.description ?? evaluator.description ?? "",
-          inputMapping: datasetEvaluator.inputMapping,
-          kind: "CODE",
-          isBuiltin: false,
-          includeExplanation: false,
-        },
-        datasetEvaluator: {
-          id: datasetEvaluatorId,
-        },
-        outputConfigs: loadedOutputConfigs,
-        dataset: {
-          readonly: true,
-          id: datasetId,
-          selectedExampleId: null,
-          selectedSplitIds: [],
-        },
-        evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT,
-        showPromptPreview: false,
-      }) satisfies EvaluatorStoreProps,
-    [
-      datasetEvaluator.description,
-      datasetEvaluator.inputMapping,
-      datasetEvaluator.name,
-      datasetEvaluatorId,
-      datasetId,
-      evaluator.description,
-      evaluator.id,
-      evaluator.name,
-      loadedOutputConfigs,
-    ]
-  );
+  const loadedOutputConfigs = (
+    datasetEvaluator.outputConfigs?.length
+      ? datasetEvaluator.outputConfigs
+      : evaluator.outputConfigs?.length
+        ? evaluator.outputConfigs
+        : [createDefaultContinuousOutputConfig(datasetEvaluator.name ?? "")]
+  ) as Mutable<
+    | ContinuousEvaluatorAnnotationConfig
+    | ClassificationEvaluatorAnnotationConfig
+  >[];
+  const initialState: EvaluatorStoreProps = {
+    evaluator: {
+      id: evaluator.id,
+      globalName: evaluator.name ?? datasetEvaluator.name ?? "",
+      name: datasetEvaluator.name ?? evaluator.name ?? "",
+      description: datasetEvaluator.description ?? evaluator.description ?? "",
+      inputMapping: datasetEvaluator.inputMapping,
+      kind: "CODE",
+      isBuiltin: false,
+      includeExplanation: false,
+    },
+    datasetEvaluator: {
+      id: datasetEvaluatorId,
+    },
+    outputConfigs: loadedOutputConfigs,
+    dataset: {
+      readonly: true,
+      id: datasetId,
+      selectedExampleId: null,
+      selectedSplitIds: [],
+    },
+    evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT,
+    showPromptPreview: false,
+  };
 
   const onSubmit = (
     store: EvaluatorStoreInstance,
     payload: {
       language: "PYTHON" | "TYPESCRIPT";
       sourceCode: string;
-      sandboxConfigId: number | null;
+      sandboxConfigId?: string | null;
     }
   ) => {
     setError(undefined);
@@ -328,6 +336,7 @@ function EditCodeDatasetEvaluatorSlideoverContent({
           },
           onCompleted: () => {
             notifySuccess({ title: "Evaluator updated" });
+            onDirtyChange?.(false);
             onClose();
             onUpdate?.();
           },
@@ -354,6 +363,8 @@ function EditCodeDatasetEvaluatorSlideoverContent({
       {({ store }) => (
         <EditCodeEvaluatorDialogContent
           onSubmit={(payload) => onSubmit(store, payload)}
+          onCancel={onClose}
+          onDirtyChange={onDirtyChange}
           isSubmitting={
             isUpdatingCodeEvaluator || isUpdatingDatasetCodeEvaluator
           }
diff --git a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts
index 56fd92ccf44..5cad19a7c55 100644
--- a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts
+++ b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts
@@ -1,5 +1,5 @@
 /**
- * @generated SignedSource<<06e620c176178145fc5a3839b02c23ca>>
+ * @generated SignedSource<<5e569775dd6d69ca1c8f074190c66e94>>
  * @lightSyntaxTransform
  * @nogrep
  */
@@ -10,8 +10,13 @@
 
 import { ConcreteRequest } from 'relay-runtime';
 export type Language = "PYTHON" | "TYPESCRIPT";
+export type SandboxBackendStatus = "AVAILABLE" | "NOT_INSTALLED" | "UNAVAILABLE";
 export type CreateCodeDatasetEvaluatorSlideoverQuery$variables = Record<PropertyKey, never>;
 export type CreateCodeDatasetEvaluatorSlideoverQuery$data = {
+  readonly sandboxBackends: ReadonlyArray<{
+    readonly backendType: string;
+    readonly status: SandboxBackendStatus;
+  }>;
   readonly sandboxProviders: ReadonlyArray<{
     readonly backendType: string;
     readonly configs: ReadonlyArray<{
@@ -19,6 +24,7 @@ export type CreateCodeDatasetEvaluatorSlideoverQuery$data = {
       readonly id: string;
       readonly name: string;
     }>;
+    readonly enabled: boolean;
     readonly language: Language;
   }>;
 };
@@ -46,10 +52,17 @@ v2 = {
   "alias": null,
   "args": null,
   "kind": "ScalarField",
-  "name": "id",
+  "name": "enabled",
   "storageKey": null
 },
 v3 = {
+  "alias": null,
+  "args": null,
+  "kind": "ScalarField",
+  "name": "id",
+  "storageKey": null
+},
+v4 = {
   "alias": null,
   "args": null,
   "concreteType": "SandboxConfig",
@@ -57,7 +70,7 @@ v3 = {
   "name": "configs",
   "plural": true,
   "selections": [
-    (v2/*: any*/),
+    (v3/*: any*/),
     {
       "alias": null,
       "args": null,
@@ -74,6 +87,25 @@ v3 = {
     }
   ],
   "storageKey": null
+},
+v5 = {
+  "alias": null,
+  "args": null,
+  "concreteType": "SandboxBackendInfo",
+  "kind": "LinkedField",
+  "name": "sandboxBackends",
+  "plural": true,
+  "selections": [
+    (v0/*: any*/),
+    {
+      "alias": null,
+      "args": null,
+      "kind": "ScalarField",
+      "name": "status",
+      "storageKey": null
+    }
+  ],
+  "storageKey": null
 };
 return {
   "fragment": {
@@ -92,10 +124,12 @@ return {
         "selections": [
           (v0/*: any*/),
           (v1/*: any*/),
-          (v3/*: any*/)
+          (v2/*: any*/),
+          (v4/*: any*/)
         ],
         "storageKey": null
-      }
+      },
+      (v5/*: any*/)
     ],
     "type": "Query",
     "abstractKey": null
@@ -116,24 +150,26 @@ return {
         "selections": [
           (v0/*: any*/),
           (v1/*: any*/),
-          (v3/*: any*/),
-          (v2/*: any*/)
+          (v2/*: any*/),
+          (v4/*: any*/),
+          (v3/*: any*/)
         ],
         "storageKey": null
-      }
+      },
+      (v5/*: any*/)
     ]
   },
   "params": {
-    "cacheID": "a0d387f77418178c3e8ffb53ac0a8f53",
+    "cacheID": "a0ca8e56dc9f78382ace98a30550e113",
     "id": null,
     "metadata": {},
     "name": "CreateCodeDatasetEvaluatorSlideoverQuery",
     "operationKind": "query",
-    "text": "query CreateCodeDatasetEvaluatorSlideoverQuery {\n  sandboxProviders {\n    backendType\n    language\n    configs {\n      id\n      name\n      description\n    }\n    id\n  }\n}\n"
+    "text": "query CreateCodeDatasetEvaluatorSlideoverQuery {\n  sandboxProviders {\n    backendType\n    language\n    enabled\n    configs {\n      id\n      name\n      description\n    }\n    id\n  }\n  sandboxBackends {\n    backendType\n    status\n  }\n}\n"
   }
 };
 })();
 
-(node as any).hash = "c0a20fe4690192a61297ef66e60b100a";
+(node as any).hash = "6dc158e2f8dcebc2e068c3d0edcb4060";
 
 export default node;
diff --git a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts
index 34330188e5a..69b47d9ac1d 100644
--- a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts
+++ b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts
@@ -1,5 +1,5 @@
 /**
- * @generated SignedSource<<9c2078a5db23ed3519210588c895fdee>>
+ * @generated SignedSource<<c99b9e5cbd5600d1bc1b3b9e4c7c7ec4>>
  * @lightSyntaxTransform
  * @nogrep
  */
@@ -17,7 +17,7 @@ export type CreateCodeEvaluatorInput = {
   language: Language;
   name: string;
   outputConfigs?: ReadonlyArray<AnnotationConfigInput> | null;
-  sandboxConfigId?: number | null;
+  sandboxConfigId?: string | null;
   sourceCode: string;
 };
 export type AnnotationConfigInput = {
diff --git a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts
index f3b02af9449..b48439aa0fb 100644
--- a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts
+++ b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts
@@ -1,5 +1,5 @@
 /**
- * @generated SignedSource<<fa75eb5d53b2ea79843d7d36e8bd84e4>>
+ * @generated SignedSource<<1d0a16e5296b8abba920428753786658>>
  * @lightSyntaxTransform
  * @nogrep
  */
@@ -12,6 +12,7 @@ import { ConcreteRequest } from 'relay-runtime';
 export type EvaluatorKind = "BUILTIN" | "CODE" | "LLM";
 export type Language = "PYTHON" | "TYPESCRIPT";
 export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE" | "NONE";
+export type SandboxBackendStatus = "AVAILABLE" | "NOT_INSTALLED" | "UNAVAILABLE";
 export type EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery$variables = {
   datasetEvaluatorId: string;
   datasetId: string;
@@ -60,6 +61,10 @@ export type EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery$data = {
     };
     readonly id: string;
   };
+  readonly sandboxBackends: ReadonlyArray<{
+    readonly backendType: string;
+    readonly status: SandboxBackendStatus;
+  }>;
   readonly sandboxProviders: ReadonlyArray<{
     readonly backendType: string;
     readonly configs: ReadonlyArray<{
@@ -67,6 +72,7 @@ export type EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery$data = {
       readonly id: string;
       readonly name: string;
     }>;
+    readonly enabled: boolean;
     readonly language: Language;
   }>;
 };
@@ -267,6 +273,13 @@ v16 = {
   "storageKey": null
 },
 v17 = {
+  "alias": null,
+  "args": null,
+  "kind": "ScalarField",
+  "name": "enabled",
+  "storageKey": null
+},
+v18 = {
   "alias": null,
   "args": null,
   "concreteType": "SandboxConfig",
@@ -280,14 +293,33 @@ v17 = {
   ],
   "storageKey": null
 },
-v18 = {
+v19 = {
+  "alias": null,
+  "args": null,
+  "concreteType": "SandboxBackendInfo",
+  "kind": "LinkedField",
+  "name": "sandboxBackends",
+  "plural": true,
+  "selections": [
+    (v16/*: any*/),
+    {
+      "alias": null,
+      "args": null,
+      "kind": "ScalarField",
+      "name": "status",
+      "storageKey": null
+    }
+  ],
+  "storageKey": null
+},
+v20 = {
   "alias": null,
   "args": null,
   "kind": "ScalarField",
   "name": "__typename",
   "storageKey": null
 },
-v19 = {
+v21 = {
   "alias": null,
   "args": null,
   "concreteType": null,
@@ -295,7 +327,7 @@ v19 = {
   "name": "outputConfigs",
   "plural": true,
   "selections": [
-    (v18/*: any*/),
+    (v20/*: any*/),
     (v8/*: any*/),
     (v9/*: any*/),
     {
@@ -385,10 +417,12 @@ return {
         "selections": [
           (v16/*: any*/),
           (v13/*: any*/),
-          (v17/*: any*/)
+          (v17/*: any*/),
+          (v18/*: any*/)
         ],
         "storageKey": null
-      }
+      },
+      (v19/*: any*/)
     ],
     "type": "Query",
     "abstractKey": null
@@ -407,7 +441,7 @@ return {
         "name": "node",
         "plural": false,
         "selections": [
-          (v18/*: any*/),
+          (v20/*: any*/),
           (v2/*: any*/),
           {
             "kind": "InlineFragment",
@@ -424,7 +458,7 @@ return {
                   (v4/*: any*/),
                   (v5/*: any*/),
                   (v6/*: any*/),
-                  (v19/*: any*/),
+                  (v21/*: any*/),
                   {
                     "alias": null,
                     "args": null,
@@ -433,7 +467,7 @@ return {
                     "name": "evaluator",
                     "plural": false,
                     "selections": [
-                      (v18/*: any*/),
+                      (v20/*: any*/),
                       (v2/*: any*/),
                       (v11/*: any*/),
                       {
@@ -444,7 +478,7 @@ return {
                           (v12/*: any*/),
                           (v13/*: any*/),
                           (v15/*: any*/),
-                          (v19/*: any*/)
+                          (v21/*: any*/)
                         ],
                         "type": "CodeEvaluator",
                         "abstractKey": null
@@ -473,23 +507,25 @@ return {
           (v16/*: any*/),
           (v13/*: any*/),
           (v17/*: any*/),
+          (v18/*: any*/),
           (v2/*: any*/)
         ],
         "storageKey": null
-      }
+      },
+      (v19/*: any*/)
     ]
   },
   "params": {
-    "cacheID": "2c53f4a8dc7f28e90872e9dc40018e4e",
+    "cacheID": "82065e93bd959f33205eb3f97e84170e",
     "id": null,
     "metadata": {},
     "name": "EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery",
     "operationKind": "query",
-    "text": "query EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery(\n  $datasetEvaluatorId: ID!\n  $datasetId: ID!\n) {\n  dataset: node(id: $datasetId) {\n    __typename\n    id\n    ... on Dataset {\n      datasetEvaluator(datasetEvaluatorId: $datasetEvaluatorId) {\n        id\n        name\n        description\n        inputMapping {\n          literalMapping\n          pathMapping\n        }\n        outputConfigs {\n          __typename\n          ... on CategoricalAnnotationConfig {\n            name\n            optimizationDirection\n            values {\n              label\n              score\n            }\n          }\n          ... on ContinuousAnnotationConfig {\n            name\n            optimizationDirection\n            lowerBound\n            upperBound\n          }\n          ... on Node {\n            __isNode: __typename\n            id\n          }\n        }\n        evaluator {\n          __typename\n          id\n          kind\n          ... on CodeEvaluator {\n            name\n            description\n            sourceCode\n            language\n            sandboxConfig {\n              id\n            }\n            outputConfigs {\n              __typename\n              ... on CategoricalAnnotationConfig {\n                name\n                optimizationDirection\n                values {\n                  label\n                  score\n                }\n              }\n              ... on ContinuousAnnotationConfig {\n                name\n                optimizationDirection\n                lowerBound\n                upperBound\n              }\n              ... on Node {\n                __isNode: __typename\n                id\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n  sandboxProviders {\n    backendType\n    language\n    configs {\n      id\n      name\n      description\n    }\n    id\n  }\n}\n"
+    "text": "query EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery(\n  $datasetEvaluatorId: ID!\n  $datasetId: ID!\n) {\n  dataset: node(id: $datasetId) {\n    __typename\n    id\n    ... on Dataset {\n      datasetEvaluator(datasetEvaluatorId: $datasetEvaluatorId) {\n        id\n        name\n        description\n        inputMapping {\n          literalMapping\n          pathMapping\n        }\n        outputConfigs {\n          __typename\n          ... on CategoricalAnnotationConfig {\n            name\n            optimizationDirection\n            values {\n              label\n              score\n            }\n          }\n          ... on ContinuousAnnotationConfig {\n            name\n            optimizationDirection\n            lowerBound\n            upperBound\n          }\n          ... on Node {\n            __isNode: __typename\n            id\n          }\n        }\n        evaluator {\n          __typename\n          id\n          kind\n          ... on CodeEvaluator {\n            name\n            description\n            sourceCode\n            language\n            sandboxConfig {\n              id\n            }\n            outputConfigs {\n              __typename\n              ... on CategoricalAnnotationConfig {\n                name\n                optimizationDirection\n                values {\n                  label\n                  score\n                }\n              }\n              ... on ContinuousAnnotationConfig {\n                name\n                optimizationDirection\n                lowerBound\n                upperBound\n              }\n              ... on Node {\n                __isNode: __typename\n                id\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n  sandboxProviders {\n    backendType\n    language\n    enabled\n    configs {\n      id\n      name\n      description\n    }\n    id\n  }\n  sandboxBackends {\n    backendType\n    status\n  }\n}\n"
   }
 };
 })();
 
-(node as any).hash = "de2b53ad3e5ff75b83df0c917ba3be10";
+(node as any).hash = "210eabeeb373d8a2c919d19fc5042c34";
 
 export default node;
diff --git a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts
index be5da31f314..a4257e369cc 100644
--- a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts
+++ b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts
@@ -1,5 +1,5 @@
 /**
- * @generated SignedSource<<02dd3e4c4452e113e602dc70df251404>>
+ * @generated SignedSource<<e4e05e09b43a52bbe91219f37c48bca9>>
  * @lightSyntaxTransform
  * @nogrep
  */
@@ -18,7 +18,7 @@ export type UpdateCodeEvaluatorInput = {
   language?: Language | null;
   name?: string | null;
   outputConfigs?: ReadonlyArray<AnnotationConfigInput> | null;
-  sandboxConfigId?: number | null;
+  sandboxConfigId?: string | null;
   sourceCode?: string | null;
 };
 export type AnnotationConfigInput = {
diff --git a/app/src/components/evaluators/CodeEvaluatorLanguageSandboxFields.tsx b/app/src/components/evaluators/CodeEvaluatorLanguageSandboxFields.tsx
new file mode 100644
index 00000000000..93c606878e0
--- /dev/null
+++ b/app/src/components/evaluators/CodeEvaluatorLanguageSandboxFields.tsx
@@ -0,0 +1,235 @@
+import { useMemo } from "react";
+
+import {
+  Button,
+  ComboBox,
+  ComboBoxItem,
+  Flex,
+  Label,
+  ListBox,
+  Popover,
+  Select,
+  SelectChevronUpDownIcon,
+  SelectItem,
+  SelectValue,
+  Text,
+  View,
+} from "@phoenix/components";
+import type { CodeEvaluatorLanguage } from "@phoenix/types";
+
+export type SandboxConfigOption = {
+  id: string;
+  name: string;
+  description?: string | null;
+  providerLabel: string;
+  providerLanguage: CodeEvaluatorLanguage;
+};
+
+export type CodeEvaluatorLanguageFieldProps = {
+  /** Current language selection */
+  language: CodeEvaluatorLanguage;
+  /** Callback when language changes */
+  onChange: (language: CodeEvaluatorLanguage) => void;
+};
+
+/**
+ * Language selector for code evaluators (Python or TypeScript)
+ */
+export const CodeEvaluatorLanguageField = ({
+  language,
+  onChange,
+}: CodeEvaluatorLanguageFieldProps) => {
+  return (
+    <Select
+      value={language}
+      onChange={(value) => onChange(value as CodeEvaluatorLanguage)}
+    >
+      <Label>Language</Label>
+      <Button>
+        <SelectValue />
+        <SelectChevronUpDownIcon />
+      </Button>
+      <Popover>
+        <ListBox>
+          <SelectItem id="PYTHON">Python</SelectItem>
+          <SelectItem id="TYPESCRIPT">TypeScript</SelectItem>
+        </ListBox>
+      </Popover>
+    </Select>
+  );
+};
+
+export type CodeEvaluatorSandboxFieldProps = {
+  /** All available sandbox configs (will be filtered by language) */
+  sandboxConfigs: SandboxConfigOption[];
+  /** Current language to filter configs by */
+  language: CodeEvaluatorLanguage;
+  /** Currently selected sandbox config Relay ID */
+  selectedSandboxConfigId: string | null;
+  /** Callback when selection changes */
+  onSelectionChange: (sandboxConfigId: string | null) => void;
+  /** Optional size variant */
+  size?: "M" | "L";
+  /** Whether to show the helper text below the field */
+  showHelperText?: boolean;
+  /** Optional warning shown when a saved selection is no longer available */
+  unavailableSelectionMessage?: string;
+};
+
+/**
+ * Sandbox config selector for code evaluators.
+ * Automatically filters configs by the selected language.
+ */
+export const CodeEvaluatorSandboxField = ({
+  sandboxConfigs,
+  language,
+  selectedSandboxConfigId,
+  onSelectionChange,
+  size = "L",
+  showHelperText = false,
+  unavailableSelectionMessage,
+}: CodeEvaluatorSandboxFieldProps) => {
+  // Filter configs to only show those matching the current language
+  const compatibleConfigs = useMemo(
+    () =>
+      sandboxConfigs.filter((config) => config.providerLanguage === language),
+    [sandboxConfigs, language]
+  );
+
+  // Check if the selected config is still valid for the current language
+  const validSelectedId = compatibleConfigs.some(
+    (config) => config.id === selectedSandboxConfigId
+  )
+    ? selectedSandboxConfigId
+    : null;
+
+  if (sandboxConfigs.length === 0) {
+    // No sandbox providers enabled at all
+    return (
+      <Flex direction="column" gap="size-50">
+        <Label>Sandbox</Label>
+        <Text color="text-500" size="S">
+          No sandbox providers enabled. Configure in Settings.
+        </Text>
+      </Flex>
+    );
+  }
+
+  return (
+    <View>
+      <ComboBox
+        label="Sandbox"
+        size={size}
+        placeholder={
+          compatibleConfigs.length > 0 ? "Select..." : "None available"
+        }
+        selectedKey={validSelectedId != null ? String(validSelectedId) : null}
+        onSelectionChange={(key) => {
+          if (typeof key === "string") {
+            onSelectionChange(key);
+          } else {
+            onSelectionChange(null);
+          }
+        }}
+        defaultItems={compatibleConfigs}
+        menuTrigger="focus"
+        isDisabled={compatibleConfigs.length === 0}
+        renderEmptyState={() => (
+          <View padding="size-100">
+            <Text color="text-500" size="S">
+              No configs for {language === "PYTHON" ? "Python" : "TypeScript"}
+            </Text>
+          </View>
+        )}
+      >
+        {(item) => (
+          <ComboBoxItem
+            id={String(item.id)}
+            key={item.id}
+            textValue={item.name}
+          >
+            <Flex direction="column" gap="size-25">
+              <Text>{item.name}</Text>
+              {item.description ? (
+                <Text color="text-700" size="XS">
+                  {item.description}
+                </Text>
+              ) : (
+                <Text color="text-700" size="XS">
+                  {item.providerLabel}
+                </Text>
+              )}
+            </Flex>
+          </ComboBoxItem>
+        )}
+      </ComboBox>
+      {showHelperText && (
+        <Text color="text-500" size="S">
+          Code evaluators run in a sandbox. Configure reusable sandbox configs
+          in Settings if none are available here.
+        </Text>
+      )}
+      {unavailableSelectionMessage ? (
+        <Text color="danger" size="S">
+          {unavailableSelectionMessage}
+        </Text>
+      ) : null}
+    </View>
+  );
+};
+
+const BACKEND_TYPE_LABELS: Record<string, string> = {
+  WASM: "WebAssembly",
+  E2B: "E2B",
+  DAYTONA_PYTHON: "Daytona",
+  VERCEL_PYTHON: "Vercel",
+  VERCEL_TYPESCRIPT: "Vercel",
+  DENO: "Deno",
+  MODAL: "Modal",
+};
+
+const backendTypeLabel = (backendType: string): string =>
+  BACKEND_TYPE_LABELS[backendType] ?? backendType;
+
+/**
+ * Maps sandbox provider data from GraphQL to SandboxConfigOption[].
+ * Only includes configs from enabled providers whose backends are available.
+ */
+export const mapSandboxConfigOptions = (
+  sandboxProviders: ReadonlyArray<{
+    language: CodeEvaluatorLanguage;
+    backendType: string;
+    enabled: boolean;
+    configs: ReadonlyArray<{
+      id: string;
+      name: string;
+      description?: string | null;
+    }>;
+  }>,
+  sandboxBackends: ReadonlyArray<{
+    backendType: string;
+    status: string;
+  }>
+): SandboxConfigOption[] => {
+  // Build a set of available backend types
+  const availableBackendTypes = new Set(
+    sandboxBackends
+      .filter((backend) => backend.status === "AVAILABLE")
+      .map((backend) => backend.backendType)
+  );
+
+  return sandboxProviders
+    .filter(
+      (provider) =>
+        provider.enabled && availableBackendTypes.has(provider.backendType)
+    )
+    .flatMap((provider) =>
+      provider.configs.map((config) => ({
+        id: config.id,
+        name: config.name,
+        description: config.description,
+        providerLanguage: provider.language,
+        providerLabel: backendTypeLabel(provider.backendType),
+      }))
+    );
+};
diff --git a/app/src/components/evaluators/CodeEvaluatorSourceCodeBlock.tsx b/app/src/components/evaluators/CodeEvaluatorSourceCodeBlock.tsx
new file mode 100644
index 00000000000..602fe5edec5
--- /dev/null
+++ b/app/src/components/evaluators/CodeEvaluatorSourceCodeBlock.tsx
@@ -0,0 +1,16 @@
+import { PythonBlock } from "@phoenix/components/code/PythonBlock";
+import { TypeScriptBlock } from "@phoenix/components/code/TypeScriptBlock";
+import type { CodeEvaluatorLanguage } from "@phoenix/types";
+
+export const CodeEvaluatorSourceCodeBlock = ({
+  language,
+  sourceCode,
+}: {
+  language: CodeEvaluatorLanguage;
+  sourceCode: string;
+}) => {
+  if (language === "PYTHON") {
+    return <PythonBlock value={sourceCode} />;
+  }
+  return <TypeScriptBlock value={sourceCode} />;
+};
diff --git a/app/src/components/evaluators/CodeEvaluatorTestSection.tsx b/app/src/components/evaluators/CodeEvaluatorTestSection.tsx
new file mode 100644
index 00000000000..0adb01ec80f
--- /dev/null
+++ b/app/src/components/evaluators/CodeEvaluatorTestSection.tsx
@@ -0,0 +1,374 @@
+import { useMemo, useState } from "react";
+import { graphql, useMutation } from "react-relay";
+
+import {
+  Alert,
+  Button,
+  Card,
+  DialogTrigger,
+  Flex,
+  Heading,
+  Icon,
+  IconButton,
+  Icons,
+  Popover,
+  Skeleton,
+  Text,
+  View,
+} from "@phoenix/components";
+import type { Annotation } from "@phoenix/components/annotation";
+import { AnnotationDetailsContent } from "@phoenix/components/annotation/AnnotationDetailsContent";
+import { getPositiveOptimization } from "@phoenix/components/annotation/optimizationUtils";
+import { JSONBlock } from "@phoenix/components/code";
+import type { CodeEvaluatorTestSectionMutation } from "@phoenix/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql";
+import { buildOutputConfigsInput } from "@phoenix/components/evaluators/utils";
+import { ExperimentAnnotationButton } from "@phoenix/components/experiment/ExperimentAnnotationButton";
+import { useEvaluatorStore } from "@phoenix/contexts/EvaluatorContext";
+import type { AnnotationConfig } from "@phoenix/store/evaluatorStore";
+import type { CodeEvaluatorLanguage } from "@phoenix/types";
+import { getErrorMessagesFromRelayMutationError } from "@phoenix/utils/errorUtils";
+
+type EvaluationPreviewResult =
+  | { kind: "success"; annotation: Annotation }
+  | { kind: "error"; evaluatorName: string; message: string };
+
+/**
+ * Computes whether an annotation score represents a positive optimization result
+ * by matching the annotation name to the corresponding output config.
+ */
+function computePositiveOptimization({
+  annotationName,
+  score,
+  evaluatorName,
+  outputConfigs,
+}: {
+  annotationName: string;
+  score: number | null | undefined;
+  evaluatorName: string;
+  outputConfigs: AnnotationConfig[];
+}): boolean | null {
+  if (outputConfigs.length === 0) {
+    return null;
+  }
+
+  let matchedConfig: AnnotationConfig | undefined;
+  if (outputConfigs.length === 1) {
+    matchedConfig = outputConfigs[0];
+  } else {
+    // Multi-output: annotation name is "evaluatorName.configName"
+    const prefix = evaluatorName + ".";
+    if (annotationName.startsWith(prefix)) {
+      const configName = annotationName.slice(prefix.length);
+      matchedConfig = outputConfigs.find((c) => c.name === configName);
+    }
+  }
+
+  if (matchedConfig == null) {
+    return null;
+  }
+
+  const optimizationDirection =
+    matchedConfig.optimizationDirection === "MAXIMIZE" ||
+    matchedConfig.optimizationDirection === "MINIMIZE"
+      ? matchedConfig.optimizationDirection
+      : undefined;
+
+  let lowerBound: number | undefined;
+  let upperBound: number | undefined;
+
+  if ("values" in matchedConfig) {
+    // Categorical: compute bounds from values scores
+    const scores = matchedConfig.values
+      .map((v) => v.score)
+      .filter((s): s is number => s != null);
+    if (scores.length > 0) {
+      lowerBound = Math.min(...scores);
+      upperBound = Math.max(...scores);
+    }
+  } else {
+    // Continuous: use bounds directly
+    lowerBound = matchedConfig.lowerBound ?? undefined;
+    upperBound = matchedConfig.upperBound ?? undefined;
+  }
+
+  return getPositiveOptimization({
+    score,
+    lowerBound,
+    upperBound,
+    optimizationDirection,
+  });
+}
+
+export type CodeEvaluatorTestSectionProps = {
+  /** The evaluator's source code */
+  sourceCode: string;
+  /** The language (PYTHON or TYPESCRIPT) */
+  language: CodeEvaluatorLanguage;
+  /** The sandbox config Relay ID if selected */
+  sandboxConfigId: string | null;
+};
+
+/**
+ * Test section for code evaluators - allows testing the evaluator
+ * against example data before saving.
+ */
+export const CodeEvaluatorTestSection = ({
+  sourceCode,
+  language,
+  sandboxConfigId,
+}: CodeEvaluatorTestSectionProps) => {
+  const [error, setError] = useState<string | null>(null);
+  const [previewResults, setPreviewResults] = useState<
+    EvaluationPreviewResult[]
+  >([]);
+
+  const outputConfigs = useEvaluatorStore((state) => state.outputConfigs);
+  const evaluatorName = useEvaluatorStore(
+    (state) => state.evaluator.name || state.evaluator.globalName || "evaluator"
+  );
+  const evaluatorDescription = useEvaluatorStore(
+    (state) => state.evaluator.description
+  );
+  const inputMapping = useEvaluatorStore(
+    (state) => state.evaluator.inputMapping
+  );
+  const evaluatorMappingSource = useEvaluatorStore(
+    (state) => state.evaluatorMappingSource
+  );
+
+  const [testEvaluator, isLoading] =
+    useMutation<CodeEvaluatorTestSectionMutation>(graphql`
+      mutation CodeEvaluatorTestSectionMutation($input: EvaluatorPreviewsInput!) {
+        evaluatorPreviews(input: $input) {
+          results {
+            evaluatorName
+            annotation {
+              explanation
+              label
+              score
+              name
+              id
+            }
+            error
+          }
+        }
+      }
+    `);
+
+  const onTestEvaluator = () => {
+    setError(null);
+    setPreviewResults([]);
+
+    if (!sourceCode.trim()) {
+      setError("Source code is required");
+      return;
+    }
+
+    if (outputConfigs.length === 0) {
+      setError("At least one output configuration is required");
+      return;
+    }
+
+    if (sandboxConfigId == null) {
+      setError("Please select a sandbox configuration to test the evaluator");
+      return;
+    }
+
+    const gqlOutputConfigs = buildOutputConfigsInput(outputConfigs);
+    testEvaluator({
+      variables: {
+        input: {
+          previews: [
+            {
+              context: evaluatorMappingSource,
+              evaluator: {
+                inlineCodeEvaluator: {
+                  name: evaluatorName,
+                  description: evaluatorDescription || null,
+                  language,
+                  sourceCode,
+                  outputConfigs: gqlOutputConfigs,
+                  sandboxConfigId,
+                },
+              },
+              inputMapping,
+            },
+          ],
+        },
+      },
+      onCompleted(response, errors) {
+        if (errors) {
+          const errorMessages = getErrorMessagesFromRelayMutationError(errors);
+          const errorMessage =
+            errorMessages?.join("\n") ??
+            errors[0]?.message ??
+            "An unknown error occurred";
+          setError(errorMessage);
+        } else {
+          const results: EvaluationPreviewResult[] =
+            response.evaluatorPreviews.results.map((result) => {
+              if (result.error != null) {
+                return {
+                  kind: "error" as const,
+                  evaluatorName: result.evaluatorName,
+                  message: result.error,
+                };
+              } else if (result.annotation != null) {
+                return {
+                  kind: "success" as const,
+                  annotation: {
+                    id: result.annotation.id,
+                    name: result.annotation.name,
+                    label: result.annotation.label,
+                    score: result.annotation.score,
+                    explanation: result.annotation.explanation,
+                  },
+                };
+              } else {
+                throw new Error(
+                  "Unknown error: no annotation or error returned"
+                );
+              }
+            });
+          setPreviewResults(results);
+        }
+      },
+      onError(error) {
+        const errorMessages = getErrorMessagesFromRelayMutationError(error);
+        const errorMessage =
+          errorMessages?.join("\n") ??
+          error.message ??
+          "An unknown error occurred";
+        setError(errorMessage);
+      },
+    });
+  };
+
+  const isShowingPreview =
+    isLoading || previewResults.length > 0 || error != null;
+
+  return (
+    <Flex direction="column" gap="size-100">
+      {/* Results section */}
+      {isShowingPreview && (
+        <Flex direction="column" gap="size-100" marginBottom="size-100">
+          {isLoading && (
+            <Card title="Evaluator Result">
+              <View padding="size-100">
+                <Flex direction="column" gap="size-100">
+                  <Skeleton height={100} borderRadius={8} animation="wave" />
+                  <Skeleton height={32} width="60%" animation="wave" />
+                </Flex>
+              </View>
+            </Card>
+          )}
+          {previewResults.map((result, i) => (
+            <Flex direction="column" gap="size-100" key={i} width="100%">
+              {result.kind === "success" ? (
+                <Card
+                  title="Evaluator Result"
+                  width="100%"
+                  extra={
+                    <IconButton
+                      aria-label="Dismiss evaluator result"
+                      size="S"
+                      onPress={() => setPreviewResults([])}
+                    >
+                      <Icon svg={<Icons.CloseOutline />} />
+                    </IconButton>
+                  }
+                >
+                  <AnnotationPreviewJSONBlock annotation={result.annotation} />
+                  <View padding="size-100">
+                    <DialogTrigger>
+                      <ExperimentAnnotationButton
+                        annotation={result.annotation}
+                        positiveOptimization={
+                          computePositiveOptimization({
+                            annotationName: result.annotation.name,
+                            score: result.annotation.score,
+                            evaluatorName,
+                            outputConfigs,
+                          }) ?? undefined
+                        }
+                      />
+                      <Popover>
+                        <View padding="size-200">
+                          <AnnotationDetailsContent
+                            annotation={result.annotation}
+                          />
+                        </View>
+                      </Popover>
+                    </DialogTrigger>
+                  </View>
+                </Card>
+              ) : (
+                <Alert
+                  variant="danger"
+                  title={`Evaluator Error: ${result.evaluatorName}`}
+                >
+                  {result.message}
+                </Alert>
+              )}
+            </Flex>
+          ))}
+
+          {error && !isLoading && previewResults.length === 0 && (
+            <Alert
+              variant="danger"
+              title="Error"
+              dismissable
+              onDismissClick={() => setError(null)}
+            >
+              {error}
+            </Alert>
+          )}
+        </Flex>
+      )}
+
+      {/* Test button and description */}
+      <Flex justifyContent="space-between" alignItems="center">
+        <Heading weight="heavy" level={3}>
+          Test Evaluator
+        </Heading>
+        <Button
+          size="S"
+          onPress={onTestEvaluator}
+          isPending={isLoading}
+          variant="primary"
+          leadingVisual={
+            <Icon
+              svg={
+                isLoading ? (
+                  <Icons.LoadingOutline />
+                ) : (
+                  <Icons.PlayCircleOutline />
+                )
+              }
+            />
+          }
+        >
+          {isLoading ? "Testing..." : "Test"}
+        </Button>
+      </Flex>
+      <Text color="text-500" size="XS">
+        Run your evaluator against the example data to verify it works correctly
+        before saving.
+      </Text>
+    </Flex>
+  );
+};
+
+function AnnotationPreviewJSONBlock(props: { annotation: Annotation }) {
+  const { name, label, score, explanation } = props.annotation;
+  const jsonString = useMemo(() => {
+    return JSON.stringify({ name, label, score, explanation }, null, 2);
+  }, [explanation, label, name, score]);
+
+  return (
+    <JSONBlock
+      value={jsonString}
+      basicSetup={{ lineNumbers: false, foldGutter: false }}
+    />
+  );
+}
diff --git a/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx b/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx
index aafc48d6ca9..9067fe72585 100644
--- a/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx
+++ b/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx
@@ -3,16 +3,13 @@ import { python } from "@codemirror/lang-python";
 import { css } from "@emotion/react";
 import { githubDark, githubLight } from "@uiw/codemirror-theme-github";
 import CodeMirror from "@uiw/react-codemirror";
-import { useEffect, useMemo, useState } from "react";
+import { useEffect, useEffectEvent, useMemo, useRef, useState } from "react";
 import { Group, Panel, Separator } from "react-resizable-panels";
 
 import {
   Alert,
   Button,
-  ComboBox,
-  ComboBoxItem,
   Flex,
-  Heading,
   Icon,
   Icons,
   Input,
@@ -27,14 +24,26 @@ import {
   TextField,
   View,
 } from "@phoenix/components";
-import { PythonBlock } from "@phoenix/components/code/PythonBlock";
-import { TypeScriptBlock } from "@phoenix/components/code/TypeScriptBlock";
 import {
   DialogContent,
   DialogHeader,
   DialogTitle,
   DialogTitleExtra,
 } from "@phoenix/components/core/dialog";
+import {
+  Disclosure,
+  DisclosureGroup,
+  DisclosurePanel,
+  DisclosureTrigger,
+} from "@phoenix/components/core/disclosure";
+import { createEvaluatorAutocompletion } from "@phoenix/components/evaluators/codeEvaluatorAutocomplete";
+import {
+  CodeEvaluatorLanguageField,
+  CodeEvaluatorSandboxField,
+  type SandboxConfigOption,
+} from "@phoenix/components/evaluators/CodeEvaluatorLanguageSandboxFields";
+import { CodeEvaluatorTestSection } from "@phoenix/components/evaluators/CodeEvaluatorTestSection";
+import { generateEvaluatorTypes } from "@phoenix/components/evaluators/codeEvaluatorTypeGeneration";
 import {
   DEFAULT_CODE_EVALUATOR_SOURCE,
   extractCodeEvaluatorVariables,
@@ -73,16 +82,10 @@ export const createDefaultContinuousOutputConfig = (
   upperBound: 1,
 });
 
-export type SandboxConfigOption = {
-  id: number;
-  name: string;
-  description?: string | null;
-  providerLabel: string;
-  providerLanguage: CodeEvaluatorLanguage;
-};
-
 export const EditCodeEvaluatorDialogContent = ({
   onSubmit,
+  onCancel,
+  onDirtyChange,
   isSubmitting,
   mode,
   error,
@@ -94,31 +97,112 @@ export const EditCodeEvaluatorDialogContent = ({
   onSubmit: (payload: {
     language: CodeEvaluatorLanguage;
     sourceCode: string;
-    sandboxConfigId: number | null;
+    sandboxConfigId?: string | null;
   }) => void;
+  /**
+   * Called when the user clicks Cancel. Parent overlays can use this to
+   * centralize close behavior such as unsaved-change confirmation.
+   */
+  onCancel?: () => void;
+  /**
+   * Called whenever the dirty state changes (has unsaved changes vs. not).
+   */
+  onDirtyChange?: (isDirty: boolean) => void;
   isSubmitting: boolean;
   mode: "create" | "update";
   error?: string;
   initialLanguage: CodeEvaluatorLanguage;
   initialSourceCode: string;
   sandboxConfigs: SandboxConfigOption[];
-  initialSandboxConfigId?: number | null;
+  initialSandboxConfigId?: string | null;
 }) => {
   const store = useEvaluatorStoreInstance();
   const [showValidationError, setShowValidationError] = useState(false);
   const [sourceCode, setSourceCode] = useState(initialSourceCode);
   const [language, setLanguage] =
     useState<CodeEvaluatorLanguage>(initialLanguage);
-  const [sandboxConfigId, setSandboxConfigId] = useState<number | null>(
+  const [sandboxConfigId, setSandboxConfigId] = useState<string | null>(
     initialSandboxConfigId ?? null
   );
   const [localValidationError, setLocalValidationError] = useState<
     string | undefined
   >();
+
+  // Track initial store state for dirty checking
+  const initialStoreStateRef = useRef<{
+    name: string;
+    outputConfigs: string;
+    inputMapping: string;
+  } | null>(null);
+
+  // Track last reported dirty state to avoid redundant callbacks
+  const lastDirtyRef = useRef(false);
+
+  useEffect(() => {
+    // Capture initial store state on mount for dirty comparison
+    const state = store.getState();
+    initialStoreStateRef.current = {
+      name: state.evaluator.name,
+      outputConfigs: JSON.stringify(state.outputConfigs),
+      inputMapping: JSON.stringify(state.evaluator.inputMapping),
+    };
+  }, [store]);
+
+  const reportDirtyState = useEffectEvent((isDirty: boolean) => {
+    onDirtyChange?.(isDirty);
+  });
+
+  const checkForDirtyChanges = useEffectEvent(() => {
+    const initial = initialStoreStateRef.current;
+    if (!initial) {
+      return;
+    }
+
+    const state = store.getState();
+    const codeChanged = sourceCode !== initialSourceCode;
+    const languageChanged = language !== initialLanguage;
+    const sandboxChanged = sandboxConfigId !== (initialSandboxConfigId ?? null);
+    const nameChanged = state.evaluator.name !== initial.name;
+    const outputConfigsChanged =
+      JSON.stringify(state.outputConfigs) !== initial.outputConfigs;
+    const inputMappingChanged =
+      JSON.stringify(state.evaluator.inputMapping) !== initial.inputMapping;
+
+    const isDirty =
+      codeChanged ||
+      languageChanged ||
+      sandboxChanged ||
+      nameChanged ||
+      outputConfigsChanged ||
+      inputMappingChanged;
+
+    if (isDirty !== lastDirtyRef.current) {
+      lastDirtyRef.current = isDirty;
+      reportDirtyState(isDirty);
+    }
+  });
+
+  // Notify parent of dirty state changes from local state
+  useEffect(() => {
+    checkForDirtyChanges();
+  }, [sourceCode, language, sandboxConfigId]);
+
+  // Subscribe to store changes to notify parent of dirty state
+  useEffect(() => {
+    return store.subscribe(() => {
+      checkForDirtyChanges();
+    });
+  }, [store]);
+
+  const handleCancel = () => {
+    onCancel?.();
+  };
+
   const variables = useMemo(
     () => extractCodeEvaluatorVariables({ language, sourceCode }),
     [language, sourceCode]
   );
+
   const compatibleSandboxConfigs = useMemo(
     () =>
       sandboxConfigs.filter(
@@ -126,17 +210,26 @@ export const EditCodeEvaluatorDialogContent = ({
       ),
     [language, sandboxConfigs]
   );
+
   const selectedSandboxConfigId = compatibleSandboxConfigs.some(
     (sandboxConfig) => sandboxConfig.id === sandboxConfigId
   )
     ? sandboxConfigId
     : null;
+  const hasUnavailableSandboxSelection =
+    sandboxConfigId != null && selectedSandboxConfigId == null;
+  const unavailableSandboxSelectionMessage = hasUnavailableSandboxSelection
+    ? "The previously selected sandbox is no longer available. Save to keep the existing sandbox, or choose a new one to update it."
+    : undefined;
+  const hasNoSandboxConfigs = sandboxConfigs.length === 0;
 
   const handleSubmit = async () => {
     const isValid = await store.getState().validateAll();
     const configError = getCodeEvaluatorValidationError({
       outputConfigs: store.getState().outputConfigs,
       sourceCode,
+      mode,
+      sandboxConfigId: selectedSandboxConfigId,
     });
     if (!isValid || configError) {
       setShowValidationError(true);
@@ -145,10 +238,18 @@ export const EditCodeEvaluatorDialogContent = ({
     }
     setShowValidationError(false);
     setLocalValidationError(undefined);
+    const hasSandboxChanged =
+      sandboxConfigId !== (initialSandboxConfigId ?? null);
+    const nextSandboxConfigId =
+      selectedSandboxConfigId != null
+        ? selectedSandboxConfigId
+        : mode === "create" || hasSandboxChanged
+          ? null
+          : undefined;
     onSubmit({
       language,
       sourceCode,
-      sandboxConfigId: selectedSandboxConfigId,
+      sandboxConfigId: nextSandboxConfigId,
     });
   };
 
@@ -159,9 +260,15 @@ export const EditCodeEvaluatorDialogContent = ({
           {mode === "create" ? "Create Evaluator" : "Edit Evaluator"}
         </DialogTitle>
         <DialogTitleExtra>
-          <Button slot="close" isDisabled={isSubmitting}>
-            Cancel
-          </Button>
+          {onCancel ? (
+            <Button isDisabled={isSubmitting} onPress={handleCancel}>
+              Cancel
+            </Button>
+          ) : (
+            <Button slot="close" isDisabled={isSubmitting}>
+              Cancel
+            </Button>
+          )}
           <Button
             variant="primary"
             isDisabled={isSubmitting}
@@ -172,7 +279,9 @@ export const EditCodeEvaluatorDialogContent = ({
           </Button>
         </DialogTitleExtra>
       </DialogHeader>
+
       <fieldset disabled={isSubmitting} css={fieldsetCSS}>
+        {/* Error alerts */}
         {showValidationError && (
           <Alert
             variant="danger"
@@ -196,78 +305,125 @@ export const EditCodeEvaluatorDialogContent = ({
             {error}
           </Alert>
         )}
+        {hasNoSandboxConfigs ? (
+          <Alert variant="warning" title="No sandboxes configured">
+            You can still draft and save this evaluator now. Select a sandbox
+            later to test or execute it.
+          </Alert>
+        ) : null}
+
+        {/* Compact inline header bar */}
+        <CompactHeaderBar
+          language={language}
+          onLanguageChange={(nextLanguage) => {
+            setLanguage((currentLanguage) => {
+              if (
+                sourceCode === DEFAULT_CODE_EVALUATOR_SOURCE[currentLanguage]
+              ) {
+                setSourceCode(DEFAULT_CODE_EVALUATOR_SOURCE[nextLanguage]);
+              }
+              return nextLanguage;
+            });
+          }}
+          sandboxConfigs={sandboxConfigs}
+          selectedSandboxConfigId={selectedSandboxConfigId}
+          onSandboxChange={setSandboxConfigId}
+          unavailableSelectionMessage={unavailableSandboxSelectionMessage}
+          showSandboxHelperText={hasNoSandboxConfigs}
+        />
+
         <CodeEvaluatorInputVariablesProvider variables={variables}>
           <Group orientation="horizontal" style={{ flex: 1, minHeight: 0 }}>
-            <Panel defaultSize={55} style={panelStyle} css={leftPanelCSS}>
-              <View marginBottom="size-200" flex="none">
-                <Flex
-                  direction="row"
-                  alignItems="baseline"
-                  width="100%"
-                  gap="size-100"
-                >
-                  <EvaluatorNameInput />
-                  <EvaluatorDescriptionInput />
-                </Flex>
-              </View>
-              <CodeEvaluatorLanguageField
-                language={language}
-                onChange={(nextLanguage) => {
-                  setLanguage((currentLanguage) => {
-                    if (
-                      sourceCode ===
-                      DEFAULT_CODE_EVALUATOR_SOURCE[currentLanguage]
-                    ) {
-                      setSourceCode(
-                        DEFAULT_CODE_EVALUATOR_SOURCE[nextLanguage]
-                      );
-                    }
-                    return nextLanguage;
-                  });
-                }}
-              />
-              <CodeEvaluatorSandboxField
-                sandboxConfigs={compatibleSandboxConfigs}
-                selectedSandboxConfigId={selectedSandboxConfigId}
-                onSelectionChange={setSandboxConfigId}
-              />
-              <CodeEvaluatorSourceEditor
-                language={language}
-                sourceCode={sourceCode}
-                onChange={setSourceCode}
-              />
-              <CodeEvaluatorOutputConfigSection />
-              <Flex direction="column" gap="size-100">
-                <Heading level={2} weight="heavy">
-                  Map Evaluator Inputs
-                </Heading>
-                <Text color="text-500">
-                  Map the arguments used by your evaluator to fields from a
-                  dataset example. For TypeScript evaluators, variables are
-                  inferred from the object keys used in the{" "}
-                  <code>evaluate</code> function.
-                </Text>
-                <View
-                  borderRadius="medium"
-                  borderWidth="thin"
-                  padding="size-200"
-                  marginTop="size-50"
-                  borderColor="default"
-                >
-                  <EvaluatorInputMapping />
-                </View>
-              </Flex>
+            {/* Left panel: Code Editor (60%) */}
+            <Panel defaultSize="60%" minSize="40%" style={panelStyle}>
+              <div css={editorPanelCSS}>
+                <CodeEditorSection
+                  language={language}
+                  sourceCode={sourceCode}
+                  onChange={setSourceCode}
+                />
+              </div>
             </Panel>
+
             <Separator css={compactResizeHandleCSS} />
-            <Panel defaultSize={45} style={panelStyle} css={rightPanelCSS}>
-              <Flex direction="column" gap="size-200">
-                <View paddingX="size-200">
-                  <Flex direction="column" gap="size-100">
-                    <EvaluatorExampleDataset />
-                  </Flex>
-                </View>
-                <EvaluatorInputPreview />
-              </Flex>
+
+            {/* Right panel: Collapsible Sidebar (40%) */}
+            <Panel defaultSize="40%" minSize="25%" style={panelStyle}>
+              <div css={sidebarPanelCSS}>
+                <DisclosureGroup
+                  defaultExpandedKeys={["output-config", "input-mapping"]}
+                >
+                  {/* Test Section */}
+                  <Disclosure id="test-section" defaultExpanded={false}>
+                    <DisclosureTrigger arrowPosition="start">
+                      <Text weight="heavy" size="S">
+                        Test Evaluator
+                      </Text>
+                    </DisclosureTrigger>
+                    <DisclosurePanel>
+                      <div css={accordionContentCSS}>
+                        <View marginY="size-100" paddingX="size-200">
+                          <CodeEvaluatorTestSection
+                            sourceCode={sourceCode}
+                            language={language}
+                            sandboxConfigId={selectedSandboxConfigId}
+                          />
+                        </View>
+                        <View paddingX="size-200" paddingTop="size-50">
+                          <EvaluatorExampleDataset />
+                        </View>
+                        <View marginTop="size-100">
+                          <EvaluatorInputPreview />
+                        </View>
+                      </div>
+                    </DisclosurePanel>
+                  </Disclosure>
+
+                  {/* Output Configuration Section */}
+                  <Disclosure id="output-config">
+                    <DisclosureTrigger arrowPosition="start">
+                      <Text weight="heavy" size="S">
+                        Output Configuration
+                      </Text>
+                    </DisclosureTrigger>
+                    <DisclosurePanel>
+                      <div css={accordionContentCSS}>
+                        <View paddingX="size-200" paddingTop="size-100">
+                          <Text color="text-500" size="XS">
+                            Define the output type and optimization direction
+                            for your evaluator.
+                          </Text>
+                          <View marginTop="size-100">
+                            <OutputConfigSection />
+                          </View>
+                        </View>
+                      </div>
+                    </DisclosurePanel>
+                  </Disclosure>
+
+                  {/* Input Mapping Section */}
+                  <Disclosure id="input-mapping">
+                    <DisclosureTrigger arrowPosition="start">
+                      <Text weight="heavy" size="S">
+                        Input Mapping
+                      </Text>
+                    </DisclosureTrigger>
+                    <DisclosurePanel>
+                      <div css={accordionContentCSS}>
+                        <View paddingX="size-200" paddingTop="size-100">
+                          <Text color="text-500" size="XS">
+                            Map evaluator arguments to dataset fields. Arguments
+                            are auto-detected from your code.
+                          </Text>
+                          <View marginTop="size-100">
+                            <EvaluatorInputMapping />
+                          </View>
+                        </View>
+                      </div>
+                    </DisclosurePanel>
+                  </Disclosure>
+                </DisclosureGroup>
+              </div>
             </Panel>
           </Group>
         </CodeEvaluatorInputVariablesProvider>
@@ -276,36 +432,71 @@ export const EditCodeEvaluatorDialogContent = ({
   );
 };
 
-const CodeEvaluatorLanguageField = ({
+/**
+ * Compact header bar with inline name, language, and sandbox fields
+ */
+const CompactHeaderBar = ({
   language,
-  onChange,
+  onLanguageChange,
+  sandboxConfigs,
+  selectedSandboxConfigId,
+  onSandboxChange,
+  unavailableSelectionMessage,
+  showSandboxHelperText,
 }: {
   language: CodeEvaluatorLanguage;
-  onChange: (language: CodeEvaluatorLanguage) => void;
+  onLanguageChange: (language: CodeEvaluatorLanguage) => void;
+  sandboxConfigs: SandboxConfigOption[];
+  selectedSandboxConfigId: string | null;
+  onSandboxChange: (sandboxConfigId: string | null) => void;
+  unavailableSelectionMessage?: string;
+  showSandboxHelperText?: boolean;
 }) => {
   return (
-    <View marginBottom="size-200" flex="none">
-      <Select
-        value={language}
-        onChange={(value) => onChange(value as CodeEvaluatorLanguage)}
-      >
-        <Label>Language</Label>
-        <Button>
-          <SelectValue />
-          <SelectChevronUpDownIcon />
-        </Button>
-        <Popover>
-          <ListBox>
-            <SelectItem id="PYTHON">Python</SelectItem>
-            <SelectItem id="TYPESCRIPT">TypeScript</SelectItem>
-          </ListBox>
-        </Popover>
-      </Select>
-    </View>
+    <Flex direction="column" gap="size-100">
+      <div css={headerBarCSS}>
+        {/* Name field */}
+        <div
+          css={headerFieldCSS}
+          style={{ flex: "1 1 180px", minWidth: 140, maxWidth: 500 }}
+        >
+          <EvaluatorNameInput />
+        </div>
+
+        {/* Description field */}
+        <div css={headerFieldCSS} style={{ flex: "1 1 240px", minWidth: 180 }}>
+          <EvaluatorDescriptionInput />
+        </div>
+
+        {/* Language selector */}
+        <div css={headerFieldCSS} style={{ flex: "0 0 auto", width: 130 }}>
+          <CodeEvaluatorLanguageField
+            language={language}
+            onChange={onLanguageChange}
+          />
+        </div>
+
+        {/* Sandbox selector */}
+        <div css={headerFieldCSS} style={{ flex: "0 1 240px", minWidth: 180 }}>
+          <CodeEvaluatorSandboxField
+            sandboxConfigs={sandboxConfigs}
+            language={language}
+            selectedSandboxConfigId={selectedSandboxConfigId}
+            onSelectionChange={onSandboxChange}
+            showHelperText={showSandboxHelperText}
+            unavailableSelectionMessage={unavailableSelectionMessage}
+          />
+        </div>
+      </div>
+    </Flex>
   );
 };
 
-const CodeEvaluatorSourceEditor = ({
+/**
+ * Code editor section - full height, primary element
+ * Includes auto-generated type definitions as a read-only footer below the editor.
+ */
+const CodeEditorSection = ({
   language,
   sourceCode,
   onChange,
@@ -316,60 +507,114 @@ const CodeEvaluatorSourceEditor = ({
 }) => {
   const { theme } = useTheme();
   const codeMirrorTheme = theme === "light" ? githubLight : githubDark;
+
+  // Get the evaluator mapping source from the store for type generation
+  const evaluatorMappingSource = useEvaluatorStore(
+    (state) => state.evaluatorMappingSource
+  );
+
+  // Generate the type footer based on language and available data
+  const typeFooter = useMemo(
+    () => generateEvaluatorTypes(language, evaluatorMappingSource),
+    [language, evaluatorMappingSource]
+  );
+
   const extensions = useMemo(
-    () =>
-      language === "PYTHON" ? [python()] : [javascript({ typescript: true })],
-    [language]
+    () => [
+      language === "PYTHON" ? python() : javascript({ typescript: true }),
+      createEvaluatorAutocompletion(evaluatorMappingSource, language),
+    ],
+    [language, evaluatorMappingSource]
   );
+
   return (
-    <Flex direction="column" gap="size-100" marginBottom="size-200">
-      <Heading level={2} weight="heavy">
-        Evaluator Code
-      </Heading>
-      <Flex direction="row" justifyContent="space-between" alignItems="center">
-        <Text color="text-500">
-          Define an <code>evaluate</code> function that returns either a numeric
-          score or a categorical label.
+    <div css={editorSectionCSS}>
+      {/* Editor header with reset button */}
+      <Flex
+        direction="row"
+        justifyContent="space-between"
+        alignItems="center"
+        flex="none"
+      >
+        <Text color="text-500" size="XS">
+          Define an <code>evaluate</code> function that returns a score or
+          label.
         </Text>
         <Button
           size="S"
-          variant="default"
+          variant="quiet"
           onPress={() => onChange(DEFAULT_CODE_EVALUATOR_SOURCE[language])}
         >
           <Icon svg={<Icons.Refresh />} />
-          Reset to default
+          Reset
         </Button>
       </Flex>
-      <div
-        css={editorWrapCSS}
-        onKeyDown={(e) => {
-          // Prevent Escape from propagating to the modal overlay,
-          // which would close the slideover and discard edits.
-          if (e.key === "Escape") {
-            e.stopPropagation();
-          }
-        }}
-      >
-        <CodeMirror
-          value={sourceCode}
-          onChange={onChange}
-          theme={codeMirrorTheme}
-          extensions={extensions}
-          basicSetup={{
-            lineNumbers: true,
-            foldGutter: true,
-            bracketMatching: true,
-            syntaxHighlighting: true,
-            highlightActiveLine: false,
-            highlightActiveLineGutter: false,
-          }}
-        />
+
+      {/* Code editor and type footer with resizable panels */}
+      <div css={editorContainerCSS}>
+        <Group orientation="vertical" style={{ flex: 1, minHeight: 0 }}>
+          {/* Editable code editor panel */}
+          <Panel defaultSize="75%" minSize="30%" style={editorPanelStyle}>
+            <div
+              css={editorWrapCSS}
+              onKeyDown={(e) => {
+                if (e.key === "Escape") {
+                  e.stopPropagation();
+                }
+              }}
+            >
+              <CodeMirror
+                // Key on language to force remount when language changes
+                key={language}
+                value={sourceCode}
+                onChange={onChange}
+                theme={codeMirrorTheme}
+                extensions={extensions}
+                height="100%"
+                basicSetup={{
+                  lineNumbers: true,
+                  foldGutter: true,
+                  bracketMatching: true,
+                  syntaxHighlighting: true,
+                  highlightActiveLine: false,
+                  highlightActiveLineGutter: false,
+                }}
+              />
+            </div>
+          </Panel>
+
+          {/* Read-only type footer panel */}
+          {typeFooter && (
+            <>
+              <Separator css={compactResizeHandleCSS} />
+              <Panel defaultSize="25%" minSize="10%" style={editorPanelStyle}>
+                <div css={typeFooterCSS}>
+                  <CodeMirror
+                    value={typeFooter}
+                    theme={codeMirrorTheme}
+                    extensions={extensions}
+                    editable={false}
+                    basicSetup={{
+                      lineNumbers: true,
+                      foldGutter: false,
+                      highlightActiveLine: false,
+                      highlightActiveLineGutter: false,
+                    }}
+                  />
+                </div>
+              </Panel>
+            </>
+          )}
+        </Group>
       </div>
-    </Flex>
+    </div>
   );
 };
 
-const CodeEvaluatorOutputConfigSection = () => {
+/**
+ * Output configuration section (inside accordion)
+ */
+const OutputConfigSection = () => {
   const store = useEvaluatorStoreInstance();
   const outputConfig = useEvaluatorStore((state) => state.outputConfigs[0]);
   const evaluatorName = useEvaluatorStore(
@@ -391,66 +636,61 @@ const CodeEvaluatorOutputConfigSection = () => {
   }
 
   return (
-    <Flex direction="column" gap="size-100" marginBottom="size-200">
-      <Heading level={2} weight="heavy">
-        Evaluator Annotation
-      </Heading>
-      <Text color="text-500">
-        Configure the annotation produced by this evaluator.
-      </Text>
-      <View
-        borderRadius="medium"
-        borderWidth="thin"
-        borderColor="default"
-        padding="size-200"
+    <Flex direction="column" gap="size-100">
+      <TextField isDisabled value={outputConfig.name}>
+        <Label>Annotation name</Label>
+        <Input />
+        <Text slot="description">
+          The name of the annotation that will be created by this evaluator.
+          Fixed to the evaluator name.
+        </Text>
+      </TextField>
+
+      <Select
+        value={outputType}
+        onChange={(value) => {
+          const nextType = value as (typeof outputTypeOptions)[number]["id"];
+          store.getState().setOutputConfigs([
+            nextType === "categorical"
+              ? {
+                  name: evaluatorName,
+                  optimizationDirection: "NONE",
+                  values: [
+                    { label: "pass", score: 1 },
+                    { label: "fail", score: 0 },
+                  ],
+                }
+              : createDefaultContinuousOutputConfig(evaluatorName),
+          ]);
+        }}
       >
-        <Flex direction="column" gap="size-150">
-          <Select
-            value={outputType}
-            onChange={(value) => {
-              const nextType =
-                value as (typeof outputTypeOptions)[number]["id"];
-              store.getState().setOutputConfigs([
-                nextType === "categorical"
-                  ? {
-                      name: evaluatorName,
-                      optimizationDirection: "NONE",
-                      values: [
-                        { label: "pass", score: 1 },
-                        { label: "fail", score: 0 },
-                      ],
-                    }
-                  : createDefaultContinuousOutputConfig(evaluatorName),
-              ]);
-            }}
-          >
-            <Label>Output type</Label>
-            <Button>
-              <SelectValue />
-              <SelectChevronUpDownIcon />
-            </Button>
-            <Popover>
-              <ListBox>
-                {outputTypeOptions.map((option) => (
-                  <SelectItem key={option.id} id={option.id}>
-                    {option.label}
-                  </SelectItem>
-                ))}
-              </ListBox>
-            </Popover>
-          </Select>
-          <TextField isDisabled value={outputConfig.name}>
-            <Label>Name</Label>
-            <Input />
-          </TextField>
-          <OptimizationDirectionField />
-          {"values" in outputConfig ? (
-            <CategoricalChoicesEditor values={outputConfig.values} />
-          ) : (
-            <ContinuousBoundsEditor config={outputConfig} />
-          )}
-        </Flex>
-      </View>
+        <Label>Output type</Label>
+        <Button>
+          <SelectValue />
+          <SelectChevronUpDownIcon />
+        </Button>
+        <Text slot="description">
+          The type of output that will be created by this evaluator. Your code
+          should return a numerical score or a categorical label.
+        </Text>
+        <Popover>
+          <ListBox>
+            {outputTypeOptions.map((option) => (
+              <SelectItem key={option.id} id={option.id}>
+                {option.label}
+              </SelectItem>
+            ))}
+          </ListBox>
+        </Popover>
+      </Select>
+
+      <OptimizationDirectionField />
+
+      {"values" in outputConfig ? (
+        <CategoricalChoicesEditor values={outputConfig.values} />
+      ) : (
+        <ContinuousBoundsEditor config={outputConfig} />
+      )}
     </Flex>
   );
 };
@@ -462,15 +702,18 @@ const CategoricalChoicesEditor = ({
 }) => {
   const setOutputConfigs = useEvaluatorStore((state) => state.setOutputConfigs);
   const outputConfig = useEvaluatorStore((state) => state.outputConfigs[0]);
+
   if (!outputConfig || !("values" in outputConfig)) {
     return null;
   }
+
   const updateValues = (nextValues: ClassificationChoice[]) => {
     setOutputConfigs([{ ...outputConfig, values: nextValues }]);
   };
+
   return (
     <Flex direction="column" gap="size-100">
-      <Text weight="heavy" size="S">
+      <Text weight="heavy" size="XS">
         Choices
       </Text>
       {values.map((choice, index) => (
@@ -501,28 +744,27 @@ const CategoricalChoicesEditor = ({
           <Button
             type="button"
             variant="quiet"
+            aria-label="Remove choice"
+            size="S"
             isDisabled={values.length <= 2}
             onPress={() => {
-              if (values.length <= 2) {
-                return;
-              }
-              updateValues(
-                values.filter((_, valueIndex) => valueIndex !== index)
-              );
+              if (values.length <= 2) return;
+              updateValues(values.filter((_, i) => i !== index));
             }}
           >
-            Remove
+            <Icon svg={<Icons.TrashOutline />} />
           </Button>
         </div>
       ))}
       <Button
         type="button"
         variant="quiet"
-        onPress={() => {
-          updateValues([...values, { label: "", score: undefined }]);
-        }}
+        size="S"
+        onPress={() =>
+          updateValues([...values, { label: "", score: undefined }])
+        }
       >
-        Add choice
+        + Add choice
       </Button>
     </Flex>
   );
@@ -534,167 +776,52 @@ const ContinuousBoundsEditor = ({
   config: ContinuousEvaluatorAnnotationConfig;
 }) => {
   const setOutputConfigs = useEvaluatorStore((state) => state.setOutputConfigs);
+
   const updateConfig = (
     updates: Partial<ContinuousEvaluatorAnnotationConfig>
   ) => {
     setOutputConfigs([{ ...config, ...updates }]);
   };
+
   return (
-    <div css={choiceGridCSS}>
+    <Flex direction="row" gap="size-100">
       <TextField
         value={config.lowerBound != null ? String(config.lowerBound) : ""}
-        onChange={(value) => {
+        onChange={(value) =>
           updateConfig({
             lowerBound: value.trim() === "" ? null : Number(value),
-          });
-        }}
+          })
+        }
       >
         <Label>Lower bound</Label>
-        <Input type="number" placeholder="Optional" />
+        <Input type="number" placeholder="0" />
       </TextField>
       <TextField
         value={config.upperBound != null ? String(config.upperBound) : ""}
-        onChange={(value) => {
+        onChange={(value) =>
           updateConfig({
             upperBound: value.trim() === "" ? null : Number(value),
-          });
-        }}
+          })
+        }
       >
         <Label>Upper bound</Label>
-        <Input type="number" placeholder="Optional" />
+        <Input type="number" placeholder="1" />
       </TextField>
-      <View />
-    </div>
-  );
-};
-
-export const CodeEvaluatorSourceCodeBlock = ({
-  language,
-  sourceCode,
-}: {
-  language: CodeEvaluatorLanguage;
-  sourceCode: string;
-}) => {
-  if (language === "PYTHON") {
-    return <PythonBlock value={sourceCode} />;
-  }
-  return <TypeScriptBlock value={sourceCode} />;
-};
-
-const CodeEvaluatorSandboxField = ({
-  sandboxConfigs,
-  selectedSandboxConfigId,
-  onSelectionChange,
-}: {
-  sandboxConfigs: SandboxConfigOption[];
-  selectedSandboxConfigId: number | null;
-  onSelectionChange: (sandboxConfigId: number | null) => void;
-}) => {
-  return (
-    <View marginBottom="size-200" flex="none">
-      <ComboBox
-        label="Sandbox config"
-        size="L"
-        placeholder={
-          sandboxConfigs.length > 0
-            ? "Select a sandbox config"
-            : "No sandbox configs available"
-        }
-        selectedKey={
-          selectedSandboxConfigId != null
-            ? String(selectedSandboxConfigId)
-            : null
-        }
-        onSelectionChange={(key) => {
-          if (typeof key === "string") {
-            onSelectionChange(Number(key));
-          } else {
-            onSelectionChange(null);
-          }
-        }}
-        defaultItems={sandboxConfigs}
-        menuTrigger="focus"
-        isDisabled={sandboxConfigs.length === 0}
-        renderEmptyState={() => (
-          <div>No sandbox configs found for this language</div>
-        )}
-      >
-        {(item) => (
-          <ComboBoxItem
-            id={String(item.id)}
-            key={item.id}
-            textValue={item.name}
-          >
-            <Flex direction="column" gap="size-25">
-              <Text>{item.name}</Text>
-              {item.description ? (
-                <Text color="text-700" size="S">
-                  {item.description}
-                </Text>
-              ) : (
-                <Text color="text-700" size="S">
-                  {item.providerLabel}
-                </Text>
-              )}
-            </Flex>
-          </ComboBoxItem>
-        )}
-      </ComboBox>
-      <Text color="text-500" size="S">
-        Code evaluators run in a sandbox. Configure reusable sandbox configs in
-        Settings if none are available here.
-      </Text>
-    </View>
-  );
-};
-
-export const mapSandboxConfigOptions = (
-  sandboxProviders: ReadonlyArray<{
-    language: CodeEvaluatorLanguage;
-    backendType: string;
-    configs: ReadonlyArray<{
-      id: string;
-      name: string;
-      description?: string | null;
-    }>;
-  }>
-): SandboxConfigOption[] => {
-  return sandboxProviders.flatMap((provider) =>
-    provider.configs.map((config) => ({
-      id: decodeRelayNodeId(config.id),
-      name: config.name,
-      description: config.description,
-      providerLanguage: provider.language,
-      providerLabel: backendTypeLabel(provider.backendType),
-    }))
+    </Flex>
   );
 };
 
-const BACKEND_TYPE_LABELS: Record<string, string> = {
-  WASM: "WebAssembly",
-  E2B: "E2B",
-  DAYTONA_PYTHON: "Daytona",
-  VERCEL_PYTHON: "Vercel",
-  VERCEL_TYPESCRIPT: "Vercel",
-  DENO: "Deno",
-  MODAL: "Modal",
-};
-
-const backendTypeLabel = (backendType: string): string =>
-  BACKEND_TYPE_LABELS[backendType] ?? backendType;
-
-const decodeRelayNodeId = (globalId: string) => {
-  const decoded = globalThis.atob(globalId);
-  const [, rawId = ""] = decoded.split(":", 2);
-  return Number(rawId);
-};
-
+// Validation helper
 const getCodeEvaluatorValidationError = ({
   outputConfigs,
   sourceCode,
+  mode,
+  sandboxConfigId,
 }: {
   outputConfigs: AnnotationConfig[];
   sourceCode: string;
+  mode: "create" | "update";
+  sandboxConfigId: string | null;
 }) => {
   if (sourceCode.trim().length === 0) {
     return "Source code is required.";
@@ -702,6 +829,10 @@ const getCodeEvaluatorValidationError = ({
   if (outputConfigs.length === 0) {
     return "At least one output config is required.";
   }
+  // Require sandbox selection when creating a new evaluator
+  if (mode === "create" && sandboxConfigId == null) {
+    return "Please select a sandbox configuration.";
+  }
   const outputConfig = outputConfigs[0];
   if ("values" in outputConfig) {
     if (outputConfig.values.length < 2) {
@@ -717,54 +848,137 @@ const getCodeEvaluatorValidationError = ({
   return undefined;
 };
 
+// Styles
 const fieldsetCSS = css`
   all: unset;
   display: flex;
   flex-direction: column;
   flex: 1;
   min-height: 0;
-  gap: var(--global-dimension-size-200);
-  overflow: auto;
+  overflow: hidden;
+`;
+
+const headerBarCSS = css`
+  display: flex;
+  flex-direction: row;
+  align-items: flex-start;
+  gap: var(--global-dimension-size-150);
+  padding: var(--global-dimension-size-150) var(--global-dimension-size-200);
+  border-bottom: 1px solid var(--global-border-color-default);
+  flex-shrink: 0;
+`;
+
+const headerFieldCSS = css`
+  /* Ensure fields don't wrap */
 `;
 
 const panelStyle = {
   height: "100%",
-  overflowY: "auto",
-} as const;
+  display: "flex",
+  flexDirection: "column" as const,
+  minHeight: 0,
+  overflow: "hidden" as const,
+};
 
-const leftPanelCSS = css`
+const editorPanelCSS = css`
   display: flex;
   flex-direction: column;
-  padding: var(--global-dimension-size-100) var(--global-dimension-size-200);
+  flex: 1;
+  min-height: 0;
+  padding: var(--global-dimension-size-150);
+  padding-top: var(--global-dimension-size-100);
   box-sizing: border-box;
+`;
+
+const editorSectionCSS = css`
+  display: flex;
+  flex-direction: column;
+  flex: 1;
+  min-height: 0;
   gap: var(--global-dimension-size-100);
 `;
 
-const rightPanelCSS = css`
+const sidebarPanelCSS = css`
   display: flex;
   flex-direction: column;
-  padding: var(--global-dimension-size-100) 0;
+  height: 100%;
+  padding: 0;
   box-sizing: border-box;
+  overflow-y: auto;
+  border-left: 1px solid var(--global-border-color-default);
 `;
 
-const editorWrapCSS = css`
+const accordionContentCSS = css`
+  padding: var(--global-dimension-size-50) 0;
+  padding-bottom: var(--global-dimension-size-150);
+`;
+
+const editorContainerCSS = css`
+  display: flex;
+  flex-direction: column;
+  flex: 1;
+  min-height: 0;
   border: 1px solid var(--global-border-color-default);
   border-radius: var(--global-rounding-medium);
   overflow: hidden;
+  background-color: var(--code-mirror-editor-background-color);
+`;
+
+const editorPanelStyle = {
+  display: "flex",
+  flexDirection: "column" as const,
+  minHeight: 0,
+  overflow: "hidden" as const,
+};
+
+const editorWrapCSS = css`
+  flex: 1;
+  min-height: 0;
+  overflow: hidden;
+  display: flex;
+  flex-direction: column;
+
+  & .cm-theme {
+    height: 100% !important;
+  }
+
+  & .cm-editor {
+    height: 100% !important;
+  }
+
+  & .cm-scroller {
+    overflow: auto !important;
+  }
+`;
+
+const typeFooterCSS = css`
+  flex: 1;
+  min-height: 0;
+  overflow: hidden;
+  display: flex;
+  flex-direction: column;
+
+  & .cm-theme {
+    height: 100% !important;
+  }
 
   & .cm-editor {
-    min-height: 280px;
+    height: 100% !important;
+    background-color: var(--ac-global-color-grey-100);
   }
 
-  & .cm-content,
-  & .cm-gutter {
-    min-height: 280px;
+  & .cm-gutters {
+    background-color: var(--ac-global-color-grey-100);
+  }
+
+  & .cm-scroller {
+    overflow: auto !important;
   }
 `;
 
 const choiceGridCSS = css`
   display: grid;
-  grid-template-columns: 1.5fr 1fr auto;
-  gap: var(--global-dimension-size-100);
-  align-items: end;
+  grid-template-columns: 1fr 100px 32px;
+  gap: var(--global-dimension-size-50);
+  align-items: center;
 `;
diff --git a/app/src/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql.ts b/app/src/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql.ts
new file mode 100644
index 00000000000..1742332fe5b
--- /dev/null
+++ b/app/src/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql.ts
@@ -0,0 +1,300 @@
+/**
+ * @generated SignedSource<<54099212158b2aedaffde087d5ae2557>>
+ * @lightSyntaxTransform
+ * @nogrep
+ */
+
+/* tslint:disable */
+/* eslint-disable */
+// @ts-nocheck
+
+import { ConcreteRequest } from 'relay-runtime';
+export type GenerativeProviderKey = "ANTHROPIC" | "AWS" | "AZURE_OPENAI" | "CEREBRAS" | "DEEPSEEK" | "FIREWORKS" | "GOOGLE" | "GROQ" | "MOONSHOT" | "OLLAMA" | "OPENAI" | "PERPLEXITY" | "TOGETHER" | "XAI";
+export type Language = "PYTHON" | "TYPESCRIPT";
+export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE" | "NONE";
+export type PromptMessageRole = "AI" | "SYSTEM" | "TOOL" | "USER";
+export type PromptTemplateFormat = "F_STRING" | "MUSTACHE" | "NONE";
+export type EvaluatorPreviewsInput = {
+  credentials?: ReadonlyArray<GenerativeCredentialInput> | null;
+  previews: ReadonlyArray<EvaluatorPreviewItemInput>;
+};
+export type EvaluatorPreviewItemInput = {
+  context: any;
+  evaluator: EvaluatorPreviewInput;
+  inputMapping?: EvaluatorInputMappingInput;
+};
+export type EvaluatorPreviewInput = {
+  builtInEvaluatorId?: string | null;
+  codeEvaluatorId?: string | null;
+  inlineCodeEvaluator?: InlineCodeEvaluatorInput | null;
+  inlineLlmEvaluator?: InlineLLMEvaluatorInput | null;
+};
+export type InlineLLMEvaluatorInput = {
+  description?: string | null;
+  name: string;
+  outputConfigs: ReadonlyArray<AnnotationConfigInput>;
+  promptVersion: ChatPromptVersionInput;
+};
+export type ChatPromptVersionInput = {
+  customProviderId?: string | null;
+  description?: string | null;
+  invocationParameters?: any;
+  modelName: string;
+  modelProvider: GenerativeProviderKey;
+  responseFormat?: PromptResponseFormatJSONSchemaInput | null;
+  template: PromptChatTemplateInput;
+  templateFormat: PromptTemplateFormat;
+  tools?: PromptToolsInput | null;
+};
+export type PromptChatTemplateInput = {
+  messages: ReadonlyArray<PromptMessageInput>;
+};
+export type PromptMessageInput = {
+  content: ReadonlyArray<ContentPartInput>;
+  role: PromptMessageRole;
+};
+export type ContentPartInput = {
+  text?: TextContentValueInput | null;
+  toolCall?: ToolCallContentValueInput | null;
+  toolResult?: ToolResultContentValueInput | null;
+};
+export type TextContentValueInput = {
+  text: string;
+};
+export type ToolCallContentValueInput = {
+  toolCall: ToolCallFunctionInput;
+  toolCallId: string;
+};
+export type ToolCallFunctionInput = {
+  arguments: string;
+  name: string;
+  type?: string | null;
+};
+export type ToolResultContentValueInput = {
+  result: any;
+  toolCallId: string;
+};
+export type PromptToolsInput = {
+  disableParallelToolCalls?: boolean | null;
+  toolChoice?: PromptToolChoiceInput | null;
+  tools: ReadonlyArray<PromptToolFunctionInput>;
+};
+export type PromptToolFunctionInput = {
+  function: PromptToolFunctionDefinitionInput;
+};
+export type PromptToolFunctionDefinitionInput = {
+  description?: string | null;
+  name: string;
+  parameters?: any | null;
+  strict?: boolean | null;
+};
+export type PromptToolChoiceInput = {
+  functionName?: string | null;
+  none?: boolean | null;
+  oneOrMore?: boolean | null;
+  zeroOrMore?: boolean | null;
+};
+export type PromptResponseFormatJSONSchemaInput = {
+  jsonSchema: PromptResponseFormatJSONSchemaDefinitionInput;
+  type: string;
+};
+export type PromptResponseFormatJSONSchemaDefinitionInput = {
+  description?: string | null;
+  name: string;
+  schema?: any | null;
+  strict?: boolean | null;
+};
+export type AnnotationConfigInput = {
+  categorical?: CategoricalAnnotationConfigInput | null;
+  continuous?: ContinuousAnnotationConfigInput | null;
+  freeform?: FreeformAnnotationConfigInput | null;
+};
+export type CategoricalAnnotationConfigInput = {
+  description?: string | null;
+  name: string;
+  optimizationDirection: OptimizationDirection;
+  values: ReadonlyArray<CategoricalAnnotationConfigValueInput>;
+};
+export type CategoricalAnnotationConfigValueInput = {
+  label: string;
+  score?: number | null;
+};
+export type ContinuousAnnotationConfigInput = {
+  description?: string | null;
+  lowerBound?: number | null;
+  name: string;
+  optimizationDirection: OptimizationDirection;
+  upperBound?: number | null;
+};
+export type FreeformAnnotationConfigInput = {
+  description?: string | null;
+  name: string;
+};
+export type InlineCodeEvaluatorInput = {
+  description?: string | null;
+  language: Language;
+  name: string;
+  outputConfigs: ReadonlyArray<AnnotationConfigInput>;
+  sandboxConfigId?: string | null;
+  sourceCode: string;
+};
+export type EvaluatorInputMappingInput = {
+  literalMapping?: any;
+  pathMapping?: any;
+};
+export type GenerativeCredentialInput = {
+  envVarName: string;
+  value: string;
+};
+export type CodeEvaluatorTestSectionMutation$variables = {
+  input: EvaluatorPreviewsInput;
+};
+export type CodeEvaluatorTestSectionMutation$data = {
+  readonly evaluatorPreviews: {
+    readonly results: ReadonlyArray<{
+      readonly annotation: {
+        readonly explanation: string | null;
+        readonly id: string;
+        readonly label: string | null;
+        readonly name: string;
+        readonly score: number | null;
+      } | null;
+      readonly error: string | null;
+      readonly evaluatorName: string;
+    }>;
+  };
+};
+export type CodeEvaluatorTestSectionMutation = {
+  response: CodeEvaluatorTestSectionMutation$data;
+  variables: CodeEvaluatorTestSectionMutation$variables;
+};
+
+const node: ConcreteRequest = (function(){
+var v0 = [
+  {
+    "defaultValue": null,
+    "kind": "LocalArgument",
+    "name": "input"
+  }
+],
+v1 = [
+  {
+    "alias": null,
+    "args": [
+      {
+        "kind": "Variable",
+        "name": "input",
+        "variableName": "input"
+      }
+    ],
+    "concreteType": "EvaluatorPreviewsPayload",
+    "kind": "LinkedField",
+    "name": "evaluatorPreviews",
+    "plural": false,
+    "selections": [
+      {
+        "alias": null,
+        "args": null,
+        "concreteType": "EvaluationResult",
+        "kind": "LinkedField",
+        "name": "results",
+        "plural": true,
+        "selections": [
+          {
+            "alias": null,
+            "args": null,
+            "kind": "ScalarField",
+            "name": "evaluatorName",
+            "storageKey": null
+          },
+          {
+            "alias": null,
+            "args": null,
+            "concreteType": "ExperimentRunAnnotation",
+            "kind": "LinkedField",
+            "name": "annotation",
+            "plural": false,
+            "selections": [
+              {
+                "alias": null,
+                "args": null,
+                "kind": "ScalarField",
+                "name": "explanation",
+                "storageKey": null
+              },
+              {
+                "alias": null,
+                "args": null,
+                "kind": "ScalarField",
+                "name": "label",
+                "storageKey": null
+              },
+              {
+                "alias": null,
+                "args": null,
+                "kind": "ScalarField",
+                "name": "score",
+                "storageKey": null
+              },
+              {
+                "alias": null,
+                "args": null,
+                "kind": "ScalarField",
+                "name": "name",
+                "storageKey": null
+              },
+              {
+                "alias": null,
+                "args": null,
+                "kind": "ScalarField",
+                "name": "id",
+                "storageKey": null
+              }
+            ],
+            "storageKey": null
+          },
+          {
+            "alias": null,
+            "args": null,
+            "kind": "ScalarField",
+            "name": "error",
+            "storageKey": null
+          }
+        ],
+        "storageKey": null
+      }
+    ],
+    "storageKey": null
+  }
+];
+return {
+  "fragment": {
+    "argumentDefinitions": (v0/*: any*/),
+    "kind": "Fragment",
+    "metadata": null,
+    "name": "CodeEvaluatorTestSectionMutation",
+    "selections": (v1/*: any*/),
+    "type": "Mutation",
+    "abstractKey": null
+  },
+  "kind": "Request",
+  "operation": {
+    "argumentDefinitions": (v0/*: any*/),
+    "kind": "Operation",
+    "name": "CodeEvaluatorTestSectionMutation",
+    "selections": (v1/*: any*/)
+  },
+  "params": {
+    "cacheID": "880a87909fde069d7330f259c8cea2a2",
+    "id": null,
+    "metadata": {},
+    "name": "CodeEvaluatorTestSectionMutation",
+    "operationKind": "mutation",
+    "text": "mutation CodeEvaluatorTestSectionMutation(\n  $input: EvaluatorPreviewsInput!\n) {\n  evaluatorPreviews(input: $input) {\n    results {\n      evaluatorName\n      annotation {\n        explanation\n        label\n        score\n        name\n        id\n      }\n      error\n    }\n  }\n}\n"
+  }
+};
+})();
+
+(node as any).hash = "51564928488b73ce3754b374762baa5a";
+
+export default node;
diff --git a/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts b/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts
index 3b4b941b551..72b36520378 100644
--- a/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts
+++ b/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts
@@ -1,5 +1,5 @@
 /**
- * @generated SignedSource<<3c3d1c0f71d7aee0d1c7e07898318cea>>
+ * @generated SignedSource<<24debaf7bb672a75d29e86628a268828>>
  * @lightSyntaxTransform
  * @nogrep
  */
@@ -10,6 +10,7 @@
 
 import { ConcreteRequest } from 'relay-runtime';
 export type GenerativeProviderKey = "ANTHROPIC" | "AWS" | "AZURE_OPENAI" | "CEREBRAS" | "DEEPSEEK" | "FIREWORKS" | "GOOGLE" | "GROQ" | "MOONSHOT" | "OLLAMA" | "OPENAI" | "PERPLEXITY" | "TOGETHER" | "XAI";
+export type Language = "PYTHON" | "TYPESCRIPT";
 export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE" | "NONE";
 export type PromptMessageRole = "AI" | "SYSTEM" | "TOOL" | "USER";
 export type PromptTemplateFormat = "F_STRING" | "MUSTACHE" | "NONE";
@@ -25,6 +26,7 @@ export type EvaluatorPreviewItemInput = {
 export type EvaluatorPreviewInput = {
   builtInEvaluatorId?: string | null;
   codeEvaluatorId?: string | null;
+  inlineCodeEvaluator?: InlineCodeEvaluatorInput | null;
   inlineLlmEvaluator?: InlineLLMEvaluatorInput | null;
 };
 export type InlineLLMEvaluatorInput = {
@@ -128,6 +130,14 @@ export type FreeformAnnotationConfigInput = {
   description?: string | null;
   name: string;
 };
+export type InlineCodeEvaluatorInput = {
+  description?: string | null;
+  language: Language;
+  name: string;
+  outputConfigs: ReadonlyArray<AnnotationConfigInput>;
+  sandboxConfigId?: string | null;
+  sourceCode: string;
+};
 export type EvaluatorInputMappingInput = {
   literalMapping?: any;
   pathMapping?: any;
diff --git a/app/src/components/evaluators/__tests__/codeEvaluatorAutocomplete.test.ts b/app/src/components/evaluators/__tests__/codeEvaluatorAutocomplete.test.ts
new file mode 100644
index 00000000000..0de144cbb31
--- /dev/null
+++ b/app/src/components/evaluators/__tests__/codeEvaluatorAutocomplete.test.ts
@@ -0,0 +1,83 @@
+import { describe, expect, it } from "vitest";
+
+import { createCompletionOptions } from "../codeEvaluatorAutocomplete";
+
+describe("createCompletionOptions", () => {
+  const mappingSource = {
+    output: {
+      answer: "Paris",
+      nested: {
+        score: 1,
+      },
+      items: [
+        {
+          name: "alpha",
+        },
+      ],
+    },
+    reference: {
+      answer: "Paris",
+    },
+    input: {
+      question: "What is the capital of France?",
+    },
+    metadata: {
+      isGolden: true,
+    },
+  };
+
+  it("produces sensible nested and indexed property completions", () => {
+    const options = createCompletionOptions({
+      mappingSource,
+      language: "TYPESCRIPT",
+    });
+
+    const labels = options.map((option) => option.label);
+
+    expect(labels).toContain("output");
+    expect(labels).toContain("reference");
+    expect(labels).toContain("input");
+    expect(labels).toContain("metadata");
+    expect(labels).toContain("output.answer");
+    expect(labels).toContain("output.nested");
+    expect(labels).toContain("output.nested.score");
+    expect(labels).toContain("output.items");
+    expect(labels).toContain("output.items[0]");
+    expect(labels).toContain("output.items[0].name");
+  });
+
+  it("includes useful type information for completion entries", () => {
+    const options = createCompletionOptions({
+      mappingSource,
+      language: "TYPESCRIPT",
+    });
+
+    expect(
+      options.find((option) => option.label === "output.answer")?.info
+    ).toBe('string: "Paris"');
+    expect(
+      options.find((option) => option.label === "output.items")?.info
+    ).toBe("array (1 items)");
+    expect(
+      options.find((option) => option.label === "metadata.isGolden")?.info
+    ).toBe("boolean: true");
+  });
+
+  it("adds language-specific helper completions", () => {
+    const pythonOptions = createCompletionOptions({
+      mappingSource,
+      language: "PYTHON",
+    });
+    const typescriptOptions = createCompletionOptions({
+      mappingSource,
+      language: "TYPESCRIPT",
+    });
+
+    expect(pythonOptions.map((option) => option.label)).toContain(".get(");
+    expect(pythonOptions.map((option) => option.label)).toContain(
+      "isinstance("
+    );
+    expect(typescriptOptions.map((option) => option.label)).toContain("?.");
+    expect(typescriptOptions.map((option) => option.label)).toContain("typeof");
+  });
+});
diff --git a/app/src/components/evaluators/codeEvaluatorAutocomplete.ts b/app/src/components/evaluators/codeEvaluatorAutocomplete.ts
new file mode 100644
index 00000000000..20c77a40199
--- /dev/null
+++ b/app/src/components/evaluators/codeEvaluatorAutocomplete.ts
@@ -0,0 +1,182 @@
+import type {
+  Completion,
+  CompletionContext,
+  CompletionResult,
+} from "@codemirror/autocomplete";
+import { autocompletion } from "@codemirror/autocomplete";
+
+import type { EvaluatorMappingSource } from "@phoenix/types";
+import { flattenObject } from "@phoenix/utils/jsonUtils";
+
+/**
+ * Generates a human-readable type description for a value.
+ */
+function getTypeDescription(value: unknown): string {
+  if (value === null) return "null";
+  if (value === undefined) return "undefined";
+  if (Array.isArray(value)) {
+    if (value.length === 0) return "array (empty)";
+    return `array (${value.length} items)`;
+  }
+  if (typeof value === "object") {
+    const keys = Object.keys(value as Record<string, unknown>);
+    if (keys.length <= 3) return `object { ${keys.join(", ")} }`;
+    return `object (${keys.length} keys)`;
+  }
+  if (typeof value === "string") {
+    if (value.length > 30) return `string: "${value.slice(0, 30)}..."`;
+    return `string: "${value}"`;
+  }
+  if (typeof value === "number") return `number: ${value}`;
+  if (typeof value === "boolean") return `boolean: ${value}`;
+  return typeof value;
+}
+
+/**
+ * Creates autocomplete options from the evaluator mapping source.
+ */
+export function createCompletionOptions({
+  mappingSource,
+  language,
+}: {
+  mappingSource: EvaluatorMappingSource;
+  language: "PYTHON" | "TYPESCRIPT";
+}): Completion[] {
+  const options: Completion[] = [];
+
+  // Add top-level parameter completions
+  const topLevelParams = [
+    {
+      name: "output",
+      data: mappingSource.output,
+      info: "The output from the task being evaluated",
+    },
+    {
+      name: "reference",
+      data: mappingSource.reference,
+      info: "The expected/reference output from the dataset",
+    },
+    {
+      name: "input",
+      data: mappingSource.input,
+      info: "The input provided to the task",
+    },
+    {
+      name: "metadata",
+      data: mappingSource.metadata,
+      info: "Additional metadata from the dataset",
+    },
+  ];
+
+  for (const { name, data, info } of topLevelParams) {
+    options.push({
+      label: name,
+      type: "variable",
+      info,
+      boost: 10, // Boost top-level params
+    });
+
+    // Add nested field completions
+    if (data && typeof data === "object" && Object.keys(data).length > 0) {
+      const flattened = flattenObject({
+        obj: data as Record<string, unknown>,
+        parentKey: name,
+        keepNonTerminalValues: true,
+        formatIndices: true,
+      }) as Record<string, unknown>;
+      for (const [path, value] of Object.entries(flattened)) {
+        options.push({
+          label: path,
+          type: "property",
+          info: getTypeDescription(value),
+          boost: 5,
+        });
+      }
+    }
+  }
+
+  // Add language-specific helper completions
+  if (language === "PYTHON") {
+    options.push(
+      {
+        label: ".get(",
+        type: "method",
+        info: "Safely get a dict value with optional default",
+        apply: '.get("key", "")',
+        boost: 3,
+      },
+      {
+        label: "isinstance(",
+        type: "function",
+        info: "Check if value is an instance of a type",
+        apply: "isinstance(output, dict)",
+        boost: 2,
+      }
+    );
+  } else {
+    options.push(
+      {
+        label: "?.",
+        type: "keyword",
+        info: "Optional chaining operator",
+        boost: 3,
+      },
+      {
+        label: "typeof",
+        type: "keyword",
+        info: "Check the type of a value",
+        apply: 'typeof output?.field === "string"',
+        boost: 2,
+      }
+    );
+  }
+
+  return options;
+}
+
+/**
+ * Creates a completion function for the code evaluator editor.
+ */
+function createEvaluatorCompletions(
+  mappingSource: EvaluatorMappingSource,
+  language: "PYTHON" | "TYPESCRIPT"
+): (context: CompletionContext) => CompletionResult | null {
+  return (context: CompletionContext): CompletionResult | null => {
+    // Match word characters and dots (for nested access like output.answer)
+    const word = context.matchBefore(/[\w.?]*/);
+    if (!word) return null;
+
+    // Don't autocomplete if we're not at a word boundary or explicit
+    if (word.from === word.to && !context.explicit) return null;
+
+    const options = createCompletionOptions({ mappingSource, language });
+
+    // Filter options based on what's typed
+    const typed = word.text.toLowerCase();
+    const filteredOptions = typed
+      ? options.filter((opt) => opt.label.toLowerCase().includes(typed))
+      : options;
+
+    if (filteredOptions.length === 0) return null;
+
+    return {
+      from: word.from,
+      options: filteredOptions,
+      validFor: /^[\w.?]*$/,
+    };
+  };
+}
+
+/**
+ * Creates the autocompletion extension for the code evaluator editor.
+ */
+export function createEvaluatorAutocompletion(
+  mappingSource: EvaluatorMappingSource,
+  language: "PYTHON" | "TYPESCRIPT"
+) {
+  return autocompletion({
+    override: [createEvaluatorCompletions(mappingSource, language)],
+    activateOnTyping: true,
+    maxRenderedOptions: 50,
+  });
+}
diff --git a/app/src/components/evaluators/codeEvaluatorTypeGeneration.ts b/app/src/components/evaluators/codeEvaluatorTypeGeneration.ts
new file mode 100644
index 00000000000..db4b0d17760
--- /dev/null
+++ b/app/src/components/evaluators/codeEvaluatorTypeGeneration.ts
@@ -0,0 +1,253 @@
+import type { EvaluatorMappingSource } from "@phoenix/types";
+
+/**
+ * Infers a TypeScript type string from a JavaScript value.
+ */
+function inferTypeFromValue(value: unknown, indent = 0): string {
+  const spaces = "  ".repeat(indent);
+
+  if (value === null) {
+    return "null";
+  }
+
+  if (value === undefined) {
+    return "undefined";
+  }
+
+  if (typeof value === "string") {
+    return "string";
+  }
+
+  if (typeof value === "number") {
+    return "number";
+  }
+
+  if (typeof value === "boolean") {
+    return "boolean";
+  }
+
+  if (Array.isArray(value)) {
+    if (value.length === 0) {
+      return "unknown[]";
+    }
+    // Infer type from first element
+    const elementType = inferTypeFromValue(value[0], indent);
+    return `${elementType}[]`;
+  }
+
+  if (typeof value === "object") {
+    const entries = Object.entries(value as Record<string, unknown>);
+    if (entries.length === 0) {
+      return "Record<string, unknown>";
+    }
+
+    const innerSpaces = "  ".repeat(indent + 1);
+    const properties = entries
+      .map(([key, val]) => {
+        const safeKey = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/.test(key)
+          ? key
+          : `"${key}"`;
+        return `${innerSpaces}${safeKey}: ${inferTypeFromValue(val, indent + 1)};`;
+      })
+      .join("\n");
+
+    return `{\n${properties}\n${spaces}}`;
+  }
+
+  return "unknown";
+}
+
+/**
+ * Generates TypeScript interface definitions from the evaluator mapping source.
+ * Returns a read-only footer block to append to the code editor.
+ */
+export function generateTypeScriptTypes(
+  mappingSource: EvaluatorMappingSource
+): string {
+  const lines: string[] = [
+    "// Auto-generated types from dataset example (read-only)",
+    "// These types reflect the structure of your dataset",
+  ];
+
+  // Generate type for each mapping source field if it has data
+  const fields: Array<{ name: string; typeName: string; data: unknown }> = [
+    { name: "input", typeName: "Input", data: mappingSource.input },
+    { name: "output", typeName: "Output", data: mappingSource.output },
+    { name: "reference", typeName: "Reference", data: mappingSource.reference },
+    { name: "metadata", typeName: "Metadata", data: mappingSource.metadata },
+  ];
+
+  for (const { typeName, data } of fields) {
+    if (data && typeof data === "object" && Object.keys(data).length > 0) {
+      const typeBody = inferTypeFromValue(data, 0);
+      lines.push(`type ${typeName} = ${typeBody};`);
+      lines.push("");
+    }
+  }
+
+  // Add the EvaluatorParams type that combines available fields
+  const availableFields = fields.filter(
+    (f) =>
+      f.data && typeof f.data === "object" && Object.keys(f.data).length > 0
+  );
+
+  if (availableFields.length > 0) {
+    lines.push("type EvaluatorParams = {");
+    for (const { name, typeName } of availableFields) {
+      lines.push(`  ${name}?: ${typeName};`);
+    }
+    lines.push("};");
+    lines.push("");
+  }
+
+  return lines.join("\n");
+}
+
+/**
+ * Infers a Python type hint string from a JavaScript value.
+ */
+function inferPythonTypeFromValue(value: unknown): string {
+  if (value === null || value === undefined) {
+    return "None";
+  }
+
+  if (typeof value === "string") {
+    return "str";
+  }
+
+  if (typeof value === "number") {
+    return Number.isInteger(value) ? "int" : "float";
+  }
+
+  if (typeof value === "boolean") {
+    return "bool";
+  }
+
+  if (Array.isArray(value)) {
+    if (value.length === 0) {
+      return "list";
+    }
+    const elementType = inferPythonTypeFromValue(value[0]);
+    return `list[${elementType}]`;
+  }
+
+  if (typeof value === "object") {
+    const entries = Object.entries(value as Record<string, unknown>);
+    if (entries.length === 0) {
+      return "dict";
+    }
+    // For complex objects, use TypedDict representation in docstring
+    return "dict";
+  }
+
+  return "Any";
+}
+
+/**
+ * Generates a formatted dict structure description for Python docstrings.
+ */
+function formatPythonDictStructure(
+  data: Record<string, unknown>,
+  indent = 0
+): string[] {
+  const lines: string[] = [];
+  const spaces = "    ".repeat(indent);
+
+  for (const [key, value] of Object.entries(data)) {
+    if (value && typeof value === "object" && !Array.isArray(value)) {
+      lines.push(`${spaces}"${key}": {`);
+      lines.push(
+        ...formatPythonDictStructure(
+          value as Record<string, unknown>,
+          indent + 1
+        )
+      );
+      lines.push(`${spaces}}`);
+    } else if (
+      Array.isArray(value) &&
+      value.length > 0 &&
+      typeof value[0] === "object"
+    ) {
+      lines.push(`${spaces}"${key}": [`);
+      lines.push(`${spaces}    {`);
+      lines.push(
+        ...formatPythonDictStructure(
+          value[0] as Record<string, unknown>,
+          indent + 2
+        )
+      );
+      lines.push(`${spaces}    }`);
+      lines.push(`${spaces}]`);
+    } else {
+      const typeHint = inferPythonTypeFromValue(value);
+      lines.push(`${spaces}"${key}": ${typeHint}`);
+    }
+  }
+
+  return lines;
+}
+
+/**
+ * Generates Python docstring/type hints from the evaluator mapping source.
+ * Returns a read-only footer block to append to the code editor.
+ */
+export function generatePythonTypes(
+  mappingSource: EvaluatorMappingSource
+): string {
+  const lines: string[] = [
+    '"""',
+    "Auto-generated type information from dataset example (read-only)",
+    "These types reflect the structure of your dataset",
+  ];
+
+  const fields: Array<{ name: string; data: unknown }> = [
+    { name: "input", data: mappingSource.input },
+    { name: "output", data: mappingSource.output },
+    { name: "reference", data: mappingSource.reference },
+    { name: "metadata", data: mappingSource.metadata },
+  ];
+
+  for (const { name, data } of fields) {
+    if (data && typeof data === "object" && Object.keys(data).length > 0) {
+      lines.push(`${name}: dict`);
+      const structureLines = formatPythonDictStructure(
+        data as Record<string, unknown>,
+        1
+      );
+      if (structureLines.length > 0) {
+        lines.push("    {");
+        lines.push(...structureLines);
+        lines.push("    }");
+      }
+      lines.push("");
+    }
+  }
+
+  lines.push('"""');
+  lines.push("");
+
+  return lines.join("\n");
+}
+
+/**
+ * Generates type definitions based on language.
+ */
+export function generateEvaluatorTypes(
+  language: "PYTHON" | "TYPESCRIPT",
+  mappingSource: EvaluatorMappingSource
+): string {
+  // Only generate types if there's meaningful data
+  const hasData = Object.values(mappingSource).some(
+    (data) => data && typeof data === "object" && Object.keys(data).length > 0
+  );
+
+  if (!hasData) {
+    return "";
+  }
+
+  if (language === "PYTHON") {
+    return generatePythonTypes(mappingSource);
+  }
+
+  return generateTypeScriptTypes(mappingSource);
+}
diff --git a/app/src/components/evaluators/codeEvaluatorUtils.ts b/app/src/components/evaluators/codeEvaluatorUtils.ts
index 4c4903a2750..a92df4bbe52 100644
--- a/app/src/components/evaluators/codeEvaluatorUtils.ts
+++ b/app/src/components/evaluators/codeEvaluatorUtils.ts
@@ -5,11 +5,16 @@ export const DEFAULT_CODE_EVALUATOR_SOURCE: Record<
   string
 > = {
   PYTHON: `def evaluate(output, reference=None, input=None, metadata=None):
+    """
+    Evaluate the output against the reference.
+    See the auto-generated type information below for the structure of each parameter.
+    """
     candidate = output.get("answer", "") if isinstance(output, dict) else ""
     expected = reference.get("answer", "") if isinstance(reference, dict) else ""
     return 1 if candidate == expected else 0
 `,
-  TYPESCRIPT: `function evaluate({ output, reference }: { output?: Record<string, unknown>; reference?: Record<string, unknown> }) {
+  TYPESCRIPT: `function evaluate({ output, reference, input, metadata }: EvaluatorParams) {
+  // See the auto-generated type definitions below for the structure of each parameter.
   const candidate = typeof output?.answer === "string" ? output.answer : "";
   const expected = typeof reference?.answer === "string" ? reference.answer : "";
   return candidate === expected ? 1 : 0;
@@ -65,17 +70,12 @@ function extractTypeScriptVariables(sourceCode: string) {
       .map((part) => part.trim())
       .filter(Boolean)
       .map((part) => part.split(":")[0]?.trim() ?? "")
+      .map((part) => part.split("=")[0]?.trim() ?? "")
+      .map((part) => part.replace(/\?$/, "").trim())
       .filter(Boolean)
       .filter(unique);
   }
-  const firstParam = params.split(",")[0]?.trim() ?? "";
-  const paramName = firstParam.split(":")[0]?.trim() ?? "";
-  if (!paramName) {
-    return [];
-  }
-  const accessPattern = new RegExp(`${paramName}\\.([a-zA-Z_$][\\w$]*)`, "g");
-  const matches = sourceCode.matchAll(accessPattern);
-  return Array.from(matches, (match) => match[1]).filter(unique);
+  return [];
 }
 
 function unique(value: string, index: number, values: string[]) {
diff --git a/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx b/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx
index 68d2873300c..b621625b279 100644
--- a/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx
+++ b/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx
@@ -7,7 +7,7 @@ import { graphql } from "relay-runtime";
 import { Flex, Heading, Text } from "@phoenix/components";
 import { JSONBlock } from "@phoenix/components/code";
 import { EditCodeDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditCodeDatasetEvaluatorSlideover";
-import { CodeEvaluatorSourceCodeBlock } from "@phoenix/components/evaluators/EditCodeEvaluatorDialogContent";
+import { CodeEvaluatorSourceCodeBlock } from "@phoenix/components/evaluators/CodeEvaluatorSourceCodeBlock";
 import type { CodeDatasetEvaluatorDetails_datasetEvaluator$key } from "@phoenix/pages/dataset/evaluators/__generated__/CodeDatasetEvaluatorDetails_datasetEvaluator.graphql";
 
 const boxCSS = css`
diff --git a/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx b/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx
index 19f35a26bca..74b0a8c2adc 100644
--- a/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx
+++ b/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx
@@ -12,6 +12,7 @@ import {
   Text,
 } from "@phoenix/components";
 import { EditBuiltInDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditBuiltInDatasetEvaluatorSlideover";
+import { EditCodeDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditCodeDatasetEvaluatorSlideover";
 import { EditLLMDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditLLMDatasetEvaluatorSlideover";
 import { StopPropagation } from "@phoenix/components/StopPropagation";
 
@@ -91,6 +92,14 @@ export function DatasetEvaluatorActionMenu({
           onOpenChange={setIsEditDialogOpen}
           updateConnectionIds={updateConnectionIds}
         />
+      ) : evaluatorKind === "CODE" ? (
+        <EditCodeDatasetEvaluatorSlideover
+          datasetEvaluatorId={datasetEvaluatorId}
+          datasetId={datasetId}
+          isOpen={isEditDialogOpen}
+          onOpenChange={setIsEditDialogOpen}
+          updateConnectionIds={updateConnectionIds}
+        />
       ) : (
         <EditLLMDatasetEvaluatorSlideover
           datasetEvaluatorId={datasetEvaluatorId}
diff --git a/app/tests/code-evaluators.spec.ts b/app/tests/code-evaluators.spec.ts
new file mode 100644
index 00000000000..5daa017651b
--- /dev/null
+++ b/app/tests/code-evaluators.spec.ts
@@ -0,0 +1,577 @@
+import { randomUUID } from "crypto";
+import {
+  expect,
+  test,
+  type Locator,
+  type Page,
+  type Response,
+} from "@playwright/test";
+
+function isGraphQLMutationResponse(response: Response, operationName: string) {
+  if (!response.url().includes("/graphql") || response.status() !== 200) {
+    return false;
+  }
+  const postData = response.request().postData();
+  return postData?.includes(operationName) ?? false;
+}
+
+async function createDatasetWithExample(page: Page, datasetName: string) {
+  await page.goto("/datasets");
+  await page.waitForURL("**/datasets");
+
+  await page.getByRole("button", { name: "New Dataset" }).click();
+  await expect(
+    page.getByRole("heading", { name: "Create Dataset" })
+  ).toBeVisible();
+
+  await page.getByRole("tab", { name: "From scratch" }).click();
+  await page.getByLabel("Dataset Name").clear();
+  await page.getByLabel("Dataset Name").fill(datasetName);
+  await page
+    .getByLabel("Description")
+    .fill("Dataset for custom code evaluator flows");
+
+  await page.getByRole("button", { name: "Create Dataset" }).click();
+  await expect(page.getByTestId("dialog")).not.toBeVisible();
+
+  await page.getByRole("link", { name: datasetName }).click();
+  await page.waitForURL("**/datasets/**/examples");
+
+  await page
+    .getByRole("button", { name: "Add Dataset Example" })
+    .or(page.getByRole("button", { name: "Example" }))
+    .click();
+  await expect(page.getByRole("dialog")).toBeVisible();
+
+  const dialog = page.getByRole("dialog");
+  const inputEditor = dialog.locator(".cm-content").first();
+  await expect(inputEditor).toBeVisible();
+  await inputEditor.click();
+  await page.keyboard.press("ControlOrMeta+a");
+  await page.keyboard.insertText(
+    '{"output": {"answer": "4"}, "reference": {"answer": "4"}}'
+  );
+
+  await page.getByText("Create more", { exact: true }).click();
+  await page.getByRole("button", { name: "Add Example" }).click();
+  await expect(page.getByRole("dialog")).not.toBeVisible();
+}
+
+async function gotoDatasetEvaluators(page: Page, datasetName: string) {
+  await page.goto("/datasets");
+  await page.waitForURL("**/datasets");
+  await page.getByRole("link", { name: datasetName }).click();
+  await page.waitForURL("**/datasets/**/examples");
+  await page.getByRole("tab", { name: /Evaluators/i }).click();
+  await page.waitForURL("**/evaluators");
+}
+
+async function ensureSandboxConfig(
+  page: Page,
+  language: "Python" | "TypeScript",
+  configName: string
+) {
+  await page.goto("/settings/sandboxes");
+  await page.waitForURL("**/settings/sandboxes");
+
+  const providerRow = page
+    .locator("table")
+    .first()
+    .locator("tbody tr")
+    .filter({ has: page.getByText(`${language} provider`, { exact: true }) })
+    .filter({ has: page.getByRole("switch", { name: "Enabled" }) })
+    .first();
+
+  await expect(providerRow).toBeVisible();
+  await expect(providerRow.getByText("Enabled", { exact: true })).toBeVisible();
+
+  const providerName = (
+    await providerRow
+      .locator("td")
+      .first()
+      .locator("span")
+      .first()
+      .textContent()
+  )?.trim();
+  expect(providerName).toBeTruthy();
+
+  await page.getByRole("button", { name: "New Sandbox" }).click();
+  await expect(
+    page.getByRole("heading", { name: "New Sandbox Config" })
+  ).toBeVisible();
+
+  await selectComboboxOption(page, "Provider", providerName!);
+  await page.getByRole("textbox", { name: "Name" }).fill(configName);
+
+  await Promise.all([
+    page.waitForResponse((response) =>
+      isGraphQLMutationResponse(
+        response,
+        "SandboxConfigDialogCreateSandboxConfigMutation"
+      )
+    ),
+    page.getByRole("button", { name: "Create Config" }).click(),
+  ]);
+
+  await expect(page.getByRole("dialog")).not.toBeVisible();
+  await expect(page.getByText(configName, { exact: true })).toBeVisible();
+}
+
+async function selectComboboxOption(
+  page: Page,
+  label: string,
+  optionName: string,
+  container?: Locator
+) {
+  const scope = container ?? page;
+  const combobox = scope.getByRole("combobox", { name: label });
+  await combobox.click();
+  await combobox.fill(optionName);
+  await page.getByRole("option", { name: optionName, exact: true }).click();
+  await expect(combobox).toHaveValue(optionName);
+}
+
+async function selectLanguage(
+  page: Page,
+  container: Locator,
+  language: "Python" | "TypeScript"
+) {
+  const languageField = container
+    .getByText("Language", { exact: true })
+    .locator("..");
+  await languageField.getByRole("button").click();
+  await page.getByRole("option", { name: language, exact: true }).click();
+  await expect(languageField.getByRole("button")).toHaveText(language);
+}
+
+async function openEvaluatorEditor(page: Page, evaluatorName: string) {
+  const evaluatorRow = page.getByRole("row").filter({
+    has: page.getByRole("cell", { name: evaluatorName, exact: true }),
+  });
+
+  await evaluatorRow.getByRole("button").last().click();
+  await page.getByRole("menuitem", { name: "Edit" }).click();
+  await expect(
+    page.getByRole("heading", { name: "Edit Evaluator" })
+  ).toBeVisible();
+}
+
+async function createCustomCodeEvaluator({
+  page,
+  evaluatorName,
+  language,
+  sandboxName,
+  description,
+}: {
+  page: Page;
+  evaluatorName: string;
+  language: "Python" | "TypeScript";
+  sandboxName?: string;
+  description?: string;
+}) {
+  await page.getByRole("button", { name: "Add evaluator" }).click();
+  await page
+    .getByRole("menuitem", { name: "Create new code evaluator" })
+    .click();
+
+  const dialog = page.getByRole("dialog");
+  await expect(
+    page.getByRole("heading", { name: "Create Evaluator" })
+  ).toBeVisible();
+
+  await dialog
+    .getByRole("textbox", { name: "Name", exact: true })
+    .fill(evaluatorName);
+
+  if (description) {
+    await dialog
+      .getByRole("textbox", { name: /Description/i })
+      .fill(description);
+  }
+
+  if (language === "TypeScript") {
+    await selectLanguage(page, dialog, "TypeScript");
+  }
+
+  if (sandboxName) {
+    await selectComboboxOption(page, "Sandbox", sandboxName, dialog);
+  }
+
+  await page.getByRole("button", { name: "Create" }).click();
+  await expect(page.getByTestId("dialog")).not.toBeVisible();
+}
+
+async function expectEvaluatorDetailsPage(page: Page, evaluatorName: string) {
+  await page.getByRole("link", { name: evaluatorName, exact: true }).click();
+  await page.waitForURL("**/evaluators/**");
+  await expect(
+    page.getByRole("heading", { name: evaluatorName })
+  ).toBeVisible();
+}
+
+test.describe.serial("Code Evaluators", () => {
+  const datasetName = `code-evals-${randomUUID().slice(0, 8)}`;
+  const pythonSandboxName = `python-sandbox-${randomUUID().slice(0, 8)}`;
+  const typeScriptSandboxName = `ts-sandbox-${randomUUID().slice(0, 8)}`;
+  const pythonEvaluatorName = `python-code-eval-${randomUUID().slice(0, 8)}`;
+  const updatedPythonEvaluatorName = `updated-python-code-eval-${randomUUID().slice(0, 8)}`;
+  const typeScriptEvaluatorName = `typescript-code-eval-${randomUUID().slice(0, 8)}`;
+
+  test("can create prerequisites for code evaluator flows", async ({
+    page,
+  }) => {
+    await ensureSandboxConfig(page, "Python", pythonSandboxName);
+    await ensureSandboxConfig(page, "TypeScript", typeScriptSandboxName);
+    await createDatasetWithExample(page, datasetName);
+    await gotoDatasetEvaluators(page, datasetName);
+
+    await expect(
+      page.getByRole("tab", { name: /Evaluators/i })
+    ).toHaveAttribute("aria-selected", "true");
+  });
+
+  test("can create and render a Python code evaluator", async ({ page }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+
+    await createCustomCodeEvaluator({
+      page,
+      evaluatorName: pythonEvaluatorName,
+      language: "Python",
+      sandboxName: pythonSandboxName,
+    });
+
+    await expect(
+      page.getByRole("cell", { name: pythonEvaluatorName, exact: true })
+    ).toBeVisible();
+
+    await expectEvaluatorDetailsPage(page, pythonEvaluatorName);
+  });
+
+  test("can create and render a TypeScript code evaluator", async ({
+    page,
+  }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+
+    await createCustomCodeEvaluator({
+      page,
+      evaluatorName: typeScriptEvaluatorName,
+      language: "TypeScript",
+      sandboxName: typeScriptSandboxName,
+    });
+
+    await expect(
+      page.getByRole("cell", { name: typeScriptEvaluatorName, exact: true })
+    ).toBeVisible();
+
+    await expectEvaluatorDetailsPage(page, typeScriptEvaluatorName);
+  });
+
+  test("can edit a custom Python code evaluator", async ({ page }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+    await openEvaluatorEditor(page, pythonEvaluatorName);
+
+    const dialog = page.getByRole("dialog");
+    const nameInput = dialog.getByRole("textbox", {
+      name: "Name",
+      exact: true,
+    });
+    await expect(nameInput).toHaveValue(pythonEvaluatorName);
+
+    await nameInput.fill(updatedPythonEvaluatorName);
+    await page.getByRole("button", { name: "Update" }).click();
+
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+    await expect(
+      page.getByRole("cell", { name: updatedPythonEvaluatorName, exact: true })
+    ).toBeVisible();
+
+    await openEvaluatorEditor(page, updatedPythonEvaluatorName);
+    await expect(
+      page
+        .getByRole("dialog")
+        .getByRole("textbox", { name: "Name", exact: true })
+    ).toHaveValue(updatedPythonEvaluatorName);
+    await page.getByRole("button", { name: "Cancel" }).click();
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+
+    await expectEvaluatorDetailsPage(page, updatedPythonEvaluatorName);
+  });
+
+  test("submits a cleared sandbox when switching back to the original language", async ({
+    page,
+  }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+    await openEvaluatorEditor(page, updatedPythonEvaluatorName);
+
+    const dialog = page.getByRole("dialog");
+    const sandboxCombobox = dialog.getByRole("combobox", { name: "Sandbox" });
+
+    await expect(sandboxCombobox).toHaveValue(pythonSandboxName);
+
+    await selectLanguage(page, dialog, "TypeScript");
+    await selectComboboxOption(page, "Sandbox", typeScriptSandboxName, dialog);
+    await selectLanguage(page, dialog, "Python");
+
+    await expect(sandboxCombobox).toHaveValue("");
+
+    const updateCodeEvaluatorResponse = page.waitForResponse((response) =>
+      isGraphQLMutationResponse(
+        response,
+        "EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation"
+      )
+    );
+
+    await page.getByRole("button", { name: "Update" }).click();
+
+    const response = await updateCodeEvaluatorResponse;
+    const requestBody = response.request().postDataJSON() as {
+      variables: {
+        input: {
+          sandboxConfigId?: string | null;
+        };
+      };
+    };
+
+    expect(requestBody.variables.input.sandboxConfigId).toBeNull();
+
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+
+    await expect(
+      page.getByRole("cell", { name: updatedPythonEvaluatorName, exact: true })
+    ).toBeVisible();
+
+    await openEvaluatorEditor(page, updatedPythonEvaluatorName);
+    await expect(
+      page.getByRole("dialog").getByRole("combobox", { name: "Sandbox" })
+    ).toHaveValue("");
+    await page.getByRole("button", { name: "Cancel" }).click();
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+  });
+
+  // Store names for additional test cases
+  const evaluatorWithDescriptionName = `eval-with-desc-${randomUUID().slice(0, 8)}`;
+  const evaluatorWithDescriptionDesc = "This evaluator checks output quality";
+  const updatedDescription = "Updated description for testing";
+
+  test("can create code evaluator with description and verify it persists", async ({
+    page,
+  }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+
+    await createCustomCodeEvaluator({
+      page,
+      evaluatorName: evaluatorWithDescriptionName,
+      language: "Python",
+      sandboxName: pythonSandboxName,
+      description: evaluatorWithDescriptionDesc,
+    });
+
+    await expect(
+      page.getByRole("cell", {
+        name: evaluatorWithDescriptionName,
+        exact: true,
+      })
+    ).toBeVisible();
+
+    // Reopen editor and verify description persisted
+    await openEvaluatorEditor(page, evaluatorWithDescriptionName);
+    const dialog = page.getByRole("dialog");
+    const descriptionInput = dialog.getByRole("textbox", {
+      name: /Description/i,
+    });
+    await expect(descriptionInput).toHaveValue(evaluatorWithDescriptionDesc);
+
+    // Update the description
+    await descriptionInput.fill(updatedDescription);
+    await page.getByRole("button", { name: "Update" }).click();
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+
+    // Verify updated description persisted
+    await openEvaluatorEditor(page, evaluatorWithDescriptionName);
+    await expect(
+      page.getByRole("dialog").getByRole("textbox", { name: /Description/i })
+    ).toHaveValue(updatedDescription);
+    await page.getByRole("button", { name: "Cancel" }).click();
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+  });
+
+  test("cannot create code evaluator without selecting a sandbox", async ({
+    page,
+  }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+
+    await page.getByRole("button", { name: "Add evaluator" }).click();
+    await page
+      .getByRole("menuitem", { name: "Create new code evaluator" })
+      .click();
+
+    const dialog = page.getByRole("dialog");
+    await expect(
+      page.getByRole("heading", { name: "Create Evaluator" })
+    ).toBeVisible();
+
+    await dialog
+      .getByRole("textbox", { name: "Name", exact: true })
+      .fill("test-no-sandbox-eval");
+
+    // Don't select a sandbox - verify sandbox field is empty
+    const sandboxCombobox = dialog.getByRole("combobox", { name: "Sandbox" });
+    await expect(sandboxCombobox).toHaveValue("");
+
+    // Attempt to create - should show validation error
+    await page.getByRole("button", { name: "Create" }).click();
+
+    // Verify validation error is shown
+    await expect(
+      dialog.getByText("Please select a sandbox configuration.")
+    ).toBeVisible();
+
+    // Dialog should still be open (not created)
+    await expect(dialog).toBeVisible();
+
+    // Verify the error alert header is also shown
+    await expect(
+      dialog.getByRole("heading", {
+        name: "Invalid code evaluator configuration",
+      })
+    ).toBeVisible();
+  });
+
+  test("can open test evaluator section and verify UI elements", async ({
+    page,
+  }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+    await openEvaluatorEditor(page, evaluatorWithDescriptionName);
+
+    const dialog = page.getByRole("dialog");
+
+    // Ensure sandbox is set for testing
+    const sandboxCombobox = dialog.getByRole("combobox", { name: "Sandbox" });
+    if ((await sandboxCombobox.inputValue()) === "") {
+      await selectComboboxOption(page, "Sandbox", pythonSandboxName, dialog);
+    }
+
+    // Expand the Test Evaluator section
+    const testSectionTrigger = dialog.getByRole("button", {
+      name: "Test Evaluator",
+    });
+    await testSectionTrigger.click();
+
+    // Verify the Test button is visible (exact match to avoid matching "Test Evaluator")
+    const testButton = dialog.getByRole("button", {
+      name: "Test",
+      exact: true,
+    });
+    await expect(testButton).toBeVisible();
+
+    // Verify the test section description is visible
+    await expect(
+      dialog.getByText(
+        "Run your evaluator against the example data to verify it works correctly"
+      )
+    ).toBeVisible();
+
+    // Note: Actually running the test requires a working sandbox backend.
+    // The dismiss button test is skipped as it depends on sandbox execution.
+
+    await page.getByRole("button", { name: "Cancel" }).click();
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+  });
+
+  const categoricalEvaluatorName = `categorical-eval-${randomUUID().slice(0, 8)}`;
+
+  test("can configure categorical choices in code evaluator", async ({
+    page,
+  }) => {
+    await gotoDatasetEvaluators(page, datasetName);
+
+    await page.getByRole("button", { name: "Add evaluator" }).click();
+    await page
+      .getByRole("menuitem", { name: "Create new code evaluator" })
+      .click();
+
+    const dialog = page.getByRole("dialog");
+    await expect(
+      page.getByRole("heading", { name: "Create Evaluator" })
+    ).toBeVisible();
+
+    await dialog
+      .getByRole("textbox", { name: "Name", exact: true })
+      .fill(categoricalEvaluatorName);
+
+    await selectComboboxOption(page, "Sandbox", pythonSandboxName, dialog);
+
+    // Expand Output Configuration section
+    const outputConfigTrigger = dialog.getByRole("button", {
+      name: "Output Configuration",
+    });
+    // Click only if section is collapsed (check if panel is not visible)
+    const outputConfigPanel = dialog.getByText(
+      "Define the output type and optimization direction"
+    );
+    if (!(await outputConfigPanel.isVisible())) {
+      await outputConfigTrigger.click();
+    }
+    await expect(outputConfigPanel).toBeVisible();
+
+    // Change output type from Continuous to Categorical
+    const outputTypeSelect = dialog.getByRole("button", {
+      name: /Continuous score|Categorical label/,
+    });
+    await outputTypeSelect.click();
+    await page
+      .getByRole("option", { name: "Categorical label", exact: true })
+      .click();
+
+    // Verify Choices section appears with default two choices
+    await expect(dialog.getByText("Choices", { exact: true })).toBeVisible();
+
+    // Fill in the first choice
+    const choiceInputs = dialog.locator('input[placeholder^="Choice"]');
+    await expect(choiceInputs.first()).toBeVisible();
+    await choiceInputs.first().fill("Good");
+
+    // Fill in the second choice
+    await choiceInputs.nth(1).fill("Bad");
+
+    // Add a third choice
+    await dialog.getByRole("button", { name: "+ Add choice" }).click();
+    await choiceInputs.nth(2).fill("Neutral");
+
+    // Verify the third choice was added
+    await expect(choiceInputs).toHaveCount(3);
+
+    // Remove the third choice using the aria-labeled button
+    const removeButtons = dialog.getByRole("button", { name: "Remove choice" });
+    await expect(removeButtons).toHaveCount(3);
+    await removeButtons.last().click();
+
+    // Verify we're back to two choices
+    await expect(choiceInputs).toHaveCount(2);
+
+    // Verify remove is disabled when only 2 choices remain
+    const remainingRemoveButtons = dialog.getByRole("button", {
+      name: "Remove choice",
+    });
+    await expect(remainingRemoveButtons.first()).toBeDisabled();
+    await expect(remainingRemoveButtons.last()).toBeDisabled();
+
+    // Create the evaluator
+    await page.getByRole("button", { name: "Create" }).click();
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+
+    // Verify the evaluator was created
+    await expect(
+      page.getByRole("cell", { name: categoricalEvaluatorName, exact: true })
+    ).toBeVisible();
+
+    // Reopen and verify categorical config persisted
+    await openEvaluatorEditor(page, categoricalEvaluatorName);
+    await expect(dialog.getByText("Choices", { exact: true })).toBeVisible();
+    await expect(choiceInputs.first()).toHaveValue("Good");
+    await expect(choiceInputs.last()).toHaveValue("Bad");
+
+    await page.getByRole("button", { name: "Cancel" }).click();
+    await expect(page.getByTestId("dialog")).not.toBeVisible();
+  });
+});
diff --git a/src/phoenix/server/api/evaluators.py b/src/phoenix/server/api/evaluators.py
index 5498685766d..040d4834309 100644
--- a/src/phoenix/server/api/evaluators.py
+++ b/src/phoenix/server/api/evaluators.py
@@ -1,3 +1,4 @@
+import ast
 import json
 import logging
 import re
@@ -2328,6 +2329,156 @@ def _get_template_variables_attributes(*, variables: dict[str, Any]) -> dict[str
     return {TEMPLATE_VARIABLES: json.dumps(variables)}
 
 
+def _make_object_input_schema(
+    parameter_names: Sequence[str],
+    required_names: Sequence[str],
+) -> dict[str, Any]:
+    return {
+        "type": "object",
+        "properties": {name: {} for name in parameter_names},
+        "required": list(required_names),
+    }
+
+
+_SUPPORTED_CODE_EVALUATOR_INPUT_NAMES = ("output", "reference", "input", "metadata")
+
+
+def _validate_code_evaluator_input_names(
+    parameter_names: Sequence[str],
+    *,
+    language: str,
+) -> Optional[str]:
+    unsupported_names = [
+        name for name in parameter_names if name not in _SUPPORTED_CODE_EVALUATOR_INPUT_NAMES
+    ]
+    if not unsupported_names:
+        return None
+    supported_names = ", ".join(f"`{name}`" for name in _SUPPORTED_CODE_EVALUATOR_INPUT_NAMES)
+    invalid_names = ", ".join(f"`{name}`" for name in unsupported_names)
+    return (
+        f"Could not infer the {language} evaluator inputs because the `evaluate(...)` signature "
+        f"uses unsupported parameter names: {invalid_names}. Supported parameter names are "
+        f"{supported_names}."
+    )
+
+
+def _infer_python_evaluate_input_schema(source_code: str) -> tuple[dict[str, Any], Optional[str]]:
+    try:
+        module = ast.parse(source_code)
+    except SyntaxError as exc:
+        return (
+            {},
+            (
+                "Could not parse the Python evaluator signature. "
+                "Define a top-level function like "
+                "`def evaluate(output, reference=None, input=None, metadata=None):`. "
+                f"Parser error: {exc.msg}"
+            ),
+        )
+
+    evaluate_function = next(
+        (
+            node
+            for node in module.body
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == "evaluate"
+        ),
+        None,
+    )
+    if evaluate_function is None:
+        return (
+            {},
+            (
+                "Could not infer the Python evaluator inputs because no top-level "
+                "`evaluate(...)` function was found. Define a function like "
+                "`def evaluate(output, reference=None, input=None, metadata=None):`."
+            ),
+        )
+
+    args = evaluate_function.args
+    positional_args = [*args.posonlyargs, *args.args]
+    positional_required_count = len(positional_args) - len(args.defaults)
+    required_names = [arg.arg for arg in positional_args[:positional_required_count]]
+    required_names.extend(
+        arg.arg for arg, default in zip(args.kwonlyargs, args.kw_defaults) if default is None
+    )
+
+    parameter_names = [arg.arg for arg in positional_args]
+    parameter_names.extend(arg.arg for arg in args.kwonlyargs)
+
+    invalid_name_error = _validate_code_evaluator_input_names(
+        parameter_names,
+        language="Python",
+    )
+    if invalid_name_error is not None:
+        return ({}, invalid_name_error)
+
+    return (_make_object_input_schema(parameter_names, required_names), None)
+
+
+_TYPESCRIPT_FUNCTION_SIGNATURE_RE = re.compile(r"function\s+evaluate\s*\(([^)]*)\)")
+_TYPESCRIPT_ARROW_SIGNATURE_RE = re.compile(r"(?:const|let|var)\s+evaluate\s*=\s*\(([^)]*)\)\s*=>")
+
+
+def _extract_typescript_object_parameter_keys(params: str) -> tuple[list[str], list[str]]:
+    destructured = re.match(r"^\{([^}]*)\}", params.strip())
+    if destructured is None:
+        return ([], [])
+
+    parameter_names: list[str] = []
+    for raw_part in destructured.group(1).split(","):
+        part = raw_part.strip()
+        if not part:
+            continue
+        part = part.split(":", 1)[0].strip()
+        if not part:
+            continue
+        name = part.split("=", 1)[0].rstrip("?").strip()
+        if not name:
+            continue
+        parameter_names.append(name)
+    return (parameter_names, [])
+
+
+def _infer_typescript_evaluate_input_schema(
+    source_code: str,
+) -> tuple[dict[str, Any], Optional[str]]:
+    signature = _TYPESCRIPT_FUNCTION_SIGNATURE_RE.search(
+        source_code
+    ) or _TYPESCRIPT_ARROW_SIGNATURE_RE.search(source_code)
+    if signature is None:
+        return (
+            {},
+            (
+                "Could not infer the TypeScript evaluator inputs because no supported "
+                "`evaluate(...)` signature was found. Define `evaluate` as either "
+                "`function evaluate({ output, reference, input, metadata }: "
+                "EvaluatorParams) { ... }` or `const evaluate = ({ output, "
+                "reference, input, metadata }: EvaluatorParams) => { ... }`."
+            ),
+        )
+
+    parameter_names, required_names = _extract_typescript_object_parameter_keys(signature.group(1))
+    if not parameter_names:
+        return (
+            {},
+            (
+                "Could not infer the TypeScript evaluator inputs from the `evaluate(...)` "
+                "signature. Use a destructured object parameter like "
+                "`function evaluate({ output, reference, input, metadata }: "
+                "EvaluatorParams) { ... }`."
+            ),
+        )
+
+    invalid_name_error = _validate_code_evaluator_input_names(
+        parameter_names,
+        language="TypeScript",
+    )
+    if invalid_name_error is not None:
+        return ({}, invalid_name_error)
+
+    return (_make_object_input_schema(parameter_names, required_names), None)
+
+
 class CodeEvaluatorRunner(BaseEvaluator):
     """
     Evaluator that executes user-provided source code in a sandbox.
@@ -2372,7 +2523,15 @@ def output_configs(self) -> Sequence[OutputConfigType]:
 
     @property
     def input_schema(self) -> dict[str, Any]:
-        return {}
+        schema, _ = self._infer_input_schema()
+        return schema
+
+    def _infer_input_schema(self) -> tuple[dict[str, Any], Optional[str]]:
+        if self._language == "PYTHON":
+            return _infer_python_evaluate_input_schema(self._source_code)
+        if self._language == "TYPESCRIPT":
+            return _infer_typescript_evaluate_input_schema(self._source_code)
+        return ({}, None)
 
     def _build_python_harness(self, mapped_inputs: dict[str, Any]) -> str:
         """Wrap source_code in a Python script that calls evaluate(**inputs)."""
@@ -2428,9 +2587,16 @@ async def evaluate(
 
         start_time = datetime.now(timezone.utc)
 
+        input_schema, inference_error = self._infer_input_schema()
+        if inference_error is not None:
+            return [
+                self._make_error_result(name, inference_error, start_time)
+                for _ in (output_configs or [None])  # type: ignore[list-item]
+            ]
+
         try:
             mapped_inputs = apply_input_mapping(
-                input_schema={},
+                input_schema=input_schema,
                 input_mapping=input_mapping,
                 context=context,
             )
diff --git a/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py b/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py
index 8f603661693..41504cd1893 100644
--- a/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py
+++ b/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py
@@ -11,6 +11,7 @@
 from phoenix.server.api.input_types.GenerativeCredentialInput import GenerativeCredentialInput
 from phoenix.server.api.input_types.PlaygroundEvaluatorInput import EvaluatorInputMappingInput
 from phoenix.server.api.input_types.PromptVersionInput import ChatPromptVersionInput
+from phoenix.server.api.types.SandboxConfig import Language
 
 
 @strawberry.input
@@ -23,16 +24,29 @@ class InlineLLMEvaluatorInput:
     description: Optional[str] = None
 
 
+@strawberry.input
+class InlineCodeEvaluatorInput:
+    """Defines an inline code evaluator without requiring persistence."""
+
+    name: str
+    language: Language
+    source_code: str
+    output_configs: list[AnnotationConfigInput]
+    sandbox_config_id: Optional[GlobalID] = None
+    description: Optional[str] = None
+
+
 @strawberry.input(one_of=True)
 class EvaluatorPreviewInput:
     """
     Input for previewing an evaluator. Either provide an existing evaluator ID
-    or an inline LLM evaluator definition.
+    or an inline evaluator definition.
     """
 
     built_in_evaluator_id: Optional[GlobalID] = UNSET
     inline_llm_evaluator: Optional[InlineLLMEvaluatorInput] = UNSET
     code_evaluator_id: Optional[GlobalID] = UNSET
+    inline_code_evaluator: Optional[InlineCodeEvaluatorInput] = UNSET
 
 
 @strawberry.input
diff --git a/src/phoenix/server/api/mutations/chat_mutations.py b/src/phoenix/server/api/mutations/chat_mutations.py
index eeceb6ba0ad..11a71a2aa67 100644
--- a/src/phoenix/server/api/mutations/chat_mutations.py
+++ b/src/phoenix/server/api/mutations/chat_mutations.py
@@ -35,7 +35,8 @@
 )
 from phoenix.server.api.types.Evaluator import BuiltInEvaluator, CodeEvaluator
 from phoenix.server.api.types.ExperimentRunAnnotation import ExperimentRunAnnotation
-from phoenix.server.api.types.node import from_global_id
+from phoenix.server.api.types.node import from_global_id, from_global_id_with_expected_type
+from phoenix.server.api.types.SandboxConfig import SandboxConfig
 from phoenix.server.api.types.Trace import Trace
 
 logger = logging.getLogger(__name__)
@@ -91,6 +92,70 @@ def _to_evaluation_result(
     )
 
 
+async def _resolve_inline_code_evaluator_backend(
+    *,
+    info: Info[Context, None],
+    sandbox_config_id: Optional[strawberry.relay.GlobalID],
+    language: str,
+) -> tuple[Any, Optional[int]]:
+    from phoenix.server.sandbox import get_or_create_backend
+
+    if sandbox_config_id is None:
+        raise BadRequest(
+            f"No sandbox configuration selected for language '{language}'. "
+            "Choose a sandbox configuration before testing this evaluator."
+        )
+
+    sandbox_config_db_id = from_global_id_with_expected_type(
+        sandbox_config_id, SandboxConfig.__name__
+    )
+
+    async with info.context.db() as session:
+        sandbox_cfg = await session.get(models.SandboxConfig, sandbox_config_db_id)
+        if sandbox_cfg is None:
+            raise BadRequest(f"Sandbox configuration with id {sandbox_config_id} was not found")
+        if not sandbox_cfg.enabled:
+            raise BadRequest(
+                (
+                    f"Sandbox configuration '{sandbox_cfg.name}' is disabled. Enable it before "
+                    "testing this evaluator."
+                )
+            )
+
+        sandbox_timeout = sandbox_cfg.timeout
+        provider = await session.get(models.SandboxProvider, sandbox_cfg.sandbox_provider_id)
+        if provider is None:
+            raise BadRequest(
+                f"Sandbox provider for configuration '{sandbox_cfg.name}' was not found"
+            )
+        if not provider.enabled:
+            raise BadRequest(
+                (
+                    f"Sandbox provider '{provider.backend_type}' is disabled. Enable it before "
+                    "testing this evaluator."
+                )
+            )
+
+        provider_language_row = await session.get(models.Language, provider.language_id)
+        if provider_language_row is not None and provider_language_row.name != language:
+            raise BadRequest("Sandbox provider language does not match code evaluator language")
+
+        merged_config = {
+            **provider.config,
+            **sandbox_cfg.config,
+        }
+        backend_type = provider.backend_type
+        sandbox_backend = get_or_create_backend(backend_type, config=merged_config)
+
+    if sandbox_backend is None:
+        raise BadRequest(
+            f"Sandbox backend '{backend_type}' is unavailable for language '{language}'. "
+            "Ensure the backend is installed and configured."
+        )
+
+    return sandbox_backend, sandbox_timeout
+
+
 @strawberry.type
 class ChatCompletionMutationMixin:
     @strawberry.mutation(permission_classes=[IsNotReadOnly, IsNotViewer, IsLocked])  # type: ignore
@@ -278,7 +343,49 @@ async def evaluator_previews(
                 for eval_result in eval_results:
                     all_results.append(_to_evaluation_result(eval_result, eval_result["name"]))
 
+            elif inline_code_evaluator := evaluator_input.inline_code_evaluator:
+                from phoenix.server.api.evaluators import CodeEvaluatorRunner
+
+                language = inline_code_evaluator.language.value
+                evaluator_name = inline_code_evaluator.name
+                evaluator_description = inline_code_evaluator.description
+                source_code = inline_code_evaluator.source_code
+
+                # Convert output configs
+                output_configs = [
+                    c
+                    for c in _convert_output_config_inputs_to_pydantic(
+                        inline_code_evaluator.output_configs
+                    )
+                    if isinstance(c, (CategoricalOutputConfig, ContinuousOutputConfig))
+                ]
+
+                sandbox_backend, sandbox_timeout = await _resolve_inline_code_evaluator_backend(
+                    info=info,
+                    sandbox_config_id=inline_code_evaluator.sandbox_config_id,
+                    language=language,
+                )
+
+                runner = CodeEvaluatorRunner(
+                    name=evaluator_name,
+                    description=evaluator_description,
+                    source_code=source_code,
+                    stored_output_configs=output_configs,
+                    sandbox_backend=sandbox_backend,
+                    language=language,
+                    timeout=sandbox_timeout,
+                )
+                eval_results = await runner.evaluate(
+                    context=context,
+                    input_mapping=input_mapping.to_orm(),
+                    name=evaluator_name,
+                    output_configs=output_configs,
+                    session_key="",
+                )
+                for eval_result in eval_results:
+                    all_results.append(_to_evaluation_result(eval_result, eval_result["name"]))
+
             else:
-                raise BadRequest("Either evaluator_id or inline_llm_evaluator must be provided")
+                raise BadRequest("Either evaluator_id or inline evaluator must be provided")
 
         return EvaluatorPreviewsPayload(results=all_results)
diff --git a/src/phoenix/server/api/mutations/evaluator_mutations.py b/src/phoenix/server/api/mutations/evaluator_mutations.py
index dd630103536..ddaaa86436e 100644
--- a/src/phoenix/server/api/mutations/evaluator_mutations.py
+++ b/src/phoenix/server/api/mutations/evaluator_mutations.py
@@ -47,7 +47,7 @@
 from phoenix.server.api.types.Identifier import Identifier
 from phoenix.server.api.types.node import from_global_id, from_global_id_with_expected_type
 from phoenix.server.api.types.PromptVersion import PromptVersion
-from phoenix.server.api.types.SandboxConfig import Language
+from phoenix.server.api.types.SandboxConfig import Language, SandboxConfig
 from phoenix.server.bearer_auth import PhoenixUser
 
 
@@ -315,7 +315,7 @@ class CreateCodeEvaluatorInput:
     source_code: str
     language: Language
     description: Optional[str] = None
-    sandbox_config_id: Optional[int] = None
+    sandbox_config_id: Optional[GlobalID] = None
     output_configs: Optional[list[AnnotationConfigInput]] = None
     input_mapping: Optional[EvaluatorInputMappingInput] = None
 
@@ -327,7 +327,7 @@ class UpdateCodeEvaluatorInput:
     source_code: Optional[str] = UNSET
     language: Optional[Language] = UNSET
     description: Optional[str] = UNSET
-    sandbox_config_id: Optional[int] = UNSET
+    sandbox_config_id: Optional[GlobalID] = UNSET
     output_configs: Optional[list[AnnotationConfigInput]] = UNSET
     input_mapping: Optional[EvaluatorInputMappingInput] = UNSET
 
@@ -1184,9 +1184,13 @@ async def create_code_evaluator(
                 if language_id is None:
                     raise BadRequest(f"Unknown language: {input.language!r}")
 
+                sandbox_config_id = None
                 if input.sandbox_config_id is not None:
+                    sandbox_config_id = from_global_id_with_expected_type(
+                        input.sandbox_config_id, SandboxConfig.__name__
+                    )
                     await _validate_language_matches_sandbox(
-                        language_id, input.sandbox_config_id, session
+                        language_id, sandbox_config_id, session
                     )
 
                 row = models.CodeEvaluator(
@@ -1194,7 +1198,7 @@ async def create_code_evaluator(
                     description=input.description,
                     source_code=input.source_code,
                     language_id=language_id,
-                    sandbox_config_id=input.sandbox_config_id,
+                    sandbox_config_id=sandbox_config_id,
                     output_configs=output_configs,
                     input_mapping=input_mapping_orm,
                     user_id=user_id,
@@ -1248,7 +1252,13 @@ async def update_code_evaluator(
                     row.description = input.description
 
                 if input.sandbox_config_id is not UNSET:
-                    row.sandbox_config_id = input.sandbox_config_id
+                    row.sandbox_config_id = (
+                        None
+                        if input.sandbox_config_id is None
+                        else from_global_id_with_expected_type(
+                            input.sandbox_config_id, SandboxConfig.__name__
+                        )
+                    )
 
                 if input.output_configs is not UNSET and input.output_configs is not None:
                     row.output_configs = list(
diff --git a/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py b/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py
index 370223752e4..c5f57e21c6f 100644
--- a/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py
+++ b/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py
@@ -425,33 +425,63 @@ async def test_update_provider_not_found_returns_error(
 }
 """
 
+_CREATE_CODE_EVALUATOR = """
+mutation CreateCodeEvaluator($input: CreateCodeEvaluatorInput!) {
+    createCodeEvaluator(input: $input) {
+        evaluator {
+            id
+            ... on CodeEvaluator {
+                sandboxConfig {
+                    id
+                }
+            }
+        }
+    }
+}
+"""
+
+_UPDATE_CODE_EVALUATOR = """
+mutation UpdateCodeEvaluator($input: UpdateCodeEvaluatorInput!) {
+    updateCodeEvaluator(input: $input) {
+        evaluator {
+            id
+            ... on CodeEvaluator {
+                sandboxConfig {
+                    id
+                }
+            }
+        }
+    }
+}
+"""
+
+
+async def _create_code_evaluator_with_config(
+    db: DbSessionFactory,
+    sandbox_config: models.SandboxConfig,
+) -> int:
+    """Insert a CodeEvaluator row linked to the given sandbox config."""
+    async with db() as session:
+        code_eval = models.CodeEvaluator(
+            name=Identifier(root="test-disabled-guard-eval"),
+            description=None,
+            metadata_={},
+            source_code="def evaluate(input): return {'score': 1.0}",
+            sandbox_config_id=sandbox_config.id,
+        )
+        session.add(code_eval)
+        await session.flush()
+        return code_eval.id
 
-class TestDisabledProviderAndConfigGuards:
-    async def _create_code_evaluator_with_config(
-        self,
-        db: DbSessionFactory,
-        sandbox_config: models.SandboxConfig,
-    ) -> int:
-        """Insert a CodeEvaluator row (joined-table inheritance) linked to the given sandbox config."""
-        async with db() as session:
-            code_eval = models.CodeEvaluator(
-                name=Identifier(root="test-disabled-guard-eval"),
-                description=None,
-                metadata_={},
-                source_code="def evaluate(input): return {'score': 1.0}",
-                sandbox_config_id=sandbox_config.id,
-            )
-            session.add(code_eval)
-            await session.flush()
-            return code_eval.id
 
+class TestDisabledProviderAndConfigGuards:
     async def test_disabled_provider_blocks_execution(
         self,
         gql_client: AsyncGraphQLClient,
         db: DbSessionFactory,
         sandbox_config: models.SandboxConfig,
     ) -> None:
-        evaluator_db_id = await self._create_code_evaluator_with_config(db, sandbox_config)
+        evaluator_db_id = await _create_code_evaluator_with_config(db, sandbox_config)
         evaluator_gid = str(GlobalID("CodeEvaluator", str(evaluator_db_id)))
 
         # Disable the provider via the updateSandboxProvider mutation
@@ -486,13 +516,88 @@ async def test_disabled_provider_blocks_execution(
         )
         assert result.errors
 
+
+class TestCodeEvaluatorSandboxMutationIds:
+    async def test_create_code_evaluator_accepts_sandbox_global_id(
+        self,
+        gql_client: AsyncGraphQLClient,
+        db: DbSessionFactory,
+        sandbox_config: models.SandboxConfig,
+    ) -> None:
+        result = await gql_client.execute(
+            _CREATE_CODE_EVALUATOR,
+            variables={
+                "input": {
+                    "name": "test_code_evaluator",
+                    "description": "uses relay id",
+                    "language": "PYTHON",
+                    "sourceCode": "def evaluate(output):\n    return {'score': 1.0}",
+                    "sandboxConfigId": _config_global_id(sandbox_config.id),
+                    "outputConfigs": [
+                        {
+                            "continuous": {
+                                "name": "score",
+                                "optimizationDirection": "NONE",
+                                "lowerBound": 0,
+                                "upperBound": 1,
+                            }
+                        }
+                    ],
+                }
+            },
+        )
+        assert result.data and not result.errors
+        evaluator = result.data["createCodeEvaluator"]["evaluator"]
+        assert evaluator["sandboxConfig"]["id"] == _config_global_id(sandbox_config.id)
+
+        evaluator_id = GlobalID.from_id(evaluator["id"])
+        async with db() as session:
+            row = await session.get(models.CodeEvaluator, int(evaluator_id.node_id))
+        assert row is not None
+        assert row.sandbox_config_id == sandbox_config.id
+
+    async def test_update_code_evaluator_accepts_sandbox_global_id(
+        self,
+        gql_client: AsyncGraphQLClient,
+        db: DbSessionFactory,
+        sandbox_config: models.SandboxConfig,
+    ) -> None:
+        async with db() as session:
+            code_eval = models.CodeEvaluator(
+                name=Identifier(root="test_update_code_evaluator"),
+                description=None,
+                metadata_={},
+                source_code="def evaluate(output): return {'score': 0.0}",
+            )
+            session.add(code_eval)
+            await session.flush()
+            evaluator_gid = str(GlobalID("CodeEvaluator", str(code_eval.id)))
+
+        result = await gql_client.execute(
+            _UPDATE_CODE_EVALUATOR,
+            variables={
+                "input": {
+                    "id": evaluator_gid,
+                    "sandboxConfigId": _config_global_id(sandbox_config.id),
+                }
+            },
+        )
+        assert result.data and not result.errors
+        evaluator = result.data["updateCodeEvaluator"]["evaluator"]
+        assert evaluator["sandboxConfig"]["id"] == _config_global_id(sandbox_config.id)
+
+        async with db() as session:
+            row = await session.get(models.CodeEvaluator, code_eval.id)
+        assert row is not None
+        assert row.sandbox_config_id == sandbox_config.id
+
     async def test_disabled_config_blocks_execution(
         self,
         gql_client: AsyncGraphQLClient,
         db: DbSessionFactory,
         sandbox_config: models.SandboxConfig,
     ) -> None:
-        evaluator_db_id = await self._create_code_evaluator_with_config(db, sandbox_config)
+        evaluator_db_id = await _create_code_evaluator_with_config(db, sandbox_config)
         evaluator_gid = str(GlobalID("CodeEvaluator", str(evaluator_db_id)))
 
         # Disable the sandbox config via the mutation
diff --git a/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py b/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py
index adc0b4296b4..c67a0c5bf2f 100644
--- a/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py
+++ b/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py
@@ -1,10 +1,12 @@
 from typing import Any
+from unittest.mock import AsyncMock, patch
 
 import pytest
 from sqlalchemy import select
 from strawberry.relay.types import GlobalID
 
 from phoenix.db import models
+from phoenix.server.sandbox.types import ExecutionResult
 from phoenix.server.types import DbSessionFactory
 from tests.unit.graphql import AsyncGraphQLClient
 
@@ -150,3 +152,167 @@ async def test_preview_requires_evaluator_or_inline(
         )
 
         assert result.errors is not None
+
+
+class TestInlineCodeEvaluatorPreviewMutation:
+    async def _preview_inline_code_evaluator(
+        self,
+        gql_client: AsyncGraphQLClient,
+        *,
+        sandbox_config_id: str | None,
+        language: str = "PYTHON",
+        source_code: str = "def evaluate(output):\n    return 1.0",
+    ) -> Any:
+        return await gql_client.execute(
+            TestEvaluatorPreviewMutation._MUTATION,
+            {
+                "input": {
+                    "previews": [
+                        {
+                            "evaluator": {
+                                "inlineCodeEvaluator": {
+                                    "name": "inline_code_eval",
+                                    "description": "preview",
+                                    "language": language,
+                                    "sourceCode": source_code,
+                                    "sandboxConfigId": sandbox_config_id,
+                                    "outputConfigs": [
+                                        {
+                                            "continuous": {
+                                                "name": "score",
+                                                "optimizationDirection": "NONE",
+                                                "lowerBound": 0,
+                                                "upperBound": 1,
+                                            }
+                                        }
+                                    ],
+                                }
+                            },
+                            "context": {"output": {"answer": "4"}},
+                            "inputMapping": {},
+                        }
+                    ]
+                }
+            },
+        )
+
+    async def test_requires_sandbox_config_selection(
+        self,
+        gql_client: AsyncGraphQLClient,
+    ) -> None:
+        result = await self._preview_inline_code_evaluator(
+            gql_client,
+            sandbox_config_id=None,
+        )
+
+        assert result.errors is not None
+        assert "No sandbox configuration selected" in result.errors[0].message
+
+    async def test_rejects_wrong_global_id_type(
+        self,
+        gql_client: AsyncGraphQLClient,
+        sandbox_config: models.SandboxConfig,
+    ) -> None:
+        wrong_type_id = str(GlobalID("SandboxProvider", str(sandbox_config.id)))
+
+        result = await self._preview_inline_code_evaluator(
+            gql_client,
+            sandbox_config_id=wrong_type_id,
+        )
+
+        assert result.errors is not None
+        assert "SandboxConfig" in result.errors[0].message
+
+    async def test_rejects_missing_sandbox_config(
+        self,
+        gql_client: AsyncGraphQLClient,
+    ) -> None:
+        result = await self._preview_inline_code_evaluator(
+            gql_client,
+            sandbox_config_id=str(GlobalID("SandboxConfig", "999999")),
+        )
+
+        assert result.errors is not None
+        assert "was not found" in result.errors[0].message
+
+    async def test_rejects_disabled_sandbox_config(
+        self,
+        gql_client: AsyncGraphQLClient,
+        db: DbSessionFactory,
+        sandbox_config: models.SandboxConfig,
+    ) -> None:
+        async with db() as session:
+            row = await session.get(models.SandboxConfig, sandbox_config.id)
+            assert row is not None
+            row.enabled = False
+            await session.commit()
+
+        result = await self._preview_inline_code_evaluator(
+            gql_client,
+            sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))),
+        )
+
+        assert result.errors is not None
+        assert "is disabled" in result.errors[0].message
+
+    async def test_rejects_disabled_sandbox_provider(
+        self,
+        gql_client: AsyncGraphQLClient,
+        db: DbSessionFactory,
+        sandbox_config: models.SandboxConfig,
+    ) -> None:
+        async with db() as session:
+            provider = await session.get(models.SandboxProvider, sandbox_config.sandbox_provider_id)
+            assert provider is not None
+            provider.enabled = False
+            await session.commit()
+
+        result = await self._preview_inline_code_evaluator(
+            gql_client,
+            sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))),
+        )
+
+        assert result.errors is not None
+        assert "Sandbox provider" in result.errors[0].message
+        assert "is disabled" in result.errors[0].message
+
+    async def test_rejects_language_mismatch(
+        self,
+        gql_client: AsyncGraphQLClient,
+        sandbox_config: models.SandboxConfig,
+    ) -> None:
+        result = await self._preview_inline_code_evaluator(
+            gql_client,
+            sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))),
+            language="TYPESCRIPT",
+            source_code="function evaluate({ output }: EvaluatorParams) { return 1; }",
+        )
+
+        assert result.errors is not None
+        assert "language does not match" in result.errors[0].message
+
+    async def test_returns_preview_result_for_valid_inline_code_evaluator(
+        self,
+        gql_client: AsyncGraphQLClient,
+        sandbox_config: models.SandboxConfig,
+    ) -> None:
+        backend = AsyncMock()
+        backend.execute = AsyncMock(
+            return_value=ExecutionResult(stdout="1.0", stderr="", error=None)
+        )
+
+        with patch(
+            "phoenix.server.sandbox.get_or_create_backend",
+            return_value=backend,
+        ):
+            result = await self._preview_inline_code_evaluator(
+                gql_client,
+                sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))),
+            )
+
+        assert result.data and not result.errors
+        results = result.data["evaluatorPreviews"]["results"]
+        assert len(results) == 1
+        assert results[0]["evaluatorName"] == "inline_code_eval"
+        assert results[0]["error"] is None
+        assert results[0]["annotation"]["score"] == 1.0
diff --git a/tests/unit/server/api/test_code_evaluator_runner.py b/tests/unit/server/api/test_code_evaluator_runner.py
index 0e923e521c5..05b36a804a8 100644
--- a/tests/unit/server/api/test_code_evaluator_runner.py
+++ b/tests/unit/server/api/test_code_evaluator_runner.py
@@ -111,6 +111,90 @@ def test_typescript_harness_contains_json_stringify(self) -> None:
         assert "JSON.stringify" in harness
 
 
+class TestInputSchemaInference:
+    def test_python_input_schema_infers_top_level_parameters(self) -> None:
+        runner, _ = _make_runner(
+            source_code=(
+                "def evaluate(output, reference=None, input=None, *, metadata=None):\n"
+                "    return 1\n"
+            )
+        )
+
+        assert runner.input_schema == {
+            "type": "object",
+            "properties": {
+                "output": {},
+                "reference": {},
+                "input": {},
+                "metadata": {},
+            },
+            "required": ["output"],
+        }
+
+    def test_typescript_input_schema_infers_destructured_parameters(self) -> None:
+        runner, _ = _make_runner(
+            source_code=(
+                "function evaluate({ output, reference, input, metadata }: EvaluatorParams) "
+                "{ return 1; }"
+            ),
+            language="TYPESCRIPT",
+        )
+
+        assert runner.input_schema == {
+            "type": "object",
+            "properties": {
+                "output": {},
+                "reference": {},
+                "input": {},
+                "metadata": {},
+            },
+            "required": [],
+        }
+
+    def test_python_input_schema_returns_error_when_evaluate_is_missing(self) -> None:
+        runner, _ = _make_runner(source_code="def not_evaluate(output):\n    return 1\n")
+
+        schema, error = runner._infer_input_schema()
+        assert schema == {}
+        assert error is not None
+        assert "no top-level `evaluate(...)` function was found" in error
+
+    def test_typescript_input_schema_returns_error_for_non_destructured_signature(self) -> None:
+        runner, _ = _make_runner(
+            source_code="function evaluate(output: EvaluatorParams) { return 1; }",
+            language="TYPESCRIPT",
+        )
+
+        schema, error = runner._infer_input_schema()
+        assert schema == {}
+        assert error is not None
+        assert "Use a destructured object parameter" in error
+
+    def test_python_input_schema_returns_error_for_unsupported_parameter_names(self) -> None:
+        runner, _ = _make_runner(
+            source_code="def evaluate(outputs, reference=None):\n    return 1\n"
+        )
+
+        schema, error = runner._infer_input_schema()
+        assert schema == {}
+        assert error is not None
+        assert "unsupported parameter names: `outputs`" in error
+
+    def test_typescript_input_schema_returns_error_for_unsupported_parameter_names(self) -> None:
+        runner, _ = _make_runner(
+            source_code=(
+                "function evaluate({ outputs, reference, input, metadata }: EvaluatorParams) "
+                "{ return 1; }"
+            ),
+            language="TYPESCRIPT",
+        )
+
+        schema, error = runner._infer_input_schema()
+        assert schema == {}
+        assert error is not None
+        assert "unsupported parameter names: `outputs`" in error
+
+
 class TestEvaluateSuccessPath:
     async def test_returns_label_from_stdout(self) -> None:
         runner, _ = _make_runner(backend_stdout='"pass"')
@@ -191,8 +275,99 @@ async def test_none_timeout_forwarded_to_backend_execute(self) -> None:
         call_kwargs = backend.execute.call_args
         assert call_kwargs.kwargs.get("timeout") is None
 
+    async def test_python_evaluate_auto_passes_context_keys_matching_signature(self) -> None:
+        runner, backend = _make_runner(source_code="def evaluate(output, reference=None): return 1")
+
+        await runner.evaluate(
+            context={"output": {"answer": "a"}, "reference": {"answer": "a"}},
+            input_mapping=_EMPTY_MAPPING,
+            name="test",
+            output_configs=[_continuous_config()],
+        )
+
+        call_args = backend.execute.call_args
+        code_arg = call_args.args[0] if call_args.args else call_args.kwargs.get("code", "")
+        assert '"output": {"answer": "a"}' in code_arg
+        assert '"reference": {"answer": "a"}' in code_arg
+
+    async def test_typescript_evaluate_auto_passes_context_keys_matching_signature(self) -> None:
+        runner, backend = _make_runner(
+            source_code=("function evaluate({ output, reference }: EvaluatorParams) { return 1; }"),
+            language="TYPESCRIPT",
+            backend_stdout="1",
+        )
+
+        await runner.evaluate(
+            context={"output": {"answer": "a"}, "reference": {"answer": "a"}},
+            input_mapping=_EMPTY_MAPPING,
+            name="test",
+            output_configs=[_continuous_config()],
+        )
+
+        call_args = backend.execute.call_args
+        code_arg = call_args.args[0] if call_args.args else call_args.kwargs.get("code", "")
+        assert '"output": {"answer": "a"}' in code_arg
+        assert '"reference": {"answer": "a"}' in code_arg
+
 
 class TestEvaluateErrorPaths:
+    async def test_inference_failure_returns_human_readable_python_error(self) -> None:
+        runner, backend = _make_runner(source_code="def not_evaluate(output): return 1")
+
+        results = await runner.evaluate(
+            context={"output": {"answer": "a"}},
+            input_mapping=_EMPTY_MAPPING,
+            name="test_py",
+            output_configs=[_categorical_config()],
+        )
+
+        assert len(results) == 1
+        assert results[0]["error"] is not None
+        assert "no top-level `evaluate(...)` function was found" in results[0]["error"]
+        backend.execute.assert_not_called()
+
+    async def test_inference_failure_returns_human_readable_typescript_error(self) -> None:
+        runner, backend = _make_runner(
+            source_code="function evaluate(output: EvaluatorParams) { return 1; }",
+            language="TYPESCRIPT",
+        )
+
+        results = await runner.evaluate(
+            context={"output": {"answer": "a"}},
+            input_mapping=_EMPTY_MAPPING,
+            name="test_ts",
+            output_configs=[_categorical_config()],
+        )
+
+        assert len(results) == 1
+        assert results[0]["error"] is not None
+        assert "Use a destructured object parameter" in results[0]["error"]
+        backend.execute.assert_not_called()
+
+    async def test_inference_failure_returns_human_readable_error_for_renamed_typescript_param(
+        self,
+    ) -> None:
+        runner, backend = _make_runner(
+            source_code=(
+                "function evaluate({ outputs, reference, input, metadata }: EvaluatorParams) { "
+                "const candidate = typeof output?.answer === 'string' ? output.answer : ''; "
+                "return 1; }"
+            ),
+            language="TYPESCRIPT",
+        )
+
+        results = await runner.evaluate(
+            context={"output": {"answer": "a"}},
+            input_mapping=_EMPTY_MAPPING,
+            name="pytest",
+            output_configs=[_categorical_config()],
+        )
+
+        assert len(results) == 1
+        assert results[0]["error"] is not None
+        assert "unsupported parameter names: `outputs`" in results[0]["error"]
+        backend.execute.assert_not_called()
+
     async def test_input_mapping_failure_returns_error_result(self) -> None:
         runner, _ = _make_runner()
         bad_mapping = InputMapping(
@@ -255,12 +430,14 @@ async def test_language_stored_normalized_to_uppercase(self) -> None:
     async def test_typescript_language_uses_typescript_harness(self) -> None:
         """Runner selects TypeScript harness when language is TYPESCRIPT."""
         runner, backend = _make_runner(
-            source_code="function evaluate(x) { return 1; }",
+            source_code=(
+                "function evaluate({ output }: EvaluatorParams) { return output ? 1 : 0; }"
+            ),
             language="TYPESCRIPT",
             backend_stdout="1",
         )
         await runner.evaluate(
-            context={},
+            context={"output": {"answer": "a"}},
             input_mapping=_EMPTY_MAPPING,
             name="test",
             output_configs=[_continuous_config()],