diff --git a/app/schema.graphql b/app/schema.graphql index e681fb97241..3d03a569b4f 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -594,7 +594,7 @@ input CreateCodeEvaluatorInput { sourceCode: String! language: Language! description: String = null - sandboxConfigId: Int = null + sandboxConfigId: ID = null outputConfigs: [AnnotationConfigInput!] = null inputMapping: EvaluatorInputMappingInput = null } @@ -1440,6 +1440,7 @@ input EvaluatorPreviewInput @oneOf { builtInEvaluatorId: ID inlineLlmEvaluator: InlineLLMEvaluatorInput codeEvaluatorId: ID + inlineCodeEvaluator: InlineCodeEvaluatorInput } input EvaluatorPreviewItemInput { @@ -2045,6 +2046,15 @@ input GoogleGenAIHttpOptionsInput { scalar Identifier +input InlineCodeEvaluatorInput { + name: String! + language: Language! + sourceCode: String! + outputConfigs: [AnnotationConfigInput!]! + sandboxConfigId: ID = null + description: String = null +} + input InlineLLMEvaluatorInput { name: String! promptVersion: ChatPromptVersionInput! @@ -3867,7 +3877,7 @@ input UpdateCodeEvaluatorInput { sourceCode: String language: Language description: String - sandboxConfigId: Int + sandboxConfigId: ID outputConfigs: [AnnotationConfigInput!] inputMapping: EvaluatorInputMappingInput } diff --git a/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx b/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx index 4080094ab73..6e456e69f7d 100644 --- a/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx +++ b/app/src/components/dataset/CreateCodeDatasetEvaluatorSlideover.tsx @@ -1,4 +1,4 @@ -import { Suspense, useMemo, useState } from "react"; +import { Suspense, useCallback, useEffect, useRef, useState } from "react"; import type { ModalOverlayProps } from "react-aria-components"; import { graphql, useLazyLoadQuery, useMutation } from "react-relay"; import invariant from "tiny-invariant"; @@ -9,11 +9,11 @@ import { Modal, ModalOverlay } from "@phoenix/components/core/overlay/Modal"; import type { CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql"; import type { CreateCodeDatasetEvaluatorSlideover_createDatasetCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createDatasetCodeEvaluatorMutation.graphql"; import type { CreateCodeDatasetEvaluatorSlideoverQuery } from "@phoenix/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql"; +import { mapSandboxConfigOptions } from "@phoenix/components/evaluators/CodeEvaluatorLanguageSandboxFields"; import { DEFAULT_CODE_EVALUATOR_SOURCE } from "@phoenix/components/evaluators/codeEvaluatorUtils"; import { - EditCodeEvaluatorDialogContent, - mapSandboxConfigOptions, createDefaultContinuousOutputConfig, + EditCodeEvaluatorDialogContent, } from "@phoenix/components/evaluators/EditCodeEvaluatorDialogContent"; import { buildOutputConfigsInput } from "@phoenix/components/evaluators/utils"; import { EvaluatorStoreProvider } from "@phoenix/contexts/EvaluatorContext"; @@ -29,20 +29,49 @@ export const CreateCodeDatasetEvaluatorSlideover = ({ datasetId, updateConnectionIds, onEvaluatorCreated, + onOpenChange, + isOpen, ...props }: { datasetId: string; updateConnectionIds?: string[]; onEvaluatorCreated?: (datasetEvaluatorId: string) => void; } & ModalOverlayProps) => { + const isDirtyRef = useRef(false); + + // Reset dirty state when slideover opens + useEffect(() => { + if (isOpen) { + isDirtyRef.current = false; + } + }, [isOpen]); + + const handleOpenChange = useCallback( + (nextIsOpen: boolean) => { + if (!nextIsOpen && isDirtyRef.current) { + const confirmed = window.confirm( + "You have unsaved changes. Are you sure you want to close?" + ); + if (!confirmed) return; + } + onOpenChange?.(nextIsOpen); + }, + [onOpenChange] + ); + + const handleDirtyChange = useCallback((isDirty: boolean) => { + isDirtyRef.current = isDirty; + }, []); + return ( - + {({ close }) => ( }> void; + onDirtyChange?: (isDirty: boolean) => void; datasetId: string; updateConnectionIds?: string[]; onEvaluatorCreated?: (datasetEvaluatorId: string) => void; @@ -74,19 +105,24 @@ const CreateCodeEvaluatorDialog = ({ sandboxProviders { backendType language + enabled configs { id name description } } + sandboxBackends { + backendType + status + } } `, {} ); - const sandboxConfigs = useMemo( - () => mapSandboxConfigOptions(data.sandboxProviders), - [data.sandboxProviders] + const sandboxConfigs = mapSandboxConfigOptions( + data.sandboxProviders, + data.sandboxBackends ); const [createCodeEvaluator, isCreatingCodeEvaluator] = useMutation(graphql` @@ -119,40 +155,36 @@ const CreateCodeEvaluatorDialog = ({ } } `); - const initialState = useMemo( - () => - ({ - evaluator: { - globalName: "", - name: "", - description: "", - inputMapping: { - literalMapping: {}, - pathMapping: {}, - }, - kind: "CODE", - isBuiltin: false, - includeExplanation: false, - }, - outputConfigs: [createDefaultContinuousOutputConfig("")], - dataset: { - readonly: true, - id: datasetId, - selectedExampleId: null, - selectedSplitIds: [], - }, - evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT, - showPromptPreview: false, - }) satisfies EvaluatorStoreProps, - [datasetId] - ); + const initialState: EvaluatorStoreProps = { + evaluator: { + globalName: "", + name: "", + description: "", + inputMapping: { + literalMapping: {}, + pathMapping: {}, + }, + kind: "CODE", + isBuiltin: false, + includeExplanation: false, + }, + outputConfigs: [createDefaultContinuousOutputConfig("")], + dataset: { + readonly: true, + id: datasetId, + selectedExampleId: null, + selectedSplitIds: [], + }, + evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT, + showPromptPreview: false, + }; const onSubmit = ( store: EvaluatorStoreInstance, payload: { language: "PYTHON" | "TYPESCRIPT"; sourceCode: string; - sandboxConfigId: number | null; + sandboxConfigId?: string | null; } ) => { setError(undefined); @@ -198,6 +230,7 @@ const CreateCodeEvaluatorDialog = ({ title: "Evaluator created", message: "The code evaluator has been added to the dataset.", }); + onDirtyChange?.(false); onClose(); }, onError: (mutationError) => { @@ -223,6 +256,8 @@ const CreateCodeEvaluatorDialog = ({ {({ store }) => ( onSubmit(store, payload)} + onCancel={onClose} + onDirtyChange={onDirtyChange} isSubmitting={ isCreatingCodeEvaluator || isCreatingDatasetCodeEvaluator } diff --git a/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx b/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx index 0f1da5fb656..ac2b299ce31 100644 --- a/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx +++ b/app/src/components/dataset/EditCodeDatasetEvaluatorSlideover.tsx @@ -1,4 +1,4 @@ -import { Suspense, useMemo, useState } from "react"; +import { Suspense, useCallback, useEffect, useRef, useState } from "react"; import type { ModalOverlayProps } from "react-aria-components"; import { graphql, useLazyLoadQuery, useMutation } from "react-relay"; import invariant from "tiny-invariant"; @@ -13,10 +13,10 @@ import { import type { EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery } from "@phoenix/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql"; import type { EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql"; import type { EditCodeDatasetEvaluatorSlideover_updateDatasetCodeEvaluatorMutation } from "@phoenix/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateDatasetCodeEvaluatorMutation.graphql"; +import { mapSandboxConfigOptions } from "@phoenix/components/evaluators/CodeEvaluatorLanguageSandboxFields"; import { - EditCodeEvaluatorDialogContent, - mapSandboxConfigOptions, createDefaultContinuousOutputConfig, + EditCodeEvaluatorDialogContent, } from "@phoenix/components/evaluators/EditCodeEvaluatorDialogContent"; import { buildOutputConfigsInput } from "@phoenix/components/evaluators/utils"; import { EvaluatorStoreProvider } from "@phoenix/contexts/EvaluatorContext"; @@ -45,10 +45,38 @@ export function EditCodeDatasetEvaluatorSlideover({ datasetId, updateConnectionIds, onUpdate, + onOpenChange, + isOpen, ...props }: EditCodeDatasetEvaluatorSlideoverProps) { + const isDirtyRef = useRef(false); + + // Reset dirty state when slideover opens + useEffect(() => { + if (isOpen) { + isDirtyRef.current = false; + } + }, [isOpen]); + + const handleOpenChange = useCallback( + (nextIsOpen: boolean) => { + if (!nextIsOpen && isDirtyRef.current) { + const confirmed = window.confirm( + "You have unsaved changes. Are you sure you want to close?" + ); + if (!confirmed) return; + } + onOpenChange?.(nextIsOpen); + }, + [onOpenChange] + ); + + const handleDirtyChange = useCallback((isDirty: boolean) => { + isDirtyRef.current = isDirty; + }, []); + return ( - + {({ close }) => ( @@ -63,6 +91,7 @@ export function EditCodeDatasetEvaluatorSlideover({ void; + onDirtyChange?: (isDirty: boolean) => void; datasetId: string; updateConnectionIds?: string[]; onUpdate?: () => void; @@ -160,18 +191,23 @@ function EditCodeDatasetEvaluatorSlideoverContent({ sandboxProviders { backendType language + enabled configs { id name description } } + sandboxBackends { + backendType + status + } } `, { datasetEvaluatorId, datasetId }, { fetchPolicy: "network-only" } ); - const { dataset, sandboxProviders } = data; + const { dataset, sandboxProviders, sandboxBackends } = data; invariant(dataset, "dataset is required"); const datasetEvaluator = dataset.datasetEvaluator; invariant(datasetEvaluator, "dataset evaluator is required"); @@ -181,17 +217,12 @@ function EditCodeDatasetEvaluatorSlideoverContent({ invariant(evaluator.sourceCode, "code evaluator source code is required"); const evaluatorLanguage = evaluator.language; const evaluatorSourceCode = evaluator.sourceCode; - const sandboxConfigs = useMemo( - () => mapSandboxConfigOptions(sandboxProviders), - [sandboxProviders] + const sandboxConfigs = mapSandboxConfigOptions( + sandboxProviders, + sandboxBackends ); const sandboxConfigGlobalId = evaluator.sandboxConfig?.id; - const initialSandboxConfigId = sandboxConfigGlobalId - ? (sandboxConfigs.find( - (config) => - String(config.id) === atob(sandboxConfigGlobalId).split(":", 2)[1] - )?.id ?? null) - : null; + const initialSandboxConfigId = sandboxConfigGlobalId ?? null; const [updateCodeEvaluator, isUpdatingCodeEvaluator] = useMutation(graphql` @@ -226,70 +257,47 @@ function EditCodeDatasetEvaluatorSlideoverContent({ } `); - const loadedOutputConfigs = useMemo( - () => - (datasetEvaluator.outputConfigs?.length - ? datasetEvaluator.outputConfigs - : evaluator.outputConfigs?.length - ? evaluator.outputConfigs - : [ - createDefaultContinuousOutputConfig(datasetEvaluator.name ?? ""), - ]) as Mutable< - | ContinuousEvaluatorAnnotationConfig - | ClassificationEvaluatorAnnotationConfig - >[], - [ - datasetEvaluator.name, - datasetEvaluator.outputConfigs, - evaluator.outputConfigs, - ] - ); - const initialState = useMemo( - () => - ({ - evaluator: { - id: evaluator.id, - globalName: evaluator.name ?? datasetEvaluator.name ?? "", - name: datasetEvaluator.name ?? evaluator.name ?? "", - description: - datasetEvaluator.description ?? evaluator.description ?? "", - inputMapping: datasetEvaluator.inputMapping, - kind: "CODE", - isBuiltin: false, - includeExplanation: false, - }, - datasetEvaluator: { - id: datasetEvaluatorId, - }, - outputConfigs: loadedOutputConfigs, - dataset: { - readonly: true, - id: datasetId, - selectedExampleId: null, - selectedSplitIds: [], - }, - evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT, - showPromptPreview: false, - }) satisfies EvaluatorStoreProps, - [ - datasetEvaluator.description, - datasetEvaluator.inputMapping, - datasetEvaluator.name, - datasetEvaluatorId, - datasetId, - evaluator.description, - evaluator.id, - evaluator.name, - loadedOutputConfigs, - ] - ); + const loadedOutputConfigs = ( + datasetEvaluator.outputConfigs?.length + ? datasetEvaluator.outputConfigs + : evaluator.outputConfigs?.length + ? evaluator.outputConfigs + : [createDefaultContinuousOutputConfig(datasetEvaluator.name ?? "")] + ) as Mutable< + | ContinuousEvaluatorAnnotationConfig + | ClassificationEvaluatorAnnotationConfig + >[]; + const initialState: EvaluatorStoreProps = { + evaluator: { + id: evaluator.id, + globalName: evaluator.name ?? datasetEvaluator.name ?? "", + name: datasetEvaluator.name ?? evaluator.name ?? "", + description: datasetEvaluator.description ?? evaluator.description ?? "", + inputMapping: datasetEvaluator.inputMapping, + kind: "CODE", + isBuiltin: false, + includeExplanation: false, + }, + datasetEvaluator: { + id: datasetEvaluatorId, + }, + outputConfigs: loadedOutputConfigs, + dataset: { + readonly: true, + id: datasetId, + selectedExampleId: null, + selectedSplitIds: [], + }, + evaluatorMappingSource: EVALUATOR_MAPPING_SOURCE_DEFAULT, + showPromptPreview: false, + }; const onSubmit = ( store: EvaluatorStoreInstance, payload: { language: "PYTHON" | "TYPESCRIPT"; sourceCode: string; - sandboxConfigId: number | null; + sandboxConfigId?: string | null; } ) => { setError(undefined); @@ -328,6 +336,7 @@ function EditCodeDatasetEvaluatorSlideoverContent({ }, onCompleted: () => { notifySuccess({ title: "Evaluator updated" }); + onDirtyChange?.(false); onClose(); onUpdate?.(); }, @@ -354,6 +363,8 @@ function EditCodeDatasetEvaluatorSlideoverContent({ {({ store }) => ( onSubmit(store, payload)} + onCancel={onClose} + onDirtyChange={onDirtyChange} isSubmitting={ isUpdatingCodeEvaluator || isUpdatingDatasetCodeEvaluator } diff --git a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts index 56fd92ccf44..5cad19a7c55 100644 --- a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts +++ b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideoverQuery.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<06e620c176178145fc5a3839b02c23ca>> + * @generated SignedSource<<5e569775dd6d69ca1c8f074190c66e94>> * @lightSyntaxTransform * @nogrep */ @@ -10,8 +10,13 @@ import { ConcreteRequest } from 'relay-runtime'; export type Language = "PYTHON" | "TYPESCRIPT"; +export type SandboxBackendStatus = "AVAILABLE" | "NOT_INSTALLED" | "UNAVAILABLE"; export type CreateCodeDatasetEvaluatorSlideoverQuery$variables = Record; export type CreateCodeDatasetEvaluatorSlideoverQuery$data = { + readonly sandboxBackends: ReadonlyArray<{ + readonly backendType: string; + readonly status: SandboxBackendStatus; + }>; readonly sandboxProviders: ReadonlyArray<{ readonly backendType: string; readonly configs: ReadonlyArray<{ @@ -19,6 +24,7 @@ export type CreateCodeDatasetEvaluatorSlideoverQuery$data = { readonly id: string; readonly name: string; }>; + readonly enabled: boolean; readonly language: Language; }>; }; @@ -46,10 +52,17 @@ v2 = { "alias": null, "args": null, "kind": "ScalarField", - "name": "id", + "name": "enabled", "storageKey": null }, v3 = { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "id", + "storageKey": null +}, +v4 = { "alias": null, "args": null, "concreteType": "SandboxConfig", @@ -57,7 +70,7 @@ v3 = { "name": "configs", "plural": true, "selections": [ - (v2/*: any*/), + (v3/*: any*/), { "alias": null, "args": null, @@ -74,6 +87,25 @@ v3 = { } ], "storageKey": null +}, +v5 = { + "alias": null, + "args": null, + "concreteType": "SandboxBackendInfo", + "kind": "LinkedField", + "name": "sandboxBackends", + "plural": true, + "selections": [ + (v0/*: any*/), + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "status", + "storageKey": null + } + ], + "storageKey": null }; return { "fragment": { @@ -92,10 +124,12 @@ return { "selections": [ (v0/*: any*/), (v1/*: any*/), - (v3/*: any*/) + (v2/*: any*/), + (v4/*: any*/) ], "storageKey": null - } + }, + (v5/*: any*/) ], "type": "Query", "abstractKey": null @@ -116,24 +150,26 @@ return { "selections": [ (v0/*: any*/), (v1/*: any*/), - (v3/*: any*/), - (v2/*: any*/) + (v2/*: any*/), + (v4/*: any*/), + (v3/*: any*/) ], "storageKey": null - } + }, + (v5/*: any*/) ] }, "params": { - "cacheID": "a0d387f77418178c3e8ffb53ac0a8f53", + "cacheID": "a0ca8e56dc9f78382ace98a30550e113", "id": null, "metadata": {}, "name": "CreateCodeDatasetEvaluatorSlideoverQuery", "operationKind": "query", - "text": "query CreateCodeDatasetEvaluatorSlideoverQuery {\n sandboxProviders {\n backendType\n language\n configs {\n id\n name\n description\n }\n id\n }\n}\n" + "text": "query CreateCodeDatasetEvaluatorSlideoverQuery {\n sandboxProviders {\n backendType\n language\n enabled\n configs {\n id\n name\n description\n }\n id\n }\n sandboxBackends {\n backendType\n status\n }\n}\n" } }; })(); -(node as any).hash = "c0a20fe4690192a61297ef66e60b100a"; +(node as any).hash = "6dc158e2f8dcebc2e068c3d0edcb4060"; export default node; diff --git a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts index 34330188e5a..69b47d9ac1d 100644 --- a/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts +++ b/app/src/components/dataset/__generated__/CreateCodeDatasetEvaluatorSlideover_createCodeEvaluatorMutation.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<9c2078a5db23ed3519210588c895fdee>> + * @generated SignedSource<> * @lightSyntaxTransform * @nogrep */ @@ -17,7 +17,7 @@ export type CreateCodeEvaluatorInput = { language: Language; name: string; outputConfigs?: ReadonlyArray | null; - sandboxConfigId?: number | null; + sandboxConfigId?: string | null; sourceCode: string; }; export type AnnotationConfigInput = { diff --git a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts index f3b02af9449..b48439aa0fb 100644 --- a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts +++ b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<> + * @generated SignedSource<<1d0a16e5296b8abba920428753786658>> * @lightSyntaxTransform * @nogrep */ @@ -12,6 +12,7 @@ import { ConcreteRequest } from 'relay-runtime'; export type EvaluatorKind = "BUILTIN" | "CODE" | "LLM"; export type Language = "PYTHON" | "TYPESCRIPT"; export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE" | "NONE"; +export type SandboxBackendStatus = "AVAILABLE" | "NOT_INSTALLED" | "UNAVAILABLE"; export type EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery$variables = { datasetEvaluatorId: string; datasetId: string; @@ -60,6 +61,10 @@ export type EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery$data = { }; readonly id: string; }; + readonly sandboxBackends: ReadonlyArray<{ + readonly backendType: string; + readonly status: SandboxBackendStatus; + }>; readonly sandboxProviders: ReadonlyArray<{ readonly backendType: string; readonly configs: ReadonlyArray<{ @@ -67,6 +72,7 @@ export type EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery$data = { readonly id: string; readonly name: string; }>; + readonly enabled: boolean; readonly language: Language; }>; }; @@ -267,6 +273,13 @@ v16 = { "storageKey": null }, v17 = { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "enabled", + "storageKey": null +}, +v18 = { "alias": null, "args": null, "concreteType": "SandboxConfig", @@ -280,14 +293,33 @@ v17 = { ], "storageKey": null }, -v18 = { +v19 = { + "alias": null, + "args": null, + "concreteType": "SandboxBackendInfo", + "kind": "LinkedField", + "name": "sandboxBackends", + "plural": true, + "selections": [ + (v16/*: any*/), + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "status", + "storageKey": null + } + ], + "storageKey": null +}, +v20 = { "alias": null, "args": null, "kind": "ScalarField", "name": "__typename", "storageKey": null }, -v19 = { +v21 = { "alias": null, "args": null, "concreteType": null, @@ -295,7 +327,7 @@ v19 = { "name": "outputConfigs", "plural": true, "selections": [ - (v18/*: any*/), + (v20/*: any*/), (v8/*: any*/), (v9/*: any*/), { @@ -385,10 +417,12 @@ return { "selections": [ (v16/*: any*/), (v13/*: any*/), - (v17/*: any*/) + (v17/*: any*/), + (v18/*: any*/) ], "storageKey": null - } + }, + (v19/*: any*/) ], "type": "Query", "abstractKey": null @@ -407,7 +441,7 @@ return { "name": "node", "plural": false, "selections": [ - (v18/*: any*/), + (v20/*: any*/), (v2/*: any*/), { "kind": "InlineFragment", @@ -424,7 +458,7 @@ return { (v4/*: any*/), (v5/*: any*/), (v6/*: any*/), - (v19/*: any*/), + (v21/*: any*/), { "alias": null, "args": null, @@ -433,7 +467,7 @@ return { "name": "evaluator", "plural": false, "selections": [ - (v18/*: any*/), + (v20/*: any*/), (v2/*: any*/), (v11/*: any*/), { @@ -444,7 +478,7 @@ return { (v12/*: any*/), (v13/*: any*/), (v15/*: any*/), - (v19/*: any*/) + (v21/*: any*/) ], "type": "CodeEvaluator", "abstractKey": null @@ -473,23 +507,25 @@ return { (v16/*: any*/), (v13/*: any*/), (v17/*: any*/), + (v18/*: any*/), (v2/*: any*/) ], "storageKey": null - } + }, + (v19/*: any*/) ] }, "params": { - "cacheID": "2c53f4a8dc7f28e90872e9dc40018e4e", + "cacheID": "82065e93bd959f33205eb3f97e84170e", "id": null, "metadata": {}, "name": "EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery", "operationKind": "query", - "text": "query EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery(\n $datasetEvaluatorId: ID!\n $datasetId: ID!\n) {\n dataset: node(id: $datasetId) {\n __typename\n id\n ... on Dataset {\n datasetEvaluator(datasetEvaluatorId: $datasetEvaluatorId) {\n id\n name\n description\n inputMapping {\n literalMapping\n pathMapping\n }\n outputConfigs {\n __typename\n ... on CategoricalAnnotationConfig {\n name\n optimizationDirection\n values {\n label\n score\n }\n }\n ... on ContinuousAnnotationConfig {\n name\n optimizationDirection\n lowerBound\n upperBound\n }\n ... on Node {\n __isNode: __typename\n id\n }\n }\n evaluator {\n __typename\n id\n kind\n ... on CodeEvaluator {\n name\n description\n sourceCode\n language\n sandboxConfig {\n id\n }\n outputConfigs {\n __typename\n ... on CategoricalAnnotationConfig {\n name\n optimizationDirection\n values {\n label\n score\n }\n }\n ... on ContinuousAnnotationConfig {\n name\n optimizationDirection\n lowerBound\n upperBound\n }\n ... on Node {\n __isNode: __typename\n id\n }\n }\n }\n }\n }\n }\n }\n sandboxProviders {\n backendType\n language\n configs {\n id\n name\n description\n }\n id\n }\n}\n" + "text": "query EditCodeDatasetEvaluatorSlideover_datasetEvaluatorQuery(\n $datasetEvaluatorId: ID!\n $datasetId: ID!\n) {\n dataset: node(id: $datasetId) {\n __typename\n id\n ... on Dataset {\n datasetEvaluator(datasetEvaluatorId: $datasetEvaluatorId) {\n id\n name\n description\n inputMapping {\n literalMapping\n pathMapping\n }\n outputConfigs {\n __typename\n ... on CategoricalAnnotationConfig {\n name\n optimizationDirection\n values {\n label\n score\n }\n }\n ... on ContinuousAnnotationConfig {\n name\n optimizationDirection\n lowerBound\n upperBound\n }\n ... on Node {\n __isNode: __typename\n id\n }\n }\n evaluator {\n __typename\n id\n kind\n ... on CodeEvaluator {\n name\n description\n sourceCode\n language\n sandboxConfig {\n id\n }\n outputConfigs {\n __typename\n ... on CategoricalAnnotationConfig {\n name\n optimizationDirection\n values {\n label\n score\n }\n }\n ... on ContinuousAnnotationConfig {\n name\n optimizationDirection\n lowerBound\n upperBound\n }\n ... on Node {\n __isNode: __typename\n id\n }\n }\n }\n }\n }\n }\n }\n sandboxProviders {\n backendType\n language\n enabled\n configs {\n id\n name\n description\n }\n id\n }\n sandboxBackends {\n backendType\n status\n }\n}\n" } }; })(); -(node as any).hash = "de2b53ad3e5ff75b83df0c917ba3be10"; +(node as any).hash = "210eabeeb373d8a2c919d19fc5042c34"; export default node; diff --git a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts index be5da31f314..a4257e369cc 100644 --- a/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts +++ b/app/src/components/dataset/__generated__/EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<02dd3e4c4452e113e602dc70df251404>> + * @generated SignedSource<> * @lightSyntaxTransform * @nogrep */ @@ -18,7 +18,7 @@ export type UpdateCodeEvaluatorInput = { language?: Language | null; name?: string | null; outputConfigs?: ReadonlyArray | null; - sandboxConfigId?: number | null; + sandboxConfigId?: string | null; sourceCode?: string | null; }; export type AnnotationConfigInput = { diff --git a/app/src/components/evaluators/CodeEvaluatorLanguageSandboxFields.tsx b/app/src/components/evaluators/CodeEvaluatorLanguageSandboxFields.tsx new file mode 100644 index 00000000000..93c606878e0 --- /dev/null +++ b/app/src/components/evaluators/CodeEvaluatorLanguageSandboxFields.tsx @@ -0,0 +1,235 @@ +import { useMemo } from "react"; + +import { + Button, + ComboBox, + ComboBoxItem, + Flex, + Label, + ListBox, + Popover, + Select, + SelectChevronUpDownIcon, + SelectItem, + SelectValue, + Text, + View, +} from "@phoenix/components"; +import type { CodeEvaluatorLanguage } from "@phoenix/types"; + +export type SandboxConfigOption = { + id: string; + name: string; + description?: string | null; + providerLabel: string; + providerLanguage: CodeEvaluatorLanguage; +}; + +export type CodeEvaluatorLanguageFieldProps = { + /** Current language selection */ + language: CodeEvaluatorLanguage; + /** Callback when language changes */ + onChange: (language: CodeEvaluatorLanguage) => void; +}; + +/** + * Language selector for code evaluators (Python or TypeScript) + */ +export const CodeEvaluatorLanguageField = ({ + language, + onChange, +}: CodeEvaluatorLanguageFieldProps) => { + return ( + + ); +}; + +export type CodeEvaluatorSandboxFieldProps = { + /** All available sandbox configs (will be filtered by language) */ + sandboxConfigs: SandboxConfigOption[]; + /** Current language to filter configs by */ + language: CodeEvaluatorLanguage; + /** Currently selected sandbox config Relay ID */ + selectedSandboxConfigId: string | null; + /** Callback when selection changes */ + onSelectionChange: (sandboxConfigId: string | null) => void; + /** Optional size variant */ + size?: "M" | "L"; + /** Whether to show the helper text below the field */ + showHelperText?: boolean; + /** Optional warning shown when a saved selection is no longer available */ + unavailableSelectionMessage?: string; +}; + +/** + * Sandbox config selector for code evaluators. + * Automatically filters configs by the selected language. + */ +export const CodeEvaluatorSandboxField = ({ + sandboxConfigs, + language, + selectedSandboxConfigId, + onSelectionChange, + size = "L", + showHelperText = false, + unavailableSelectionMessage, +}: CodeEvaluatorSandboxFieldProps) => { + // Filter configs to only show those matching the current language + const compatibleConfigs = useMemo( + () => + sandboxConfigs.filter((config) => config.providerLanguage === language), + [sandboxConfigs, language] + ); + + // Check if the selected config is still valid for the current language + const validSelectedId = compatibleConfigs.some( + (config) => config.id === selectedSandboxConfigId + ) + ? selectedSandboxConfigId + : null; + + if (sandboxConfigs.length === 0) { + // No sandbox providers enabled at all + return ( + + + + No sandbox providers enabled. Configure in Settings. + + + ); + } + + return ( + + 0 ? "Select..." : "None available" + } + selectedKey={validSelectedId != null ? String(validSelectedId) : null} + onSelectionChange={(key) => { + if (typeof key === "string") { + onSelectionChange(key); + } else { + onSelectionChange(null); + } + }} + defaultItems={compatibleConfigs} + menuTrigger="focus" + isDisabled={compatibleConfigs.length === 0} + renderEmptyState={() => ( + + + No configs for {language === "PYTHON" ? "Python" : "TypeScript"} + + + )} + > + {(item) => ( + + + {item.name} + {item.description ? ( + + {item.description} + + ) : ( + + {item.providerLabel} + + )} + + + )} + + {showHelperText && ( + + Code evaluators run in a sandbox. Configure reusable sandbox configs + in Settings if none are available here. + + )} + {unavailableSelectionMessage ? ( + + {unavailableSelectionMessage} + + ) : null} + + ); +}; + +const BACKEND_TYPE_LABELS: Record = { + WASM: "WebAssembly", + E2B: "E2B", + DAYTONA_PYTHON: "Daytona", + VERCEL_PYTHON: "Vercel", + VERCEL_TYPESCRIPT: "Vercel", + DENO: "Deno", + MODAL: "Modal", +}; + +const backendTypeLabel = (backendType: string): string => + BACKEND_TYPE_LABELS[backendType] ?? backendType; + +/** + * Maps sandbox provider data from GraphQL to SandboxConfigOption[]. + * Only includes configs from enabled providers whose backends are available. + */ +export const mapSandboxConfigOptions = ( + sandboxProviders: ReadonlyArray<{ + language: CodeEvaluatorLanguage; + backendType: string; + enabled: boolean; + configs: ReadonlyArray<{ + id: string; + name: string; + description?: string | null; + }>; + }>, + sandboxBackends: ReadonlyArray<{ + backendType: string; + status: string; + }> +): SandboxConfigOption[] => { + // Build a set of available backend types + const availableBackendTypes = new Set( + sandboxBackends + .filter((backend) => backend.status === "AVAILABLE") + .map((backend) => backend.backendType) + ); + + return sandboxProviders + .filter( + (provider) => + provider.enabled && availableBackendTypes.has(provider.backendType) + ) + .flatMap((provider) => + provider.configs.map((config) => ({ + id: config.id, + name: config.name, + description: config.description, + providerLanguage: provider.language, + providerLabel: backendTypeLabel(provider.backendType), + })) + ); +}; diff --git a/app/src/components/evaluators/CodeEvaluatorSourceCodeBlock.tsx b/app/src/components/evaluators/CodeEvaluatorSourceCodeBlock.tsx new file mode 100644 index 00000000000..602fe5edec5 --- /dev/null +++ b/app/src/components/evaluators/CodeEvaluatorSourceCodeBlock.tsx @@ -0,0 +1,16 @@ +import { PythonBlock } from "@phoenix/components/code/PythonBlock"; +import { TypeScriptBlock } from "@phoenix/components/code/TypeScriptBlock"; +import type { CodeEvaluatorLanguage } from "@phoenix/types"; + +export const CodeEvaluatorSourceCodeBlock = ({ + language, + sourceCode, +}: { + language: CodeEvaluatorLanguage; + sourceCode: string; +}) => { + if (language === "PYTHON") { + return ; + } + return ; +}; diff --git a/app/src/components/evaluators/CodeEvaluatorTestSection.tsx b/app/src/components/evaluators/CodeEvaluatorTestSection.tsx new file mode 100644 index 00000000000..0adb01ec80f --- /dev/null +++ b/app/src/components/evaluators/CodeEvaluatorTestSection.tsx @@ -0,0 +1,374 @@ +import { useMemo, useState } from "react"; +import { graphql, useMutation } from "react-relay"; + +import { + Alert, + Button, + Card, + DialogTrigger, + Flex, + Heading, + Icon, + IconButton, + Icons, + Popover, + Skeleton, + Text, + View, +} from "@phoenix/components"; +import type { Annotation } from "@phoenix/components/annotation"; +import { AnnotationDetailsContent } from "@phoenix/components/annotation/AnnotationDetailsContent"; +import { getPositiveOptimization } from "@phoenix/components/annotation/optimizationUtils"; +import { JSONBlock } from "@phoenix/components/code"; +import type { CodeEvaluatorTestSectionMutation } from "@phoenix/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql"; +import { buildOutputConfigsInput } from "@phoenix/components/evaluators/utils"; +import { ExperimentAnnotationButton } from "@phoenix/components/experiment/ExperimentAnnotationButton"; +import { useEvaluatorStore } from "@phoenix/contexts/EvaluatorContext"; +import type { AnnotationConfig } from "@phoenix/store/evaluatorStore"; +import type { CodeEvaluatorLanguage } from "@phoenix/types"; +import { getErrorMessagesFromRelayMutationError } from "@phoenix/utils/errorUtils"; + +type EvaluationPreviewResult = + | { kind: "success"; annotation: Annotation } + | { kind: "error"; evaluatorName: string; message: string }; + +/** + * Computes whether an annotation score represents a positive optimization result + * by matching the annotation name to the corresponding output config. + */ +function computePositiveOptimization({ + annotationName, + score, + evaluatorName, + outputConfigs, +}: { + annotationName: string; + score: number | null | undefined; + evaluatorName: string; + outputConfigs: AnnotationConfig[]; +}): boolean | null { + if (outputConfigs.length === 0) { + return null; + } + + let matchedConfig: AnnotationConfig | undefined; + if (outputConfigs.length === 1) { + matchedConfig = outputConfigs[0]; + } else { + // Multi-output: annotation name is "evaluatorName.configName" + const prefix = evaluatorName + "."; + if (annotationName.startsWith(prefix)) { + const configName = annotationName.slice(prefix.length); + matchedConfig = outputConfigs.find((c) => c.name === configName); + } + } + + if (matchedConfig == null) { + return null; + } + + const optimizationDirection = + matchedConfig.optimizationDirection === "MAXIMIZE" || + matchedConfig.optimizationDirection === "MINIMIZE" + ? matchedConfig.optimizationDirection + : undefined; + + let lowerBound: number | undefined; + let upperBound: number | undefined; + + if ("values" in matchedConfig) { + // Categorical: compute bounds from values scores + const scores = matchedConfig.values + .map((v) => v.score) + .filter((s): s is number => s != null); + if (scores.length > 0) { + lowerBound = Math.min(...scores); + upperBound = Math.max(...scores); + } + } else { + // Continuous: use bounds directly + lowerBound = matchedConfig.lowerBound ?? undefined; + upperBound = matchedConfig.upperBound ?? undefined; + } + + return getPositiveOptimization({ + score, + lowerBound, + upperBound, + optimizationDirection, + }); +} + +export type CodeEvaluatorTestSectionProps = { + /** The evaluator's source code */ + sourceCode: string; + /** The language (PYTHON or TYPESCRIPT) */ + language: CodeEvaluatorLanguage; + /** The sandbox config Relay ID if selected */ + sandboxConfigId: string | null; +}; + +/** + * Test section for code evaluators - allows testing the evaluator + * against example data before saving. + */ +export const CodeEvaluatorTestSection = ({ + sourceCode, + language, + sandboxConfigId, +}: CodeEvaluatorTestSectionProps) => { + const [error, setError] = useState(null); + const [previewResults, setPreviewResults] = useState< + EvaluationPreviewResult[] + >([]); + + const outputConfigs = useEvaluatorStore((state) => state.outputConfigs); + const evaluatorName = useEvaluatorStore( + (state) => state.evaluator.name || state.evaluator.globalName || "evaluator" + ); + const evaluatorDescription = useEvaluatorStore( + (state) => state.evaluator.description + ); + const inputMapping = useEvaluatorStore( + (state) => state.evaluator.inputMapping + ); + const evaluatorMappingSource = useEvaluatorStore( + (state) => state.evaluatorMappingSource + ); + + const [testEvaluator, isLoading] = + useMutation(graphql` + mutation CodeEvaluatorTestSectionMutation($input: EvaluatorPreviewsInput!) { + evaluatorPreviews(input: $input) { + results { + evaluatorName + annotation { + explanation + label + score + name + id + } + error + } + } + } + `); + + const onTestEvaluator = () => { + setError(null); + setPreviewResults([]); + + if (!sourceCode.trim()) { + setError("Source code is required"); + return; + } + + if (outputConfigs.length === 0) { + setError("At least one output configuration is required"); + return; + } + + if (sandboxConfigId == null) { + setError("Please select a sandbox configuration to test the evaluator"); + return; + } + + const gqlOutputConfigs = buildOutputConfigsInput(outputConfigs); + testEvaluator({ + variables: { + input: { + previews: [ + { + context: evaluatorMappingSource, + evaluator: { + inlineCodeEvaluator: { + name: evaluatorName, + description: evaluatorDescription || null, + language, + sourceCode, + outputConfigs: gqlOutputConfigs, + sandboxConfigId, + }, + }, + inputMapping, + }, + ], + }, + }, + onCompleted(response, errors) { + if (errors) { + const errorMessages = getErrorMessagesFromRelayMutationError(errors); + const errorMessage = + errorMessages?.join("\n") ?? + errors[0]?.message ?? + "An unknown error occurred"; + setError(errorMessage); + } else { + const results: EvaluationPreviewResult[] = + response.evaluatorPreviews.results.map((result) => { + if (result.error != null) { + return { + kind: "error" as const, + evaluatorName: result.evaluatorName, + message: result.error, + }; + } else if (result.annotation != null) { + return { + kind: "success" as const, + annotation: { + id: result.annotation.id, + name: result.annotation.name, + label: result.annotation.label, + score: result.annotation.score, + explanation: result.annotation.explanation, + }, + }; + } else { + throw new Error( + "Unknown error: no annotation or error returned" + ); + } + }); + setPreviewResults(results); + } + }, + onError(error) { + const errorMessages = getErrorMessagesFromRelayMutationError(error); + const errorMessage = + errorMessages?.join("\n") ?? + error.message ?? + "An unknown error occurred"; + setError(errorMessage); + }, + }); + }; + + const isShowingPreview = + isLoading || previewResults.length > 0 || error != null; + + return ( + + {/* Results section */} + {isShowingPreview && ( + + {isLoading && ( + + + + + + + + + )} + {previewResults.map((result, i) => ( + + {result.kind === "success" ? ( + setPreviewResults([])} + > + } /> + + } + > + + + + + + + + + + + + + ) : ( + + {result.message} + + )} + + ))} + + {error && !isLoading && previewResults.length === 0 && ( + setError(null)} + > + {error} + + )} + + )} + + {/* Test button and description */} + + + Test Evaluator + + + + + Run your evaluator against the example data to verify it works correctly + before saving. + + + ); +}; + +function AnnotationPreviewJSONBlock(props: { annotation: Annotation }) { + const { name, label, score, explanation } = props.annotation; + const jsonString = useMemo(() => { + return JSON.stringify({ name, label, score, explanation }, null, 2); + }, [explanation, label, name, score]); + + return ( + + ); +} diff --git a/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx b/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx index aafc48d6ca9..9067fe72585 100644 --- a/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx +++ b/app/src/components/evaluators/EditCodeEvaluatorDialogContent.tsx @@ -3,16 +3,13 @@ import { python } from "@codemirror/lang-python"; import { css } from "@emotion/react"; import { githubDark, githubLight } from "@uiw/codemirror-theme-github"; import CodeMirror from "@uiw/react-codemirror"; -import { useEffect, useMemo, useState } from "react"; +import { useEffect, useEffectEvent, useMemo, useRef, useState } from "react"; import { Group, Panel, Separator } from "react-resizable-panels"; import { Alert, Button, - ComboBox, - ComboBoxItem, Flex, - Heading, Icon, Icons, Input, @@ -27,14 +24,26 @@ import { TextField, View, } from "@phoenix/components"; -import { PythonBlock } from "@phoenix/components/code/PythonBlock"; -import { TypeScriptBlock } from "@phoenix/components/code/TypeScriptBlock"; import { DialogContent, DialogHeader, DialogTitle, DialogTitleExtra, } from "@phoenix/components/core/dialog"; +import { + Disclosure, + DisclosureGroup, + DisclosurePanel, + DisclosureTrigger, +} from "@phoenix/components/core/disclosure"; +import { createEvaluatorAutocompletion } from "@phoenix/components/evaluators/codeEvaluatorAutocomplete"; +import { + CodeEvaluatorLanguageField, + CodeEvaluatorSandboxField, + type SandboxConfigOption, +} from "@phoenix/components/evaluators/CodeEvaluatorLanguageSandboxFields"; +import { CodeEvaluatorTestSection } from "@phoenix/components/evaluators/CodeEvaluatorTestSection"; +import { generateEvaluatorTypes } from "@phoenix/components/evaluators/codeEvaluatorTypeGeneration"; import { DEFAULT_CODE_EVALUATOR_SOURCE, extractCodeEvaluatorVariables, @@ -73,16 +82,10 @@ export const createDefaultContinuousOutputConfig = ( upperBound: 1, }); -export type SandboxConfigOption = { - id: number; - name: string; - description?: string | null; - providerLabel: string; - providerLanguage: CodeEvaluatorLanguage; -}; - export const EditCodeEvaluatorDialogContent = ({ onSubmit, + onCancel, + onDirtyChange, isSubmitting, mode, error, @@ -94,31 +97,112 @@ export const EditCodeEvaluatorDialogContent = ({ onSubmit: (payload: { language: CodeEvaluatorLanguage; sourceCode: string; - sandboxConfigId: number | null; + sandboxConfigId?: string | null; }) => void; + /** + * Called when the user clicks Cancel. Parent overlays can use this to + * centralize close behavior such as unsaved-change confirmation. + */ + onCancel?: () => void; + /** + * Called whenever the dirty state changes (has unsaved changes vs. not). + */ + onDirtyChange?: (isDirty: boolean) => void; isSubmitting: boolean; mode: "create" | "update"; error?: string; initialLanguage: CodeEvaluatorLanguage; initialSourceCode: string; sandboxConfigs: SandboxConfigOption[]; - initialSandboxConfigId?: number | null; + initialSandboxConfigId?: string | null; }) => { const store = useEvaluatorStoreInstance(); const [showValidationError, setShowValidationError] = useState(false); const [sourceCode, setSourceCode] = useState(initialSourceCode); const [language, setLanguage] = useState(initialLanguage); - const [sandboxConfigId, setSandboxConfigId] = useState( + const [sandboxConfigId, setSandboxConfigId] = useState( initialSandboxConfigId ?? null ); const [localValidationError, setLocalValidationError] = useState< string | undefined >(); + + // Track initial store state for dirty checking + const initialStoreStateRef = useRef<{ + name: string; + outputConfigs: string; + inputMapping: string; + } | null>(null); + + // Track last reported dirty state to avoid redundant callbacks + const lastDirtyRef = useRef(false); + + useEffect(() => { + // Capture initial store state on mount for dirty comparison + const state = store.getState(); + initialStoreStateRef.current = { + name: state.evaluator.name, + outputConfigs: JSON.stringify(state.outputConfigs), + inputMapping: JSON.stringify(state.evaluator.inputMapping), + }; + }, [store]); + + const reportDirtyState = useEffectEvent((isDirty: boolean) => { + onDirtyChange?.(isDirty); + }); + + const checkForDirtyChanges = useEffectEvent(() => { + const initial = initialStoreStateRef.current; + if (!initial) { + return; + } + + const state = store.getState(); + const codeChanged = sourceCode !== initialSourceCode; + const languageChanged = language !== initialLanguage; + const sandboxChanged = sandboxConfigId !== (initialSandboxConfigId ?? null); + const nameChanged = state.evaluator.name !== initial.name; + const outputConfigsChanged = + JSON.stringify(state.outputConfigs) !== initial.outputConfigs; + const inputMappingChanged = + JSON.stringify(state.evaluator.inputMapping) !== initial.inputMapping; + + const isDirty = + codeChanged || + languageChanged || + sandboxChanged || + nameChanged || + outputConfigsChanged || + inputMappingChanged; + + if (isDirty !== lastDirtyRef.current) { + lastDirtyRef.current = isDirty; + reportDirtyState(isDirty); + } + }); + + // Notify parent of dirty state changes from local state + useEffect(() => { + checkForDirtyChanges(); + }, [sourceCode, language, sandboxConfigId]); + + // Subscribe to store changes to notify parent of dirty state + useEffect(() => { + return store.subscribe(() => { + checkForDirtyChanges(); + }); + }, [store]); + + const handleCancel = () => { + onCancel?.(); + }; + const variables = useMemo( () => extractCodeEvaluatorVariables({ language, sourceCode }), [language, sourceCode] ); + const compatibleSandboxConfigs = useMemo( () => sandboxConfigs.filter( @@ -126,17 +210,26 @@ export const EditCodeEvaluatorDialogContent = ({ ), [language, sandboxConfigs] ); + const selectedSandboxConfigId = compatibleSandboxConfigs.some( (sandboxConfig) => sandboxConfig.id === sandboxConfigId ) ? sandboxConfigId : null; + const hasUnavailableSandboxSelection = + sandboxConfigId != null && selectedSandboxConfigId == null; + const unavailableSandboxSelectionMessage = hasUnavailableSandboxSelection + ? "The previously selected sandbox is no longer available. Save to keep the existing sandbox, or choose a new one to update it." + : undefined; + const hasNoSandboxConfigs = sandboxConfigs.length === 0; const handleSubmit = async () => { const isValid = await store.getState().validateAll(); const configError = getCodeEvaluatorValidationError({ outputConfigs: store.getState().outputConfigs, sourceCode, + mode, + sandboxConfigId: selectedSandboxConfigId, }); if (!isValid || configError) { setShowValidationError(true); @@ -145,10 +238,18 @@ export const EditCodeEvaluatorDialogContent = ({ } setShowValidationError(false); setLocalValidationError(undefined); + const hasSandboxChanged = + sandboxConfigId !== (initialSandboxConfigId ?? null); + const nextSandboxConfigId = + selectedSandboxConfigId != null + ? selectedSandboxConfigId + : mode === "create" || hasSandboxChanged + ? null + : undefined; onSubmit({ language, sourceCode, - sandboxConfigId: selectedSandboxConfigId, + sandboxConfigId: nextSandboxConfigId, }); }; @@ -159,9 +260,15 @@ export const EditCodeEvaluatorDialogContent = ({ {mode === "create" ? "Create Evaluator" : "Edit Evaluator"} - + {onCancel ? ( + + ) : ( + + )} -
{ - // Prevent Escape from propagating to the modal overlay, - // which would close the slideover and discard edits. - if (e.key === "Escape") { - e.stopPropagation(); - } - }} - > - + + {/* Code editor and type footer with resizable panels */} +
+ + {/* Editable code editor panel */} + +
{ + if (e.key === "Escape") { + e.stopPropagation(); + } + }} + > + +
+
+ + {/* Read-only type footer panel */} + {typeFooter && ( + <> + + +
+ +
+
+ + )} +
- +
); }; -const CodeEvaluatorOutputConfigSection = () => { +/** + * Output configuration section (inside accordion) + */ +const OutputConfigSection = () => { const store = useEvaluatorStoreInstance(); const outputConfig = useEvaluatorStore((state) => state.outputConfigs[0]); const evaluatorName = useEvaluatorStore( @@ -391,66 +636,61 @@ const CodeEvaluatorOutputConfigSection = () => { } return ( - - - Evaluator Annotation - - - Configure the annotation produced by this evaluator. - - + + + + + The name of the annotation that will be created by this evaluator. + Fixed to the evaluator name. + + + + { - const nextType = - value as (typeof outputTypeOptions)[number]["id"]; - store.getState().setOutputConfigs([ - nextType === "categorical" - ? { - name: evaluatorName, - optimizationDirection: "NONE", - values: [ - { label: "pass", score: 1 }, - { label: "fail", score: 0 }, - ], - } - : createDefaultContinuousOutputConfig(evaluatorName), - ]); - }} - > - - - - - {outputTypeOptions.map((option) => ( - - {option.label} - - ))} - - - - - - - - - {"values" in outputConfig ? ( - - ) : ( - - )} - - + + + + The type of output that will be created by this evaluator. Your code + should return a numerical score or a categorical label. + + + + {outputTypeOptions.map((option) => ( + + {option.label} + + ))} + + + + + + + {"values" in outputConfig ? ( + + ) : ( + + )} ); }; @@ -462,15 +702,18 @@ const CategoricalChoicesEditor = ({ }) => { const setOutputConfigs = useEvaluatorStore((state) => state.setOutputConfigs); const outputConfig = useEvaluatorStore((state) => state.outputConfigs[0]); + if (!outputConfig || !("values" in outputConfig)) { return null; } + const updateValues = (nextValues: ClassificationChoice[]) => { setOutputConfigs([{ ...outputConfig, values: nextValues }]); }; + return ( - + Choices {values.map((choice, index) => ( @@ -501,28 +744,27 @@ const CategoricalChoicesEditor = ({ ))} ); @@ -534,167 +776,52 @@ const ContinuousBoundsEditor = ({ config: ContinuousEvaluatorAnnotationConfig; }) => { const setOutputConfigs = useEvaluatorStore((state) => state.setOutputConfigs); + const updateConfig = ( updates: Partial ) => { setOutputConfigs([{ ...config, ...updates }]); }; + return ( -
+ { + onChange={(value) => updateConfig({ lowerBound: value.trim() === "" ? null : Number(value), - }); - }} + }) + } > - + { + onChange={(value) => updateConfig({ upperBound: value.trim() === "" ? null : Number(value), - }); - }} + }) + } > - + - -
- ); -}; - -export const CodeEvaluatorSourceCodeBlock = ({ - language, - sourceCode, -}: { - language: CodeEvaluatorLanguage; - sourceCode: string; -}) => { - if (language === "PYTHON") { - return ; - } - return ; -}; - -const CodeEvaluatorSandboxField = ({ - sandboxConfigs, - selectedSandboxConfigId, - onSelectionChange, -}: { - sandboxConfigs: SandboxConfigOption[]; - selectedSandboxConfigId: number | null; - onSelectionChange: (sandboxConfigId: number | null) => void; -}) => { - return ( - - 0 - ? "Select a sandbox config" - : "No sandbox configs available" - } - selectedKey={ - selectedSandboxConfigId != null - ? String(selectedSandboxConfigId) - : null - } - onSelectionChange={(key) => { - if (typeof key === "string") { - onSelectionChange(Number(key)); - } else { - onSelectionChange(null); - } - }} - defaultItems={sandboxConfigs} - menuTrigger="focus" - isDisabled={sandboxConfigs.length === 0} - renderEmptyState={() => ( -
No sandbox configs found for this language
- )} - > - {(item) => ( - - - {item.name} - {item.description ? ( - - {item.description} - - ) : ( - - {item.providerLabel} - - )} - - - )} -
- - Code evaluators run in a sandbox. Configure reusable sandbox configs in - Settings if none are available here. - -
- ); -}; - -export const mapSandboxConfigOptions = ( - sandboxProviders: ReadonlyArray<{ - language: CodeEvaluatorLanguage; - backendType: string; - configs: ReadonlyArray<{ - id: string; - name: string; - description?: string | null; - }>; - }> -): SandboxConfigOption[] => { - return sandboxProviders.flatMap((provider) => - provider.configs.map((config) => ({ - id: decodeRelayNodeId(config.id), - name: config.name, - description: config.description, - providerLanguage: provider.language, - providerLabel: backendTypeLabel(provider.backendType), - })) + ); }; -const BACKEND_TYPE_LABELS: Record = { - WASM: "WebAssembly", - E2B: "E2B", - DAYTONA_PYTHON: "Daytona", - VERCEL_PYTHON: "Vercel", - VERCEL_TYPESCRIPT: "Vercel", - DENO: "Deno", - MODAL: "Modal", -}; - -const backendTypeLabel = (backendType: string): string => - BACKEND_TYPE_LABELS[backendType] ?? backendType; - -const decodeRelayNodeId = (globalId: string) => { - const decoded = globalThis.atob(globalId); - const [, rawId = ""] = decoded.split(":", 2); - return Number(rawId); -}; - +// Validation helper const getCodeEvaluatorValidationError = ({ outputConfigs, sourceCode, + mode, + sandboxConfigId, }: { outputConfigs: AnnotationConfig[]; sourceCode: string; + mode: "create" | "update"; + sandboxConfigId: string | null; }) => { if (sourceCode.trim().length === 0) { return "Source code is required."; @@ -702,6 +829,10 @@ const getCodeEvaluatorValidationError = ({ if (outputConfigs.length === 0) { return "At least one output config is required."; } + // Require sandbox selection when creating a new evaluator + if (mode === "create" && sandboxConfigId == null) { + return "Please select a sandbox configuration."; + } const outputConfig = outputConfigs[0]; if ("values" in outputConfig) { if (outputConfig.values.length < 2) { @@ -717,54 +848,137 @@ const getCodeEvaluatorValidationError = ({ return undefined; }; +// Styles const fieldsetCSS = css` all: unset; display: flex; flex-direction: column; flex: 1; min-height: 0; - gap: var(--global-dimension-size-200); - overflow: auto; + overflow: hidden; +`; + +const headerBarCSS = css` + display: flex; + flex-direction: row; + align-items: flex-start; + gap: var(--global-dimension-size-150); + padding: var(--global-dimension-size-150) var(--global-dimension-size-200); + border-bottom: 1px solid var(--global-border-color-default); + flex-shrink: 0; +`; + +const headerFieldCSS = css` + /* Ensure fields don't wrap */ `; const panelStyle = { height: "100%", - overflowY: "auto", -} as const; + display: "flex", + flexDirection: "column" as const, + minHeight: 0, + overflow: "hidden" as const, +}; -const leftPanelCSS = css` +const editorPanelCSS = css` display: flex; flex-direction: column; - padding: var(--global-dimension-size-100) var(--global-dimension-size-200); + flex: 1; + min-height: 0; + padding: var(--global-dimension-size-150); + padding-top: var(--global-dimension-size-100); box-sizing: border-box; +`; + +const editorSectionCSS = css` + display: flex; + flex-direction: column; + flex: 1; + min-height: 0; gap: var(--global-dimension-size-100); `; -const rightPanelCSS = css` +const sidebarPanelCSS = css` display: flex; flex-direction: column; - padding: var(--global-dimension-size-100) 0; + height: 100%; + padding: 0; box-sizing: border-box; + overflow-y: auto; + border-left: 1px solid var(--global-border-color-default); `; -const editorWrapCSS = css` +const accordionContentCSS = css` + padding: var(--global-dimension-size-50) 0; + padding-bottom: var(--global-dimension-size-150); +`; + +const editorContainerCSS = css` + display: flex; + flex-direction: column; + flex: 1; + min-height: 0; border: 1px solid var(--global-border-color-default); border-radius: var(--global-rounding-medium); overflow: hidden; + background-color: var(--code-mirror-editor-background-color); +`; + +const editorPanelStyle = { + display: "flex", + flexDirection: "column" as const, + minHeight: 0, + overflow: "hidden" as const, +}; + +const editorWrapCSS = css` + flex: 1; + min-height: 0; + overflow: hidden; + display: flex; + flex-direction: column; + + & .cm-theme { + height: 100% !important; + } + + & .cm-editor { + height: 100% !important; + } + + & .cm-scroller { + overflow: auto !important; + } +`; + +const typeFooterCSS = css` + flex: 1; + min-height: 0; + overflow: hidden; + display: flex; + flex-direction: column; + + & .cm-theme { + height: 100% !important; + } & .cm-editor { - min-height: 280px; + height: 100% !important; + background-color: var(--ac-global-color-grey-100); } - & .cm-content, - & .cm-gutter { - min-height: 280px; + & .cm-gutters { + background-color: var(--ac-global-color-grey-100); + } + + & .cm-scroller { + overflow: auto !important; } `; const choiceGridCSS = css` display: grid; - grid-template-columns: 1.5fr 1fr auto; - gap: var(--global-dimension-size-100); - align-items: end; + grid-template-columns: 1fr 100px 32px; + gap: var(--global-dimension-size-50); + align-items: center; `; diff --git a/app/src/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql.ts b/app/src/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql.ts new file mode 100644 index 00000000000..1742332fe5b --- /dev/null +++ b/app/src/components/evaluators/__generated__/CodeEvaluatorTestSectionMutation.graphql.ts @@ -0,0 +1,300 @@ +/** + * @generated SignedSource<<54099212158b2aedaffde087d5ae2557>> + * @lightSyntaxTransform + * @nogrep + */ + +/* tslint:disable */ +/* eslint-disable */ +// @ts-nocheck + +import { ConcreteRequest } from 'relay-runtime'; +export type GenerativeProviderKey = "ANTHROPIC" | "AWS" | "AZURE_OPENAI" | "CEREBRAS" | "DEEPSEEK" | "FIREWORKS" | "GOOGLE" | "GROQ" | "MOONSHOT" | "OLLAMA" | "OPENAI" | "PERPLEXITY" | "TOGETHER" | "XAI"; +export type Language = "PYTHON" | "TYPESCRIPT"; +export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE" | "NONE"; +export type PromptMessageRole = "AI" | "SYSTEM" | "TOOL" | "USER"; +export type PromptTemplateFormat = "F_STRING" | "MUSTACHE" | "NONE"; +export type EvaluatorPreviewsInput = { + credentials?: ReadonlyArray | null; + previews: ReadonlyArray; +}; +export type EvaluatorPreviewItemInput = { + context: any; + evaluator: EvaluatorPreviewInput; + inputMapping?: EvaluatorInputMappingInput; +}; +export type EvaluatorPreviewInput = { + builtInEvaluatorId?: string | null; + codeEvaluatorId?: string | null; + inlineCodeEvaluator?: InlineCodeEvaluatorInput | null; + inlineLlmEvaluator?: InlineLLMEvaluatorInput | null; +}; +export type InlineLLMEvaluatorInput = { + description?: string | null; + name: string; + outputConfigs: ReadonlyArray; + promptVersion: ChatPromptVersionInput; +}; +export type ChatPromptVersionInput = { + customProviderId?: string | null; + description?: string | null; + invocationParameters?: any; + modelName: string; + modelProvider: GenerativeProviderKey; + responseFormat?: PromptResponseFormatJSONSchemaInput | null; + template: PromptChatTemplateInput; + templateFormat: PromptTemplateFormat; + tools?: PromptToolsInput | null; +}; +export type PromptChatTemplateInput = { + messages: ReadonlyArray; +}; +export type PromptMessageInput = { + content: ReadonlyArray; + role: PromptMessageRole; +}; +export type ContentPartInput = { + text?: TextContentValueInput | null; + toolCall?: ToolCallContentValueInput | null; + toolResult?: ToolResultContentValueInput | null; +}; +export type TextContentValueInput = { + text: string; +}; +export type ToolCallContentValueInput = { + toolCall: ToolCallFunctionInput; + toolCallId: string; +}; +export type ToolCallFunctionInput = { + arguments: string; + name: string; + type?: string | null; +}; +export type ToolResultContentValueInput = { + result: any; + toolCallId: string; +}; +export type PromptToolsInput = { + disableParallelToolCalls?: boolean | null; + toolChoice?: PromptToolChoiceInput | null; + tools: ReadonlyArray; +}; +export type PromptToolFunctionInput = { + function: PromptToolFunctionDefinitionInput; +}; +export type PromptToolFunctionDefinitionInput = { + description?: string | null; + name: string; + parameters?: any | null; + strict?: boolean | null; +}; +export type PromptToolChoiceInput = { + functionName?: string | null; + none?: boolean | null; + oneOrMore?: boolean | null; + zeroOrMore?: boolean | null; +}; +export type PromptResponseFormatJSONSchemaInput = { + jsonSchema: PromptResponseFormatJSONSchemaDefinitionInput; + type: string; +}; +export type PromptResponseFormatJSONSchemaDefinitionInput = { + description?: string | null; + name: string; + schema?: any | null; + strict?: boolean | null; +}; +export type AnnotationConfigInput = { + categorical?: CategoricalAnnotationConfigInput | null; + continuous?: ContinuousAnnotationConfigInput | null; + freeform?: FreeformAnnotationConfigInput | null; +}; +export type CategoricalAnnotationConfigInput = { + description?: string | null; + name: string; + optimizationDirection: OptimizationDirection; + values: ReadonlyArray; +}; +export type CategoricalAnnotationConfigValueInput = { + label: string; + score?: number | null; +}; +export type ContinuousAnnotationConfigInput = { + description?: string | null; + lowerBound?: number | null; + name: string; + optimizationDirection: OptimizationDirection; + upperBound?: number | null; +}; +export type FreeformAnnotationConfigInput = { + description?: string | null; + name: string; +}; +export type InlineCodeEvaluatorInput = { + description?: string | null; + language: Language; + name: string; + outputConfigs: ReadonlyArray; + sandboxConfigId?: string | null; + sourceCode: string; +}; +export type EvaluatorInputMappingInput = { + literalMapping?: any; + pathMapping?: any; +}; +export type GenerativeCredentialInput = { + envVarName: string; + value: string; +}; +export type CodeEvaluatorTestSectionMutation$variables = { + input: EvaluatorPreviewsInput; +}; +export type CodeEvaluatorTestSectionMutation$data = { + readonly evaluatorPreviews: { + readonly results: ReadonlyArray<{ + readonly annotation: { + readonly explanation: string | null; + readonly id: string; + readonly label: string | null; + readonly name: string; + readonly score: number | null; + } | null; + readonly error: string | null; + readonly evaluatorName: string; + }>; + }; +}; +export type CodeEvaluatorTestSectionMutation = { + response: CodeEvaluatorTestSectionMutation$data; + variables: CodeEvaluatorTestSectionMutation$variables; +}; + +const node: ConcreteRequest = (function(){ +var v0 = [ + { + "defaultValue": null, + "kind": "LocalArgument", + "name": "input" + } +], +v1 = [ + { + "alias": null, + "args": [ + { + "kind": "Variable", + "name": "input", + "variableName": "input" + } + ], + "concreteType": "EvaluatorPreviewsPayload", + "kind": "LinkedField", + "name": "evaluatorPreviews", + "plural": false, + "selections": [ + { + "alias": null, + "args": null, + "concreteType": "EvaluationResult", + "kind": "LinkedField", + "name": "results", + "plural": true, + "selections": [ + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "evaluatorName", + "storageKey": null + }, + { + "alias": null, + "args": null, + "concreteType": "ExperimentRunAnnotation", + "kind": "LinkedField", + "name": "annotation", + "plural": false, + "selections": [ + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "explanation", + "storageKey": null + }, + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "label", + "storageKey": null + }, + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "score", + "storageKey": null + }, + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "name", + "storageKey": null + }, + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "id", + "storageKey": null + } + ], + "storageKey": null + }, + { + "alias": null, + "args": null, + "kind": "ScalarField", + "name": "error", + "storageKey": null + } + ], + "storageKey": null + } + ], + "storageKey": null + } +]; +return { + "fragment": { + "argumentDefinitions": (v0/*: any*/), + "kind": "Fragment", + "metadata": null, + "name": "CodeEvaluatorTestSectionMutation", + "selections": (v1/*: any*/), + "type": "Mutation", + "abstractKey": null + }, + "kind": "Request", + "operation": { + "argumentDefinitions": (v0/*: any*/), + "kind": "Operation", + "name": "CodeEvaluatorTestSectionMutation", + "selections": (v1/*: any*/) + }, + "params": { + "cacheID": "880a87909fde069d7330f259c8cea2a2", + "id": null, + "metadata": {}, + "name": "CodeEvaluatorTestSectionMutation", + "operationKind": "mutation", + "text": "mutation CodeEvaluatorTestSectionMutation(\n $input: EvaluatorPreviewsInput!\n) {\n evaluatorPreviews(input: $input) {\n results {\n evaluatorName\n annotation {\n explanation\n label\n score\n name\n id\n }\n error\n }\n }\n}\n" + } +}; +})(); + +(node as any).hash = "51564928488b73ce3754b374762baa5a"; + +export default node; diff --git a/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts b/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts index 3b4b941b551..72b36520378 100644 --- a/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts +++ b/app/src/components/evaluators/__generated__/EvaluatorOutputPreviewMutation.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<3c3d1c0f71d7aee0d1c7e07898318cea>> + * @generated SignedSource<<24debaf7bb672a75d29e86628a268828>> * @lightSyntaxTransform * @nogrep */ @@ -10,6 +10,7 @@ import { ConcreteRequest } from 'relay-runtime'; export type GenerativeProviderKey = "ANTHROPIC" | "AWS" | "AZURE_OPENAI" | "CEREBRAS" | "DEEPSEEK" | "FIREWORKS" | "GOOGLE" | "GROQ" | "MOONSHOT" | "OLLAMA" | "OPENAI" | "PERPLEXITY" | "TOGETHER" | "XAI"; +export type Language = "PYTHON" | "TYPESCRIPT"; export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE" | "NONE"; export type PromptMessageRole = "AI" | "SYSTEM" | "TOOL" | "USER"; export type PromptTemplateFormat = "F_STRING" | "MUSTACHE" | "NONE"; @@ -25,6 +26,7 @@ export type EvaluatorPreviewItemInput = { export type EvaluatorPreviewInput = { builtInEvaluatorId?: string | null; codeEvaluatorId?: string | null; + inlineCodeEvaluator?: InlineCodeEvaluatorInput | null; inlineLlmEvaluator?: InlineLLMEvaluatorInput | null; }; export type InlineLLMEvaluatorInput = { @@ -128,6 +130,14 @@ export type FreeformAnnotationConfigInput = { description?: string | null; name: string; }; +export type InlineCodeEvaluatorInput = { + description?: string | null; + language: Language; + name: string; + outputConfigs: ReadonlyArray; + sandboxConfigId?: string | null; + sourceCode: string; +}; export type EvaluatorInputMappingInput = { literalMapping?: any; pathMapping?: any; diff --git a/app/src/components/evaluators/__tests__/codeEvaluatorAutocomplete.test.ts b/app/src/components/evaluators/__tests__/codeEvaluatorAutocomplete.test.ts new file mode 100644 index 00000000000..0de144cbb31 --- /dev/null +++ b/app/src/components/evaluators/__tests__/codeEvaluatorAutocomplete.test.ts @@ -0,0 +1,83 @@ +import { describe, expect, it } from "vitest"; + +import { createCompletionOptions } from "../codeEvaluatorAutocomplete"; + +describe("createCompletionOptions", () => { + const mappingSource = { + output: { + answer: "Paris", + nested: { + score: 1, + }, + items: [ + { + name: "alpha", + }, + ], + }, + reference: { + answer: "Paris", + }, + input: { + question: "What is the capital of France?", + }, + metadata: { + isGolden: true, + }, + }; + + it("produces sensible nested and indexed property completions", () => { + const options = createCompletionOptions({ + mappingSource, + language: "TYPESCRIPT", + }); + + const labels = options.map((option) => option.label); + + expect(labels).toContain("output"); + expect(labels).toContain("reference"); + expect(labels).toContain("input"); + expect(labels).toContain("metadata"); + expect(labels).toContain("output.answer"); + expect(labels).toContain("output.nested"); + expect(labels).toContain("output.nested.score"); + expect(labels).toContain("output.items"); + expect(labels).toContain("output.items[0]"); + expect(labels).toContain("output.items[0].name"); + }); + + it("includes useful type information for completion entries", () => { + const options = createCompletionOptions({ + mappingSource, + language: "TYPESCRIPT", + }); + + expect( + options.find((option) => option.label === "output.answer")?.info + ).toBe('string: "Paris"'); + expect( + options.find((option) => option.label === "output.items")?.info + ).toBe("array (1 items)"); + expect( + options.find((option) => option.label === "metadata.isGolden")?.info + ).toBe("boolean: true"); + }); + + it("adds language-specific helper completions", () => { + const pythonOptions = createCompletionOptions({ + mappingSource, + language: "PYTHON", + }); + const typescriptOptions = createCompletionOptions({ + mappingSource, + language: "TYPESCRIPT", + }); + + expect(pythonOptions.map((option) => option.label)).toContain(".get("); + expect(pythonOptions.map((option) => option.label)).toContain( + "isinstance(" + ); + expect(typescriptOptions.map((option) => option.label)).toContain("?."); + expect(typescriptOptions.map((option) => option.label)).toContain("typeof"); + }); +}); diff --git a/app/src/components/evaluators/codeEvaluatorAutocomplete.ts b/app/src/components/evaluators/codeEvaluatorAutocomplete.ts new file mode 100644 index 00000000000..20c77a40199 --- /dev/null +++ b/app/src/components/evaluators/codeEvaluatorAutocomplete.ts @@ -0,0 +1,182 @@ +import type { + Completion, + CompletionContext, + CompletionResult, +} from "@codemirror/autocomplete"; +import { autocompletion } from "@codemirror/autocomplete"; + +import type { EvaluatorMappingSource } from "@phoenix/types"; +import { flattenObject } from "@phoenix/utils/jsonUtils"; + +/** + * Generates a human-readable type description for a value. + */ +function getTypeDescription(value: unknown): string { + if (value === null) return "null"; + if (value === undefined) return "undefined"; + if (Array.isArray(value)) { + if (value.length === 0) return "array (empty)"; + return `array (${value.length} items)`; + } + if (typeof value === "object") { + const keys = Object.keys(value as Record); + if (keys.length <= 3) return `object { ${keys.join(", ")} }`; + return `object (${keys.length} keys)`; + } + if (typeof value === "string") { + if (value.length > 30) return `string: "${value.slice(0, 30)}..."`; + return `string: "${value}"`; + } + if (typeof value === "number") return `number: ${value}`; + if (typeof value === "boolean") return `boolean: ${value}`; + return typeof value; +} + +/** + * Creates autocomplete options from the evaluator mapping source. + */ +export function createCompletionOptions({ + mappingSource, + language, +}: { + mappingSource: EvaluatorMappingSource; + language: "PYTHON" | "TYPESCRIPT"; +}): Completion[] { + const options: Completion[] = []; + + // Add top-level parameter completions + const topLevelParams = [ + { + name: "output", + data: mappingSource.output, + info: "The output from the task being evaluated", + }, + { + name: "reference", + data: mappingSource.reference, + info: "The expected/reference output from the dataset", + }, + { + name: "input", + data: mappingSource.input, + info: "The input provided to the task", + }, + { + name: "metadata", + data: mappingSource.metadata, + info: "Additional metadata from the dataset", + }, + ]; + + for (const { name, data, info } of topLevelParams) { + options.push({ + label: name, + type: "variable", + info, + boost: 10, // Boost top-level params + }); + + // Add nested field completions + if (data && typeof data === "object" && Object.keys(data).length > 0) { + const flattened = flattenObject({ + obj: data as Record, + parentKey: name, + keepNonTerminalValues: true, + formatIndices: true, + }) as Record; + for (const [path, value] of Object.entries(flattened)) { + options.push({ + label: path, + type: "property", + info: getTypeDescription(value), + boost: 5, + }); + } + } + } + + // Add language-specific helper completions + if (language === "PYTHON") { + options.push( + { + label: ".get(", + type: "method", + info: "Safely get a dict value with optional default", + apply: '.get("key", "")', + boost: 3, + }, + { + label: "isinstance(", + type: "function", + info: "Check if value is an instance of a type", + apply: "isinstance(output, dict)", + boost: 2, + } + ); + } else { + options.push( + { + label: "?.", + type: "keyword", + info: "Optional chaining operator", + boost: 3, + }, + { + label: "typeof", + type: "keyword", + info: "Check the type of a value", + apply: 'typeof output?.field === "string"', + boost: 2, + } + ); + } + + return options; +} + +/** + * Creates a completion function for the code evaluator editor. + */ +function createEvaluatorCompletions( + mappingSource: EvaluatorMappingSource, + language: "PYTHON" | "TYPESCRIPT" +): (context: CompletionContext) => CompletionResult | null { + return (context: CompletionContext): CompletionResult | null => { + // Match word characters and dots (for nested access like output.answer) + const word = context.matchBefore(/[\w.?]*/); + if (!word) return null; + + // Don't autocomplete if we're not at a word boundary or explicit + if (word.from === word.to && !context.explicit) return null; + + const options = createCompletionOptions({ mappingSource, language }); + + // Filter options based on what's typed + const typed = word.text.toLowerCase(); + const filteredOptions = typed + ? options.filter((opt) => opt.label.toLowerCase().includes(typed)) + : options; + + if (filteredOptions.length === 0) return null; + + return { + from: word.from, + options: filteredOptions, + validFor: /^[\w.?]*$/, + }; + }; +} + +/** + * Creates the autocompletion extension for the code evaluator editor. + */ +export function createEvaluatorAutocompletion( + mappingSource: EvaluatorMappingSource, + language: "PYTHON" | "TYPESCRIPT" +) { + return autocompletion({ + override: [createEvaluatorCompletions(mappingSource, language)], + activateOnTyping: true, + maxRenderedOptions: 50, + }); +} diff --git a/app/src/components/evaluators/codeEvaluatorTypeGeneration.ts b/app/src/components/evaluators/codeEvaluatorTypeGeneration.ts new file mode 100644 index 00000000000..db4b0d17760 --- /dev/null +++ b/app/src/components/evaluators/codeEvaluatorTypeGeneration.ts @@ -0,0 +1,253 @@ +import type { EvaluatorMappingSource } from "@phoenix/types"; + +/** + * Infers a TypeScript type string from a JavaScript value. + */ +function inferTypeFromValue(value: unknown, indent = 0): string { + const spaces = " ".repeat(indent); + + if (value === null) { + return "null"; + } + + if (value === undefined) { + return "undefined"; + } + + if (typeof value === "string") { + return "string"; + } + + if (typeof value === "number") { + return "number"; + } + + if (typeof value === "boolean") { + return "boolean"; + } + + if (Array.isArray(value)) { + if (value.length === 0) { + return "unknown[]"; + } + // Infer type from first element + const elementType = inferTypeFromValue(value[0], indent); + return `${elementType}[]`; + } + + if (typeof value === "object") { + const entries = Object.entries(value as Record); + if (entries.length === 0) { + return "Record"; + } + + const innerSpaces = " ".repeat(indent + 1); + const properties = entries + .map(([key, val]) => { + const safeKey = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/.test(key) + ? key + : `"${key}"`; + return `${innerSpaces}${safeKey}: ${inferTypeFromValue(val, indent + 1)};`; + }) + .join("\n"); + + return `{\n${properties}\n${spaces}}`; + } + + return "unknown"; +} + +/** + * Generates TypeScript interface definitions from the evaluator mapping source. + * Returns a read-only footer block to append to the code editor. + */ +export function generateTypeScriptTypes( + mappingSource: EvaluatorMappingSource +): string { + const lines: string[] = [ + "// Auto-generated types from dataset example (read-only)", + "// These types reflect the structure of your dataset", + ]; + + // Generate type for each mapping source field if it has data + const fields: Array<{ name: string; typeName: string; data: unknown }> = [ + { name: "input", typeName: "Input", data: mappingSource.input }, + { name: "output", typeName: "Output", data: mappingSource.output }, + { name: "reference", typeName: "Reference", data: mappingSource.reference }, + { name: "metadata", typeName: "Metadata", data: mappingSource.metadata }, + ]; + + for (const { typeName, data } of fields) { + if (data && typeof data === "object" && Object.keys(data).length > 0) { + const typeBody = inferTypeFromValue(data, 0); + lines.push(`type ${typeName} = ${typeBody};`); + lines.push(""); + } + } + + // Add the EvaluatorParams type that combines available fields + const availableFields = fields.filter( + (f) => + f.data && typeof f.data === "object" && Object.keys(f.data).length > 0 + ); + + if (availableFields.length > 0) { + lines.push("type EvaluatorParams = {"); + for (const { name, typeName } of availableFields) { + lines.push(` ${name}?: ${typeName};`); + } + lines.push("};"); + lines.push(""); + } + + return lines.join("\n"); +} + +/** + * Infers a Python type hint string from a JavaScript value. + */ +function inferPythonTypeFromValue(value: unknown): string { + if (value === null || value === undefined) { + return "None"; + } + + if (typeof value === "string") { + return "str"; + } + + if (typeof value === "number") { + return Number.isInteger(value) ? "int" : "float"; + } + + if (typeof value === "boolean") { + return "bool"; + } + + if (Array.isArray(value)) { + if (value.length === 0) { + return "list"; + } + const elementType = inferPythonTypeFromValue(value[0]); + return `list[${elementType}]`; + } + + if (typeof value === "object") { + const entries = Object.entries(value as Record); + if (entries.length === 0) { + return "dict"; + } + // For complex objects, use TypedDict representation in docstring + return "dict"; + } + + return "Any"; +} + +/** + * Generates a formatted dict structure description for Python docstrings. + */ +function formatPythonDictStructure( + data: Record, + indent = 0 +): string[] { + const lines: string[] = []; + const spaces = " ".repeat(indent); + + for (const [key, value] of Object.entries(data)) { + if (value && typeof value === "object" && !Array.isArray(value)) { + lines.push(`${spaces}"${key}": {`); + lines.push( + ...formatPythonDictStructure( + value as Record, + indent + 1 + ) + ); + lines.push(`${spaces}}`); + } else if ( + Array.isArray(value) && + value.length > 0 && + typeof value[0] === "object" + ) { + lines.push(`${spaces}"${key}": [`); + lines.push(`${spaces} {`); + lines.push( + ...formatPythonDictStructure( + value[0] as Record, + indent + 2 + ) + ); + lines.push(`${spaces} }`); + lines.push(`${spaces}]`); + } else { + const typeHint = inferPythonTypeFromValue(value); + lines.push(`${spaces}"${key}": ${typeHint}`); + } + } + + return lines; +} + +/** + * Generates Python docstring/type hints from the evaluator mapping source. + * Returns a read-only footer block to append to the code editor. + */ +export function generatePythonTypes( + mappingSource: EvaluatorMappingSource +): string { + const lines: string[] = [ + '"""', + "Auto-generated type information from dataset example (read-only)", + "These types reflect the structure of your dataset", + ]; + + const fields: Array<{ name: string; data: unknown }> = [ + { name: "input", data: mappingSource.input }, + { name: "output", data: mappingSource.output }, + { name: "reference", data: mappingSource.reference }, + { name: "metadata", data: mappingSource.metadata }, + ]; + + for (const { name, data } of fields) { + if (data && typeof data === "object" && Object.keys(data).length > 0) { + lines.push(`${name}: dict`); + const structureLines = formatPythonDictStructure( + data as Record, + 1 + ); + if (structureLines.length > 0) { + lines.push(" {"); + lines.push(...structureLines); + lines.push(" }"); + } + lines.push(""); + } + } + + lines.push('"""'); + lines.push(""); + + return lines.join("\n"); +} + +/** + * Generates type definitions based on language. + */ +export function generateEvaluatorTypes( + language: "PYTHON" | "TYPESCRIPT", + mappingSource: EvaluatorMappingSource +): string { + // Only generate types if there's meaningful data + const hasData = Object.values(mappingSource).some( + (data) => data && typeof data === "object" && Object.keys(data).length > 0 + ); + + if (!hasData) { + return ""; + } + + if (language === "PYTHON") { + return generatePythonTypes(mappingSource); + } + + return generateTypeScriptTypes(mappingSource); +} diff --git a/app/src/components/evaluators/codeEvaluatorUtils.ts b/app/src/components/evaluators/codeEvaluatorUtils.ts index 4c4903a2750..a92df4bbe52 100644 --- a/app/src/components/evaluators/codeEvaluatorUtils.ts +++ b/app/src/components/evaluators/codeEvaluatorUtils.ts @@ -5,11 +5,16 @@ export const DEFAULT_CODE_EVALUATOR_SOURCE: Record< string > = { PYTHON: `def evaluate(output, reference=None, input=None, metadata=None): + """ + Evaluate the output against the reference. + See the auto-generated type information below for the structure of each parameter. + """ candidate = output.get("answer", "") if isinstance(output, dict) else "" expected = reference.get("answer", "") if isinstance(reference, dict) else "" return 1 if candidate == expected else 0 `, - TYPESCRIPT: `function evaluate({ output, reference }: { output?: Record; reference?: Record }) { + TYPESCRIPT: `function evaluate({ output, reference, input, metadata }: EvaluatorParams) { + // See the auto-generated type definitions below for the structure of each parameter. const candidate = typeof output?.answer === "string" ? output.answer : ""; const expected = typeof reference?.answer === "string" ? reference.answer : ""; return candidate === expected ? 1 : 0; @@ -65,17 +70,12 @@ function extractTypeScriptVariables(sourceCode: string) { .map((part) => part.trim()) .filter(Boolean) .map((part) => part.split(":")[0]?.trim() ?? "") + .map((part) => part.split("=")[0]?.trim() ?? "") + .map((part) => part.replace(/\?$/, "").trim()) .filter(Boolean) .filter(unique); } - const firstParam = params.split(",")[0]?.trim() ?? ""; - const paramName = firstParam.split(":")[0]?.trim() ?? ""; - if (!paramName) { - return []; - } - const accessPattern = new RegExp(`${paramName}\\.([a-zA-Z_$][\\w$]*)`, "g"); - const matches = sourceCode.matchAll(accessPattern); - return Array.from(matches, (match) => match[1]).filter(unique); + return []; } function unique(value: string, index: number, values: string[]) { diff --git a/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx b/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx index 68d2873300c..b621625b279 100644 --- a/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx +++ b/app/src/pages/dataset/evaluators/CodeDatasetEvaluatorDetails.tsx @@ -7,7 +7,7 @@ import { graphql } from "relay-runtime"; import { Flex, Heading, Text } from "@phoenix/components"; import { JSONBlock } from "@phoenix/components/code"; import { EditCodeDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditCodeDatasetEvaluatorSlideover"; -import { CodeEvaluatorSourceCodeBlock } from "@phoenix/components/evaluators/EditCodeEvaluatorDialogContent"; +import { CodeEvaluatorSourceCodeBlock } from "@phoenix/components/evaluators/CodeEvaluatorSourceCodeBlock"; import type { CodeDatasetEvaluatorDetails_datasetEvaluator$key } from "@phoenix/pages/dataset/evaluators/__generated__/CodeDatasetEvaluatorDetails_datasetEvaluator.graphql"; const boxCSS = css` diff --git a/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx b/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx index 19f35a26bca..74b0a8c2adc 100644 --- a/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx +++ b/app/src/pages/dataset/evaluators/DatasetEvaluatorActionMenu.tsx @@ -12,6 +12,7 @@ import { Text, } from "@phoenix/components"; import { EditBuiltInDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditBuiltInDatasetEvaluatorSlideover"; +import { EditCodeDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditCodeDatasetEvaluatorSlideover"; import { EditLLMDatasetEvaluatorSlideover } from "@phoenix/components/dataset/EditLLMDatasetEvaluatorSlideover"; import { StopPropagation } from "@phoenix/components/StopPropagation"; @@ -91,6 +92,14 @@ export function DatasetEvaluatorActionMenu({ onOpenChange={setIsEditDialogOpen} updateConnectionIds={updateConnectionIds} /> + ) : evaluatorKind === "CODE" ? ( + ) : ( + isGraphQLMutationResponse( + response, + "SandboxConfigDialogCreateSandboxConfigMutation" + ) + ), + page.getByRole("button", { name: "Create Config" }).click(), + ]); + + await expect(page.getByRole("dialog")).not.toBeVisible(); + await expect(page.getByText(configName, { exact: true })).toBeVisible(); +} + +async function selectComboboxOption( + page: Page, + label: string, + optionName: string, + container?: Locator +) { + const scope = container ?? page; + const combobox = scope.getByRole("combobox", { name: label }); + await combobox.click(); + await combobox.fill(optionName); + await page.getByRole("option", { name: optionName, exact: true }).click(); + await expect(combobox).toHaveValue(optionName); +} + +async function selectLanguage( + page: Page, + container: Locator, + language: "Python" | "TypeScript" +) { + const languageField = container + .getByText("Language", { exact: true }) + .locator(".."); + await languageField.getByRole("button").click(); + await page.getByRole("option", { name: language, exact: true }).click(); + await expect(languageField.getByRole("button")).toHaveText(language); +} + +async function openEvaluatorEditor(page: Page, evaluatorName: string) { + const evaluatorRow = page.getByRole("row").filter({ + has: page.getByRole("cell", { name: evaluatorName, exact: true }), + }); + + await evaluatorRow.getByRole("button").last().click(); + await page.getByRole("menuitem", { name: "Edit" }).click(); + await expect( + page.getByRole("heading", { name: "Edit Evaluator" }) + ).toBeVisible(); +} + +async function createCustomCodeEvaluator({ + page, + evaluatorName, + language, + sandboxName, + description, +}: { + page: Page; + evaluatorName: string; + language: "Python" | "TypeScript"; + sandboxName?: string; + description?: string; +}) { + await page.getByRole("button", { name: "Add evaluator" }).click(); + await page + .getByRole("menuitem", { name: "Create new code evaluator" }) + .click(); + + const dialog = page.getByRole("dialog"); + await expect( + page.getByRole("heading", { name: "Create Evaluator" }) + ).toBeVisible(); + + await dialog + .getByRole("textbox", { name: "Name", exact: true }) + .fill(evaluatorName); + + if (description) { + await dialog + .getByRole("textbox", { name: /Description/i }) + .fill(description); + } + + if (language === "TypeScript") { + await selectLanguage(page, dialog, "TypeScript"); + } + + if (sandboxName) { + await selectComboboxOption(page, "Sandbox", sandboxName, dialog); + } + + await page.getByRole("button", { name: "Create" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); +} + +async function expectEvaluatorDetailsPage(page: Page, evaluatorName: string) { + await page.getByRole("link", { name: evaluatorName, exact: true }).click(); + await page.waitForURL("**/evaluators/**"); + await expect( + page.getByRole("heading", { name: evaluatorName }) + ).toBeVisible(); +} + +test.describe.serial("Code Evaluators", () => { + const datasetName = `code-evals-${randomUUID().slice(0, 8)}`; + const pythonSandboxName = `python-sandbox-${randomUUID().slice(0, 8)}`; + const typeScriptSandboxName = `ts-sandbox-${randomUUID().slice(0, 8)}`; + const pythonEvaluatorName = `python-code-eval-${randomUUID().slice(0, 8)}`; + const updatedPythonEvaluatorName = `updated-python-code-eval-${randomUUID().slice(0, 8)}`; + const typeScriptEvaluatorName = `typescript-code-eval-${randomUUID().slice(0, 8)}`; + + test("can create prerequisites for code evaluator flows", async ({ + page, + }) => { + await ensureSandboxConfig(page, "Python", pythonSandboxName); + await ensureSandboxConfig(page, "TypeScript", typeScriptSandboxName); + await createDatasetWithExample(page, datasetName); + await gotoDatasetEvaluators(page, datasetName); + + await expect( + page.getByRole("tab", { name: /Evaluators/i }) + ).toHaveAttribute("aria-selected", "true"); + }); + + test("can create and render a Python code evaluator", async ({ page }) => { + await gotoDatasetEvaluators(page, datasetName); + + await createCustomCodeEvaluator({ + page, + evaluatorName: pythonEvaluatorName, + language: "Python", + sandboxName: pythonSandboxName, + }); + + await expect( + page.getByRole("cell", { name: pythonEvaluatorName, exact: true }) + ).toBeVisible(); + + await expectEvaluatorDetailsPage(page, pythonEvaluatorName); + }); + + test("can create and render a TypeScript code evaluator", async ({ + page, + }) => { + await gotoDatasetEvaluators(page, datasetName); + + await createCustomCodeEvaluator({ + page, + evaluatorName: typeScriptEvaluatorName, + language: "TypeScript", + sandboxName: typeScriptSandboxName, + }); + + await expect( + page.getByRole("cell", { name: typeScriptEvaluatorName, exact: true }) + ).toBeVisible(); + + await expectEvaluatorDetailsPage(page, typeScriptEvaluatorName); + }); + + test("can edit a custom Python code evaluator", async ({ page }) => { + await gotoDatasetEvaluators(page, datasetName); + await openEvaluatorEditor(page, pythonEvaluatorName); + + const dialog = page.getByRole("dialog"); + const nameInput = dialog.getByRole("textbox", { + name: "Name", + exact: true, + }); + await expect(nameInput).toHaveValue(pythonEvaluatorName); + + await nameInput.fill(updatedPythonEvaluatorName); + await page.getByRole("button", { name: "Update" }).click(); + + await expect(page.getByTestId("dialog")).not.toBeVisible(); + await expect( + page.getByRole("cell", { name: updatedPythonEvaluatorName, exact: true }) + ).toBeVisible(); + + await openEvaluatorEditor(page, updatedPythonEvaluatorName); + await expect( + page + .getByRole("dialog") + .getByRole("textbox", { name: "Name", exact: true }) + ).toHaveValue(updatedPythonEvaluatorName); + await page.getByRole("button", { name: "Cancel" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); + + await expectEvaluatorDetailsPage(page, updatedPythonEvaluatorName); + }); + + test("submits a cleared sandbox when switching back to the original language", async ({ + page, + }) => { + await gotoDatasetEvaluators(page, datasetName); + await openEvaluatorEditor(page, updatedPythonEvaluatorName); + + const dialog = page.getByRole("dialog"); + const sandboxCombobox = dialog.getByRole("combobox", { name: "Sandbox" }); + + await expect(sandboxCombobox).toHaveValue(pythonSandboxName); + + await selectLanguage(page, dialog, "TypeScript"); + await selectComboboxOption(page, "Sandbox", typeScriptSandboxName, dialog); + await selectLanguage(page, dialog, "Python"); + + await expect(sandboxCombobox).toHaveValue(""); + + const updateCodeEvaluatorResponse = page.waitForResponse((response) => + isGraphQLMutationResponse( + response, + "EditCodeDatasetEvaluatorSlideover_updateCodeEvaluatorMutation" + ) + ); + + await page.getByRole("button", { name: "Update" }).click(); + + const response = await updateCodeEvaluatorResponse; + const requestBody = response.request().postDataJSON() as { + variables: { + input: { + sandboxConfigId?: string | null; + }; + }; + }; + + expect(requestBody.variables.input.sandboxConfigId).toBeNull(); + + await expect(page.getByTestId("dialog")).not.toBeVisible(); + + await expect( + page.getByRole("cell", { name: updatedPythonEvaluatorName, exact: true }) + ).toBeVisible(); + + await openEvaluatorEditor(page, updatedPythonEvaluatorName); + await expect( + page.getByRole("dialog").getByRole("combobox", { name: "Sandbox" }) + ).toHaveValue(""); + await page.getByRole("button", { name: "Cancel" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); + }); + + // Store names for additional test cases + const evaluatorWithDescriptionName = `eval-with-desc-${randomUUID().slice(0, 8)}`; + const evaluatorWithDescriptionDesc = "This evaluator checks output quality"; + const updatedDescription = "Updated description for testing"; + + test("can create code evaluator with description and verify it persists", async ({ + page, + }) => { + await gotoDatasetEvaluators(page, datasetName); + + await createCustomCodeEvaluator({ + page, + evaluatorName: evaluatorWithDescriptionName, + language: "Python", + sandboxName: pythonSandboxName, + description: evaluatorWithDescriptionDesc, + }); + + await expect( + page.getByRole("cell", { + name: evaluatorWithDescriptionName, + exact: true, + }) + ).toBeVisible(); + + // Reopen editor and verify description persisted + await openEvaluatorEditor(page, evaluatorWithDescriptionName); + const dialog = page.getByRole("dialog"); + const descriptionInput = dialog.getByRole("textbox", { + name: /Description/i, + }); + await expect(descriptionInput).toHaveValue(evaluatorWithDescriptionDesc); + + // Update the description + await descriptionInput.fill(updatedDescription); + await page.getByRole("button", { name: "Update" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); + + // Verify updated description persisted + await openEvaluatorEditor(page, evaluatorWithDescriptionName); + await expect( + page.getByRole("dialog").getByRole("textbox", { name: /Description/i }) + ).toHaveValue(updatedDescription); + await page.getByRole("button", { name: "Cancel" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); + }); + + test("cannot create code evaluator without selecting a sandbox", async ({ + page, + }) => { + await gotoDatasetEvaluators(page, datasetName); + + await page.getByRole("button", { name: "Add evaluator" }).click(); + await page + .getByRole("menuitem", { name: "Create new code evaluator" }) + .click(); + + const dialog = page.getByRole("dialog"); + await expect( + page.getByRole("heading", { name: "Create Evaluator" }) + ).toBeVisible(); + + await dialog + .getByRole("textbox", { name: "Name", exact: true }) + .fill("test-no-sandbox-eval"); + + // Don't select a sandbox - verify sandbox field is empty + const sandboxCombobox = dialog.getByRole("combobox", { name: "Sandbox" }); + await expect(sandboxCombobox).toHaveValue(""); + + // Attempt to create - should show validation error + await page.getByRole("button", { name: "Create" }).click(); + + // Verify validation error is shown + await expect( + dialog.getByText("Please select a sandbox configuration.") + ).toBeVisible(); + + // Dialog should still be open (not created) + await expect(dialog).toBeVisible(); + + // Verify the error alert header is also shown + await expect( + dialog.getByRole("heading", { + name: "Invalid code evaluator configuration", + }) + ).toBeVisible(); + }); + + test("can open test evaluator section and verify UI elements", async ({ + page, + }) => { + await gotoDatasetEvaluators(page, datasetName); + await openEvaluatorEditor(page, evaluatorWithDescriptionName); + + const dialog = page.getByRole("dialog"); + + // Ensure sandbox is set for testing + const sandboxCombobox = dialog.getByRole("combobox", { name: "Sandbox" }); + if ((await sandboxCombobox.inputValue()) === "") { + await selectComboboxOption(page, "Sandbox", pythonSandboxName, dialog); + } + + // Expand the Test Evaluator section + const testSectionTrigger = dialog.getByRole("button", { + name: "Test Evaluator", + }); + await testSectionTrigger.click(); + + // Verify the Test button is visible (exact match to avoid matching "Test Evaluator") + const testButton = dialog.getByRole("button", { + name: "Test", + exact: true, + }); + await expect(testButton).toBeVisible(); + + // Verify the test section description is visible + await expect( + dialog.getByText( + "Run your evaluator against the example data to verify it works correctly" + ) + ).toBeVisible(); + + // Note: Actually running the test requires a working sandbox backend. + // The dismiss button test is skipped as it depends on sandbox execution. + + await page.getByRole("button", { name: "Cancel" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); + }); + + const categoricalEvaluatorName = `categorical-eval-${randomUUID().slice(0, 8)}`; + + test("can configure categorical choices in code evaluator", async ({ + page, + }) => { + await gotoDatasetEvaluators(page, datasetName); + + await page.getByRole("button", { name: "Add evaluator" }).click(); + await page + .getByRole("menuitem", { name: "Create new code evaluator" }) + .click(); + + const dialog = page.getByRole("dialog"); + await expect( + page.getByRole("heading", { name: "Create Evaluator" }) + ).toBeVisible(); + + await dialog + .getByRole("textbox", { name: "Name", exact: true }) + .fill(categoricalEvaluatorName); + + await selectComboboxOption(page, "Sandbox", pythonSandboxName, dialog); + + // Expand Output Configuration section + const outputConfigTrigger = dialog.getByRole("button", { + name: "Output Configuration", + }); + // Click only if section is collapsed (check if panel is not visible) + const outputConfigPanel = dialog.getByText( + "Define the output type and optimization direction" + ); + if (!(await outputConfigPanel.isVisible())) { + await outputConfigTrigger.click(); + } + await expect(outputConfigPanel).toBeVisible(); + + // Change output type from Continuous to Categorical + const outputTypeSelect = dialog.getByRole("button", { + name: /Continuous score|Categorical label/, + }); + await outputTypeSelect.click(); + await page + .getByRole("option", { name: "Categorical label", exact: true }) + .click(); + + // Verify Choices section appears with default two choices + await expect(dialog.getByText("Choices", { exact: true })).toBeVisible(); + + // Fill in the first choice + const choiceInputs = dialog.locator('input[placeholder^="Choice"]'); + await expect(choiceInputs.first()).toBeVisible(); + await choiceInputs.first().fill("Good"); + + // Fill in the second choice + await choiceInputs.nth(1).fill("Bad"); + + // Add a third choice + await dialog.getByRole("button", { name: "+ Add choice" }).click(); + await choiceInputs.nth(2).fill("Neutral"); + + // Verify the third choice was added + await expect(choiceInputs).toHaveCount(3); + + // Remove the third choice using the aria-labeled button + const removeButtons = dialog.getByRole("button", { name: "Remove choice" }); + await expect(removeButtons).toHaveCount(3); + await removeButtons.last().click(); + + // Verify we're back to two choices + await expect(choiceInputs).toHaveCount(2); + + // Verify remove is disabled when only 2 choices remain + const remainingRemoveButtons = dialog.getByRole("button", { + name: "Remove choice", + }); + await expect(remainingRemoveButtons.first()).toBeDisabled(); + await expect(remainingRemoveButtons.last()).toBeDisabled(); + + // Create the evaluator + await page.getByRole("button", { name: "Create" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); + + // Verify the evaluator was created + await expect( + page.getByRole("cell", { name: categoricalEvaluatorName, exact: true }) + ).toBeVisible(); + + // Reopen and verify categorical config persisted + await openEvaluatorEditor(page, categoricalEvaluatorName); + await expect(dialog.getByText("Choices", { exact: true })).toBeVisible(); + await expect(choiceInputs.first()).toHaveValue("Good"); + await expect(choiceInputs.last()).toHaveValue("Bad"); + + await page.getByRole("button", { name: "Cancel" }).click(); + await expect(page.getByTestId("dialog")).not.toBeVisible(); + }); +}); diff --git a/src/phoenix/server/api/evaluators.py b/src/phoenix/server/api/evaluators.py index 5498685766d..040d4834309 100644 --- a/src/phoenix/server/api/evaluators.py +++ b/src/phoenix/server/api/evaluators.py @@ -1,3 +1,4 @@ +import ast import json import logging import re @@ -2328,6 +2329,156 @@ def _get_template_variables_attributes(*, variables: dict[str, Any]) -> dict[str return {TEMPLATE_VARIABLES: json.dumps(variables)} +def _make_object_input_schema( + parameter_names: Sequence[str], + required_names: Sequence[str], +) -> dict[str, Any]: + return { + "type": "object", + "properties": {name: {} for name in parameter_names}, + "required": list(required_names), + } + + +_SUPPORTED_CODE_EVALUATOR_INPUT_NAMES = ("output", "reference", "input", "metadata") + + +def _validate_code_evaluator_input_names( + parameter_names: Sequence[str], + *, + language: str, +) -> Optional[str]: + unsupported_names = [ + name for name in parameter_names if name not in _SUPPORTED_CODE_EVALUATOR_INPUT_NAMES + ] + if not unsupported_names: + return None + supported_names = ", ".join(f"`{name}`" for name in _SUPPORTED_CODE_EVALUATOR_INPUT_NAMES) + invalid_names = ", ".join(f"`{name}`" for name in unsupported_names) + return ( + f"Could not infer the {language} evaluator inputs because the `evaluate(...)` signature " + f"uses unsupported parameter names: {invalid_names}. Supported parameter names are " + f"{supported_names}." + ) + + +def _infer_python_evaluate_input_schema(source_code: str) -> tuple[dict[str, Any], Optional[str]]: + try: + module = ast.parse(source_code) + except SyntaxError as exc: + return ( + {}, + ( + "Could not parse the Python evaluator signature. " + "Define a top-level function like " + "`def evaluate(output, reference=None, input=None, metadata=None):`. " + f"Parser error: {exc.msg}" + ), + ) + + evaluate_function = next( + ( + node + for node in module.body + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == "evaluate" + ), + None, + ) + if evaluate_function is None: + return ( + {}, + ( + "Could not infer the Python evaluator inputs because no top-level " + "`evaluate(...)` function was found. Define a function like " + "`def evaluate(output, reference=None, input=None, metadata=None):`." + ), + ) + + args = evaluate_function.args + positional_args = [*args.posonlyargs, *args.args] + positional_required_count = len(positional_args) - len(args.defaults) + required_names = [arg.arg for arg in positional_args[:positional_required_count]] + required_names.extend( + arg.arg for arg, default in zip(args.kwonlyargs, args.kw_defaults) if default is None + ) + + parameter_names = [arg.arg for arg in positional_args] + parameter_names.extend(arg.arg for arg in args.kwonlyargs) + + invalid_name_error = _validate_code_evaluator_input_names( + parameter_names, + language="Python", + ) + if invalid_name_error is not None: + return ({}, invalid_name_error) + + return (_make_object_input_schema(parameter_names, required_names), None) + + +_TYPESCRIPT_FUNCTION_SIGNATURE_RE = re.compile(r"function\s+evaluate\s*\(([^)]*)\)") +_TYPESCRIPT_ARROW_SIGNATURE_RE = re.compile(r"(?:const|let|var)\s+evaluate\s*=\s*\(([^)]*)\)\s*=>") + + +def _extract_typescript_object_parameter_keys(params: str) -> tuple[list[str], list[str]]: + destructured = re.match(r"^\{([^}]*)\}", params.strip()) + if destructured is None: + return ([], []) + + parameter_names: list[str] = [] + for raw_part in destructured.group(1).split(","): + part = raw_part.strip() + if not part: + continue + part = part.split(":", 1)[0].strip() + if not part: + continue + name = part.split("=", 1)[0].rstrip("?").strip() + if not name: + continue + parameter_names.append(name) + return (parameter_names, []) + + +def _infer_typescript_evaluate_input_schema( + source_code: str, +) -> tuple[dict[str, Any], Optional[str]]: + signature = _TYPESCRIPT_FUNCTION_SIGNATURE_RE.search( + source_code + ) or _TYPESCRIPT_ARROW_SIGNATURE_RE.search(source_code) + if signature is None: + return ( + {}, + ( + "Could not infer the TypeScript evaluator inputs because no supported " + "`evaluate(...)` signature was found. Define `evaluate` as either " + "`function evaluate({ output, reference, input, metadata }: " + "EvaluatorParams) { ... }` or `const evaluate = ({ output, " + "reference, input, metadata }: EvaluatorParams) => { ... }`." + ), + ) + + parameter_names, required_names = _extract_typescript_object_parameter_keys(signature.group(1)) + if not parameter_names: + return ( + {}, + ( + "Could not infer the TypeScript evaluator inputs from the `evaluate(...)` " + "signature. Use a destructured object parameter like " + "`function evaluate({ output, reference, input, metadata }: " + "EvaluatorParams) { ... }`." + ), + ) + + invalid_name_error = _validate_code_evaluator_input_names( + parameter_names, + language="TypeScript", + ) + if invalid_name_error is not None: + return ({}, invalid_name_error) + + return (_make_object_input_schema(parameter_names, required_names), None) + + class CodeEvaluatorRunner(BaseEvaluator): """ Evaluator that executes user-provided source code in a sandbox. @@ -2372,7 +2523,15 @@ def output_configs(self) -> Sequence[OutputConfigType]: @property def input_schema(self) -> dict[str, Any]: - return {} + schema, _ = self._infer_input_schema() + return schema + + def _infer_input_schema(self) -> tuple[dict[str, Any], Optional[str]]: + if self._language == "PYTHON": + return _infer_python_evaluate_input_schema(self._source_code) + if self._language == "TYPESCRIPT": + return _infer_typescript_evaluate_input_schema(self._source_code) + return ({}, None) def _build_python_harness(self, mapped_inputs: dict[str, Any]) -> str: """Wrap source_code in a Python script that calls evaluate(**inputs).""" @@ -2428,9 +2587,16 @@ async def evaluate( start_time = datetime.now(timezone.utc) + input_schema, inference_error = self._infer_input_schema() + if inference_error is not None: + return [ + self._make_error_result(name, inference_error, start_time) + for _ in (output_configs or [None]) # type: ignore[list-item] + ] + try: mapped_inputs = apply_input_mapping( - input_schema={}, + input_schema=input_schema, input_mapping=input_mapping, context=context, ) diff --git a/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py b/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py index 8f603661693..41504cd1893 100644 --- a/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py +++ b/src/phoenix/server/api/input_types/EvaluatorPreviewInput.py @@ -11,6 +11,7 @@ from phoenix.server.api.input_types.GenerativeCredentialInput import GenerativeCredentialInput from phoenix.server.api.input_types.PlaygroundEvaluatorInput import EvaluatorInputMappingInput from phoenix.server.api.input_types.PromptVersionInput import ChatPromptVersionInput +from phoenix.server.api.types.SandboxConfig import Language @strawberry.input @@ -23,16 +24,29 @@ class InlineLLMEvaluatorInput: description: Optional[str] = None +@strawberry.input +class InlineCodeEvaluatorInput: + """Defines an inline code evaluator without requiring persistence.""" + + name: str + language: Language + source_code: str + output_configs: list[AnnotationConfigInput] + sandbox_config_id: Optional[GlobalID] = None + description: Optional[str] = None + + @strawberry.input(one_of=True) class EvaluatorPreviewInput: """ Input for previewing an evaluator. Either provide an existing evaluator ID - or an inline LLM evaluator definition. + or an inline evaluator definition. """ built_in_evaluator_id: Optional[GlobalID] = UNSET inline_llm_evaluator: Optional[InlineLLMEvaluatorInput] = UNSET code_evaluator_id: Optional[GlobalID] = UNSET + inline_code_evaluator: Optional[InlineCodeEvaluatorInput] = UNSET @strawberry.input diff --git a/src/phoenix/server/api/mutations/chat_mutations.py b/src/phoenix/server/api/mutations/chat_mutations.py index eeceb6ba0ad..11a71a2aa67 100644 --- a/src/phoenix/server/api/mutations/chat_mutations.py +++ b/src/phoenix/server/api/mutations/chat_mutations.py @@ -35,7 +35,8 @@ ) from phoenix.server.api.types.Evaluator import BuiltInEvaluator, CodeEvaluator from phoenix.server.api.types.ExperimentRunAnnotation import ExperimentRunAnnotation -from phoenix.server.api.types.node import from_global_id +from phoenix.server.api.types.node import from_global_id, from_global_id_with_expected_type +from phoenix.server.api.types.SandboxConfig import SandboxConfig from phoenix.server.api.types.Trace import Trace logger = logging.getLogger(__name__) @@ -91,6 +92,70 @@ def _to_evaluation_result( ) +async def _resolve_inline_code_evaluator_backend( + *, + info: Info[Context, None], + sandbox_config_id: Optional[strawberry.relay.GlobalID], + language: str, +) -> tuple[Any, Optional[int]]: + from phoenix.server.sandbox import get_or_create_backend + + if sandbox_config_id is None: + raise BadRequest( + f"No sandbox configuration selected for language '{language}'. " + "Choose a sandbox configuration before testing this evaluator." + ) + + sandbox_config_db_id = from_global_id_with_expected_type( + sandbox_config_id, SandboxConfig.__name__ + ) + + async with info.context.db() as session: + sandbox_cfg = await session.get(models.SandboxConfig, sandbox_config_db_id) + if sandbox_cfg is None: + raise BadRequest(f"Sandbox configuration with id {sandbox_config_id} was not found") + if not sandbox_cfg.enabled: + raise BadRequest( + ( + f"Sandbox configuration '{sandbox_cfg.name}' is disabled. Enable it before " + "testing this evaluator." + ) + ) + + sandbox_timeout = sandbox_cfg.timeout + provider = await session.get(models.SandboxProvider, sandbox_cfg.sandbox_provider_id) + if provider is None: + raise BadRequest( + f"Sandbox provider for configuration '{sandbox_cfg.name}' was not found" + ) + if not provider.enabled: + raise BadRequest( + ( + f"Sandbox provider '{provider.backend_type}' is disabled. Enable it before " + "testing this evaluator." + ) + ) + + provider_language_row = await session.get(models.Language, provider.language_id) + if provider_language_row is not None and provider_language_row.name != language: + raise BadRequest("Sandbox provider language does not match code evaluator language") + + merged_config = { + **provider.config, + **sandbox_cfg.config, + } + backend_type = provider.backend_type + sandbox_backend = get_or_create_backend(backend_type, config=merged_config) + + if sandbox_backend is None: + raise BadRequest( + f"Sandbox backend '{backend_type}' is unavailable for language '{language}'. " + "Ensure the backend is installed and configured." + ) + + return sandbox_backend, sandbox_timeout + + @strawberry.type class ChatCompletionMutationMixin: @strawberry.mutation(permission_classes=[IsNotReadOnly, IsNotViewer, IsLocked]) # type: ignore @@ -278,7 +343,49 @@ async def evaluator_previews( for eval_result in eval_results: all_results.append(_to_evaluation_result(eval_result, eval_result["name"])) + elif inline_code_evaluator := evaluator_input.inline_code_evaluator: + from phoenix.server.api.evaluators import CodeEvaluatorRunner + + language = inline_code_evaluator.language.value + evaluator_name = inline_code_evaluator.name + evaluator_description = inline_code_evaluator.description + source_code = inline_code_evaluator.source_code + + # Convert output configs + output_configs = [ + c + for c in _convert_output_config_inputs_to_pydantic( + inline_code_evaluator.output_configs + ) + if isinstance(c, (CategoricalOutputConfig, ContinuousOutputConfig)) + ] + + sandbox_backend, sandbox_timeout = await _resolve_inline_code_evaluator_backend( + info=info, + sandbox_config_id=inline_code_evaluator.sandbox_config_id, + language=language, + ) + + runner = CodeEvaluatorRunner( + name=evaluator_name, + description=evaluator_description, + source_code=source_code, + stored_output_configs=output_configs, + sandbox_backend=sandbox_backend, + language=language, + timeout=sandbox_timeout, + ) + eval_results = await runner.evaluate( + context=context, + input_mapping=input_mapping.to_orm(), + name=evaluator_name, + output_configs=output_configs, + session_key="", + ) + for eval_result in eval_results: + all_results.append(_to_evaluation_result(eval_result, eval_result["name"])) + else: - raise BadRequest("Either evaluator_id or inline_llm_evaluator must be provided") + raise BadRequest("Either evaluator_id or inline evaluator must be provided") return EvaluatorPreviewsPayload(results=all_results) diff --git a/src/phoenix/server/api/mutations/evaluator_mutations.py b/src/phoenix/server/api/mutations/evaluator_mutations.py index dd630103536..ddaaa86436e 100644 --- a/src/phoenix/server/api/mutations/evaluator_mutations.py +++ b/src/phoenix/server/api/mutations/evaluator_mutations.py @@ -47,7 +47,7 @@ from phoenix.server.api.types.Identifier import Identifier from phoenix.server.api.types.node import from_global_id, from_global_id_with_expected_type from phoenix.server.api.types.PromptVersion import PromptVersion -from phoenix.server.api.types.SandboxConfig import Language +from phoenix.server.api.types.SandboxConfig import Language, SandboxConfig from phoenix.server.bearer_auth import PhoenixUser @@ -315,7 +315,7 @@ class CreateCodeEvaluatorInput: source_code: str language: Language description: Optional[str] = None - sandbox_config_id: Optional[int] = None + sandbox_config_id: Optional[GlobalID] = None output_configs: Optional[list[AnnotationConfigInput]] = None input_mapping: Optional[EvaluatorInputMappingInput] = None @@ -327,7 +327,7 @@ class UpdateCodeEvaluatorInput: source_code: Optional[str] = UNSET language: Optional[Language] = UNSET description: Optional[str] = UNSET - sandbox_config_id: Optional[int] = UNSET + sandbox_config_id: Optional[GlobalID] = UNSET output_configs: Optional[list[AnnotationConfigInput]] = UNSET input_mapping: Optional[EvaluatorInputMappingInput] = UNSET @@ -1184,9 +1184,13 @@ async def create_code_evaluator( if language_id is None: raise BadRequest(f"Unknown language: {input.language!r}") + sandbox_config_id = None if input.sandbox_config_id is not None: + sandbox_config_id = from_global_id_with_expected_type( + input.sandbox_config_id, SandboxConfig.__name__ + ) await _validate_language_matches_sandbox( - language_id, input.sandbox_config_id, session + language_id, sandbox_config_id, session ) row = models.CodeEvaluator( @@ -1194,7 +1198,7 @@ async def create_code_evaluator( description=input.description, source_code=input.source_code, language_id=language_id, - sandbox_config_id=input.sandbox_config_id, + sandbox_config_id=sandbox_config_id, output_configs=output_configs, input_mapping=input_mapping_orm, user_id=user_id, @@ -1248,7 +1252,13 @@ async def update_code_evaluator( row.description = input.description if input.sandbox_config_id is not UNSET: - row.sandbox_config_id = input.sandbox_config_id + row.sandbox_config_id = ( + None + if input.sandbox_config_id is None + else from_global_id_with_expected_type( + input.sandbox_config_id, SandboxConfig.__name__ + ) + ) if input.output_configs is not UNSET and input.output_configs is not None: row.output_configs = list( diff --git a/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py b/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py index 370223752e4..c5f57e21c6f 100644 --- a/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py +++ b/tests/unit/server/api/mutations/test_code_evaluator_sandbox_mutations.py @@ -425,33 +425,63 @@ async def test_update_provider_not_found_returns_error( } """ +_CREATE_CODE_EVALUATOR = """ +mutation CreateCodeEvaluator($input: CreateCodeEvaluatorInput!) { + createCodeEvaluator(input: $input) { + evaluator { + id + ... on CodeEvaluator { + sandboxConfig { + id + } + } + } + } +} +""" + +_UPDATE_CODE_EVALUATOR = """ +mutation UpdateCodeEvaluator($input: UpdateCodeEvaluatorInput!) { + updateCodeEvaluator(input: $input) { + evaluator { + id + ... on CodeEvaluator { + sandboxConfig { + id + } + } + } + } +} +""" + + +async def _create_code_evaluator_with_config( + db: DbSessionFactory, + sandbox_config: models.SandboxConfig, +) -> int: + """Insert a CodeEvaluator row linked to the given sandbox config.""" + async with db() as session: + code_eval = models.CodeEvaluator( + name=Identifier(root="test-disabled-guard-eval"), + description=None, + metadata_={}, + source_code="def evaluate(input): return {'score': 1.0}", + sandbox_config_id=sandbox_config.id, + ) + session.add(code_eval) + await session.flush() + return code_eval.id -class TestDisabledProviderAndConfigGuards: - async def _create_code_evaluator_with_config( - self, - db: DbSessionFactory, - sandbox_config: models.SandboxConfig, - ) -> int: - """Insert a CodeEvaluator row (joined-table inheritance) linked to the given sandbox config.""" - async with db() as session: - code_eval = models.CodeEvaluator( - name=Identifier(root="test-disabled-guard-eval"), - description=None, - metadata_={}, - source_code="def evaluate(input): return {'score': 1.0}", - sandbox_config_id=sandbox_config.id, - ) - session.add(code_eval) - await session.flush() - return code_eval.id +class TestDisabledProviderAndConfigGuards: async def test_disabled_provider_blocks_execution( self, gql_client: AsyncGraphQLClient, db: DbSessionFactory, sandbox_config: models.SandboxConfig, ) -> None: - evaluator_db_id = await self._create_code_evaluator_with_config(db, sandbox_config) + evaluator_db_id = await _create_code_evaluator_with_config(db, sandbox_config) evaluator_gid = str(GlobalID("CodeEvaluator", str(evaluator_db_id))) # Disable the provider via the updateSandboxProvider mutation @@ -486,13 +516,88 @@ async def test_disabled_provider_blocks_execution( ) assert result.errors + +class TestCodeEvaluatorSandboxMutationIds: + async def test_create_code_evaluator_accepts_sandbox_global_id( + self, + gql_client: AsyncGraphQLClient, + db: DbSessionFactory, + sandbox_config: models.SandboxConfig, + ) -> None: + result = await gql_client.execute( + _CREATE_CODE_EVALUATOR, + variables={ + "input": { + "name": "test_code_evaluator", + "description": "uses relay id", + "language": "PYTHON", + "sourceCode": "def evaluate(output):\n return {'score': 1.0}", + "sandboxConfigId": _config_global_id(sandbox_config.id), + "outputConfigs": [ + { + "continuous": { + "name": "score", + "optimizationDirection": "NONE", + "lowerBound": 0, + "upperBound": 1, + } + } + ], + } + }, + ) + assert result.data and not result.errors + evaluator = result.data["createCodeEvaluator"]["evaluator"] + assert evaluator["sandboxConfig"]["id"] == _config_global_id(sandbox_config.id) + + evaluator_id = GlobalID.from_id(evaluator["id"]) + async with db() as session: + row = await session.get(models.CodeEvaluator, int(evaluator_id.node_id)) + assert row is not None + assert row.sandbox_config_id == sandbox_config.id + + async def test_update_code_evaluator_accepts_sandbox_global_id( + self, + gql_client: AsyncGraphQLClient, + db: DbSessionFactory, + sandbox_config: models.SandboxConfig, + ) -> None: + async with db() as session: + code_eval = models.CodeEvaluator( + name=Identifier(root="test_update_code_evaluator"), + description=None, + metadata_={}, + source_code="def evaluate(output): return {'score': 0.0}", + ) + session.add(code_eval) + await session.flush() + evaluator_gid = str(GlobalID("CodeEvaluator", str(code_eval.id))) + + result = await gql_client.execute( + _UPDATE_CODE_EVALUATOR, + variables={ + "input": { + "id": evaluator_gid, + "sandboxConfigId": _config_global_id(sandbox_config.id), + } + }, + ) + assert result.data and not result.errors + evaluator = result.data["updateCodeEvaluator"]["evaluator"] + assert evaluator["sandboxConfig"]["id"] == _config_global_id(sandbox_config.id) + + async with db() as session: + row = await session.get(models.CodeEvaluator, code_eval.id) + assert row is not None + assert row.sandbox_config_id == sandbox_config.id + async def test_disabled_config_blocks_execution( self, gql_client: AsyncGraphQLClient, db: DbSessionFactory, sandbox_config: models.SandboxConfig, ) -> None: - evaluator_db_id = await self._create_code_evaluator_with_config(db, sandbox_config) + evaluator_db_id = await _create_code_evaluator_with_config(db, sandbox_config) evaluator_gid = str(GlobalID("CodeEvaluator", str(evaluator_db_id))) # Disable the sandbox config via the mutation diff --git a/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py b/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py index adc0b4296b4..c67a0c5bf2f 100644 --- a/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py +++ b/tests/unit/server/api/mutations/test_evaluator_preview_mutation.py @@ -1,10 +1,12 @@ from typing import Any +from unittest.mock import AsyncMock, patch import pytest from sqlalchemy import select from strawberry.relay.types import GlobalID from phoenix.db import models +from phoenix.server.sandbox.types import ExecutionResult from phoenix.server.types import DbSessionFactory from tests.unit.graphql import AsyncGraphQLClient @@ -150,3 +152,167 @@ async def test_preview_requires_evaluator_or_inline( ) assert result.errors is not None + + +class TestInlineCodeEvaluatorPreviewMutation: + async def _preview_inline_code_evaluator( + self, + gql_client: AsyncGraphQLClient, + *, + sandbox_config_id: str | None, + language: str = "PYTHON", + source_code: str = "def evaluate(output):\n return 1.0", + ) -> Any: + return await gql_client.execute( + TestEvaluatorPreviewMutation._MUTATION, + { + "input": { + "previews": [ + { + "evaluator": { + "inlineCodeEvaluator": { + "name": "inline_code_eval", + "description": "preview", + "language": language, + "sourceCode": source_code, + "sandboxConfigId": sandbox_config_id, + "outputConfigs": [ + { + "continuous": { + "name": "score", + "optimizationDirection": "NONE", + "lowerBound": 0, + "upperBound": 1, + } + } + ], + } + }, + "context": {"output": {"answer": "4"}}, + "inputMapping": {}, + } + ] + } + }, + ) + + async def test_requires_sandbox_config_selection( + self, + gql_client: AsyncGraphQLClient, + ) -> None: + result = await self._preview_inline_code_evaluator( + gql_client, + sandbox_config_id=None, + ) + + assert result.errors is not None + assert "No sandbox configuration selected" in result.errors[0].message + + async def test_rejects_wrong_global_id_type( + self, + gql_client: AsyncGraphQLClient, + sandbox_config: models.SandboxConfig, + ) -> None: + wrong_type_id = str(GlobalID("SandboxProvider", str(sandbox_config.id))) + + result = await self._preview_inline_code_evaluator( + gql_client, + sandbox_config_id=wrong_type_id, + ) + + assert result.errors is not None + assert "SandboxConfig" in result.errors[0].message + + async def test_rejects_missing_sandbox_config( + self, + gql_client: AsyncGraphQLClient, + ) -> None: + result = await self._preview_inline_code_evaluator( + gql_client, + sandbox_config_id=str(GlobalID("SandboxConfig", "999999")), + ) + + assert result.errors is not None + assert "was not found" in result.errors[0].message + + async def test_rejects_disabled_sandbox_config( + self, + gql_client: AsyncGraphQLClient, + db: DbSessionFactory, + sandbox_config: models.SandboxConfig, + ) -> None: + async with db() as session: + row = await session.get(models.SandboxConfig, sandbox_config.id) + assert row is not None + row.enabled = False + await session.commit() + + result = await self._preview_inline_code_evaluator( + gql_client, + sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))), + ) + + assert result.errors is not None + assert "is disabled" in result.errors[0].message + + async def test_rejects_disabled_sandbox_provider( + self, + gql_client: AsyncGraphQLClient, + db: DbSessionFactory, + sandbox_config: models.SandboxConfig, + ) -> None: + async with db() as session: + provider = await session.get(models.SandboxProvider, sandbox_config.sandbox_provider_id) + assert provider is not None + provider.enabled = False + await session.commit() + + result = await self._preview_inline_code_evaluator( + gql_client, + sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))), + ) + + assert result.errors is not None + assert "Sandbox provider" in result.errors[0].message + assert "is disabled" in result.errors[0].message + + async def test_rejects_language_mismatch( + self, + gql_client: AsyncGraphQLClient, + sandbox_config: models.SandboxConfig, + ) -> None: + result = await self._preview_inline_code_evaluator( + gql_client, + sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))), + language="TYPESCRIPT", + source_code="function evaluate({ output }: EvaluatorParams) { return 1; }", + ) + + assert result.errors is not None + assert "language does not match" in result.errors[0].message + + async def test_returns_preview_result_for_valid_inline_code_evaluator( + self, + gql_client: AsyncGraphQLClient, + sandbox_config: models.SandboxConfig, + ) -> None: + backend = AsyncMock() + backend.execute = AsyncMock( + return_value=ExecutionResult(stdout="1.0", stderr="", error=None) + ) + + with patch( + "phoenix.server.sandbox.get_or_create_backend", + return_value=backend, + ): + result = await self._preview_inline_code_evaluator( + gql_client, + sandbox_config_id=str(GlobalID("SandboxConfig", str(sandbox_config.id))), + ) + + assert result.data and not result.errors + results = result.data["evaluatorPreviews"]["results"] + assert len(results) == 1 + assert results[0]["evaluatorName"] == "inline_code_eval" + assert results[0]["error"] is None + assert results[0]["annotation"]["score"] == 1.0 diff --git a/tests/unit/server/api/test_code_evaluator_runner.py b/tests/unit/server/api/test_code_evaluator_runner.py index 0e923e521c5..05b36a804a8 100644 --- a/tests/unit/server/api/test_code_evaluator_runner.py +++ b/tests/unit/server/api/test_code_evaluator_runner.py @@ -111,6 +111,90 @@ def test_typescript_harness_contains_json_stringify(self) -> None: assert "JSON.stringify" in harness +class TestInputSchemaInference: + def test_python_input_schema_infers_top_level_parameters(self) -> None: + runner, _ = _make_runner( + source_code=( + "def evaluate(output, reference=None, input=None, *, metadata=None):\n" + " return 1\n" + ) + ) + + assert runner.input_schema == { + "type": "object", + "properties": { + "output": {}, + "reference": {}, + "input": {}, + "metadata": {}, + }, + "required": ["output"], + } + + def test_typescript_input_schema_infers_destructured_parameters(self) -> None: + runner, _ = _make_runner( + source_code=( + "function evaluate({ output, reference, input, metadata }: EvaluatorParams) " + "{ return 1; }" + ), + language="TYPESCRIPT", + ) + + assert runner.input_schema == { + "type": "object", + "properties": { + "output": {}, + "reference": {}, + "input": {}, + "metadata": {}, + }, + "required": [], + } + + def test_python_input_schema_returns_error_when_evaluate_is_missing(self) -> None: + runner, _ = _make_runner(source_code="def not_evaluate(output):\n return 1\n") + + schema, error = runner._infer_input_schema() + assert schema == {} + assert error is not None + assert "no top-level `evaluate(...)` function was found" in error + + def test_typescript_input_schema_returns_error_for_non_destructured_signature(self) -> None: + runner, _ = _make_runner( + source_code="function evaluate(output: EvaluatorParams) { return 1; }", + language="TYPESCRIPT", + ) + + schema, error = runner._infer_input_schema() + assert schema == {} + assert error is not None + assert "Use a destructured object parameter" in error + + def test_python_input_schema_returns_error_for_unsupported_parameter_names(self) -> None: + runner, _ = _make_runner( + source_code="def evaluate(outputs, reference=None):\n return 1\n" + ) + + schema, error = runner._infer_input_schema() + assert schema == {} + assert error is not None + assert "unsupported parameter names: `outputs`" in error + + def test_typescript_input_schema_returns_error_for_unsupported_parameter_names(self) -> None: + runner, _ = _make_runner( + source_code=( + "function evaluate({ outputs, reference, input, metadata }: EvaluatorParams) " + "{ return 1; }" + ), + language="TYPESCRIPT", + ) + + schema, error = runner._infer_input_schema() + assert schema == {} + assert error is not None + assert "unsupported parameter names: `outputs`" in error + + class TestEvaluateSuccessPath: async def test_returns_label_from_stdout(self) -> None: runner, _ = _make_runner(backend_stdout='"pass"') @@ -191,8 +275,99 @@ async def test_none_timeout_forwarded_to_backend_execute(self) -> None: call_kwargs = backend.execute.call_args assert call_kwargs.kwargs.get("timeout") is None + async def test_python_evaluate_auto_passes_context_keys_matching_signature(self) -> None: + runner, backend = _make_runner(source_code="def evaluate(output, reference=None): return 1") + + await runner.evaluate( + context={"output": {"answer": "a"}, "reference": {"answer": "a"}}, + input_mapping=_EMPTY_MAPPING, + name="test", + output_configs=[_continuous_config()], + ) + + call_args = backend.execute.call_args + code_arg = call_args.args[0] if call_args.args else call_args.kwargs.get("code", "") + assert '"output": {"answer": "a"}' in code_arg + assert '"reference": {"answer": "a"}' in code_arg + + async def test_typescript_evaluate_auto_passes_context_keys_matching_signature(self) -> None: + runner, backend = _make_runner( + source_code=("function evaluate({ output, reference }: EvaluatorParams) { return 1; }"), + language="TYPESCRIPT", + backend_stdout="1", + ) + + await runner.evaluate( + context={"output": {"answer": "a"}, "reference": {"answer": "a"}}, + input_mapping=_EMPTY_MAPPING, + name="test", + output_configs=[_continuous_config()], + ) + + call_args = backend.execute.call_args + code_arg = call_args.args[0] if call_args.args else call_args.kwargs.get("code", "") + assert '"output": {"answer": "a"}' in code_arg + assert '"reference": {"answer": "a"}' in code_arg + class TestEvaluateErrorPaths: + async def test_inference_failure_returns_human_readable_python_error(self) -> None: + runner, backend = _make_runner(source_code="def not_evaluate(output): return 1") + + results = await runner.evaluate( + context={"output": {"answer": "a"}}, + input_mapping=_EMPTY_MAPPING, + name="test_py", + output_configs=[_categorical_config()], + ) + + assert len(results) == 1 + assert results[0]["error"] is not None + assert "no top-level `evaluate(...)` function was found" in results[0]["error"] + backend.execute.assert_not_called() + + async def test_inference_failure_returns_human_readable_typescript_error(self) -> None: + runner, backend = _make_runner( + source_code="function evaluate(output: EvaluatorParams) { return 1; }", + language="TYPESCRIPT", + ) + + results = await runner.evaluate( + context={"output": {"answer": "a"}}, + input_mapping=_EMPTY_MAPPING, + name="test_ts", + output_configs=[_categorical_config()], + ) + + assert len(results) == 1 + assert results[0]["error"] is not None + assert "Use a destructured object parameter" in results[0]["error"] + backend.execute.assert_not_called() + + async def test_inference_failure_returns_human_readable_error_for_renamed_typescript_param( + self, + ) -> None: + runner, backend = _make_runner( + source_code=( + "function evaluate({ outputs, reference, input, metadata }: EvaluatorParams) { " + "const candidate = typeof output?.answer === 'string' ? output.answer : ''; " + "return 1; }" + ), + language="TYPESCRIPT", + ) + + results = await runner.evaluate( + context={"output": {"answer": "a"}}, + input_mapping=_EMPTY_MAPPING, + name="pytest", + output_configs=[_categorical_config()], + ) + + assert len(results) == 1 + assert results[0]["error"] is not None + assert "unsupported parameter names: `outputs`" in results[0]["error"] + backend.execute.assert_not_called() + async def test_input_mapping_failure_returns_error_result(self) -> None: runner, _ = _make_runner() bad_mapping = InputMapping( @@ -255,12 +430,14 @@ async def test_language_stored_normalized_to_uppercase(self) -> None: async def test_typescript_language_uses_typescript_harness(self) -> None: """Runner selects TypeScript harness when language is TYPESCRIPT.""" runner, backend = _make_runner( - source_code="function evaluate(x) { return 1; }", + source_code=( + "function evaluate({ output }: EvaluatorParams) { return output ? 1 : 0; }" + ), language="TYPESCRIPT", backend_stdout="1", ) await runner.evaluate( - context={}, + context={"output": {"answer": "a"}}, input_mapping=_EMPTY_MAPPING, name="test", output_configs=[_continuous_config()],