feat: Add property tests for evaluation safety and determinism (#133) (#134)

andreasronge · github-actions[bot] · claude · web-flow · commit 910ec7dcb9c0 · 2025-12-09T10:11:43.000Z
* feat: Add property tests for evaluation safety and determinism Adds two new describe blocks to test/support/lisp_generators_test.exs: 1. "evaluation safety" - Property test that verifies generated PTC-Lisp expressions evaluate without crashing the interpreter. Handles both successful evaluation and expected runtime errors gracefully. 2. "determinism" - Property test that verifies the same input always produces the same output across multiple runs with identical context and tools. Both properties dynamically extract tool names from generated source code and provide matching mock tool implementations to ensure evaluation can complete. Fixes #133 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * fix: Extract duplicated tool setup and evaluation patterns in lisp_generators_test.exs - Extract tool extraction logic into `build_tools_for_source/2` helper (previously duplicated in evaluation safety and determinism tests) - Extract try/rescue pattern into `safe_run/2` helper (previously duplicated 3 times across both tests) Resolves PR review issues from #134. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/test/support/lisp_generators_test.exs b/test/support/lisp_generators_test.exs
@@ -261,6 +261,39 @@ defmodule PtcRunner.TestSupport.LispGeneratorsTest do
     end
   end
 
+  describe "evaluation safety" do
+    property "valid programs evaluate without crashes" do
+      check all(ast <- Gen.gen_expr(2)) do
+        source = Formatter.format(ast)
+        ctx = %{items: [1, 2, 3], user: %{name: "test", active: true}}
+
+        tools = build_tools_for_source(source)
+        result = safe_run(source, context: ctx, tools: tools)
+
+        # Should return {:ok, _, _, _} or {:error, _}, never crash the interpreter
+        assert match?({:ok, _, _, _}, result) or match?({:error, _}, result),
+               "Unexpected result for source: #{source}\nResult: #{inspect(result)}"
+      end
+    end
+  end
+
+  describe "determinism" do
+    property "same input always produces same output" do
+      check all(ast <- Gen.gen_expr(2)) do
+        source = Formatter.format(ast)
+        ctx = %{x: 42, items: [1, 2, 3]}
+
+        tools = build_tools_for_source(source, "fixed")
+
+        result1 = safe_run(source, context: ctx, tools: tools)
+        result2 = safe_run(source, context: ctx, tools: tools)
+
+        assert result1 == result2,
+               "Non-deterministic evaluation for: #{source}"
+      end
+    end
+  end
+
   # Helpers
 
   defp valid_ast?(value) do
@@ -320,4 +353,19 @@ defmodule PtcRunner.TestSupport.LispGeneratorsTest do
   defp ast_equivalent?(a, b) do
     a == b
   end
+
+  defp build_tools_for_source(source, default_result \\ :result) do
+    base_tools = %{"test_tool" => fn _args -> default_result end}
+
+    Regex.scan(~r/\(call "([^"]+)"/, source)
+    |> Enum.reduce(base_tools, fn [_full, tool_name], acc ->
+      Map.put_new(acc, tool_name, fn _args -> default_result end)
+    end)
+  end
+
+  defp safe_run(source, opts) do
+    PtcRunner.Lisp.run(source, opts)
+  rescue
+    _e -> {:error, :runtime_error}
+  end
 end