googleapis
diff --git a/‎setup.py
Lines changed: 1 addition & 0 deletions b/‎setup.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/unit/vertexai/genai/test_evals.py
Lines changed: 252 additions & 79 deletions b/‎tests/unit/vertexai/genai/test_evals.py
Lines changed: 252 additions & 79 deletions
@@ -172,6 +172,7 @@
     "jsonschema",
     "ruamel.yaml",
     "pyyaml",
+    "litellm >= 1.72.4",
 ]
 
 langchain_extra_require = [
 
@@ -17,6 +17,7 @@
 import os
 import statistics
 from unittest import mock
+import google.auth.credentials
 import warnings
 
 from google.cloud import aiplatform
@@ -44,6 +45,84 @@
 pytestmark = pytest.mark.usefixtures("google_auth_mock")
 
 
+@pytest.fixture
+def mock_api_client_fixture():
+    mock_client = mock.Mock(spec=client.Client)
+    mock_client.project = _TEST_PROJECT
+    mock_client.location = _TEST_LOCATION
+    mock_client._credentials = mock.create_autospec(
+        google.auth.credentials.Credentials, instance=True
+    )
+    mock_client._credentials.universe_domain = "googleapis.com"
+    mock_client._evals_client = mock.Mock(spec=evals.Evals)
+    return mock_client
+
+
+@pytest.fixture
+def mock_eval_dependencies(mock_api_client_fixture):
+    with mock.patch("google.cloud.storage.Client") as mock_storage_client, mock.patch(
+        "google.cloud.bigquery.Client"
+    ) as mock_bq_client, mock.patch(
+        "vertexai._genai.evals.Evals.evaluate_instances"
+    ) as mock_evaluate_instances, mock.patch(
+        "vertexai._genai._evals_utils.GcsUtils.upload_json_to_prefix"
+    ) as mock_upload_to_gcs, mock.patch(
+        "vertexai._genai._evals_utils.LazyLoadedPrebuiltMetric._fetch_and_parse"
+    ) as mock_fetch_prebuilt_metric:
+
+        def mock_evaluate_instances_side_effect(*args, **kwargs):
+            metric_config = kwargs.get("metric_config", {})
+            if "exact_match_input" in metric_config:
+                return vertexai_genai_types.EvaluateInstancesResponse(
+                    exact_match_results=vertexai_genai_types.ExactMatchResults(
+                        exact_match_metric_values=[
+                            vertexai_genai_types.ExactMatchMetricValue(score=1.0)
+                        ]
+                    )
+                )
+            elif "rouge_input" in metric_config:
+                return vertexai_genai_types.EvaluateInstancesResponse(
+                    rouge_results=vertexai_genai_types.RougeResults(
+                        rouge_metric_values=[
+                            vertexai_genai_types.RougeMetricValue(score=0.8)
+                        ]
+                    )
+                )
+            elif "pointwise_metric_input" in metric_config:
+                return vertexai_genai_types.EvaluateInstancesResponse(
+                    pointwise_metric_result=vertexai_genai_types.PointwiseMetricResult(
+                        score=0.9, explanation="Mocked LLM explanation"
+                    )
+                )
+            elif "comet_input" in metric_config:
+                return vertexai_genai_types.EvaluateInstancesResponse(
+                    comet_result=vertexai_genai_types.CometResult(score=0.75)
+                )
+            return vertexai_genai_types.EvaluateInstancesResponse()
+
+        mock_evaluate_instances.side_effect = mock_evaluate_instances_side_effect
+        mock_upload_to_gcs.return_value = (
+            "gs://mock-bucket/mock_path/evaluation_result_timestamp.json"
+        )
+        mock_prebuilt_safety_metric = vertexai_genai_types.LLMMetric(
+            name="safety", prompt_template="Is this safe? {response}"
+        )
+        mock_prebuilt_safety_metric._is_predefined = True
+        mock_prebuilt_safety_metric._config_source = "gs://mock-metrics/safety/v1.yaml"
+        mock_prebuilt_safety_metric._version = "v1"
+
+        mock_fetch_prebuilt_metric.return_value = mock_prebuilt_safety_metric
+
+        yield {
+            "mock_storage_client": mock_storage_client,
+            "mock_bq_client": mock_bq_client,
+            "mock_evaluate_instances": mock_evaluate_instances,
+            "mock_upload_to_gcs": mock_upload_to_gcs,
+            "mock_fetch_prebuilt_metric": mock_fetch_prebuilt_metric,
+            "mock_prebuilt_safety_metric": mock_prebuilt_safety_metric,
+        }
+
+
 class TestEvals:
     """Unit tests for the GenAI client."""
 
@@ -716,26 +795,38 @@ def mock_generate_content_logic(*args, **kwargs):
                 mock.call(
                     model="gemini-pro",
                     contents=[
-                        {"parts": [{"text": "Placeholder prompt 1"}], "role": "user"}
+                        {
+                            "parts": [{"text": "Placeholder prompt 1"}],
+                            "role": "user",
+                        }
                     ],
                     config=genai_types.GenerateContentConfig(),
                 ),
                 mock.call(
                     model="gemini-pro",
                     contents=[
-                        {"parts": [{"text": "Placeholder prompt 2.1"}], "role": "user"},
+                        {
+                            "parts": [{"text": "Placeholder prompt 2.1"}],
+                            "role": "user",
+                        },
                         {
                             "parts": [{"text": "Placeholder model response 2.1"}],
                             "role": "model",
                         },
-                        {"parts": [{"text": "Placeholder prompt 2.2"}], "role": "user"},
+                        {
+                            "parts": [{"text": "Placeholder prompt 2.2"}],
+                            "role": "user",
+                        },
                     ],
                     config=genai_types.GenerateContentConfig(temperature=0.7, top_k=5),
                 ),
                 mock.call(
                     model="gemini-pro",
                     contents=[
-                        {"parts": [{"text": "Placeholder prompt 3"}], "role": "user"}
+                        {
+                            "parts": [{"text": "Placeholder prompt 3"}],
+                            "role": "user",
+                        }
                     ],
                     config=genai_types.GenerateContentConfig(),
                 ),
@@ -858,6 +949,163 @@ def test_inference_with_multimodal_content(
         assert inference_result.candidate_name == "gemini-pro"
         assert inference_result.gcs_source is None
 
+    def test_run_inference_with_litellm_string_prompt_format(
+        self,
+        mock_api_client_fixture,
+    ):
+        """Tests inference with LiteLLM using a simple prompt string."""
+        with mock.patch(
+            "vertexai._genai._evals_common.litellm"
+        ) as mock_litellm, mock.patch(
+            "vertexai._genai._evals_common._call_litellm_completion"
+        ) as mock_call_litellm_completion:
+            mock_litellm.utils.get_valid_models.return_value = ["gpt-4o"]
+            prompt_df = pd.DataFrame([{"prompt": "What is LiteLLM?"}])
+            expected_messages = [{"role": "user", "content": "What is LiteLLM?"}]
+
+            mock_response_dict = {
+                "id": "test",
+                "created": 123456,
+                "model": "gpt-4o",
+                "object": "chat.completion",
+                "system_fingerprint": "123456",
+                "choices": [
+                    {
+                        "finish_reason": "stop",
+                        "index": 0,
+                        "message": {
+                            "content": "LiteLLM is a library...",
+                            "role": "assistant",
+                            "annotations": [],
+                        },
+                        "provider_specific_fields": {},
+                    }
+                ],
+                "usage": {
+                    "completion_tokens": 114,
+                    "prompt_tokens": 13,
+                    "total_tokens": 127,
+                },
+                "service_tier": "default",
+            }
+            mock_call_litellm_completion.return_value = mock_response_dict
+            evals_module = evals.Evals(api_client_=mock_api_client_fixture)
+
+            result_dataset = evals_module.run_inference(
+                model="gpt-4o",
+                src=prompt_df,
+            )
+
+            mock_call_litellm_completion.assert_called_once()
+            _, call_kwargs = mock_call_litellm_completion.call_args
+
+            assert call_kwargs["model"] == "gpt-4o"
+            assert call_kwargs["messages"] == expected_messages
+            assert "response" in result_dataset.eval_dataset_df.columns
+            response_content = json.loads(result_dataset.eval_dataset_df["response"][0])
+            assert (
+                response_content["choices"][0]["message"]["content"]
+                == "LiteLLM is a library..."
+            )
+
+    def test_run_inference_with_litellm_openai_request_format(
+        self,
+        mock_api_client_fixture,
+    ):
+        """Tests inference with LiteLLM where the row contains an chat completion request body."""
+        with mock.patch(
+            "vertexai._genai._evals_common.litellm"
+        ) as mock_litellm, mock.patch(
+            "vertexai._genai._evals_common._call_litellm_completion"
+        ) as mock_call_litellm_completion:
+            mock_litellm.utils.get_valid_models.return_value = ["gpt-4o"]
+            prompt_df = pd.DataFrame(
+                [
+                    {
+                        "model": "gpt-4o",
+                        "messages": [
+                            {
+                                "role": "system",
+                                "content": "You are a helpful assistant.",
+                            },
+                            {"role": "user", "content": "Hello!"},
+                        ],
+                    }
+                ]
+            )
+            expected_messages = [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Hello!"},
+            ]
+
+            mock_response_dict = {
+                "id": "test",
+                "created": 123456,
+                "model": "gpt-4o",
+                "object": "chat.completion",
+                "system_fingerprint": "123456",
+                "choices": [
+                    {
+                        "finish_reason": "stop",
+                        "index": 0,
+                        "message": {
+                            "content": "Hello there",
+                            "role": "assistant",
+                            "annotations": [],
+                        },
+                        "provider_specific_fields": {},
+                    }
+                ],
+                "usage": {
+                    "completion_tokens": 114,
+                    "prompt_tokens": 13,
+                    "total_tokens": 127,
+                },
+                "service_tier": "default",
+            }
+            mock_call_litellm_completion.return_value = mock_response_dict
+            evals_module = evals.Evals(api_client_=mock_api_client_fixture)
+
+            result_dataset = evals_module.run_inference(
+                model="gpt-4o",
+                src=prompt_df,
+            )
+
+            mock_call_litellm_completion.assert_called_once()
+            _, call_kwargs = mock_call_litellm_completion.call_args
+
+            assert call_kwargs["model"] == "gpt-4o"
+            assert call_kwargs["messages"] == expected_messages
+            assert "response" in result_dataset.eval_dataset_df.columns
+            response_content = json.loads(result_dataset.eval_dataset_df["response"][0])
+            assert response_content["choices"][0]["message"]["content"] == "Hello there"
+
+    def test_run_inference_with_unsupported_model_string(
+        self,
+        mock_api_client_fixture,
+    ):
+        with mock.patch(
+            "vertexai._genai._evals_common.litellm"
+        ) as mock_litellm_package:
+            mock_litellm_package.utils.get_valid_models.return_value = []
+            evals_module = evals.Evals(api_client_=mock_api_client_fixture)
+            prompt_df = pd.DataFrame([{"prompt": "test"}])
+
+            with pytest.raises(TypeError, match="Unsupported string model name"):
+                evals_module.run_inference(
+                    model="some-random-model/name", src=prompt_df
+                )
+
+    @mock.patch("vertexai._genai._evals_common.litellm", None)
+    def test_run_inference_with_litellm_import_error(self, mock_api_client_fixture):
+        evals_module = evals.Evals(api_client_=mock_api_client_fixture)
+        prompt_df = pd.DataFrame([{"prompt": "test"}])
+        with pytest.raises(
+            ImportError,
+            match="The 'litellm' library is required to use third-party models",
+        ):
+            evals_module.run_inference(model="gpt-4o", src=prompt_df)
+
 
 class TestMetricPromptBuilder:
     """Unit tests for the MetricPromptBuilder class."""
@@ -2738,81 +2986,6 @@ def test_auto_detect_empty_dataset(self):
         )
 
 
-@pytest.fixture
-def mock_api_client_fixture():
-    mock_client = mock.Mock(spec=client.Client)
-    mock_client.project = _TEST_PROJECT
-    mock_client.location = _TEST_LOCATION
-    mock_client._credentials = mock.Mock()
-    mock_client._evals_client = mock.Mock(spec=evals.Evals)
-    return mock_client
-
-
-@pytest.fixture
-def mock_eval_dependencies(mock_api_client_fixture):
-    with mock.patch("google.cloud.storage.Client") as mock_storage_client, mock.patch(
-        "google.cloud.bigquery.Client"
-    ) as mock_bq_client, mock.patch(
-        "vertexai._genai.evals.Evals.evaluate_instances"
-    ) as mock_evaluate_instances, mock.patch(
-        "vertexai._genai._evals_utils.GcsUtils.upload_json_to_prefix"
-    ) as mock_upload_to_gcs, mock.patch(
-        "vertexai._genai._evals_utils.LazyLoadedPrebuiltMetric._fetch_and_parse"
-    ) as mock_fetch_prebuilt_metric:
-
-        def mock_evaluate_instances_side_effect(*args, **kwargs):
-            metric_config = kwargs.get("metric_config", {})
-            if "exact_match_input" in metric_config:
-                return vertexai_genai_types.EvaluateInstancesResponse(
-                    exact_match_results=vertexai_genai_types.ExactMatchResults(
-                        exact_match_metric_values=[
-                            vertexai_genai_types.ExactMatchMetricValue(score=1.0)
-                        ]
-                    )
-                )
-            elif "rouge_input" in metric_config:
-                return vertexai_genai_types.EvaluateInstancesResponse(
-                    rouge_results=vertexai_genai_types.RougeResults(
-                        rouge_metric_values=[
-                            vertexai_genai_types.RougeMetricValue(score=0.8)
-                        ]
-                    )
-                )
-            elif "pointwise_metric_input" in metric_config:
-                return vertexai_genai_types.EvaluateInstancesResponse(
-                    pointwise_metric_result=vertexai_genai_types.PointwiseMetricResult(
-                        score=0.9, explanation="Mocked LLM explanation"
-                    )
-                )
-            elif "comet_input" in metric_config:
-                return vertexai_genai_types.EvaluateInstancesResponse(
-                    comet_result=vertexai_genai_types.CometResult(score=0.75)
-                )
-            return vertexai_genai_types.EvaluateInstancesResponse()
-
-        mock_evaluate_instances.side_effect = mock_evaluate_instances_side_effect
-        mock_upload_to_gcs.return_value = (
-            "gs://mock-bucket/mock_path/evaluation_result_timestamp.json"
-        )
-        mock_prebuilt_safety_metric = vertexai_genai_types.LLMMetric(
-            name="safety", prompt_template="Is this safe? {response}"
-        )
-        mock_prebuilt_safety_metric._is_predefined = True
-        mock_prebuilt_safety_metric._config_source = "gs://mock-metrics/safety/v1.yaml"
-        mock_prebuilt_safety_metric._version = "v1"
-
-        mock_fetch_prebuilt_metric.return_value = mock_prebuilt_safety_metric
-
-        yield {
-            "mock_storage_client": mock_storage_client,
-            "mock_bq_client": mock_bq_client,
-            "mock_evaluate_instances": mock_evaluate_instances,
-            "mock_upload_to_gcs": mock_upload_to_gcs,
-            "mock_fetch_prebuilt_metric": mock_fetch_prebuilt_metric,
-            "mock_prebuilt_safety_metric": mock_prebuilt_safety_metric,
-        }
-
-
 class TestEvalsRunEvaluation:
     """Unit tests for the evaluate method."""
Original file line number	Diff line number	Diff line change
`@@ -172,6 +172,7 @@`
`172`	`172`	`"jsonschema",`
`173`	`173`	`"ruamel.yaml",`
`174`	`174`	`"pyyaml",`
	`175`	`+ "litellm >= 1.72.4",`
`175`	`176`	`]`
`176`	`177`
`177`	`178`	`langchain_extra_require = [`