vectara
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config_examples/eval_config_llama_index.yaml‎
Lines changed: 2 additions & 0 deletions b/‎config_examples/eval_config_llama_index.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎open_rag_eval/_version.py‎
Lines changed: 1 addition & 1 deletion b/‎open_rag_eval/_version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎open_rag_eval/connectors/llama_index_connector.py‎
Lines changed: 8 additions & 2 deletions b/‎open_rag_eval/connectors/llama_index_connector.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎open_rag_eval/metrics/umbrela_metric.py‎
Lines changed: 68 additions & 39 deletions b/‎open_rag_eval/metrics/umbrela_metric.py‎
Lines changed: 68 additions & 39 deletions
diff --git a/‎open_rag_eval/models/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎open_rag_eval/models/__init__.py‎
Lines changed: 2 additions & 2 deletions
@@ -7,6 +7,6 @@ mypy:
 	mypy open_rag_eval || true
 
 test:
-	python -m unittest discover -s tests -b
+	TRANSFORMERS_VERBOSITY=error python -m unittest discover -s tests -b
 
 .PHONY: all lint mypy test
@@ -34,3 +34,5 @@ connector:
     top_k: 10
     max_workers: -1  # -1 to use all available CPU cores for parallel processing.
     repeat_query: 5 # set this more than one to repeat the query multiple times for consistency evaluation
+    openai_embedding_model: "text-embedding-3-small"
+    openai_llm_model: "gpt-4.1-mini"
@@ -1,4 +1,4 @@
 """
 Define the version of the package.
 """
-__version__ = "0.2.0"
+__version__ = "0.2.1"
@@ -1,27 +1,33 @@
 import logging
 import os
 
-from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
 from llama_index.core.base.base_query_engine import BaseQueryEngine
 from llama_index.core.query_engine.citation_query_engine import CitationQueryEngine
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.openai import OpenAI
 
 from open_rag_eval.connectors.connector import Connector
 from open_rag_eval.utils.constants import NO_ANSWER, API_ERROR
 
 # Configure logging for tenacity
 logger = logging.getLogger(__name__)
 
-
 class LlamaIndexConnector(Connector):
 
     def __init__(
             self,
             config: dict,
             folder: str,
             top_k: int = 10,
+            openai_embedding_model: str = "text-embedding-3-small",
+            openai_llm_model: str = "gpt-4.1-mini",
             max_workers: int = -1,
             repeat_query: int = 1,  # Add repeat_query parameter
     ) -> BaseQueryEngine:
+        Settings.embed_model = OpenAIEmbedding(model=openai_embedding_model)
+        Settings.llm = OpenAI(model=openai_llm_model, temperature=0.0)
+
         documents = SimpleDirectoryReader(folder).load_data()
         index = VectorStoreIndex.from_documents(documents)
         retriever = index.as_retriever(similarity_top_k=top_k)
 
@@ -24,42 +24,68 @@ class UMBRELAMetric(RetrievalMetric):
     Bing RELevance Assessor paper: https://arxiv.org/pdf/2406.06519
     """
 
-    _UMBRELA_PROMPT = """Given a query and a passage, you must provide a score on an
-                        integer scale of 0 to 3 with the following meanings:
-                        0 = represent that the passage has nothing to do with the query,
-                        1 = represents that the passage seems related to the query but
-                        does not answer it,
-                        2 = represents that the passage has some answer for the query,
-                        but the answer may be a bit unclear, or hidden amongst extraneous
-                        information and
-                        3 = represents that the passage is dedicated to the query and
-                        contains the exact answer.
-                        Important Instruction: Assign category 1 if the passage is
-                        somewhat related to the topic but not completely, category 2 if
-                        passage presents something very important related to the entire
-                        topic but also has some extra information and category 3 if the
-                        passage only and entirely refers to the topic. If none of the
-                        above satisfies give it category 0.
-                        Query: {query}
-                        Passage: {passage}
-                        Split this problem into steps:
-                        Consider the underlying intent of the search.
-                        Measure how well the content matches a likely intent of the query
-                        (M).
-                        Measure how trustworthy the passage is (T).
-                        Consider the aspects above and the relative importance of each,
-                        and decide on a final score (O). Final score must be an integer
-                        value only.
-                        Do not provide any code in result. Provide each score in the
-                        format of: a single integer without any reasoning."""
+    _UMBRELA_PROMPT = """
+        Given a query and a passage, you must provide a score on an
+        integer scale of 0 to 3 with the following meanings:
+        0 = represent that the passage has nothing to do with the query,
+        1 = represents that the passage seems related to the query but
+        does not answer it,
+        2 = represents that the passage has some answer for the query,
+        but the answer may be a bit unclear, or hidden amongst extraneous
+        information and
+        3 = represents that the passage is dedicated to the query and
+        contains the exact answer.
+        Important Instruction: Assign category 1 if the passage is
+        somewhat related to the topic but not completely, category 2 if
+        passage presents something very important related to the entire
+        topic but also has some extra information and category 3 if the
+        passage only and entirely refers to the topic. If none of the
+        above satisfies give it category 0.
+        Query: {query}
+        Passage: {passage}
+        Split this problem into steps:
+        Consider the underlying intent of the search.
+        Measure how well the content matches a likely intent of the query
+        (M).
+        Measure how trustworthy the passage is (T).
+        Consider the aspects above and the relative importance of each,
+        and decide on a final score (O). Final score must be an integer
+        value only.
+        Do not provide any code in result. Provide each score in the
+        format of: a single integer without any reasoning.
+    """
+
+    _UMBRELA_NEW_PROMPT = """
+        Given a query and a passage, you must provide a score on an integer scale of 0 to 3 with the following meanings:
+        0 = represent that the passage has nothing to do with the query,
+        1 = represents that the passage seems related to the query but does not answer it,
+        2 = represents that the passage has some answer for the query,
+        but the answer may be a bit unclear, or hidden amongst extraneous information and
+        3 = represents that the passage is dedicated to the query and contains the exact answer.
+        Important Instructions about score assignment:
+        - score=1 if the passage is somewhat related to the topic but not completely.
+        - score=2 if the passage presents something very important related to the entire topic but also has some extra information.
+        - score=3 if the passage only and entirely refers to the topic.
+        - score=0 if none of the above is satisfied.
+        Split this problem into steps:
+        1. Consider the underlying intent of the search.
+        2. Measure how well the content matches a likely intent of the query (M).
+        3. Measure how trustworthy the passage is (T).
+        4. Consider the aspects above and the relative importance of each, and decide on a final score (O).
+        Your response must be the final score value (0, 1, 2 or 3) only, without any additional text.
+        <query>
+        {query}
+        </query>
+        <passage>
+        {passage}
+        </passage>
+    """
 
     def __init__(self, model: LLMJudgeModel):
         """Initialize the UMBRELA metric.
 
         Args:
-            model (str): The model to use for the metric assesment.
-            prompt_override (str): An optional prompt to override the default UMBRELA prompt.
-                Must hvae placeholders for {query} and {passage}.
+            model (LLMJudgeModel): The model to use for the metric assesment.
         """
         self.model = model
         # kwargs to match the UMBRELA paper.
@@ -68,11 +94,8 @@ def __init__(self, model: LLMJudgeModel):
             "top_p": 1.0,
             "presence_penalty": 0.5,
             "frequency_penalty": 0.0,
-            "seed": 42
+            "seed": 42,
         }
-        self.prompt = self._UMBRELA_PROMPT
-        # Any UMBRELA score above this threshold is considered relevant for
-        # calculation of traditional retrieval metrics like MAP, Precison@k, etc.
         self._umbrela_relevant_threshold = 2
 
     def compute(
@@ -85,7 +108,15 @@ def compute(
         for key, passage in retrieval_result.retrieved_passages.items():
             try:
                 query = retrieval_result.query
-                prompt = self.prompt.format(query=query, passage=passage)
+                if (
+                    "gpt-oss" in self.model.model_name.lower()
+                    or "qwen" in self.model.model_name.lower()
+                ):
+                    prompt = self._UMBRELA_NEW_PROMPT.format(
+                        query=query, passage=passage
+                    )
+                else:
+                    prompt = self._UMBRELA_PROMPT.format(query=query, passage=passage)
                 response = self.model.parse(prompt, UMBRELAScore, self.model_kwargs)
 
                 if not response.score:
@@ -129,9 +160,7 @@ def add_retrieval_metrics(
             relevant_at_k = sum(binary_relevance[:k])
 
             # Calculate precision@K
-            retrieval_scores["precision@"][f"{k}"] = (
-                relevant_at_k / k if k > 0 else 0.0
-            )
+            retrieval_scores["precision@"][f"{k}"] = relevant_at_k / k if k > 0 else 0.0
 
             # Calculate Average Precision (AP@K)
             retrieval_scores["AP@"][f"{k}"] = self._calculate_average_precision(
 
@@ -1,3 +1,3 @@
-from .llm_judges import LLMJudgeModel, OpenAIModel, GeminiModel
+from .llm_judges import LLMJudgeModel, OpenAIModel, GeminiModel, AnthropicModel, TogetherModel
 
-__all__ = ["LLMJudgeModel", "OpenAIModel", "GeminiModel"]
+__all__ = ["LLMJudgeModel", "OpenAIModel", "GeminiModel", "AnthropicModel", "TogetherModel"]