Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,5 @@ cython_debug/
queries.csv
vectara_results.csv
open_eval_results/

tests/outputs
24 changes: 24 additions & 0 deletions config_examples/eval_config_langchain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

input_queries: "queries.csv" # file with a list of queries to use for evaluation

# Evaluation results are written to the "results_folder" folder.
# You can oerride the names of files in this folder by specifying 'generated_answers', 'eval_results_file', and 'metrics_file'.
results_folder: "results/"
generated_answers: "answers.csv"
eval_results_file: "results.csv"
metrics_file: "metrics.png"

evaluator:
type: "TRECEvaluator"
model:
type: "OpenAIModel"
name: "gpt-4o-mini"
api_key: ${oc.env:OPENAI_API_KEY} # Reads from environment variable.

connector:
type: "LangChainConnector"
options:
# the folder with the files to be indexed into LlamaIndex
# all files in this folder and any subfolders will be indexed
folder: /path/to/folder-with-files/
top_k: 10
106 changes: 106 additions & 0 deletions open_rag_eval/connectors/langchain_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import csv
import logging
import os
from typing import List, Optional

from tqdm import tqdm

from open_rag_eval.connectors.connector import Connector

from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document as LangchainDocument # Alias for clarity
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

logger = logging.getLogger(__name__)

class LangchainConnector(Connector):
def __init__(
self,
config: dict,
folder: str,
top_k: int = 10,
) -> None:
super().__init__() # Call to the base class constructor if needed

self.top_k = top_k

# Configuration for paths
self.queries_csv = config.get("input_queries")
if not self.queries_csv:
logger.error("Config dictionary must contain 'input_queries' path.")
raise ValueError("Config dictionary must contain 'input_queries' path.")

results_folder = config.get("results_folder", ".") # Default to current directory
generated_answers_filename = config.get("generated_answers", "langchain_generated_answers.csv")
self.outputs_csv = os.path.join(results_folder, generated_answers_filename)

# Ensure the results directory exists
os.makedirs(results_folder, exist_ok=True)

logger.info(f"Loading documents from folder: {folder}")
loader = DirectoryLoader(folder, glob="**/*.*")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(
documents=splits,
embedding=OpenAIEmbeddings()
)
self.retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
logger.info(f"Loaded {len(docs)} documents and split into {len(splits)} chunks.")
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
self.rag_chain = (
{"context": self.retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)

def fetch_data(self) -> None:
queries = self.read_queries(self.queries_csv) # Using method from base or this class
logger.info(f"Starting to process {len(queries)} queries using LangChain connector.")
with open(self.outputs_csv, "w", newline='', encoding='utf-8') as csvfile:
fieldnames = ["query_id", "query", "passage_id", "passage", "generated_answer"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

for query_data in tqdm(queries, desc="Running LangChain queries"):
query_id = query_data["queryId"]
actual_query = query_data["query"]

try:
generated_answer = self.rag_chain.invoke(actual_query)
source_documents = self.retriever.invoke(actual_query)
for idx, doc in enumerate(source_documents, start=1):
passage_text = doc.page_content
row_data = {
"query_id": query_id,
"query": actual_query,
"passage_id": f"[{idx}]", # Match LlamaIndex output format
"passage": passage_text,
"generated_answer": generated_answer if idx == 1 else ""
}
writer.writerow(row_data)

except Exception as e:
logger.error(f"Failed to process query_id {query_id} ('{actual_query}'): {e}", exc_info=True)
# Write a row with error information for this specific query
writer.writerow({
"query_id": query_id,
"query": actual_query,
"passage_id": "ERROR",
"passage": f"Runtime error: {e}",
"generated_answer": "ERROR"
})
continue # Continue with the next query

logger.info(f"LangChain processing complete. Results saved to {self.outputs_csv}")
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ tenacity~=9.1.2
torch==2.6.0
tqdm==4.67.1
transformers==4.50.0
llama_index==0.12.34
llama_index>=0.12.34
langchain_chroma>=0.2.3
angchain_openai>=0.3.16
69 changes: 69 additions & 0 deletions tests/test_langchain_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import unittest
from pathlib import Path
from open_rag_eval.connectors.langchain_connector import (
LangchainConnector
)
import omegaconf
import pandas as pd

# Dummy response JSON to simulate the Vectara API response.
DUMMY_RESPONSE = {
"summary": "Test summary[1]",
"search_results": [{"text": "Passage one"}, {"text": "Passage two"}],
}

TOP_K = 10

class TestLangchainConnector(unittest.TestCase):
def setUp(self):
# Create a temporary CSV file with one test query.
self.outputs_path = 'tests/outputs'
self.data_path = 'data/pdfs/'
os.makedirs(self.outputs_path, exist_ok=True)

self.queries = ["What is the meaning of life?", "what is a transformer?", "what is attention?"]
queries_df = pd.DataFrame(self.queries, columns=["query"])
queries_df["query_id"] = [f"query_{inx}" for inx in range(len(self.queries))]
self.input_queries = os.path.join(self.outputs_path, "test_langchain_queries.csv")
queries_df.to_csv(self.input_queries, index=False)

# Output CSV file for testing.
self.generated_answers = os.path.join(self.outputs_path,'results.csv')
self.connector = LangchainConnector(
config=omegaconf.OmegaConf.create({
'input_queries': self.input_queries,
'results_folder': '.',
'generated_answers': self.generated_answers
}),
folder = self.data_path,
top_k=TOP_K
)

def tearDown(self):
# Cleanup the temporary test CSV and output CSV.
if os.path.exists(self.input_queries):
Path(self.input_queries).unlink()
if os.path.exists(self.generated_answers):
Path(self.generated_answers).unlink()

def test_fetch_data(self):
# Call the fetch_data method.
self.connector.fetch_data()

# Now read the output CSV and validate its contents.
results = pd.read_csv(self.generated_answers, header=0, encoding="utf-8")
self.assertEqual(results.shape[0], len(self.queries) * TOP_K)

# Check the first row: it should have the generated summary and passage_id "[1]"
count = results["query_id"].value_counts()[0]
for idx, row in results.iterrows():
query_idx = idx // count
passage_idx = idx % count
self.assertEqual(row["query_id"], f"query_{query_idx}")
self.assertEqual(row["query"], self.queries[query_idx])
self.assertEqual(row["passage_id"], f"[{passage_idx+1}]")


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion tests/test_llamaindex_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

TOP_K = 10

class TestVectaraConnector(unittest.TestCase):
class TestLlamaIndexConnector(unittest.TestCase):
def setUp(self):
# Create a temporary CSV file with one test query.
self.outputs_path = 'tests/outputs'
Expand Down
Loading