Merge pull request #58 from grace-sng7/augmented_suggester

grace-sng7 · web-flow · commit 83a968f7e501 · 2025-06-09T00:02:39.000-05:00
diff --git a/docs/notebooks/augmented_model_suggester_examples.ipynb b/docs/notebooks/augmented_model_suggester_examples.ipynb
@@ -53,6 +53,15 @@
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Here we introduce the AugmentedModelSuggester class. Creating an instance of it enables the chosen LLM to utilize Retrieval Augmented Generation (RAG) to determine causality. It currently does this by searching the CauseNet dataset for a relevant causal pair and augmenting the LLM with the corresponding evidence/information stored in CauseNet."
+      ],
+      "metadata": {
+        "id": "DjYECuX84vbN"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
@@ -66,6 +75,15 @@
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "AugmentedModelSuggester can suggest the pairwise relationship given two variables. If a relevant causal pair is found in CauseNet, the LLM is augmented with the aforementioned information in CauseNet. If not found, by default, the LLM will rely on its own knowledge."
+      ],
+      "metadata": {
+        "id": "dES0LwHV57eX"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,7 +60,6 @@ langchain-chroma = ">=0.2.4"
 langchain-community = ">=0.3.24"
 langchain-core = ">=0.3.60"
 langchain-huggingface = ">=0.2.0"
-langchain-openai = ">=0.3.17"
 rank-bm25 = ">=0.2.2"
 sentence-transformers = ">=4.1.0"
 
diff --git a/pywhyllm/suggesters/augmented_model_suggester.py b/pywhyllm/suggesters/augmented_model_suggester.py
@@ -2,12 +2,29 @@
 import re
 
 from .simple_model_suggester import SimpleModelSuggester
-from pywhyllm.utils.data_loader import *
+from pywhyllm.utils.data_loader import download_causenet, load_causenet_json, create_causenet_dict
 from pywhyllm.utils.augmented_model_suggester_utils import *
 
 
 class AugmentedModelSuggester(SimpleModelSuggester):
+    """
+        A class that extends SimpleModelSuggester and currently provides methods for suggesting causal relationships between variables by leveraging the CauseNet dataset for Retrieval Augmented Generation (RAG).
+
+        Methods:
+        - suggest_pairwise_relationship(variable1: str, variable2: str) -> List[str]:
+            Suggests the causal relationship between two variables and returns a list containing the cause, effect, and a description of the relationship.
+        """
+
     def __init__(self, llm, file_path: str = 'data/causenet-precision.jsonl.bz2'):
+        """
+        Initialize the AugmentedModelSuggester with a language model and download CauseNet data.
+
+        Args:
+            llm: The language model instance to be used for querying.
+            file_path (str, optional): Path to save the downloaded CauseNet JSONL file.
+                                      Defaults to 'data/causenet-precision.jsonl.bz2'.
+        """
+
         super().__init__(llm)
         self.file_path = file_path
 
@@ -23,13 +40,26 @@ def __init__(self, llm, file_path: str = 'data/causenet-precision.jsonl.bz2'):
             print("Download failed")
 
     def suggest_pairwise_relationship(self, variable1: str, variable2: str):
+        """
+            Suggests a cause-and-effect relationship between two variables, leveraging the CauseNet dataset for Retrieval Augmented Generation (RAG).
+            If a relevant causal pair is found in CauseNet, the LLM is augmented with corresponding information regarding the relationship stored
+            in CauseNet. If not found, by default, the LLM will rely on its own knowledge.
+
+            Args:
+                variable1 (str): The name of the first variable.
+                variable2 (str): The name of the second variable.
+
+            Returns:
+                list: A list containing the suggested cause variable, the suggested effect variable, and a description of the reasoning behind the suggestion.  If there is no relationship between the two variables, the first two elements will be None.
+            """
+
         result = find_top_match_in_causenet(self.causenet_dict, variable1, variable2)
         if result:
             source_text = get_source_text(result)
             retriever = split_data_and_create_vectorstore_retriever(source_text)
-            response = query_llm(variable1, variable2, source_text, retriever)
+            response = query_llm(self.llm, variable1, variable2, source_text, retriever)
         else:
-            response = query_llm(variable1, variable2)
+            response = query_llm(self.llm, variable1, variable2)
 
         answer = re.findall(r'<answer>(.*?)</answer>', response)
         answer = [ans.strip() for ans in answer]
diff --git a/pywhyllm/utils/augmented_model_suggester_utils.py b/pywhyllm/utils/augmented_model_suggester_utils.py
@@ -3,7 +3,6 @@
 from langchain_core.documents import Document
 from langchain_chroma import Chroma
 from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
@@ -13,49 +12,39 @@
 
 
 def find_top_match_in_causenet(causenet_dict, variable1, variable2, threshold=0.7):
-    # Sample dictionary
     pair_strings = [
         f"{causenet_dict[key]['causal_relation']['cause']}-{causenet_dict[key]['causal_relation']['effect']}"
         for key in causenet_dict]
 
-    # Tokenize for BM25
     tokenized_pairs = [text.split() for text in pair_strings]
     bm25 = BM25Okapi(tokenized_pairs)
 
-    # Original and reverse queries
     query = variable1 + "-" + variable2
     reverse_query = variable2 + "-" + variable1
     tokenized_query = query.split()
     tokenized_reverse_query = reverse_query.split()
 
-    # Combine tokens from both queries (remove duplicates)
     combined_query = list(set(tokenized_query + tokenized_reverse_query))
 
-    # Get top-k candidates using BM25 with combined query
     k = 5
     scores = bm25.get_scores(combined_query)
     top_k_indices = np.argsort(scores)[::-1][:k]
     candidate_pairs = [pair_strings[i] for i in top_k_indices]
 
-    # Apply SBERT to candidates
     model = SentenceTransformer('all-MiniLM-L6-v2')
     query_embedding = model.encode(query, convert_to_tensor=True)
     reverse_query_embedding = model.encode(reverse_query, convert_to_tensor=True)
     candidate_embeddings = model.encode(candidate_pairs, convert_to_tensor=True)
 
-    # Compute similarities for both original and reverse queries
     similarities = util.cos_sim(query_embedding, candidate_embeddings).flatten()
     reverse_similarities = util.cos_sim(reverse_query_embedding, candidate_embeddings).flatten()
 
-    # Take the maximum similarity for each candidate (original or reverse)
     max_similarities = np.maximum(similarities, reverse_similarities)
 
-    # Get the top match and its similarity score
     top_idx = np.argmax(max_similarities)
     top_similarity = max_similarities[top_idx]
     top_pair = candidate_pairs[top_idx]
 
-    # Check if the top similarity meets the threshold
     if top_similarity >= threshold:
         print(f"Best match: {top_pair} (Similarity: {top_similarity:.4f})")
         return causenet_dict[top_pair]
@@ -77,36 +66,29 @@ def get_source_text(causenet_query_result):
 def split_data_and_create_vectorstore_retriever(source_text):
     document = Document(page_content=source_text)
 
-    # Initialize the text splitter
     text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=100,  # Adjust chunk size as needed
-        chunk_overlap=20  # Overlap for context
+        chunk_size=100,
+        chunk_overlap=20
     )
-    # Split the documents
     splits = text_splitter.split_documents([document])
 
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
-    # Create a vector store from the document splits
     vectorstore = Chroma.from_documents(
         documents=splits,
         embedding=embeddings,
         persist_directory="./chroma_db"  # Optional: Save to disk for reuse
     )
 
-    # Create a retriever from the vector store
     retriever = vectorstore.as_retriever(
         search_type="similarity",
-        search_kwargs={"k": 5}  # Retrieve top 5 relevant chunks
+        search_kwargs={"k": 5}
     )
 
     return retriever
 
 
-def query_llm(variable1, variable2, source_text=None, retriever=None):
-    # Initialize the language model
-    llm = ChatOpenAI(model="gpt-4")
-
+def query_llm(llm, variable1, variable2, source_text=None, retriever=None):
     if source_text:
         system_prompt = """You are a helpful assistant for causal reasoning.
 
@@ -116,7 +98,6 @@ def query_llm(variable1, variable2, source_text=None, retriever=None):
         system_prompt = """You are a helpful assistant for causal reasoning.
     """
 
-    # prompt template
     prompt = ChatPromptTemplate.from_messages([
         ("system", system_prompt),
         ("human", "{input}")
@@ -125,12 +106,8 @@ def query_llm(variable1, variable2, source_text=None, retriever=None):
     query = f"""Which cause-and-effect-relationship is more likely? Provide reasoning and you must give your final answer (A, B, or C) in <answer> </answer> tags with the letter only.
             A. {variable1} causes {variable2} B. {variable2} causes {variable1} C. neither {variable1} nor {variable2} cause each other."""
 
-    # Define the system prompt
     if source_text:
-        # Create a document chain to combine retrieved documents
         question_answer_chain = create_stuff_documents_chain(llm, prompt)
-
-        # Create the RAG chain
         rag_chain = create_retrieval_chain(retriever, question_answer_chain)
 
         response = rag_chain.invoke({"input": query})
diff --git a/pywhyllm/utils/data_loader.py b/pywhyllm/utils/data_loader.py
@@ -20,7 +20,9 @@ def download_causenet(url: str, file_path: str) -> bool:
     International Conference on Information &amp; Knowledge Management (CIKM '20). Association for
     Computing Machinery, New York, NY, USA, 3023–3030. https://doi.org/10.1145/3340531.3412763
 
-    TODO: Add license
+    License:
+    CauseNet data is licensed under the Creative Commons Attribution (CC BY) license.
+    For full license details, see: https://creativecommons.org/licenses/by/4.0/
 
     Args:
         url (str): The URL of the file to download.
@@ -30,21 +32,16 @@ def download_causenet(url: str, file_path: str) -> bool:
         bool: True if the download was successful, False otherwise.
     """
     try:
-        # Ensure the output directory exists
         os.makedirs(os.path.dirname(file_path), exist_ok=True)
 
-        # Send a GET request to the URL
         response = requests.get(url, stream=True)
 
-        # Check if the request was successful
         if response.status_code != 200:
             logging.error(f"Failed to download file from {url}. Status code: {response.status_code}")
             return False
 
-        # Get the total file size for progress bar (if available)
         total_size = int(response.headers.get("content-length", 0))
 
-        # Download and save the file with a progress bar
         with open(file_path, "wb") as file, tqdm(
                 desc="Downloading",
                 total=total_size,
@@ -73,12 +70,11 @@ def load_causenet_json(file_path):
     print("Loading CauseNet using json")
     with bz2.open(file_path, 'rt',
                   encoding='utf-8') as file:
-        # Read each line and parse as JSON
         for line in file:
-            line = line.strip()  # Remove trailing newlines
-            if line:  # Skip empty lines
-                json_obj = json.loads(line)  # Parse the line as JSON
-                json_data.append(json_obj)  # Add to list
+            line = line.strip()
+            if line:
+                json_obj = json.loads(line)
+                json_data.append(json_obj)
     print("Done loading CauseNet using json")
     return json_data
 
@@ -97,7 +93,6 @@ def create_causenet_dict(json_data):
                 'sources': item['sources']
             }
         else:
-            # Append sources to existing list
             causenet_dict[key]['sources'].extend(item['sources'])
     print("Done creating dictionary from CauseNet json data")
     return causenet_dict