Skip to content

Commit 2ef9a73

Browse files
committed
Fixed data enrichment with current info + removed exposed secrets
- Enhanced search queries to include current year for up-to-date results - Added company name normalization for better search accuracy - Improved LLM extraction to prioritize recent information - Added time-based search parameters (days=365) to Tavily
1 parent aab3464 commit 2ef9a73

File tree

7 files changed

+248
-90
lines changed

7 files changed

+248
-90
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,3 +173,4 @@ cython_debug/
173173
# PyPI configuration file
174174
.pypirc
175175
/ui/node_modules
176+
ui/.env.development

app.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
# Configure CORS
4848
app.add_middleware(
4949
CORSMiddleware,
50-
allow_origins=["http://localhost:5173"],
50+
allow_origins=["http://localhost:5173", "http://localhost:5174"],
5151
allow_credentials=True,
5252
allow_methods=["*"],
5353
allow_headers=["*"],
@@ -75,6 +75,8 @@ def init_clients(tavily_api_key: str):
7575

7676
# Gemini provider (optional)
7777
if gemini_api_key:
78+
import google.generativeai as genai
79+
genai.configure(api_key=gemini_api_key)
7880
gemini_model = GenerativeModel(model_name="gemini-1.5-flash")
7981
gemini_provider = GeminiProvider(gemini_model)
8082

@@ -106,21 +108,30 @@ class EnrichmentRequest(BaseModel):
106108
column_name: str
107109
target_value: str
108110
context_values: Dict[str, str]
109-
answer: str = None
110-
search_result: str = None
111-
111+
answer: Optional[str] = None
112+
search_result: Optional[str] = None
113+
# Add AI agent configuration fields
114+
input_source_type: Optional[str] = "ENTITY"
115+
input_data: Optional[str] = None
116+
custom_prompt: Optional[str] = None
117+
118+
# Update BatchEnrichmentRequest to include AI agent configuration
112119
class BatchEnrichmentRequest(BaseModel):
113120
column_name: str
114121
rows: List[str] # List of target values to enrich
115122
context_values: Dict[str, str]
116-
answer: str = None
117-
search_result: str = None
123+
answer: Optional[str] = None
124+
search_result: Optional[str] = None
125+
# Add AI agent configuration fields
126+
input_source_type: Optional[str] = "ENTITY"
127+
input_data: Optional[str] = None
128+
custom_prompt: Optional[str] = None
118129

119130
class TableData(BaseModel):
120131
rows: List[str] # List of target values to enrich
121132
context_values: Dict[str, str]
122-
answer: str = None
123-
search_result: str = None
133+
answer: Optional[str] = None
134+
search_result: Optional[str] = None
124135

125136
class TableEnrichmentRequest(BaseModel):
126137
data: Dict[str, TableData]
@@ -165,7 +176,7 @@ async def verify_jwt(jwt_token: str = Cookie(None)):
165176
async def enrich_data(
166177
request: EnrichmentRequest,
167178
fastapi_request: Request,
168-
provider: str = "openai"
179+
provider: str = "gemini"
169180
):
170181
"""Enrich a single cell's data."""
171182
start_time = time.time()
@@ -188,7 +199,10 @@ async def enrich_data(
188199
target_value=request.target_value,
189200
context_values=request.context_values,
190201
tavily_client=tavily_client,
191-
llm_provider=llm_provider
202+
llm_provider=llm_provider,
203+
input_source_type=request.input_source_type,
204+
input_data=request.input_data,
205+
custom_prompt=request.custom_prompt
192206
)
193207
enrich_time = time.time() - enrich_start_time
194208

@@ -235,7 +249,7 @@ async def enrich_data(
235249
async def enrich_batch(
236250
request: BatchEnrichmentRequest,
237251
fastapi_request: Request,
238-
provider: str = "openai"
252+
provider: str = "gemini"
239253
):
240254
"""Enrich multiple rows in parallel."""
241255
start_time = time.time()
@@ -260,7 +274,10 @@ async def enrich_batch(
260274
target_value=row,
261275
context_values=request.context_values,
262276
tavily_client=tavily_client,
263-
llm_provider=llm_provider
277+
llm_provider=llm_provider,
278+
input_source_type=request.input_source_type,
279+
input_data=request.input_data,
280+
custom_prompt=request.custom_prompt
264281
)
265282
tasks.append(task)
266283

@@ -337,7 +354,7 @@ async def enrich_batch(
337354
async def enrich_table(
338355
request: TableEnrichmentRequest,
339356
fastapi_request: Request,
340-
provider: str = "openai"
357+
provider: str = "gemini"
341358
):
342359
"""Enrich the entire table (all columns) in parallel."""
343360
start_time = time.time()

backend/graph.py

Lines changed: 112 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class EnrichmentContext:
9494
input_source_type: Optional[str] = None # 'ENTITY', 'URL', or 'TEXT_FROM_COLUMN' (where the input comes from)
9595
input_data: Optional[str] = None # The actual data to use (e.g., "plaid.com" or text from another cell)
9696
plan: Optional[dict] = None # Output of the Planner node: structured plan for what to do next
97+
custom_prompt: Optional[str] = None # Custom prompt/instruction for AI agent
9798

9899
# Existing/legacy fields for enrichment
99100
search_result: Optional[dict] = None # Results from Tavily search or other sources
@@ -108,23 +109,81 @@ def __init__(self, tavily_client, llm_provider: LLMProvider):
108109
async def search_tavily(self, state: EnrichmentContext):
109110
"""Run Tavily search in a separate thread"""
110111
try:
111-
# Use a custom search_query from the plan if available, otherwise fall back to the default pattern
112+
# DEBUG: Log all the state information
113+
logger.info(f"DEBUG search_tavily - column_name: {state.column_name}")
114+
logger.info(f"DEBUG search_tavily - target_value: {state.target_value}")
115+
logger.info(f"DEBUG search_tavily - input_source_type: {state.input_source_type}")
116+
logger.info(f"DEBUG search_tavily - input_data: {state.input_data}")
117+
logger.info(f"DEBUG search_tavily - custom_prompt: {state.custom_prompt}")
118+
logger.info(f"DEBUG search_tavily - plan: {state.plan}")
119+
120+
# Use a custom search_query from the plan if available, otherwise fall back to improved query construction
112121
query = None
113122
if state.plan and isinstance(state.plan, dict):
114123
query = state.plan.get("search_query")
115124
if not query:
116-
query = f"{state.column_name} of {state.target_value}?"
117-
logger.info(f"Searching Tavily with query: {query}")
125+
# Get current year for time-sensitive searches
126+
import datetime
127+
current_year = datetime.datetime.now().year
128+
129+
# Normalize company names for better search results
130+
def normalize_company_name(company_name):
131+
"""Normalize company names for better search results"""
132+
if not company_name:
133+
return company_name
134+
135+
# Convert to lowercase for comparison
136+
name_lower = company_name.lower().strip()
137+
138+
# Handle common variations
139+
if name_lower in ['linked in', 'linkedin inc', 'linkedin corp']:
140+
return 'LinkedIn'
141+
elif name_lower in ['amazon inc', 'amazon.com', 'amazon com']:
142+
return 'Amazon'
143+
elif name_lower in ['microsoft corp', 'microsoft inc', 'msft']:
144+
return 'Microsoft'
145+
elif name_lower in ['google inc', 'alphabet inc', 'alphabet']:
146+
return 'Google'
147+
elif name_lower in ['apple inc', 'apple computer']:
148+
return 'Apple'
149+
elif name_lower in ['meta', 'facebook inc', 'meta platforms']:
150+
return 'Meta'
151+
152+
# Return original with proper capitalization
153+
return company_name.strip()
154+
155+
# Improved fallback query construction using custom prompt and input data
156+
if state.custom_prompt and state.input_data:
157+
# Use custom prompt with input data for AI agent queries
158+
normalized_company = normalize_company_name(state.input_data)
159+
query = f"{state.custom_prompt} {normalized_company} current {current_year}"
160+
elif state.custom_prompt:
161+
# Use custom prompt with target value
162+
normalized_company = normalize_company_name(state.target_value)
163+
query = f"{state.custom_prompt} {normalized_company} current {current_year}"
164+
else:
165+
# Default fallback with current year for time-sensitive information
166+
normalized_company = normalize_company_name(state.target_value)
167+
query = f"current {state.column_name} of {normalized_company} {current_year}"
168+
logger.info(f"FINAL SEARCH QUERY: {query}")
169+
print(f"🔍 FINAL SEARCH QUERY: {query}")
170+
print(f"🎯 Query components - custom_prompt: {state.custom_prompt}, input_data: {state.input_data}, target_value: {state.target_value}")
118171
# Run the Tavily search in a thread to avoid blocking
119172
result = await asyncio.to_thread(
120173
lambda: self.tavily.search(
121-
query=query, auto_parameters=True, search_depth="advanced", max_results = 5, include_raw_content=True
174+
query=query,
175+
auto_parameters=True,
176+
search_depth="advanced",
177+
max_results=5,
178+
include_raw_content=True,
179+
days=365 # Search within the last year for more recent information
122180
)
123181
)
124182
print(result["auto_parameters"])
125183
logger.info(f"Tavily search result: {result}")
126184
# Return the search result to be added to the context
127185
return {"search_result": result}
186+
l
128187
except Exception as e:
129188
logger.error(f"Error in search_tavily: {str(e)}")
130189
raise
@@ -147,22 +206,35 @@ async def extract_minimal_answer( self, state: EnrichmentContext ) -> dict:
147206
content = "\n\n---\n\n".join(result_contents)
148207
print(f"Content: {content}")
149208
try:
209+
# Get current year for time-sensitive extraction
210+
import datetime
211+
current_year = datetime.datetime.now().year
212+
150213
# Build the prompt for the LLM, instructing it to extract only the direct answer
151214
extraction_instructions = ""
152215
if state.plan and isinstance(state.plan, dict):
153216
extraction_instructions = state.plan.get("extraction_instructions", "")
154217
prompt = f"""
155-
Extract the {state.column_name} of {state.target_value} from this search result:
218+
Extract the CURRENT {state.column_name} of {state.target_value} from this search result:
156219
157220
{content}
158221
159222
Rules:
160-
1. Provide ONLY the direct answer - no explanations
161-
2. Be concise
162-
3. If not found, respond \"Information not found\"
163-
4. No citations or references
223+
1. PRIORITIZE the most recent information (look for dates from {current_year} or late 2023)
224+
2. If there are multiple entries, choose the most recent one
225+
3. For leadership positions (CEO, President, etc.), always look for "current" or most recent information
226+
4. Provide ONLY the direct answer - no explanations
227+
5. Be concise
228+
6. If not found, respond \"Information not found\"
229+
7. No citations or references
230+
231+
Example:
232+
Search result: "Amazon's CEO is Andy Jassy as of 2021. Previously, Jeff Bezos was CEO until July 2021."
233+
Direct Answer: Andy Jassy
234+
164235
{extraction_instructions}
165-
Direct Answer:
236+
237+
Current {state.column_name} of {state.target_value}:
166238
"""
167239
logger.info(f"Extracting answer for {state.target_value}")
168240

@@ -181,18 +253,33 @@ async def generate_plan(self, state: EnrichmentContext) -> dict:
181253
Planner node: Uses the LLM to analyze the user's request and input data, and produces a structured plan.
182254
The plan determines the next action (e.g., analyze a URL, search the web, analyze text, etc.).
183255
"""
256+
# Get custom prompt and context for better planning
257+
custom_prompt = state.custom_prompt
258+
context_info = ""
259+
if state.context_values:
260+
context_info = f"Additional context: {state.context_values}"
261+
262+
# Get current year for time-sensitive searches
263+
import datetime
264+
current_year = datetime.datetime.now().year
265+
184266
# Compose a prompt for the LLM to analyze the user's intent and input
185267
prompt = f"""
186268
You are an AI research agent planner. Your job is to analyze the user's enrichment request and input data, and return a JSON plan for the next step.
187269
188270
User's column/question: {state.column_name}
189271
Input source type: {state.input_source_type}
190272
Input data: {state.input_data}
273+
Custom prompt/instruction: {custom_prompt or "None"}
274+
{context_info}
191275
192276
Instructions:
193277
- If the input source type is 'URL', and the question is about analyzing or extracting from that URL, set action to 'analyze_url' and include 'source_url' and 'extraction_instructions'.
194278
- If the input source type is 'TEXT_FROM_COLUMN', and the question is about analyzing or extracting from that text, set action to 'analyze_text' and include 'source_text' and 'extraction_instructions'.
195279
- If the input source type is 'ENTITY' or the question is general, set action to 'search_web' and include 'search_query' and 'extraction_instructions'.
280+
- When creating search_query, ALWAYS include "current {current_year}" or "latest" for time-sensitive information like CEO, leadership, current status, etc.
281+
- For leadership/CEO queries, include both "current" and "{current_year}" in the search query.
282+
- If custom prompt is provided, incorporate it into the search query and extraction instructions.
196283
- Always include a concise 'extraction_instructions' field that tells the next node what to extract or analyze.
197284
- Respond ONLY with a valid JSON object, no explanation.
198285
@@ -205,8 +292,8 @@ async def generate_plan(self, state: EnrichmentContext) -> dict:
205292
or
206293
{{
207294
"action": "search_web",
208-
"search_query": "CEO of Amazon",
209-
"extraction_instructions": "Extract the name of the CEO."
295+
"search_query": "current CEO of Amazon {current_year} latest",
296+
"extraction_instructions": "Extract the name of the current CEO as of {current_year}."
210297
}}
211298
or
212299
{{
@@ -219,15 +306,17 @@ async def generate_plan(self, state: EnrichmentContext) -> dict:
219306
"""
220307
try:
221308
# Use the LLM to generate the plan as a JSON string
309+
#plan_json = await self.llm.generate(prompt)
222310
plan_json = await self.llm.generate(prompt)
223311
import json
224312
plan = json.loads(plan_json)
225313
# Attach the plan to the state
226314
return {"plan": plan}
227315
except Exception as e:
228316
logger.error(f"Error in generate_plan: {str(e)}")
229-
# Fallback: default to web search
230-
return {"plan": {"action": "search_web", "search_query": state.column_name, "extraction_instructions": f"Extract the answer to: {state.column_name}"}}
317+
# Fallback: default to web search with current year
318+
fallback_query = f"current {state.column_name} of {state.target_value} {current_year}"
319+
return {"plan": {"action": "search_web", "search_query": fallback_query, "extraction_instructions": f"Extract the current answer to: {state.column_name}"}}
231320

232321
async def scrape_website(self, state: EnrichmentContext):
233322
"""
@@ -345,7 +434,10 @@ async def enrich_cell_with_graph(
345434
target_value: str,
346435
context_values: Dict[str, str],
347436
tavily_client,
348-
llm_provider: LLMProvider
437+
llm_provider: LLMProvider,
438+
input_source_type: Optional[str] = "ENTITY",
439+
input_data: Optional[str] = None,
440+
custom_prompt: Optional[str] = None
349441
) -> Dict:
350442
"""Helper function to enrich a single cell using langgraph."""
351443
try:
@@ -357,6 +449,9 @@ async def enrich_cell_with_graph(
357449
column_name=column_name,
358450
target_value=target_value,
359451
context_values=context_values,
452+
input_source_type=input_source_type,
453+
input_data=input_data or target_value,
454+
custom_prompt=custom_prompt,
360455
search_result=None,
361456
answer=None
362457
)
@@ -367,7 +462,7 @@ async def enrich_cell_with_graph(
367462
return result #, result['urls']
368463
except Exception as e:
369464
logger.error(f"Error in enrich_cell_with_graph: {str(e)}")
370-
return "Error during enrichment"
465+
return {"answer": "Error during enrichment", "search_result": None}
371466

372467
# Example usage block for running the enrichment pipeline directly
373468
if __name__ == "__main__":
@@ -388,7 +483,7 @@ async def enrich_cell_with_graph(
388483
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
389484
# Initialize Gemini model (Google Generative AI)
390485
gemini_model = GenerativeModel(model_name="gemini-1.5-flash")
391-
486+
#gemini_model = GenerativeModel(api_key=os.getenv("GEMINI_API_KEY"))
392487
# Example: Create OpenAI provider and pipeline
393488
openai_provider = OpenAIProvider(openai_client)
394489
pipeline_openai = EnrichmentPipeline(tavily_client, openai_provider)

package-lock.json

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ui/.env.development

Lines changed: 0 additions & 4 deletions
This file was deleted.

ui/.env.development.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
VITE_API_URL=http://localhost:8000
2+
VITE_WS_URL=ws://localhost:8000
3+
TAVILY_API_KEY=your_tavily_api_key_here
4+
OPENAI_API_KEY=your_openai_api_key_here
5+
VITE_APP_URL=http://localhost:5173

0 commit comments

Comments
 (0)