Skip to content

Commit 2363728

Browse files
committed
refactor(extractor): Enhance HTML processing and base64 image removal
- Consolidate BeautifulSoup operations for more robust HTML processing - Implement comprehensive base64 image removal strategy with detailed debugging - Add multiple removal techniques for base64 images in img tags, anchors, and styles - Improve error handling and type checking during HTML manipulation - Update example code to use different LLM model and async extraction method - Add debug print statements to track base64 image removal process - Refactor main content extraction and tag filtering logic
1 parent 338d7b4 commit 2363728

File tree

1 file changed

+84
-21
lines changed

1 file changed

+84
-21
lines changed

agentle/web/extractor.py

Lines changed: 84 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -178,45 +178,108 @@ async def extract_markdown_async(
178178

179179
html = await page.content()
180180

181-
# Process HTML based on preferences
182-
if _preferences.remove_base_64_images:
183-
import re
184-
185-
html = re.sub(
186-
r'<img[^>]+src="data:image/[^"]+"[^>]*>',
187-
"",
188-
html,
189-
flags=re.IGNORECASE,
190-
)
191-
192-
# Filter HTML by tags if specified
193-
if _preferences.include_tags or _preferences.exclude_tags:
181+
# Process HTML based on preferences - consolidate all BeautifulSoup operations
182+
if (
183+
_preferences.remove_base_64_images
184+
or _preferences.include_tags
185+
or _preferences.exclude_tags
186+
or _preferences.only_main_content
187+
):
194188
from bs4 import BeautifulSoup
195189

196190
soup = BeautifulSoup(html, "html.parser")
197191

192+
# Remove base64 images first
193+
if _preferences.remove_base_64_images:
194+
import re
195+
196+
# Debug: Check what we have before processing
197+
all_imgs = soup.find_all("img")
198+
print(f"DEBUG: Found {len(all_imgs)} img tags total")
199+
base64_count = 0
200+
for img in all_imgs:
201+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
202+
if isinstance(src, str) and "data:image/" in src:
203+
base64_count += 1
204+
print(f"DEBUG: Found base64 img: {src[:100]}...")
205+
print(f"DEBUG: {base64_count} images have base64 data")
206+
207+
# First, remove any anchor tags that contain img children with base64
208+
# (must be done before removing img tags themselves)
209+
removed_anchors = 0
210+
for a_tag in soup.find_all("a"):
211+
imgs = a_tag.find_all("img") # type: ignore[union-attr]
212+
for img in imgs:
213+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
214+
if isinstance(src, str) and src.startswith("data:image/"):
215+
# Remove the entire anchor tag if it contains base64 image
216+
a_tag.decompose()
217+
removed_anchors += 1
218+
break
219+
print(
220+
f"DEBUG: Removed {removed_anchors} anchor tags with base64 images"
221+
)
222+
223+
# Remove standalone img tags with base64 src
224+
removed_imgs = 0
225+
for img in soup.find_all("img"):
226+
src = img.attrs.get("src") if hasattr(img, "attrs") else None # type: ignore[union-attr]
227+
if isinstance(src, str) and src.startswith("data:image/"):
228+
img.decompose()
229+
removed_imgs += 1
230+
print(f"DEBUG: Removed {removed_imgs} standalone img tags")
231+
232+
# Remove any element with base64 in href (like anchor tags with image data)
233+
for elem in soup.find_all(attrs={"href": True}):
234+
href = elem.attrs.get("href") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
235+
if isinstance(href, str) and href.startswith("data:image/"):
236+
elem.decompose()
237+
238+
# Remove any element with base64 in style attribute
239+
for elem in soup.find_all(attrs={"style": True}):
240+
style = elem.attrs.get("style") if hasattr(elem, "attrs") else None # type: ignore[union-attr]
241+
if isinstance(style, str) and "data:image/" in style:
242+
elem.decompose()
243+
244+
# Remove SVG tags (they often contain base64 or are converted to base64 by markdown)
245+
for svg in soup.find_all("svg"):
246+
svg.decompose()
247+
248+
# Remove any anchor tags that contain SVG children
249+
for a_tag in soup.find_all("a"):
250+
if a_tag.find("svg"): # type: ignore[union-attr]
251+
a_tag.decompose()
252+
253+
# Final check: see if any base64 remains in the HTML string
254+
html_str = str(soup)
255+
remaining = len(re.findall(r'data:image/[^"\')\s]+', html_str))
256+
print(
257+
f"DEBUG: After processing, {remaining} base64 data URIs remain in HTML"
258+
)
259+
260+
# Extract main content if requested
198261
if _preferences.only_main_content:
199-
# Try to find main content area
200262
main_content = (
201263
soup.find("main")
202264
or soup.find("article")
203265
or soup.find("div", {"id": "content"})
204266
or soup.find("div", {"class": "content"})
205267
)
206268
if main_content:
207-
soup = BeautifulSoup(str(main_content), "html.parser")
269+
soup = main_content # type: ignore[assignment]
208270

271+
# Exclude specific tags
209272
if _preferences.exclude_tags:
210273
for tag in _preferences.exclude_tags:
211-
for element in soup.find_all(tag):
274+
for element in soup.find_all(tag): # type: ignore[union-attr]
212275
element.decompose()
213276

277+
# Include only specific tags
214278
if _preferences.include_tags:
215-
# Keep only specified tags
216279
new_soup = BeautifulSoup("", "html.parser")
217280
for tag in _preferences.include_tags:
218-
for element in soup.find_all(tag):
219-
new_soup.append(element)
281+
for element in soup.find_all(tag): # type: ignore[union-attr]
282+
new_soup.append(element) # type: ignore[arg-type]
220283
soup = new_soup
221284

222285
html = str(soup)
@@ -315,7 +378,7 @@ class PossiveisRedirecionamentos(BaseModel):
315378

316379
extractor = Extractor(
317380
llm=Responder.openrouter(),
318-
model="openai/gpt-5-nano",
381+
model="google/gemini-2.5-flash",
319382
)
320383

321384
# Example with custom extraction preferences
@@ -330,7 +393,7 @@ class PossiveisRedirecionamentos(BaseModel):
330393
async with async_api.async_playwright() as p:
331394
browser = await p.chromium.launch(headless=True)
332395

333-
result = extractor.extract(
396+
result = await extractor.extract_async(
334397
browser=browser,
335398
urls=[site_uniube],
336399
output=PossiveisRedirecionamentos,

0 commit comments

Comments
 (0)