@@ -178,45 +178,108 @@ async def extract_markdown_async(
178178
179179 html = await page .content ()
180180
181- # Process HTML based on preferences
182- if _preferences .remove_base_64_images :
183- import re
184-
185- html = re .sub (
186- r'<img[^>]+src="data:image/[^"]+"[^>]*>' ,
187- "" ,
188- html ,
189- flags = re .IGNORECASE ,
190- )
191-
192- # Filter HTML by tags if specified
193- if _preferences .include_tags or _preferences .exclude_tags :
181+ # Process HTML based on preferences - consolidate all BeautifulSoup operations
182+ if (
183+ _preferences .remove_base_64_images
184+ or _preferences .include_tags
185+ or _preferences .exclude_tags
186+ or _preferences .only_main_content
187+ ):
194188 from bs4 import BeautifulSoup
195189
196190 soup = BeautifulSoup (html , "html.parser" )
197191
192+ # Remove base64 images first
193+ if _preferences .remove_base_64_images :
194+ import re
195+
196+ # Debug: Check what we have before processing
197+ all_imgs = soup .find_all ("img" )
198+ print (f"DEBUG: Found { len (all_imgs )} img tags total" )
199+ base64_count = 0
200+ for img in all_imgs :
201+ src = img .attrs .get ("src" ) if hasattr (img , "attrs" ) else None # type: ignore[union-attr]
202+ if isinstance (src , str ) and "data:image/" in src :
203+ base64_count += 1
204+ print (f"DEBUG: Found base64 img: { src [:100 ]} ..." )
205+ print (f"DEBUG: { base64_count } images have base64 data" )
206+
207+ # First, remove any anchor tags that contain img children with base64
208+ # (must be done before removing img tags themselves)
209+ removed_anchors = 0
210+ for a_tag in soup .find_all ("a" ):
211+ imgs = a_tag .find_all ("img" ) # type: ignore[union-attr]
212+ for img in imgs :
213+ src = img .attrs .get ("src" ) if hasattr (img , "attrs" ) else None # type: ignore[union-attr]
214+ if isinstance (src , str ) and src .startswith ("data:image/" ):
215+ # Remove the entire anchor tag if it contains base64 image
216+ a_tag .decompose ()
217+ removed_anchors += 1
218+ break
219+ print (
220+ f"DEBUG: Removed { removed_anchors } anchor tags with base64 images"
221+ )
222+
223+ # Remove standalone img tags with base64 src
224+ removed_imgs = 0
225+ for img in soup .find_all ("img" ):
226+ src = img .attrs .get ("src" ) if hasattr (img , "attrs" ) else None # type: ignore[union-attr]
227+ if isinstance (src , str ) and src .startswith ("data:image/" ):
228+ img .decompose ()
229+ removed_imgs += 1
230+ print (f"DEBUG: Removed { removed_imgs } standalone img tags" )
231+
232+ # Remove any element with base64 in href (like anchor tags with image data)
233+ for elem in soup .find_all (attrs = {"href" : True }):
234+ href = elem .attrs .get ("href" ) if hasattr (elem , "attrs" ) else None # type: ignore[union-attr]
235+ if isinstance (href , str ) and href .startswith ("data:image/" ):
236+ elem .decompose ()
237+
238+ # Remove any element with base64 in style attribute
239+ for elem in soup .find_all (attrs = {"style" : True }):
240+ style = elem .attrs .get ("style" ) if hasattr (elem , "attrs" ) else None # type: ignore[union-attr]
241+ if isinstance (style , str ) and "data:image/" in style :
242+ elem .decompose ()
243+
244+ # Remove SVG tags (they often contain base64 or are converted to base64 by markdown)
245+ for svg in soup .find_all ("svg" ):
246+ svg .decompose ()
247+
248+ # Remove any anchor tags that contain SVG children
249+ for a_tag in soup .find_all ("a" ):
250+ if a_tag .find ("svg" ): # type: ignore[union-attr]
251+ a_tag .decompose ()
252+
253+ # Final check: see if any base64 remains in the HTML string
254+ html_str = str (soup )
255+ remaining = len (re .findall (r'data:image/[^"\')\s]+' , html_str ))
256+ print (
257+ f"DEBUG: After processing, { remaining } base64 data URIs remain in HTML"
258+ )
259+
260+ # Extract main content if requested
198261 if _preferences .only_main_content :
199- # Try to find main content area
200262 main_content = (
201263 soup .find ("main" )
202264 or soup .find ("article" )
203265 or soup .find ("div" , {"id" : "content" })
204266 or soup .find ("div" , {"class" : "content" })
205267 )
206268 if main_content :
207- soup = BeautifulSoup ( str ( main_content ), "html.parser" )
269+ soup = main_content # type: ignore[assignment]
208270
271+ # Exclude specific tags
209272 if _preferences .exclude_tags :
210273 for tag in _preferences .exclude_tags :
211- for element in soup .find_all (tag ):
274+ for element in soup .find_all (tag ): # type: ignore[union-attr]
212275 element .decompose ()
213276
277+ # Include only specific tags
214278 if _preferences .include_tags :
215- # Keep only specified tags
216279 new_soup = BeautifulSoup ("" , "html.parser" )
217280 for tag in _preferences .include_tags :
218- for element in soup .find_all (tag ):
219- new_soup .append (element )
281+ for element in soup .find_all (tag ): # type: ignore[union-attr]
282+ new_soup .append (element ) # type: ignore[arg-type]
220283 soup = new_soup
221284
222285 html = str (soup )
@@ -315,7 +378,7 @@ class PossiveisRedirecionamentos(BaseModel):
315378
316379 extractor = Extractor (
317380 llm = Responder .openrouter (),
318- model = "openai/gpt-5-nano " ,
381+ model = "google/gemini-2.5-flash " ,
319382 )
320383
321384 # Example with custom extraction preferences
@@ -330,7 +393,7 @@ class PossiveisRedirecionamentos(BaseModel):
330393 async with async_api .async_playwright () as p :
331394 browser = await p .chromium .launch (headless = True )
332395
333- result = extractor .extract (
396+ result = await extractor .extract_async (
334397 browser = browser ,
335398 urls = [site_uniube ],
336399 output = PossiveisRedirecionamentos ,
0 commit comments