4040from intellibricks .llms import Synapse , SynapseCascade
4141from intellibricks .llms .util import get_struct_from_schema
4242from markdownify import markdownify as md
43- from playwright .async_api import async_playwright
43+ from playwright .async_api import async_playwright , Browser
44+ from playwright .async_api ._generated import Playwright as AsyncPlaywright
4445
4546from ._const import SYSTEM_PROMPT
4647from ._types import JsonSchema
@@ -119,15 +120,16 @@ def get_contents(self, url: str) -> str:
119120 """
120121 return run_sync (self .get_contents_async , url )
121122
122- async def get_contents_async (self , url : str ) -> str :
123+ async def get_contents_async (self , url : str , browser : Browser | None = None ) -> str :
123124 """
124125 Asynchronously retrieves the entire HTML content of a webpage using Playwright.
125126
126- This method launches a headless Chromium browser, opens a new page,
127+ This method launches a headless Chromium browser (if browser not provided) , opens a new page,
127128 navigates to the specified URL, and returns the page's HTML content.
128129
129130 Args:
130131 url (str): The URL of the webpage to retrieve.
132+ browser (Browser | None): Optional Playwright browser instance to reuse.
131133
132134 Returns:
133135 str: The HTML content of the webpage.
@@ -136,15 +138,30 @@ async def get_contents_async(self, url: str) -> str:
136138 >>> contents = await webson.get_contents_async("https://example.com")
137139 >>> print(contents)
138140 """
139- async with async_playwright () as p :
140- chromium = p .chromium
141- browser = await chromium .launch (headless = True )
142- page = await browser .new_page ()
141+ if browser is None :
142+ async with async_playwright () as p :
143+ return await self ._get_contents (url , p )
144+ else :
145+ return await self ._get_contents_with_browser (url , browser )
146+
147+ async def _get_contents (self , url : str , playwright : AsyncPlaywright ) -> str :
148+ """Helper method to handle content retrieval with a new Playwright instance."""
149+ chromium = playwright .chromium
150+ browser = await chromium .launch (headless = True )
151+ try :
152+ return await self ._get_contents_with_browser (url , browser )
153+ finally :
154+ await browser .close ()
155+
156+ async def _get_contents_with_browser (self , url : str , browser : Browser ) -> str :
157+ """Helper method to retrieve content using an existing browser instance."""
158+ page = await browser .new_page ()
159+ try :
143160 await page .goto (url , timeout = self .timeout )
144161 debug_logger .debug ("Getting page contents" )
145- contents = await page .content ()
146- await browser . close () # Good practice to close the browser
147- return contents
162+ return await page .content ()
163+ finally :
164+ await page . close ()
148165
149166 def cast [T : msgspec .Struct ](self , url : str , * , to : type [T ]) -> T :
150167 """
0 commit comments