Skip to content

Commit e3c5fdc

Browse files
committed
feat: new parameter to reuse a browser
1 parent f00fa88 commit e3c5fdc

File tree

2 files changed

+29
-12
lines changed

2 files changed

+29
-12
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Changelog
22

3-
## v0.0.4
3+
## v0.1.0
44

5-
-
5+
- feat: new parameter to allow reusing an open browser.

src/webson/_webson.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@
4040
from intellibricks.llms import Synapse, SynapseCascade
4141
from intellibricks.llms.util import get_struct_from_schema
4242
from markdownify import markdownify as md
43-
from playwright.async_api import async_playwright
43+
from playwright.async_api import async_playwright, Browser
44+
from playwright.async_api._generated import Playwright as AsyncPlaywright
4445

4546
from ._const import SYSTEM_PROMPT
4647
from ._types import JsonSchema
@@ -119,15 +120,16 @@ def get_contents(self, url: str) -> str:
119120
"""
120121
return run_sync(self.get_contents_async, url)
121122

122-
async def get_contents_async(self, url: str) -> str:
123+
async def get_contents_async(self, url: str, browser: Browser | None = None) -> str:
123124
"""
124125
Asynchronously retrieves the entire HTML content of a webpage using Playwright.
125126
126-
This method launches a headless Chromium browser, opens a new page,
127+
This method launches a headless Chromium browser (if browser not provided), opens a new page,
127128
navigates to the specified URL, and returns the page's HTML content.
128129
129130
Args:
130131
url (str): The URL of the webpage to retrieve.
132+
browser (Browser | None): Optional Playwright browser instance to reuse.
131133
132134
Returns:
133135
str: The HTML content of the webpage.
@@ -136,15 +138,30 @@ async def get_contents_async(self, url: str) -> str:
136138
>>> contents = await webson.get_contents_async("https://example.com")
137139
>>> print(contents)
138140
"""
139-
async with async_playwright() as p:
140-
chromium = p.chromium
141-
browser = await chromium.launch(headless=True)
142-
page = await browser.new_page()
141+
if browser is None:
142+
async with async_playwright() as p:
143+
return await self._get_contents(url, p)
144+
else:
145+
return await self._get_contents_with_browser(url, browser)
146+
147+
async def _get_contents(self, url: str, playwright: AsyncPlaywright) -> str:
148+
"""Helper method to handle content retrieval with a new Playwright instance."""
149+
chromium = playwright.chromium
150+
browser = await chromium.launch(headless=True)
151+
try:
152+
return await self._get_contents_with_browser(url, browser)
153+
finally:
154+
await browser.close()
155+
156+
async def _get_contents_with_browser(self, url: str, browser: Browser) -> str:
157+
"""Helper method to retrieve content using an existing browser instance."""
158+
page = await browser.new_page()
159+
try:
143160
await page.goto(url, timeout=self.timeout)
144161
debug_logger.debug("Getting page contents")
145-
contents = await page.content()
146-
await browser.close() # Good practice to close the browser
147-
return contents
162+
return await page.content()
163+
finally:
164+
await page.close()
148165

149166
def cast[T: msgspec.Struct](self, url: str, *, to: type[T]) -> T:
150167
"""

0 commit comments

Comments
 (0)