Skip to content

Commit 7993f03

Browse files
Merge pull request #121 from SethPaul/filter-html-options
allow filtering of html
2 parents 39f2277 + b779b7d commit 7993f03

File tree

5 files changed

+261
-7
lines changed

5 files changed

+261
-7
lines changed

docs/docs/playwright-web/Examples.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,26 @@ When I extract the HTML content of the page
116116
Then I should receive the complete HTML structure of the page
117117
```
118118

119+
You can also filter HTML content for easier analysis:
120+
121+
```bdd
122+
Given I navigate to website "https://example.com/products"
123+
When I extract the HTML content of the page filtered to remove scripts and styles
124+
Then I should receive clean HTML without JavaScript or CSS code
125+
126+
Given I navigate to website "https://example.com/products"
127+
When I extract the HTML content of the page filtered to remove meta tags
128+
Then I should receive HTML without metadata like charset, viewport, and SEO tags
129+
130+
Given I navigate to website "https://example.com/products"
131+
When I extract the HTML content using the cleanHtml option
132+
Then I should receive a clean version of the HTML without scripts, styles, comments, and meta tags
133+
134+
Given I navigate to website "https://example.com/products"
135+
When I extract only the HTML for the main product container using selector "#product-listings"
136+
Then I should receive just the HTML for the products section for easier analysis
137+
```
138+
119139
Example use case for content analysis:
120140

121141
```bdd

docs/docs/playwright-web/Supported-Tools.mdx

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,9 +304,25 @@ Get the visible text content of the current page.
304304
### playwright_get_visible_html
305305
Get the HTML content of the current page.
306306

307+
- **Inputs:**
308+
- **`selector`** *(string, optional)*:
309+
CSS selector to limit the HTML to a specific container. If provided, only returns the HTML for that element.
310+
- **`removeScripts`** *(boolean, optional, default: false)*:
311+
Remove all script tags from the HTML to reduce noise.
312+
- **`removeComments`** *(boolean, optional, default: false)*:
313+
Remove all HTML comments to clean up the output.
314+
- **`removeStyles`** *(boolean, optional, default: false)*:
315+
Remove all style tags from the HTML.
316+
- **`removeMeta`** *(boolean, optional, default: false)*:
317+
Remove all meta tags from the HTML head section.
318+
- **`minify`** *(boolean, optional, default: false)*:
319+
Minify the HTML output by removing extra whitespace.
320+
- **`cleanHtml`** *(boolean, optional, default: false)*:
321+
Convenience option that combines removeScripts, removeComments, removeStyles, and removeMeta for a cleaner HTML output.
322+
307323
- **Response:**
308324
- **`content`** *(string)*:
309-
The complete HTML content of the current page.
325+
The HTML content of the current page, optionally filtered based on the provided parameters.
310326

311327
---
312328

src/__tests__/tools/browser/visiblePage.test.ts

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
import { VisibleTextTool, VisibleHtmlTool } from '../../../tools/browser/visiblePage.js';
22
import { ToolContext } from '../../../tools/common/types.js';
3-
import { Page, Browser } from 'playwright';
3+
import { Page, Browser, ElementHandle } from 'playwright';
44
import { jest } from '@jest/globals';
55

66
// Mock the Page object
7-
const mockEvaluate = jest.fn();
7+
const mockEvaluate = jest.fn() as jest.MockedFunction<(pageFunction: Function | string, arg?: any) => Promise<any>>;
88
const mockContent = jest.fn();
99
const mockIsClosed = jest.fn().mockReturnValue(false);
10+
const mock$ = jest.fn() as jest.MockedFunction<(selector: string) => Promise<ElementHandle | null>>;
1011

1112
const mockPage = {
1213
evaluate: mockEvaluate,
1314
content: mockContent,
14-
isClosed: mockIsClosed
15+
isClosed: mockIsClosed,
16+
$: mock$
1517
} as unknown as Page;
1618

1719
// Mock the browser
@@ -135,6 +137,128 @@ describe('VisibleHtmlTool', () => {
135137
expect(result.content[0].text).toContain('<html><body>Sample HTML content</body></html>');
136138
});
137139

140+
test('should supply the correct filters', async () => {
141+
const args = {
142+
removeScripts: true,
143+
removeComments: true,
144+
removeStyles: true,
145+
removeMeta: true,
146+
minify: true,
147+
cleanHtml: true
148+
};
149+
150+
// Mock the page.evaluate to capture the filter arguments
151+
mockEvaluate.mockImplementationOnce((callback, params) => {
152+
expect(params).toEqual({
153+
html: '<html><body>Sample HTML content</body></html>',
154+
removeScripts: true,
155+
removeComments: true,
156+
removeStyles: true,
157+
removeMeta: true,
158+
minify: true
159+
});
160+
return Promise.resolve('<html><body>Processed HTML content</body></html>');
161+
});
162+
163+
const result = await visibleHtmlTool.execute(args, mockContext);
164+
165+
expect(mockContent).toHaveBeenCalled();
166+
expect(mockEvaluate).toHaveBeenCalled();
167+
expect(result.isError).toBe(false);
168+
expect(result.content[0].text).toContain('HTML content');
169+
expect(result.content[0].text).toContain('Processed HTML content');
170+
});
171+
172+
test('should handle individual filter combinations', async () => {
173+
const args = {
174+
removeScripts: true,
175+
minify: true
176+
};
177+
178+
// Mock content to return HTML
179+
mockContent.mockImplementationOnce(() =>
180+
Promise.resolve('<html><body>Sample HTML content</body></html>')
181+
);
182+
183+
mockEvaluate.mockImplementationOnce((callback, params: any) => {
184+
expect(params).toEqual({
185+
html: '<html><body>Sample HTML content</body></html>',
186+
removeScripts: true,
187+
removeComments: undefined,
188+
removeStyles: undefined,
189+
removeMeta: undefined,
190+
minify: true
191+
});
192+
return Promise.resolve('<html><body>Filtered content</body></html>');
193+
});
194+
195+
const result = await visibleHtmlTool.execute(args, mockContext);
196+
expect(result.isError).toBe(false);
197+
expect(result.content[0].text).toContain('Filtered content');
198+
});
199+
200+
test('should handle selector parameter', async () => {
201+
const args = {
202+
selector: '#main-content',
203+
removeScripts: true
204+
};
205+
206+
// Mock element selection
207+
const mockElement = {
208+
outerHTML: '<div id="main-content">Selected content</div>'
209+
} as unknown as ElementHandle<Element>;
210+
mock$.mockResolvedValueOnce(mockElement);
211+
212+
// Mock evaluate for filtering
213+
mockEvaluate.mockImplementation((_: any, params: any) =>
214+
Promise.resolve('<div>Processed selected content</div>')
215+
);
216+
217+
const result = await visibleHtmlTool.execute(args, mockContext);
218+
expect(mock$).toHaveBeenCalledWith('#main-content');
219+
expect(result.isError).toBe(false);
220+
expect(result.content[0].text).toContain('Processed selected content');
221+
});
222+
223+
test('should handle empty HTML content', async () => {
224+
const args = {
225+
removeScripts: true
226+
};
227+
228+
// Mock content to return empty HTML
229+
mockContent.mockImplementationOnce(() => Promise.resolve(''));
230+
231+
mockEvaluate.mockImplementationOnce((callback, params: any) => {
232+
expect(params.html).toBe('');
233+
return Promise.resolve('');
234+
});
235+
236+
const result = await visibleHtmlTool.execute(args, mockContext);
237+
expect(result.isError).toBe(false);
238+
expect(result.content[0].text).toContain('HTML content');
239+
});
240+
241+
test('should handle cleanHtml flag setting all filters', async () => {
242+
const args = {
243+
cleanHtml: true
244+
};
245+
246+
mockEvaluate.mockImplementationOnce((callback, params) => {
247+
expect(params).toEqual({
248+
html: '<html><body>Sample HTML content</body></html>',
249+
removeScripts: true,
250+
removeComments: true,
251+
removeStyles: true,
252+
removeMeta: true,
253+
minify: undefined
254+
});
255+
return Promise.resolve('<html><body>Processed HTML content</body></html>');
256+
});
257+
258+
const result = await visibleHtmlTool.execute(args, mockContext);
259+
expect(result.isError).toBe(false);
260+
});
261+
138262
test('should handle missing page', async () => {
139263
const args = {};
140264

src/tools.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,15 @@ export function createToolDefinitions() {
353353
description: "Get the HTML content of the current page",
354354
inputSchema: {
355355
type: "object",
356-
properties: {},
356+
properties: {
357+
selector: { type: "string", description: "CSS selector to limit the HTML to a specific container" },
358+
removeScripts: { type: "boolean", description: "Remove all script tags from the HTML (default: false)" },
359+
removeComments: { type: "boolean", description: "Remove all HTML comments (default: false)" },
360+
removeStyles: { type: "boolean", description: "Remove all style tags from the HTML (default: false)" },
361+
removeMeta: { type: "boolean", description: "Remove all meta tags from the HTML (default: false)" },
362+
cleanHtml: { type: "boolean", description: "Perform comprehensive HTML cleaning (default: false)" },
363+
minify: { type: "boolean", description: "Minify the HTML output (default: false)" }
364+
},
357365
required: [],
358366
},
359367
},
@@ -485,4 +493,4 @@ export const tools = [
485493
...BROWSER_TOOLS,
486494
...API_TOOLS,
487495
...CODEGEN_TOOLS
488-
];
496+
];

src/tools/browser/visiblePage.ts

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,93 @@ export class VisibleHtmlTool extends BrowserToolBase {
8383
}
8484
return this.safeExecute(context, async (page) => {
8585
try {
86-
const htmlContent = await page!.content();
86+
const { selector, removeScripts, removeComments, removeStyles, removeMeta, minify, cleanHtml } = args;
87+
88+
// Get the HTML content
89+
let htmlContent: string;
90+
91+
if (selector) {
92+
// If a selector is provided, get only the HTML for that element
93+
const element = await page.$(selector);
94+
if (!element) {
95+
return createErrorResponse(`Element with selector "${selector}" not found`);
96+
}
97+
htmlContent = await page.evaluate((el) => el.outerHTML, element);
98+
} else {
99+
// Otherwise get the full page HTML
100+
htmlContent = await page.content();
101+
}
102+
103+
// Determine if we need to apply filters
104+
const shouldRemoveScripts = removeScripts || cleanHtml;
105+
const shouldRemoveComments = removeComments || cleanHtml;
106+
const shouldRemoveStyles = removeStyles || cleanHtml;
107+
const shouldRemoveMeta = removeMeta || cleanHtml;
108+
109+
// Apply filters in the browser context
110+
if (shouldRemoveScripts || shouldRemoveComments || shouldRemoveStyles || shouldRemoveMeta || minify) {
111+
htmlContent = await page.evaluate(
112+
({ html, removeScripts, removeComments, removeStyles, removeMeta, minify }) => {
113+
// Create a DOM parser to work with the HTML
114+
const parser = new DOMParser();
115+
const doc = parser.parseFromString(html, 'text/html');
116+
117+
// Remove script tags if requested
118+
if (removeScripts) {
119+
const scripts = doc.querySelectorAll('script');
120+
scripts.forEach(script => script.remove());
121+
}
122+
123+
// Remove style tags if requested
124+
if (removeStyles) {
125+
const styles = doc.querySelectorAll('style');
126+
styles.forEach(style => style.remove());
127+
}
128+
129+
// Remove meta tags if requested
130+
if (removeMeta) {
131+
const metaTags = doc.querySelectorAll('meta');
132+
metaTags.forEach(meta => meta.remove());
133+
}
134+
135+
// Remove HTML comments if requested
136+
if (removeComments) {
137+
const removeComments = (node) => {
138+
const childNodes = node.childNodes;
139+
for (let i = childNodes.length - 1; i >= 0; i--) {
140+
const child = childNodes[i];
141+
if (child.nodeType === 8) { // 8 is for comment nodes
142+
node.removeChild(child);
143+
} else if (child.nodeType === 1) { // 1 is for element nodes
144+
removeComments(child);
145+
}
146+
}
147+
};
148+
removeComments(doc.documentElement);
149+
}
150+
151+
// Get the processed HTML
152+
let result = doc.documentElement.outerHTML;
153+
154+
// Minify if requested
155+
if (minify) {
156+
// Simple minification: remove extra whitespace
157+
result = result.replace(/>\s+</g, '><').trim();
158+
}
159+
160+
return result;
161+
},
162+
{
163+
html: htmlContent,
164+
removeScripts: shouldRemoveScripts,
165+
removeComments: shouldRemoveComments,
166+
removeStyles: shouldRemoveStyles,
167+
removeMeta: shouldRemoveMeta,
168+
minify
169+
}
170+
);
171+
}
172+
87173
return createSuccessResponse(`HTML content:\n${htmlContent}`);
88174
} catch (error) {
89175
return createErrorResponse(`Failed to get visible HTML content: ${(error as Error).message}`);

0 commit comments

Comments
 (0)