diff --git a/docs/docs/playwright-web/Examples.md b/docs/docs/playwright-web/Examples.md index ff60264..88528f4 100644 --- a/docs/docs/playwright-web/Examples.md +++ b/docs/docs/playwright-web/Examples.md @@ -116,6 +116,26 @@ When I extract the HTML content of the page Then I should receive the complete HTML structure of the page ``` +You can also filter HTML content for easier analysis: + +```bdd +Given I navigate to website "https://example.com/products" +When I extract the HTML content of the page filtered to remove scripts and styles +Then I should receive clean HTML without JavaScript or CSS code + +Given I navigate to website "https://example.com/products" +When I extract the HTML content of the page filtered to remove meta tags +Then I should receive HTML without metadata like charset, viewport, and SEO tags + +Given I navigate to website "https://example.com/products" +When I extract the HTML content using the cleanHtml option +Then I should receive a clean version of the HTML without scripts, styles, comments, and meta tags + +Given I navigate to website "https://example.com/products" +When I extract only the HTML for the main product container using selector "#product-listings" +Then I should receive just the HTML for the products section for easier analysis +``` + Example use case for content analysis: ```bdd diff --git a/docs/docs/playwright-web/Supported-Tools.mdx b/docs/docs/playwright-web/Supported-Tools.mdx index 155a743..b6cfd67 100644 --- a/docs/docs/playwright-web/Supported-Tools.mdx +++ b/docs/docs/playwright-web/Supported-Tools.mdx @@ -293,9 +293,25 @@ Get the visible text content of the current page. ### playwright_get_visible_html Get the HTML content of the current page. +- **Inputs:** + - **`selector`** *(string, optional)*: + CSS selector to limit the HTML to a specific container. If provided, only returns the HTML for that element. + - **`removeScripts`** *(boolean, optional, default: false)*: + Remove all script tags from the HTML to reduce noise. + - **`removeComments`** *(boolean, optional, default: false)*: + Remove all HTML comments to clean up the output. + - **`removeStyles`** *(boolean, optional, default: false)*: + Remove all style tags from the HTML. + - **`removeMeta`** *(boolean, optional, default: false)*: + Remove all meta tags from the HTML head section. + - **`minify`** *(boolean, optional, default: false)*: + Minify the HTML output by removing extra whitespace. + - **`cleanHtml`** *(boolean, optional, default: false)*: + Convenience option that combines removeScripts, removeComments, removeStyles, and removeMeta for a cleaner HTML output. + - **Response:** - **`content`** *(string)*: - The complete HTML content of the current page. + The HTML content of the current page, optionally filtered based on the provided parameters. --- diff --git a/src/__tests__/tools/browser/visiblePage.test.ts b/src/__tests__/tools/browser/visiblePage.test.ts index 0c1a2be..d7f18e9 100644 --- a/src/__tests__/tools/browser/visiblePage.test.ts +++ b/src/__tests__/tools/browser/visiblePage.test.ts @@ -1,17 +1,19 @@ import { VisibleTextTool, VisibleHtmlTool } from '../../../tools/browser/visiblePage.js'; import { ToolContext } from '../../../tools/common/types.js'; -import { Page, Browser } from 'playwright'; +import { Page, Browser, ElementHandle } from 'playwright'; import { jest } from '@jest/globals'; // Mock the Page object -const mockEvaluate = jest.fn(); +const mockEvaluate = jest.fn() as jest.MockedFunction<(pageFunction: Function | string, arg?: any) => Promise>; const mockContent = jest.fn(); const mockIsClosed = jest.fn().mockReturnValue(false); +const mock$ = jest.fn() as jest.MockedFunction<(selector: string) => Promise>; const mockPage = { evaluate: mockEvaluate, content: mockContent, - isClosed: mockIsClosed + isClosed: mockIsClosed, + $: mock$ } as unknown as Page; // Mock the browser @@ -135,6 +137,128 @@ describe('VisibleHtmlTool', () => { expect(result.content[0].text).toContain('Sample HTML content'); }); + test('should supply the correct filters', async () => { + const args = { + removeScripts: true, + removeComments: true, + removeStyles: true, + removeMeta: true, + minify: true, + cleanHtml: true + }; + + // Mock the page.evaluate to capture the filter arguments + mockEvaluate.mockImplementationOnce((callback, params) => { + expect(params).toEqual({ + html: 'Sample HTML content', + removeScripts: true, + removeComments: true, + removeStyles: true, + removeMeta: true, + minify: true + }); + return Promise.resolve('Processed HTML content'); + }); + + const result = await visibleHtmlTool.execute(args, mockContext); + + expect(mockContent).toHaveBeenCalled(); + expect(mockEvaluate).toHaveBeenCalled(); + expect(result.isError).toBe(false); + expect(result.content[0].text).toContain('HTML content'); + expect(result.content[0].text).toContain('Processed HTML content'); + }); + + test('should handle individual filter combinations', async () => { + const args = { + removeScripts: true, + minify: true + }; + + // Mock content to return HTML + mockContent.mockImplementationOnce(() => + Promise.resolve('Sample HTML content') + ); + + mockEvaluate.mockImplementationOnce((callback, params: any) => { + expect(params).toEqual({ + html: 'Sample HTML content', + removeScripts: true, + removeComments: undefined, + removeStyles: undefined, + removeMeta: undefined, + minify: true + }); + return Promise.resolve('Filtered content'); + }); + + const result = await visibleHtmlTool.execute(args, mockContext); + expect(result.isError).toBe(false); + expect(result.content[0].text).toContain('Filtered content'); + }); + + test('should handle selector parameter', async () => { + const args = { + selector: '#main-content', + removeScripts: true + }; + + // Mock element selection + const mockElement = { + outerHTML: '
Selected content
' + } as unknown as ElementHandle; + mock$.mockResolvedValueOnce(mockElement); + + // Mock evaluate for filtering + mockEvaluate.mockImplementation((_: any, params: any) => + Promise.resolve('
Processed selected content
') + ); + + const result = await visibleHtmlTool.execute(args, mockContext); + expect(mock$).toHaveBeenCalledWith('#main-content'); + expect(result.isError).toBe(false); + expect(result.content[0].text).toContain('Processed selected content'); + }); + + test('should handle empty HTML content', async () => { + const args = { + removeScripts: true + }; + + // Mock content to return empty HTML + mockContent.mockImplementationOnce(() => Promise.resolve('')); + + mockEvaluate.mockImplementationOnce((callback, params: any) => { + expect(params.html).toBe(''); + return Promise.resolve(''); + }); + + const result = await visibleHtmlTool.execute(args, mockContext); + expect(result.isError).toBe(false); + expect(result.content[0].text).toContain('HTML content'); + }); + + test('should handle cleanHtml flag setting all filters', async () => { + const args = { + cleanHtml: true + }; + + mockEvaluate.mockImplementationOnce((callback, params) => { + expect(params).toEqual({ + html: 'Sample HTML content', + removeScripts: true, + removeComments: true, + removeStyles: true, + removeMeta: true, + minify: undefined + }); + return Promise.resolve('Processed HTML content'); + }); + + const result = await visibleHtmlTool.execute(args, mockContext); + expect(result.isError).toBe(false); + }); + test('should handle missing page', async () => { const args = {}; diff --git a/src/tools.ts b/src/tools.ts index 937491c..5093a9f 100644 --- a/src/tools.ts +++ b/src/tools.ts @@ -341,7 +341,15 @@ export function createToolDefinitions() { description: "Get the HTML content of the current page", inputSchema: { type: "object", - properties: {}, + properties: { + selector: { type: "string", description: "CSS selector to limit the HTML to a specific container" }, + removeScripts: { type: "boolean", description: "Remove all script tags from the HTML (default: false)" }, + removeComments: { type: "boolean", description: "Remove all HTML comments (default: false)" }, + removeStyles: { type: "boolean", description: "Remove all style tags from the HTML (default: false)" }, + removeMeta: { type: "boolean", description: "Remove all meta tags from the HTML (default: false)" }, + cleanHtml: { type: "boolean", description: "Perform comprehensive HTML cleaning (default: false)" }, + minify: { type: "boolean", description: "Minify the HTML output (default: false)" } + }, required: [], }, }, @@ -472,4 +480,4 @@ export const tools = [ ...BROWSER_TOOLS, ...API_TOOLS, ...CODEGEN_TOOLS -]; \ No newline at end of file +]; diff --git a/src/tools/browser/visiblePage.ts b/src/tools/browser/visiblePage.ts index a837894..6e99c78 100644 --- a/src/tools/browser/visiblePage.ts +++ b/src/tools/browser/visiblePage.ts @@ -83,7 +83,93 @@ export class VisibleHtmlTool extends BrowserToolBase { } return this.safeExecute(context, async (page) => { try { - const htmlContent = await page!.content(); + const { selector, removeScripts, removeComments, removeStyles, removeMeta, minify, cleanHtml } = args; + + // Get the HTML content + let htmlContent: string; + + if (selector) { + // If a selector is provided, get only the HTML for that element + const element = await page.$(selector); + if (!element) { + return createErrorResponse(`Element with selector "${selector}" not found`); + } + htmlContent = await page.evaluate((el) => el.outerHTML, element); + } else { + // Otherwise get the full page HTML + htmlContent = await page.content(); + } + + // Determine if we need to apply filters + const shouldRemoveScripts = removeScripts || cleanHtml; + const shouldRemoveComments = removeComments || cleanHtml; + const shouldRemoveStyles = removeStyles || cleanHtml; + const shouldRemoveMeta = removeMeta || cleanHtml; + + // Apply filters in the browser context + if (shouldRemoveScripts || shouldRemoveComments || shouldRemoveStyles || shouldRemoveMeta || minify) { + htmlContent = await page.evaluate( + ({ html, removeScripts, removeComments, removeStyles, removeMeta, minify }) => { + // Create a DOM parser to work with the HTML + const parser = new DOMParser(); + const doc = parser.parseFromString(html, 'text/html'); + + // Remove script tags if requested + if (removeScripts) { + const scripts = doc.querySelectorAll('script'); + scripts.forEach(script => script.remove()); + } + + // Remove style tags if requested + if (removeStyles) { + const styles = doc.querySelectorAll('style'); + styles.forEach(style => style.remove()); + } + + // Remove meta tags if requested + if (removeMeta) { + const metaTags = doc.querySelectorAll('meta'); + metaTags.forEach(meta => meta.remove()); + } + + // Remove HTML comments if requested + if (removeComments) { + const removeComments = (node) => { + const childNodes = node.childNodes; + for (let i = childNodes.length - 1; i >= 0; i--) { + const child = childNodes[i]; + if (child.nodeType === 8) { // 8 is for comment nodes + node.removeChild(child); + } else if (child.nodeType === 1) { // 1 is for element nodes + removeComments(child); + } + } + }; + removeComments(doc.documentElement); + } + + // Get the processed HTML + let result = doc.documentElement.outerHTML; + + // Minify if requested + if (minify) { + // Simple minification: remove extra whitespace + result = result.replace(/>\s+<').trim(); + } + + return result; + }, + { + html: htmlContent, + removeScripts: shouldRemoveScripts, + removeComments: shouldRemoveComments, + removeStyles: shouldRemoveStyles, + removeMeta: shouldRemoveMeta, + minify + } + ); + } + return createSuccessResponse(`HTML content:\n${htmlContent}`); } catch (error) { return createErrorResponse(`Failed to get visible HTML content: ${(error as Error).message}`);