diff --git a/packages/code-infra/package.json b/packages/code-infra/package.json index 3fd013108..3615c0614 100644 --- a/packages/code-infra/package.json +++ b/packages/code-infra/package.json @@ -111,6 +111,7 @@ "git-url-parse": "^16.1.0", "globals": "^17.4.0", "globby": "^16.1.1", + "html-validate": "^10.11.2", "minimatch": "^10.2.4", "node-html-parser": "^7.1.0", "open": "^11.0.0", diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html index c9ba541e5..c79573ba0 100644 --- a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html @@ -22,6 +22,7 @@

Test Site Home

  • Page with API Links
  • Example Markdown
  • Page with Unclosed Tags
  • +
  • Invalid HTML Page
  • diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/invalid-html.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/invalid-html.html new file mode 100644 index 000000000..fd4f01f73 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/invalid-html.html @@ -0,0 +1,15 @@ + + + + + Invalid HTML Page + + +

    Invalid HTML

    + +
    First
    +
    Second
    + +

    Tom & Jerry

    + + diff --git a/packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs b/packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs new file mode 100644 index 000000000..a2e55ab39 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs @@ -0,0 +1,173 @@ +import { workerData, parentPort } from 'node:worker_threads'; +import { parse } from 'node-html-parser'; +import contentType from 'content-type'; +import { HtmlValidate, StaticConfigLoader, staticResolver } from 'html-validate'; +import { unified } from 'unified'; +import remarkParse from 'remark-parse'; +import remarkGfm from 'remark-gfm'; +import remarkRehype from 'remark-rehype'; +import rehypeSlug from 'rehype-slug'; +import rehypeStringify from 'rehype-stringify'; + +/** @type {import('./index.mjs').CrawlWorkerInput} */ +const { pageUrl, options } = workerData; + +/** + * Posts the crawl result back to the parent thread. + * @param {import('./index.mjs').CrawlWorkerOutput} output + */ +function postResult(output) { + if (!parentPort) { + throw new Error('crawlWorker must be run as a worker thread'); + } + parentPort.postMessage(output); +} + +/** + * Computes the accessible name of an element according to ARIA rules. + * @param {import('node-html-parser').HTMLElement | null} elm + * @param {import('node-html-parser').HTMLElement} ownerDocument + * @returns {string} + */ +function getAccessibleName(elm, ownerDocument) { + if (!elm) { + return ''; + } + + const ariaLabel = elm.getAttribute('aria-label')?.trim(); + if (ariaLabel) { + return ariaLabel; + } + + const labelledby = elm.getAttribute('aria-labelledby'); + if (labelledby) { + const labels = []; + for (const id of labelledby.split(/\s+/)) { + const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument); + if (label) { + labels.push(label); + } + } + const label = labels.join(' ').trim(); + if (label) { + return label; + } + } + + if (elm.id) { + const label = ownerDocument.querySelector(`label[for="${elm.id}"]`); + if (label) { + return getAccessibleName(label, ownerDocument); + } + } + + if (elm.tagName === 'IMG') { + const alt = elm.getAttribute('alt')?.trim(); + if (alt) { + return alt; + } + } + + return elm.innerText.trim(); +} + +/** + * Converts markdown content to HTML using unified pipeline. + * @param {string} markdown + * @returns {Promise} + */ +async function markdownToHtml(markdown) { + const result = await unified() + .use(remarkParse) + .use(remarkGfm) + .use(remarkRehype) + .use(rehypeSlug) + .use(rehypeStringify) + .process(markdown); + return String(result); +} + +const res = await fetch(new URL(pageUrl, options.host)); + +const contentTypeHeader = res.headers.get('content-type'); +let type = 'text/html'; + +if (contentTypeHeader) { + try { + const parsed = contentType.parse(contentTypeHeader); + type = parsed.type; + } catch { + // invalid content-type, default to text/html + } +} + +/** @type {import('./index.mjs').CrawlWorkerPageData} */ +const pageData = { + url: pageUrl, + status: res.status, + targets: [], + contentType: type, +}; + +if (pageData.status < 200 || pageData.status >= 400) { + postResult({ pageData, links: [], htmlValidateResults: null }); +} else if (type.startsWith('image/') || (type !== 'text/html' && type !== 'text/markdown')) { + postResult({ pageData, links: [], htmlValidateResults: null }); +} else { + const rawContent = await res.text(); + + const content = type === 'text/markdown' ? await markdownToHtml(rawContent) : rawContent; + + const dom = parse(content, { parseNoneClosedTags: true }); + + // Extract targets + for (const target of dom.querySelectorAll('*[id]')) { + if (!options.ignoredTargets.has(target.id)) { + pageData.targets.push(`#${target.id}`); + } + } + + // Extract links + let ignoredSelector = ':not(*)'; + if (options.ignoredContent.length > 0) { + ignoredSelector = Array.from(options.ignoredContent) + .flatMap((selector) => [selector, `${selector} *`]) + .join(','); + } + const linksSelector = `a[href]:not(${ignoredSelector})`; + + const links = dom.querySelectorAll(linksSelector).map((a) => ({ + src: pageUrl, + text: getAccessibleName(a, dom), + href: a.getAttribute('href') ?? '', + contentType: type, + })); + + // HTML validation + /** @type {{ pageUrl: string, results: import('html-validate').Result[] } | null} */ + let htmlValidateResults = null; + if (options.htmlValidate && type === 'text/html') { + const muiHtmlValidateResolver = staticResolver({ + configs: { + 'mui:recommended': { + extends: ['html-validate:standard', 'html-validate:document', 'html-validate:browser'], + rules: { + // TODO: Enable when subresource integrity is adopted across projects + 'require-sri': 'off', + }, + }, + }, + }); + + const htmlValidator = new HtmlValidate( + new StaticConfigLoader([muiHtmlValidateResolver], options.htmlValidate), + ); + + const report = await htmlValidator.validateString(rawContent, pageUrl); + if (!report.valid) { + htmlValidateResults = { pageUrl, results: report.results }; + } + } + + postResult({ pageData, links, htmlValidateResults }); +} diff --git a/packages/code-infra/src/brokenLinksChecker/index.mjs b/packages/code-infra/src/brokenLinksChecker/index.mjs index 64718cec2..d22161eac 100644 --- a/packages/code-infra/src/brokenLinksChecker/index.mjs +++ b/packages/code-infra/src/brokenLinksChecker/index.mjs @@ -1,21 +1,17 @@ /* eslint-disable no-console */ import { execaCommand } from 'execa'; import timers from 'node:timers/promises'; -import { parse } from 'node-html-parser'; import * as fs from 'node:fs/promises'; import * as path from 'node:path'; +import { pathToFileURL } from 'node:url'; import chalk from 'chalk'; import { Transform } from 'node:stream'; -import contentType from 'content-type'; -import { unified } from 'unified'; -import remarkParse from 'remark-parse'; -import remarkGfm from 'remark-gfm'; -import remarkRehype from 'remark-rehype'; -import rehypeSlug from 'rehype-slug'; -import rehypeStringify from 'rehype-stringify'; +import { Worker } from 'node:worker_threads'; const DEFAULT_CONCURRENCY = 4; +const crawlWorkerUrl = new URL('./crawlWorker.mjs', import.meta.url); + /** * Creates a Transform stream that prefixes each line with a given string. * Useful for distinguishing server logs from other output. @@ -105,6 +101,30 @@ function deserializeLinkStructure(data) { return linkStructure; } +/** + * Input data passed to the crawl worker via workerData. + * @typedef {Object} CrawlWorkerInput + * @property {string} pageUrl - The page URL to crawl + * @property {ResolvedCrawlOptions} options - Fully resolved crawl options + */ + +/** + * Serialized page data returned by the crawl worker (uses arrays instead of Sets for structured clone). + * @typedef {Object} CrawlWorkerPageData + * @property {string} url - The normalized page URL + * @property {number} status - HTTP status code + * @property {string[]} targets - Array of anchor targets (e.g., '#intro') + * @property {string} contentType - Content-type of the page + */ + +/** + * Output message posted by the crawl worker. + * @typedef {Object} CrawlWorkerOutput + * @property {CrawlWorkerPageData} pageData - Serialized page data + * @property {Link[]} links - Links discovered on the page + * @property {{ pageUrl: string, results: import('html-validate').Result[] } | null} htmlValidateResults - HTML validation results, or null if validation was skipped/passed + */ + /** * Data about a crawled page including its URL, HTTP status, and available link targets. * @typedef {Object} PageData @@ -131,77 +151,6 @@ async function writePagesToFile(pages, outPath) { await fs.writeFile(outPath, JSON.stringify(fileContent, null, 2), 'utf-8'); } -/** - * Computes the accessible name of an element according to ARIA rules. - * Polyfill for `node.computedName` available only in Chrome v112+. - * Checks in order: aria-label, aria-labelledby, label[for], img alt, innerText. - * @param {import('node-html-parser').HTMLElement | null} elm - Element to compute name for - * @param {import('node-html-parser').HTMLElement} ownerDocument - Document containing the element - * @returns {string} The computed accessible name, or empty string if none found - */ -function getAccessibleName(elm, ownerDocument) { - if (!elm) { - return ''; - } - - // 1. aria-label - const ariaLabel = elm.getAttribute('aria-label')?.trim(); - if (ariaLabel) { - return ariaLabel; - } - - // 2. aria-labelledby - const labelledby = elm.getAttribute('aria-labelledby'); - if (labelledby) { - const labels = []; - for (const id of labelledby.split(/\s+/)) { - const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument); - if (label) { - labels.push(label); - } - } - const label = labels.join(' ').trim(); - if (label) { - return label; - } - } - - // 3.