Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/code-infra/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
"git-url-parse": "^16.1.0",
"globals": "^17.4.0",
"globby": "^16.1.1",
"html-validate": "^10.11.2",
"minimatch": "^10.2.4",
"node-html-parser": "^7.1.0",
"open": "^11.0.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ <h1>Test Site Home</h1>
<li><a href="/page-with-api-links.html">Page with API Links</a></li>
<li><a href="/example.md">Example Markdown</a></li>
<li><a href="/unclosed-tags.html">Page with Unclosed Tags</a></li>
<li><a href="/invalid-html.html">Invalid HTML Page</a></li>
</ul>
</nav>
</body>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>Invalid HTML Page</title>
</head>
<body>
<h1>Invalid HTML</h1>
<!-- Duplicate ID (violates no-duplicate-id rule) -->
<div id="dup">First</div>
<div id="dup">Second</div>
<!-- Raw ampersand (violates no-raw-characters rule) -->
<p>Tom & Jerry</p>
</body>
</html>
173 changes: 173 additions & 0 deletions packages/code-infra/src/brokenLinksChecker/crawlWorker.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import { workerData, parentPort } from 'node:worker_threads';
import { parse } from 'node-html-parser';
import contentType from 'content-type';
import { HtmlValidate, StaticConfigLoader, staticResolver } from 'html-validate';
import { unified } from 'unified';
import remarkParse from 'remark-parse';
import remarkGfm from 'remark-gfm';
import remarkRehype from 'remark-rehype';
import rehypeSlug from 'rehype-slug';
import rehypeStringify from 'rehype-stringify';

/** @type {import('./index.mjs').CrawlWorkerInput} */
const { pageUrl, options } = workerData;

/**
* Posts the crawl result back to the parent thread.
* @param {import('./index.mjs').CrawlWorkerOutput} output
*/
function postResult(output) {
if (!parentPort) {
throw new Error('crawlWorker must be run as a worker thread');
}
parentPort.postMessage(output);
}

/**
* Computes the accessible name of an element according to ARIA rules.
* @param {import('node-html-parser').HTMLElement | null} elm
* @param {import('node-html-parser').HTMLElement} ownerDocument
* @returns {string}
*/
function getAccessibleName(elm, ownerDocument) {
if (!elm) {
return '';
}

const ariaLabel = elm.getAttribute('aria-label')?.trim();
if (ariaLabel) {
return ariaLabel;
}

const labelledby = elm.getAttribute('aria-labelledby');
if (labelledby) {
const labels = [];
for (const id of labelledby.split(/\s+/)) {
const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument);
if (label) {
labels.push(label);
}
}
const label = labels.join(' ').trim();
if (label) {
return label;
}
}

if (elm.id) {
const label = ownerDocument.querySelector(`label[for="${elm.id}"]`);
if (label) {
return getAccessibleName(label, ownerDocument);
}
}

if (elm.tagName === 'IMG') {
const alt = elm.getAttribute('alt')?.trim();
if (alt) {
return alt;
}
}

return elm.innerText.trim();
}

/**
* Converts markdown content to HTML using unified pipeline.
* @param {string} markdown
* @returns {Promise<string>}
*/
async function markdownToHtml(markdown) {
const result = await unified()
.use(remarkParse)
.use(remarkGfm)
.use(remarkRehype)
.use(rehypeSlug)
.use(rehypeStringify)
.process(markdown);
return String(result);
}

const res = await fetch(new URL(pageUrl, options.host));

const contentTypeHeader = res.headers.get('content-type');
let type = 'text/html';

if (contentTypeHeader) {
try {
const parsed = contentType.parse(contentTypeHeader);
type = parsed.type;
} catch {
// invalid content-type, default to text/html
}
}

/** @type {import('./index.mjs').CrawlWorkerPageData} */
const pageData = {
url: pageUrl,
status: res.status,
targets: [],
contentType: type,
};

if (pageData.status < 200 || pageData.status >= 400) {
postResult({ pageData, links: [], htmlValidateResults: null });
} else if (type.startsWith('image/') || (type !== 'text/html' && type !== 'text/markdown')) {
postResult({ pageData, links: [], htmlValidateResults: null });
} else {
const rawContent = await res.text();

const content = type === 'text/markdown' ? await markdownToHtml(rawContent) : rawContent;

const dom = parse(content, { parseNoneClosedTags: true });

// Extract targets
for (const target of dom.querySelectorAll('*[id]')) {
if (!options.ignoredTargets.has(target.id)) {
pageData.targets.push(`#${target.id}`);
}
}

// Extract links
let ignoredSelector = ':not(*)';
if (options.ignoredContent.length > 0) {
ignoredSelector = Array.from(options.ignoredContent)
.flatMap((selector) => [selector, `${selector} *`])
.join(',');
}
const linksSelector = `a[href]:not(${ignoredSelector})`;

const links = dom.querySelectorAll(linksSelector).map((a) => ({
src: pageUrl,
text: getAccessibleName(a, dom),
href: a.getAttribute('href') ?? '',
contentType: type,
}));

// HTML validation
/** @type {{ pageUrl: string, results: import('html-validate').Result[] } | null} */
let htmlValidateResults = null;
if (options.htmlValidate && type === 'text/html') {
const muiHtmlValidateResolver = staticResolver({
configs: {
'mui:recommended': {
extends: ['html-validate:standard', 'html-validate:document', 'html-validate:browser'],
rules: {
// TODO: Enable when subresource integrity is adopted across projects
'require-sri': 'off',
},
},
},
});

const htmlValidator = new HtmlValidate(
new StaticConfigLoader([muiHtmlValidateResolver], options.htmlValidate),
);

const report = await htmlValidator.validateString(rawContent, pageUrl);
if (!report.valid) {
htmlValidateResults = { pageUrl, results: report.results };
}
}

postResult({ pageData, links, htmlValidateResults });
}
Loading
Loading