Open
Description
- Are you running the latest version?
- Have you included sample input, output, error, and expected output?
- Have you checked if you are using correct configuration?
- Did you try online tool?
Description
Finding it unsafe to use this library for dependable html parsing
Input
create test-fxp.js
:
const util = require('node:util');
const {XMLParser, XMLValidator} = require('fast-xml-parser');
(async () => {
for (const url of [
'https://nytimes.com',
'https://cnn.com',
'https://nypost.com',
'https://reddit.com',
'https://github.com'
]) {
const html = await (await fetch(url)).text();
const parsingOptions = {
ignoreAttributes: false,
preserveOrder: true,
unpairedTags: ['hr', 'br', 'link', 'meta'],
stopNodes: ['*.pre', '*.script'],
processEntities: true,
htmlEntities: true,
};
const parser = new XMLParser(parsingOptions);
try {
const result = await parser.parse(html);
console.log(`Success: ${url}:`, util.inspect(result, { depth: 1, colors: true }));
} catch (err) {
console.error(`Fail: ${url}:`, err);
}
}
})();
Output
run:
node test-fxp.js
Fail: https://nytimes.com: Error: Unexpected end of script
at OrderedObjParser.parseXml (<pwd>/node_modules/fast-xml-parser/src/xmlparser/OrderedObjParser.js:323:31)
at XMLParser.parse (<pwd>/node_modules/fast-xml-parser/src/xmlparser/XMLParser.js:35:48)
at <pwd>/test-fxp.js:23:35
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
Fail: https://cnn.com: Error: StopNode is not closed.
at findClosingIndex (<pwd>/node_modules/fast-xml-parser/src/xmlparser/OrderedObjParser.js:489:11)
at OrderedObjParser.readStopNodeData (<pwd>/node_modules/fast-xml-parser/src/xmlparser/OrderedObjParser.js:558:30)
at OrderedObjParser.parseXml (<pwd>/node_modules/fast-xml-parser/src/xmlparser/OrderedObjParser.js:322:33)
at XMLParser.parse (<pwd>/node_modules/fast-xml-parser/src/xmlparser/XMLParser.js:35:48)
at <pwd>/test-fxp.js:23:35
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
Fail: https://nypost.com: Error: Unexpected end of script
at OrderedObjParser.parseXml (<pwd>/node_modules/fast-xml-parser/src/xmlparser/OrderedObjParser.js:323:31)
at XMLParser.parse (<pwd>/node_modules/fast-xml-parser/src/xmlparser/XMLParser.js:35:48)
at <pwd>/test-fxp.js:23:35
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
Success: https://reddit.com: [ { '!doctype': [Array] }, { p: [Array] }, { p: [Array] } ]
Success: https://github.com: [ { html: [Array], ':@': [Object] } ]
expected data
Some kind of way to handle broken html without failing the entire parsing process
Would you like to work on this issue?
- Yes
- No