Polish PRs #1072, #1074

puzrin · puzrin · commit 59955f2ad35c · 2026-05-21T06:26:18.000+03:00
diff --git a/lib/common/utils.mjs b/lib/common/utils.mjs
@@ -279,12 +279,12 @@ function normalizeReference (str) {
   return str.toLowerCase().toUpperCase()
 }
 
-// Light trim for blocks like paragraph/header, where simple whitespaces should be trimmed, but unicode ones should stay intact.
 function isAsciiTrimmable (c) {
   return c === 0x20 || c === 0x09 || c === 0x0a || c === 0x0d
 }
 
-// Removes space-like characters that are allowed to be removed in CommonMark spec.
+// "Light" .trim() for blocks (headers, paragraphs), where unicode spaces
+// should be preserved.
 function asciiTrim (str) {
   let start = 0
   for (; start < str.length; start++) {
diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs
@@ -79,21 +79,6 @@ StateInline.prototype.push = function (type, tag, nesting) {
   return token
 }
 
-// Get the last character code before the given position, considering surrogate pairs.
-function getLastCharCode (str, pos) {
-  // treat beginning of the line as a whitespace
-  if (pos <= 0) { return 0x20 }
-  const charCode = str.charCodeAt(pos - 1)
-  // not a low-surrogate code unit (is BMP code point)
-  if ((charCode & 0xFC00) !== 0xDC00) { return charCode }
-
-  // undefined if out of range (typically due to isolated low-surrogate code unit
-  // at the beginning of the string)
-  const codePoint = str.codePointAt(pos - 2)
-  // undefined > 0xffff = false, so we don't need extra check here
-  return codePoint > 0xffff ? codePoint : charCode
-}
-
 // Scan a sequence of emphasis-like markers, and determine whether
 // it can start an emphasis sequence or end an emphasis sequence.
 //
@@ -104,15 +89,47 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
   const max = this.posMax
   const marker = this.src.charCodeAt(start)
 
-  const lastChar = getLastCharCode(this.src, start)
+  // Astral characters below are combined manually, because .codePointAt()
+  // does not guarantee numeric type output. And we don't wish JIT cache issues.
+  // The broken surrogate pairs are evaluated as U+FFFD to prevent possible
+  // crashes.
+
+  let lastChar
+  if (start === 0) {
+    // treat beginning of the line as a whitespace
+    lastChar = 0x20
+  } else if (start === 1) {
+    lastChar = this.src.charCodeAt(0)
+    if ((lastChar & 0xF800) === 0xD800) { lastChar = 0xFFFD }
+  } else {
+    lastChar = this.src.charCodeAt(start - 1)
+    if ((lastChar & 0xFC00) === 0xDC00) {
+      // low surrogate => add high one, replace broken pair with U+FFFD
+      const highSurr = this.src.charCodeAt(start - 2)
+      lastChar = (highSurr & 0xFC00) === 0xD800
+        ? 0x10000 + ((highSurr - 0xD800) << 10) + (lastChar - 0xDC00)
+        : 0xFFFD
+    } else if ((lastChar & 0xFC00) === 0xD800) {
+      lastChar = 0xFFFD
+    }
+  }
 
   let pos = start
   while (pos < max && this.src.charCodeAt(pos) === marker) { pos++ }
 
   const count = pos - start
 
   // treat end of the line as a whitespace
-  const nextChar = pos < max ? this.src.codePointAt(pos) : 0x20
+  let nextChar = pos < max ? this.src.charCodeAt(pos) : 0x20
+  if ((nextChar & 0xFC00) === 0xD800) {
+    // high surrogate => add low one, replace broken pair with U+FFFD
+    const lowSurr = this.src.charCodeAt(pos + 1)
+    nextChar = (lowSurr & 0xFC00) === 0xDC00
+      ? 0x10000 + ((nextChar - 0xD800) << 10) + (lowSurr - 0xDC00)
+      : 0xFFFD
+  } else if ((nextChar & 0xFC00) === 0xDC00) {
+    nextChar = 0xFFFD
+  }
 
   const isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(fromCodePoint(lastChar))
   const isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(fromCodePoint(nextChar))

Original file line number	Diff line number	Diff line change
`@@ -279,12 +279,12 @@ function normalizeReference (str) {`
`279`	`279`	`return str.toLowerCase().toUpperCase()`
`280`	`280`	`}`
`281`	`281`
`282`		`-// Light trim for blocks like paragraph/header, where simple whitespaces should be trimmed, but unicode ones should stay intact.`
`283`	`282`	`function isAsciiTrimmable (c) {`
`284`	`283`	`return c === 0x20 \|\| c === 0x09 \|\| c === 0x0a \|\| c === 0x0d`
`285`	`284`	`}`
`286`	`285`
`287`		`-// Removes space-like characters that are allowed to be removed in CommonMark spec.`
	`286`	`+// "Light" .trim() for blocks (headers, paragraphs), where unicode spaces`
	`287`	`+// should be preserved.`
`288`	`288`	`function asciiTrim (str) {`
`289`	`289`	`let start = 0`
`290`	`290`	`for (; start < str.length; start++) {`