Skip to content

Commit 59955f2

Browse files
committed
Polish PRs #1072, #1074
1 parent 2c1e305 commit 59955f2

2 files changed

Lines changed: 36 additions & 19 deletions

File tree

lib/common/utils.mjs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,12 +279,12 @@ function normalizeReference (str) {
279279
return str.toLowerCase().toUpperCase()
280280
}
281281

282-
// Light trim for blocks like paragraph/header, where simple whitespaces should be trimmed, but unicode ones should stay intact.
283282
function isAsciiTrimmable (c) {
284283
return c === 0x20 || c === 0x09 || c === 0x0a || c === 0x0d
285284
}
286285

287-
// Removes space-like characters that are allowed to be removed in CommonMark spec.
286+
// "Light" .trim() for blocks (headers, paragraphs), where unicode spaces
287+
// should be preserved.
288288
function asciiTrim (str) {
289289
let start = 0
290290
for (; start < str.length; start++) {

lib/rules_inline/state_inline.mjs

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -79,21 +79,6 @@ StateInline.prototype.push = function (type, tag, nesting) {
7979
return token
8080
}
8181

82-
// Get the last character code before the given position, considering surrogate pairs.
83-
function getLastCharCode (str, pos) {
84-
// treat beginning of the line as a whitespace
85-
if (pos <= 0) { return 0x20 }
86-
const charCode = str.charCodeAt(pos - 1)
87-
// not a low-surrogate code unit (is BMP code point)
88-
if ((charCode & 0xFC00) !== 0xDC00) { return charCode }
89-
90-
// undefined if out of range (typically due to isolated low-surrogate code unit
91-
// at the beginning of the string)
92-
const codePoint = str.codePointAt(pos - 2)
93-
// undefined > 0xffff = false, so we don't need extra check here
94-
return codePoint > 0xffff ? codePoint : charCode
95-
}
96-
9782
// Scan a sequence of emphasis-like markers, and determine whether
9883
// it can start an emphasis sequence or end an emphasis sequence.
9984
//
@@ -104,15 +89,47 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
10489
const max = this.posMax
10590
const marker = this.src.charCodeAt(start)
10691

107-
const lastChar = getLastCharCode(this.src, start)
92+
// Astral characters below are combined manually, because .codePointAt()
93+
// does not guarantee numeric type output. And we don't wish JIT cache issues.
94+
// The broken surrogate pairs are evaluated as U+FFFD to prevent possible
95+
// crashes.
96+
97+
let lastChar
98+
if (start === 0) {
99+
// treat beginning of the line as a whitespace
100+
lastChar = 0x20
101+
} else if (start === 1) {
102+
lastChar = this.src.charCodeAt(0)
103+
if ((lastChar & 0xF800) === 0xD800) { lastChar = 0xFFFD }
104+
} else {
105+
lastChar = this.src.charCodeAt(start - 1)
106+
if ((lastChar & 0xFC00) === 0xDC00) {
107+
// low surrogate => add high one, replace broken pair with U+FFFD
108+
const highSurr = this.src.charCodeAt(start - 2)
109+
lastChar = (highSurr & 0xFC00) === 0xD800
110+
? 0x10000 + ((highSurr - 0xD800) << 10) + (lastChar - 0xDC00)
111+
: 0xFFFD
112+
} else if ((lastChar & 0xFC00) === 0xD800) {
113+
lastChar = 0xFFFD
114+
}
115+
}
108116

109117
let pos = start
110118
while (pos < max && this.src.charCodeAt(pos) === marker) { pos++ }
111119

112120
const count = pos - start
113121

114122
// treat end of the line as a whitespace
115-
const nextChar = pos < max ? this.src.codePointAt(pos) : 0x20
123+
let nextChar = pos < max ? this.src.charCodeAt(pos) : 0x20
124+
if ((nextChar & 0xFC00) === 0xD800) {
125+
// high surrogate => add low one, replace broken pair with U+FFFD
126+
const lowSurr = this.src.charCodeAt(pos + 1)
127+
nextChar = (lowSurr & 0xFC00) === 0xDC00
128+
? 0x10000 + ((nextChar - 0xD800) << 10) + (lowSurr - 0xDC00)
129+
: 0xFFFD
130+
} else if ((nextChar & 0xFC00) === 0xDC00) {
131+
nextChar = 0xFFFD
132+
}
116133

117134
const isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(fromCodePoint(lastChar))
118135
const isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(fromCodePoint(nextChar))

0 commit comments

Comments
 (0)