Skip to content

Commit 7fd8c21

Browse files
committed
Clarifications
1 parent 59164a0 commit 7fd8c21

File tree

2 files changed

+82
-44
lines changed

2 files changed

+82
-44
lines changed

src/html/mod.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,13 @@ pub use self::text_type::TextType;
1818
pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) {
1919
loop {
2020
if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) {
21-
let Some((chunk_before, (matched, rest))) = content
22-
.split_at_checked(pos)
23-
.and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?)))
24-
else {
21+
let Some((chunk_before, rest)) = content.split_at_checked(pos) else {
2522
return;
2623
};
24+
let Some((matched, rest)) = rest.split_at_checked(1) else {
25+
return;
26+
};
27+
2728
content = rest;
2829
let matched = matched.as_bytes()[0];
2930

src/rewritable_units/text_encoder.rs

Lines changed: 77 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,36 @@ impl<'output_handler> StreamingHandlerSinkInner<'output_handler> {
125125
}
126126
}
127127

128+
/// Temporary buffer used for encoding_rs output
128129
enum Buffer {
130+
/// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path.
131+
Stack([u8; 63]), // leave a byte for the enum's tag, so that the enum has 64-byte size
132+
/// Used when encoding_rs asks for a larger buffer, or the content is large enough for small buffer roundtrips to add up
129133
Heap(Vec<u8>),
130-
Stack([u8; 63]), // leave a byte for the tag
134+
}
135+
136+
impl Buffer {
137+
/// Arbitrary limit when to switch from a small on-stack buffer to heap allocation
138+
const CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER: usize = 1 << 20;
139+
140+
/// Arbitrary, about a page size
141+
const DEFAULT_HEAP_BUFFER_SIZE: usize = 4096;
142+
143+
fn buffer_for_length(&mut self, content_len: usize) -> &mut [u8] {
144+
let buffer = match self {
145+
Buffer::Heap(buf) => buf.as_mut_slice(),
146+
// Long non-ASCII content could take lots of roundtrips through the encoder
147+
buf if content_len >= Self::CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER => {
148+
*buf = Buffer::Heap(vec![0; Self::DEFAULT_HEAP_BUFFER_SIZE]);
149+
match buf {
150+
Buffer::Heap(buf) => buf.as_mut(),
151+
_ => unreachable!(),
152+
}
153+
}
154+
Buffer::Stack(buf) => buf.as_mut_slice(),
155+
};
156+
buffer
157+
}
131158
}
132159

133160
struct TextEncoder {
@@ -152,6 +179,7 @@ impl TextEncoder {
152179
#[inline(never)]
153180
fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) {
154181
loop {
182+
// First, fast path for ASCII-only prefix
155183
debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it
156184
let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes());
157185
if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) {
@@ -164,41 +192,34 @@ impl TextEncoder {
164192
content = remainder;
165193
}
166194

167-
let buffer = match &mut self.buffer {
168-
Buffer::Heap(buf) => buf.as_mut_slice(),
169-
// Long non-ASCII content could take lots of roundtrips through the encoder
170-
buf if content.len() >= 1 << 20 => {
171-
*buf = Buffer::Heap(vec![0; 4096]);
172-
match buf {
173-
Buffer::Heap(buf) => buf.as_mut(),
174-
_ => unreachable!(),
175-
}
176-
}
177-
Buffer::Stack(buf) => buf.as_mut_slice(),
178-
};
195+
// Now the content starts with non-ASCII byte, so encoding_rs may need a buffer to convert to.
196+
let buffer = self.buffer.buffer_for_length(content.len());
179197

198+
// last == true is needed only for the stateful ISO-JP encoding, which this library doesn't allow
180199
let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false);
200+
181201
if written > 0 && written <= buffer.len() {
182202
(output_handler)(&buffer[..written]);
183203
}
184204
if read >= content.len() {
185205
return;
186206
}
187207
content = &content[read..];
208+
188209
match result {
189210
CoderResult::InputEmpty => {
190211
debug_assert!(content.is_empty());
191212
return;
192213
}
214+
// we've made progress, and can try again without growing the buffer
215+
CoderResult::OutputFull if written > 0 => {}
193216
CoderResult::OutputFull => {
194-
match &mut self.buffer {
195-
Buffer::Heap(buf) if buf.len() >= 1024 => {
196-
if written == 0 {
197-
panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes
198-
}
199-
}
200-
buf => *buf = Buffer::Heap(vec![0; 1024]),
201-
}
217+
// encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug.
218+
assert!(
219+
buffer.len() < Buffer::DEFAULT_HEAP_BUFFER_SIZE,
220+
"encoding_rs infinite loop"
221+
);
222+
self.buffer = Buffer::Heap(vec![0; Buffer::DEFAULT_HEAP_BUFFER_SIZE]);
202223
}
203224
}
204225
}
@@ -213,45 +234,60 @@ const fn utf8_width(b: u8) -> u8 {
213234
b.leading_ones() as _
214235
}
215236

237+
/// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments
216238
struct IncompleteUtf8Resync {
217-
bytes: [u8; 4],
218-
len: u8,
239+
/// Buffers an incomplete UTF-8 sequence
240+
char_bytes: [u8; 4],
241+
/// Number of bytes in `bytes`
242+
char_len: u8,
219243
}
220244

221245
impl IncompleteUtf8Resync {
222246
pub fn new() -> Self {
223247
Self {
224-
bytes: [0; 4],
225-
len: 0,
248+
char_bytes: [0; 4],
249+
char_len: 0,
226250
}
227251
}
228252

253+
/// Returns a valid UTF-8 fragment, and not-yet-checked remainder of the bytes.
254+
///
255+
/// Call `discard_incomplete()` after the last write to flush any partially-written chars.
229256
pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>(
230257
&'buf mut self,
231258
mut content: &'src [u8],
232259
) -> Result<(&'buf str, &'src [u8]), Utf8Error> {
233-
if self.len > 0 {
234-
let mut found_end_byte = false;
260+
// There may be incomplete char buffered from previous write, that must be continued now
261+
if self.char_len > 0 {
262+
let mut must_emit_now = false;
235263
while let Some((&next_byte, rest)) = content.split_first() {
236264
if is_continuation_byte(next_byte) {
237-
if let Some(buf) = self.bytes.get_mut(self.len as usize) {
265+
if let Some(buf) = self.char_bytes.get_mut(self.char_len as usize) {
238266
*buf = next_byte;
239-
self.len += 1;
267+
self.char_len += 1;
240268
content = rest;
241269
continue;
242270
}
271+
// overlong sequences fall here, and will be checked when the char_bytes is flushed
243272
}
244-
found_end_byte = true;
273+
must_emit_now = true;
245274
break;
246275
}
247276

248-
if found_end_byte || self.len >= utf8_width(self.bytes[0]) {
249-
let char_buf = self.bytes.get(..self.len as usize).ok_or(Utf8Error)?;
250-
self.len = 0;
251-
std::str::from_utf8(char_buf)
252-
.map_err(|_| Utf8Error)
253-
.map(|ch| (ch, content))
277+
if self.char_len >= utf8_width(self.char_bytes[0]) {
278+
must_emit_now = true;
279+
}
280+
281+
if must_emit_now {
282+
let char_buf = self
283+
.char_bytes
284+
.get(..self.char_len as usize)
285+
.ok_or(Utf8Error)?;
286+
self.char_len = 0;
287+
let ch = std::str::from_utf8(char_buf).map_err(|_| Utf8Error)?;
288+
Ok((ch, content))
254289
} else {
290+
// a partial write has ended without fully completing a char (it's possible to write 1 byte at a time)
255291
debug_assert!(content.is_empty());
256292
Ok(("", b""))
257293
}
@@ -264,11 +300,12 @@ impl IncompleteUtf8Resync {
264300
let (valid, invalid) = content
265301
.split_at_checked(err.valid_up_to())
266302
.ok_or(Utf8Error)?;
267-
self.bytes
303+
// save the incomplete bytes from the end for the next write
304+
self.char_bytes
268305
.get_mut(..invalid.len())
269306
.ok_or(Utf8Error)?
270307
.copy_from_slice(invalid);
271-
self.len = invalid.len() as _;
308+
self.char_len = invalid.len() as _;
272309
// valid_up_to promises it is valid
273310
debug_assert!(std::str::from_utf8(valid).is_ok());
274311
let valid = unsafe { std::str::from_utf8_unchecked(valid) };
@@ -280,8 +317,8 @@ impl IncompleteUtf8Resync {
280317

281318
/// True if there were incomplete invalid bytes in the buffer
282319
pub fn discard_incomplete(&mut self) -> bool {
283-
if self.len > 0 {
284-
self.len = 0;
320+
if self.char_len > 0 {
321+
self.char_len = 0;
285322
true
286323
} else {
287324
false

0 commit comments

Comments
 (0)